├── LICENSE
├── README.md
├── data
    ├── example_video.mp4
    └── example_video.wav
├── environment.yml
├── header.py
├── inference.py
└── model
    ├── ImageBind
        ├── CODE_OF_CONDUCT.md
        ├── CONTRIBUTING.md
        ├── LICENSE
        ├── README.md
        ├── __init__.py
        ├── __pycache__
        │   ├── __init__.cpython-39.pyc
        │   └── data.cpython-39.pyc
        ├── bpe
        │   └── bpe_simple_vocab_16e6.txt.gz
        ├── data.py
        ├── model_card.md
        ├── models
        │   ├── __init__.py
        │   ├── __pycache__
        │   │   ├── __init__.cpython-39.pyc
        │   │   ├── helpers.cpython-39.pyc
        │   │   ├── imagebind_model.cpython-39.pyc
        │   │   ├── multimodal_preprocessors.cpython-39.pyc
        │   │   └── transformer.cpython-39.pyc
        │   ├── helpers.py
        │   ├── imagebind_model.py
        │   ├── multimodal_preprocessors.py
        │   └── transformer.py
        └── requirements.txt
    ├── Qformer.py
    ├── __init__.py
    ├── __pycache__
        ├── Qformer.cpython-39.pyc
        ├── __init__.cpython-39.pyc
        ├── agent.cpython-39.pyc
        ├── eva_vit.cpython-39.pyc
        ├── modeling_llama.cpython-39.pyc
        ├── modeling_whisper.cpython-39.pyc
        └── openllama.cpython-39.pyc
    ├── agent.py
    ├── eva_vit.py
    ├── modeling_llama.py
    ├── modeling_whisper.py
    └── openllama.py


/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # FAVOR
 2 | Fine-grained Audio-Visual Joint Representations for Multimodal Large Language Models
 3 | 
 4 | <a href='https://881c5a6a6db84b1a2f.gradio.live'><img src='https://img.shields.io/badge/gradio-demo-blue'></a>
 5 | 
 6 | Button Specifications:
 7 | 
 8 | `Clear All`: clear chat history as well as all modality inputs. **Please always use clear all before you want to upload or update any image, audio or video** 
 9 | 
10 | `Clear history`: only clear chat history. The modality input will remain unchanged unless you click `Clear All`.
11 | 
12 | `Submit`: submit the text in the text box to get a response
13 | 
14 | `Resubmit`: clear the previous conversation turn and then submit the text in the text box
15 | 
16 | `maximum length`, `top p` and `temperature` have their meanings
17 | 
18 | Examples mentioned in the paper are provided. Please feel free to start with those.
19 | 
20 | 
21 | ## Getting started
22 | ```
23 | cd AudioVisualLLM
24 | conda env create -f environment.yml
25 | mkdir ckpt
26 | ```
27 | 
28 | Download model checkpoint [here](https://drive.google.com/drive/folders/166g9WVWXwYP77VJyOd3isi_UvRpmX_cv?usp=sharing) and put the folder under `ckpt/`
29 | 
30 | 
31 | ## Inference
32 | ```
33 | conda activate favor
34 | python inference.py
35 | ==========================Output=============================
36 | The video is a romantic scene of a man and a woman on a boat. The man is holding the woman in his arms, and they are both looking at the sunset. The audio is a song that adds to the romantic atmosphere. The woman says "I'm flying" and "Jack," which suggests that they are happy and enjoying the moment. The setting of the boat and the sunset create a beautiful and serene environment that enhances the romantic feel of the video. The man and the woman's body language and facial expressions also convey their love and affection for each other. Overall, the video is a perfect representation of a romantic and intimate moment between two people.
37 | ```
38 | 
39 | ## Reference
40 | Please cite our paper if you use our model
41 | ```
42 | @article{sun2023finegrained,
43 |       title={Fine-grained Audio-Visual Joint Representations for Multimodal Large Language Models}, 
44 |       author={Guangzhi Sun and Wenyi Yu and Changli Tang and Xianzhao Chen and Tian Tan and Wei Li and Lu Lu and Zejun Ma and Chao Zhang},
45 |       year={2023},
46 |       journal={arXiv:2310.05863},
47 | }
48 | ```
49 | 


--------------------------------------------------------------------------------
/data/example_video.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BriansIDP/AudioVisualLLM/52ab7d14b499e7c951df26c89ce2d3673976be9e/data/example_video.mp4


--------------------------------------------------------------------------------
/data/example_video.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BriansIDP/AudioVisualLLM/52ab7d14b499e7c951df26c89ce2d3673976be9e/data/example_video.wav


--------------------------------------------------------------------------------
/environment.yml:
--------------------------------------------------------------------------------
  1 | name: favor
  2 | channels:
  3 |   - nvidia
  4 |   - conda-forge
  5 |   - pytorch
  6 |   - defaults
  7 | dependencies:
  8 |   - _libgcc_mutex=0.1=main
  9 |   - _openmp_mutex=5.1=1_gnu
 10 |   - _sysroot_linux-64_curr_repodata_hack=3=haa98f57_10
 11 |   - binutils_impl_linux-64=2.38=h2a08ee3_1
 12 |   - binutils_linux-64=2.38.0=hc2dff05_0
 13 |   - blas=1.0=mkl
 14 |   - brotli-python=1.0.9=py39h6a678d5_7
 15 |   - bzip2=1.0.8=h7b6447c_0
 16 |   - ca-certificates=2023.7.22=hbcca054_0
 17 |   - cffi=1.15.1=py39h5eee18b_3
 18 |   - cuda=11.6.1=0
 19 |   - cuda-cccl=11.6.55=hf6102b2_0
 20 |   - cuda-command-line-tools=11.6.2=0
 21 |   - cuda-compiler=11.6.2=0
 22 |   - cuda-cudart=11.6.55=he381448_0
 23 |   - cuda-cudart-dev=11.6.55=h42ad0f4_0
 24 |   - cuda-cuobjdump=11.6.124=h2eeebcb_0
 25 |   - cuda-cupti=11.6.124=h86345e5_0
 26 |   - cuda-cuxxfilt=11.6.124=hecbf4f6_0
 27 |   - cuda-driver-dev=11.6.55=0
 28 |   - cuda-gdb=12.3.52=0
 29 |   - cuda-libraries=11.6.1=0
 30 |   - cuda-libraries-dev=11.6.1=0
 31 |   - cuda-memcheck=11.8.86=0
 32 |   - cuda-nsight=12.3.52=0
 33 |   - cuda-nsight-compute=12.3.0=0
 34 |   - cuda-nvcc=11.6.124=hbba6d2d_0
 35 |   - cuda-nvdisasm=12.3.52=0
 36 |   - cuda-nvml-dev=11.6.55=haa9ef22_0
 37 |   - cuda-nvprof=12.3.52=0
 38 |   - cuda-nvprune=11.6.124=he22ec0a_0
 39 |   - cuda-nvrtc=11.6.124=h020bade_0
 40 |   - cuda-nvrtc-dev=11.6.124=h249d397_0
 41 |   - cuda-nvtx=11.6.124=h0630a44_0
 42 |   - cuda-nvvp=12.3.52=0
 43 |   - cuda-runtime=11.6.1=0
 44 |   - cuda-samples=11.6.101=h8efea70_0
 45 |   - cuda-sanitizer-api=12.3.52=0
 46 |   - cuda-toolkit=11.6.1=0
 47 |   - cuda-tools=11.6.1=0
 48 |   - cuda-visual-tools=11.6.1=0
 49 |   - ffmpeg=4.3=hf484d3e_0
 50 |   - freetype=2.12.1=h4a9f257_0
 51 |   - gcc_impl_linux-64=11.2.0=h1234567_1
 52 |   - gcc_linux-64=11.2.0=h5c386dc_0
 53 |   - gds-tools=1.8.0.34=0
 54 |   - giflib=5.2.1=h5eee18b_3
 55 |   - gmp=6.2.1=h295c915_3
 56 |   - gnutls=3.6.15=he1e5248_0
 57 |   - gxx_impl_linux-64=11.2.0=h1234567_1
 58 |   - gxx_linux-64=11.2.0=hc2dff05_0
 59 |   - idna=3.4=py39h06a4308_0
 60 |   - intel-openmp=2023.1.0=hdb19cb5_46305
 61 |   - jpeg=9e=h5eee18b_1
 62 |   - kernel-headers_linux-64=3.10.0=h57e8cba_10
 63 |   - lame=3.100=h7b6447c_0
 64 |   - lcms2=2.12=h3be6417_0
 65 |   - ld_impl_linux-64=2.38=h1181459_1
 66 |   - lerc=3.0=h295c915_0
 67 |   - libcublas=11.9.2.110=h5e84587_0
 68 |   - libcublas-dev=11.9.2.110=h5c901ab_0
 69 |   - libcufft=10.7.1.112=hf425ae0_0
 70 |   - libcufft-dev=10.7.1.112=ha5ce4c0_0
 71 |   - libcufile=1.8.0.34=0
 72 |   - libcufile-dev=1.8.0.34=0
 73 |   - libcurand=10.3.4.52=0
 74 |   - libcurand-dev=10.3.4.52=0
 75 |   - libcusolver=11.3.4.124=h33c3c4e_0
 76 |   - libcusparse=11.7.2.124=h7538f96_0
 77 |   - libcusparse-dev=11.7.2.124=hbbe9722_0
 78 |   - libdeflate=1.17=h5eee18b_1
 79 |   - libffi=3.4.4=h6a678d5_0
 80 |   - libgcc-devel_linux-64=11.2.0=h1234567_1
 81 |   - libgcc-ng=11.2.0=h1234567_1
 82 |   - libgfortran-ng=7.5.0=h14aa051_20
 83 |   - libgfortran4=7.5.0=h14aa051_20
 84 |   - libgomp=11.2.0=h1234567_1
 85 |   - libiconv=1.16=h7f8727e_2
 86 |   - libidn2=2.3.4=h5eee18b_0
 87 |   - libnpp=11.6.3.124=hd2722f0_0
 88 |   - libnpp-dev=11.6.3.124=h3c42840_0
 89 |   - libnvjpeg=11.6.2.124=hd473ad6_0
 90 |   - libnvjpeg-dev=11.6.2.124=hb5906b9_0
 91 |   - libpng=1.6.39=h5eee18b_0
 92 |   - libstdcxx-devel_linux-64=11.2.0=h1234567_1
 93 |   - libstdcxx-ng=11.2.0=h1234567_1
 94 |   - libtasn1=4.19.0=h5eee18b_0
 95 |   - libtiff=4.5.1=h6a678d5_0
 96 |   - libunistring=0.9.10=h27cfd23_0
 97 |   - libwebp=1.3.2=h11a3e52_0
 98 |   - libwebp-base=1.3.2=h5eee18b_0
 99 |   - lz4-c=1.9.4=h6a678d5_0
100 |   - mkl=2023.1.0=h213fc3f_46343
101 |   - mkl-service=2.4.0=py39h5eee18b_1
102 |   - mkl_fft=1.3.8=py39h5eee18b_0
103 |   - mkl_random=1.2.4=py39hdb19cb5_0
104 |   - mpi=1.0=mpich
105 |   - mpi4py=3.1.4=py39hfc96bbd_0
106 |   - mpich=3.3.2=hc856adb_0
107 |   - ncurses=6.4=h6a678d5_0
108 |   - nettle=3.7.3=hbbd107a_1
109 |   - nsight-compute=2023.3.0.12=0
110 |   - openh264=2.1.1=h4ff587b_0
111 |   - openjpeg=2.4.0=h3ad879b_0
112 |   - openssl=3.0.11=h7f8727e_2
113 |   - pycparser=2.21=pyhd3eb1b0_0
114 |   - pysocks=1.7.1=py39h06a4308_0
115 |   - python=3.9.16=h955ad1f_3
116 |   - pytorch=1.13.1=py3.9_cuda11.6_cudnn8.3.2_0
117 |   - pytorch-cuda=11.6=h867d48c_1
118 |   - pytorch-mutex=1.0=cuda
119 |   - readline=8.2=h5eee18b_0
120 |   - requests=2.31.0=py39h06a4308_0
121 |   - setuptools=67.8.0=py39h06a4308_0
122 |   - sqlite=3.41.2=h5eee18b_0
123 |   - sysroot_linux-64=2.17=h57e8cba_10
124 |   - tbb=2021.8.0=hdb19cb5_0
125 |   - tk=8.6.12=h1ccaba5_0
126 |   - torchaudio=0.13.1=py39_cu116
127 |   - torchvision=0.14.1=py39_cu116
128 |   - typing_extensions=4.7.1=py39h06a4308_0
129 |   - wheel=0.38.4=py39h06a4308_0
130 |   - xz=5.4.2=h5eee18b_0
131 |   - zlib=1.2.13=h5eee18b_0
132 |   - zstd=1.5.5=hc292b87_0
133 |   - pip:
134 |       - accelerate==0.20.3
135 |       - aiofiles==23.1.0
136 |       - aiohttp==3.8.4
137 |       - aiosignal==1.3.1
138 |       - altair==5.0.1
139 |       - antlr4-python3-runtime==4.9.3
140 |       - anyio==3.7.0
141 |       - asttokens==2.2.1
142 |       - async-timeout==4.0.2
143 |       - attrs==23.1.0
144 |       - audioread==3.0.0
145 |       - av==10.0.0
146 |       - backcall==0.2.0
147 |       - blis==0.7.9
148 |       - braceexpand==0.1.7
149 |       - catalogue==2.0.10
150 |       - certifi==2023.5.7
151 |       - charset-normalizer==3.1.0
152 |       - click==8.1.3
153 |       - confection==0.1.0
154 |       - contourpy==1.1.0
155 |       - cryptography==38.0.4
156 |       - cycler==0.11.0
157 |       - cymem==2.0.7
158 |       - data==0.4
159 |       - datasets==2.16.1
160 |       - decorator==5.1.1
161 |       - decord==0.6.0
162 |       - deepspeed==0.9.2
163 |       - deprecated==1.2.14
164 |       - dill==0.3.7
165 |       - dnspython==2.4.0
166 |       - docker-pycreds==0.4.0
167 |       - docstring-parser==0.15
168 |       - einops==0.6.1
169 |       - evaluate==0.4.1
170 |       - exceptiongroup==1.1.1
171 |       - executing==1.2.0
172 |       - fastapi==0.99.1
173 |       - ffmpy==0.3.0
174 |       - filelock==3.12.2
175 |       - fonttools==4.40.0
176 |       - frozenlist==1.3.3
177 |       - fsspec==2023.10.0
178 |       - ftfy==6.1.1
179 |       - funcsigs==1.0.2
180 |       - fvcore==0.1.5.post20221221
181 |       - gitdb==4.0.10
182 |       - gitpython==3.1.32
183 |       - gradio==3.35.2
184 |       - gradio-client==0.2.7
185 |       - grpcio==1.56.2
186 |       - h11==0.14.0
187 |       - hjson==3.1.0
188 |       - httpcore==0.18.0
189 |       - httpx==0.25.0
190 |       - huggingface-hub==0.20.2
191 |       - imageio==2.31.1
192 |       - importlib-metadata==6.7.0
193 |       - importlib-resources==5.12.0
194 |       - iopath==0.1.10
195 |       - ipaddress==1.0.23
196 |       - ipdb==0.13.13
197 |       - ipython==8.14.0
198 |       - jedi==0.18.2
199 |       - jieba==0.42.1
200 |       - jinja2==3.1.2
201 |       - jiwer==3.0.2
202 |       - joblib==1.3.2
203 |       - jsonargparse==4.14.1
204 |       - jsonschema==4.17.3
205 |       - kiwisolver==1.4.4
206 |       - langcodes==3.3.0
207 |       - latex2mathml==3.76.0
208 |       - lazy-loader==0.3
209 |       - librosa==0.10.1
210 |       - linkify-it-py==2.0.2
211 |       - llvmlite==0.41.0
212 |       - markdown==3.4.3
213 |       - markdown-it-py==2.2.0
214 |       - markupsafe==2.1.3
215 |       - matplotlib==3.7.1
216 |       - matplotlib-inline==0.1.6
217 |       - mdit-py-plugins==0.3.3
218 |       - mdtex2html==1.2.0
219 |       - mdurl==0.1.2
220 |       - more-itertools==10.1.0
221 |       - msgpack==1.0.5
222 |       - multidict==6.0.4
223 |       - multiprocess==0.70.15
224 |       - murmurhash==1.0.9
225 |       - networkx==3.1
226 |       - ninja==1.11.1
227 |       - nltk==3.8.1
228 |       - numba==0.58.0
229 |       - numpy==1.24.3
230 |       - nvidia-cublas-cu11==11.10.3.66
231 |       - nvidia-cuda-nvrtc-cu11==11.7.99
232 |       - nvidia-cuda-runtime-cu11==11.7.99
233 |       - nvidia-cudnn-cu11==8.5.0.96
234 |       - omegaconf==2.3.0
235 |       - opencv-python==4.8.1.78
236 |       - orjson==3.9.1
237 |       - packaging==22.0
238 |       - pandas==1.5.3
239 |       - parameterized==0.9.0
240 |       - parso==0.8.3
241 |       - pathtools==0.1.2
242 |       - pathy==0.10.2
243 |       - peft==0.3.0
244 |       - pexpect==4.8.0
245 |       - pickleshare==0.7.5
246 |       - pillow==9.5.0
247 |       - pip==23.3.1
248 |       - platformdirs==3.10.0
249 |       - pooch==1.7.0
250 |       - portalocker==2.7.0
251 |       - preshed==3.0.8
252 |       - promise==2.3
253 |       - prompt-toolkit==3.0.38
254 |       - protobuf==3.20.3
255 |       - psutil==5.9.5
256 |       - ptyprocess==0.7.0
257 |       - pure-eval==0.2.2
258 |       - py-cpuinfo==9.0.0
259 |       - pyarrow==14.0.2
260 |       - pyarrow-hotfix==0.6
261 |       - pydantic==1.9.0
262 |       - pydub==0.25.1
263 |       - pygments==2.15.1
264 |       - pyjwt==2.8.0
265 |       - pyopenssl==22.1.0
266 |       - pyparsing==3.1.0
267 |       - pyrsistent==0.19.3
268 |       - python-dateutil==2.8.2
269 |       - python-etcd==0.4.5
270 |       - python-multipart==0.0.6
271 |       - pytorchvideo==0.1.5
272 |       - pytz==2023.3
273 |       - pywavelets==1.4.1
274 |       - pyyaml==6.0
275 |       - rapidfuzz==2.13.7
276 |       - regex==2022.10.31
277 |       - responses==0.18.0
278 |       - rfc3986==1.5.0
279 |       - schedule==1.2.0
280 |       - scikit-image==0.21.0
281 |       - scikit-learn==1.3.1
282 |       - scipy==1.11.1
283 |       - semantic-version==2.10.0
284 |       - sentencepiece==0.1.99
285 |       - sentry-sdk==1.28.1
286 |       - setproctitle==1.3.2
287 |       - shortuuid==1.0.11
288 |       - six==1.16.0
289 |       - smart-open==6.3.0
290 |       - smmap==5.0.0
291 |       - sniffio==1.3.0
292 |       - soundfile==0.12.1
293 |       - soxr==0.3.6
294 |       - spacy==3.6.0
295 |       - spacy-legacy==3.0.12
296 |       - spacy-loggers==1.0.4
297 |       - srsly==2.4.6
298 |       - stack-data==0.6.2
299 |       - starlette==0.27.0
300 |       - tabulate==0.9.0
301 |       - termcolor==2.3.0
302 |       - thinc==8.1.10
303 |       - threadpoolctl==3.2.0
304 |       - tifffile==2023.7.18
305 |       - timm==0.6.7
306 |       - tokenizer==3.4.3
307 |       - tokenizers==0.13.3
308 |       - tomli==2.0.1
309 |       - toolz==0.12.0
310 |       - tqdm==4.64.1
311 |       - traitlets==5.9.0
312 |       - transformers==4.29.1
313 |       - typer==0.9.0
314 |       - tzdata==2023.3
315 |       - uc-micro-py==1.0.2
316 |       - urllib3==1.26.16
317 |       - uvicorn==0.22.0
318 |       - wasabi==1.1.2
319 |       - wcwidth==0.2.6
320 |       - webdataset==0.2.75
321 |       - websockets==11.0.3
322 |       - werpy==1.0.0
323 |       - wrapt==1.15.0
324 |       - xxhash==3.4.1
325 |       - yacs==0.1.8
326 |       - yarl==1.9.2
327 |       - youtube-dl==2021.12.17
328 |       - zipp==3.15.0
329 | 


--------------------------------------------------------------------------------
/header.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import datetime
 3 | import types
 4 | import deepspeed
 5 | from transformers.deepspeed import HfDeepSpeedConfig
 6 | import transformers
 7 | import numpy as np
 8 | from collections import OrderedDict
 9 | from torch.utils.data import Dataset, DataLoader
10 | from torch.nn.utils import clip_grad_norm_
11 | from torch.cuda.amp import autocast, GradScaler
12 | from torch.nn import DataParallel
13 | from torch.optim import lr_scheduler
14 | import torch.optim as optim
15 | import torch.nn as nn
16 | import torch.nn.functional as F
17 | from tqdm import tqdm
18 | import os
19 | import re
20 | import math
21 | import random
22 | import json
23 | import time
24 | import logging
25 | from copy import deepcopy
26 | import ipdb
27 | import argparse
28 | import data
29 | from transformers import LlamaTokenizer, LlamaForCausalLM, LlamaConfig
30 | from torch.nn.utils.rnn import pad_sequence
31 | from peft import LoraConfig, TaskType, get_peft_model
32 | 
33 | logging.getLogger("transformers").setLevel(logging.WARNING)
34 | logging.getLogger("transformers.tokenization_utils").setLevel(logging.ERROR)
35 | os.environ['TOKENIZERS_PARALLELISM'] = 'false'
36 | 


--------------------------------------------------------------------------------
/inference.py:
--------------------------------------------------------------------------------
  1 | from operator import truediv
  2 | from transformers import AutoModel, AutoTokenizer
  3 | from copy import deepcopy
  4 | import os
  5 | import time
  6 | import ipdb
  7 | import mdtex2html
  8 | from model.openllama import OpenLLAMAPEFTModel
  9 | import torch
 10 | import json
 11 | 
 12 | 
 13 | def parse_text(text):
 14 |     """copy from https://github.com/GaiZhenbiao/ChuanhuChatGPT/"""
 15 |     lines = text.split("\n")
 16 |     lines = [line for line in lines if line != ""]
 17 |     count = 0
 18 |     for i, line in enumerate(lines):
 19 |         if "```" in line:
 20 |             count += 1
 21 |             items = line.split('`')
 22 |             if count % 2 == 1:
 23 |                 lines[i] = f'<pre><code class="language-{items[-1]}">'
 24 |             else:
 25 |                 lines[i] = f'<br></code></pre>'
 26 |         else:
 27 |             if i > 0:
 28 |                 if count % 2 == 1:
 29 |                     line = line.replace("`", "\`")
 30 |                     line = line.replace("<", "&lt;")
 31 |                     line = line.replace(">", "&gt;")
 32 |                     line = line.replace(" ", "&nbsp;")
 33 |                     line = line.replace("*", "&ast;")
 34 |                     line = line.replace("_", "&lowbar;")
 35 |                     line = line.replace("-", "&#45;")
 36 |                     line = line.replace(".", "&#46;")
 37 |                     line = line.replace("!", "&#33;")
 38 |                     line = line.replace("(", "&#40;")
 39 |                     line = line.replace(")", "&#41;")
 40 |                     line = line.replace("$", "&#36;")
 41 |                 lines[i] = "<br>"+line
 42 |     text = "".join(lines)
 43 |     return text
 44 | 
 45 | 
 46 | def predict(
 47 |     input, 
 48 |     image_path, 
 49 |     audio_path, 
 50 |     video_path, 
 51 |     thermal_path, 
 52 |     max_length, 
 53 |     top_p, 
 54 |     temperature, 
 55 |     history, 
 56 |     modality_cache,
 57 | ):
 58 |     if image_path is None and audio_path is None and video_path is None and thermal_path is None:
 59 |         return [(input, "There is no input data provided! Please upload your data and start the conversation.")]
 60 | 
 61 |     # prepare the prompt
 62 |     prompt_text = ''
 63 |     for idx, (q, a) in enumerate(history):
 64 |         if idx == 0:
 65 |             prompt_text += f'{q}\nASSISTANT: {a}\n'
 66 |         else:
 67 |             prompt_text += f' USER: {q}\nASSISTANT: {a}\n'
 68 |     if len(history) == 0:
 69 |         prompt_text += f'{input}'
 70 |     else:
 71 |         prompt_text += f' USER: {input}'
 72 |     inputs = {
 73 |         'prompt': prompt_text,
 74 |         'image_paths': [image_path] if image_path else [],
 75 |         'audio_paths': [audio_path] if audio_path else [],
 76 |         'video_paths': [video_path] if video_path else [],
 77 |         'thermal_paths': [thermal_path] if thermal_path else [],
 78 |         'top_p': top_p,
 79 |         'temperature': temperature,
 80 |         'max_tgt_len': max_length,
 81 |         'dosample': True,
 82 |         'modality_embeds': modality_cache,
 83 |         'av_shift': False,
 84 |     }
 85 |     response = model.generate(inputs)
 86 |     history.append((input, response))
 87 |     return response, history, modality_cache
 88 | 
 89 | 
 90 | if __name__ == "__main__":
 91 |     # init the model
 92 |     expname = "audiovisual_vicuna13b_sepqformer_avsd_earlyalign_swqformer_causal_tune"
 93 |     # expname="audiovisual_vicuna7b_sepqformer_avsd_earlyalign_swqformer_causal_diversity"
 94 |     args = {
 95 |         'model': 'openllama_peft',
 96 |         'imagebind_ckpt_path': "",
 97 |         'vicuna_ckpt_path': "/home/gs534/rds/rds-t2-cs164-KQ4S3rlDzm8/gs534/llama/vicuna.13b/",
 98 |         'orig_delta_path': "", #'../pretrained_ckpt/pandagpt_ckpt/13b/pytorch_model.pt',
 99 |         'delta_ckpt_path': f"ckpt/{expname}/pytorch_model_1_101.pt",
100 |         'stage': 2,
101 |         'max_tgt_len': 256,
102 |         'lora_r': 32,
103 |         'lora_alpha': 32,
104 |         'lora_dropout': 0.1,
105 |         'use_lora': "true",
106 |         'qformer': "true",
107 |         'use_whisper': "true",
108 |         'use_blip': "true",
109 |         'instructblip': "true",
110 |         'proj_checkpoint': "",
111 |         'num_video_query': 32,
112 |         'num_speech_query': 32,
113 |         'instructblip_video': "false",
114 |         'video_window_size': 240,
115 |         'skip_vqformer': "false",
116 |         'speech_qformer': "false",
117 |         'early_align': "true",
118 |         'cascaded': "",
119 |         'causal': "false",
120 |         'diversity_loss': "false",
121 |         'causal_attention': "true",
122 |         'modalitymask': 'false',
123 |         'groupsize': 10,
124 |         'alignmode': 2,
125 |     }
126 |     model = OpenLLAMAPEFTModel(**args)
127 |     if args['orig_delta_path'] != '':
128 |         orig_ckpt = torch.load(args['orig_delta_path'], map_location=torch.device('cpu'))
129 |         model.load_state_dict(orig_ckpt, strict=False)
130 |     delta_ckpt = torch.load(args['delta_ckpt_path'], map_location=torch.device('cpu'))
131 |     model.load_state_dict(delta_ckpt, strict=False)
132 |     model = model.eval().half().cuda()
133 |     print(f'[!] init the 13b model over ...')
134 |     modality_cache = []
135 | 
136 |     testimage = False
137 |     withaudio = True
138 |     multiturn = False
139 |     video_path = "data/example_video.mp4"
140 |     audio_path = "data/example_video.wav"
141 |     user_input = "Explain in detail why this video together with the audio and what they say is romantic"
142 |     response, history, modality_cache = predict(user_input, None, audio_path, video_path, None, 512, 0.01, 1, [], [])
143 |     print(response)
144 | 


--------------------------------------------------------------------------------
/model/ImageBind/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
 1 | # Code of Conduct
 2 | 
 3 | ## Our Pledge
 4 | 
 5 | In the interest of fostering an open and welcoming environment, we as
 6 | contributors and maintainers pledge to make participation in our project and
 7 | our community a harassment-free experience for everyone, regardless of age, body
 8 | size, disability, ethnicity, sex characteristics, gender identity and expression,
 9 | level of experience, education, socio-economic status, nationality, personal
10 | appearance, race, religion, or sexual identity and orientation.
11 | 
12 | ## Our Standards
13 | 
14 | Examples of behavior that contributes to creating a positive environment
15 | include:
16 | 
17 | * Using welcoming and inclusive language
18 | * Being respectful of differing viewpoints and experiences
19 | * Gracefully accepting constructive criticism
20 | * Focusing on what is best for the community
21 | * Showing empathy towards other community members
22 | 
23 | Examples of unacceptable behavior by participants include:
24 | 
25 | * The use of sexualized language or imagery and unwelcome sexual attention or
26 | advances
27 | * Trolling, insulting/derogatory comments, and personal or political attacks
28 | * Public or private harassment
29 | * Publishing others' private information, such as a physical or electronic
30 | address, without explicit permission
31 | * Other conduct which could reasonably be considered inappropriate in a
32 | professional setting
33 | 
34 | ## Our Responsibilities
35 | 
36 | Project maintainers are responsible for clarifying the standards of acceptable
37 | behavior and are expected to take appropriate and fair corrective action in
38 | response to any instances of unacceptable behavior.
39 | 
40 | Project maintainers have the right and responsibility to remove, edit, or
41 | reject comments, commits, code, wiki edits, issues, and other contributions
42 | that are not aligned to this Code of Conduct, or to ban temporarily or
43 | permanently any contributor for other behaviors that they deem inappropriate,
44 | threatening, offensive, or harmful.
45 | 
46 | ## Scope
47 | 
48 | This Code of Conduct applies within all project spaces, and it also applies when
49 | an individual is representing the project or its community in public spaces.
50 | Examples of representing a project or community include using an official
51 | project e-mail address, posting via an official social media account, or acting
52 | as an appointed representative at an online or offline event. Representation of
53 | a project may be further defined and clarified by project maintainers.
54 | 
55 | This Code of Conduct also applies outside the project spaces when there is a
56 | reasonable belief that an individual's behavior may have a negative impact on
57 | the project or its community.
58 | 
59 | ## Enforcement
60 | 
61 | Instances of abusive, harassing, or otherwise unacceptable behavior may be
62 | reported by contacting the project team at <opensource-conduct@fb.com>. All
63 | complaints will be reviewed and investigated and will result in a response that
64 | is deemed necessary and appropriate to the circumstances. The project team is
65 | obligated to maintain confidentiality with regard to the reporter of an incident.
66 | Further details of specific enforcement policies may be posted separately.
67 | 
68 | Project maintainers who do not follow or enforce the Code of Conduct in good
69 | faith may face temporary or permanent repercussions as determined by other
70 | members of the project's leadership.
71 | 
72 | ## Attribution
73 | 
74 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
75 | available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html
76 | 
77 | [homepage]: https://www.contributor-covenant.org
78 | 
79 | For answers to common questions about this code of conduct, see
80 | https://www.contributor-covenant.org/faq


--------------------------------------------------------------------------------
/model/ImageBind/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing to ImageBind
 2 | We want to make contributing to this project as easy and transparent as
 3 | possible.
 4 | 
 5 | ## Pull Requests
 6 | We actively welcome your pull requests.
 7 | 
 8 | 1. Fork the repo and create your branch from `main`.
 9 | 2. If you've added code that should be tested, add tests.
10 | 3. If you've changed APIs, update the documentation.
11 | 4. Ensure the test suite passes.
12 | 5. Make sure your code lints.
13 | 6. If you haven't already, complete the Contributor License Agreement ("CLA").
14 | 
15 | ## Contributor License Agreement ("CLA")
16 | In order to accept your pull request, we need you to submit a CLA. You only need
17 | to do this once to work on any of Meta's open source projects.
18 | 
19 | Complete your CLA here: <https://code.facebook.com/cla>
20 | 
21 | ## Issues
22 | We use GitHub issues to track public bugs. Please ensure your description is
23 | clear and has sufficient instructions to be able to reproduce the issue.
24 | 
25 | Meta has a [bounty program](https://www.facebook.com/whitehat/) for the safe
26 | disclosure of security bugs. In those cases, please go through the process
27 | outlined on that page and do not file a public issue.
28 | 
29 | ## License
30 | By contributing to Omnivore, you agree that your contributions will be licensed
31 | under the [LICENSE](LICENSE) file in the root directory of this source tree.
32 | 


--------------------------------------------------------------------------------
/model/ImageBind/LICENSE:
--------------------------------------------------------------------------------
  1 | Attribution-NonCommercial-ShareAlike 4.0 International
  2 | 
  3 | =======================================================================
  4 | 
  5 | Creative Commons Corporation ("Creative Commons") is not a law firm and
  6 | does not provide legal services or legal advice. Distribution of
  7 | Creative Commons public licenses does not create a lawyer-client or
  8 | other relationship. Creative Commons makes its licenses and related
  9 | information available on an "as-is" basis. Creative Commons gives no
 10 | warranties regarding its licenses, any material licensed under their
 11 | terms and conditions, or any related information. Creative Commons
 12 | disclaims all liability for damages resulting from their use to the
 13 | fullest extent possible.
 14 | 
 15 | Using Creative Commons Public Licenses
 16 | 
 17 | Creative Commons public licenses provide a standard set of terms and
 18 | conditions that creators and other rights holders may use to share
 19 | original works of authorship and other material subject to copyright
 20 | and certain other rights specified in the public license below. The
 21 | following considerations are for informational purposes only, are not
 22 | exhaustive, and do not form part of our licenses.
 23 | 
 24 |      Considerations for licensors: Our public licenses are
 25 |      intended for use by those authorized to give the public
 26 |      permission to use material in ways otherwise restricted by
 27 |      copyright and certain other rights. Our licenses are
 28 |      irrevocable. Licensors should read and understand the terms
 29 |      and conditions of the license they choose before applying it.
 30 |      Licensors should also secure all rights necessary before
 31 |      applying our licenses so that the public can reuse the
 32 |      material as expected. Licensors should clearly mark any
 33 |      material not subject to the license. This includes other CC-
 34 |      licensed material, or material used under an exception or
 35 |      limitation to copyright. More considerations for licensors:
 36 |     wiki.creativecommons.org/Considerations_for_licensors
 37 | 
 38 |      Considerations for the public: By using one of our public
 39 |      licenses, a licensor grants the public permission to use the
 40 |      licensed material under specified terms and conditions. If
 41 |      the licensor's permission is not necessary for any reason--for
 42 |      example, because of any applicable exception or limitation to
 43 |      copyright--then that use is not regulated by the license. Our
 44 |      licenses grant only permissions under copyright and certain
 45 |      other rights that a licensor has authority to grant. Use of
 46 |      the licensed material may still be restricted for other
 47 |      reasons, including because others have copyright or other
 48 |      rights in the material. A licensor may make special requests,
 49 |      such as asking that all changes be marked or described.
 50 |      Although not required by our licenses, you are encouraged to
 51 |      respect those requests where reasonable. More considerations
 52 |      for the public:
 53 |     wiki.creativecommons.org/Considerations_for_licensees
 54 | 
 55 | =======================================================================
 56 | 
 57 | Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International
 58 | Public License
 59 | 
 60 | By exercising the Licensed Rights (defined below), You accept and agree
 61 | to be bound by the terms and conditions of this Creative Commons
 62 | Attribution-NonCommercial-ShareAlike 4.0 International Public License
 63 | ("Public License"). To the extent this Public License may be
 64 | interpreted as a contract, You are granted the Licensed Rights in
 65 | consideration of Your acceptance of these terms and conditions, and the
 66 | Licensor grants You such rights in consideration of benefits the
 67 | Licensor receives from making the Licensed Material available under
 68 | these terms and conditions.
 69 | 
 70 | 
 71 | Section 1 -- Definitions.
 72 | 
 73 |   a. Adapted Material means material subject to Copyright and Similar
 74 |      Rights that is derived from or based upon the Licensed Material
 75 |      and in which the Licensed Material is translated, altered,
 76 |      arranged, transformed, or otherwise modified in a manner requiring
 77 |      permission under the Copyright and Similar Rights held by the
 78 |      Licensor. For purposes of this Public License, where the Licensed
 79 |      Material is a musical work, performance, or sound recording,
 80 |      Adapted Material is always produced where the Licensed Material is
 81 |      synched in timed relation with a moving image.
 82 | 
 83 |   b. Adapter's License means the license You apply to Your Copyright
 84 |      and Similar Rights in Your contributions to Adapted Material in
 85 |      accordance with the terms and conditions of this Public License.
 86 | 
 87 |   c. BY-NC-SA Compatible License means a license listed at
 88 |      creativecommons.org/compatiblelicenses, approved by Creative
 89 |      Commons as essentially the equivalent of this Public License.
 90 | 
 91 |   d. Copyright and Similar Rights means copyright and/or similar rights
 92 |      closely related to copyright including, without limitation,
 93 |      performance, broadcast, sound recording, and Sui Generis Database
 94 |      Rights, without regard to how the rights are labeled or
 95 |      categorized. For purposes of this Public License, the rights
 96 |      specified in Section 2(b)(1)-(2) are not Copyright and Similar
 97 |      Rights.
 98 | 
 99 |   e. Effective Technological Measures means those measures that, in the
100 |      absence of proper authority, may not be circumvented under laws
101 |      fulfilling obligations under Article 11 of the WIPO Copyright
102 |      Treaty adopted on December 20, 1996, and/or similar international
103 |      agreements.
104 | 
105 |   f. Exceptions and Limitations means fair use, fair dealing, and/or
106 |      any other exception or limitation to Copyright and Similar Rights
107 |      that applies to Your use of the Licensed Material.
108 | 
109 |   g. License Elements means the license attributes listed in the name
110 |      of a Creative Commons Public License. The License Elements of this
111 |      Public License are Attribution, NonCommercial, and ShareAlike.
112 | 
113 |   h. Licensed Material means the artistic or literary work, database,
114 |      or other material to which the Licensor applied this Public
115 |      License.
116 | 
117 |   i. Licensed Rights means the rights granted to You subject to the
118 |      terms and conditions of this Public License, which are limited to
119 |      all Copyright and Similar Rights that apply to Your use of the
120 |      Licensed Material and that the Licensor has authority to license.
121 | 
122 |   j. Licensor means the individual(s) or entity(ies) granting rights
123 |      under this Public License.
124 | 
125 |   k. NonCommercial means not primarily intended for or directed towards
126 |      commercial advantage or monetary compensation. For purposes of
127 |      this Public License, the exchange of the Licensed Material for
128 |      other material subject to Copyright and Similar Rights by digital
129 |      file-sharing or similar means is NonCommercial provided there is
130 |      no payment of monetary compensation in connection with the
131 |      exchange.
132 | 
133 |   l. Share means to provide material to the public by any means or
134 |      process that requires permission under the Licensed Rights, such
135 |      as reproduction, public display, public performance, distribution,
136 |      dissemination, communication, or importation, and to make material
137 |      available to the public including in ways that members of the
138 |      public may access the material from a place and at a time
139 |      individually chosen by them.
140 | 
141 |   m. Sui Generis Database Rights means rights other than copyright
142 |      resulting from Directive 96/9/EC of the European Parliament and of
143 |      the Council of 11 March 1996 on the legal protection of databases,
144 |      as amended and/or succeeded, as well as other essentially
145 |      equivalent rights anywhere in the world.
146 | 
147 |   n. You means the individual or entity exercising the Licensed Rights
148 |      under this Public License. Your has a corresponding meaning.
149 | 
150 | 
151 | Section 2 -- Scope.
152 | 
153 |   a. License grant.
154 | 
155 |        1. Subject to the terms and conditions of this Public License,
156 |           the Licensor hereby grants You a worldwide, royalty-free,
157 |           non-sublicensable, non-exclusive, irrevocable license to
158 |           exercise the Licensed Rights in the Licensed Material to:
159 | 
160 |             a. reproduce and Share the Licensed Material, in whole or
161 |                in part, for NonCommercial purposes only; and
162 | 
163 |             b. produce, reproduce, and Share Adapted Material for
164 |                NonCommercial purposes only.
165 | 
166 |        2. Exceptions and Limitations. For the avoidance of doubt, where
167 |           Exceptions and Limitations apply to Your use, this Public
168 |           License does not apply, and You do not need to comply with
169 |           its terms and conditions.
170 | 
171 |        3. Term. The term of this Public License is specified in Section
172 |           6(a).
173 | 
174 |        4. Media and formats; technical modifications allowed. The
175 |           Licensor authorizes You to exercise the Licensed Rights in
176 |           all media and formats whether now known or hereafter created,
177 |           and to make technical modifications necessary to do so. The
178 |           Licensor waives and/or agrees not to assert any right or
179 |           authority to forbid You from making technical modifications
180 |           necessary to exercise the Licensed Rights, including
181 |           technical modifications necessary to circumvent Effective
182 |           Technological Measures. For purposes of this Public License,
183 |           simply making modifications authorized by this Section 2(a)
184 |           (4) never produces Adapted Material.
185 | 
186 |        5. Downstream recipients.
187 | 
188 |             a. Offer from the Licensor -- Licensed Material. Every
189 |                recipient of the Licensed Material automatically
190 |                receives an offer from the Licensor to exercise the
191 |                Licensed Rights under the terms and conditions of this
192 |                Public License.
193 | 
194 |             b. Additional offer from the Licensor -- Adapted Material.
195 |                Every recipient of Adapted Material from You
196 |                automatically receives an offer from the Licensor to
197 |                exercise the Licensed Rights in the Adapted Material
198 |                under the conditions of the Adapter's License You apply.
199 | 
200 |             c. No downstream restrictions. You may not offer or impose
201 |                any additional or different terms or conditions on, or
202 |                apply any Effective Technological Measures to, the
203 |                Licensed Material if doing so restricts exercise of the
204 |                Licensed Rights by any recipient of the Licensed
205 |                Material.
206 | 
207 |        6. No endorsement. Nothing in this Public License constitutes or
208 |           may be construed as permission to assert or imply that You
209 |           are, or that Your use of the Licensed Material is, connected
210 |           with, or sponsored, endorsed, or granted official status by,
211 |           the Licensor or others designated to receive attribution as
212 |           provided in Section 3(a)(1)(A)(i).
213 | 
214 |   b. Other rights.
215 | 
216 |        1. Moral rights, such as the right of integrity, are not
217 |           licensed under this Public License, nor are publicity,
218 |           privacy, and/or other similar personality rights; however, to
219 |           the extent possible, the Licensor waives and/or agrees not to
220 |           assert any such rights held by the Licensor to the limited
221 |           extent necessary to allow You to exercise the Licensed
222 |           Rights, but not otherwise.
223 | 
224 |        2. Patent and trademark rights are not licensed under this
225 |           Public License.
226 | 
227 |        3. To the extent possible, the Licensor waives any right to
228 |           collect royalties from You for the exercise of the Licensed
229 |           Rights, whether directly or through a collecting society
230 |           under any voluntary or waivable statutory or compulsory
231 |           licensing scheme. In all other cases the Licensor expressly
232 |           reserves any right to collect such royalties, including when
233 |           the Licensed Material is used other than for NonCommercial
234 |           purposes.
235 | 
236 | 
237 | Section 3 -- License Conditions.
238 | 
239 | Your exercise of the Licensed Rights is expressly made subject to the
240 | following conditions.
241 | 
242 |   a. Attribution.
243 | 
244 |        1. If You Share the Licensed Material (including in modified
245 |           form), You must:
246 | 
247 |             a. retain the following if it is supplied by the Licensor
248 |                with the Licensed Material:
249 | 
250 |                  i. identification of the creator(s) of the Licensed
251 |                     Material and any others designated to receive
252 |                     attribution, in any reasonable manner requested by
253 |                     the Licensor (including by pseudonym if
254 |                     designated);
255 | 
256 |                 ii. a copyright notice;
257 | 
258 |                iii. a notice that refers to this Public License;
259 | 
260 |                 iv. a notice that refers to the disclaimer of
261 |                     warranties;
262 | 
263 |                  v. a URI or hyperlink to the Licensed Material to the
264 |                     extent reasonably practicable;
265 | 
266 |             b. indicate if You modified the Licensed Material and
267 |                retain an indication of any previous modifications; and
268 | 
269 |             c. indicate the Licensed Material is licensed under this
270 |                Public License, and include the text of, or the URI or
271 |                hyperlink to, this Public License.
272 | 
273 |        2. You may satisfy the conditions in Section 3(a)(1) in any
274 |           reasonable manner based on the medium, means, and context in
275 |           which You Share the Licensed Material. For example, it may be
276 |           reasonable to satisfy the conditions by providing a URI or
277 |           hyperlink to a resource that includes the required
278 |           information.
279 |        3. If requested by the Licensor, You must remove any of the
280 |           information required by Section 3(a)(1)(A) to the extent
281 |           reasonably practicable.
282 | 
283 |   b. ShareAlike.
284 | 
285 |      In addition to the conditions in Section 3(a), if You Share
286 |      Adapted Material You produce, the following conditions also apply.
287 | 
288 |        1. The Adapter's License You apply must be a Creative Commons
289 |           license with the same License Elements, this version or
290 |           later, or a BY-NC-SA Compatible License.
291 | 
292 |        2. You must include the text of, or the URI or hyperlink to, the
293 |           Adapter's License You apply. You may satisfy this condition
294 |           in any reasonable manner based on the medium, means, and
295 |           context in which You Share Adapted Material.
296 | 
297 |        3. You may not offer or impose any additional or different terms
298 |           or conditions on, or apply any Effective Technological
299 |           Measures to, Adapted Material that restrict exercise of the
300 |           rights granted under the Adapter's License You apply.
301 | 
302 | 
303 | Section 4 -- Sui Generis Database Rights.
304 | 
305 | Where the Licensed Rights include Sui Generis Database Rights that
306 | apply to Your use of the Licensed Material:
307 | 
308 |   a. for the avoidance of doubt, Section 2(a)(1) grants You the right
309 |      to extract, reuse, reproduce, and Share all or a substantial
310 |      portion of the contents of the database for NonCommercial purposes
311 |      only;
312 | 
313 |   b. if You include all or a substantial portion of the database
314 |      contents in a database in which You have Sui Generis Database
315 |      Rights, then the database in which You have Sui Generis Database
316 |      Rights (but not its individual contents) is Adapted Material,
317 |      including for purposes of Section 3(b); and
318 | 
319 |   c. You must comply with the conditions in Section 3(a) if You Share
320 |      all or a substantial portion of the contents of the database.
321 | 
322 | For the avoidance of doubt, this Section 4 supplements and does not
323 | replace Your obligations under this Public License where the Licensed
324 | Rights include other Copyright and Similar Rights.
325 | 
326 | 
327 | Section 5 -- Disclaimer of Warranties and Limitation of Liability.
328 | 
329 |   a. UNLESS OTHERWISE SEPARATELY UNDERTAKEN BY THE LICENSOR, TO THE
330 |      EXTENT POSSIBLE, THE LICENSOR OFFERS THE LICENSED MATERIAL AS-IS
331 |      AND AS-AVAILABLE, AND MAKES NO REPRESENTATIONS OR WARRANTIES OF
332 |      ANY KIND CONCERNING THE LICENSED MATERIAL, WHETHER EXPRESS,
333 |      IMPLIED, STATUTORY, OR OTHER. THIS INCLUDES, WITHOUT LIMITATION,
334 |      WARRANTIES OF TITLE, MERCHANTABILITY, FITNESS FOR A PARTICULAR
335 |      PURPOSE, NON-INFRINGEMENT, ABSENCE OF LATENT OR OTHER DEFECTS,
336 |      ACCURACY, OR THE PRESENCE OR ABSENCE OF ERRORS, WHETHER OR NOT
337 |      KNOWN OR DISCOVERABLE. WHERE DISCLAIMERS OF WARRANTIES ARE NOT
338 |      ALLOWED IN FULL OR IN PART, THIS DISCLAIMER MAY NOT APPLY TO YOU.
339 | 
340 |   b. TO THE EXTENT POSSIBLE, IN NO EVENT WILL THE LICENSOR BE LIABLE
341 |      TO YOU ON ANY LEGAL THEORY (INCLUDING, WITHOUT LIMITATION,
342 |      NEGLIGENCE) OR OTHERWISE FOR ANY DIRECT, SPECIAL, INDIRECT,
343 |      INCIDENTAL, CONSEQUENTIAL, PUNITIVE, EXEMPLARY, OR OTHER LOSSES,
344 |      COSTS, EXPENSES, OR DAMAGES ARISING OUT OF THIS PUBLIC LICENSE OR
345 |      USE OF THE LICENSED MATERIAL, EVEN IF THE LICENSOR HAS BEEN
346 |      ADVISED OF THE POSSIBILITY OF SUCH LOSSES, COSTS, EXPENSES, OR
347 |      DAMAGES. WHERE A LIMITATION OF LIABILITY IS NOT ALLOWED IN FULL OR
348 |      IN PART, THIS LIMITATION MAY NOT APPLY TO YOU.
349 | 
350 |   c. The disclaimer of warranties and limitation of liability provided
351 |      above shall be interpreted in a manner that, to the extent
352 |      possible, most closely approximates an absolute disclaimer and
353 |      waiver of all liability.
354 | 
355 | 
356 | Section 6 -- Term and Termination.
357 | 
358 |   a. This Public License applies for the term of the Copyright and
359 |      Similar Rights licensed here. However, if You fail to comply with
360 |      this Public License, then Your rights under this Public License
361 |      terminate automatically.
362 | 
363 |   b. Where Your right to use the Licensed Material has terminated under
364 |      Section 6(a), it reinstates:
365 | 
366 |        1. automatically as of the date the violation is cured, provided
367 |           it is cured within 30 days of Your discovery of the
368 |           violation; or
369 | 
370 |        2. upon express reinstatement by the Licensor.
371 | 
372 |      For the avoidance of doubt, this Section 6(b) does not affect any
373 |      right the Licensor may have to seek remedies for Your violations
374 |      of this Public License.
375 | 
376 |   c. For the avoidance of doubt, the Licensor may also offer the
377 |      Licensed Material under separate terms or conditions or stop
378 |      distributing the Licensed Material at any time; however, doing so
379 |      will not terminate this Public License.
380 | 
381 |   d. Sections 1, 5, 6, 7, and 8 survive termination of this Public
382 |      License.
383 | 
384 | 
385 | Section 7 -- Other Terms and Conditions.
386 | 
387 |   a. The Licensor shall not be bound by any additional or different
388 |      terms or conditions communicated by You unless expressly agreed.
389 | 
390 |   b. Any arrangements, understandings, or agreements regarding the
391 |      Licensed Material not stated herein are separate from and
392 |      independent of the terms and conditions of this Public License.
393 | 
394 | 
395 | Section 8 -- Interpretation.
396 | 
397 |   a. For the avoidance of doubt, this Public License does not, and
398 |      shall not be interpreted to, reduce, limit, restrict, or impose
399 |      conditions on any use of the Licensed Material that could lawfully
400 |      be made without permission under this Public License.
401 | 
402 |   b. To the extent possible, if any provision of this Public License is
403 |      deemed unenforceable, it shall be automatically reformed to the
404 |      minimum extent necessary to make it enforceable. If the provision
405 |      cannot be reformed, it shall be severed from this Public License
406 |      without affecting the enforceability of the remaining terms and
407 |      conditions.
408 | 
409 |   c. No term or condition of this Public License will be waived and no
410 |      failure to comply consented to unless expressly agreed to by the
411 |      Licensor.
412 | 
413 |   d. Nothing in this Public License constitutes or may be interpreted
414 |      as a limitation upon, or waiver of, any privileges and immunities
415 |      that apply to the Licensor or You, including from the legal
416 |      processes of any jurisdiction or authority.
417 | 
418 | =======================================================================
419 | 
420 | Creative Commons is not a party to its public
421 | licenses. Notwithstanding, Creative Commons may elect to apply one of
422 | its public licenses to material it publishes and in those instances
423 | will be considered the “Licensor.” The text of the Creative Commons
424 | public licenses is dedicated to the public domain under the CC0 Public
425 | Domain Dedication. Except for the limited purpose of indicating that
426 | material is shared under a Creative Commons public license or as
427 | otherwise permitted by the Creative Commons policies published at
428 | creativecommons.org/policies, Creative Commons does not authorize the
429 | use of the trademark "Creative Commons" or any other trademark or logo
430 | of Creative Commons without its prior written consent including,
431 | without limitation, in connection with any unauthorized modifications
432 | to any of its public licenses or any other arrangements,
433 | understandings, or agreements concerning use of licensed material. For
434 | the avoidance of doubt, this paragraph does not form part of the
435 | public licenses.
436 | 
437 | Creative Commons may be contacted at creativecommons.org.


--------------------------------------------------------------------------------
/model/ImageBind/README.md:
--------------------------------------------------------------------------------
  1 | # ImageBind: One Embedding Space To Bind Them All
  2 | 
  3 | **[FAIR, Meta AI](https://ai.facebook.com/research/)** 
  4 | 
  5 | Rohit Girdhar*,
  6 | Alaaeldin El-Nouby*,
  7 | Zhuang Liu,
  8 | Mannat Singh,
  9 | Kalyan Vasudev Alwala,
 10 | Armand Joulin,
 11 | Ishan Misra*
 12 | 
 13 | To appear at CVPR 2023 (*Highlighted paper*)
 14 | 
 15 | [[`Paper`](https://facebookresearch.github.io/ImageBind/paper)] [[`Blog`](https://ai.facebook.com/blog/imagebind-six-modalities-binding-ai/)] [[`Demo`](https://imagebind.metademolab.com/)] [[`Supplementary Video`](https://dl.fbaipublicfiles.com/imagebind/imagebind_video.mp4)] [[`BibTex`](#citing-imagebind)]
 16 | 
 17 | PyTorch implementation and pretrained models for ImageBind. For details, see the paper: **[ImageBind: One Embedding Space To Bind Them All](https://facebookresearch.github.io/ImageBind/paper)**.
 18 | 
 19 | ImageBind learns a joint embedding across six different modalities - images, text, audio, depth, thermal, and IMU data. It enables novel emergent applications ‘out-of-the-box’ including cross-modal retrieval, composing modalities with arithmetic, cross-modal detection and generation.
 20 | 
 21 | 
 22 | 
 23 | ![ImageBind](https://user-images.githubusercontent.com/8495451/236859695-ffa13364-3e39-4d99-a8da-fbfab17f9a6b.gif)
 24 | 
 25 | ## ImageBind model
 26 | 
 27 | Emergent zero-shot classification performance.
 28 | 
 29 | <table style="margin: auto">
 30 |   <tr>
 31 |     <th>Model</th>
 32 |     <th><span style="color:blue">IN1k</span></th>
 33 |     <th><span style="color:purple">K400</span></th>
 34 |     <th><span style="color:green">NYU-D</span></th>
 35 |     <th><span style="color:LightBlue">ESC</span></th>
 36 |     <th><span style="color:orange">LLVIP</span></th>
 37 |     <th><span style="color:purple">Ego4D</span></th>
 38 |     <th>download</th>
 39 |   </tr>
 40 |   <tr>
 41 |     <td>imagebind_huge</td>
 42 |     <td align="right">77.7</td>
 43 |     <td align="right">50.0</td>
 44 |     <td align="right">54.0</td>
 45 |     <td align="right">66.9</td>
 46 |     <td align="right">63.4</td>
 47 |     <td align="right">25.0</td>
 48 |     <td><a href="https://dl.fbaipublicfiles.com/imagebind/imagebind_huge.pth">checkpoint</a></td>
 49 |   </tr>
 50 |   
 51 | </table>
 52 | 
 53 | ## Usage
 54 | 
 55 | Install pytorch 1.13+ and other 3rd party dependencies.
 56 | 
 57 | ```shell
 58 | conda create --name imagebind python=3.8 -y
 59 | conda activate imagebind
 60 | 
 61 | pip install -r requirements.txt
 62 | ```
 63 | 
 64 | For windows users, you might need to install `soundfile` for reading/writing audio files. (Thanks @congyue1977)
 65 | 
 66 | ```
 67 | pip install soundfile
 68 | ```
 69 | 
 70 | 
 71 | Extract and compare features across modalities (e.g. Image, Text and Audio).
 72 | 
 73 | ```python
 74 | import data
 75 | import torch
 76 | from models import imagebind_model
 77 | from models.imagebind_model import ModalityType
 78 | 
 79 | text_list=["A dog.", "A car", "A bird"]
 80 | image_paths=[".assets/dog_image.jpg", ".assets/car_image.jpg", ".assets/bird_image.jpg"]
 81 | audio_paths=[".assets/dog_audio.wav", ".assets/car_audio.wav", ".assets/bird_audio.wav"]
 82 | 
 83 | device = "cuda:0" if torch.cuda.is_available() else "cpu"
 84 | 
 85 | # Instantiate model
 86 | model = imagebind_model.imagebind_huge(pretrained=True)
 87 | model.eval()
 88 | model.to(device)
 89 | 
 90 | # Load data
 91 | inputs = {
 92 |     ModalityType.TEXT: data.load_and_transform_text(text_list, device),
 93 |     ModalityType.VISION: data.load_and_transform_vision_data(image_paths, device),
 94 |     ModalityType.AUDIO: data.load_and_transform_audio_data(audio_paths, device),
 95 | }
 96 | 
 97 | with torch.no_grad():
 98 |     embeddings = model(inputs)
 99 | 
100 | print(
101 |     "Vision x Text: ",
102 |     torch.softmax(embeddings[ModalityType.VISION] @ embeddings[ModalityType.TEXT].T, dim=-1),
103 | )
104 | print(
105 |     "Audio x Text: ",
106 |     torch.softmax(embeddings[ModalityType.AUDIO] @ embeddings[ModalityType.TEXT].T, dim=-1),
107 | )
108 | print(
109 |     "Vision x Audio: ",
110 |     torch.softmax(embeddings[ModalityType.VISION] @ embeddings[ModalityType.AUDIO].T, dim=-1),
111 | )
112 | 
113 | # Expected output:
114 | #
115 | # Vision x Text:
116 | # tensor([[9.9761e-01, 2.3694e-03, 1.8612e-05],
117 | #         [3.3836e-05, 9.9994e-01, 2.4118e-05],
118 | #         [4.7997e-05, 1.3496e-02, 9.8646e-01]])
119 | #
120 | # Audio x Text:
121 | # tensor([[1., 0., 0.],
122 | #         [0., 1., 0.],
123 | #         [0., 0., 1.]])
124 | #
125 | # Vision x Audio:
126 | # tensor([[0.8070, 0.1088, 0.0842],
127 | #         [0.1036, 0.7884, 0.1079],
128 | #         [0.0018, 0.0022, 0.9960]])
129 | 
130 | ```
131 | 
132 | ## Model card
133 | Please see the [model card](model_card.md) for details.
134 | 
135 | ## License
136 | 
137 | ImageBind code and model weights are released under the CC-BY-NC 4.0 license. See [LICENSE](LICENSE) for additional details.
138 | 
139 | ## Contributing
140 | 
141 | See [contributing](CONTRIBUTING.md) and the [code of conduct](CODE_OF_CONDUCT.md).
142 | 
143 | ## Citing ImageBind
144 | 
145 | If you find this repository useful, please consider giving a star :star: and citation
146 | 
147 | ```
148 | @inproceedings{girdhar2023imagebind,
149 |   title={ImageBind: One Embedding Space To Bind Them All},
150 |   author={Girdhar, Rohit and El-Nouby, Alaaeldin and Liu, Zhuang
151 | and Singh, Mannat and Alwala, Kalyan Vasudev and Joulin, Armand and Misra, Ishan},
152 |   booktitle={CVPR},
153 |   year={2023}
154 | }
155 | ```
156 | 


--------------------------------------------------------------------------------
/model/ImageBind/__init__.py:
--------------------------------------------------------------------------------
1 | from .models import imagebind_model
2 | from .models.imagebind_model import ModalityType
3 | 


--------------------------------------------------------------------------------
/model/ImageBind/__pycache__/__init__.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BriansIDP/AudioVisualLLM/52ab7d14b499e7c951df26c89ce2d3673976be9e/model/ImageBind/__pycache__/__init__.cpython-39.pyc


--------------------------------------------------------------------------------
/model/ImageBind/__pycache__/data.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BriansIDP/AudioVisualLLM/52ab7d14b499e7c951df26c89ce2d3673976be9e/model/ImageBind/__pycache__/data.cpython-39.pyc


--------------------------------------------------------------------------------
/model/ImageBind/bpe/bpe_simple_vocab_16e6.txt.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BriansIDP/AudioVisualLLM/52ab7d14b499e7c951df26c89ce2d3673976be9e/model/ImageBind/bpe/bpe_simple_vocab_16e6.txt.gz


--------------------------------------------------------------------------------
/model/ImageBind/data.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # Portions Copyright (c) Meta Platforms, Inc. and affiliates.
  3 | # All rights reserved.
  4 | 
  5 | # This source code is licensed under the license found in the
  6 | # LICENSE file in the root directory of this source tree.
  7 | 
  8 | import math
  9 | 
 10 | import torch
 11 | import torch.nn as nn
 12 | import torchaudio
 13 | import logging
 14 | 
 15 | from .models.multimodal_preprocessors import SimpleTokenizer
 16 | from PIL import Image
 17 | from pytorchvideo import transforms as pv_transforms
 18 | from pytorchvideo.data.clip_sampling import ConstantClipsPerVideoSampler, UniformClipSampler
 19 | from pytorchvideo.data.encoded_video import EncodedVideo
 20 | 
 21 | from torchvision import transforms
 22 | from torchvision.transforms._transforms_video import NormalizeVideo
 23 | from torchvision.transforms.functional import InterpolationMode
 24 | 
 25 | DEFAULT_AUDIO_FRAME_SHIFT_MS = 10  # in milliseconds
 26 | 
 27 | BPE_PATH = "bpe/bpe_simple_vocab_16e6.txt.gz"
 28 | 
 29 | 
 30 | def waveform2melspec(waveform, sample_rate, num_mel_bins, target_length):
 31 |     # Based on https://github.com/YuanGongND/ast/blob/d7d8b4b8e06cdaeb6c843cdb38794c1c7692234c/src/dataloader.py#L102
 32 |     waveform -= waveform.mean()
 33 |     fbank = torchaudio.compliance.kaldi.fbank(
 34 |         waveform,
 35 |         htk_compat=True,
 36 |         sample_frequency=sample_rate,
 37 |         use_energy=False,
 38 |         window_type="hanning",
 39 |         num_mel_bins=num_mel_bins,
 40 |         dither=0.0,
 41 |         frame_length=25,
 42 |         frame_shift=DEFAULT_AUDIO_FRAME_SHIFT_MS,
 43 |     )
 44 |     # Convert to [mel_bins, num_frames] shape
 45 |     fbank = fbank.transpose(0, 1)
 46 |     # Pad to target_length
 47 |     n_frames = fbank.size(1)
 48 |     p = target_length - n_frames
 49 |     # if p is too large (say >20%), flash a warning
 50 |     if abs(p) / n_frames > 0.2:
 51 |         logging.warning(
 52 |             "Large gap between audio n_frames(%d) and "
 53 |             "target_length (%d). Is the audio_target_length "
 54 |             "setting correct?",
 55 |             n_frames,
 56 |             target_length,
 57 |         )
 58 |     # cut and pad
 59 |     if p > 0:
 60 |         fbank = torch.nn.functional.pad(fbank, (0, p), mode="constant", value=0)
 61 |     elif p < 0:
 62 |         fbank = fbank[:, 0:target_length]
 63 |     # Convert to [1, mel_bins, num_frames] shape, essentially like a 1
 64 |     # channel image
 65 |     fbank = fbank.unsqueeze(0)
 66 |     return fbank
 67 | 
 68 | 
 69 | def get_clip_timepoints(clip_sampler, duration):
 70 |     # Read out all clips in this video
 71 |     all_clips_timepoints = []
 72 |     is_last_clip = False
 73 |     end = 0.0
 74 |     while not is_last_clip:
 75 |         start, end, _, _, is_last_clip = clip_sampler(end, duration, annotation=None)
 76 |         all_clips_timepoints.append((start, end))
 77 |     return all_clips_timepoints
 78 | 
 79 | 
 80 | def load_and_transform_vision_data(image_paths, device):
 81 |     if image_paths is None:
 82 |         return None
 83 | 
 84 |     image_ouputs = []
 85 |     for image_path in image_paths:
 86 |         data_transform = transforms.Compose(
 87 |             [
 88 |                 transforms.Resize(
 89 |                     224, interpolation=transforms.InterpolationMode.BICUBIC
 90 |                 ),
 91 |                 transforms.CenterCrop(224),
 92 |                 transforms.ToTensor(),
 93 |                 transforms.Normalize(
 94 |                     mean=(0.48145466, 0.4578275, 0.40821073),
 95 |                     std=(0.26862954, 0.26130258, 0.27577711),
 96 |                 ),
 97 |             ]
 98 |         )
 99 |         with open(image_path, "rb") as fopen:
100 |             image = Image.open(fopen).convert("RGB")
101 | 
102 |         image = data_transform(image).to(device)
103 |         image_ouputs.append(image)
104 |     return torch.stack(image_ouputs, dim=0)
105 | 
106 | 
107 | def load_and_transform_vision_data_blip(image_paths, device, training=False):
108 |     if image_paths is None:
109 |         return None
110 | 
111 |     if training:
112 |         data_transform = transforms.Compose(
113 |             [
114 |                 transforms.RandomResizedCrop(
115 |                     224,
116 |                     scale=(0.5, 1.0),
117 |                     interpolation=InterpolationMode.BICUBIC,
118 |                 ),
119 |                 transforms.ToTensor(),
120 |                 transforms.Normalize(
121 |                     mean=(0.48145466, 0.4578275, 0.40821073),
122 |                     std=(0.26862954, 0.26130258, 0.27577711),
123 |                 ),
124 |             ]
125 |         )
126 |     else:
127 |         data_transform = transforms.Compose(
128 |             [
129 |                 transforms.Resize(
130 |                     (224, 224), interpolation=InterpolationMode.BICUBIC
131 |                 ),
132 |                 transforms.ToTensor(),
133 |                 transforms.Normalize(
134 |                     mean=(0.48145466, 0.4578275, 0.40821073),
135 |                     std=(0.26862954, 0.26130258, 0.27577711),
136 |                 ),
137 |             ]
138 |         )
139 | 
140 |     image_ouputs = []
141 |     for image_path in image_paths:
142 |         with open(image_path, "rb") as fopen:
143 |             image = Image.open(fopen).convert("RGB")
144 | 
145 |         image = data_transform(image).to(device)
146 |         image_ouputs.append(image)
147 |     return torch.stack(image_ouputs, dim=0)
148 | 
149 | 
150 | def load_and_transform_thermal_data(thermal_paths, device):
151 |     if thermal_paths is None:
152 |         return None
153 | 
154 |     thermal_ouputs = []
155 |     for thermal_path in thermal_paths:
156 |         data_transform = transforms.Compose(
157 |             [
158 |                 transforms.Resize(
159 |                     224, interpolation=transforms.InterpolationMode.BICUBIC
160 |                 ),
161 |                 transforms.CenterCrop(224),
162 |                 transforms.ToTensor(),
163 |             ]
164 |         )
165 |         with open(thermal_path, "rb") as fopen:
166 |             thermal = Image.open(fopen).convert("L")
167 |         thermal = data_transform(thermal).to(device)
168 |         thermal_ouputs.append(thermal)
169 |     return torch.stack(thermal_ouputs, dim=0)
170 | 
171 | 
172 | def load_and_transform_text(text, device):
173 |     if text is None:
174 |         return None
175 |     tokenizer = SimpleTokenizer(bpe_path=BPE_PATH)
176 |     tokens = [tokenizer(t).unsqueeze(0).to(device) for t in text]
177 |     tokens = torch.cat(tokens, dim=0)
178 |     return tokens
179 | 
180 | def load_and_transform_audio_data_fulllen(
181 |     audio_paths,
182 |     device,
183 |     num_mel_bins=128,
184 |     target_length=204,
185 |     sample_rate=16000,
186 |     clip_duration=2,
187 |     mean=-4.268,
188 |     std=9.138,
189 |     maxlen=30,
190 | ):
191 |     if audio_paths is None:
192 |         return None
193 | 
194 |     audio_outputs = []
195 |     # clip_sampler = ConstantClipsPerVideoSampler(
196 |     #     clip_duration=clip_duration, clips_per_video=clips_per_video
197 |     # )
198 | 
199 |     for audio_path in audio_paths:
200 |         waveform, sr = torchaudio.load(audio_path)
201 |         if sample_rate != sr:
202 |             waveform = torchaudio.functional.resample(
203 |                 waveform, orig_freq=sr, new_freq=sample_rate
204 |             )
205 |         full_lengths = waveform.size(1)
206 |         if full_lengths < maxlen * sample_rate:
207 |             diffsize = maxlen * sample_rate - full_lengths - 1
208 |             waveform = torch.cat(
209 |                     [waveform, waveform.new_zeros(waveform.size(0), diffsize)], dim=-1)
210 |         full_lengths = min(waveform.size(1), maxlen * sample_rate)
211 |         all_clips = []
212 |         start = 0
213 |         stepsize = clip_duration * sample_rate
214 |         while start < full_lengths:
215 |             end = min(start + stepsize, full_lengths)
216 |             waveform_clip = waveform[
217 |                 :,
218 |                 int(start) : int(end),
219 |             ]
220 |             if int(end) - int(start) < stepsize:
221 |                 diffsize = stepsize - int(end) + int(start)
222 |                 waveform_clip = torch.cat(
223 |                     [waveform_clip, waveform_clip.new_zeros(waveform_clip.size(0), diffsize)], dim=-1)
224 |             waveform_melspec = waveform2melspec(
225 |                 waveform_clip, sample_rate, num_mel_bins, target_length
226 |             )
227 |             all_clips.append(waveform_melspec)
228 |             start = start + stepsize
229 | 
230 |         normalize = transforms.Normalize(mean=mean, std=std)
231 |         all_clips = [normalize(ac).to(device) for ac in all_clips]
232 | 
233 |         all_clips = torch.stack(all_clips, dim=0)
234 |         audio_outputs.append(all_clips)
235 |         # for audio in audio_outputs:
236 |         #     if audio.size(0) > 5:
237 |         #         import pdb; pdb.set_trace()
238 | 
239 |     return torch.stack(audio_outputs, dim=0)
240 | 
241 | 
242 | def load_and_transform_audio_data(
243 |     audio_paths,
244 |     device,
245 |     num_mel_bins=128,
246 |     target_length=204,
247 |     sample_rate=16000,
248 |     clip_duration=2,
249 |     clips_per_video=3,
250 |     mean=-4.268,
251 |     std=9.138,
252 | ):
253 |     if audio_paths is None:
254 |         return None
255 | 
256 |     audio_outputs = []
257 |     clip_sampler = ConstantClipsPerVideoSampler(
258 |         clip_duration=clip_duration, clips_per_video=clips_per_video
259 |     )
260 | 
261 |     for audio_path in audio_paths:
262 |         waveform, sr = torchaudio.load(audio_path)
263 |         if sample_rate != sr:
264 |             waveform = torchaudio.functional.resample(
265 |                 waveform, orig_freq=sr, new_freq=sample_rate
266 |             )
267 |         all_clips_timepoints = get_clip_timepoints(
268 |             clip_sampler, waveform.size(1) / sample_rate
269 |         )
270 |         all_clips = []
271 |         for clip_timepoints in all_clips_timepoints:
272 |             waveform_clip = waveform[
273 |                 :,
274 |                 int(clip_timepoints[0] * sample_rate) : int(
275 |                     clip_timepoints[1] * sample_rate
276 |                 ),
277 |             ]
278 |             waveform_melspec = waveform2melspec(
279 |                 waveform_clip, sample_rate, num_mel_bins, target_length
280 |             )
281 |             all_clips.append(waveform_melspec)
282 | 
283 |         normalize = transforms.Normalize(mean=mean, std=std)
284 |         all_clips = [normalize(ac).to(device) for ac in all_clips]
285 | 
286 |         all_clips = torch.stack(all_clips, dim=0)
287 |         audio_outputs.append(all_clips)
288 | 
289 |     return torch.stack(audio_outputs, dim=0)
290 | 
291 | 
292 | def get_clip_timepoints(clip_sampler, duration):
293 |     # Read out all clips in this video
294 |     all_clips_timepoints = []
295 |     is_last_clip = False
296 |     end = 0.0
297 |     while not is_last_clip:
298 |         start, end, _, _, is_last_clip = clip_sampler(end, duration, annotation=None)
299 |         all_clips_timepoints.append((start, end))
300 |     return all_clips_timepoints
301 | 
302 | 
303 | def crop_boxes(boxes, x_offset, y_offset):
304 |     """
305 |     Peform crop on the bounding boxes given the offsets.
306 |     Args:
307 |         boxes (ndarray or None): bounding boxes to peform crop. The dimension
308 |             is `num boxes` x 4.
309 |         x_offset (int): cropping offset in the x axis.
310 |         y_offset (int): cropping offset in the y axis.
311 |     Returns:
312 |         cropped_boxes (ndarray or None): the cropped boxes with dimension of
313 |             `num boxes` x 4.
314 |     """
315 |     cropped_boxes = boxes.copy()
316 |     cropped_boxes[:, [0, 2]] = boxes[:, [0, 2]] - x_offset
317 |     cropped_boxes[:, [1, 3]] = boxes[:, [1, 3]] - y_offset
318 | 
319 |     return cropped_boxes
320 | 
321 | 
322 | def uniform_crop(images, size, spatial_idx, boxes=None, scale_size=None):
323 |     """
324 |     Perform uniform spatial sampling on the images and corresponding boxes.
325 |     Args:
326 |         images (tensor): images to perform uniform crop. The dimension is
327 |             `num frames` x `channel` x `height` x `width`.
328 |         size (int): size of height and weight to crop the images.
329 |         spatial_idx (int): 0, 1, or 2 for left, center, and right crop if width
330 |             is larger than height. Or 0, 1, or 2 for top, center, and bottom
331 |             crop if height is larger than width.
332 |         boxes (ndarray or None): optional. Corresponding boxes to images.
333 |             Dimension is `num boxes` x 4.
334 |         scale_size (int): optinal. If not None, resize the images to scale_size before
335 |             performing any crop.
336 |     Returns:
337 |         cropped (tensor): images with dimension of
338 |             `num frames` x `channel` x `size` x `size`.
339 |         cropped_boxes (ndarray or None): the cropped boxes with dimension of
340 |             `num boxes` x 4.
341 |     """
342 |     assert spatial_idx in [0, 1, 2]
343 |     ndim = len(images.shape)
344 |     if ndim == 3:
345 |         images = images.unsqueeze(0)
346 |     height = images.shape[2]
347 |     width = images.shape[3]
348 | 
349 |     if scale_size is not None:
350 |         if width <= height:
351 |             width, height = scale_size, int(height / width * scale_size)
352 |         else:
353 |             width, height = int(width / height * scale_size), scale_size
354 |         images = torch.nn.functional.interpolate(
355 |             images,
356 |             size=(height, width),
357 |             mode="bilinear",
358 |             align_corners=False,
359 |         )
360 | 
361 |     y_offset = int(math.ceil((height - size) / 2))
362 |     x_offset = int(math.ceil((width - size) / 2))
363 | 
364 |     if height > width:
365 |         if spatial_idx == 0:
366 |             y_offset = 0
367 |         elif spatial_idx == 2:
368 |             y_offset = height - size
369 |     else:
370 |         if spatial_idx == 0:
371 |             x_offset = 0
372 |         elif spatial_idx == 2:
373 |             x_offset = width - size
374 |     cropped = images[:, :, y_offset : y_offset + size, x_offset : x_offset + size]
375 |     cropped_boxes = crop_boxes(boxes, x_offset, y_offset) if boxes is not None else None
376 |     if ndim == 3:
377 |         cropped = cropped.squeeze(0)
378 |     return cropped, cropped_boxes
379 | 
380 | 
381 | class SpatialCrop(nn.Module):
382 |     """
383 |     Convert the video into 3 smaller clips spatially. Must be used after the
384 |         temporal crops to get spatial crops, and should be used with
385 |         -2 in the spatial crop at the slowfast augmentation stage (so full
386 |         frames are passed in here). Will return a larger list with the
387 |         3x spatial crops as well.
388 |     """
389 | 
390 |     def __init__(self, crop_size: int = 224, num_crops: int = 3):
391 |         super().__init__()
392 |         self.crop_size = crop_size
393 |         if num_crops == 3:
394 |             self.crops_to_ext = [0, 1, 2]
395 |             self.flipped_crops_to_ext = []
396 |         elif num_crops == 1:
397 |             self.crops_to_ext = [1]
398 |             self.flipped_crops_to_ext = []
399 |         else:
400 |             raise NotImplementedError("Nothing else supported yet")
401 | 
402 |     def forward(self, videos):
403 |         """
404 |         Args:
405 |             videos: A list of C, T, H, W videos.
406 |         Returns:
407 |             videos: A list with 3x the number of elements. Each video converted
408 |                 to C, T, H', W' by spatial cropping.
409 |         """
410 |         assert isinstance(videos, list), "Must be a list of videos after temporal crops"
411 |         assert all([video.ndim == 4 for video in videos]), "Must be (C,T,H,W)"
412 |         res = []
413 |         for video in videos:
414 |             for spatial_idx in self.crops_to_ext:
415 |                 res.append(uniform_crop(video, self.crop_size, spatial_idx)[0])
416 |             if not self.flipped_crops_to_ext:
417 |                 continue
418 |             flipped_video = transforms.functional.hflip(video)
419 |             for spatial_idx in self.flipped_crops_to_ext:
420 |                 res.append(uniform_crop(flipped_video, self.crop_size, spatial_idx)[0])
421 |         return res
422 | 
423 | 
424 | class ToUint8(object):
425 |     def __init__(self):
426 |         pass
427 | 
428 |     def __call__(self, tensor):
429 |         return tensor.to(torch.uint8)
430 | 
431 |     def __repr__(self):
432 |         return self.__class__.__name__
433 | 
434 | 
435 | class ToTHWC(object):
436 |     """
437 |     Args:
438 |         clip (torch.tensor, dtype=torch.uint8): Size is (C, T, H, W)
439 |     Return:
440 |         clip (torch.tensor, dtype=torch.float): Size is (T, H, W, C)
441 |     """
442 | 
443 |     def __init__(self):
444 |         pass
445 | 
446 |     def __call__(self, tensor):
447 |         return tensor.permute(1, 2, 3, 0)
448 | 
449 |     def __repr__(self):
450 |         return self.__class__.__name__
451 | 
452 | def resize(clip, target_size, interpolation_mode):
453 |     if len(target_size) != 2:
454 |         raise ValueError(
455 |             f"target size should be tuple (height, width), instead got {target_size}"
456 |         )
457 |     return torch.nn.functional.interpolate(
458 |         clip, size=target_size, mode=interpolation_mode, align_corners=False
459 |     )
460 | 
461 | class ResizeVideo(object):
462 |     def __init__(self, target_size, interpolation_mode="bilinear"):
463 |         self.target_size = target_size
464 |         self.interpolation_mode = interpolation_mode
465 | 
466 |     def __call__(self, clip):
467 |         """
468 |         Args:
469 |             clip (torch.tensor): Video clip to be cropped. Size is (C, T, H, W)
470 |         Returns:
471 |             torch.tensor: central cropping of video clip. Size is
472 |             (C, T, crop_size, crop_size)
473 |         """
474 |         return resize(clip, self.target_size, self.interpolation_mode)
475 | 
476 |     def __repr__(self):
477 |         return self.__class__.__name__ + "(resize_size={0})".format(self.target_size)
478 | 
479 | def load_and_transform_video_data_full(
480 |     video_paths,
481 |     device,
482 |     clip_duration=1,
483 |     sample_per_clip=2,
484 |     sample_rate=16000,
485 | ):
486 |     if video_paths is None:
487 |         return None
488 | 
489 |     video_outputs = []
490 |     video_transform = transforms.Compose(
491 |         [
492 |             ResizeVideo((224, 224), interpolation_mode="bicubic"),
493 |             NormalizeVideo(
494 |                 mean=(0.48145466, 0.4578275, 0.40821073),
495 |                 std=(0.26862954, 0.26130258, 0.27577711),
496 |             ),
497 |         ]
498 |     )
499 | 
500 |     clip_sampler = UniformClipSampler(
501 |         clip_duration=clip_duration, backpad_last=True
502 |     )
503 |     frame_sampler = pv_transforms.UniformTemporalSubsample(num_samples=sample_per_clip)
504 | 
505 |     maxlen = 0
506 |     for video_path in video_paths:
507 |         if not isinstance(video_path, list):
508 |             video = EncodedVideo.from_path(
509 |                 video_path,
510 |                 decoder="decord",
511 |                 decode_audio=False,
512 |                 **{"sample_rate": sample_rate},
513 |             )
514 | 
515 |             all_clips_timepoints = get_clip_timepoints(clip_sampler, video.duration)
516 | 
517 |             all_video = []
518 |             for clip_timepoints in all_clips_timepoints:
519 |                 # Read the clip, get frames
520 |                 clip = video.get_clip(clip_timepoints[0], clip_timepoints[1])
521 |                 if clip is None:
522 |                     raise ValueError("No clip found")
523 |                 video_clip = frame_sampler(clip["video"])
524 |                 video_clip = video_clip / 255.0  # since this is float, need 0-1
525 | 
526 |                 all_video.append(video_clip)
527 |         else:
528 |             all_video = video_path
529 | 
530 |         all_video = [video_transform(clip) for clip in all_video]
531 |         # all_video = SpatialCrop(224, num_crops=3)(all_video)
532 |         if len(all_video) > maxlen:
533 |             maxlen = len(all_video)
534 |         all_video = torch.stack(all_video, dim=0)
535 |         video_outputs.append(all_video)
536 | 
537 |     padded_video_outputs = []
538 |     padded_video_mask = []
539 |     for video in video_outputs:
540 |         if video.size(0) < maxlen:
541 |             diffsize = maxlen - video.size(0)
542 |             padded_video_mask.append([1] * video.size(0) + [0] * diffsize)
543 |             video = torch.cat([video, video.new_zeros(
544 |                 diffsize, video.size(1), video.size(2), video.size(3), video.size(4))], dim=0)
545 |         else:
546 |             padded_video_mask.append([1] * video.size(0))
547 |         padded_video_outputs.append(video)
548 | 
549 |     return torch.stack(padded_video_outputs, dim=0).to(device), torch.tensor(padded_video_mask).to(device)
550 | 
551 | def load_and_transform_video_data_blip(
552 |     video_paths,
553 |     device,
554 |     clip_duration=1,
555 |     sample_per_clip=2,
556 |     sample_rate=16000,
557 | ):
558 |     if video_paths is None:
559 |         return None
560 | 
561 |     video_outputs = []
562 |     video_transform = transforms.Compose(
563 |         [
564 |             ResizeVideo((224, 224), interpolation_mode="bicubic"),
565 |             NormalizeVideo(
566 |                 mean=(0.48145466, 0.4578275, 0.40821073),
567 |                 std=(0.26862954, 0.26130258, 0.27577711),
568 |             ),
569 |         ]
570 |     )
571 | 
572 |     clip_sampler = UniformClipSampler(
573 |         clip_duration=clip_duration, backpad_last=True
574 |     )
575 |     frame_sampler = pv_transforms.UniformTemporalSubsample(num_samples=sample_per_clip)
576 | 
577 |     maxlen = 0
578 |     for all_video in video_paths:
579 |         if all_video is not None:
580 |             if not isinstance(all_video, list):
581 |                 video = EncodedVideo.from_path(
582 |                     all_video,
583 |                     decoder="pyav",
584 |                     decode_audio=False,
585 |                     # **{"sample_rate": sample_rate},
586 |                 )
587 | 
588 |                 all_clips_timepoints = get_clip_timepoints(clip_sampler, video.duration)
589 | 
590 |                 all_video = []
591 |                 for clip_timepoints in all_clips_timepoints:
592 |                     # Read the clip, get frames
593 |                     clip = video.get_clip(clip_timepoints[0], clip_timepoints[1])
594 |                     if clip is None:
595 |                         raise ValueError("No clip found")
596 |                     video_clip = frame_sampler(clip["video"])
597 |                     video_clip = video_clip / 255.0  # since this is float, need 0-1
598 | 
599 |                     all_video.append(video_clip)
600 |                 # Hard set here to be less than 60 seconds
601 |                 if len(all_video) > 60:
602 |                     all_video = all_video[:60]
603 | 
604 |             all_video = torch.cat(all_video, dim=1)
605 |             all_video = video_transform(all_video).transpose(0, 1)  # C, T, H, W -> T, C, H, W
606 |         else:
607 |             all_video = torch.zeros(int(30/clip_duration*sample_per_clip), 3, 224, 224)
608 | 
609 |         if all_video.size(0) > maxlen:
610 |             maxlen = all_video.size(0)
611 |         video_outputs.append(all_video)
612 | 
613 |     padded_video_outputs = []
614 |     padded_video_mask = []
615 |     for video in video_outputs:
616 |         if video.size(0) < maxlen:
617 |             diffsize = maxlen - video.size(0)
618 |             padded_video_mask.append([1] * video.size(0) + [0] * diffsize)
619 |             video = torch.cat([video, video.new_zeros(
620 |                 diffsize, video.size(1), video.size(2), video.size(3))], dim=0)
621 |         else:
622 |             padded_video_mask.append([1] * video.size(0))
623 |         padded_video_outputs.append(video)
624 | 
625 |     return torch.stack(padded_video_outputs, dim=0).to(device), torch.tensor(padded_video_mask).to(device)
626 | 
627 | def load_and_transform_video_data(
628 |     video_paths,
629 |     device,
630 |     clip_duration=2,
631 |     clips_per_video=5,
632 |     sample_rate=16000,
633 | ):
634 |     if video_paths is None:
635 |         return None
636 | 
637 |     video_outputs = []
638 |     video_transform = transforms.Compose(
639 |         [
640 |             pv_transforms.ShortSideScale(224),
641 |             NormalizeVideo(
642 |                 mean=(0.48145466, 0.4578275, 0.40821073),
643 |                 std=(0.26862954, 0.26130258, 0.27577711),
644 |             ),
645 |         ]
646 |     )
647 | 
648 |     clip_sampler = ConstantClipsPerVideoSampler(
649 |         clip_duration=clip_duration, clips_per_video=clips_per_video
650 |     )
651 |     frame_sampler = pv_transforms.UniformTemporalSubsample(num_samples=clip_duration)
652 | 
653 |     for video_path in video_paths:
654 |         video = EncodedVideo.from_path(
655 |             video_path,
656 |             decoder="decord",
657 |             decode_audio=False,
658 |             **{"sample_rate": sample_rate},
659 |         )
660 | 
661 |         all_clips_timepoints = get_clip_timepoints(clip_sampler, video.duration)
662 | 
663 |         all_video = []
664 |         for clip_timepoints in all_clips_timepoints:
665 |             # Read the clip, get frames
666 |             clip = video.get_clip(clip_timepoints[0], clip_timepoints[1])
667 |             if clip is None:
668 |                 raise ValueError("No clip found")
669 |             video_clip = frame_sampler(clip["video"])
670 |             video_clip = video_clip / 255.0  # since this is float, need 0-1
671 | 
672 |             all_video.append(video_clip)
673 | 
674 |         all_video = [video_transform(clip) for clip in all_video]
675 |         all_video = SpatialCrop(224, num_crops=3)(all_video)
676 | 
677 |         all_video = torch.stack(all_video, dim=0)
678 |         video_outputs.append(all_video)
679 | 
680 |     return torch.stack(video_outputs, dim=0).to(device)
681 | 


--------------------------------------------------------------------------------
/model/ImageBind/model_card.md:
--------------------------------------------------------------------------------
 1 | # Model Card for ImageBind
 2 | 
 3 | Multimodal joint embedding model for image/video, text, audio, depth, IMU, and thermal images.
 4 | Input any of the six modalities and get the same sized embedding that can be used for cross-modal and multimodal tasks.
 5 | 
 6 | # Model Details
 7 | 
 8 | ## Model Description
 9 | 
10 | <!-- Provide a longer summary of what this model is/does. -->
11 | Multimodal joint embedding model for image/video, text, audio, depth, IMU, and thermal images
12 | 
13 | - **Developed by:** Meta AI
14 | - **Model type:** Multimodal model
15 | - **Language(s) (NLP):** en
16 | - **License:** CC BY-NC-SA 4.0
17 | - **Resources for more information:**
18 |     - [GitHub Repo](https://github.com/facebookresearch/ImageBind)
19 | 
20 | 
21 | # Uses
22 | 
23 | <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
24 | This model is intended only for research purposes. It provides a joint embedding space for different modalities -- image/video, text, audio, depth, IMU and thermal images.
25 | We hope that these joint embeddings can be used for a variety of different cross-modal research, e.g., cross-modal retrieval and combining embeddings from different modalities.
26 | 
27 | ## Out-of-Scope Use
28 | 
29 | <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
30 | <!-- If the user enters content, print that. If not, but they enter a task in the list, use that. If neither, say "more info needed." -->
31 | 
32 | This model is *NOT* intended to be used in any real world application -- commercial or otherwise.
33 | It may produce harmful associations with different inputs.
34 | The model needs to be investigated and likely re-trained on specific data for any such application.
35 | The model is expected to work better on web-based visual data since it was trained on such data.
36 | The text encoder is likely to work only on English language text because of the underlying training datasets.
37 | 
38 | # Bias, Risks, and Limitations
39 | 
40 | <!-- This section is meant to convey both technical and sociotechnical limitations. -->
41 | Open-domain joint embedding models are prone to producing specific biases, e.g., study from [CLIP](https://github.com/openai/CLIP/blob/main/model-card.md#bias-and-fairness).
42 | Since our model uses such models as initialization, it will exhibit such biases too.
43 | Moreover, for learning joint embeddings for other modalities such as audio, thermal, depth, and IMU we leverage datasets that are relatively small. These joint embeddings are thus limited to the concepts present in the datasets. For example, the thermal datasets we used are limited to outdoor street scenes, while the depth datasets are limited to indoor scenes.
44 | 
45 | 
46 | 
47 | # Training Details
48 | 
49 | ## Training Data
50 | 
51 | <!-- This should link to a Data Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
52 | 
53 | ImageBind uses image-paired data for training -- (image, X) where X is one of text, audio, depth, IMU or thermal data.
54 | In particular, we initialize and freeze the image and text encoders using an OpenCLIP ViT-H encoder.
55 | We train audio embeddings using Audioset, depth embeddings using the SUN RGB-D dataset, IMU using the Ego4D dataset and thermal embeddings using the LLVIP dataset.
56 | We provide the exact training data details in the paper.
57 | 
58 | 
59 | ## Training Procedure
60 | 
61 | <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
62 | Please refer to the research paper and github repo for exact details on this.
63 | 
64 | # Evaluation
65 | 
66 | ## Testing Data, Factors & Metrics
67 | 
68 | We evaluate the model on a variety of different classification benchmarks for each modality.
69 | The evaluation details are presented in the paper.
70 | The models performance is measured using standard classification metrics such as accuracy and mAP.
71 | 
72 | # Citation
73 | 
74 | <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
75 | 
76 | **BibTeX:**
77 | ```
78 | @inproceedings{girdhar2023imagebind,
79 |   title={ImageBind: One Embedding Space To Bind Them All},
80 |   author={Girdhar, Rohit and El-Nouby, Alaaeldin and Liu, Zhuang
81 | and Singh, Mannat and Alwala, Kalyan Vasudev and Joulin, Armand and Misra, Ishan},
82 |   booktitle={CVPR},
83 |   year={2023}
84 | }
85 | ```
86 | 
87 | 
88 | # Model Card Contact
89 | 
90 | Please reach out to the authors at: rgirdhar@meta.com imisra@meta.com alaaelnouby@gmail.com
91 | 
92 | # How to Get Started with the Model
93 | 
94 | Our github repo provides a simple example to extract embeddings from images, audio etc.
95 | 


--------------------------------------------------------------------------------
/model/ImageBind/models/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BriansIDP/AudioVisualLLM/52ab7d14b499e7c951df26c89ce2d3673976be9e/model/ImageBind/models/__init__.py


--------------------------------------------------------------------------------
/model/ImageBind/models/__pycache__/__init__.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BriansIDP/AudioVisualLLM/52ab7d14b499e7c951df26c89ce2d3673976be9e/model/ImageBind/models/__pycache__/__init__.cpython-39.pyc


--------------------------------------------------------------------------------
/model/ImageBind/models/__pycache__/helpers.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BriansIDP/AudioVisualLLM/52ab7d14b499e7c951df26c89ce2d3673976be9e/model/ImageBind/models/__pycache__/helpers.cpython-39.pyc


--------------------------------------------------------------------------------
/model/ImageBind/models/__pycache__/imagebind_model.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BriansIDP/AudioVisualLLM/52ab7d14b499e7c951df26c89ce2d3673976be9e/model/ImageBind/models/__pycache__/imagebind_model.cpython-39.pyc


--------------------------------------------------------------------------------
/model/ImageBind/models/__pycache__/multimodal_preprocessors.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BriansIDP/AudioVisualLLM/52ab7d14b499e7c951df26c89ce2d3673976be9e/model/ImageBind/models/__pycache__/multimodal_preprocessors.cpython-39.pyc


--------------------------------------------------------------------------------
/model/ImageBind/models/__pycache__/transformer.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BriansIDP/AudioVisualLLM/52ab7d14b499e7c951df26c89ce2d3673976be9e/model/ImageBind/models/__pycache__/transformer.cpython-39.pyc


--------------------------------------------------------------------------------
/model/ImageBind/models/helpers.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # Portions Copyright (c) Meta Platforms, Inc. and affiliates.
  3 | # All rights reserved.
  4 | 
  5 | # This source code is licensed under the license found in the
  6 | # LICENSE file in the root directory of this source tree.
  7 | 
  8 | import math
  9 | 
 10 | import einops
 11 | import numpy as np
 12 | import torch
 13 | 
 14 | import torch.nn as nn
 15 | 
 16 | 
 17 | class Normalize(nn.Module):
 18 |     def __init__(self, dim: int) -> None:
 19 |         super().__init__()
 20 |         self.dim = dim
 21 | 
 22 |     def forward(self, x):
 23 |         return torch.nn.functional.normalize(x, dim=self.dim, p=2)
 24 | 
 25 | 
 26 | class LearnableLogitScaling(nn.Module):
 27 |     def __init__(
 28 |         self,
 29 |         logit_scale_init: float = 1 / 0.07,
 30 |         learnable: bool = True,
 31 |         max_logit_scale: float = 100,
 32 |     ) -> None:
 33 |         super().__init__()
 34 |         self.max_logit_scale = max_logit_scale
 35 |         self.logit_scale_init = logit_scale_init
 36 |         self.learnable = learnable
 37 |         log_logit_scale = torch.ones([]) * np.log(self.logit_scale_init)
 38 |         if learnable:
 39 |             self.log_logit_scale = nn.Parameter(log_logit_scale)
 40 |         else:
 41 |             self.register_buffer("log_logit_scale", log_logit_scale)
 42 | 
 43 |     def forward(self, x):
 44 |         return torch.clip(self.log_logit_scale.exp(), max=self.max_logit_scale) * x
 45 | 
 46 |     def extra_repr(self):
 47 |         st = f"logit_scale_init={self.logit_scale_init},learnable={self.learnable}, max_logit_scale={self.max_logit_scale}"
 48 |         return st
 49 | 
 50 | 
 51 | class EinOpsRearrange(nn.Module):
 52 |     def __init__(self, rearrange_expr: str, **kwargs) -> None:
 53 |         super().__init__()
 54 |         self.rearrange_expr = rearrange_expr
 55 |         self.kwargs = kwargs
 56 | 
 57 |     def forward(self, x):
 58 |         assert isinstance(x, torch.Tensor)
 59 |         return einops.rearrange(x, self.rearrange_expr, **self.kwargs)
 60 | 
 61 | 
 62 | class VerboseNNModule(nn.Module):
 63 |     """
 64 |     Wrapper around nn.Module that prints registered buffers and parameter names.
 65 |     """
 66 | 
 67 |     @staticmethod
 68 |     def get_readable_tensor_repr(name: str, tensor: torch.Tensor) -> str:
 69 |         st = (
 70 |             "("
 71 |             + name
 72 |             + "): "
 73 |             + "tensor("
 74 |             + str(tuple(tensor[1].shape))
 75 |             + ", requires_grad="
 76 |             + str(tensor[1].requires_grad)
 77 |             + ")\n"
 78 |         )
 79 |         return st
 80 | 
 81 |     def extra_repr(self) -> str:
 82 |         named_modules = set()
 83 |         for p in self.named_modules():
 84 |             named_modules.update([p[0]])
 85 |         named_modules = list(named_modules)
 86 | 
 87 |         string_repr = ""
 88 |         for p in self.named_parameters():
 89 |             name = p[0].split(".")[0]
 90 |             if name not in named_modules:
 91 |                 string_repr += self.get_readable_tensor_repr(name, p)
 92 | 
 93 |         for p in self.named_buffers():
 94 |             name = p[0].split(".")[0]
 95 |             string_repr += self.get_readable_tensor_repr(name, p)
 96 | 
 97 |         return string_repr
 98 | 
 99 | 
100 | def cast_if_src_dtype(
101 |     tensor: torch.Tensor, src_dtype: torch.dtype, tgt_dtype: torch.dtype
102 | ):
103 |     updated = False
104 |     if tensor.dtype == src_dtype:
105 |         tensor = tensor.to(dtype=tgt_dtype)
106 |         updated = True
107 |     return tensor, updated
108 | 
109 | 
110 | class QuickGELU(nn.Module):
111 |     # From https://github.com/openai/CLIP/blob/d50d76daa670286dd6cacf3bcd80b5e4823fc8e1/clip/model.py#L166
112 |     def forward(self, x: torch.Tensor):
113 |         return x * torch.sigmoid(1.702 * x)
114 | 
115 | 
116 | class SelectElement(nn.Module):
117 |     def __init__(self, index) -> None:
118 |         super().__init__()
119 |         self.index = index
120 | 
121 |     def forward(self, x):
122 |         assert x.ndim >= 3
123 |         return x[:, self.index, ...]
124 | 
125 | 
126 | class SelectEOSAndProject(nn.Module):
127 |     """
128 |     Text Pooling used in OpenCLIP
129 |     """
130 | 
131 |     def __init__(self, proj: nn.Module) -> None:
132 |         super().__init__()
133 |         self.proj = proj
134 | 
135 |     def forward(self, x, seq_len):
136 |         assert x.ndim == 3
137 |         # x is of shape B x L x D
138 |         # take features from the eot embedding (eot_token is the highest number in each sequence)
139 |         x = x[torch.arange(x.shape[0]), seq_len]
140 |         x = self.proj(x)
141 |         return x
142 | 


--------------------------------------------------------------------------------
/model/ImageBind/models/imagebind_model.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # Portions Copyright (c) Meta Platforms, Inc. and affiliates.
  3 | # All rights reserved.
  4 | 
  5 | # This source code is licensed under the license found in the
  6 | # LICENSE file in the root directory of this source tree.
  7 | 
  8 | 
  9 | import os
 10 | import urllib
 11 | from functools import partial
 12 | from types import SimpleNamespace
 13 | 
 14 | import torch
 15 | import torch.nn as nn
 16 | 
 17 | from .helpers import (
 18 |     EinOpsRearrange,
 19 |     LearnableLogitScaling,
 20 |     Normalize,
 21 |     SelectElement,
 22 |     SelectEOSAndProject,
 23 | )
 24 | from .multimodal_preprocessors import (
 25 |     AudioPreprocessor,
 26 |     IMUPreprocessor,
 27 |     PadIm2Video,
 28 |     PatchEmbedGeneric,
 29 |     RGBDTPreprocessor,
 30 |     SpatioTemporalPosEmbeddingHelper,
 31 |     TextPreprocessor,
 32 |     ThermalPreprocessor,
 33 | )
 34 | 
 35 | from .transformer import MultiheadAttention, SimpleTransformer
 36 | 
 37 | 
 38 | ModalityType = SimpleNamespace(
 39 |     VISION="vision",
 40 |     TEXT="text",
 41 |     AUDIO="audio",
 42 |     THERMAL="thermal",
 43 |     DEPTH="depth",
 44 |     IMU="imu",
 45 | )
 46 | 
 47 | 
 48 | class ImageBindModel(nn.Module):
 49 |     def __init__(
 50 |         self,
 51 |         video_frames=2,
 52 |         kernel_size=(2, 14, 14),
 53 |         audio_kernel_size=16,
 54 |         audio_stride=10,
 55 |         out_embed_dim=768,
 56 |         vision_embed_dim=1024,
 57 |         vision_num_blocks=24,
 58 |         vision_num_heads=16,
 59 |         audio_embed_dim=768,
 60 |         audio_num_blocks=12,
 61 |         audio_num_heads=12,
 62 |         audio_num_mel_bins=128,
 63 |         audio_target_len=204,
 64 |         audio_drop_path=0.1,
 65 |         text_embed_dim=768,
 66 |         text_num_blocks=12,
 67 |         text_num_heads=12,
 68 |         depth_embed_dim=384,
 69 |         depth_kernel_size=16,
 70 |         depth_num_blocks=12,
 71 |         depth_num_heads=8,
 72 |         depth_drop_path=0.0,
 73 |         thermal_embed_dim=768,
 74 |         thermal_kernel_size=16,
 75 |         thermal_num_blocks=12,
 76 |         thermal_num_heads=12,
 77 |         thermal_drop_path=0.0,
 78 |         imu_embed_dim=512,
 79 |         imu_kernel_size=8,
 80 |         imu_num_blocks=6,
 81 |         imu_num_heads=8,
 82 |         imu_drop_path=0.7,
 83 |     ):
 84 |         super().__init__()
 85 | 
 86 |         self.modality_preprocessors = self._create_modality_preprocessors(
 87 |             video_frames,
 88 |             vision_embed_dim,
 89 |             kernel_size,
 90 |             text_embed_dim,
 91 |             audio_embed_dim,
 92 |             audio_kernel_size,
 93 |             audio_stride,
 94 |             audio_num_mel_bins,
 95 |             audio_target_len,
 96 |             depth_embed_dim,
 97 |             depth_kernel_size,
 98 |             thermal_embed_dim,
 99 |             thermal_kernel_size,
100 |             imu_embed_dim,
101 |         )
102 | 
103 |         self.modality_trunks = self._create_modality_trunks(
104 |             vision_embed_dim,
105 |             vision_num_blocks,
106 |             vision_num_heads,
107 |             text_embed_dim,
108 |             text_num_blocks,
109 |             text_num_heads,
110 |             audio_embed_dim,
111 |             audio_num_blocks,
112 |             audio_num_heads,
113 |             audio_drop_path,
114 |             depth_embed_dim,
115 |             depth_num_blocks,
116 |             depth_num_heads,
117 |             depth_drop_path,
118 |             thermal_embed_dim,
119 |             thermal_num_blocks,
120 |             thermal_num_heads,
121 |             thermal_drop_path,
122 |             imu_embed_dim,
123 |             imu_num_blocks,
124 |             imu_num_heads,
125 |             imu_drop_path,
126 |         )
127 | 
128 |         self.modality_heads = self._create_modality_heads(
129 |             out_embed_dim,
130 |             vision_embed_dim,
131 |             text_embed_dim,
132 |             audio_embed_dim,
133 |             depth_embed_dim,
134 |             thermal_embed_dim,
135 |             imu_embed_dim,
136 |         )
137 | 
138 |         self.modality_postprocessors = self._create_modality_postprocessors(
139 |             out_embed_dim
140 |         )
141 | 
142 |     def _create_modality_preprocessors(
143 |         self,
144 |         video_frames=2,
145 |         vision_embed_dim=1024,
146 |         kernel_size=(2, 14, 14),
147 |         text_embed_dim=768,
148 |         audio_embed_dim=768,
149 |         audio_kernel_size=16,
150 |         audio_stride=10,
151 |         audio_num_mel_bins=128,
152 |         audio_target_len=204,
153 |         depth_embed_dim=768,
154 |         depth_kernel_size=16,
155 |         thermal_embed_dim=768,
156 |         thermal_kernel_size=16,
157 |         imu_embed_dim=512,
158 |     ):
159 |         rgbt_stem = PatchEmbedGeneric(
160 |             proj_stem=[
161 |                 PadIm2Video(pad_type="repeat", ntimes=2),
162 |                 nn.Conv3d(
163 |                     in_channels=3,
164 |                     kernel_size=kernel_size,
165 |                     out_channels=vision_embed_dim,
166 |                     stride=kernel_size,
167 |                     bias=False,
168 |                 ),
169 |             ]
170 |         )
171 |         rgbt_preprocessor = RGBDTPreprocessor(
172 |             img_size=[3, video_frames, 224, 224],
173 |             num_cls_tokens=1,
174 |             pos_embed_fn=partial(SpatioTemporalPosEmbeddingHelper, learnable=True),
175 |             rgbt_stem=rgbt_stem,
176 |             depth_stem=None,
177 |         )
178 | 
179 |         text_preprocessor = TextPreprocessor(
180 |             context_length=77,
181 |             vocab_size=49408,
182 |             embed_dim=text_embed_dim,
183 |             causal_masking=True,
184 |         )
185 | 
186 |         audio_stem = PatchEmbedGeneric(
187 |             proj_stem=[
188 |                 nn.Conv2d(
189 |                     in_channels=1,
190 |                     kernel_size=audio_kernel_size,
191 |                     stride=audio_stride,
192 |                     out_channels=audio_embed_dim,
193 |                     bias=False,
194 |                 ),
195 |             ],
196 |             norm_layer=nn.LayerNorm(normalized_shape=audio_embed_dim),
197 |         )
198 |         audio_preprocessor = AudioPreprocessor(
199 |             img_size=[1, audio_num_mel_bins, audio_target_len],
200 |             num_cls_tokens=1,
201 |             pos_embed_fn=partial(SpatioTemporalPosEmbeddingHelper, learnable=True),
202 |             audio_stem=audio_stem,
203 |         )
204 | 
205 |         depth_stem = PatchEmbedGeneric(
206 |             [
207 |                 nn.Conv2d(
208 |                     kernel_size=depth_kernel_size,
209 |                     in_channels=1,
210 |                     out_channels=depth_embed_dim,
211 |                     stride=depth_kernel_size,
212 |                     bias=False,
213 |                 ),
214 |             ],
215 |             norm_layer=nn.LayerNorm(normalized_shape=depth_embed_dim),
216 |         )
217 | 
218 |         depth_preprocessor = RGBDTPreprocessor(
219 |             img_size=[1, 224, 224],
220 |             num_cls_tokens=1,
221 |             pos_embed_fn=partial(SpatioTemporalPosEmbeddingHelper, learnable=True),
222 |             rgbt_stem=None,
223 |             depth_stem=depth_stem,
224 |         )
225 | 
226 |         thermal_stem = PatchEmbedGeneric(
227 |             [
228 |                 nn.Conv2d(
229 |                     kernel_size=thermal_kernel_size,
230 |                     in_channels=1,
231 |                     out_channels=thermal_embed_dim,
232 |                     stride=thermal_kernel_size,
233 |                     bias=False,
234 |                 ),
235 |             ],
236 |             norm_layer=nn.LayerNorm(normalized_shape=thermal_embed_dim),
237 |         )
238 |         thermal_preprocessor = ThermalPreprocessor(
239 |             img_size=[1, 224, 224],
240 |             num_cls_tokens=1,
241 |             pos_embed_fn=partial(SpatioTemporalPosEmbeddingHelper, learnable=True),
242 |             thermal_stem=thermal_stem,
243 |         )
244 | 
245 |         imu_stem = PatchEmbedGeneric(
246 |             [
247 |                 nn.Linear(
248 |                     in_features=48,
249 |                     out_features=imu_embed_dim,
250 |                     bias=False,
251 |                 ),
252 |             ],
253 |             norm_layer=nn.LayerNorm(normalized_shape=imu_embed_dim),
254 |         )
255 | 
256 |         imu_preprocessor = IMUPreprocessor(
257 |             img_size=[6, 2000],
258 |             num_cls_tokens=1,
259 |             kernel_size=8,
260 |             embed_dim=imu_embed_dim,
261 |             pos_embed_fn=partial(SpatioTemporalPosEmbeddingHelper, learnable=True),
262 |             imu_stem=imu_stem,
263 |         )
264 | 
265 |         modality_preprocessors = {
266 |             ModalityType.VISION: rgbt_preprocessor,
267 |             ModalityType.TEXT: text_preprocessor,
268 |             ModalityType.AUDIO: audio_preprocessor,
269 |             ModalityType.DEPTH: depth_preprocessor,
270 |             ModalityType.THERMAL: thermal_preprocessor,
271 |             ModalityType.IMU: imu_preprocessor,
272 |         }
273 | 
274 |         return nn.ModuleDict(modality_preprocessors)
275 | 
276 |     def _create_modality_trunks(
277 |         self,
278 |         vision_embed_dim=1024,
279 |         vision_num_blocks=24,
280 |         vision_num_heads=16,
281 |         text_embed_dim=768,
282 |         text_num_blocks=12,
283 |         text_num_heads=12,
284 |         audio_embed_dim=768,
285 |         audio_num_blocks=12,
286 |         audio_num_heads=12,
287 |         audio_drop_path=0.0,
288 |         depth_embed_dim=768,
289 |         depth_num_blocks=12,
290 |         depth_num_heads=12,
291 |         depth_drop_path=0.0,
292 |         thermal_embed_dim=768,
293 |         thermal_num_blocks=12,
294 |         thermal_num_heads=12,
295 |         thermal_drop_path=0.0,
296 |         imu_embed_dim=512,
297 |         imu_num_blocks=6,
298 |         imu_num_heads=8,
299 |         imu_drop_path=0.7,
300 |     ):
301 |         def instantiate_trunk(
302 |             embed_dim, num_blocks, num_heads, pre_transformer_ln, add_bias_kv, drop_path
303 |         ):
304 |             return SimpleTransformer(
305 |                 embed_dim=embed_dim,
306 |                 num_blocks=num_blocks,
307 |                 ffn_dropout_rate=0.0,
308 |                 drop_path_rate=drop_path,
309 |                 attn_target=partial(
310 |                     MultiheadAttention,
311 |                     embed_dim=embed_dim,
312 |                     num_heads=num_heads,
313 |                     bias=True,
314 |                     add_bias_kv=add_bias_kv,
315 |                 ),
316 |                 pre_transformer_layer=nn.Sequential(
317 |                     nn.LayerNorm(embed_dim, eps=1e-6)
318 |                     if pre_transformer_ln
319 |                     else nn.Identity(),
320 |                     EinOpsRearrange("b l d -> l b d"),
321 |                 ),
322 |                 post_transformer_layer=EinOpsRearrange("l b d -> b l d"),
323 |             )
324 | 
325 |         modality_trunks = {}
326 |         modality_trunks[ModalityType.VISION] = instantiate_trunk(
327 |             vision_embed_dim,
328 |             vision_num_blocks,
329 |             vision_num_heads,
330 |             pre_transformer_ln=True,
331 |             add_bias_kv=False,
332 |             drop_path=0.0,
333 |         )
334 |         modality_trunks[ModalityType.TEXT] = instantiate_trunk(
335 |             text_embed_dim,
336 |             text_num_blocks,
337 |             text_num_heads,
338 |             pre_transformer_ln=False,
339 |             add_bias_kv=False,
340 |             drop_path=0.0,
341 |         )
342 |         modality_trunks[ModalityType.AUDIO] = instantiate_trunk(
343 |             audio_embed_dim,
344 |             audio_num_blocks,
345 |             audio_num_heads,
346 |             pre_transformer_ln=False,
347 |             add_bias_kv=True,
348 |             drop_path=audio_drop_path,
349 |         )
350 |         modality_trunks[ModalityType.DEPTH] = instantiate_trunk(
351 |             depth_embed_dim,
352 |             depth_num_blocks,
353 |             depth_num_heads,
354 |             pre_transformer_ln=False,
355 |             add_bias_kv=True,
356 |             drop_path=depth_drop_path,
357 |         )
358 |         modality_trunks[ModalityType.THERMAL] = instantiate_trunk(
359 |             thermal_embed_dim,
360 |             thermal_num_blocks,
361 |             thermal_num_heads,
362 |             pre_transformer_ln=False,
363 |             add_bias_kv=True,
364 |             drop_path=thermal_drop_path,
365 |         )
366 |         modality_trunks[ModalityType.IMU] = instantiate_trunk(
367 |             imu_embed_dim,
368 |             imu_num_blocks,
369 |             imu_num_heads,
370 |             pre_transformer_ln=False,
371 |             add_bias_kv=True,
372 |             drop_path=imu_drop_path,
373 |         )
374 | 
375 |         return nn.ModuleDict(modality_trunks)
376 | 
377 |     def _create_modality_heads(
378 |         self,
379 |         out_embed_dim,
380 |         vision_embed_dim,
381 |         text_embed_dim,
382 |         audio_embed_dim,
383 |         depth_embed_dim,
384 |         thermal_embed_dim,
385 |         imu_embed_dim,
386 |     ):
387 |         modality_heads = {}
388 | 
389 |         modality_heads[ModalityType.VISION] = nn.Sequential(
390 |             nn.LayerNorm(normalized_shape=vision_embed_dim, eps=1e-6),
391 |             SelectElement(index=0),
392 |             nn.Linear(vision_embed_dim, out_embed_dim, bias=False),
393 |         )
394 | 
395 |         modality_heads[ModalityType.TEXT] = SelectEOSAndProject(
396 |             proj=nn.Sequential(
397 |                 nn.LayerNorm(normalized_shape=text_embed_dim, eps=1e-6),
398 |                 nn.Linear(text_embed_dim, out_embed_dim, bias=False),
399 |             )
400 |         )
401 | 
402 |         modality_heads[ModalityType.AUDIO] = nn.Sequential(
403 |             nn.LayerNorm(normalized_shape=audio_embed_dim, eps=1e-6),
404 |             SelectElement(index=0),
405 |             nn.Linear(audio_embed_dim, out_embed_dim, bias=False),
406 |         )
407 | 
408 |         modality_heads[ModalityType.DEPTH] = nn.Sequential(
409 |             nn.LayerNorm(normalized_shape=depth_embed_dim, eps=1e-6),
410 |             SelectElement(index=0),
411 |             nn.Linear(depth_embed_dim, out_embed_dim, bias=False),
412 |         )
413 | 
414 |         modality_heads[ModalityType.THERMAL] = nn.Sequential(
415 |             nn.LayerNorm(normalized_shape=thermal_embed_dim, eps=1e-6),
416 |             SelectElement(index=0),
417 |             nn.Linear(thermal_embed_dim, out_embed_dim, bias=False),
418 |         )
419 | 
420 |         modality_heads[ModalityType.IMU] = nn.Sequential(
421 |             nn.LayerNorm(normalized_shape=imu_embed_dim, eps=1e-6),
422 |             SelectElement(index=0),
423 |             nn.Dropout(p=0.5),
424 |             nn.Linear(imu_embed_dim, out_embed_dim, bias=False),
425 |         )
426 | 
427 |         return nn.ModuleDict(modality_heads)
428 | 
429 |     def _create_modality_postprocessors(self, out_embed_dim):
430 |         modality_postprocessors = {}
431 | 
432 |         modality_postprocessors[ModalityType.VISION] = Normalize(dim=-1)
433 |         modality_postprocessors[ModalityType.TEXT] = nn.Sequential(
434 |             Normalize(dim=-1), LearnableLogitScaling(learnable=True)
435 |         )
436 |         modality_postprocessors[ModalityType.AUDIO] = nn.Sequential(
437 |             Normalize(dim=-1),
438 |             LearnableLogitScaling(logit_scale_init=20.0, learnable=False),
439 |         )
440 |         modality_postprocessors[ModalityType.DEPTH] = nn.Sequential(
441 |             Normalize(dim=-1),
442 |             LearnableLogitScaling(logit_scale_init=5.0, learnable=False),
443 |         )
444 |         modality_postprocessors[ModalityType.THERMAL] = nn.Sequential(
445 |             Normalize(dim=-1),
446 |             LearnableLogitScaling(logit_scale_init=10.0, learnable=False),
447 |         )
448 |         modality_postprocessors[ModalityType.IMU] = nn.Sequential(
449 |             Normalize(dim=-1),
450 |             LearnableLogitScaling(logit_scale_init=5.0, learnable=False),
451 |         )
452 |         return nn.ModuleDict(modality_postprocessors)
453 | 
454 |     def forward(self, inputs):
455 |         outputs = {}
456 |         for modality_key, modality_value in inputs.items():
457 |             reduce_list = (
458 |                 modality_value.ndim >= 5
459 |             )  # Audio and Video inputs consist of multiple clips
460 |             if reduce_list:
461 |                 B, S = modality_value.shape[:2]
462 |                 modality_value = modality_value.reshape(
463 |                     B * S, *modality_value.shape[2:]
464 |                 )
465 | 
466 |             if modality_value is not None:
467 |                 modality_value = self.modality_preprocessors[modality_key](
468 |                     **{modality_key: modality_value}
469 |                 )
470 |                 trunk_inputs = modality_value["trunk"]
471 |                 head_inputs = modality_value["head"]
472 |                 modality_value = self.modality_trunks[modality_key](**trunk_inputs)
473 |                 modality_value = self.modality_heads[modality_key](
474 |                     modality_value, **head_inputs
475 |                 )
476 |                 if modality_key in [ModalityType.AUDIO]:
477 |                     modality_value = self.modality_postprocessors[modality_key][0](
478 |                         modality_value
479 |                     )
480 |                 else:
481 |                     modality_value = self.modality_postprocessors[modality_key](
482 |                         modality_value
483 |                     )
484 | 
485 |                 if reduce_list:
486 |                     modality_value = modality_value.reshape(B, S, -1)
487 |                     # modality_value = modality_value.mean(dim=1)
488 | 
489 |                 outputs[modality_key] = modality_value
490 | 
491 |         return outputs
492 | 
493 | 
494 | def imagebind_huge(pretrained=False, store_path=r'.checkpoints'):
495 |     model = ImageBindModel(
496 |         vision_embed_dim=1280,
497 |         vision_num_blocks=32,
498 |         vision_num_heads=16,
499 |         text_embed_dim=1024,
500 |         text_num_blocks=24,
501 |         text_num_heads=16,
502 |         out_embed_dim=1024,
503 |         audio_drop_path=0.1,
504 |         imu_drop_path=0.7,
505 |     )
506 | 
507 |     if pretrained:
508 |         if not os.path.exists("{}/imagebind_huge.pth".format(store_path)):
509 |             print(
510 |                 "Downloading imagebind weights to {}/imagebind_huge.pth ...".format(store_path)
511 |             )
512 |             os.makedirs(store_path, exist_ok=True)
513 |             torch.hub.download_url_to_file(
514 |                 "https://dl.fbaipublicfiles.com/imagebind/imagebind_huge.pth",
515 |                 "{}/imagebind_huge.pth".format(store_path),
516 |                 progress=True,
517 |             )
518 | 
519 |         model.load_state_dict(torch.load("{}/imagebind_huge.pth".format(store_path)))
520 | 
521 |     return model, 1024
522 | 


--------------------------------------------------------------------------------
/model/ImageBind/models/multimodal_preprocessors.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # Portions Copyright (c) Meta Platforms, Inc. and affiliates.
  3 | # All rights reserved.
  4 | 
  5 | # This source code is licensed under the license found in the
  6 | # LICENSE file in the root directory of this source tree.
  7 | 
  8 | import gzip
  9 | import html
 10 | import io
 11 | import math
 12 | from functools import lru_cache
 13 | from typing import Callable, List, Optional
 14 | 
 15 | import ftfy
 16 | 
 17 | import numpy as np
 18 | import regex as re
 19 | import torch
 20 | import torch.nn as nn
 21 | from iopath.common.file_io import g_pathmgr
 22 | from timm.models.layers import trunc_normal_
 23 | 
 24 | from .helpers import cast_if_src_dtype, VerboseNNModule
 25 | 
 26 | 
 27 | def get_sinusoid_encoding_table(n_position, d_hid):
 28 |     """Sinusoid position encoding table"""
 29 | 
 30 |     # TODO: make it with torch instead of numpy
 31 |     def get_position_angle_vec(position):
 32 |         return [
 33 |             position / np.power(10000, 2 * (hid_j // 2) / d_hid)
 34 |             for hid_j in range(d_hid)
 35 |         ]
 36 | 
 37 |     sinusoid_table = np.array(
 38 |         [get_position_angle_vec(pos_i) for pos_i in range(n_position)]
 39 |     )
 40 |     sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2])  # dim 2i
 41 |     sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2])  # dim 2i+1
 42 | 
 43 |     return torch.FloatTensor(sinusoid_table).unsqueeze(0)
 44 | 
 45 | 
 46 | def interpolate_pos_encoding_2d(target_spatial_size, pos_embed):
 47 |     N = pos_embed.shape[1]
 48 |     if N == target_spatial_size:
 49 |         return pos_embed
 50 |     dim = pos_embed.shape[-1]
 51 |     # nn.functional.interpolate doesn't work with bfloat16 so we cast to float32
 52 |     pos_embed, updated = cast_if_src_dtype(pos_embed, torch.bfloat16, torch.float32)
 53 |     pos_embed = nn.functional.interpolate(
 54 |         pos_embed.reshape(1, int(math.sqrt(N)), int(math.sqrt(N)), dim).permute(
 55 |             0, 3, 1, 2
 56 |         ),
 57 |         scale_factor=math.sqrt(target_spatial_size / N),
 58 |         mode="bicubic",
 59 |     )
 60 |     if updated:
 61 |         pos_embed, _ = cast_if_src_dtype(pos_embed, torch.float32, torch.bfloat16)
 62 |     pos_embed = pos_embed.permute(0, 2, 3, 1).view(1, -1, dim)
 63 |     return pos_embed
 64 | 
 65 | 
 66 | def interpolate_pos_encoding(
 67 |     npatch_per_img,
 68 |     pos_embed,
 69 |     patches_layout,
 70 |     input_shape=None,
 71 |     first_patch_idx=1,
 72 | ):
 73 |     assert first_patch_idx == 0 or first_patch_idx == 1, "there is 1 CLS token or none"
 74 |     N = pos_embed.shape[1] - first_patch_idx  # since it's 1 if cls_token exists
 75 |     if npatch_per_img == N:
 76 |         return pos_embed
 77 | 
 78 |     assert (
 79 |         patches_layout[-1] == patches_layout[-2]
 80 |     ), "Interpolation of pos embed not supported for non-square layouts"
 81 | 
 82 |     class_emb = pos_embed[:, :first_patch_idx]
 83 |     pos_embed = pos_embed[:, first_patch_idx:]
 84 | 
 85 |     if input_shape is None or patches_layout[0] == 1:
 86 |         # simple 2D pos embedding, no temporal component
 87 |         pos_embed = interpolate_pos_encoding_2d(npatch_per_img, pos_embed)
 88 |     elif patches_layout[0] > 1:
 89 |         # pos embed has a temporal component
 90 |         assert len(input_shape) == 4, "temporal interpolation not supported"
 91 |         # we only support 2D interpolation in this case
 92 |         num_frames = patches_layout[0]
 93 |         num_spatial_tokens = patches_layout[1] * patches_layout[2]
 94 |         pos_embed = pos_embed.view(1, num_frames, num_spatial_tokens, -1)
 95 |         # interpolate embedding for zeroth frame
 96 |         pos_embed = interpolate_pos_encoding_2d(
 97 |             npatch_per_img, pos_embed[0, 0, ...].unsqueeze(0)
 98 |         )
 99 |     else:
100 |         raise ValueError("This type of interpolation isn't implemented")
101 | 
102 |     return torch.cat((class_emb, pos_embed), dim=1)
103 | 
104 | 
105 | def _get_pos_embedding(
106 |     npatch_per_img,
107 |     pos_embed,
108 |     patches_layout,
109 |     input_shape,
110 |     first_patch_idx=1,
111 | ):
112 |     pos_embed = interpolate_pos_encoding(
113 |         npatch_per_img,
114 |         pos_embed,
115 |         patches_layout,
116 |         input_shape=input_shape,
117 |         first_patch_idx=first_patch_idx,
118 |     )
119 |     return pos_embed
120 | 
121 | 
122 | class PatchEmbedGeneric(nn.Module):
123 |     """
124 |     PatchEmbed from Hydra
125 |     """
126 | 
127 |     def __init__(self, proj_stem, norm_layer: Optional[nn.Module] = None):
128 |         super().__init__()
129 | 
130 |         if len(proj_stem) > 1:
131 |             self.proj = nn.Sequential(*proj_stem)
132 |         else:
133 |             # Special case to be able to load pre-trained models that were
134 |             # trained with a standard stem
135 |             self.proj = proj_stem[0]
136 |         self.norm_layer = norm_layer
137 | 
138 |     def get_patch_layout(self, img_size):
139 |         with torch.no_grad():
140 |             dummy_img = torch.zeros(
141 |                 [
142 |                     1,
143 |                 ]
144 |                 + img_size
145 |             )
146 |             dummy_out = self.proj(dummy_img)
147 |         embed_dim = dummy_out.shape[1]
148 |         patches_layout = tuple(dummy_out.shape[2:])
149 |         num_patches = np.prod(patches_layout)
150 |         return patches_layout, num_patches, embed_dim
151 | 
152 |     def forward(self, x):
153 |         x = self.proj(x)
154 |         # B C (T) H W -> B (T)HW C
155 |         x = x.flatten(2).transpose(1, 2)
156 |         if self.norm_layer is not None:
157 |             x = self.norm_layer(x)
158 |         return x
159 | 
160 | 
161 | class SpatioTemporalPosEmbeddingHelper(VerboseNNModule):
162 |     def __init__(
163 |         self,
164 |         patches_layout: List,
165 |         num_patches: int,
166 |         num_cls_tokens: int,
167 |         embed_dim: int,
168 |         learnable: bool,
169 |     ) -> None:
170 |         super().__init__()
171 |         self.num_cls_tokens = num_cls_tokens
172 |         self.patches_layout = patches_layout
173 |         self.num_patches = num_patches
174 |         self.num_tokens = num_cls_tokens + num_patches
175 |         self.learnable = learnable
176 |         if self.learnable:
177 |             self.pos_embed = nn.Parameter(torch.zeros(1, self.num_tokens, embed_dim))
178 |             trunc_normal_(self.pos_embed, std=0.02)
179 |         else:
180 |             self.register_buffer(
181 |                 "pos_embed", get_sinusoid_encoding_table(self.num_tokens, embed_dim)
182 |             )
183 | 
184 |     def get_pos_embedding(self, vision_input, all_vision_tokens):
185 |         input_shape = vision_input.shape
186 |         pos_embed = _get_pos_embedding(
187 |             all_vision_tokens.size(1) - self.num_cls_tokens,
188 |             pos_embed=self.pos_embed,
189 |             patches_layout=self.patches_layout,
190 |             input_shape=input_shape,
191 |             first_patch_idx=self.num_cls_tokens,
192 |         )
193 |         return pos_embed
194 | 
195 | 
196 | class RGBDTPreprocessor(VerboseNNModule):
197 |     def __init__(
198 |         self,
199 |         rgbt_stem: PatchEmbedGeneric,
200 |         depth_stem: PatchEmbedGeneric,
201 |         img_size: List = (3, 224, 224),
202 |         num_cls_tokens: int = 1,
203 |         pos_embed_fn: Callable = None,
204 |         use_type_embed: bool = False,
205 |         init_param_style: str = "openclip",
206 |     ) -> None:
207 |         super().__init__()
208 |         stem = rgbt_stem if rgbt_stem is not None else depth_stem
209 |         (
210 |             self.patches_layout,
211 |             self.num_patches,
212 |             self.embed_dim,
213 |         ) = stem.get_patch_layout(img_size)
214 |         self.rgbt_stem = rgbt_stem
215 |         self.depth_stem = depth_stem
216 |         self.use_pos_embed = pos_embed_fn is not None
217 |         self.use_type_embed = use_type_embed
218 |         self.num_cls_tokens = num_cls_tokens
219 | 
220 |         if self.use_pos_embed:
221 |             self.pos_embedding_helper = pos_embed_fn(
222 |                 patches_layout=self.patches_layout,
223 |                 num_cls_tokens=num_cls_tokens,
224 |                 num_patches=self.num_patches,
225 |                 embed_dim=self.embed_dim,
226 |             )
227 |         if self.num_cls_tokens > 0:
228 |             self.cls_token = nn.Parameter(
229 |                 torch.zeros(1, self.num_cls_tokens, self.embed_dim)
230 |             )
231 |         if self.use_type_embed:
232 |             self.type_embed = nn.Parameter(torch.zeros(1, 1, self.embed_dim))
233 | 
234 |         self.init_parameters(init_param_style)
235 | 
236 |     @torch.no_grad()
237 |     def init_parameters(self, init_param_style):
238 |         if init_param_style == "openclip":
239 |             # OpenCLIP style initialization
240 |             scale = self.embed_dim**-0.5
241 |             if self.use_pos_embed:
242 |                 nn.init.normal_(self.pos_embedding_helper.pos_embed)
243 |                 self.pos_embedding_helper.pos_embed *= scale
244 | 
245 |             if self.num_cls_tokens > 0:
246 |                 nn.init.normal_(self.cls_token)
247 |                 self.cls_token *= scale
248 |         elif init_param_style == "vit":
249 |             self.cls_token.data.fill_(0)
250 |         else:
251 |             raise ValueError(f"Unknown init {init_param_style}")
252 | 
253 |         if self.use_type_embed:
254 |             nn.init.normal_(self.type_embed)
255 | 
256 |     def tokenize_input_and_cls_pos(self, input, stem, mask):
257 |         # tokens is of shape B x L x D
258 |         tokens = stem(input)
259 |         assert tokens.ndim == 3
260 |         assert tokens.shape[2] == self.embed_dim
261 |         B = tokens.shape[0]
262 |         if self.num_cls_tokens > 0:
263 |             class_tokens = self.cls_token.expand(
264 |                 B, -1, -1
265 |             )  # stole class_tokens impl from Phil Wang, thanks
266 |             tokens = torch.cat((class_tokens, tokens), dim=1)
267 |         if self.use_pos_embed:
268 |             pos_embed = self.pos_embedding_helper.get_pos_embedding(input, tokens)
269 |             tokens = tokens + pos_embed
270 |         if self.use_type_embed:
271 |             tokens = tokens + self.type_embed.expand(B, -1, -1)
272 |         return tokens
273 | 
274 |     def forward(self, vision=None, depth=None, patch_mask=None):
275 |         if patch_mask is not None:
276 |             raise NotImplementedError()
277 | 
278 |         if vision is not None:
279 |             vision_tokens = self.tokenize_input_and_cls_pos(
280 |                 vision, self.rgbt_stem, patch_mask
281 |             )
282 | 
283 |         if depth is not None:
284 |             depth_tokens = self.tokenize_input_and_cls_pos(
285 |                 depth, self.depth_stem, patch_mask
286 |             )
287 | 
288 |         # aggregate tokens
289 |         if vision is not None and depth is not None:
290 |             final_tokens = vision_tokens + depth_tokens
291 |         else:
292 |             final_tokens = vision_tokens if vision is not None else depth_tokens
293 |         return_dict = {
294 |             "trunk": {
295 |                 "tokens": final_tokens,
296 |             },
297 |             "head": {},
298 |         }
299 |         return return_dict
300 | 
301 | 
302 | class AudioPreprocessor(RGBDTPreprocessor):
303 |     def __init__(self, audio_stem: PatchEmbedGeneric, **kwargs) -> None:
304 |         super().__init__(rgbt_stem=audio_stem, depth_stem=None, **kwargs)
305 | 
306 |     def forward(self, audio=None):
307 |         return super().forward(vision=audio)
308 | 
309 | 
310 | class ThermalPreprocessor(RGBDTPreprocessor):
311 |     def __init__(self, thermal_stem: PatchEmbedGeneric, **kwargs) -> None:
312 |         super().__init__(rgbt_stem=thermal_stem, depth_stem=None, **kwargs)
313 | 
314 |     def forward(self, thermal=None):
315 |         return super().forward(vision=thermal)
316 | 
317 | 
318 | def build_causal_attention_mask(context_length):
319 |     # lazily create causal attention mask, with full attention between the vision tokens
320 |     # pytorch uses additive attention mask; fill with -inf
321 |     mask = torch.empty(context_length, context_length, requires_grad=False)
322 |     mask.fill_(float("-inf"))
323 |     mask.triu_(1)  # zero out the lower diagonal
324 |     return mask
325 | 
326 | 
327 | class TextPreprocessor(VerboseNNModule):
328 |     def __init__(
329 |         self,
330 |         vocab_size: int,
331 |         context_length: int,
332 |         embed_dim: int,
333 |         causal_masking: bool,
334 |         supply_seq_len_to_head: bool = True,
335 |         num_cls_tokens: int = 0,
336 |         init_param_style: str = "openclip",
337 |     ) -> None:
338 |         super().__init__()
339 |         self.vocab_size = vocab_size
340 |         self.context_length = context_length
341 |         self.token_embedding = nn.Embedding(vocab_size, embed_dim)
342 |         self.pos_embed = nn.Parameter(
343 |             torch.empty(1, self.context_length + num_cls_tokens, embed_dim)
344 |         )
345 |         self.causal_masking = causal_masking
346 |         if self.causal_masking:
347 |             mask = build_causal_attention_mask(self.context_length)
348 |             # register the mask as a buffer so it can be moved to the right device
349 |             self.register_buffer("mask", mask)
350 | 
351 |         self.supply_seq_len_to_head = supply_seq_len_to_head
352 |         self.num_cls_tokens = num_cls_tokens
353 |         self.embed_dim = embed_dim
354 |         if num_cls_tokens > 0:
355 |             assert self.causal_masking is False, "Masking + CLS token isn't implemented"
356 |             self.cls_token = nn.Parameter(
357 |                 torch.zeros(1, self.num_cls_tokens, embed_dim)
358 |             )
359 | 
360 |         self.init_parameters(init_param_style)
361 | 
362 |     @torch.no_grad()
363 |     def init_parameters(self, init_param_style="openclip"):
364 |         # OpenCLIP style initialization
365 |         nn.init.normal_(self.token_embedding.weight, std=0.02)
366 |         nn.init.normal_(self.pos_embed, std=0.01)
367 | 
368 |         if init_param_style == "openclip":
369 |             # OpenCLIP style initialization
370 |             scale = self.embed_dim**-0.5
371 |             if self.num_cls_tokens > 0:
372 |                 nn.init.normal_(self.cls_token)
373 |                 self.cls_token *= scale
374 |         elif init_param_style == "vit":
375 |             self.cls_token.data.fill_(0)
376 |         else:
377 |             raise ValueError(f"Unknown init {init_param_style}")
378 | 
379 |     def forward(self, text):
380 |         # text tokens are of shape B x L x D
381 |         text_tokens = self.token_embedding(text)
382 |         # concat CLS tokens if any
383 |         if self.num_cls_tokens > 0:
384 |             B = text_tokens.shape[0]
385 |             class_tokens = self.cls_token.expand(
386 |                 B, -1, -1
387 |             )  # stole class_tokens impl from Phil Wang, thanks
388 |             text_tokens = torch.cat((class_tokens, text_tokens), dim=1)
389 |         text_tokens = text_tokens + self.pos_embed
390 |         return_dict = {
391 |             "trunk": {
392 |                 "tokens": text_tokens,
393 |             },
394 |             "head": {},
395 |         }
396 |         # Compute sequence length after adding CLS tokens
397 |         if self.supply_seq_len_to_head:
398 |             text_lengths = text.argmax(dim=-1)
399 |             return_dict["head"] = {
400 |                 "seq_len": text_lengths,
401 |             }
402 |         if self.causal_masking:
403 |             return_dict["trunk"].update({"attn_mask": self.mask})
404 |         return return_dict
405 | 
406 | 
407 | class Im2Video(nn.Module):
408 |     """Convert an image into a trivial video."""
409 | 
410 |     def __init__(self, time_dim=2):
411 |         super().__init__()
412 |         self.time_dim = time_dim
413 | 
414 |     def forward(self, x):
415 |         if x.ndim == 4:
416 |             # B, C, H, W -> B, C, T, H, W
417 |             return x.unsqueeze(self.time_dim)
418 |         elif x.ndim == 5:
419 |             return x
420 |         else:
421 |             raise ValueError(f"Dimension incorrect {x.shape}")
422 | 
423 | 
424 | class PadIm2Video(Im2Video):
425 |     def __init__(self, ntimes, pad_type, time_dim=2):
426 |         super().__init__(time_dim=time_dim)
427 |         assert ntimes > 0
428 |         assert pad_type in ["zero", "repeat"]
429 |         self.ntimes = ntimes
430 |         self.pad_type = pad_type
431 | 
432 |     def forward(self, x):
433 |         x = super().forward(x)
434 |         if x.shape[self.time_dim] == 1:
435 |             if self.pad_type == "repeat":
436 |                 new_shape = [1] * len(x.shape)
437 |                 new_shape[self.time_dim] = self.ntimes
438 |                 x = x.repeat(new_shape)
439 |             elif self.pad_type == "zero":
440 |                 padarg = [0, 0] * len(x.shape)
441 |                 padarg[2 * self.time_dim + 1] = self.ntimes - x.shape[self.time_dim]
442 |                 x = nn.functional.pad(x, padarg)
443 |         return x
444 | 
445 | 
446 | # Modified from github.com/openai/CLIP
447 | @lru_cache()
448 | def bytes_to_unicode():
449 |     """
450 |     Returns list of utf-8 byte and a corresponding list of unicode strings.
451 |     The reversible bpe codes work on unicode strings.
452 |     This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
453 |     When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
454 |     This is a signficant percentage of your normal, say, 32K bpe vocab.
455 |     To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
456 |     And avoids mapping to whitespace/control characters the bpe code barfs on.
457 |     """
458 |     bs = (
459 |         list(range(ord("!"), ord("~") + 1))
460 |         + list(range(ord("¡"), ord("¬") + 1))
461 |         + list(range(ord("®"), ord("ÿ") + 1))
462 |     )
463 |     cs = bs[:]
464 |     n = 0
465 |     for b in range(2**8):
466 |         if b not in bs:
467 |             bs.append(b)
468 |             cs.append(2**8 + n)
469 |             n += 1
470 |     cs = [chr(n) for n in cs]
471 |     return dict(zip(bs, cs))
472 | 
473 | 
474 | def get_pairs(word):
475 |     """Return set of symbol pairs in a word.
476 |     Word is represented as tuple of symbols (symbols being variable-length strings).
477 |     """
478 |     pairs = set()
479 |     prev_char = word[0]
480 |     for char in word[1:]:
481 |         pairs.add((prev_char, char))
482 |         prev_char = char
483 |     return pairs
484 | 
485 | 
486 | def basic_clean(text):
487 |     text = ftfy.fix_text(text)
488 |     text = html.unescape(html.unescape(text))
489 |     return text.strip()
490 | 
491 | 
492 | def whitespace_clean(text):
493 |     text = re.sub(r"\s+", " ", text)
494 |     text = text.strip()
495 |     return text
496 | 
497 | 
498 | class SimpleTokenizer(object):
499 |     def __init__(self, bpe_path: str, context_length=77):
500 |         self.byte_encoder = bytes_to_unicode()
501 |         self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
502 | 
503 |         with g_pathmgr.open(bpe_path, "rb") as fh:
504 |             bpe_bytes = io.BytesIO(fh.read())
505 |             merges = gzip.open(bpe_bytes).read().decode("utf-8").split("\n")
506 |         merges = merges[1 : 49152 - 256 - 2 + 1]
507 |         merges = [tuple(merge.split()) for merge in merges]
508 |         vocab = list(bytes_to_unicode().values())
509 |         vocab = vocab + [v + "</w>" for v in vocab]
510 |         for merge in merges:
511 |             vocab.append("".join(merge))
512 |         vocab.extend(["<|startoftext|>", "<|endoftext|>"])
513 |         self.encoder = dict(zip(vocab, range(len(vocab))))
514 |         self.decoder = {v: k for k, v in self.encoder.items()}
515 |         self.bpe_ranks = dict(zip(merges, range(len(merges))))
516 |         self.cache = {
517 |             "<|startoftext|>": "<|startoftext|>",
518 |             "<|endoftext|>": "<|endoftext|>",
519 |         }
520 |         self.pat = re.compile(
521 |             r"""<\|startoftext\|>|<\|endoftext\|>|'s|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+""",
522 |             re.IGNORECASE,
523 |         )
524 |         self.context_length = context_length
525 | 
526 |     def bpe(self, token):
527 |         if token in self.cache:
528 |             return self.cache[token]
529 |         word = tuple(token[:-1]) + (token[-1] + "</w>",)
530 |         pairs = get_pairs(word)
531 | 
532 |         if not pairs:
533 |             return token + "</w>"
534 | 
535 |         while True:
536 |             bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf")))
537 |             if bigram not in self.bpe_ranks:
538 |                 break
539 |             first, second = bigram
540 |             new_word = []
541 |             i = 0
542 |             while i < len(word):
543 |                 try:
544 |                     j = word.index(first, i)
545 |                     new_word.extend(word[i:j])
546 |                     i = j
547 |                 except:
548 |                     new_word.extend(word[i:])
549 |                     break
550 | 
551 |                 if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
552 |                     new_word.append(first + second)
553 |                     i += 2
554 |                 else:
555 |                     new_word.append(word[i])
556 |                     i += 1
557 |             new_word = tuple(new_word)
558 |             word = new_word
559 |             if len(word) == 1:
560 |                 break
561 |             else:
562 |                 pairs = get_pairs(word)
563 |         word = " ".join(word)
564 |         self.cache[token] = word
565 |         return word
566 | 
567 |     def encode(self, text):
568 |         bpe_tokens = []
569 |         text = whitespace_clean(basic_clean(text)).lower()
570 |         for token in re.findall(self.pat, text):
571 |             token = "".join(self.byte_encoder[b] for b in token.encode("utf-8"))
572 |             bpe_tokens.extend(
573 |                 self.encoder[bpe_token] for bpe_token in self.bpe(token).split(" ")
574 |             )
575 |         return bpe_tokens
576 | 
577 |     def decode(self, tokens):
578 |         text = "".join([self.decoder[token] for token in tokens])
579 |         text = (
580 |             bytearray([self.byte_decoder[c] for c in text])
581 |             .decode("utf-8", errors="replace")
582 |             .replace("</w>", " ")
583 |         )
584 |         return text
585 | 
586 |     def __call__(self, texts, context_length=None):
587 |         if not context_length:
588 |             context_length = self.context_length
589 | 
590 |         if isinstance(texts, str):
591 |             texts = [texts]
592 | 
593 |         sot_token = self.encoder["<|startoftext|>"]
594 |         eot_token = self.encoder["<|endoftext|>"]
595 |         all_tokens = [[sot_token] + self.encode(text) + [eot_token] for text in texts]
596 |         result = torch.zeros(len(all_tokens), context_length, dtype=torch.long)
597 | 
598 |         for i, tokens in enumerate(all_tokens):
599 |             tokens = tokens[:context_length]
600 |             result[i, : len(tokens)] = torch.tensor(tokens)
601 | 
602 |         if len(result) == 1:
603 |             return result[0]
604 |         return result
605 | 
606 | 
607 | class IMUPreprocessor(VerboseNNModule):
608 |     def __init__(
609 |         self,
610 |         kernel_size: int,
611 |         imu_stem: PatchEmbedGeneric,
612 |         embed_dim: int,
613 |         img_size: List = (6, 2000),
614 |         num_cls_tokens: int = 1,
615 |         pos_embed_fn: Callable = None,
616 |         init_param_style: str = "openclip",
617 |     ) -> None:
618 |         super().__init__()
619 |         stem = imu_stem
620 |         self.imu_stem = imu_stem
621 |         self.embed_dim = embed_dim
622 |         self.use_pos_embed = pos_embed_fn is not None
623 |         self.num_cls_tokens = num_cls_tokens
624 |         self.kernel_size = kernel_size
625 |         self.pos_embed = nn.Parameter(
626 |             torch.empty(1, (img_size[1] // kernel_size) + num_cls_tokens, embed_dim)
627 |         )
628 | 
629 |         if self.num_cls_tokens > 0:
630 |             self.cls_token = nn.Parameter(
631 |                 torch.zeros(1, self.num_cls_tokens, self.embed_dim)
632 |             )
633 | 
634 |         self.init_parameters(init_param_style)
635 | 
636 |     @torch.no_grad()
637 |     def init_parameters(self, init_param_style):
638 |         nn.init.normal_(self.pos_embed, std=0.01)
639 | 
640 |         if init_param_style == "openclip":
641 |             # OpenCLIP style initialization
642 |             scale = self.embed_dim**-0.5
643 | 
644 |             if self.num_cls_tokens > 0:
645 |                 nn.init.normal_(self.cls_token)
646 |                 self.cls_token *= scale
647 |         elif init_param_style == "vit":
648 |             self.cls_token.data.fill_(0)
649 |         else:
650 |             raise ValueError(f"Unknown init {init_param_style}")
651 | 
652 |     def tokenize_input_and_cls_pos(self, input, stem):
653 |         # tokens is of shape B x L x D
654 |         tokens = stem.norm_layer(stem.proj(input))
655 |         assert tokens.ndim == 3
656 |         assert tokens.shape[2] == self.embed_dim
657 |         B = tokens.shape[0]
658 |         if self.num_cls_tokens > 0:
659 |             class_tokens = self.cls_token.expand(
660 |                 B, -1, -1
661 |             )  # stole class_tokens impl from Phil Wang, thanks
662 |             tokens = torch.cat((class_tokens, tokens), dim=1)
663 |         if self.use_pos_embed:
664 |             tokens = tokens + self.pos_embed
665 |         return tokens
666 | 
667 |     def forward(self, imu):
668 |         # Patchify
669 |         imu = imu.unfold(
670 |             -1,
671 |             self.kernel_size,
672 |             self.kernel_size,
673 |         ).permute(0, 2, 1, 3)
674 |         imu = imu.reshape(imu.size(0), imu.size(1), -1)
675 | 
676 |         imu_tokens = self.tokenize_input_and_cls_pos(
677 |             imu,
678 |             self.imu_stem,
679 |         )
680 | 
681 |         return_dict = {
682 |             "trunk": {
683 |                 "tokens": imu_tokens,
684 |             },
685 |             "head": {},
686 |         }
687 |         return return_dict
688 | 


--------------------------------------------------------------------------------
/model/ImageBind/models/transformer.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # Portions Copyright (c) Meta Platforms, Inc. and affiliates.
  3 | # All rights reserved.
  4 | 
  5 | # This source code is licensed under the license found in the
  6 | # LICENSE file in the root directory of this source tree.
  7 | 
  8 | # Code modified from
  9 | # https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/vision_transformer.py ;
 10 | # https://github.com/facebookresearch/deit/blob/main/models.py
 11 | # and https://github.com/facebookresearch/vissl/blob/main/vissl/models/trunks/vision_transformer.py
 12 | 
 13 | 
 14 | import copy
 15 | import fnmatch
 16 | import logging
 17 | from functools import partial
 18 | from typing import Callable, List
 19 | 
 20 | import torch
 21 | import torch.nn as nn
 22 | import torch.utils.checkpoint as checkpoint
 23 | 
 24 | from timm.models.layers import DropPath, trunc_normal_
 25 | 
 26 | 
 27 | class Attention(nn.Module):
 28 |     def __init__(
 29 |         self,
 30 |         dim,
 31 |         num_heads=8,
 32 |         qkv_bias=False,
 33 |         qk_scale=None,
 34 |         attn_drop=0.0,
 35 |         proj_drop=0.0,
 36 |     ):
 37 |         super().__init__()
 38 |         self.num_heads = num_heads
 39 |         head_dim = dim // num_heads
 40 |         # NOTE scale factor was wrong in my original version,
 41 |         # can set manually to be compat with prev weights
 42 |         self.scale = qk_scale or head_dim**-0.5
 43 | 
 44 |         self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
 45 |         self.attn_drop = nn.Dropout(attn_drop)
 46 |         self.proj = nn.Linear(dim, dim)
 47 |         self.proj_drop = nn.Dropout(proj_drop)
 48 | 
 49 |     def forward(self, x):
 50 |         B, N, C = x.shape
 51 |         qkv = (
 52 |             self.qkv(x)
 53 |             .reshape(B, N, 3, self.num_heads, C // self.num_heads)
 54 |             .permute(2, 0, 3, 1, 4)
 55 |         )
 56 |         q, k, v = (
 57 |             qkv[0],
 58 |             qkv[1],
 59 |             qkv[2],
 60 |         )  # make torchscript happy (cannot use tensor as tuple)
 61 | 
 62 |         attn = (q @ k.transpose(-2, -1)) * self.scale
 63 |         attn = attn.softmax(dim=-1)
 64 |         attn = self.attn_drop(attn)
 65 | 
 66 |         x = (attn @ v).transpose(1, 2).reshape(B, N, C)
 67 |         x = self.proj(x)
 68 |         x = self.proj_drop(x)
 69 |         return x
 70 | 
 71 | 
 72 | class Mlp(nn.Module):
 73 |     def __init__(
 74 |         self,
 75 |         in_features,
 76 |         hidden_features=None,
 77 |         out_features=None,
 78 |         act_layer=nn.GELU,
 79 |         drop=0.0,
 80 |     ):
 81 |         super().__init__()
 82 |         out_features = out_features or in_features
 83 |         hidden_features = hidden_features or in_features
 84 |         self.fc1 = nn.Linear(in_features, hidden_features)
 85 |         self.act = act_layer()
 86 |         self.fc2 = nn.Linear(hidden_features, out_features)
 87 |         self.drop = nn.Dropout(drop)
 88 | 
 89 |     def forward(self, x):
 90 |         x = self.fc1(x)
 91 |         x = self.act(x)
 92 |         x = self.drop(x)
 93 |         x = self.fc2(x)
 94 |         x = self.drop(x)
 95 |         return x
 96 | 
 97 | 
 98 | class MultiheadAttention(nn.MultiheadAttention):
 99 |     def forward(self, x: torch.Tensor, attn_mask: torch.Tensor):
100 |         return super().forward(x, x, x, need_weights=False, attn_mask=attn_mask)[0]
101 | 
102 | 
103 | class ViTAttention(Attention):
104 |     def forward(self, x: torch.Tensor, attn_mask: torch.Tensor):
105 |         assert attn_mask is None
106 |         return super().forward(x)
107 | 
108 | 
109 | class BlockWithMasking(nn.Module):
110 |     def __init__(
111 |         self,
112 |         dim: int,
113 |         attn_target: Callable,
114 |         mlp_ratio: int = 4,
115 |         act_layer: Callable = nn.GELU,
116 |         norm_layer: Callable = nn.LayerNorm,
117 |         ffn_dropout_rate: float = 0.0,
118 |         drop_path: float = 0.0,
119 |         layer_scale_type: str = None,
120 |         layer_scale_init_value: float = 1e-4,
121 |     ):
122 |         super().__init__()
123 | 
124 |         assert not isinstance(
125 |             attn_target, nn.Module
126 |         ), "attn_target should be a Callable. Otherwise attn_target is shared across blocks!"
127 |         self.attn = attn_target()
128 |         if drop_path > 0.0:
129 |             self.drop_path = DropPath(drop_path)
130 |         else:
131 |             self.drop_path = nn.Identity()
132 |         self.norm_1 = norm_layer(dim)
133 |         mlp_hidden_dim = int(mlp_ratio * dim)
134 |         self.mlp = Mlp(
135 |             in_features=dim,
136 |             hidden_features=mlp_hidden_dim,
137 |             act_layer=act_layer,
138 |             drop=ffn_dropout_rate,
139 |         )
140 |         self.norm_2 = norm_layer(dim)
141 |         self.layer_scale_type = layer_scale_type
142 |         if self.layer_scale_type is not None:
143 |             assert self.layer_scale_type in [
144 |                 "per_channel",
145 |                 "scalar",
146 |             ], f"Found Layer scale type {self.layer_scale_type}"
147 |             if self.layer_scale_type == "per_channel":
148 |                 # one gamma value per channel
149 |                 gamma_shape = [1, 1, dim]
150 |             elif self.layer_scale_type == "scalar":
151 |                 # single gamma value for all channels
152 |                 gamma_shape = [1, 1, 1]
153 |             # two gammas: for each part of the fwd in the encoder
154 |             self.layer_scale_gamma1 = nn.Parameter(
155 |                 torch.ones(size=gamma_shape) * layer_scale_init_value,
156 |                 requires_grad=True,
157 |             )
158 |             self.layer_scale_gamma2 = nn.Parameter(
159 |                 torch.ones(size=gamma_shape) * layer_scale_init_value,
160 |                 requires_grad=True,
161 |             )
162 | 
163 |     def forward(self, x: torch.Tensor, attn_mask: torch.Tensor):
164 |         if self.layer_scale_type is None:
165 |             x = x + self.drop_path(self.attn(self.norm_1(x), attn_mask))
166 |             x = x + self.drop_path(self.mlp(self.norm_2(x)))
167 |         else:
168 |             x = (
169 |                 x
170 |                 + self.drop_path(self.attn(self.norm_1(x), attn_mask))
171 |                 * self.layer_scale_gamma1
172 |             )
173 |             x = x + self.drop_path(self.mlp(self.norm_2(x))) * self.layer_scale_gamma2
174 |         return x
175 | 
176 | 
177 | _LAYER_NORM = partial(nn.LayerNorm, eps=1e-6)
178 | 
179 | 
180 | class SimpleTransformer(nn.Module):
181 |     def __init__(
182 |         self,
183 |         attn_target: Callable,
184 |         embed_dim: int,
185 |         num_blocks: int,
186 |         block: Callable = BlockWithMasking,
187 |         pre_transformer_layer: Callable = None,
188 |         post_transformer_layer: Callable = None,
189 |         drop_path_rate: float = 0.0,
190 |         drop_path_type: str = "progressive",
191 |         norm_layer: Callable = _LAYER_NORM,
192 |         mlp_ratio: int = 4,
193 |         ffn_dropout_rate: float = 0.0,
194 |         layer_scale_type: str = None,  # from cait; possible values are None, "per_channel", "scalar"
195 |         layer_scale_init_value: float = 1e-4,  # from cait; float
196 |         weight_init_style: str = "jax",  # possible values jax or pytorch
197 |     ):
198 |         """
199 |         Simple Transformer with the following features
200 |         1. Supports masked attention
201 |         2. Supports DropPath
202 |         3. Supports LayerScale
203 |         4. Supports Dropout in Attention and FFN
204 |         5. Makes few assumptions about the input except that it is a Tensor
205 |         """
206 |         super().__init__()
207 |         self.pre_transformer_layer = pre_transformer_layer
208 |         if drop_path_type == "progressive":
209 |             dpr = [x.item() for x in torch.linspace(0, drop_path_rate, num_blocks)]
210 |         elif drop_path_type == "uniform":
211 |             dpr = [drop_path_rate for i in range(num_blocks)]
212 |         else:
213 |             raise ValueError(f"Unknown drop_path_type: {drop_path_type}")
214 | 
215 |         self.blocks = nn.Sequential(
216 |             *[
217 |                 block(
218 |                     dim=embed_dim,
219 |                     attn_target=attn_target,
220 |                     mlp_ratio=mlp_ratio,
221 |                     ffn_dropout_rate=ffn_dropout_rate,
222 |                     drop_path=dpr[i],
223 |                     norm_layer=norm_layer,
224 |                     layer_scale_type=layer_scale_type,
225 |                     layer_scale_init_value=layer_scale_init_value,
226 |                 )
227 |                 for i in range(num_blocks)
228 |             ]
229 |         )
230 |         self.post_transformer_layer = post_transformer_layer
231 |         self.weight_init_style = weight_init_style
232 |         self.apply(self._init_weights)
233 | 
234 |     def _init_weights(self, m):
235 |         if isinstance(m, nn.Linear):
236 |             if self.weight_init_style == "jax":
237 |                 # Based on MAE and official Jax ViT implementation
238 |                 torch.nn.init.xavier_uniform_(m.weight)
239 |             elif self.weight_init_style == "pytorch":
240 |                 # PyTorch ViT uses trunc_normal_
241 |                 trunc_normal_(m.weight, std=0.02)
242 | 
243 |             if m.bias is not None:
244 |                 nn.init.constant_(m.bias, 0)
245 |         elif isinstance(m, (nn.LayerNorm)):
246 |             nn.init.constant_(m.bias, 0)
247 |             nn.init.constant_(m.weight, 1.0)
248 | 
249 |     def forward(
250 |         self,
251 |         tokens: torch.Tensor,
252 |         attn_mask: torch.Tensor = None,
253 |         use_checkpoint: bool = False,
254 |         checkpoint_every_n: int = 1,
255 |         checkpoint_blk_ids: List[int] = None,
256 |     ):
257 |         """
258 |         Inputs
259 |         - tokens: data of shape N x L x D (or L x N x D depending on the attention implementation)
260 |         - attn: mask of shape L x L
261 | 
262 |         Output
263 |         - x: data of shape N x L x D (or L x N x D depending on the attention implementation)
264 |         """
265 |         if self.pre_transformer_layer:
266 |             tokens = self.pre_transformer_layer(tokens)
267 |         if use_checkpoint and checkpoint_blk_ids is None:
268 |             checkpoint_blk_ids = [
269 |                 blk_id
270 |                 for blk_id in range(len(self.blocks))
271 |                 if blk_id % checkpoint_every_n == 0
272 |             ]
273 |         if checkpoint_blk_ids:
274 |             checkpoint_blk_ids = set(checkpoint_blk_ids)
275 |         for blk_id, blk in enumerate(self.blocks):
276 |             if use_checkpoint and blk_id in checkpoint_blk_ids:
277 |                 tokens = checkpoint.checkpoint(
278 |                     blk, tokens, attn_mask, use_reentrant=False
279 |                 )
280 |             else:
281 |                 tokens = blk(tokens, attn_mask=attn_mask)
282 |         if self.post_transformer_layer:
283 |             tokens = self.post_transformer_layer(tokens)
284 |         return tokens
285 | 


--------------------------------------------------------------------------------
/model/ImageBind/requirements.txt:
--------------------------------------------------------------------------------
 1 | --extra-index-url https://download.pytorch.org/whl/cu113
 2 | torchvision==0.14.0
 3 | torchaudio==0.13.0
 4 | pytorchvideo @ git+https://github.com/facebookresearch/pytorchvideo.git@28fe037d212663c6a24f373b94cc5d478c8c1a1d
 5 | timm==0.6.7
 6 | ftfy
 7 | regex
 8 | einops
 9 | fvcore
10 | decord==0.6.0
11 | 


--------------------------------------------------------------------------------
/model/__init__.py:
--------------------------------------------------------------------------------
 1 | from .agent import DeepSpeedAgent
 2 | from .openllama import OpenLLAMAPEFTModel
 3 | 
 4 | def load_model(args):
 5 |     agent_name = args['models'][args['model']]['agent_name']
 6 |     model_name = args['models'][args['model']]['model_name']
 7 |     model = globals()[model_name](**args)
 8 |     agent = globals()[agent_name](model, args)
 9 |     return agent
10 | 


--------------------------------------------------------------------------------
/model/__pycache__/Qformer.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BriansIDP/AudioVisualLLM/52ab7d14b499e7c951df26c89ce2d3673976be9e/model/__pycache__/Qformer.cpython-39.pyc


--------------------------------------------------------------------------------
/model/__pycache__/__init__.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BriansIDP/AudioVisualLLM/52ab7d14b499e7c951df26c89ce2d3673976be9e/model/__pycache__/__init__.cpython-39.pyc


--------------------------------------------------------------------------------
/model/__pycache__/agent.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BriansIDP/AudioVisualLLM/52ab7d14b499e7c951df26c89ce2d3673976be9e/model/__pycache__/agent.cpython-39.pyc


--------------------------------------------------------------------------------
/model/__pycache__/eva_vit.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BriansIDP/AudioVisualLLM/52ab7d14b499e7c951df26c89ce2d3673976be9e/model/__pycache__/eva_vit.cpython-39.pyc


--------------------------------------------------------------------------------
/model/__pycache__/modeling_llama.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BriansIDP/AudioVisualLLM/52ab7d14b499e7c951df26c89ce2d3673976be9e/model/__pycache__/modeling_llama.cpython-39.pyc


--------------------------------------------------------------------------------
/model/__pycache__/modeling_whisper.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BriansIDP/AudioVisualLLM/52ab7d14b499e7c951df26c89ce2d3673976be9e/model/__pycache__/modeling_whisper.cpython-39.pyc


--------------------------------------------------------------------------------
/model/__pycache__/openllama.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BriansIDP/AudioVisualLLM/52ab7d14b499e7c951df26c89ce2d3673976be9e/model/__pycache__/openllama.cpython-39.pyc


--------------------------------------------------------------------------------
/model/agent.py:
--------------------------------------------------------------------------------
  1 | from header import *
  2 | 
  3 | class DeepSpeedAgent:
  4 |     
  5 |     def __init__(self, model, args):
  6 |         super(DeepSpeedAgent, self).__init__()
  7 |         self.args = args
  8 |         self.model = model
  9 |         if args['stage'] == 2:
 10 |             self.load_stage_1_parameters(args["delta_ckpt_path"])
 11 |             print(f'[!] load stage 1 checkpoint from {args["delta_ckpt_path"]}')
 12 | 
 13 |         # load config parameters of deepspeed
 14 |         ds_params = json.load(open(self.args['ds_config_path']))
 15 |         ds_params['scheduler']['params']['total_num_steps'] = self.args['total_steps'] / ds_params['train_micro_batch_size_per_gpu']
 16 |         ds_params['scheduler']['params']['warmup_num_steps'] = max(10, int(self.args['total_steps'] * self.args['warmup_rate']) / 8)
 17 |         if self.args['world_size'] * ds_params['gradient_accumulation_steps'] * ds_params['train_micro_batch_size_per_gpu'] != ds_params['train_batch_size']:
 18 |             print("Force setting train batch size")
 19 |             ds_params['train_batch_size'] = self.args['world_size'] * ds_params['gradient_accumulation_steps'] * ds_params['train_micro_batch_size_per_gpu']
 20 |         self.ds_engine, self.optimizer, _ , _ = deepspeed.initialize(
 21 |             model=self.model,
 22 |             model_parameters=self.model.parameters(),
 23 |             config_params=ds_params,
 24 |             dist_init_required=True,
 25 |             args=types.SimpleNamespace(**args)
 26 |         )
 27 | 
 28 |     @torch.no_grad()
 29 |     def predict(self, batch):
 30 |         self.model.eval()
 31 |         string = self.model.generate_one_sample(batch)
 32 |         return string
 33 | 
 34 |     def train_model(self, batch, current_step=0, pbar=None):
 35 |         self.ds_engine.module.train()
 36 |         loss, mle_acc = self.ds_engine(batch)
 37 |         # print("Begin backward, {}".format(int(os.getenv('RANK', '0'))))
 38 |         self.ds_engine.backward(loss)
 39 |         # print("After backward, {}".format(int(os.getenv('RANK', '0'))))
 40 |         self.ds_engine.step()
 41 |         # print("After optimizer step, {}".format(int(os.getenv('RANK', '0'))))
 42 |         pbar.set_description(f'[!] loss: {round(loss.item(), 4)}; token_acc: {round(mle_acc*100, 2)}')
 43 |         pbar.update(1)
 44 |         if self.args['local_rank'] == 0 and self.args['log_path'] and current_step % self.args['logging_step'] == 0:
 45 |             elapsed = pbar.format_dict['elapsed']
 46 |             rate = pbar.format_dict['rate']
 47 |             remaining = (pbar.total - pbar.n) / rate if rate and pbar.total else 0
 48 |             remaining = str(datetime.timedelta(seconds=remaining))
 49 |             logging.info(f'[!] progress: {round(pbar.n/pbar.total, 5)}; remaining time: {remaining}; loss: {round(loss.item(), 4)}; token_acc: {round(mle_acc*100, 2)}')
 50 |             
 51 |         mle_acc *= 100
 52 |         return mle_acc
 53 | 
 54 |     @torch.no_grad()
 55 |     def valid_model(self, batch):
 56 |         self.model.eval()
 57 |         loss, mle_acc = self.ds_engine(batch)
 58 |         return loss, mle_acc
 59 | 
 60 |     def _zero3_consolidated_16bit_state_dict(self):
 61 |         """
 62 |         Get a full non-partitioned state_dict with fp16 weights on cpu.
 63 |         Important: this function must be called on all ranks and not just rank 0.
 64 |         This is similar to nn.Module.state_dict (modelled after _save_to_state_dict), but:
 65 |         1. consolidates the weights from different partitions on gpu0
 66 |         2. works on one layer at a time to require as little gpu0 memory as possible, by
 67 |         moving the already consolidated weights to cpu
 68 |         3. takes care to keep the shared params shared when gradually copying the params to cpu
 69 |         Returns:
 70 |             a consolidated fp16 ``state_dict`` on cpu on rank 0, ``None`` on other ranks
 71 |         """
 72 |         state_dict = OrderedDict()
 73 |         shared_params = {}
 74 | 
 75 |         def get_layer_state_dict(module, prefix=""):
 76 |             # gather one layer at a time to be memory-efficient
 77 |             # must use modifier_rank=0 to release GPU memory after each layer gathered
 78 |             #see_memory_usage("before GatheredParameters", force=True)
 79 |             with deepspeed.zero.GatheredParameters(list(module.parameters(recurse=False)), modifier_rank=0):
 80 |                 if int(os.getenv('RANK', '0')) == 0:
 81 |                     # handle params
 82 |                     for name, param in module.named_parameters(recurse=False):
 83 |                         if param is None:
 84 |                             continue
 85 |                         key = prefix + name
 86 |                         if param.requires_grad:
 87 |                             # can't rely on param.data_ptr() as it will be reused as weights gets
 88 |                             # gathered and reduced, but param.ds_id is unique across all zero weights
 89 |                             # (and shared params will have the same param.ds_id)
 90 |                             if param.ds_id in shared_params:
 91 |                                 # shared weights
 92 |                                 # print(f"`{key}` is shared with `{shared_params[param.ds_id]}`")
 93 |                                 state_dict[key] = state_dict[shared_params[param.ds_id]]
 94 |                             else:
 95 |                                 state_dict[key] = param.detach().cpu()
 96 |                                 shared_params[param.ds_id] = key
 97 |                         # print(f"param {param.ds_id} {param.shape} {key} ")
 98 | 
 99 |                     # now buffers - not sure if need to take care of potentially shared weights here
100 |                     for name, buf in module.named_buffers(recurse=False):
101 |                         if (buf is not None and name not in module._non_persistent_buffers_set):
102 |                             state_dict[prefix + name] = buf.detach().cpu()
103 |                     #see_memory_usage("after GatheredParameters", force=True)
104 | 
105 |             for name, child in module.named_children():
106 |                 if child is not None:
107 |                     get_layer_state_dict(child, prefix + name + ".")
108 | 
109 |         # Prepare for checkpoint save by ensuring all parameters are partitioned
110 |         self.optimizer.checkpoint_event_prologue()
111 | 
112 |         # see_memory_usage("before get_layer_state_dict", force=False)
113 |         get_layer_state_dict(self.ds_engine.module, prefix="")
114 |         # see_memory_usage("after get_layer_state_dict", force=False)
115 | 
116 |         # self.ds_engine.optimizer.checkpoint_event_epilogue()
117 | 
118 |         return state_dict
119 |     
120 |     def save_model(self, path, current_step):
121 |         # only save trainable model parameters
122 |         if self.ds_engine.zero_gather_16bit_weights_on_model_save():
123 |             # state_dict = self.ds_engine._zero3_consolidated_16bit_state_dict()
124 |             checkpoint = self._zero3_consolidated_16bit_state_dict()
125 |         else:
126 |             checkpoint = OrderedDict()
127 |             for k, v in self.ds_engine.module.named_parameters():
128 |                 if v.requires_grad:
129 |                     checkpoint[k] = v
130 |         if int(os.getenv('RANK', '0')) == 0:
131 |             torch.save(checkpoint, f'{path}/pytorch_model_{current_step}.pt')
132 |             # save tokenizer
133 |             self.model.llama_tokenizer.save_pretrained(path)
134 |             # save configuration
135 |             self.model.llama_model.config.save_pretrained(path)
136 |             print(f'[!] save model into {path}')
137 | 
138 |     def load_stage_1_parameters(self, path):
139 |         delta_ckpt = torch.load(path, map_location=torch.device('cpu'))
140 |         self.model.load_state_dict(delta_ckpt, strict=False)
141 | 


--------------------------------------------------------------------------------
/model/eva_vit.py:
--------------------------------------------------------------------------------
  1 | # Based on EVA, BEIT, timm and DeiT code bases
  2 | # https://github.com/baaivision/EVA
  3 | # https://github.com/rwightman/pytorch-image-models/tree/master/timm
  4 | # https://github.com/microsoft/unilm/tree/master/beit
  5 | # https://github.com/facebookresearch/deit/
  6 | # https://github.com/facebookresearch/dino
  7 | # --------------------------------------------------------'
  8 | import math
  9 | from functools import partial
 10 | 
 11 | import torch
 12 | import torch.nn as nn
 13 | import torch.nn.functional as F
 14 | import torch.utils.checkpoint as checkpoint
 15 | from timm.models.layers import drop_path, to_2tuple, trunc_normal_
 16 | from timm.models.registry import register_model
 17 | 
 18 | 
 19 | def _cfg(url='', **kwargs):
 20 |     return {
 21 |         'url': url,
 22 |         'num_classes': 1000, 'input_size': (3, 224, 224), 'pool_size': None,
 23 |         'crop_pct': .9, 'interpolation': 'bicubic',
 24 |         'mean': (0.5, 0.5, 0.5), 'std': (0.5, 0.5, 0.5),
 25 |         **kwargs
 26 |     }
 27 | 
 28 | 
 29 | class DropPath(nn.Module):
 30 |     """Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks).
 31 |     """
 32 |     def __init__(self, drop_prob=None):
 33 |         super(DropPath, self).__init__()
 34 |         self.drop_prob = drop_prob
 35 | 
 36 |     def forward(self, x):
 37 |         return drop_path(x, self.drop_prob, self.training)
 38 |     
 39 |     def extra_repr(self) -> str:
 40 |         return 'p={}'.format(self.drop_prob)
 41 | 
 42 | 
 43 | class Mlp(nn.Module):
 44 |     def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.):
 45 |         super().__init__()
 46 |         out_features = out_features or in_features
 47 |         hidden_features = hidden_features or in_features
 48 |         self.fc1 = nn.Linear(in_features, hidden_features)
 49 |         self.act = act_layer()
 50 |         self.fc2 = nn.Linear(hidden_features, out_features)
 51 |         self.drop = nn.Dropout(drop)
 52 | 
 53 |     def forward(self, x):
 54 |         x = self.fc1(x)
 55 |         x = self.act(x)
 56 |         # x = self.drop(x)
 57 |         # commit this for the orignal BERT implement 
 58 |         x = self.fc2(x)
 59 |         x = self.drop(x)
 60 |         return x
 61 | 
 62 | 
 63 | class Attention(nn.Module):
 64 |     def __init__(
 65 |             self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0.,
 66 |             proj_drop=0., window_size=None, attn_head_dim=None):
 67 |         super().__init__()
 68 |         self.num_heads = num_heads
 69 |         head_dim = dim // num_heads
 70 |         if attn_head_dim is not None:
 71 |             head_dim = attn_head_dim
 72 |         all_head_dim = head_dim * self.num_heads
 73 |         self.scale = qk_scale or head_dim ** -0.5
 74 | 
 75 |         self.qkv = nn.Linear(dim, all_head_dim * 3, bias=False)
 76 |         if qkv_bias:
 77 |             self.q_bias = nn.Parameter(torch.zeros(all_head_dim))
 78 |             self.v_bias = nn.Parameter(torch.zeros(all_head_dim))
 79 |         else:
 80 |             self.q_bias = None
 81 |             self.v_bias = None
 82 | 
 83 |         if window_size:
 84 |             self.window_size = window_size
 85 |             self.num_relative_distance = (2 * window_size[0] - 1) * (2 * window_size[1] - 1) + 3
 86 |             self.relative_position_bias_table = nn.Parameter(
 87 |                 torch.zeros(self.num_relative_distance, num_heads))  # 2*Wh-1 * 2*Ww-1, nH
 88 |             # cls to token & token 2 cls & cls to cls
 89 | 
 90 |             # get pair-wise relative position index for each token inside the window
 91 |             coords_h = torch.arange(window_size[0])
 92 |             coords_w = torch.arange(window_size[1])
 93 |             coords = torch.stack(torch.meshgrid([coords_h, coords_w]))  # 2, Wh, Ww
 94 |             coords_flatten = torch.flatten(coords, 1)  # 2, Wh*Ww
 95 |             relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :]  # 2, Wh*Ww, Wh*Ww
 96 |             relative_coords = relative_coords.permute(1, 2, 0).contiguous()  # Wh*Ww, Wh*Ww, 2
 97 |             relative_coords[:, :, 0] += window_size[0] - 1  # shift to start from 0
 98 |             relative_coords[:, :, 1] += window_size[1] - 1
 99 |             relative_coords[:, :, 0] *= 2 * window_size[1] - 1
100 |             relative_position_index = \
101 |                 torch.zeros(size=(window_size[0] * window_size[1] + 1, ) * 2, dtype=relative_coords.dtype)
102 |             relative_position_index[1:, 1:] = relative_coords.sum(-1)  # Wh*Ww, Wh*Ww
103 |             relative_position_index[0, 0:] = self.num_relative_distance - 3
104 |             relative_position_index[0:, 0] = self.num_relative_distance - 2
105 |             relative_position_index[0, 0] = self.num_relative_distance - 1
106 | 
107 |             self.register_buffer("relative_position_index", relative_position_index)
108 |         else:
109 |             self.window_size = None
110 |             self.relative_position_bias_table = None
111 |             self.relative_position_index = None
112 | 
113 |         self.attn_drop = nn.Dropout(attn_drop)
114 |         self.proj = nn.Linear(all_head_dim, dim)
115 |         self.proj_drop = nn.Dropout(proj_drop)
116 | 
117 |     def forward(self, x, rel_pos_bias=None):
118 |         B, N, C = x.shape
119 |         qkv_bias = None
120 |         if self.q_bias is not None:
121 |             qkv_bias = torch.cat((self.q_bias, torch.zeros_like(self.v_bias, requires_grad=False), self.v_bias))
122 |         # qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
123 |         qkv = F.linear(input=x, weight=self.qkv.weight, bias=qkv_bias)
124 |         qkv = qkv.reshape(B, N, 3, self.num_heads, -1).permute(2, 0, 3, 1, 4)
125 |         q, k, v = qkv[0], qkv[1], qkv[2]   # make torchscript happy (cannot use tensor as tuple)
126 | 
127 |         q = q * self.scale
128 |         attn = (q @ k.transpose(-2, -1))
129 | 
130 |         if self.relative_position_bias_table is not None:
131 |             relative_position_bias = \
132 |                 self.relative_position_bias_table[self.relative_position_index.view(-1)].view(
133 |                     self.window_size[0] * self.window_size[1] + 1,
134 |                     self.window_size[0] * self.window_size[1] + 1, -1)  # Wh*Ww,Wh*Ww,nH
135 |             relative_position_bias = relative_position_bias.permute(2, 0, 1).contiguous()  # nH, Wh*Ww, Wh*Ww
136 |             attn = attn + relative_position_bias.unsqueeze(0)
137 | 
138 |         if rel_pos_bias is not None:
139 |             attn = attn + rel_pos_bias
140 |         
141 |         attn = attn.softmax(dim=-1)
142 |         attn = self.attn_drop(attn)
143 | 
144 |         x = (attn @ v).transpose(1, 2).reshape(B, N, -1)
145 |         x = self.proj(x)
146 |         x = self.proj_drop(x)
147 |         return x
148 | 
149 | 
150 | class Block(nn.Module):
151 | 
152 |     def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop=0., attn_drop=0.,
153 |                  drop_path=0., init_values=None, act_layer=nn.GELU, norm_layer=nn.LayerNorm,
154 |                  window_size=None, attn_head_dim=None):
155 |         super().__init__()
156 |         self.norm1 = norm_layer(dim)
157 |         self.attn = Attention(
158 |             dim, num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale,
159 |             attn_drop=attn_drop, proj_drop=drop, window_size=window_size, attn_head_dim=attn_head_dim)
160 |         # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
161 |         self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
162 |         self.norm2 = norm_layer(dim)
163 |         mlp_hidden_dim = int(dim * mlp_ratio)
164 |         self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)
165 | 
166 |         if init_values is not None and init_values > 0:
167 |             self.gamma_1 = nn.Parameter(init_values * torch.ones((dim)),requires_grad=True)
168 |             self.gamma_2 = nn.Parameter(init_values * torch.ones((dim)),requires_grad=True)
169 |         else:
170 |             self.gamma_1, self.gamma_2 = None, None
171 | 
172 |     def forward(self, x, rel_pos_bias=None):
173 |         if self.gamma_1 is None:
174 |             x = x + self.drop_path(self.attn(self.norm1(x), rel_pos_bias=rel_pos_bias))
175 |             x = x + self.drop_path(self.mlp(self.norm2(x)))
176 |         else:
177 |             x = x + self.drop_path(self.gamma_1 * self.attn(self.norm1(x), rel_pos_bias=rel_pos_bias))
178 |             x = x + self.drop_path(self.gamma_2 * self.mlp(self.norm2(x)))
179 |         return x
180 | 
181 | 
182 | class PatchEmbed(nn.Module):
183 |     """ Image to Patch Embedding
184 |     """
185 |     def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768):
186 |         super().__init__()
187 |         img_size = to_2tuple(img_size)
188 |         patch_size = to_2tuple(patch_size)
189 |         num_patches = (img_size[1] // patch_size[1]) * (img_size[0] // patch_size[0])
190 |         self.patch_shape = (img_size[0] // patch_size[0], img_size[1] // patch_size[1])
191 |         self.img_size = img_size
192 |         self.patch_size = patch_size
193 |         self.num_patches = num_patches
194 | 
195 |         self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
196 | 
197 |     def forward(self, x, **kwargs):
198 |         B, C, H, W = x.shape
199 |         # FIXME look at relaxing size constraints
200 |         assert H == self.img_size[0] and W == self.img_size[1], \
201 |             f"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})."
202 |         x = self.proj(x).flatten(2).transpose(1, 2)
203 |         return x
204 | 
205 | 
206 | class RelativePositionBias(nn.Module):
207 | 
208 |     def __init__(self, window_size, num_heads):
209 |         super().__init__()
210 |         self.window_size = window_size
211 |         self.num_relative_distance = (2 * window_size[0] - 1) * (2 * window_size[1] - 1) + 3
212 |         self.relative_position_bias_table = nn.Parameter(
213 |             torch.zeros(self.num_relative_distance, num_heads))  # 2*Wh-1 * 2*Ww-1, nH
214 |         # cls to token & token 2 cls & cls to cls
215 | 
216 |         # get pair-wise relative position index for each token inside the window
217 |         coords_h = torch.arange(window_size[0])
218 |         coords_w = torch.arange(window_size[1])
219 |         coords = torch.stack(torch.meshgrid([coords_h, coords_w]))  # 2, Wh, Ww
220 |         coords_flatten = torch.flatten(coords, 1)  # 2, Wh*Ww
221 |         relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :]  # 2, Wh*Ww, Wh*Ww
222 |         relative_coords = relative_coords.permute(1, 2, 0).contiguous()  # Wh*Ww, Wh*Ww, 2
223 |         relative_coords[:, :, 0] += window_size[0] - 1  # shift to start from 0
224 |         relative_coords[:, :, 1] += window_size[1] - 1
225 |         relative_coords[:, :, 0] *= 2 * window_size[1] - 1
226 |         relative_position_index = \
227 |             torch.zeros(size=(window_size[0] * window_size[1] + 1,) * 2, dtype=relative_coords.dtype)
228 |         relative_position_index[1:, 1:] = relative_coords.sum(-1)  # Wh*Ww, Wh*Ww
229 |         relative_position_index[0, 0:] = self.num_relative_distance - 3
230 |         relative_position_index[0:, 0] = self.num_relative_distance - 2
231 |         relative_position_index[0, 0] = self.num_relative_distance - 1
232 | 
233 |         self.register_buffer("relative_position_index", relative_position_index)
234 | 
235 |         # trunc_normal_(self.relative_position_bias_table, std=.02)
236 | 
237 |     def forward(self):
238 |         relative_position_bias = \
239 |             self.relative_position_bias_table[self.relative_position_index.view(-1)].view(
240 |                 self.window_size[0] * self.window_size[1] + 1,
241 |                 self.window_size[0] * self.window_size[1] + 1, -1)  # Wh*Ww,Wh*Ww,nH
242 |         return relative_position_bias.permute(2, 0, 1).contiguous()  # nH, Wh*Ww, Wh*Ww
243 | 
244 | 
245 | class VisionTransformer(nn.Module):
246 |     """ Vision Transformer with support for patch or hybrid CNN input stage
247 |     """
248 |     def __init__(self, img_size=224, patch_size=16, in_chans=3, num_classes=1000, embed_dim=768, depth=12,
249 |                  num_heads=12, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop_rate=0., attn_drop_rate=0.,
250 |                  drop_path_rate=0., norm_layer=nn.LayerNorm, init_values=None,
251 |                  use_abs_pos_emb=True, use_rel_pos_bias=False, use_shared_rel_pos_bias=False,
252 |                  use_mean_pooling=True, init_scale=0.001, use_checkpoint=False):
253 |         super().__init__()
254 |         self.image_size = img_size
255 |         self.num_classes = num_classes
256 |         self.num_features = self.embed_dim = embed_dim  # num_features for consistency with other models
257 | 
258 |         self.patch_embed = PatchEmbed(
259 |             img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim)
260 |         num_patches = self.patch_embed.num_patches
261 | 
262 |         self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
263 |         if use_abs_pos_emb:
264 |             self.pos_embed = nn.Parameter(torch.zeros(1, num_patches + 1, embed_dim))
265 |         else:
266 |             self.pos_embed = None
267 |         self.pos_drop = nn.Dropout(p=drop_rate)
268 | 
269 |         if use_shared_rel_pos_bias:
270 |             self.rel_pos_bias = RelativePositionBias(window_size=self.patch_embed.patch_shape, num_heads=num_heads)
271 |         else:
272 |             self.rel_pos_bias = None
273 |         self.use_checkpoint = use_checkpoint
274 |         
275 |         dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)]  # stochastic depth decay rule
276 |         self.use_rel_pos_bias = use_rel_pos_bias
277 |         self.blocks = nn.ModuleList([
278 |             Block(
279 |                 dim=embed_dim, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale,
280 |                 drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i], norm_layer=norm_layer,
281 |                 init_values=init_values, window_size=self.patch_embed.patch_shape if use_rel_pos_bias else None)
282 |             for i in range(depth)])
283 | #         self.norm = nn.Identity() if use_mean_pooling else norm_layer(embed_dim)
284 | #         self.fc_norm = norm_layer(embed_dim) if use_mean_pooling else None
285 | #         self.head = nn.Linear(embed_dim, num_classes) if num_classes > 0 else nn.Identity()
286 | 
287 |         if self.pos_embed is not None:
288 |             trunc_normal_(self.pos_embed, std=.02)
289 |         trunc_normal_(self.cls_token, std=.02)
290 |         # trunc_normal_(self.mask_token, std=.02)
291 | #         if isinstance(self.head, nn.Linear):
292 | #             trunc_normal_(self.head.weight, std=.02)
293 |         self.apply(self._init_weights)
294 |         self.fix_init_weight()
295 | #         if isinstance(self.head, nn.Linear):
296 | #             self.head.weight.data.mul_(init_scale)
297 | #             self.head.bias.data.mul_(init_scale)
298 | 
299 |     def fix_init_weight(self):
300 |         def rescale(param, layer_id):
301 |             param.div_(math.sqrt(2.0 * layer_id))
302 | 
303 |         for layer_id, layer in enumerate(self.blocks):
304 |             rescale(layer.attn.proj.weight.data, layer_id + 1)
305 |             rescale(layer.mlp.fc2.weight.data, layer_id + 1)
306 | 
307 |     def _init_weights(self, m):
308 |         if isinstance(m, nn.Linear):
309 |             trunc_normal_(m.weight, std=.02)
310 |             if isinstance(m, nn.Linear) and m.bias is not None:
311 |                 nn.init.constant_(m.bias, 0)
312 |         elif isinstance(m, nn.LayerNorm):
313 |             nn.init.constant_(m.bias, 0)
314 |             nn.init.constant_(m.weight, 1.0)
315 | 
316 |     def get_classifier(self):
317 |         return self.head
318 | 
319 |     def reset_classifier(self, num_classes, global_pool=''):
320 |         self.num_classes = num_classes
321 |         self.head = nn.Linear(self.embed_dim, num_classes) if num_classes > 0 else nn.Identity()
322 | 
323 |     def forward_features(self, x):
324 |         x = self.patch_embed(x)
325 |         batch_size, seq_len, _ = x.size()
326 | 
327 |         cls_tokens = self.cls_token.expand(batch_size, -1, -1)  # stole cls_tokens impl from Phil Wang, thanks
328 |         x = torch.cat((cls_tokens, x), dim=1)
329 |         if self.pos_embed is not None:
330 |             x = x + self.pos_embed
331 |         x = self.pos_drop(x)
332 | 
333 |         rel_pos_bias = self.rel_pos_bias() if self.rel_pos_bias is not None else None
334 |         for blk in self.blocks:
335 |             if self.use_checkpoint:
336 |                 x = checkpoint.checkpoint(blk, x, rel_pos_bias)
337 |             else:
338 |                 x = blk(x, rel_pos_bias)
339 |         return x
340 | #         x = self.norm(x)
341 | 
342 | #         if self.fc_norm is not None:
343 | #             t = x[:, 1:, :]
344 | #             return self.fc_norm(t.mean(1))
345 | #         else:
346 | #             return x[:, 0]
347 | 
348 |     def forward(self, x):
349 |         x = self.forward_features(x)
350 | #         x = self.head(x)
351 |         return x
352 | 
353 |     def get_intermediate_layers(self, x):
354 |         x = self.patch_embed(x)
355 |         batch_size, seq_len, _ = x.size()
356 | 
357 |         cls_tokens = self.cls_token.expand(batch_size, -1, -1)  # stole cls_tokens impl from Phil Wang, thanks
358 |         x = torch.cat((cls_tokens, x), dim=1)
359 |         if self.pos_embed is not None:
360 |             x = x + self.pos_embed
361 |         x = self.pos_drop(x)
362 | 
363 |         features = []
364 |         rel_pos_bias = self.rel_pos_bias() if self.rel_pos_bias is not None else None
365 |         for blk in self.blocks:
366 |             x = blk(x, rel_pos_bias)
367 |             features.append(x)
368 | 
369 |         return features
370 |     
371 |     
372 | def interpolate_pos_embed(model, checkpoint_model):
373 |     if 'pos_embed' in checkpoint_model:
374 |         pos_embed_checkpoint = checkpoint_model['pos_embed'].float()
375 |         embedding_size = pos_embed_checkpoint.shape[-1]
376 |         num_patches = model.patch_embed.num_patches
377 |         num_extra_tokens = model.pos_embed.shape[-2] - num_patches
378 |         # height (== width) for the checkpoint position embedding
379 |         orig_size = int((pos_embed_checkpoint.shape[-2] - num_extra_tokens) ** 0.5)
380 |         # height (== width) for the new position embedding
381 |         new_size = int(num_patches ** 0.5)
382 |         # class_token and dist_token are kept unchanged
383 |         if orig_size != new_size:
384 |             print("Position interpolate from %dx%d to %dx%d" % (orig_size, orig_size, new_size, new_size))
385 |             extra_tokens = pos_embed_checkpoint[:, :num_extra_tokens]
386 |             # only the position tokens are interpolated
387 |             pos_tokens = pos_embed_checkpoint[:, num_extra_tokens:]
388 |             pos_tokens = pos_tokens.reshape(-1, orig_size, orig_size, embedding_size).permute(0, 3, 1, 2)
389 |             pos_tokens = torch.nn.functional.interpolate(
390 |                 pos_tokens, size=(new_size, new_size), mode='bicubic', align_corners=False)
391 |             pos_tokens = pos_tokens.permute(0, 2, 3, 1).flatten(1, 2)
392 |             new_pos_embed = torch.cat((extra_tokens, pos_tokens), dim=1)
393 |             checkpoint_model['pos_embed'] = new_pos_embed
394 |             
395 |             
396 | def convert_weights_to_fp16(model: nn.Module):
397 |     """Convert applicable model parameters to fp16"""
398 | 
399 |     def _convert_weights_to_fp16(l):
400 |         if isinstance(l, (nn.Conv1d, nn.Conv2d, nn.Linear)):
401 |             l.weight.data = l.weight.data.half()
402 |             if l.bias is not None:
403 |                 l.bias.data = l.bias.data.half()
404 | 
405 | #         if isinstance(l, (nn.MultiheadAttention, Attention)):
406 | #             for attr in [*[f"{s}_proj_weight" for s in ["in", "q", "k", "v"]], "in_proj_bias", "bias_k", "bias_v"]:
407 | #                 tensor = getattr(l, attr)
408 | #                 if tensor is not None:
409 | #                     tensor.data = tensor.data.half()
410 | 
411 |     model.apply(_convert_weights_to_fp16)
412 |     
413 |     
414 | def create_eva_vit_g(img_size=224,drop_path_rate=0.4,use_checkpoint=False,precision="fp16"):
415 |     model = VisionTransformer(
416 |         img_size=img_size,
417 |         patch_size=14,
418 |         use_mean_pooling=False,
419 |         embed_dim=1408,
420 |         depth=39,
421 |         num_heads=1408//88,
422 |         mlp_ratio=4.3637,
423 |         qkv_bias=True,
424 |         drop_path_rate=drop_path_rate,
425 |         norm_layer=partial(nn.LayerNorm, eps=1e-6),
426 |         use_checkpoint=use_checkpoint,
427 |     )
428 |     cached_file = "/home/gs534/rds/rds-t2-cs164-KQ4S3rlDzm8/gs534/opensource/favor/pretrained_ckpt/eva_vit_g.pth"
429 |     state_dict = torch.load(cached_file, map_location="cpu")    
430 |     interpolate_pos_embed(model,state_dict)
431 |     
432 |     incompatible_keys = model.load_state_dict(state_dict, strict=False)
433 |     print(incompatible_keys)
434 |     
435 |     if precision == "fp16":
436 | #         model.to("cuda") 
437 |         convert_weights_to_fp16(model)
438 |     return model
439 | 


--------------------------------------------------------------------------------
/model/modeling_llama.py:
--------------------------------------------------------------------------------
  1 | # This script is based on https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py
  2 | 
  3 | """ PyTorch LLaMA model."""
  4 | import math
  5 | from typing import List, Optional, Tuple, Union
  6 | 
  7 | import torch
  8 | import torch.utils.checkpoint
  9 | from torch import nn
 10 | from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 11 | 
 12 | from transformers.activations import ACT2FN
 13 | from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutputWithPast
 14 | from transformers.modeling_utils import PreTrainedModel
 15 | from transformers.utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
 16 | from transformers.models.llama.configuration_llama import LlamaConfig
 17 | 
 18 | 
 19 | logger = logging.get_logger(__name__)
 20 | 
 21 | _CONFIG_FOR_DOC = "LlamaConfig"
 22 | 
 23 | 
 24 | # Copied from transformers.models.bart.modeling_bart._make_causal_mask
 25 | def _make_causal_mask(
 26 |     input_ids_shape: torch.Size, dtype: torch.dtype, device: torch.device, past_key_values_length: int = 0
 27 | ):
 28 |     """
 29 |     Make causal mask used for bi-directional self-attention.
 30 |     """
 31 |     bsz, tgt_len = input_ids_shape
 32 |     mask = torch.full((tgt_len, tgt_len), torch.tensor(torch.finfo(dtype).min, device=device), device=device)
 33 |     mask_cond = torch.arange(mask.size(-1), device=device)
 34 |     mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
 35 |     mask = mask.to(dtype)
 36 | 
 37 |     if past_key_values_length > 0:
 38 |         mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype, device=device), mask], dim=-1)
 39 |     return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length)
 40 | 
 41 | 
 42 | # Copied from transformers.models.bart.modeling_bart._expand_mask
 43 | def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
 44 |     """
 45 |     Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
 46 |     """
 47 |     bsz, src_len = mask.size()
 48 |     tgt_len = tgt_len if tgt_len is not None else src_len
 49 | 
 50 |     expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype)
 51 | 
 52 |     inverted_mask = 1.0 - expanded_mask
 53 | 
 54 |     return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min)
 55 | 
 56 | 
 57 | class LlamaRMSNorm(nn.Module):
 58 |     def __init__(self, hidden_size, eps=1e-6):
 59 |         """
 60 |         LlamaRMSNorm is equivalent to T5LayerNorm
 61 |         """
 62 |         super().__init__()
 63 |         self.weight = nn.Parameter(torch.ones(hidden_size))
 64 |         self.variance_epsilon = eps
 65 | 
 66 |     def forward(self, hidden_states):
 67 |         variance = hidden_states.to(torch.float32).pow(2).mean(-1, keepdim=True)
 68 |         hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
 69 | 
 70 |         # convert into half-precision if necessary
 71 |         if self.weight.dtype in [torch.float16, torch.bfloat16]:
 72 |             hidden_states = hidden_states.to(self.weight.dtype)
 73 | 
 74 |         return self.weight * hidden_states
 75 | 
 76 | 
 77 | class LlamaRotaryEmbedding(torch.nn.Module):
 78 |     def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
 79 |         super().__init__()
 80 |         inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2).float().to(device) / dim))
 81 |         self.register_buffer("inv_freq", inv_freq)
 82 | 
 83 |         # Build here to make `torch.jit.trace` work.
 84 |         self.max_seq_len_cached = max_position_embeddings
 85 |         t = torch.arange(self.max_seq_len_cached, device=self.inv_freq.device, dtype=self.inv_freq.dtype)
 86 |         freqs = torch.einsum("i,j->ij", t, self.inv_freq)
 87 |         # Different from paper, but it uses a different permutation in order to obtain the same calculation
 88 |         emb = torch.cat((freqs, freqs), dim=-1)
 89 |         self.register_buffer("cos_cached", emb.cos()[None, None, :, :], persistent=False)
 90 |         self.register_buffer("sin_cached", emb.sin()[None, None, :, :], persistent=False)
 91 | 
 92 |     def forward(self, x, seq_len=None):
 93 |         # x: [bs, num_attention_heads, seq_len, head_size]
 94 |         # This `if` block is unlikely to be run after we build sin/cos in `__init__`. Keep the logic here just in case.
 95 |         if seq_len > self.max_seq_len_cached:
 96 |             self.max_seq_len_cached = seq_len
 97 |             t = torch.arange(self.max_seq_len_cached, device=x.device, dtype=self.inv_freq.dtype)
 98 |             freqs = torch.einsum("i,j->ij", t, self.inv_freq)
 99 |             # Different from paper, but it uses a different permutation in order to obtain the same calculation
100 |             emb = torch.cat((freqs, freqs), dim=-1).to(x.device)
101 |             self.register_buffer("cos_cached", emb.cos()[None, None, :, :], persistent=False)
102 |             self.register_buffer("sin_cached", emb.sin()[None, None, :, :], persistent=False)
103 |         return (
104 |             self.cos_cached[:, :, :seq_len, ...].to(dtype=x.dtype),
105 |             self.sin_cached[:, :, :seq_len, ...].to(dtype=x.dtype),
106 |         )
107 | 
108 | 
109 | def rotate_half(x):
110 |     """Rotates half the hidden dims of the input."""
111 |     x1 = x[..., : x.shape[-1] // 2]
112 |     x2 = x[..., x.shape[-1] // 2 :]
113 |     return torch.cat((-x2, x1), dim=-1)
114 | 
115 | 
116 | def apply_rotary_pos_emb(q, k, cos, sin, position_ids):
117 |     gather_indices = position_ids[:, None, :, None]  # [bs, 1, seq_len, 1]
118 |     gather_indices = gather_indices.repeat(1, cos.shape[1], 1, cos.shape[3])
119 |     cos = torch.gather(cos.repeat(gather_indices.shape[0], 1, 1, 1), 2, gather_indices)
120 |     sin = torch.gather(sin.repeat(gather_indices.shape[0], 1, 1, 1), 2, gather_indices)
121 |     q_embed = (q * cos) + (rotate_half(q) * sin)
122 |     k_embed = (k * cos) + (rotate_half(k) * sin)
123 |     return q_embed, k_embed
124 | 
125 | 
126 | class LlamaMLP(nn.Module):
127 |     def __init__(
128 |         self,
129 |         hidden_size: int,
130 |         intermediate_size: int,
131 |         hidden_act: str,
132 |     ):
133 |         super().__init__()
134 |         self.gate_proj = nn.Linear(hidden_size, intermediate_size, bias=False)
135 |         self.down_proj = nn.Linear(intermediate_size, hidden_size, bias=False)
136 |         self.up_proj = nn.Linear(hidden_size, intermediate_size, bias=False)
137 |         self.act_fn = ACT2FN[hidden_act]
138 | 
139 |     def forward(self, x):
140 |         return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
141 | 
142 | 
143 | class LlamaAttention(nn.Module):
144 |     """Multi-headed attention from 'Attention Is All You Need' paper"""
145 | 
146 |     def __init__(self, config: LlamaConfig):
147 |         super().__init__()
148 |         self.config = config
149 |         self.hidden_size = config.hidden_size
150 |         self.num_heads = config.num_attention_heads
151 |         self.head_dim = self.hidden_size // self.num_heads
152 |         self.max_position_embeddings = config.max_position_embeddings
153 | 
154 |         if (self.head_dim * self.num_heads) != self.hidden_size:
155 |             raise ValueError(
156 |                 f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
157 |                 f" and `num_heads`: {self.num_heads})."
158 |             )
159 |         self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False)
160 |         self.k_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False)
161 |         self.v_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False)
162 |         self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
163 |         self.rotary_emb = LlamaRotaryEmbedding(self.head_dim, max_position_embeddings=self.max_position_embeddings)
164 | 
165 |     def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
166 |         return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
167 | 
168 |     def forward(
169 |         self,
170 |         hidden_states: torch.Tensor,
171 |         attention_mask: Optional[torch.Tensor] = None,
172 |         position_ids: Optional[torch.LongTensor] = None,
173 |         past_key_value: Optional[Tuple[torch.Tensor]] = None,
174 |         output_attentions: bool = False,
175 |         use_cache: bool = False,
176 |     ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
177 |         bsz, q_len, _ = hidden_states.size()
178 | 
179 |         # print(hidden_states.size())
180 |         query_states = self.q_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
181 |         key_states = self.k_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
182 |         value_states = self.v_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
183 | 
184 |         kv_seq_len = key_states.shape[-2]
185 |         if past_key_value is not None:
186 |             kv_seq_len += past_key_value[0].shape[-2]
187 |         cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
188 |         query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
189 |         # [bsz, nh, t, hd]
190 | 
191 |         if past_key_value is not None:
192 |             # reuse k, v, self_attention
193 |             key_states = torch.cat([past_key_value[0], key_states], dim=2)
194 |             value_states = torch.cat([past_key_value[1], value_states], dim=2)
195 | 
196 |         past_key_value = (key_states, value_states) if use_cache else None
197 | 
198 |         attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
199 | 
200 |         if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
201 |             raise ValueError(
202 |                 f"Attention weights should be of size {(bsz * self.num_heads, q_len, kv_seq_len)}, but is"
203 |                 f" {attn_weights.size()}"
204 |             )
205 | 
206 |         if attention_mask is not None:
207 |             if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
208 |                 raise ValueError(
209 |                     f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
210 |                 )
211 |             attn_weights = attn_weights + attention_mask
212 |             attn_weights = torch.max(attn_weights, torch.tensor(torch.finfo(attn_weights.dtype).min))
213 | 
214 |         # upcast attention to fp32
215 |         attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
216 |         attn_output = torch.matmul(attn_weights, value_states)
217 | 
218 |         if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
219 |             raise ValueError(
220 |                 f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
221 |                 f" {attn_output.size()}"
222 |             )
223 | 
224 |         attn_output = attn_output.transpose(1, 2)
225 |         attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
226 | 
227 |         attn_output = self.o_proj(attn_output)
228 | 
229 |         if not output_attentions:
230 |             attn_weights = None
231 | 
232 |         return attn_output, attn_weights, past_key_value
233 | 
234 | 
235 | class LlamaDecoderLayer(nn.Module):
236 |     def __init__(self, config: LlamaConfig):
237 |         super().__init__()
238 |         self.hidden_size = config.hidden_size
239 |         self.self_attn = LlamaAttention(config=config)
240 |         self.mlp = LlamaMLP(
241 |             hidden_size=self.hidden_size,
242 |             intermediate_size=config.intermediate_size,
243 |             hidden_act=config.hidden_act,
244 |         )
245 |         self.input_layernorm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
246 |         self.post_attention_layernorm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
247 | 
248 |     def forward(
249 |         self,
250 |         hidden_states: torch.Tensor,
251 |         attention_mask: Optional[torch.Tensor] = None,
252 |         position_ids: Optional[torch.LongTensor] = None,
253 |         past_key_value: Optional[Tuple[torch.Tensor]] = None,
254 |         output_attentions: Optional[bool] = False,
255 |         use_cache: Optional[bool] = False,
256 |     ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
257 |         """
258 |         Args:
259 |             hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
260 |             attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
261 |                 `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
262 |             output_attentions (`bool`, *optional*):
263 |                 Whether or not to return the attentions tensors of all attention layers. See `attentions` under
264 |                 returned tensors for more detail.
265 |             use_cache (`bool`, *optional*):
266 |                 If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
267 |                 (see `past_key_values`).
268 |             past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
269 |         """
270 | 
271 |         residual = hidden_states
272 | 
273 |         hidden_states = self.input_layernorm(hidden_states)
274 | 
275 |         # Self Attention
276 |         hidden_states, self_attn_weights, present_key_value = self.self_attn(
277 |             hidden_states=hidden_states,
278 |             attention_mask=attention_mask,
279 |             position_ids=position_ids,
280 |             past_key_value=past_key_value,
281 |             output_attentions=output_attentions,
282 |             use_cache=use_cache,
283 |         )
284 |         hidden_states = residual + hidden_states
285 | 
286 |         # Fully Connected
287 |         residual = hidden_states
288 |         hidden_states = self.post_attention_layernorm(hidden_states)
289 |         hidden_states = self.mlp(hidden_states)
290 |         hidden_states = residual + hidden_states
291 | 
292 |         outputs = (hidden_states,)
293 | 
294 |         if output_attentions:
295 |             outputs += (self_attn_weights,)
296 | 
297 |         if use_cache:
298 |             outputs += (present_key_value,)
299 | 
300 |         return outputs
301 | 
302 | 
303 | LLAMA_START_DOCSTRING = r"""
304 |     This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
305 |     library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
306 |     etc.)
307 | 
308 |     This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
309 |     Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
310 |     and behavior.
311 | 
312 |     Parameters:
313 |         config ([`LlamaConfig`]):
314 |             Model configuration class with all the parameters of the model. Initializing with a config file does not
315 |             load the weights associated with the model, only the configuration. Check out the
316 |             [`~PreTrainedModel.from_pretrained`] method to load the model weights.
317 | """
318 | 
319 | 
320 | @add_start_docstrings(
321 |     "The bare LLaMA Model outputting raw hidden-states without any specific head on top.",
322 |     LLAMA_START_DOCSTRING,
323 | )
324 | class LlamaPreTrainedModel(PreTrainedModel):
325 |     config_class = LlamaConfig
326 |     base_model_prefix = "model"
327 |     supports_gradient_checkpointing = True
328 |     _no_split_modules = ["LlamaDecoderLayer"]
329 |     _keys_to_ignore_on_load_unexpected = [r"decoder\.version"]
330 | 
331 |     def _init_weights(self, module):
332 |         std = self.config.initializer_range
333 |         if isinstance(module, nn.Linear):
334 |             module.weight.data.normal_(mean=0.0, std=std)
335 |             if module.bias is not None:
336 |                 module.bias.data.zero_()
337 |         elif isinstance(module, nn.Embedding):
338 |             module.weight.data.normal_(mean=0.0, std=std)
339 |             if module.padding_idx is not None:
340 |                 module.weight.data[module.padding_idx].zero_()
341 | 
342 |     def _set_gradient_checkpointing(self, module, value=False):
343 |         if isinstance(module, LlamaModel):
344 |             module.gradient_checkpointing = value
345 | 
346 | 
347 | LLAMA_INPUTS_DOCSTRING = r"""
348 |     Args:
349 |         input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
350 |             Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
351 |             it.
352 | 
353 |             Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
354 |             [`PreTrainedTokenizer.__call__`] for details.
355 | 
356 |             [What are input IDs?](../glossary#input-ids)
357 |         attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
358 |             Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
359 | 
360 |             - 1 for tokens that are **not masked**,
361 |             - 0 for tokens that are **masked**.
362 | 
363 |             [What are attention masks?](../glossary#attention-mask)
364 | 
365 |             Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
366 |             [`PreTrainedTokenizer.__call__`] for details.
367 | 
368 |             If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
369 |             `past_key_values`).
370 | 
371 |             If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
372 |             and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
373 |             information on the default strategy.
374 | 
375 |             - 1 indicates the head is **not masked**,
376 |             - 0 indicates the head is **masked**.
377 |         position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
378 |             Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
379 |             config.n_positions - 1]`.
380 | 
381 |             [What are position IDs?](../glossary#position-ids)
382 |         past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
383 |             Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
384 |             `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
385 |             `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
386 | 
387 |             Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
388 |             blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
389 | 
390 |             If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
391 |             don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
392 |             `decoder_input_ids` of shape `(batch_size, sequence_length)`.
393 |         inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
394 |             Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
395 |             is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
396 |             model's internal embedding lookup matrix.
397 |         use_cache (`bool`, *optional*):
398 |             If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
399 |             `past_key_values`).
400 |         output_attentions (`bool`, *optional*):
401 |             Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
402 |             tensors for more detail.
403 |         output_hidden_states (`bool`, *optional*):
404 |             Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
405 |             more detail.
406 |         return_dict (`bool`, *optional*):
407 |             Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
408 | """
409 | 
410 | 
411 | @add_start_docstrings(
412 |     "The bare LLaMA Model outputting raw hidden-states without any specific head on top.",
413 |     LLAMA_START_DOCSTRING,
414 | )
415 | class LlamaModel(LlamaPreTrainedModel):
416 |     """
417 |     Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`LlamaDecoderLayer`]
418 | 
419 |     Args:
420 |         config: LlamaConfig
421 |     """
422 | 
423 |     def __init__(self, config: LlamaConfig):
424 |         super().__init__(config)
425 |         self.padding_idx = config.pad_token_id
426 |         self.vocab_size = config.vocab_size
427 | 
428 |         self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
429 |         self.layers = nn.ModuleList([LlamaDecoderLayer(config) for _ in range(config.num_hidden_layers)])
430 |         self.norm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
431 | 
432 |         self.gradient_checkpointing = False
433 |         # Initialize weights and apply final processing
434 |         self.post_init()
435 | 
436 |     def get_input_embeddings(self):
437 |         return self.embed_tokens
438 | 
439 |     def set_input_embeddings(self, value):
440 |         self.embed_tokens = value
441 | 
442 |     # Copied from transformers.models.bart.modeling_bart.BartDecoder._prepare_decoder_attention_mask
443 |     def _prepare_decoder_attention_mask(self, attention_mask, input_shape, inputs_embeds, past_key_values_length):
444 |         # create causal mask
445 |         # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
446 |         combined_attention_mask = None
447 |         if input_shape[-1] > 1:
448 |             combined_attention_mask = _make_causal_mask(
449 |                 input_shape,
450 |                 inputs_embeds.dtype,
451 |                 device=inputs_embeds.device,
452 |                 past_key_values_length=past_key_values_length,
453 |             )
454 | 
455 |         if attention_mask is not None:
456 |             # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
457 |             expanded_attn_mask = _expand_mask(attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]).to(
458 |                 inputs_embeds.device
459 |             )
460 |             combined_attention_mask = (
461 |                 expanded_attn_mask if combined_attention_mask is None else expanded_attn_mask + combined_attention_mask
462 |             )
463 | 
464 |         return combined_attention_mask
465 | 
466 |     @add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING)
467 |     def forward(
468 |         self,
469 |         input_ids: torch.LongTensor = None,
470 |         attention_mask: Optional[torch.Tensor] = None,
471 |         position_ids: Optional[torch.LongTensor] = None,
472 |         past_key_values: Optional[List[torch.FloatTensor]] = None,
473 |         inputs_embeds: Optional[torch.FloatTensor] = None,
474 |         query_embeds: Optional[torch.FloatTensor] = None,
475 |         use_cache: Optional[bool] = None,
476 |         output_attentions: Optional[bool] = None,
477 |         output_hidden_states: Optional[bool] = None,
478 |         return_dict: Optional[bool] = None,
479 |         modality_lengths: int = 0,
480 |     ) -> Union[Tuple, BaseModelOutputWithPast]:
481 |         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
482 |         output_hidden_states = (
483 |             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
484 |         )
485 |         use_cache = use_cache if use_cache is not None else self.config.use_cache
486 | 
487 |         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
488 | 
489 |         # retrieve input_ids and inputs_embeds
490 |         if input_ids is not None and inputs_embeds is not None:
491 |             raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
492 |         elif input_ids is not None:
493 |             batch_size, seq_length = input_ids.shape
494 |         elif inputs_embeds is not None:
495 |             batch_size, seq_length, _ = inputs_embeds.shape
496 |         else:
497 |             raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
498 | 
499 |         if inputs_embeds is None:
500 |             inputs_embeds = self.embed_tokens(input_ids)
501 |         if query_embeds is not None:
502 |             inputs_embeds = torch.cat([query_embeds, inputs_embeds], dim=1)
503 |             batch_size, seq_length, _ = inputs_embeds.shape
504 | 
505 |         seq_length_with_past = seq_length
506 |         past_key_values_length = 0
507 | 
508 |         if past_key_values is not None:
509 |             past_key_values_length = past_key_values[0][0].shape[2]
510 |             seq_length_with_past = seq_length_with_past + past_key_values_length
511 | 
512 |         if position_ids is None:
513 |             device = input_ids.device if input_ids is not None else inputs_embeds.device
514 |             position_ids = torch.arange(
515 |                 past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device
516 |             )
517 |             position_ids = position_ids.unsqueeze(0).view(-1, seq_length)
518 |         else:
519 |             position_ids = position_ids.view(-1, seq_length).long()
520 | 
521 |         # embed positions
522 |         if attention_mask is None:
523 |             attention_mask = torch.ones(
524 |                 (batch_size, seq_length_with_past), dtype=torch.bool, device=inputs_embeds.device
525 |             )
526 |         if modality_lengths > 0:
527 |             attn_mask_blk1 = (1 - attention_mask[:, :modality_lengths]).unsqueeze(1).unsqueeze(1).repeat(1, 1, modality_lengths, 1) * -1e9
528 |             attention_mask = self._prepare_decoder_attention_mask(
529 |                 attention_mask, (batch_size, seq_length-modality_lengths), inputs_embeds, modality_lengths
530 |             )
531 |             attn_mask_blk2 = attention_mask.new_ones(batch_size, 1, modality_lengths, seq_length-modality_lengths) * -1e9
532 |             attn_mask_blk = torch.cat([attn_mask_blk1, attn_mask_blk2], dim=-1)
533 |             attention_mask = torch.cat([attn_mask_blk, attention_mask], dim=2)
534 |         else:
535 |             attention_mask = self._prepare_decoder_attention_mask(
536 |                 attention_mask, (batch_size, seq_length), inputs_embeds, past_key_values_length
537 |             )
538 | 
539 |         hidden_states = inputs_embeds
540 | 
541 |         if self.gradient_checkpointing and self.training:
542 |             if use_cache:
543 |                 logger.warning_once(
544 |                     "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
545 |                 )
546 |                 use_cache = False
547 | 
548 |         # decoder layers
549 |         all_hidden_states = () if output_hidden_states else None
550 |         all_self_attns = () if output_attentions else None
551 |         next_decoder_cache = () if use_cache else None
552 | 
553 |         for idx, decoder_layer in enumerate(self.layers):
554 |             if output_hidden_states:
555 |                 all_hidden_states += (hidden_states,)
556 | 
557 |             past_key_value = past_key_values[idx] if past_key_values is not None else None
558 | 
559 |             if self.gradient_checkpointing and self.training:
560 | 
561 |                 def create_custom_forward(module):
562 |                     def custom_forward(*inputs):
563 |                         # None for past_key_value
564 |                         return module(*inputs, output_attentions, None)
565 | 
566 |                     return custom_forward
567 | 
568 |                 layer_outputs = torch.utils.checkpoint.checkpoint(
569 |                     create_custom_forward(decoder_layer),
570 |                     hidden_states,
571 |                     attention_mask,
572 |                     position_ids,
573 |                     None,
574 |                 )
575 |             else:
576 |                 layer_outputs = decoder_layer(
577 |                     hidden_states,
578 |                     attention_mask=attention_mask,
579 |                     position_ids=position_ids,
580 |                     past_key_value=past_key_value,
581 |                     output_attentions=output_attentions,
582 |                     use_cache=use_cache,
583 |                 )
584 | 
585 |             hidden_states = layer_outputs[0]
586 | 
587 |             if use_cache:
588 |                 next_decoder_cache += (layer_outputs[2 if output_attentions else 1],)
589 | 
590 |             if output_attentions:
591 |                 all_self_attns += (layer_outputs[1],)
592 | 
593 |         hidden_states = self.norm(hidden_states)
594 | 
595 |         # add hidden states from the last decoder layer
596 |         if output_hidden_states:
597 |             all_hidden_states += (hidden_states,)
598 | 
599 |         next_cache = next_decoder_cache if use_cache else None
600 |         if not return_dict:
601 |             return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
602 |         return BaseModelOutputWithPast(
603 |             last_hidden_state=hidden_states,
604 |             past_key_values=next_cache,
605 |             hidden_states=all_hidden_states,
606 |             attentions=all_self_attns,
607 |         )
608 | 
609 | 
610 | class LlamaForCausalLM(LlamaPreTrainedModel):
611 |     def __init__(self, config):
612 |         super().__init__(config)
613 |         self.model = LlamaModel(config)
614 | 
615 |         self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
616 | 
617 |         # Initialize weights and apply final processing
618 |         self.post_init()
619 | 
620 |     def get_input_embeddings(self):
621 |         return self.model.embed_tokens
622 | 
623 |     def set_input_embeddings(self, value):
624 |         self.model.embed_tokens = value
625 | 
626 |     def get_output_embeddings(self):
627 |         return self.lm_head
628 | 
629 |     def set_output_embeddings(self, new_embeddings):
630 |         self.lm_head = new_embeddings
631 | 
632 |     def set_decoder(self, decoder):
633 |         self.model = decoder
634 | 
635 |     def get_decoder(self):
636 |         return self.model
637 | 
638 |     @add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING)
639 |     @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
640 |     def forward(
641 |         self,
642 |         input_ids: torch.LongTensor = None,
643 |         attention_mask: Optional[torch.Tensor] = None,
644 |         position_ids: Optional[torch.LongTensor] = None,
645 |         past_key_values: Optional[List[torch.FloatTensor]] = None,
646 |         inputs_embeds: Optional[torch.FloatTensor] = None,
647 |         query_embeds: Optional[torch.FloatTensor] = None,
648 |         labels: Optional[torch.LongTensor] = None,
649 |         use_cache: Optional[bool] = None,
650 |         output_attentions: Optional[bool] = None,
651 |         output_hidden_states: Optional[bool] = None,
652 |         return_dict: Optional[bool] = None,
653 |         modality_lengths: int = 0,
654 |     ) -> Union[Tuple, CausalLMOutputWithPast]:
655 |         r"""
656 |         Args:
657 |             labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
658 |                 Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
659 |                 config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
660 |                 (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
661 | 
662 |         Returns:
663 | 
664 |         Example:
665 | 
666 |         ```python
667 |         >>> from transformers import AutoTokenizer, LlamaForCausalLM
668 | 
669 |         >>> model = LlamaForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
670 |         >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
671 | 
672 |         >>> prompt = "Hey, are you consciours? Can you talk to me?"
673 |         >>> inputs = tokenizer(prompt, return_tensors="pt")
674 | 
675 |         >>> # Generate
676 |         >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
677 |         >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
678 |         "Hey, are you consciours? Can you talk to me?\nI'm not consciours, but I can talk to you."
679 |         ```"""
680 | 
681 |         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
682 |         output_hidden_states = (
683 |             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
684 |         )
685 |         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
686 | 
687 |         # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
688 |         outputs = self.model(
689 |             input_ids=input_ids,
690 |             attention_mask=attention_mask,
691 |             position_ids=position_ids,
692 |             past_key_values=past_key_values,
693 |             inputs_embeds=inputs_embeds,
694 |             query_embeds=query_embeds,
695 |             use_cache=use_cache,
696 |             output_attentions=output_attentions,
697 |             output_hidden_states=output_hidden_states,
698 |             return_dict=return_dict,
699 |             modality_lengths=modality_lengths,
700 |         )
701 | 
702 |         hidden_states = outputs[0]
703 |         logits = self.lm_head(hidden_states)
704 | 
705 |         loss = None
706 |         if labels is not None:
707 |             # Shift so that tokens < n predict n
708 |             shift_logits = logits[..., :-1, :].contiguous()
709 |             shift_labels = labels[..., 1:].contiguous()
710 |             # Flatten the tokens
711 |             loss_fct = CrossEntropyLoss()
712 |             shift_logits = shift_logits.view(-1, self.config.vocab_size)
713 |             shift_labels = shift_labels.view(-1)
714 |             # Enable model parallelism
715 |             shift_labels = shift_labels.to(shift_logits.device)
716 |             loss = loss_fct(shift_logits, shift_labels)
717 | 
718 |         if not return_dict:
719 |             output = (logits,) + outputs[1:]
720 |             return (loss,) + output if loss is not None else output
721 | 
722 |         return CausalLMOutputWithPast(
723 |             loss=loss,
724 |             logits=logits,
725 |             past_key_values=outputs.past_key_values,
726 |             hidden_states=outputs.hidden_states,
727 |             attentions=outputs.attentions,
728 |         )
729 | 
730 |     def prepare_inputs_for_generation(
731 |         self, input_ids, query_embeds=None, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
732 |     ):
733 |         if past_key_values:
734 |             input_ids = input_ids[:, -1:]
735 | 
736 |         position_ids = kwargs.get("position_ids", None)
737 |         if attention_mask is not None and position_ids is None:
738 |             # create position_ids on the fly for batch generation
739 |             position_ids = attention_mask.long().cumsum(-1) - 1
740 |             position_ids.masked_fill_(attention_mask == 0, 1)
741 |             if past_key_values:
742 |                 position_ids = position_ids[:, -1].unsqueeze(-1)
743 |                 query_embeds = None
744 | 
745 |         # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
746 |         if inputs_embeds is not None and past_key_values is None:
747 |             model_inputs = {"inputs_embeds": inputs_embeds}
748 |         else:
749 |             model_inputs = {"input_ids": input_ids}
750 | 
751 |         model_inputs.update(
752 |             {
753 |                 "position_ids": position_ids,
754 |                 "query_embeds": query_embeds,
755 |                 "past_key_values": past_key_values,
756 |                 "use_cache": kwargs.get("use_cache"),
757 |                 "attention_mask": attention_mask,
758 |             }
759 |         )
760 |         return model_inputs
761 | 
762 |     @staticmethod
763 |     def _reorder_cache(past_key_values, beam_idx):
764 |         reordered_past = ()
765 |         for layer_past in past_key_values:
766 |             reordered_past += (tuple(past_state.index_select(0, beam_idx) for past_state in layer_past),)
767 |         return reordered_past
768 | 
769 | 


--------------------------------------------------------------------------------