├── .dockerignore
├── .env
├── .gitignore
├── Dockerfile
    └── Dockerfile-hpu
├── LICENSE
├── README.md
├── app
    ├── .env
    └── README.md
├── docker_build.sh
├── docker_run-hpu.sh
├── exp
    ├── .gitignore
    └── example
    │   ├── 0.png
    │   ├── 1.png
    │   └── example_descriptor.json
├── media
    ├── ISC.png
    ├── blog.png
    ├── library.jpg
    ├── livingRoom.jpg
    ├── pano.png
    ├── pipeline.png
    ├── snow.jpg
    └── underwater.jpeg
├── mm_pano
    ├── __init__.py
    ├── lib
    │   ├── Equirec2Perspec.py
    │   ├── Perspec2Equirec.py
    │   └── multi_Perspec2Equirec.py
    ├── mmpano.py
    ├── tgi_gaudi
    │   ├── README.md
    │   ├── run_tgi_gaudi.sh
    │   └── test_tgi.sh
    └── utils
    │   ├── common.py
    │   ├── image_utils.py
    │   ├── llm_engines.py
    │   └── model_utils.py
├── requirements-api.txt
├── requirements-hpu.txt
└── requirements.txt


/.dockerignore:
--------------------------------------------------------------------------------
 1 | *
 2 | !.env
 3 | !Dockerfile
 4 | !app
 5 | !exp
 6 | !mm_pano
 7 | !requirements*
 8 | 
 9 | mm_pano/tgi_gaudi/data/
10 | 


--------------------------------------------------------------------------------
/.env:
--------------------------------------------------------------------------------
 1 | IMAGE_NAME=mm_pano
 2 | IMAGE_TAG=latest
 3 | CONTAINER_NAME=mm_pano
 4 | 
 5 | # Backend API
 6 | API_HOST="127.0.0.1"
 7 | API_PORT="8010"
 8 | API_TIMEOUT=300
 9 | API_RESPONSE_PANO_NAME="pano"
10 | API_RESPONSE_VIDEO_NAME="panorama_video"
11 | 
12 | # Frontend UI
13 | WEBAPP_NAME="Language Model Assisted Generation of Images with Coherence"
14 | WEBAPP_HOST="127.0.0.1"
15 | WEBAPP_PORT="8100"
16 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | **/__pycache__
2 | 


--------------------------------------------------------------------------------
/Dockerfile/Dockerfile-hpu:
--------------------------------------------------------------------------------
 1 | # From vault.habana.ai/gaudi-docker/1.14.0/ubuntu22.04/habanalabs/pytorch-installer-2.1.1:latest
 2 | From vault.habana.ai/gaudi-docker/1.15.0/ubuntu22.04/habanalabs/pytorch-installer-2.2.0:latest
 3 | 
 4 | WORKDIR /app
 5 | 
 6 | RUN apt-get update && \
 7 |     apt-get upgrade -y && \
 8 |     apt-get install -y \
 9 |         tmux
10 | #####################
11 | # Multimodal Panorama Generation source code
12 | #####################
13 | # Common requirements
14 | COPY requirements.txt /app/requirements.txt
15 | RUN pip3 install --upgrade pip && \
16 |     pip3 install -r requirements.txt
17 | 
18 | # Copy code
19 | COPY mm_pano /app/mm_pano
20 | COPY exp /app/exp
21 | 
22 | ######################
23 | # Application frontend 
24 | ######################
25 | COPY requirements-api.txt /app/requirements-api.txt
26 | RUN pip3 install -r requirements-api.txt
27 | COPY app /app/app
28 | COPY .env /app/.env
29 | 
30 | ###########################
31 | # HPU specific requirements
32 | ###########################
33 | COPY requirements-hpu.txt /app
34 | RUN pip3 install -r requirements-hpu.txt
35 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # [CVPR 2024] Official implementation of the paper: "L-MAGIC: Language Model Assisted Generation of Images with Coherence"
  2 | We present a novel method that can generate 360 degree panorama from different types of zero-shot inputs (e.g., a single image, text description, hand-drawing etc.). Our Huggingface space is now available. Feel free to try it out!
  3 | 
  4 | <div align="center">
  5 |   <img width="800" src="media/pipeline.png">
  6 | </div>
  7 | 
  8 | - [Paper](https://arxiv.org/abs/2406.01843)
  9 | - [Project Page](https://zhipengcai.github.io/MMPano/)
 10 | - [Youtube Video](https://youtu.be/XDMNEzH4-Ec)
 11 | - [Huggingface demo (now available!)](https://huggingface.co/spaces/MMPano/MMPano)
 12 | 
 13 | ## Industrial Impact
 14 | 
 15 | - Our work has been selected as **one of the 5 Intel featured live demos** at [ISC HPC 2024](https://www.intel.com/content/www/us/en/events/supercomputing.html).
 16 | - Our work has been featured by [Intel Community Blog](https://community.intel.com/t5/Blogs/Tech-Innovation/Artificial-Intelligence-AI/Advancing-Gen-AI-on-Intel-Gaudi-AI-Accelerators-with-Multi-Modal/post/1603746)!
 17 | - Our work has been featured by [Intel Labs Linkedin](https://www.linkedin.com/feed/update/urn:li:activity:7203797143831076864/)!
 18 | 
 19 | <div align="center">
 20 |   <img width="800" src="media/ISC.png">
 21 | </div>
 22 | <div align="center">
 23 |   <img width="800" src="media/blog.png">
 24 | </div>
 25 | 
 26 | ## 📌 Reference
 27 | 
 28 | ```bibtex
 29 | @inproceedings{
 30 | zhipeng2024lmagic,
 31 | title={L-MAGIC: Language Model Assisted Generation of Images with Coherence},
 32 | author={Zhipeng Cai and Matthias Müller and Reiner Birkl and Diana Wofk and Shao-Yen Tseng and JunDa Cheng and Gabriela Ben-Melech Stan and Vasudev Lal and Michael Paulitsch},
 33 | booktitle={The IEEE/CVF Conference on Computer Vision and Pattern Recognition},
 34 | year={2024}
 35 | }
 36 | ```
 37 | 
 38 | ## ⭐️ Show Your Support
 39 | 
 40 | If you find this project helpful or interesting, please consider giving it a star! Your support is greatly appreciated and helps others discover the project.
 41 | 
 42 | ## Environment
 43 | 
 44 | This code has been tested on linux with python 3.9. It should be compatible with also other python versions.
 45 | 
 46 | 
 47 | ## Run on Intel Gaudi
 48 | 
 49 | This codebase has been developed and deployed on Intel Gaudi on Intel Developer Cloud
 50 | 
 51 | - [Intel Gaudi](https://habana.ai/)
 52 | - [Intel Developer Cloud](https://www.intel.com/content/www/us/en/developer/tools/devcloud/overview.html)
 53 | 
 54 | 
 55 | #### Setup Docker environment
 56 | ```bash
 57 | # Build docker image
 58 | ./docker_build.sh
 59 | 
 60 | # Start the container. Following the instruction on the script, you may modify
 61 | # the `HABANA_VISIBLE_DEVICES` and `HABANA_VISIBLE_MODULES` to run on different Gaudi device.
 62 | ./docker_run-hpu.sh
 63 | ```
 64 | 
 65 | 
 66 | ## Run on other device
 67 | 
 68 | You can also run it on Nvidia GPU. After a proper Nvidia environment setup with pytorch installed (ex: `conda`, `venv`, `docker` ...etc)
 69 | 
 70 | Install the necessary packages by running the following command:
 71 | 
 72 | ```bash
 73 | pip install -r requirements.txt
 74 | ```
 75 | 
 76 | 
 77 | ## Run the code
 78 | #### Note
 79 | - If you are running on Gaudi, you will encouter a slower performance because Gaudi requires at least 2 warmup cycles. If you want to build your own application using this codebase, please to warmup the Gaudi at least 2 times.
 80 |   
 81 | - The best performance is enabled by using ChatGPT as the LLM controller, which requires you to apply for an [OpenAI API key](https://platform.openai.com/docs/overview).
 82 | 
 83 | - If you are in areas that cannot access the ChatGPT API, we also provided a way to use a free open sourced LLM controller (e.g., Llama3). Please see below for instructions on how to enable it. You may need to set the `HF_TOKEN` or pass a huggingface token. Feel free to also contribute to the code and enable other LLMs.
 84 | 
 85 | #### (Optional) Start a TGI LLM server
 86 | 
 87 | If user wants to use the TGI to do LLM serving, the code provides a script to pull the docker image and start a TGI LLM serving on Gaudi. Once the TGI is on, please make sure to pass `--llm_model_name tgi` when running the MM Pano command line in the next step.
 88 | 
 89 | We've only validated the listed LLM models ("meta-llama/Meta-Llama-3-8B-Instruct", "mistralai/Mistral-7B-Instruct-v0.2"). We encourage users to try out new models and add them to the supported list.
 90 | 
 91 | ```bash
 92 | # Modify the model name and pass Huggingface token if needed. You can also change the `num_shard` if you like.
 93 | vi mm_pano/tgi_gaudi/run_tgi_gaudi.sh
 94 | 
 95 | # Pull and start the TGI-Gaudi in the container
 96 | (cd mm_pano/tgi_gaudi && ./run_tgi_gaudi.sh)
 97 | ```
 98 | 
 99 | If user wants to run the TGI on other devices, please make sure the default TGI url:port is set to `http://127.0.0.1:8080`.
100 | 
101 | 
102 | #### Command
103 | There are different choices when running the code, a simple example for 
104 | 
105 | - image-to-panorama task
106 | - ChatGPT LLM (GPT4)
107 | - Gaudi accelerator as the hardware
108 | 
109 | ```bash
110 | python3 mm_pano/mmpano.py \
111 |   --init_image exp/example/0.png \
112 |   --output_folder exp/outputs \
113 |   --dtype bfloat16 --device hpu \
114 |   --llm_model_name gpt-4 \
115 |   --api_key <your ChatGPT API key> \
116 |   --save_pano_img \  # To save the generated panorama picture
117 |   --gen_video  # To generate and save the video
118 | ```
119 | 
120 | To change the setups, e.g. 
121 | - perform "text-to-panorama", change `--init_image exp/example/0.png` to `--init_prompt 'maple autum forest'`, also the `--init_prompt` can be used together with `--init_image` to provide a user specified scene description.
122 | - use other LLMs, change `--llm_model_name gpt-4` to `--llm_model_name [other LLM names]`. Currently the available choices are `"gpt-4", "gpt-3.5-turbo", "meta-llama/Meta-Llama-3-8B-Instruct", "mistralai/Mistral-7B-Instruct-v0.2", "tgi"`,
123 |   where TGI can be a [TGI Gaudi](https://github.com/huggingface/tgi-gaudi) or [TGI](https://github.com/huggingface/text-generation-inference) server to run bigger model like Llama3-70B. Note that the `--api_key` is only used for gpt models.
124 | - use cuda, change `--device hpu` to `--device cuda`
125 | - specify camera intrinsic for the input image, add `--intrinsitc float, float, float, float`
126 | 
127 | ## Results (see more on our project page and paper)
128 | 
129 | After running the code, you will see in the output_folder (exp/outputs) a panoramic image "pano.png" (see below for examples) and a immersive video "video.mp4".
130 | 
131 | <div align="center">
132 |   <img width="800" src="media/pano.png">
133 | </div>
134 | 
135 | <div align="center">
136 |   <img width="800" src="media/snow.jpg">
137 | </div>
138 | 
139 | <div align="center">
140 |   <img width="800" src="media/underwater.jpeg">
141 | </div>
142 | 
143 | <div align="center">
144 |   <img width="800" src="media/livingRoom.jpg">
145 | </div>
146 | 
147 | <div align="center">
148 |   <img width="800" src="media/library.jpg">
149 | </div>
150 | 
151 | 
152 | ## Contact
153 | 
154 | Feel free to send an email to Zhipeng (czptc2h@gmail.com) or Joey (Tien Pei) Chou (joey.t.p.chou@gmail.com) if you have any questions and comments. 
155 | 
156 | ## 📈 Star History
157 | 
158 | [![Star History Chart](https://api.star-history.com/svg?repos=IntelLabs/MMPano&type=Date)](https://star-history.com/#IntelLabs/MMPano)
159 | 


--------------------------------------------------------------------------------
/app/.env:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/MMPano/b176d26bc3311584e746fe88135e8d5e793d3107/app/.env


--------------------------------------------------------------------------------
/app/README.md:
--------------------------------------------------------------------------------
1 | Placeholder for Multimodal Panorama API
2 | 


--------------------------------------------------------------------------------
/docker_build.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | while [ "$1" != "" ];
 3 | do
 4 |    case $1 in
 5 |     -d | --device)
 6 |         DEVICE=$2
 7 |         shift
 8 |         ;;
 9 |     -h | --help )
10 |          echo "Build the docker image for Multimodal Panorama Generation"
11 |          echo "Usage: docker_build.sh [OPTIONS]"
12 |          echo "OPTION includes:"
13 |          echo "   -d | --device - Supported device [hpu]"
14 |          exit
15 |       ;;
16 |     * )
17 |         echo "Invalid option: $1"
18 |         echo "Please do 'docker_build.sh -h'"
19 |         exit
20 |        ;;
21 |   esac
22 |   shift
23 | done
24 | 
25 | CUR_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
26 | source ${CUR_DIR}/.env
27 | 
28 | DEVICE="${DEVICE:-hpu}"
29 | DOCKERFILE=${CUR_DIR}/Dockerfile/Dockerfile-${DEVICE}
30 | 
31 | cmd="DOCKER_BUILDKIT=0 docker build . -f ${DOCKERFILE} -t ${IMAGE_NAME}_${DEVICE}:${IMAGE_TAG}"
32 | echo $cmd
33 | eval $cmd
34 | 


--------------------------------------------------------------------------------
/docker_run-hpu.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | # HABANA_VISIBLE_DEVICES, HABANA_VISIBLE_MODULES
 3 | # 0, 2
 4 | # 1, 6
 5 | # 2, 0
 6 | # 3, 7
 7 | # 4, 1
 8 | # 5, 4
 9 | # 6, 3
10 | # 7, 5
11 | 
12 | CUR_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
13 | source ${CUR_DIR}/.env
14 | 
15 | DEVICE_IDX=0
16 | MODULES_IDX=2
17 | IMAGE_NAME=${IMAGE_NAME}_hpu:${IMAGE_TAG}
18 | CONTAINER_NAME=${CONTAINER_NAME}_hpu
19 | 
20 | OUTPUT_DIR_LOCAL=./exp
21 | OUTPUT_DIR_CONTAINER=/app/outputs
22 | docker run -it \
23 |         --expose=${API_PORT} \
24 |         --expose=${WEBAPP_PORT} \
25 |         -v ${OUTPUT_DIR_LOCAL}:${OUTPUT_DIR_CONTAINER} \
26 |         --env=DEVICE=hpu \
27 |         --env=HABANA_VISIBLE_DEVICES=all \
28 |         --env=OMPI_MCA_btl_vader_single_copy_mechanism=none \
29 |         --cap-add=sys_nice \
30 |         --network=host \
31 |         --restart=no \
32 |         --runtime=habana \
33 | 	--shm-size=64g \
34 |         --name ${CONTAINER_NAME} \
35 |         -t ${IMAGE_NAME} 
36 | 


--------------------------------------------------------------------------------
/exp/.gitignore:
--------------------------------------------------------------------------------
1 | *
2 | 


--------------------------------------------------------------------------------
/exp/example/0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/MMPano/b176d26bc3311584e746fe88135e8d5e793d3107/exp/example/0.png


--------------------------------------------------------------------------------
/exp/example/1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/MMPano/b176d26bc3311584e746fe88135e8d5e793d3107/exp/example/1.png


--------------------------------------------------------------------------------
/exp/example/example_descriptor.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"init_image": "exp/example/0.png",
 3 |         "init_prompt": "a forest",
 4 |         "generated_text_details": "a road in autumn",
 5 |         "message": "View 1: We see a narrow, pebbled forest path.\nView 2: We see a rusted, overgrown log bridge crossing a creek.\nView 3: We see an antique lamppost, partially hidden by leaves.\nView 4: We see an old wooden cabin, shrouded in shadows.\nView 5: We see a past-its-prime swing hanging from an oak.\nView 6: We see a moss-laden stone bench facing a pond.",
 6 |         "message_main_obj": "We see: fallen leaves.\nWe see: a path.",
 7 |         "message_topdown": "We see: Canopy\nWe see: Leaf-covered path",
 8 |         "question_for_llm_repeat": "Do we often see multiple a path in a scene with a forest? Just say 'yes' or 'no' with all lower case letters",
 9 |         "description_no_obj": "'a forest'",
10 |         "major_obj_number": 2,
11 |         "is_repeated": [true, true]
12 | }
13 | 


--------------------------------------------------------------------------------
/media/ISC.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/MMPano/b176d26bc3311584e746fe88135e8d5e793d3107/media/ISC.png


--------------------------------------------------------------------------------
/media/blog.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/MMPano/b176d26bc3311584e746fe88135e8d5e793d3107/media/blog.png


--------------------------------------------------------------------------------
/media/library.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/MMPano/b176d26bc3311584e746fe88135e8d5e793d3107/media/library.jpg


--------------------------------------------------------------------------------
/media/livingRoom.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/MMPano/b176d26bc3311584e746fe88135e8d5e793d3107/media/livingRoom.jpg


--------------------------------------------------------------------------------
/media/pano.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/MMPano/b176d26bc3311584e746fe88135e8d5e793d3107/media/pano.png


--------------------------------------------------------------------------------
/media/pipeline.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/MMPano/b176d26bc3311584e746fe88135e8d5e793d3107/media/pipeline.png


--------------------------------------------------------------------------------
/media/snow.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/MMPano/b176d26bc3311584e746fe88135e8d5e793d3107/media/snow.jpg


--------------------------------------------------------------------------------
/media/underwater.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/MMPano/b176d26bc3311584e746fe88135e8d5e793d3107/media/underwater.jpeg


--------------------------------------------------------------------------------
/mm_pano/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/MMPano/b176d26bc3311584e746fe88135e8d5e793d3107/mm_pano/__init__.py


--------------------------------------------------------------------------------
/mm_pano/lib/Equirec2Perspec.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | import cv2
 4 | import numpy as np
 5 | 
 6 | class Equirectangular:
 7 |     def __init__(self, img_name, text2light=False):
 8 |         if isinstance(img_name, str):
 9 |             self._img = cv2.imread(img_name, cv2.IMREAD_COLOR)
10 |         else:
11 |             self._img = img_name
12 |         if text2light:
13 |             self._img = np.roll(self._img, -60, axis=0)
14 |         
15 |         [self._height, self._width, _] = self._img.shape
16 |     
17 | 
18 |     def GetPerspective(self, FOV, THETA, PHI, height, width):
19 |         #
20 |         # THETA is left/right angle, PHI is up/down angle, both in degree
21 |         #
22 | 
23 |         equ_h = self._height
24 |         equ_w = self._width
25 |         equ_cx = (equ_w - 1) / 2.0
26 |         equ_cy = (equ_h - 1) / 2.0
27 | 
28 |         wFOV = FOV
29 |         hFOV = float(height) / width * wFOV
30 | 
31 |         w_len = np.tan(np.radians(wFOV / 2.0))
32 |         h_len = np.tan(np.radians(hFOV / 2.0))
33 | 
34 | 
35 |         x_map = np.ones([height, width], np.float32)
36 |         y_map = np.tile(np.linspace(-w_len, w_len,width), [height,1])
37 |         z_map = -np.tile(np.linspace(-h_len, h_len,height), [width,1]).T
38 | 
39 |         D = np.sqrt(x_map**2 + y_map**2 + z_map**2)
40 |         xyz = np.stack((x_map,y_map,z_map),axis=2)/np.repeat(D[:, :, np.newaxis], 3, axis=2)
41 |         
42 |         y_axis = np.array([0.0, 1.0, 0.0], np.float32)
43 |         z_axis = np.array([0.0, 0.0, 1.0], np.float32)
44 |         [R1, _] = cv2.Rodrigues(z_axis * np.radians(THETA))
45 |         [R2, _] = cv2.Rodrigues(np.dot(R1, y_axis) * np.radians(-PHI))
46 | 
47 |         xyz = xyz.reshape([height * width, 3]).T
48 |         xyz = np.dot(R1, xyz)
49 |         xyz = np.dot(R2, xyz).T
50 |         lat = np.arcsin(xyz[:, 2])
51 |         lon = np.arctan2(xyz[:, 1] , xyz[:, 0])
52 | 
53 |         lon = lon.reshape([height, width]) / np.pi * 180
54 |         lat = -lat.reshape([height, width]) / np.pi * 180
55 | 
56 |         lon = lon / 180 * equ_cx + equ_cx
57 |         lat = lat / 90  * equ_cy + equ_cy
58 | 
59 |         
60 |             
61 |         persp = cv2.remap(self._img, lon.astype(np.float32), lat.astype(np.float32), cv2.INTER_CUBIC, borderMode=cv2.BORDER_WRAP)
62 |         return persp
63 |         
64 | 
65 | 
66 | 
67 | 
68 | 
69 | 
70 | 


--------------------------------------------------------------------------------
/mm_pano/lib/Perspec2Equirec.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | import cv2
 4 | import numpy as np
 5 | 
 6 | class Perspective:
 7 |     def __init__(self, img_name , FOV, THETA, PHI ):
 8 |         if isinstance(img_name, str):
 9 |             self._img = cv2.imread(img_name, cv2.IMREAD_COLOR)
10 |         else:
11 |             self._img = img_name
12 |         [self._height, self._width, _] = self._img.shape
13 |         self.wFOV = FOV
14 |         self.THETA = THETA
15 |         self.PHI = PHI
16 |         self.hFOV = float(self._height) / self._width * FOV
17 | 
18 |         self.w_len = np.tan(np.radians(self.wFOV / 2.0))
19 |         self.h_len = np.tan(np.radians(self.hFOV / 2.0))
20 | 
21 |     
22 | 
23 |     def GetEquirec(self,height,width):
24 |         #
25 |         # THETA is left/right angle, PHI is up/down angle, both in degree
26 |         #
27 | 
28 |         x,y = np.meshgrid(np.linspace(-180, 180,width),np.linspace(90,-90,height))
29 |         
30 |         x_map = np.cos(np.radians(x)) * np.cos(np.radians(y))
31 |         y_map = np.sin(np.radians(x)) * np.cos(np.radians(y))
32 |         z_map = np.sin(np.radians(y))
33 | 
34 |         xyz = np.stack((x_map,y_map,z_map),axis=2)
35 | 
36 |         y_axis = np.array([0.0, 1.0, 0.0], np.float32)
37 |         z_axis = np.array([0.0, 0.0, 1.0], np.float32)
38 |         [R1, _] = cv2.Rodrigues(z_axis * np.radians(self.THETA))
39 |         [R2, _] = cv2.Rodrigues(np.dot(R1, y_axis) * np.radians(-self.PHI))
40 | 
41 |         R1 = np.linalg.inv(R1)
42 |         R2 = np.linalg.inv(R2)
43 | 
44 |         xyz = xyz.reshape([height * width, 3]).T
45 |         xyz = np.dot(R2, xyz)
46 |         xyz = np.dot(R1, xyz).T
47 | 
48 |         xyz = xyz.reshape([height , width, 3])
49 |         inverse_mask = np.where(xyz[:,:,0]>0,1,0)
50 | 
51 |         xyz[:,:] = xyz[:,:]/np.repeat(xyz[:,:,0][:, :, np.newaxis], 3, axis=2)
52 |         
53 |         
54 |         lon_map = np.where((-self.w_len<xyz[:,:,1])&(xyz[:,:,1]<self.w_len)&(-self.h_len<xyz[:,:,2])
55 |                     &(xyz[:,:,2]<self.h_len),(xyz[:,:,1]+self.w_len)/2/self.w_len*self._width,0)
56 |         lat_map = np.where((-self.w_len<xyz[:,:,1])&(xyz[:,:,1]<self.w_len)&(-self.h_len<xyz[:,:,2])
57 |                     &(xyz[:,:,2]<self.h_len),(-xyz[:,:,2]+self.h_len)/2/self.h_len*self._height,0)
58 |         mask = np.where((-self.w_len<xyz[:,:,1])&(xyz[:,:,1]<self.w_len)&(-self.h_len<xyz[:,:,2])
59 |                     &(xyz[:,:,2]<self.h_len),1,0)
60 | 
61 |         persp = cv2.remap(self._img, lon_map.astype(np.float32), lat_map.astype(np.float32), cv2.INTER_CUBIC, borderMode=cv2.BORDER_WRAP)
62 |         
63 |         mask = mask * inverse_mask
64 |         mask = np.repeat(mask[:, :, np.newaxis], 3, axis=2)
65 |         persp = persp * mask
66 |         
67 |         
68 |         return persp , mask
69 |         
70 | 
71 | 
72 | 
73 | 
74 | 
75 | 
76 | 


--------------------------------------------------------------------------------
/mm_pano/lib/multi_Perspec2Equirec.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | import cv2
 4 | import numpy as np
 5 | import lib.Perspec2Equirec as P2E
 6 | class Perspective:
 7 |     def __init__(self, img_array , F_T_P_array ):
 8 |         
 9 |         assert len(img_array)==len(F_T_P_array)
10 |         
11 |         self.img_array = img_array
12 |         self.F_T_P_array = F_T_P_array
13 |     
14 |     def GetEquirec(self,height,width):
15 |         #
16 |         # THETA is left/right angle, PHI is up/down angle, both in degree
17 |         #
18 |         merge_image = np.zeros((height,width,3))
19 |         merge_mask = np.zeros((height,width,3))
20 |         for img_dir,[F,T,P] in zip (self.img_array,self.F_T_P_array):
21 |             per = P2E.Perspective(img_dir,F,T,P)        # Load equirectangular image
22 |             img , mask = per.GetEquirec(height,width)   # Specify parameters(FOV, theta, phi, height, width)
23 |             mask = mask.astype(np.float32)
24 |             img = img.astype(np.float32)
25 |             weight_mask = np.zeros((img_dir.shape[0],img_dir.shape[1], 3))
26 |             w = img_dir.shape[1]
27 |             weight_mask[:,0:w//2,:] = np.linspace(0,1,w//2)[...,None]
28 |             weight_mask[:,w//2:,:] = np.linspace(1,0,w//2)[...,None]
29 |             weight_mask = P2E.Perspective(weight_mask,F,T,P)
30 |             weight_mask, _ = weight_mask.GetEquirec(height,width)
31 |             blur = cv2.blur(mask,(5,5))
32 |             blur = blur * mask
33 |             mask = (blur == 1) * blur + (blur != 1) * blur * 0.05
34 |             merge_image += img * weight_mask
35 |             merge_mask += weight_mask
36 |         merge_image[merge_mask==0] = 255.
37 |         merge_mask = np.where(merge_mask==0,1,merge_mask)
38 |         merge_image = (np.divide(merge_image,merge_mask))
39 |         
40 |         return merge_image
41 |         
42 |         
43 | 


--------------------------------------------------------------------------------
/mm_pano/mmpano.py:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/env python3
  2 | import os
  3 | import sys
  4 | sys.path.append(os.path.dirname(os.path.abspath(__file__)))
  5 | 
  6 | import cv2
  7 | import numpy as np
  8 | import time
  9 | from typing import Dict, Optional, Tuple
 10 | 
 11 | from PIL import Image
 12 | import torch
 13 | from scipy.ndimage import distance_transform_edt
 14 | 
 15 | import json
 16 | import re
 17 | import shutil
 18 | import argparse
 19 | from pprint import pprint
 20 | from tqdm import tqdm
 21 | 
 22 | import lib.Equirec2Perspec as E2P
 23 | import lib.multi_Perspec2Equirec as m_P2E
 24 | from utils.common import (
 25 |     Descriptor,
 26 |     extract_words_after_we_see_withFailv3,
 27 |     extract_words_after_we_see_withFailv2
 28 | )
 29 | from utils.image_utils import (
 30 |     cv2_to_pil, pil_to_cv2, mask_to_pil,
 31 |     vp90Codec, mp4vCodec, mp4Codec, warp_image_v2, 
 32 |     mask_to_NN_v2, generate_left_right_fullPano_pattern,
 33 |     create_rotation_matrix, read_file_into_list, save_dict_to_file,
 34 |     load_dict_from_file, check_fov_overlap_simplified,
 35 | )
 36 | from utils.model_utils import (
 37 |     is_on_hpu, load_diffusion_model,
 38 |     load_blip_model_and_processor, load_upscaler_model,
 39 | )
 40 | from utils.llm_engines import get_llm_engine, _VALIDATED_MODELS
 41 | 
 42 | import math
 43 | 
 44 | from typing import List, Union
 45 | 
 46 | try:
 47 |     import ctypes
 48 |     libgcc_s = ctypes.CDLL('libgcc_s.so.1')
 49 | except:
 50 |     pass
 51 | 
 52 | 
 53 | def extract_valid_lines(text, start_with: str = None):
 54 |     lines = text.split('\n')
 55 |     valid_lines = [line for line in lines if line.strip()]
 56 |     if start_with:
 57 |         valid_lines = [line for line in lines if line.startswith(start_with) and line.strip()]
 58 |     else:
 59 |         valid_lines = [line for line in lines if line.strip()]
 60 |     return valid_lines
 61 | 
 62 | 
 63 | def compute_merge_weight(dis_to_mask):
 64 |     return dis_to_mask
 65 | 
 66 | 
 67 | # compute the zero-padding range in 4 directions of the image plane so that the field of view is at least a certain number
 68 | def compute_padding_range(intrinsics, w, h, fov_min_half = np.pi / 4):
 69 |     # compute the ratio required by fov_min_half
 70 |     ratio_min = np.tan(fov_min_half)
 71 |     fx, fy = intrinsics[0, 0], intrinsics[1, 1]
 72 |     cx, cy = intrinsics[0, 2], intrinsics[1, 2]
 73 |     p_left = max(0, fx * (ratio_min) - cx)
 74 |     p_right = max(0, fx * (ratio_min) - (w-cx))
 75 |     p_down = max(0, fy * (ratio_min) - cy)
 76 |     p_up = max(0, fy * (ratio_min) - (h-cy))
 77 |     return p_left, p_right, p_down, p_up
 78 | 
 79 | 
 80 | def find_kth_minmax(np_array, k = 10):
 81 | 	# np_array (1, N, M), find for each M dim
 82 | 	out_diff = []
 83 | 	output_min = []
 84 | 	for i in range(np_array.shape[-1]):
 85 | 		out_diff.append((-np.partition(-np_array[0,:,i], k)[k])-(np.partition(np_array[0,:,i], k)[k]))
 86 | 	return out_diff
 87 | 
 88 | def is_uniform_color(pil_image, tolerance=10):
 89 | 	# Convert the PIL Image to a NumPy array
 90 | 	image = np.array(pil_image)
 91 | 
 92 | 	# Calculate the height and width of the image
 93 | 	height, width, _ = image.shape
 94 | 	# Check each row on the top and bottom
 95 | 	for i in range(10):
 96 | 		if i ==0:
 97 | 			for row in [image[i:i+1], image[-(i+1):]]:
 98 | 				out_diff = find_kth_minmax(row.astype(np.float32))
 99 | 				if out_diff[0] <= tolerance and out_diff[1] <= tolerance and out_diff[2] <= tolerance:
100 | 					return True
101 | 				
102 | 			# Check each column on the left and right
103 | 			for column in [image[:,i:i+1], image[:,-i-1:]]:
104 | 				column = column.transpose(1, 0, 2)
105 | 				out_diff = find_kth_minmax(column.astype(np.float32))
106 | 				if out_diff[0] <= tolerance and out_diff[1] <= tolerance and out_diff[2] <= tolerance:
107 | 					return True
108 | 		else:
109 | 			for row in [image[i:i+1], image[-(i+1):-i]]:
110 | 				out_diff = find_kth_minmax(row.astype(np.float32))
111 | 				if out_diff[0] <= tolerance and out_diff[1] <= tolerance and out_diff[2] <= tolerance:
112 | 					return True
113 |                 
114 | 			# Check each column on the left and right
115 | 			for column in [image[:,i:i+1], image[:,-i-1:-i]]:
116 | 				column = column.transpose(1, 0, 2)
117 | 				out_diff = find_kth_minmax(column.astype(np.float32))
118 | 				if out_diff[0] <= tolerance and out_diff[1] <= tolerance and out_diff[2] <= tolerance:
119 | 					return True
120 | 				
121 | 	return False
122 | 
123 | 
124 | def create_panorama(image, intrinsic, output_folder, processor, img2text_pipe, inpaint_pipe, sr_pipe, device,
125 |                     sr_inf_step = 75, cinpaint_th = 32., init_prompt = None, major_obj_number = 2,
126 |                     torch_dtype: torch.dtype = torch.float16, 
127 |                     panorama_descriptor: Optional[Dict] = None, llm_engine = None):
128 | 
129 |     height, width, _ = image.shape
130 |     height_resize, width_resize = 512, 512
131 | 
132 |     image_pil = cv2_to_pil(cv2.resize(image, (height_resize, width_resize), interpolation=cv2.INTER_LINEAR))
133 |     
134 |     if init_prompt in [None, ""]:
135 |         prompt = "Question: What is this place (describe with fewer than 5 words)? Answer:"
136 |         inputs = processor(image_pil, text=prompt, return_tensors="pt").to(device, torch_dtype)
137 |         generated_ids = img2text_pipe.generate(**inputs, max_new_tokens=15)
138 |         generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
139 |         init_prompt = generated_text
140 |     else:
141 |         init_prompt = init_prompt
142 | 
143 |     prompt_distance = 'No close objects or walls. '
144 | 
145 |     orig_prompt_init = init_prompt + '. ' + prompt_distance + "Ultra realistic, epic, exciting, wow, stop motion, highly detailed, octane render, soft lighting, professional, 35mm, Zeiss, Hasselblad, Fujifilm, Arriflex, IMAX, 4k, 8k"
146 |     orig_negative_prompt = "multiple subfigures, close objects, large objects, human, people, pedestrian, close view, bright, oversaturated, ugly, 3d, render, cartoon, grain, low-res, kitsch, blender, cropped, lowres, poorly drawn face, out of frame, poorly drawn hands, blurry, bad art, blurred, text, watermark, disfigured, deformed, mangled"
147 | 
148 |     print("expanding the fov for the input image...")
149 | 
150 |     os.makedirs(output_folder, exist_ok = True)
151 |     
152 |     # save the BLIP description
153 |     intrinsics = np.array([[intrinsic[0]*width, 0., intrinsic[2]*width], 
154 |                            [0., intrinsic[1]*height, intrinsic[3]*height],
155 |                            [0., 0., 1.]]).astype(np.float32)
156 |     p_left, p_right, p_down, p_up = compute_padding_range(intrinsics, width, height, np.arctan(1.19)) # half field of view = ~50 deg (to create sufficient overlaps for top/bot view)     
157 | 
158 |     if max(p_left, p_right, p_down, p_up) <= 0:
159 |         image = cv2.resize(image, (height_resize, width_resize), interpolation=cv2.INTER_LINEAR)
160 |         # TODO(Joey Chou): Ask if this is needed
161 |         scale_x, scale_y = float(width_resize) / width, float(height_resize) / height
162 |         intrinsics[0] *= scale_x
163 |         intrinsics[1] *= scale_y
164 |     else:
165 |         # pad images and create mask
166 |         # compute the 4 corners of the original image, and fit it into the resized image range
167 |         # take the max to make the resized image square
168 |         wh_new = int(max(p_left + p_right + width, p_down + p_up + height)) 
169 | 
170 |         # left right
171 |         width_ori_resize, height_ori_resize = math.ceil(width_resize / wh_new * width), math.ceil(height_resize / wh_new * height)
172 |         # corner_location
173 |         loc_corner = (width_resize//2 - width_ori_resize//2, height_resize // 2 - height_ori_resize//2)
174 |         # create the new image and put the resized original image into it, and create the mask and new intrinsics
175 |         image_resized = np.zeros((512, 512, 3), dtype = image.dtype)
176 |         image_resized[loc_corner[1]:loc_corner[1]+height_ori_resize, loc_corner[0]:loc_corner[0]+width_ori_resize] = cv2.resize(image, (width_ori_resize, height_ori_resize), interpolation=cv2.INTER_LINEAR)
177 |         scale_x, scale_y = float(width_ori_resize) / width, float(height_ori_resize) / height
178 |         image = image_resized
179 |         # create mask
180 |         mask = np.ones((height_resize, width_resize), dtype = np.float32)
181 |         mask[loc_corner[1]:loc_corner[1]+height_ori_resize, loc_corner[0]:loc_corner[0]+width_ori_resize] = 0.0
182 |         # save the resized first image
183 |         cv2.imwrite(output_folder + '/input_before_inpaint.png', image)
184 | 
185 |         # inpaint the masked region
186 |         mask_revert = mask
187 |         pil_image = cv2_to_pil(image)    
188 |         
189 |         # detect whether there is a pure-colored top/down region
190 |         pure_color_bg = True
191 |         while pure_color_bg:
192 |             image_inpaint = inpaint_pipe(prompt=orig_prompt_init, negative_prompt=orig_negative_prompt, image=pil_image, mask_image=mask_revert, num_inference_steps=25).images[0]
193 |             pure_color_bg = is_uniform_color(image_inpaint)
194 |             # # for test
195 |             if pure_color_bg:
196 |                 image_inpaint.save(output_folder + '/test_pure_color.png')
197 |             print("do we have pure color for the inpainted image? {}".format(pure_color_bg))
198 | 
199 |         # make inpainting consistent
200 |         mask_cinpaint = (1-mask)
201 |         dist2zero = distance_transform_edt(mask_cinpaint)
202 |         # 2. build weight map according to dist2zero
203 |         weight_map_cinpaint  = np.ones(mask_cinpaint.shape).astype(np.float32)
204 |         weight_map_cinpaint[dist2zero<=cinpaint_th] = dist2zero[dist2zero<=cinpaint_th]/cinpaint_th        
205 |         image_inpaint = Image.fromarray((np.array(pil_image) * weight_map_cinpaint[:,:,np.newaxis] + np.array(image_inpaint) * (1-weight_map_cinpaint)[:,:, np.newaxis]).astype('uint8'))
206 | 
207 |         # perform super-resolution on the inpainted image
208 |         # dont use the SR result for warp-and-inpaint
209 |         # save it as another mesh so that we merge it into the inpainted image later
210 |         if sr_pipe is not None:
211 |             upscaled_image = sr_pipe(prompt=orig_prompt_init, negative_prompt=orig_negative_prompt, image=image_inpaint, num_inference_steps = sr_inf_step).images[0]
212 | 
213 |         prompt = "Question: Describe the foreground and background in detail and separately? Answer:"
214 |         inputs = processor(pil_image, text=prompt, return_tensors="pt").to(device, torch_dtype)
215 |         generated_ids = img2text_pipe.generate(**inputs, max_new_tokens=15)
216 |         generated_text_details = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
217 |         
218 |         image = pil_to_cv2(image_inpaint)
219 | 
220 |         # create intrinsics
221 |         intrinsics[0,2] = loc_corner[0] + intrinsics[0,2] * scale_x
222 |         intrinsics[1,2] = loc_corner[1] + intrinsics[1,2] * scale_y
223 |         intrinsics[0,0] *= scale_x
224 |         intrinsics[1,1] *= scale_y
225 | 
226 |         if sr_pipe is not None:
227 |             image_SR = pil_to_cv2(upscaled_image)
228 |             # intrinsics for image_SR
229 |             intrinsics_SR = np.copy(intrinsics)
230 |             intrinsics_SR[0,:] *= 4
231 |             intrinsics_SR[1,:] *= 4
232 | 
233 |     # save the resized first image
234 |     cv2.imwrite(output_folder + '/input_resized.png', image)
235 |     
236 | 
237 |     image_list = [image]
238 |     pose_list = [(0, 0, 0)]
239 |     if sr_pipe is not None:
240 |         image_SR_list = [image_SR]
241 |         cv2.imwrite(output_folder + '/input_resized_SR.png', image_SR)
242 | 
243 |     max_step = 6
244 |     step_size = 41
245 |     vortex_list = generate_left_right_fullPano_pattern(max_step=max_step, step_size = step_size, final_step = 55)
246 |     
247 |     if not panorama_descriptor:
248 |         question_for_llm = "Given a scene with {}, where in font of us we see {}. MUST generate {} rotated views to describe what else you see in this place, where the camera of each view rotates {} degrees to the right (you dont need to describe the original view, i.e., the first view of the {} views you need to describe is the view with {} degree rotation angle). Dont involve redundant details, just describe the content of each view. Also don't repeat the same object in different views. Don't refer to previously generated views. Generate concise (< 10 words) and diverse contents for each view. Each sentence starts with: View xxx(view number, from 1-{}): We see.... (don't mention any information about human, animal or live creature)".format(init_prompt, generated_text_details, max_step, 360//max_step, max_step, 360//max_step, max_step)
249 |         question_for_llm_major_object = "Given a scene with {}, where in font of us we see {}. What would be the two major foreground objects that we see? use two lines to describe them where each line is in the format of 'We see: xxx (one object, dont describe details, just one word for the object. Start from the most possible object. Don't mention background objects like things on the wall, ceiling or floor.)'".format(init_prompt, generated_text_details)
250 |         question_for_llm_remove_objects = "Modify the sentence: '{}' so that we remove all the objects from the description (e.g., 'a bedroom with a bed' would become 'a bedroom'. Do not change the sentence if the description is only an object). Just output the modified sentence.".format(init_prompt)
251 | 
252 |         # We want to repeat this process until there is no human detected in the answer
253 |         _message, history = llm_engine.chat(question_for_llm)
254 |         question_remove_animal = 'given the description of multiple views: \'{}\' remove any information about human, animal, or live creature in the descriptions. Answer with simply the modified content, i.e., View XXX (view number): We see ... (contents without human info)'.format(_message) 
255 |         message, _ = llm_engine.chat(question_remove_animal, history=history)
256 | 
257 |         message_main_obj, _ = llm_engine.chat(question_for_llm_major_object, history=None)
258 |         description_no_obj, _ = llm_engine.chat(question_for_llm_remove_objects, history=None)
259 | 
260 |         lines = extract_valid_lines(message, start_with="View")
261 |         lines_major_obj = extract_valid_lines(message_main_obj)
262 | 
263 |         while len(lines_major_obj) != 2 or extract_words_after_we_see_withFailv2(lines_major_obj[0]) is None or extract_words_after_we_see_withFailv2(lines_major_obj[1]) is None:
264 |             message_main_obj, _ = llm_engine.chat(question_for_llm_major_object, history=None)
265 |             lines_major_obj = extract_valid_lines(message_main_obj)
266 |         
267 |         if len(lines) != (max_step):
268 |             print("[error] num_lines != {}".format(max_step))
269 |             print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
270 |             print(len(lines), max_step)
271 |             print(message)
272 |             print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
273 |             return 1
274 | 
275 |         is_repeated = []
276 |         is_repeated_all = True
277 |         num_false = 0
278 |         for obj_id in range(major_obj_number):
279 |             # let LLM decide whether the object is repeating
280 |             question_for_llm_repeat = "Do we often see multiple {} in a scene with {}? Just say 'yes' or 'no' with all lower case letters".format(extract_words_after_we_see_withFailv2(lines_major_obj[obj_id]), init_prompt)
281 |             fail = True
282 |             while fail:
283 |                 message_repeat, _ = llm_engine.chat(question_for_llm_repeat)
284 |                 
285 |                 if 'yes' or 'Yes' in message_repeat:
286 |                     is_repeated.append(True)
287 |                     fail = False
288 |                 elif 'no' or 'No' in message_repeat:
289 |                     is_repeated.append(False)
290 |                     fail = False
291 |                     is_repeated_all = False
292 |                     num_false += 1
293 |                 else:
294 |                     print(f"wrong output for repeat answer = {message_repeat}")
295 |          
296 |         # Create dictionary
297 |         panorama_descriptor = Descriptor(**{
298 |             "init_prompt": init_prompt,
299 |             "generated_text_details": generated_text_details,
300 |             "message": message,
301 |             "message_main_obj": message_main_obj,
302 |             "question_for_llm_repeat": question_for_llm_repeat,
303 |             "description_no_obj": description_no_obj,
304 |             "major_obj_number": major_obj_number,
305 |             "is_repeated": is_repeated,
306 |         })
307 |     else:
308 |         init_prompt = panorama_descriptor.init_prompt
309 |         generated_text_details = panorama_descriptor.generated_text_details
310 |         message = panorama_descriptor.message
311 |         message_main_obj = panorama_descriptor.message_main_obj
312 |         question_for_llm_repeat = panorama_descriptor.question_for_llm_repeat
313 |         description_no_obj = panorama_descriptor.description_no_obj
314 |         major_obj_number = panorama_descriptor.major_obj_number
315 |         is_repeated = panorama_descriptor.is_repeated
316 | 
317 |         lines = extract_valid_lines(message)
318 |         lines_major_obj = extract_valid_lines(message_main_obj)
319 |         is_repeated_all = all(is_repeated)
320 | 
321 |     panorama_descriptor.save_json(os.path.join(output_folder, "panorama_descriptor.json"))
322 | 
323 |     print("====================================================================")
324 |     print("LLM descriptions:")
325 |     pprint(panorama_descriptor)
326 |     print("====================================================================")
327 | 
328 |     order = [5, 0, 1, 4, 2, 3]
329 |     for i in order:
330 |         pose = vortex_list[i]
331 |         print("generating view i = {}, pose = {}".format(i, pose))
332 |         # generate the warped image and mask
333 |         rotation_matrix = create_rotation_matrix(pose[0], pose[1], pose[2])
334 | 
335 |         # Warp previous images to the new view and create the mask
336 |         warped_image_accumulate = np.zeros((height_resize, width_resize, 3), dtype = np.float32)
337 |         weight_accumulate = np.zeros((height_resize, width_resize, 1), dtype = np.float32)
338 |         mask_accumulate = np.zeros((height_resize, width_resize), dtype = np.float32)
339 |         
340 |         if sr_pipe is not None:
341 |             warped_image_accumulate_SR = np.zeros((height_resize, width_resize, 3), dtype = np.float32)
342 |         
343 |         for j in range(len(image_list)):
344 |             # get the relative pose from the j-th image to the current view
345 |             pose_prev = pose_list[j]
346 |             rotation_matrix_prev = create_rotation_matrix(pose_prev[0], pose_prev[1], pose_prev[2])
347 |             rot_mat_prev_to_curr = rotation_matrix_prev.T @ rotation_matrix
348 | 
349 |             # skip non-overlapping views
350 |             is_overlap = check_fov_overlap_simplified(rot_mat_prev_to_curr, 80) # hard-coded small fovs so that we only keep neighbouring images
351 |             if not is_overlap:
352 |                 print("pose: {} is not overlapping with the current image ({}/{})".format(pose_prev, i, pose))
353 |                 continue
354 | 
355 |             # warp image
356 |             warped_image, mask = warp_image_v2(image_list[j], intrinsics, intrinsics, rot_mat_prev_to_curr.T, (height_resize, width_resize))
357 |             dis_to_mask = mask_to_NN_v2(mask) # [h, w]
358 |             # render, disp, mask, dis_to_mask, rads = rgbd_renderer.render_mesh_with_normal(mesh_list[j], intrinsics_tensor, ext_tensor)
359 |             # render SR and prepare a hgih_res warped image for later use
360 |             if sr_pipe is not None:
361 |                 warped_image_SR, mask_SR = warp_image_v2(image_SR_list[j], intrinsics_SR, intrinsics_SR, rot_mat_prev_to_curr.T, (height_resize*4, width_resize*4))
362 | 
363 |             if sr_pipe is not None:
364 |                 warped_image_SR = cv2.resize(warped_image_SR, (width_resize, height_resize), interpolation = cv2.INTER_CUBIC)
365 | 
366 |             weight_map = compute_merge_weight(dis_to_mask).numpy()
367 |             # accumulate the warped image with weights
368 |             warped_image_accumulate += warped_image.astype(np.float32) * weight_map[:, :, np.newaxis]
369 |             if sr_pipe is not None:
370 |                 warped_image_accumulate_SR += warped_image_SR.astype(np.float32) * weight_map[:, :, np.newaxis]
371 |             weight_accumulate += weight_map[:,:, np.newaxis]
372 |             mask_accumulate[mask == 1] = 1.0
373 | 
374 |         zero_indices = (weight_accumulate == 0)
375 |         weight_accumulate[zero_indices] = 1.0
376 |         warped_image = np.clip((warped_image_accumulate/weight_accumulate).astype(np.uint8), 0, 255)
377 |         mask = mask_accumulate
378 |         if sr_pipe is not None:
379 |             warped_image_SR = np.clip((warped_image_accumulate_SR/weight_accumulate).astype(np.uint8), 0, 255)
380 | 
381 |         # set the line number to get the right line:
382 |         description = extract_words_after_we_see_withFailv3(lines[i])
383 |         if description is None:
384 |             print("[error] GPT prompt not following our format: {}".format(lines[i]))
385 |             return 1
386 |         else:
387 |             # get the answer for repeated objects
388 |             if is_repeated_all:
389 |                 print("the major objects are repeated")
390 |                 description = 'a peripheral view of {} where we see{} {}'.format(description_no_obj, description, prompt_distance)
391 |             else:
392 |                 description = 'a peripheral view of {} where we only see{} {}'.format(description_no_obj, description, prompt_distance)
393 | 
394 |         orig_prompt = description+", ultra realistic, epic, exciting, wow, stop motion, highly detailed, octane render, soft lighting, professional, 35mm, Zeiss, Hasselblad, Fujifilm, Arriflex, IMAX, 4k, 8k, large field of view (100 degrees)"
395 |         if not is_repeated_all:
396 |             orig_negative_prompt = "human, people, pedestrian, close objects, large objects, close view, bright, oversaturated, ugly, 3d, render, cartoon, grain, low-res, kitsch, blender, cropped, lowres, poorly drawn face, out of frame, poorly drawn hands, blurry, bad art, blurred, text, watermark, disfigured, deformed, mangled"
397 |             for obj_id in range(major_obj_number):
398 |                 orig_negative_prompt = 'any type of ' + extract_words_after_we_see_withFailv2(lines_major_obj[obj_id]) + ', ' + orig_negative_prompt
399 |         else:
400 |             orig_negative_prompt = "human, people, pedestrian, close objects, large objects, close view, mirror, bright, oversaturated, ugly, 3d, render, cartoon, grain, low-res, kitsch, blender, cropped, lowres, poorly drawn face, out of frame, poorly drawn hands, blurry, bad art, blurred, text, watermark, disfigured, deformed, mangled"
401 | 
402 |         mask_revert = mask_to_pil(1-mask)
403 |         if sr_pipe is not None:
404 |             pil_image = cv2_to_pil(warped_image_SR)
405 |         else:
406 |             pil_image = cv2_to_pil(warped_image)
407 |         # detect whether there is a pure-colored top/down region
408 |         pure_color_bg = True
409 |         iter_count = 0
410 |         while pure_color_bg and iter_count < 20:
411 |             image_inpaint = inpaint_pipe(prompt=orig_prompt, negative_prompt=orig_negative_prompt, image=pil_image, mask_image=mask_revert, num_inference_steps=25).images[0]
412 |             pure_color_bg = is_uniform_color(image_inpaint)
413 |             print("[avoid pure color background] do we have pure color for the inpainted image? {}".format(pure_color_bg))
414 | 
415 |             for obj_id in range(major_obj_number):
416 |                 if not is_repeated_all and not is_repeated[obj_id] and pose[0] == 0 and i > 0 and i < 5:
417 |                     prompt = "Question: is there a {} in this picture (just say yes or no)? Answer:".format(extract_words_after_we_see_withFailv2(lines_major_obj[obj_id]))
418 |                     inputs = processor(image_inpaint, text=prompt, return_tensors="pt").to(device, torch_dtype)
419 |                     generated_ids = img2text_pipe.generate(**inputs, max_new_tokens=15)
420 |                     generated_text_repeat = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
421 |                     print("repeated check = {}".format(generated_text_repeat))
422 |                     if "yes" in generated_text_repeat:
423 |                         print(" we see {} in the inpainted view".format(extract_words_after_we_see_withFailv2(lines_major_obj[obj_id])))
424 |                         pure_color_bg = True
425 |                         iter_count += (1.0/num_false)
426 |                         if not is_repeated_all and iter_count >= 20:
427 |                             is_repeated_all = True
428 |                             print("reaching maximum checking iterations, there is a conflict, setting is_repeated to true")
429 |         inpainted_cv2 = pil_to_cv2(image_inpaint)
430 | 
431 |         # we do the same merging step as the
432 |         # 1. compute the weight mask for the warped image
433 |         dist2zero = distance_transform_edt(mask_accumulate)
434 | 
435 |         # 2. build weight map according to dist2zero
436 |         weight_map_cinpaint = np.ones(mask_accumulate.shape).astype(np.float32)
437 |         weight_map_cinpaint[dist2zero <= cinpaint_th] = dist2zero[dist2zero <= cinpaint_th] / cinpaint_th
438 | 
439 |         # Save image at each step
440 |         if sr_pipe is not None:
441 |             inpainted_cv2_merge = warped_image_SR * weight_map_cinpaint[:, :, np.newaxis] + inpainted_cv2 * (1 - weight_map_cinpaint)[:, :, np.newaxis]
442 |             # filename = os.path.join(output_folder, f"inpaint_step_SR_{i}.png")
443 |         else:
444 |             inpainted_cv2_merge = warped_image * weight_map_cinpaint[:, :, np.newaxis] + inpainted_cv2 * (1 - weight_map_cinpaint)[:, :, np.newaxis]
445 |             # filename = os.path.join(output_folder, f"inpaint_step_{i}.png")
446 |         filename = os.path.join(output_folder, f"inpaint_step_{i}.png")
447 |         cv2.imwrite(filename, inpainted_cv2_merge)
448 | 
449 |         # Perform super-resolution on the inpainted_cv2 (not on inpainted_cv2_SR to prevent noise amplification)
450 |         if sr_pipe is not None:
451 |             # image_inpaint_SR = cv2_to_pil(inpainted_cv2.astype(np.uint8))
452 |             image_inpaint_SR = cv2_to_pil(inpainted_cv2_merge.astype(np.uint8))
453 |             image_inpaint_SR = sr_pipe(prompt=orig_prompt, negative_prompt=orig_negative_prompt, image=image_inpaint_SR, num_inference_steps=sr_inf_step).images[0]
454 |             image_inpaint_SR_cv2 = pil_to_cv2(image_inpaint_SR)
455 |             filename = os.path.join(output_folder, f"inpaint_step_SR_{i}.png")
456 |             cv2.imwrite(filename, image_inpaint_SR_cv2)
457 | 
458 |         image_list.append(inpainted_cv2)
459 |         if sr_pipe is not None:
460 |             image_SR_list.append(image_inpaint_SR_cv2)
461 |         pose_list.append(pose)
462 | 
463 |     return 0
464 | 
465 | 
466 | def parse_args():
467 |     def list_of_num(arg):
468 |         return list(map(float, arg.split(',')))
469 | 
470 |     parser = argparse.ArgumentParser(description='Multimodal Panorama Generation')
471 |     parser.add_argument('--device', type=str, default="hpu", choices=["cpu", "cuda", "hpu"], help="Target HW device for Diffusion and BLIP models")
472 |     parser.add_argument('--dtype', type=str, default="float32", choices=["float16", "float32", "bfloat16"], help="Datatype for model inference.")
473 |     parser.add_argument('--init_prompt', type=str, help='Prompt which will be used for text to panorama generation.')
474 |     parser.add_argument('--init_image', type=str, help='Path to a image which will be used for image to panorama generation.')
475 |     parser.add_argument('--output_folder', type=str, default='./exp/output')
476 |     parser.add_argument('--cpu_offload', action="store_true", help="Flag if user want to offload StableDiffusion pipeline to CPU")
477 | 
478 |     parser.add_argument('--text2pano', action="store_true", help="Flag if user want to do text-to-panorama. Else will do image-to-panorama.")
479 |     parser.add_argument('--llm_model_name', type=str, default="mistralai/Mistral-7B-Instruct-v0.2",
480 |                         choices=_VALIDATED_MODELS, help='Name of LLM model for text generation.')
481 |     parser.add_argument('--api_key', type=str, default="", help="your OpenAI API key")
482 |     parser.add_argument('--intrinsic', type=list_of_num, default=[1.11733848262, 1.11733848262, 0.5, 0.5], help="Intrinsic.")
483 |     parser.add_argument('--panorama_descriptor', type=str, help='Path to a descriptor JSON that will be used for panorama generation.')
484 | 
485 |     parser.add_argument('--do_upscale', action="store_true", help="Flag if user want to use super resolution to upscale the generated images")
486 |     parser.add_argument('--major_obj_number', type=int, default=2, choices=[1, 2], help='how many major objects we do we want to consider so that they dont repeat?')
487 |     parser.add_argument('--sr_inf_step', type=int, default=35, help='number of inference steps for the super resolution model')
488 | 
489 |     parser.add_argument('--inpaint_model_name', type=str, default="stabilityai/stable-diffusion-2-inpainting",
490 |                         help="Diffusion model name")
491 |     parser.add_argument('--blip_model_name', type=str, default="Salesforce/blip2-flan-t5-xl",
492 |                         help="BLIP model name")
493 |     parser.add_argument('--upscaler_model_name', type=str, default="stabilityai/stable-diffusion-x4-upscaler",
494 |                         help="Super resolution upscaler model name")
495 | 
496 |     # Generate panorama and video
497 |     parser.add_argument('--save_pano_img', action="store_true", help="Flag if user want to save the panorama image.")
498 |     parser.add_argument('--gen_video', action="store_true", help="Flag if user want to generate and save a video of panorama view.")
499 |     parser.add_argument('--video_codec', type=str, default="MP4V", choices=["MP4V", "VP09"],
500 |                         help="Video codec used to generate the video")
501 |     args = parser.parse_args()
502 | 
503 |     # Validate arguments
504 |     if len(args.intrinsic) != 4:
505 |         raise RuntimeError(f"--intrinsic has to be 4 floating point number. Got {args.intrinsic}")
506 | 
507 |     return args
508 | 
509 | 
510 | def gen_multiviews(
511 |     device: str,
512 |     dtype: str = "float32",
513 |     output_folder: str = "./outputs",
514 |     init_prompt: Optional[str] = None,
515 |     init_image: Optional[Union[str, Image.Image]] = None,
516 |     cpu_offload: bool = False,
517 |     # Text generation
518 |     text2pano: bool = False,
519 |     llm_model_name: str = "mistralai/Mistral-7B-Instruct-v0.2",
520 |     api_key: str = "",
521 |     panorama_descriptor: Optional[Union[str, Dict[str, str]]] = None,  # None, path to JSON, or a dictionary
522 |     use_predefine_llm_descriptor: bool = False,
523 |     llm_engine = None,
524 |     # Panorama generation
525 |     intrinsic: List[float] = [1.11733848262, 1.11733848262, 0.5, 0.5],
526 |     do_upscale: bool = False,
527 |     major_obj_number: int = 2,
528 |     sr_inf_step: int = 35,
529 |     inpaint_model_name: Optional[str] = "stabilityai/stable-diffusion-2-inpainting",
530 |     blip_model_name: Optional[str] = "Salesforce/blip2-flan-t5-xl",
531 |     upscaler_model_name: Optional[str] = "stabilityai/stable-diffusion-x4-upscaler",
532 |     text2img_model_name: Optional[str] = "stabilityai/stable-diffusion-2-base", 
533 |     # Pre-loaded pipelines, if any
534 |     inpaint_pipe: Optional = None,
535 |     processor: Optional = None,
536 |     img2text_pipe: Optional = None,
537 |     sr_pipe: Optional = None,
538 |     text2img_pipe: Optional = None,
539 |     **kwargs,
540 |     ):
541 | 
542 |     if is_on_hpu(device) and dtype == "float16":
543 |         # Force dtype to be bfloat16 on HPU
544 |         dtype = "bfloat16"
545 | 
546 |     print("===========================================================================")
547 |     print(f"Running Multimodal Panorama Generation on {device} in {dtype}.")
548 |     print("===========================================================================")
549 | 
550 |     ##################
551 |     # Parse descriptor
552 |     ##################
553 |     # If given, get the pre-generated LLM descriptions
554 |     if panorama_descriptor is not None and use_predefine_llm_descriptor:
555 |         if isinstance(panorama_descriptor, dict):
556 |             panorama_descriptor = Descriptor(**panorama_descriptor)
557 |         elif isinstance(panorama_descriptor, str) and os.path.isfile(panorama_descriptor):
558 |             panorama_descriptor = Descriptor.from_json(panorama_descriptor)
559 |         elif not isinstance(panorama_descriptor, Descriptor):
560 |             raise RuntimeError(f"panorama_descriptor should be a JSON file, Dictionary, or Descriptor type.")
561 | 
562 |         # If only init_prompt is given in the panorama_descriptor, do the text-to-panorama
563 |         if not panorama_descriptor.init_image:
564 |             assert panorama_descriptor.init_prompt, "At least one of [`init_prompt`, `init_image`] must be given"
565 |             text2pano = True
566 | 
567 |     elif panorama_descriptor is None and use_predefine_llm_descriptor:
568 |         raise RuntimeError(f"`panorama_descriptor` must be provided when setting `use_predefine_llm_descriptor=True`")
569 | 
570 |     ######################
571 |     # Create output folder
572 |     ######################
573 |     if os.path.exists(output_folder):
574 |         shutil.rmtree(output_folder)
575 |     os.makedirs(output_folder, exist_ok = True)
576 |     print(f"Save all outputs to {output_folder}")
577 | 
578 |     #############################
579 |     # Load pipelines if not given 
580 |     #############################
581 |     # Inpainting pipeline
582 |     if inpaint_pipe is None:
583 |         inpaint_pipe = load_diffusion_model(inpaint_model_name, device=device, dtype=dtype, cpu_offload=cpu_offload)
584 | 
585 |     # Image-to-text pipeline
586 |     if processor is None and img2text_pipe is None:
587 |         processor, img2text_pipe = load_blip_model_and_processor(blip_model_name, device=device, dtype=dtype)
588 |     elif (processor is not None and img2text_pipe is None) or (processor is None and img2text_pipe is not None):
589 |         raise RuntimeError(
590 |             "Processor and BLIP model has to be set or not set at the same time. "
591 |             f"Got processor={processor}, img2text_pipe={img2text_pipe}."
592 |         )
593 | 
594 |     # Super resolution
595 |     if sr_pipe is None and do_upscale:
596 |         # NOTE: Skip upscaler in light version
597 |         sr_pipe = load_upscaler_model(upscaler_model_name, device, dtype)
598 | 
599 |     # Text-to-image
600 |     if text2pano and text2img_pipe is None:
601 |         # Load Diffusion pipeline
602 |         text2img_pipe = load_diffusion_model(text2img_model_name, device=device, dtype=dtype, cpu_offload=cpu_offload)
603 | 
604 |     # Text generation 
605 |     if llm_engine is None:
606 |         llm_engine = get_llm_engine(llm_model_name, device=device, dtype=dtype, openai_key=api_key)
607 | 
608 |     ###########################
609 |     # Text or Image to Panorama
610 |     ###########################
611 |     init_prompt = init_prompt if panorama_descriptor is None else panorama_descriptor.init_prompt
612 |     init_image = init_image if panorama_descriptor is None else panorama_descriptor.init_image
613 | 
614 |     t_begin = time.time()
615 |     # Use given init_image or generate an init_image from the init_prompt.
616 |     # This will be used for generating panorama
617 |     if text2pano:
618 |         print(f"Generating init image with prompt={init_prompt} ...")
619 |         init_image = text2img_pipe(init_prompt, num_inference_steps=25).images[0]
620 |         init_image = cv2.cvtColor(np.array(init_image), cv2.COLOR_RGB2BGR)
621 |     elif init_image is not None:
622 |         if isinstance(init_image, str):
623 |             # init_image is a path to a file
624 |             print(f"Loading init image from {init_image}")
625 |             init_image = cv2.imread(init_image, cv2.IMREAD_COLOR)
626 |         elif isinstance(init_image, Image.Image):
627 |             init_image = cv2.cvtColor(np.array(init_image), cv2.COLOR_RGB2BGR)
628 |         elif isinstance(init_image, np.ndarray):
629 |             pass
630 |     else:
631 |         # TODO(Joey Chou): Add error message
632 |         raise RuntimeError("Please do text2pano with a given init_prompt, or pass a init_image to do image to pano")
633 | 
634 |     # check whether the intrinsic matrix exist
635 |     with torch.inference_mode():
636 |         fail = True
637 |         while fail:
638 |             fail = create_panorama(
639 |                 init_image, intrinsic, output_folder, processor, img2text_pipe, inpaint_pipe, sr_pipe, device,
640 |                 sr_inf_step, init_prompt=init_prompt, major_obj_number=major_obj_number,
641 |                 panorama_descriptor=panorama_descriptor, llm_engine=llm_engine
642 |             )
643 |     print(f"Total runtime: {time.time() - t_begin}")
644 | 
645 | 
646 | def _gen_pano_outputs(images: List[np.ndarray],
647 |                       out_dir: str,
648 |                       rotation_degrees: List[int],
649 |                       fov: float = 99.9169018, gen_video: bool = False,
650 |                       save_pano_img: bool = True,
651 |                       # Video related
652 |                       video_size: Tuple[int, int] = (512, 512), video_codec: str = "MP4V",
653 |                       new_pano: Optional = None):
654 |     """
655 |     To make video works with gradio, please use the setup as below:
656 |         * interval_deg = 1.0
657 |         * fps: = 60
658 |         * video_codec = "VP09"
659 | 
660 |     For other application that works with mp4v:
661 |         * interval_deg = 0.5
662 |         * fps = 60
663 |         * video_codec = "MP4V"
664 |     """
665 | 
666 |     if new_pano is None:
667 |         _output_image_name = "pano.png"
668 | 
669 |         ee = m_P2E.Perspective(
670 |                 images,
671 |                 [
672 |                     [fov, rotation_degrees[0], 0], [fov, rotation_degrees[1], 0], [fov, rotation_degrees[2], 0], [fov, rotation_degrees[3], 0],
673 |                     [fov, rotation_degrees[4], 0], [fov, rotation_degrees[5], 0], [fov, rotation_degrees[6], 0]
674 |                 ]
675 |             )
676 | 
677 |         new_pano = ee.GetEquirec(2048, 4096)
678 | 
679 |         if save_pano_img:
680 |             # Output panorama image
681 |             cv2.imwrite(os.path.join(out_dir, _output_image_name), new_pano.astype(np.uint8)[540:-540])
682 | 
683 |     if gen_video:
684 |         if video_codec.upper() == "MP4V":
685 |             codec_config = mp4vCodec()
686 |         elif video_codec.upper() == "VP09":
687 |             codec_config = vp90Codec()
688 |         elif video_codec.upper() == "MP4":
689 |             codec_config = mp4Codec()
690 |         else:
691 |             raise RuntimeError(f"Only support codec ['.MP4V', 'VP09']. Got {video_codec}")
692 | 
693 |         output_video_name = f"video{codec_config.video_format}"
694 |         interval_deg = codec_config.interval_deg
695 | 
696 |         video_codec = codec_config.video_codec
697 |         fps = codec_config.fps
698 | 
699 |         fov = 86
700 |         num_frames = int(360 / interval_deg)
701 | 
702 |         equ = E2P.Equirectangular(new_pano)
703 |         img = equ.GetPerspective(fov, 0, 0, *video_size)  # Specify parameters(FOV, theta, phi, height, width)
704 | 
705 |         margin = 0
706 |         if margin > 0:
707 |             img = img[margin:-margin]
708 |         size = (img.shape[1], img.shape[0])
709 | 
710 |         save_video_path = os.path.join(out_dir, output_video_name)
711 |         print("save_video_path = ", save_video_path, "; ", video_codec, ", ", fps, ", ", size, ", video_size = ", video_size)
712 |         out = cv2.VideoWriter(save_video_path, cv2.VideoWriter_fourcc(*video_codec), fps, size)
713 | 
714 |         for i in tqdm(range(num_frames)):
715 |             # Process image
716 |             deg = i * interval_deg
717 |             img = equ.GetPerspective(fov, deg, 0, *video_size)  # Specify parameters(FOV, theta, phi, height, width)
718 |             if margin > 0:
719 |                 img = img[margin:-margin]
720 |             img = np.clip(img, 0, 255).astype(np.uint8)
721 | 
722 |             # Write to video
723 |             out.write(img)
724 |         out.release()
725 | 
726 |         # ffmpeg -y -i /root/app/rest_api/api_output/demo/video.mp4v /root/app/rest_api/api_output/demo/video.avc1
727 |     return new_pano
728 | 
729 | 
730 | def gen_pano(images: Optional[List[np.ndarray]] = None,
731 |              output_folder: Optional[str] = None,
732 |              do_upscale: bool = False,
733 |              save_pano_img: bool = True,
734 |              gen_video: bool = True,
735 |              video_codec: str = "MP4V",
736 |              pano: Optional = None,
737 |              **kwargs,
738 |              ):
739 |     # suffix = '_SR' if do_upscale else ""
740 |     suffix = "" 
741 |     image_names = ["input_resized" + suffix + ".png"]
742 |     for i in range(6):
743 |         image_names.append("inpaint_step" + suffix + "_{}.png".format(i))
744 | 
745 |     rotations = [create_rotation_matrix(0, 0, 0).T]
746 |     rotation_degrees = [0]
747 |     max_step = 6
748 |     step_size = 41
749 |     vortex_list = generate_left_right_fullPano_pattern(max_step=max_step, step_size=step_size, final_step=55)
750 |     for i in range(6):
751 |         rotations.append(create_rotation_matrix(vortex_list[i][0], vortex_list[i][1], vortex_list[i][2]).T)
752 |         rotation_degrees.append(vortex_list[i][1])
753 | 
754 |     LR_images = []
755 |     # read individual images out
756 |     for image_name in tqdm(image_names):
757 |         LR_images.append(cv2.imread(os.path.join(output_folder, image_name)))
758 | 
759 |     return _gen_pano_outputs(LR_images, output_folder, rotation_degrees, save_pano_img=save_pano_img, gen_video=gen_video, video_codec=video_codec, new_pano=pano)
760 | 
761 | 
762 | if __name__ == "__main__":
763 |     args = parse_args()
764 | 
765 |     # Generate multiview scenes
766 |     gen_multiviews(**args.__dict__)
767 | 
768 |     # Generate panorama view and optionally generate video
769 |     gen_pano(**args.__dict__)
770 | 


--------------------------------------------------------------------------------
/mm_pano/tgi_gaudi/README.md:
--------------------------------------------------------------------------------
 1 | ## Run TGI-Gaudi for LLM Serving
 2 | 
 3 | This is a short instruction to run a TGI Gaudi server to do LLM serving. For more information, please check [TGI-Gaudi](https://github.com/huggingface/tgi-gaudi)
 4 | 
 5 | #### Start a Llama3 TGI-Gaudi serving
 6 | ```bash
 7 | ./run_tgi_gaudi.sh
 8 | ```
 9 | 
10 | #### Quick test
11 | ```bash
12 | test_tgi.sh
13 | ```
14 | 


--------------------------------------------------------------------------------
/mm_pano/tgi_gaudi/run_tgi_gaudi.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | model=meta-llama/Meta-Llama-3-8B-Instruct
 3 | CONTAINER_NAME=tgi-gaudi
 4 | volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run
 5 | num_shard=2
 6 | sharded=true
 7 | max_input_length=2048
 8 | max_total_tokens=4096
 9 | 
10 | # Usage: text-generation-launcher <
11 | #     --model-id <MODEL_ID>|
12 | #     --revision <REVISION>|
13 | #     --validation-workers <VALIDATION_WORKERS>|
14 | #     --sharded <SHARDED>|
15 | #     --num-shard <NUM_SHARD>|
16 | #     --quantize <QUANTIZE>|
17 | #     --speculate <SPECULATE>|
18 | #     --dtype <DTYPE>|
19 | #     --trust-remote-code|
20 | #     --max-concurrent-requests <MAX_CONCURRENT_REQUESTS>|
21 | #     --max-best-of <MAX_BEST_OF>|
22 | #     --max-stop-sequences <MAX_STOP_SEQUENCES>|
23 | #     --max-top-n-tokens <MAX_TOP_N_TOKENS>|
24 | #     --max-input-tokens <MAX_INPUT_TOKENS>|
25 | #     --max-input-length <MAX_INPUT_LENGTH>|
26 | #     --max-total-tokens <MAX_TOTAL_TOKENS>|
27 | #     --waiting-served-ratio <WAITING_SERVED_RATIO>|
28 | #     --max-batch-prefill-tokens <MAX_BATCH_PREFILL_TOKENS>|
29 | #     --max-batch-total-tokens <MAX_BATCH_TOTAL_TOKENS>|
30 | #     --max-waiting-tokens <MAX_WAITING_TOKENS>|
31 | #     --max-batch-size <MAX_BATCH_SIZE>|
32 | #     --cuda-graphs <CUDA_GRAPHS>|
33 | #     --hostname <HOSTNAME>|
34 | #     --port <PORT>|
35 | #     --shard-uds-path <SHARD_UDS_PATH>|
36 | #     --master-addr <MASTER_ADDR>|
37 | #     --master-port <MASTER_PORT>|
38 | #     --huggingface-hub-cache <HUGGINGFACE_HUB_CACHE>|
39 | #     --weights-cache-override <WEIGHTS_CACHE_OVERRIDE>|
40 | #     --disable-custom-kernels|
41 | #     --cuda-memory-fraction <CUDA_MEMORY_FRACTION>|
42 | #     --rope-scaling <ROPE_SCALING>|
43 | #     --rope-factor <ROPE_FACTOR>|
44 | #     --json-output|
45 | #     --otlp-endpoint <OTLP_ENDPOINT>|
46 | #     --cors-allow-origin <CORS_ALLOW_ORIGIN>|
47 | #     --watermark-gamma <WATERMARK_GAMMA>|
48 | #     --watermark-delta <WATERMARK_DELTA>|
49 | #     --ngrok|
50 | #     --ngrok-authtoken <NGROK_AUTHTOKEN>|
51 | #     --ngrok-edge <NGROK_EDGE>|
52 | #     --tokenizer-config-path <TOKENIZER_CONFIG_PATH>|
53 | #     --disable-grammar-support
54 | # 
55 | 
56 | # -e HUGGING_FACE_HUB_TOKEN=<YOUR_HF_TOKEN> \
57 | docker run \
58 |     -p 8080:80 \
59 |     -v $volume:/data \
60 |     --runtime=habana \
61 |     -e PT_HPU_ENABLE_LAZY_COLLECTIVES=true \
62 |     -e HABANA_VISIBLE_DEVICES=all \
63 |     -e OMPI_MCA_btl_vader_single_copy_mechanism=none \
64 |     --cap-add=sys_nice \
65 |     --ipc=host \
66 |     --name=${CONTAINER_NAME} \
67 |     ghcr.io/huggingface/tgi-gaudi:2.0.0 \
68 |     --model-id $model --sharded $sharded --num-shard $num_shard --max-input-length $max_input_length --max-total-tokens $max_total_tokens
69 | 


--------------------------------------------------------------------------------
/mm_pano/tgi_gaudi/test_tgi.sh:
--------------------------------------------------------------------------------
1 | #! /bin/bash
2 | 
3 | curl 127.0.0.1:8080/generate \
4 |   -X POST \
5 |   -d '{"inputs":"What is deep learning?","parameters":{"max_new_tokens":2048,"temperature":0.7,"repetition_penalty":1}}' \
6 |   -H 'Content-Type: application/json'
7 | 


--------------------------------------------------------------------------------
/mm_pano/utils/common.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import re
 3 | import json
 4 | from dataclasses import dataclass, field
 5 | from typing import Optional, List
 6 | 
 7 | 
 8 | def extract_words_after_we_see_withFailv2(s):
 9 |     match = re.search('We .*?see: (.*)', s, re.IGNORECASE)
10 |     if match:
11 |         return match.group(1).replace('.', '').lower()
12 |     print("No match found")
13 |     return
14 | 
15 | 
16 | def extract_words_after_we_see_withFailv3(s):
17 |     match = re.search('We .*?see(.*)', s, re.IGNORECASE) or re.search('View .*?:(.*)', s, re.IGNORECASE)
18 |     if match:
19 |         return match.group(1)
20 |     print("No match found")
21 |     return
22 | 
23 | 
24 | @dataclass
25 | class Descriptor:
26 |     generated_text_details: Optional[str] = None
27 |     message: Optional[str] = None
28 |     message_main_obj: Optional[str] = None
29 |     message_topdown: Optional[str] = None
30 |     question_for_llm_repeat: Optional[str] = None
31 |     description_no_obj: Optional[str] = None
32 |     major_obj_number: int = 2
33 |     is_repeated: List[bool] = field(default_factory=list)
34 | 
35 |     init_prompt: Optional[str] = None
36 |     init_image: Optional[str] = None
37 | 
38 |     @classmethod
39 |     def from_json(cls, json_path: str):
40 |         assert isinstance(json_path, str) and os.path.isfile(json_path)
41 |         with open(json_path, "r") as f:
42 |             _dict = json.load(f)
43 |         print(_dict)
44 |         return cls(**_dict)
45 | 
46 |     def save_json(self, json_path: str):
47 |         assert isinstance(json_path, str)
48 |         with open(json_path, "w") as f:
49 |             json.dump(self.__dict__, f, indent=4)
50 | 
51 |     def __post_init__(self):
52 |         assert self.init_prompt is not None or self.init_image is not None, \
53 |             "When using Descriptor, either `init_prompt` or `init_image` has to be set. Got both None."
54 | 
55 |         if self.init_prompt is not None and self.init_image is not None:
56 |             print(f"Both `init_prompt` ({self.init_prompt}) and `init_image` ({self.init_image}) "
57 |                   " is given, using `init_image` and ignore `init_prompt`")
58 |             self.init_prompt = None
59 | 
60 |         if self.init_image:
61 |             assert os.path.isfile(self.init_image), f"The given `init_image` is not a valid file {self.init_image}"
62 | 


--------------------------------------------------------------------------------
/mm_pano/utils/image_utils.py:
--------------------------------------------------------------------------------
  1 | import cv2
  2 | import numpy as np, math
  3 | from dataclasses import dataclass
  4 | 
  5 | from PIL import Image
  6 | import torch
  7 | import torchvision.transforms as transforms
  8 | 
  9 | from typing import List, Union
 10 | import torch.nn.functional as F
 11 | from torchvision.utils import save_image
 12 | import argparse
 13 | 
 14 | import scipy.ndimage as ndimage
 15 | import os
 16 | import glob
 17 | import io
 18 | import json
 19 | from scipy.ndimage import distance_transform_edt
 20 | 
 21 | 
 22 | def cv2_to_pil(image):
 23 |     # Convert the cv2 image to RGB format
 24 |     image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
 25 | 
 26 |     # Convert the cv2 image to a PIL image
 27 |     pil_image = Image.fromarray(image)
 28 | 
 29 |     return pil_image
 30 | 
 31 | 
 32 | def pil_to_cv2(image):
 33 |     # Convert the PIL image to a numpy array
 34 |     np_image = np.array(image)
 35 | 
 36 |     # Convert the numpy array to a cv2 image
 37 |     cv2_image = cv2.cvtColor(np_image, cv2.COLOR_RGB2BGR)
 38 | 
 39 |     return cv2_image
 40 | 
 41 | 
 42 | def pil_to_tensor(image):
 43 |     # Define a transformation pipeline
 44 |     transform = transforms.Compose([
 45 |         transforms.ToTensor(),
 46 |         transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
 47 |     ])
 48 | 
 49 |     # Apply the transformation to the PIL image
 50 |     tensor_image = transform(image)
 51 | 
 52 |     # Add the batch dimension
 53 |     return tensor_image.unsqueeze(0)
 54 | 
 55 | 
 56 | def pil_mask_to_tensor(pil_mask):
 57 |     # Define the transformation to convert the PIL image to a tensor
 58 |     transform = transforms.ToTensor()
 59 | 
 60 |     # Apply the transformation to the PIL image
 61 |     tensor_mask = transform(pil_mask)
 62 | 
 63 |     # Repeat the tensor along the channel dimension to create 3 channels
 64 |     tensor_mask = tensor_mask.repeat(3, 1, 1)
 65 | 
 66 |     # Add the batch dimension
 67 |     return tensor_mask.unsqueeze(0)
 68 | 
 69 | 
 70 | def mask_to_pil(mask):
 71 |     # Multiply the mask by 255 to get values between 0 and 255
 72 |     mask = mask * 255
 73 | 
 74 |     # Convert the mask to an 8-bit integer numpy array
 75 |     mask = np.uint8(mask)
 76 | 
 77 |     # Create a black and white PIL image from the mask
 78 |     pil_image = Image.fromarray(mask, mode="L")
 79 | 
 80 |     return pil_image
 81 | 
 82 | 
 83 | def cv2_to_tensor(image):
 84 |     # Convert the image from BGR to RGB format
 85 |     image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
 86 | 
 87 |     # Normalize pixel values to range 0.0 to 1.0
 88 |     image_normalized = image_rgb.astype(np.float32) / 255.0
 89 | 
 90 |     # Transpose the image array to have the shape (3, h, w)
 91 |     image_transposed = np.transpose(image_normalized, (2, 0, 1))
 92 | 
 93 |     # Convert the numpy array to a PyTorch tensor and add a batch dimension
 94 |     tensor = torch.from_numpy(image_transposed).unsqueeze(0)
 95 | 
 96 |     return tensor
 97 | 
 98 | 
 99 | def tensor_to_cv2(tensor_image):
100 |     # Convert tensor image to a numpy array
101 |     image_np = (tensor_image * 255).numpy().astype(np.uint8)
102 | 
103 |     # Transpose the numpy array to have the shape (h, w, 3)
104 |     image_np_transposed = np.transpose(image_np, (0, 2, 3, 1))
105 | 
106 |     # Remove the batch dimension
107 |     image_cv2_float = np.squeeze(image_np_transposed, axis=0)
108 | 
109 |     # Convert the RGB order to BGR order
110 |     return cv2.cvtColor(image_cv2_float, cv2.COLOR_RGB2BGR)
111 | 
112 | 
113 | def tensor_mask_to_numpy(mask):
114 |     # Convert the mask to a numpy array
115 |     mask_np = mask.numpy()
116 | 
117 |     # Remove the batch and channel dimensions
118 |     return np.squeeze(mask_np, axis=(0, 3))
119 | 
120 | 
121 | def save_mask_to_png(mask, filename):
122 |     assert len(mask.shape) == 3 and mask.shape[-1] == 1, "Invalid mask shape. Expected (h, w, 1)."
123 | 
124 |     # Convert the mask to an integer array with values in the range [0, 255]
125 |     mask_255 = (mask * 255).astype(np.uint8)
126 | 
127 |     # Repeat the single channel mask to create a 3 channel image
128 |     mask_3_channels = np.repeat(mask_255, 3, axis=-1)
129 | 
130 |     # Save the image
131 |     img = Image.fromarray(mask_3_channels)
132 |     img.save(f"{filename}")
133 | 
134 | 
135 | 
136 | def warp_image_v2(image, K_original, K_new, R, output_size):
137 |     """
138 |     Warp an image to a new view given the original and new camera intrinsic matrices, relative rotation,
139 |     and output image size.
140 |     
141 |     Parameters:
142 |         image (numpy.ndarray): The original image.
143 |         K_original (numpy.ndarray): The original camera's intrinsic matrix.
144 |         K_new (numpy.ndarray): The new camera's intrinsic matrix.
145 |         R (numpy.ndarray): The relative rotation matrix.
146 |         output_size (tuple): The desired output image size (width, height).
147 |     
148 |     Returns:
149 |         warped_image (numpy.ndarray): The warped image.
150 |         mask (numpy.ndarray): Mask indicating if a pixel in the warped image has a corresponding pixel in the original image.
151 |     """
152 | 
153 |     # Compute the transformation matrix using the scaled new camera intrinsic
154 |     T = K_new.dot(R).dot(np.linalg.inv(K_original))
155 | 
156 |     # Warp the image using the new transformation matrix to the specified output size
157 |     warped_image = cv2.warpPerspective(image, T, output_size)
158 | 
159 |     # Create and warp the mask
160 |     mask = np.ones((image.shape[0], image.shape[1]), dtype=np.uint8) * 255
161 |     mask_warped = cv2.warpPerspective(mask, T, output_size, cv2.INTER_NEAREST, borderMode=cv2.BORDER_CONSTANT, borderValue=(0,0,0))
162 | 
163 |     # Convert mask to binary (0 or 1)
164 |     mask_binary = (mask_warped > 250).astype(np.uint8)
165 | 
166 |     return warped_image, mask_binary
167 | 
168 | # compute the nearest unmasked region
169 | def mask_to_NN_v2(mask, invert = False):
170 |     # print("mask = {}/{}".format(mask, mask.shape))
171 |     # Set a threshold value to create a binary mask
172 |     threshold = 0.5
173 |     binary_mask = mask > threshold
174 |     
175 |     if invert:
176 |         # Invert the binary_mask to find the unmasked pixels
177 |         binary_mask = 1 - binary_mask
178 | 
179 |     # Convert the inverted_mask to a NumPy array
180 |     inverted_mask_np = binary_mask
181 | 
182 |     # Compute the distance transform on the inverted mask
183 |     distance_transform = ndimage.distance_transform_edt(inverted_mask_np)
184 | 
185 |     # Convert the distance transform back to a PyTorch tensor
186 |     return torch.tensor(distance_transform, dtype=torch.float32)
187 | 
188 | 
189 | def generate_left_right_fullPano_pattern(max_step = 8, step_size = 42, final_step = 42):
190 |     pattern = []
191 | 
192 |     start_step = 1
193 |     angle_begin = step_size
194 |     angle_end = (360 - step_size * (max_step // 2 - 1) + step_size * (max_step // 2)) / 2
195 |     step_mid = angle_end - step_size * (max_step // 2)
196 |     for step in range(start_step, max_step+1):
197 |         if step <= max_step // 2:
198 |             pattern.append((0, angle_begin, 0))
199 |             angle_begin += step_size
200 |         else:
201 |             pattern.append((0, angle_end, 0))
202 |             if step != (max_step // 2 + 1):
203 |                 angle_end += step_size
204 |             else:
205 |                 angle_end += step_mid
206 | 
207 |     print(f"pattern = {pattern}")
208 |     return pattern
209 | 
210 | 
211 | def create_rotation_matrix(x_angle_degrees, y_angle_degrees, z_angle_degrees):
212 |     x_angle_radians = np.radians(x_angle_degrees)
213 |     y_angle_radians = np.radians(y_angle_degrees)
214 |     z_angle_radians = np.radians(z_angle_degrees)
215 | 
216 |     cos_x, sin_x = np.cos(x_angle_radians), np.sin(x_angle_radians)
217 |     cos_y, sin_y = np.cos(y_angle_radians), np.sin(y_angle_radians)
218 |     cos_z, sin_z = np.cos(z_angle_radians), np.sin(z_angle_radians)
219 | 
220 |     R_x = np.array([[1, 0, 0],
221 |                     [0, cos_x, -sin_x],
222 |                     [0, sin_x, cos_x]])
223 | 
224 |     R_y = np.array([[cos_y, 0, sin_y],
225 |                     [0, 1, 0],
226 |                     [-sin_y, 0, cos_y]])
227 | 
228 |     R_z = np.array([[cos_z, -sin_z, 0],
229 |                     [sin_z, cos_z, 0],
230 |                     [0, 0, 1]])
231 | 
232 |     R = R_y @ R_x @ R_z
233 | 
234 |     return R
235 | 
236 | def read_file_into_list(file_path):
237 |     # Initialize an empty list to hold the lines
238 |     lines_list = []
239 | 
240 |     # Open the file in read mode ('r')
241 |     with io.open(file_path, 'r', encoding='utf8') as file:
242 |         # Read each line in the file
243 |         for line in file:
244 |             # Add the line to the list (removing any trailing whitespace characters)
245 |             lines_list.append(line.rstrip())
246 |     
247 |     # Return the list of lines
248 |     return lines_list
249 | 
250 | def save_dict_to_file(dict_obj, file_name):
251 |     with open(file_name, 'w') as file:
252 |         json.dump(dict_obj, file)
253 | 
254 | 
255 | def load_dict_from_file(file_name):
256 |     with open(file_name, 'r') as file:
257 |         return json.load(file)
258 | 
259 | 
260 | 
261 | def check_fov_overlap_simplified(rotation_matrix, fov1):
262 |     """
263 |     Simplified check if there is an overlap in the field of view of two images based on rotation angle.
264 | 
265 |     Parameters:
266 |     rotation_matrix (numpy.ndarray): 3x3 rotation matrix from image1 to image2
267 |     fov1 (tuple): Field of view of image1 (horizontal_angle, vertical_angle) in degrees
268 | 
269 |     Returns:
270 |     bool: True if there is an overlap, False otherwise
271 |     """
272 | 
273 |     # Calculate the rotation angle from the rotation matrix
274 |     rotation_angle_rad = np.arccos((np.trace(rotation_matrix) - 1) / 2)
275 |     rotation_angle_deg = np.degrees(rotation_angle_rad)
276 | 
277 |     # # Compare with the FOV (considering the larger of the horizontal or vertical FOV)
278 |     return rotation_angle_deg <= fov1
279 | 
280 | 
281 | 
282 | 
283 | @dataclass
284 | class vp90Codec:
285 |     interval_deg: float = 1.0
286 |     fps: float = 60
287 |     video_codec = "VP09"
288 |     video_format = ".webm"
289 | 
290 | 
291 | @dataclass
292 | class mp4vCodec:
293 |     interval_deg: float = 0.5
294 |     fps: float = 60
295 |     video_codec = "mp4v"
296 |     video_format = ".mp4"
297 | 
298 | 
299 | @dataclass
300 | class mp4Codec:
301 |     interval_deg: float = 0.5
302 |     fps: float = 60
303 |     video_codec = "h264"
304 |     video_format = ".mp4"
305 | 


--------------------------------------------------------------------------------
/mm_pano/utils/llm_engines.py:
--------------------------------------------------------------------------------
  1 | import copy
  2 | from typing import Optional, Dict
  3 | from abc import ABC, abstractmethod
  4 | 
  5 | from text_generation import Client
  6 | from utils.model_utils import load_llm
  7 | 
  8 | _VALIDATED_MODELS = [
  9 |     "gpt-4", "gpt-3.5-turbo",
 10 |     "meta-llama/Meta-Llama-3-8B-Instruct",
 11 |     "mistralai/Mistral-7B-Instruct-v0.2",
 12 |     "tgi",
 13 | ]
 14 | 
 15 | 
 16 | class BaseLLMEngine(ABC):
 17 |     @abstractmethod
 18 |     def chat(self, user_content:str, system_content: str, history: Optional[str] = None):
 19 |         pass
 20 | 
 21 |     def extract_output(self, output: str) -> str:
 22 |         return output
 23 | 
 24 | 
 25 | class OpenAILLMEngine(BaseLLMEngine):
 26 |     def __init__(self,
 27 |                  model_engine: str = None,
 28 |                  openai=None,
 29 |                  openai_key: str = None):
 30 |         self.model_engine = model_engine
 31 |         self.openai = openai
 32 |         self.openai.api_key = openai_key
 33 |         print(f"Using model engine {self.model_engine} to generate text")
 34 | 
 35 |     def chat(self,
 36 |              user_content: str,
 37 |              system_content: str = "You are a helpful assistant.",
 38 |              history: str = None) -> str:
 39 | 
 40 |         message = self.openai.ChatCompletion.create(
 41 |             model=self.model_engine,
 42 |             messages=[
 43 |                 {"role": "system", "content": system_content},
 44 |                 {"role": "user", "content": user_content},
 45 |             ]).choices[0]['message']['content']
 46 | 
 47 |         # For now the history always = None
 48 |         return message, None
 49 | 
 50 | 
 51 | class QwenLLMEngine(BaseLLMEngine):
 52 |     def __init__(self,
 53 |                  tokenizer,
 54 |                  model):
 55 |         self.tokenizer = tokenizer
 56 |         self.model = model
 57 | 
 58 |     def chat(self,
 59 |              user_content: str,
 60 |              system_content: str = "You are a helpful assistant.",
 61 |              history: str = None):
 62 |         message, history = self.model.chat(self.tokenizer, user_content, history=history)
 63 |         return message, history
 64 | 
 65 | 
 66 | class MistralLLMEngine(BaseLLMEngine):
 67 |     def __init__(self,
 68 |                  tokenizer,
 69 |                  model,
 70 |                  default_generate_kwargs: Optional[Dict] = None):
 71 |         self.tokenizer = tokenizer
 72 |         self.model = model
 73 |         self.default_generate_kwargs = {} if default_generate_kwargs is None else default_generate_kwargs
 74 | 
 75 |     def chat(self,
 76 |              user_content: str,
 77 |              system_content: str = "You are a helpful assistant.",
 78 |              history: str = None,
 79 |              generate_kwargs: Optional[Dict] = None):
 80 | 
 81 |         _generate_kwargs = copy.deepcopy(self.default_generate_kwargs)
 82 |         if generate_kwargs is not None:
 83 |             _generate_kwargs.update(generate_kwargs)
 84 | 
 85 |         messages = [
 86 |             {"role": "user", "content": user_content},
 87 |         ]
 88 |         model_inputs = self.tokenizer.apply_chat_template(messages, return_tensors="pt").to(self.model.device)
 89 |         generated_ids = self.model.generate(model_inputs, **_generate_kwargs)
 90 |         decoded = self.tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
 91 | 
 92 |         # TODO(Tien Pei Chou): Find a better way to only output the new tokens.
 93 |         return decoded[0][decoded[0].rfind("[/INST]") + len("[/INST] "):], None
 94 | 
 95 | 
 96 | class Llama3LLMEngine(BaseLLMEngine):
 97 |     def __init__(self,
 98 |                  tokenizer,
 99 |                  model,
100 |                  default_generate_kwargs: Optional[Dict] = None):
101 |         self.tokenizer = tokenizer
102 |         self.model = model
103 |         self.default_generate_kwargs = {} if default_generate_kwargs is None else default_generate_kwargs
104 | 
105 |     def extract_output(self, output: str) -> str:
106 |         return output[output.rfind("assistant\n\n") + len("assistant\n\n"):]
107 | 
108 |     def chat(self,
109 |              user_content: str,
110 |              system_content: str = "You are a helpful assistant.",
111 |              history: str = None,
112 |              generate_kwargs: Optional[Dict] = None):
113 | 
114 |         _generate_kwargs = copy.deepcopy(self.default_generate_kwargs)
115 |         if generate_kwargs is not None:
116 |             _generate_kwargs.update(generate_kwargs)
117 | 
118 |         messages = [
119 |             {"role": "system", "content": system_content},
120 |             {"role": "user", "content": user_content},
121 |         ]
122 | 
123 |         model_inputs = self.tokenizer.apply_chat_template(messages, add_generation_prompt=False, return_tensors="pt").to(self.model.device)
124 |         generated_ids = self.model.generate(model_inputs, **_generate_kwargs)
125 |         decoded = self.tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
126 | 
127 |         # TODO(Tien Pei Chou): Find a better way to only output the new tokens.
128 |         return self.extract_output(decoded[0]), None
129 | 
130 | 
131 | class TGILLMEngine(BaseLLMEngine):
132 |     def __init__(self,
133 |                  tgi_url: Optional[str] = "http://127.0.0.1:8080",
134 |                  default_generate_kwargs: Optional[Dict] = None):
135 |         self.client = Client(tgi_url)
136 |         self.default_generate_kwargs = {} if default_generate_kwargs is None else default_generate_kwargs
137 | 
138 |     def chat(self,
139 |              user_content: str,
140 |              system_content: str = "You are a helpful assistant.",
141 |              history: str = None,
142 |              generate_kwargs: Optional[Dict] = None):
143 | 
144 |         _generate_kwargs = copy.deepcopy(self.default_generate_kwargs)
145 |         if generate_kwargs is not None:
146 |             _generate_kwargs.update(generate_kwargs)
147 | 
148 |         response = self.client.generate(user_content, **_generate_kwargs, return_full_text=False)
149 | 
150 |         return self.extract_output(response.generated_text), None
151 | 
152 | 
153 | def get_llm_engine(model_name: str,
154 |                    dtype: Optional[str] = "float32",
155 |                    device: Optional[str] = "hpu",
156 |                    openai_key: Optional[str] = None,
157 |                    hf_token: Optional[str] = None,
158 |                    tgi_url: Optional[str] = "http://127.0.0.1:8080"):
159 |     if model_name in ["gpt-4", "gpt-3.5-turbo"]:
160 |         import openai
161 |         assert openai_key is not None, "Please set the `openai_key` when using OpenAI API"
162 |         print(f"Using OpenAI {model_name} API for text generaton ...")
163 |         return OpenAILLMEngine(model_engine=model_name, openai=openai, openai_key=openai_key)
164 |     elif model_name == "mistralai/Mistral-7B-Instruct-v0.2":
165 |         tokenizer, model = load_llm(
166 |             model_name=model_name,
167 |             device=device,
168 |             dtype=dtype,
169 |             trust_remote_code=True,
170 |             hf_token=hf_token)
171 |         default_generate_kwargs = {
172 |             "do_sample": True,
173 |             "temperature": 0.7,
174 |             "max_new_tokens": 256
175 |         }
176 |         print(f"Using {model_name} for text generaton ...")
177 |         return MistralLLMEngine(tokenizer, model, default_generate_kwargs=default_generate_kwargs)
178 |     elif "Llama" in model_name:  # Ex: "meta-llama/Meta-Llama-3-8B-Instruct"
179 |         tokenizer, model = load_llm(
180 |             model_name=model_name,
181 |             device=device,
182 |             dtype=dtype,
183 |             trust_remote_code=True,
184 |             hf_token=hf_token)
185 |         default_generate_kwargs = {
186 |             "do_sample": True,
187 |             "temperature": 0.6,
188 |             "max_new_tokens": 256
189 |         }
190 |         print(f"Using {model_name} for text generaton ...")
191 |         return Llama3LLMEngine(tokenizer, model, default_generate_kwargs=default_generate_kwargs)
192 |     elif "tgi" in model_name:
193 |         assert tgi_url is not None, "Must pass a url to the client when using TGI-Gaudi"
194 |         default_generate_kwargs = {
195 |             "do_sample": True,
196 |             "temperature": 0.6,
197 |             "max_new_tokens": 256
198 |         }
199 |         return TGILLMEngine(tgi_url, default_generate_kwargs)
200 |     else:
201 |         raise NotImplementedError(f"Got unsupported model {model_name}")
202 | 


--------------------------------------------------------------------------------
/mm_pano/utils/model_utils.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | from typing import Union, Optional
  3 | 
  4 | import torch
  5 | from transformers import BlipProcessor, BlipForConditionalGeneration
  6 | from transformers import Blip2Processor, Blip2ForConditionalGeneration
  7 | from diffusers import DiffusionPipeline, DPMSolverMultistepScheduler, StableDiffusionUpscalePipeline
  8 | 
  9 | 
 10 | def is_on_hpu(device: str) -> bool:
 11 |     """ Return True is the device is a Gaudi/HPU device.
 12 |     """
 13 |     return "hpu" in device
 14 | 
 15 | 
 16 | def get_datatype(data_type: Union[str, torch.dtype]):
 17 |     if isinstance(data_type, torch.dtype):
 18 |         return data_type
 19 |     if data_type in ["fp32", "float32"]:
 20 |         return torch.float
 21 |     elif data_type in ["fp16", "float16"]:
 22 |         return torch.float16
 23 |     elif data_type in ["bfloat16", "bf16"]:
 24 |         return torch.bfloat16
 25 |     else:
 26 |         raise RuntimeError(f"Got unknown dtype {data_type}")
 27 | 
 28 | 
 29 | def optimize_stable_diffusion_pipeline(pipeline, device, datatype, cpu_offload: bool = False, enable_xformers: bool = False):
 30 |     pipeline.to(device)
 31 |     if is_on_hpu(device):
 32 |         assert datatype in ["bfloat16", "float32"] or datatype in [torch.bfloat16, torch.float32]
 33 |         pass
 34 |     else:
 35 |         # Cuda
 36 |         # TODO(Joey): Check if there is an Intel version of xformers
 37 |         # pipeline.unet = torch.compile(pipeline.unet)
 38 |         if enable_xformers:
 39 |             pipeline.set_use_memory_efficient_attention_xformers(enable_xformers)
 40 | 
 41 |     if cpu_offload:
 42 |         pipeline.enable_sequential_cpu_offload()
 43 |         pipeline.enable_model_cpu_offload()
 44 |     return pipeline.to(device)
 45 | 
 46 | 
 47 | def optimize_blip(model, device, datatype):
 48 |     model.to(device)
 49 |     return model
 50 | 
 51 | 
 52 | def load_diffusion_model(model_name: str = "stabilityai/stable-diffusion-2-inpainting",
 53 |                          device: str = "cuda",
 54 |                          dtype: Union[str, torch.dtype] = "float16",
 55 |                          cpu_offload: bool = False):
 56 |     """ Load diffusion or diffusion inpainting model for text-to-image.
 57 |     """
 58 |     print(f"Loading text-to-image model {model_name} ...")
 59 | 
 60 |     torch_dtype = get_datatype(dtype)
 61 | 
 62 |     if is_on_hpu(device=device):
 63 |         assert dtype in ["bfloat16", "float32"] or dtype in [torch.bfloat16, torch.float32]
 64 |         from optimum.habana.diffusers import GaudiDDIMScheduler
 65 |         if "inpaint" in model_name:
 66 |             from optimum.habana.diffusers import GaudiStableDiffusionInpaintPipeline as DiffusionPipelineClass
 67 |         else:
 68 |             from optimum.habana.diffusers import GaudiStableDiffusionPipeline as DiffusionPipelineClass
 69 | 
 70 |         # Load model and scheduler on Gaudi/HPU
 71 |         scheduler = GaudiDDIMScheduler.from_pretrained(model_name, subfolder="scheduler")
 72 |         kwargs = {
 73 |             "scheduler": scheduler,
 74 |             "use_habana": True,
 75 |             "use_hpu_graphs": True,
 76 |             "gaudi_config": "Habana/stable-diffusion"
 77 |         }
 78 |         pipe = DiffusionPipelineClass.from_pretrained(model_name, **kwargs).to(torch_dtype)
 79 |     else:
 80 |         # Load model and scheduler
 81 |         pipe = DiffusionPipeline.from_pretrained(model_name, torch_dtype=torch_dtype)
 82 |         pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)
 83 |         # TODO(Joey): Check why enable xformers + torch.compile for inpainting model gets error
 84 |     pipe = optimize_stable_diffusion_pipeline(pipe, device, torch_dtype, cpu_offload, enable_xformers=False)
 85 |     return pipe
 86 | 
 87 | 
 88 | def load_blip_model_and_processor(model_name: str = "Salesforce/blip2-flan-t5-xl",  # "Salesforce/blip2-opt-2.7b"
 89 |                                   device: str = "cuda",
 90 |                                   dtype: Union[str, torch.dtype] = "float16"):
 91 |     """ Load BLIP model for image-to-text.
 92 |     """
 93 |     print(f"Loading image-to-text model {model_name} ...")
 94 | 
 95 |     torch_dtype = get_datatype(dtype)
 96 | 
 97 |     if "blip2" in model_name:
 98 |         processor_class = Blip2Processor
 99 |         model_class = Blip2ForConditionalGeneration
100 |     else:
101 |         # Blip
102 |         assert "blip" in model_name
103 |         processor_class = BlipProcessor
104 |         model_class = BlipForConditionalGeneration
105 | 
106 |     if is_on_hpu(device=device):
107 |         assert dtype in ["bfloat16", "float32"] or dtype in [torch.bfloat16, torch.float32]
108 |         from optimum.habana.transformers.modeling_utils import adapt_transformers_to_gaudi
109 | 
110 |         # TODO(Joey): Check optimum-habana once it has Blip2 support
111 |         adapt_transformers_to_gaudi()
112 |     else:
113 |         processor_class = Blip2Processor
114 |         model_class = Blip2ForConditionalGeneration
115 | 
116 |     # Get a blip description
117 |     processor = processor_class.from_pretrained(model_name)
118 |     model = model_class.from_pretrained(model_name, torch_dtype=torch_dtype)
119 |     model = optimize_blip(model, device, torch_dtype)
120 | 
121 |     return processor, model
122 | 
123 | 
124 | def load_upscaler_model(model_name: str = "stabilityai/stable-diffusion-x4-upscaler",
125 |                         device: str = "cuda",
126 |                         dtype: Union[str, torch.dtype] = "float16",
127 |                         cpu_offload: bool = False):
128 |     """ Load super resolution model for upscaling.
129 |     """
130 |     print(f"Loading super resolution model {model_name} ...")
131 | 
132 |     torch_dtype = get_datatype(dtype)
133 | 
134 |     if is_on_hpu(device=device):
135 |         assert dtype in ["bfloat16", "float32"] or dtype in [torch.bfloat16, torch.float32]
136 |         from optimum.habana.diffusers import GaudiDDIMScheduler, GaudiStableDiffusionUpscalePipeline
137 | 
138 |         # Load model and scheduler on Gaudi/HPU
139 |         scheduler = GaudiDDIMScheduler.from_pretrained(model_name, subfolder="scheduler")
140 |         kwargs = {
141 |             "scheduler": scheduler,
142 |             "use_habana": True,
143 |             "use_hpu_graphs": True,
144 |             "gaudi_config": "Habana/stable-diffusion"
145 |         }
146 |         pipe = GaudiStableDiffusionUpscalePipeline.from_pretrained(model_name, **kwargs).to(torch_dtype)
147 |     else:
148 |         # Load model and scheduler
149 |         pipe = StableDiffusionUpscalePipeline.from_pretrained(model_name, torch_dtype=torch_dtype)
150 |         # TODO(Joey): Check why enable xformers + torch.compiler for inpainting model gets error
151 |     pipe = optimize_stable_diffusion_pipeline(pipe, device, torch_dtype, cpu_offload, enable_xformers=False)
152 | 
153 |     return pipe
154 | 
155 | 
156 | def load_llm(model_name: str = "mistralai/Mistral-7B-Instruct-v0.2",
157 |              device: str = "cuda",
158 |              dtype: Optional[Union[str, torch.dtype]] = "float16",
159 |              trust_remote_code: bool = False,
160 |              hf_token: Optional[str] = None):
161 |     """ Load LLM model.
162 |     """
163 |     print(f"Loading LLM {model_name} ...")
164 | 
165 |     if is_on_hpu(device=device):
166 |         assert dtype in ["bfloat16", "float32"] or dtype in [torch.bfloat16, torch.float32]
167 |         from optimum.habana.transformers.modeling_utils import adapt_transformers_to_gaudi
168 |         adapt_transformers_to_gaudi()
169 | 
170 |     from transformers import AutoModelForCausalLM, AutoTokenizer
171 |     tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=trust_remote_code, token=hf_token)
172 |     model = AutoModelForCausalLM.from_pretrained(
173 |         model_name, trust_remote_code=trust_remote_code, token=hf_token).eval().to(device)
174 | 
175 |     tokenizer.pad_token_id = tokenizer.eos_token_id
176 | 
177 |     return tokenizer, model
178 | 
179 | 
180 | def release_memory(model, tokenizer, device: str = "cuda"):
181 |     import gc
182 |     del tokenizer
183 |     del model
184 | 
185 |     if device == "cuda":
186 |         torch.cuda.empty_cache()
187 |     else:
188 |         # TODO(Tien Pei Chou): Add Gaudi and XPU
189 |         raise NotImplementedError()
190 |     # accelerator.free_memory()
191 |     gc.collect()
192 | 


--------------------------------------------------------------------------------
/requirements-api.txt:
--------------------------------------------------------------------------------
1 | # fastAPI and gradio
2 | fastapi
3 | uvicorn
4 | gradio
5 | envbash
6 | 


--------------------------------------------------------------------------------
/requirements-hpu.txt:
--------------------------------------------------------------------------------
1 | # Fix to the main branch once the PR (https://github.com/huggingface/optimum-habana/pull/869/commits) gets merged
2 | git+https://github.com/huggingface/optimum-habana.git@8893d602289226eda82cf19c79951fa12d15e1b9
3 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | wheel
 2 | diffusers
 3 | transformers
 4 | scipy
 5 | openai==0.27.2
 6 | matplotlib
 7 | imageio[ffmpeg]
 8 | opencv-python
 9 | accelerate
10 | einops 
11 | auto-gptq
12 | tiktoken
13 | transformers_stream_generator
14 | text-generation  # TGI and TGI Gaudi
15 | 


--------------------------------------------------------------------------------