├── .dockerignore
├── .env
├── .gitignore
├── Dockerfile
└── Dockerfile-hpu
├── LICENSE
├── README.md
├── app
├── .env
└── README.md
├── docker_build.sh
├── docker_run-hpu.sh
├── exp
├── .gitignore
└── example
│ ├── 0.png
│ ├── 1.png
│ └── example_descriptor.json
├── media
├── ISC.png
├── blog.png
├── library.jpg
├── livingRoom.jpg
├── pano.png
├── pipeline.png
├── snow.jpg
└── underwater.jpeg
├── mm_pano
├── __init__.py
├── lib
│ ├── Equirec2Perspec.py
│ ├── Perspec2Equirec.py
│ └── multi_Perspec2Equirec.py
├── mmpano.py
├── tgi_gaudi
│ ├── README.md
│ ├── run_tgi_gaudi.sh
│ └── test_tgi.sh
└── utils
│ ├── common.py
│ ├── image_utils.py
│ ├── llm_engines.py
│ └── model_utils.py
├── requirements-api.txt
├── requirements-hpu.txt
└── requirements.txt
/.dockerignore:
--------------------------------------------------------------------------------
1 | *
2 | !.env
3 | !Dockerfile
4 | !app
5 | !exp
6 | !mm_pano
7 | !requirements*
8 |
9 | mm_pano/tgi_gaudi/data/
10 |
--------------------------------------------------------------------------------
/.env:
--------------------------------------------------------------------------------
1 | IMAGE_NAME=mm_pano
2 | IMAGE_TAG=latest
3 | CONTAINER_NAME=mm_pano
4 |
5 | # Backend API
6 | API_HOST="127.0.0.1"
7 | API_PORT="8010"
8 | API_TIMEOUT=300
9 | API_RESPONSE_PANO_NAME="pano"
10 | API_RESPONSE_VIDEO_NAME="panorama_video"
11 |
12 | # Frontend UI
13 | WEBAPP_NAME="Language Model Assisted Generation of Images with Coherence"
14 | WEBAPP_HOST="127.0.0.1"
15 | WEBAPP_PORT="8100"
16 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | **/__pycache__
2 |
--------------------------------------------------------------------------------
/Dockerfile/Dockerfile-hpu:
--------------------------------------------------------------------------------
1 | # From vault.habana.ai/gaudi-docker/1.14.0/ubuntu22.04/habanalabs/pytorch-installer-2.1.1:latest
2 | From vault.habana.ai/gaudi-docker/1.15.0/ubuntu22.04/habanalabs/pytorch-installer-2.2.0:latest
3 |
4 | WORKDIR /app
5 |
6 | RUN apt-get update && \
7 | apt-get upgrade -y && \
8 | apt-get install -y \
9 | tmux
10 | #####################
11 | # Multimodal Panorama Generation source code
12 | #####################
13 | # Common requirements
14 | COPY requirements.txt /app/requirements.txt
15 | RUN pip3 install --upgrade pip && \
16 | pip3 install -r requirements.txt
17 |
18 | # Copy code
19 | COPY mm_pano /app/mm_pano
20 | COPY exp /app/exp
21 |
22 | ######################
23 | # Application frontend
24 | ######################
25 | COPY requirements-api.txt /app/requirements-api.txt
26 | RUN pip3 install -r requirements-api.txt
27 | COPY app /app/app
28 | COPY .env /app/.env
29 |
30 | ###########################
31 | # HPU specific requirements
32 | ###########################
33 | COPY requirements-hpu.txt /app
34 | RUN pip3 install -r requirements-hpu.txt
35 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "[]"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright [yyyy] [name of copyright owner]
190 |
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # [CVPR 2024] Official implementation of the paper: "L-MAGIC: Language Model Assisted Generation of Images with Coherence"
2 | We present a novel method that can generate 360 degree panorama from different types of zero-shot inputs (e.g., a single image, text description, hand-drawing etc.). Our Huggingface space is now available. Feel free to try it out!
3 |
4 |
5 |

6 |
7 |
8 | - [Paper](https://arxiv.org/abs/2406.01843)
9 | - [Project Page](https://zhipengcai.github.io/MMPano/)
10 | - [Youtube Video](https://youtu.be/XDMNEzH4-Ec)
11 | - [Huggingface demo (now available!)](https://huggingface.co/spaces/MMPano/MMPano)
12 |
13 | ## Industrial Impact
14 |
15 | - Our work has been selected as **one of the 5 Intel featured live demos** at [ISC HPC 2024](https://www.intel.com/content/www/us/en/events/supercomputing.html).
16 | - Our work has been featured by [Intel Community Blog](https://community.intel.com/t5/Blogs/Tech-Innovation/Artificial-Intelligence-AI/Advancing-Gen-AI-on-Intel-Gaudi-AI-Accelerators-with-Multi-Modal/post/1603746)!
17 | - Our work has been featured by [Intel Labs Linkedin](https://www.linkedin.com/feed/update/urn:li:activity:7203797143831076864/)!
18 |
19 |
20 |

21 |
22 |
23 |

24 |
25 |
26 | ## 📌 Reference
27 |
28 | ```bibtex
29 | @inproceedings{
30 | zhipeng2024lmagic,
31 | title={L-MAGIC: Language Model Assisted Generation of Images with Coherence},
32 | author={Zhipeng Cai and Matthias Müller and Reiner Birkl and Diana Wofk and Shao-Yen Tseng and JunDa Cheng and Gabriela Ben-Melech Stan and Vasudev Lal and Michael Paulitsch},
33 | booktitle={The IEEE/CVF Conference on Computer Vision and Pattern Recognition},
34 | year={2024}
35 | }
36 | ```
37 |
38 | ## ⭐️ Show Your Support
39 |
40 | If you find this project helpful or interesting, please consider giving it a star! Your support is greatly appreciated and helps others discover the project.
41 |
42 | ## Environment
43 |
44 | This code has been tested on linux with python 3.9. It should be compatible with also other python versions.
45 |
46 |
47 | ## Run on Intel Gaudi
48 |
49 | This codebase has been developed and deployed on Intel Gaudi on Intel Developer Cloud
50 |
51 | - [Intel Gaudi](https://habana.ai/)
52 | - [Intel Developer Cloud](https://www.intel.com/content/www/us/en/developer/tools/devcloud/overview.html)
53 |
54 |
55 | #### Setup Docker environment
56 | ```bash
57 | # Build docker image
58 | ./docker_build.sh
59 |
60 | # Start the container. Following the instruction on the script, you may modify
61 | # the `HABANA_VISIBLE_DEVICES` and `HABANA_VISIBLE_MODULES` to run on different Gaudi device.
62 | ./docker_run-hpu.sh
63 | ```
64 |
65 |
66 | ## Run on other device
67 |
68 | You can also run it on Nvidia GPU. After a proper Nvidia environment setup with pytorch installed (ex: `conda`, `venv`, `docker` ...etc)
69 |
70 | Install the necessary packages by running the following command:
71 |
72 | ```bash
73 | pip install -r requirements.txt
74 | ```
75 |
76 |
77 | ## Run the code
78 | #### Note
79 | - If you are running on Gaudi, you will encouter a slower performance because Gaudi requires at least 2 warmup cycles. If you want to build your own application using this codebase, please to warmup the Gaudi at least 2 times.
80 |
81 | - The best performance is enabled by using ChatGPT as the LLM controller, which requires you to apply for an [OpenAI API key](https://platform.openai.com/docs/overview).
82 |
83 | - If you are in areas that cannot access the ChatGPT API, we also provided a way to use a free open sourced LLM controller (e.g., Llama3). Please see below for instructions on how to enable it. You may need to set the `HF_TOKEN` or pass a huggingface token. Feel free to also contribute to the code and enable other LLMs.
84 |
85 | #### (Optional) Start a TGI LLM server
86 |
87 | If user wants to use the TGI to do LLM serving, the code provides a script to pull the docker image and start a TGI LLM serving on Gaudi. Once the TGI is on, please make sure to pass `--llm_model_name tgi` when running the MM Pano command line in the next step.
88 |
89 | We've only validated the listed LLM models ("meta-llama/Meta-Llama-3-8B-Instruct", "mistralai/Mistral-7B-Instruct-v0.2"). We encourage users to try out new models and add them to the supported list.
90 |
91 | ```bash
92 | # Modify the model name and pass Huggingface token if needed. You can also change the `num_shard` if you like.
93 | vi mm_pano/tgi_gaudi/run_tgi_gaudi.sh
94 |
95 | # Pull and start the TGI-Gaudi in the container
96 | (cd mm_pano/tgi_gaudi && ./run_tgi_gaudi.sh)
97 | ```
98 |
99 | If user wants to run the TGI on other devices, please make sure the default TGI url:port is set to `http://127.0.0.1:8080`.
100 |
101 |
102 | #### Command
103 | There are different choices when running the code, a simple example for
104 |
105 | - image-to-panorama task
106 | - ChatGPT LLM (GPT4)
107 | - Gaudi accelerator as the hardware
108 |
109 | ```bash
110 | python3 mm_pano/mmpano.py \
111 | --init_image exp/example/0.png \
112 | --output_folder exp/outputs \
113 | --dtype bfloat16 --device hpu \
114 | --llm_model_name gpt-4 \
115 | --api_key \
116 | --save_pano_img \ # To save the generated panorama picture
117 | --gen_video # To generate and save the video
118 | ```
119 |
120 | To change the setups, e.g.
121 | - perform "text-to-panorama", change `--init_image exp/example/0.png` to `--init_prompt 'maple autum forest'`, also the `--init_prompt` can be used together with `--init_image` to provide a user specified scene description.
122 | - use other LLMs, change `--llm_model_name gpt-4` to `--llm_model_name [other LLM names]`. Currently the available choices are `"gpt-4", "gpt-3.5-turbo", "meta-llama/Meta-Llama-3-8B-Instruct", "mistralai/Mistral-7B-Instruct-v0.2", "tgi"`,
123 | where TGI can be a [TGI Gaudi](https://github.com/huggingface/tgi-gaudi) or [TGI](https://github.com/huggingface/text-generation-inference) server to run bigger model like Llama3-70B. Note that the `--api_key` is only used for gpt models.
124 | - use cuda, change `--device hpu` to `--device cuda`
125 | - specify camera intrinsic for the input image, add `--intrinsitc float, float, float, float`
126 |
127 | ## Results (see more on our project page and paper)
128 |
129 | After running the code, you will see in the output_folder (exp/outputs) a panoramic image "pano.png" (see below for examples) and a immersive video "video.mp4".
130 |
131 |
132 |

133 |
134 |
135 |
136 |

137 |
138 |
139 |
140 |

141 |
142 |
143 |
144 |

145 |
146 |
147 |
148 |

149 |
150 |
151 |
152 | ## Contact
153 |
154 | Feel free to send an email to Zhipeng (czptc2h@gmail.com) or Joey (Tien Pei) Chou (joey.t.p.chou@gmail.com) if you have any questions and comments.
155 |
156 | ## 📈 Star History
157 |
158 | [](https://star-history.com/#IntelLabs/MMPano)
159 |
--------------------------------------------------------------------------------
/app/.env:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/MMPano/b176d26bc3311584e746fe88135e8d5e793d3107/app/.env
--------------------------------------------------------------------------------
/app/README.md:
--------------------------------------------------------------------------------
1 | Placeholder for Multimodal Panorama API
2 |
--------------------------------------------------------------------------------
/docker_build.sh:
--------------------------------------------------------------------------------
1 | #! /bin/bash
2 | while [ "$1" != "" ];
3 | do
4 | case $1 in
5 | -d | --device)
6 | DEVICE=$2
7 | shift
8 | ;;
9 | -h | --help )
10 | echo "Build the docker image for Multimodal Panorama Generation"
11 | echo "Usage: docker_build.sh [OPTIONS]"
12 | echo "OPTION includes:"
13 | echo " -d | --device - Supported device [hpu]"
14 | exit
15 | ;;
16 | * )
17 | echo "Invalid option: $1"
18 | echo "Please do 'docker_build.sh -h'"
19 | exit
20 | ;;
21 | esac
22 | shift
23 | done
24 |
25 | CUR_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
26 | source ${CUR_DIR}/.env
27 |
28 | DEVICE="${DEVICE:-hpu}"
29 | DOCKERFILE=${CUR_DIR}/Dockerfile/Dockerfile-${DEVICE}
30 |
31 | cmd="DOCKER_BUILDKIT=0 docker build . -f ${DOCKERFILE} -t ${IMAGE_NAME}_${DEVICE}:${IMAGE_TAG}"
32 | echo $cmd
33 | eval $cmd
34 |
--------------------------------------------------------------------------------
/docker_run-hpu.sh:
--------------------------------------------------------------------------------
1 | #! /bin/bash
2 | # HABANA_VISIBLE_DEVICES, HABANA_VISIBLE_MODULES
3 | # 0, 2
4 | # 1, 6
5 | # 2, 0
6 | # 3, 7
7 | # 4, 1
8 | # 5, 4
9 | # 6, 3
10 | # 7, 5
11 |
12 | CUR_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
13 | source ${CUR_DIR}/.env
14 |
15 | DEVICE_IDX=0
16 | MODULES_IDX=2
17 | IMAGE_NAME=${IMAGE_NAME}_hpu:${IMAGE_TAG}
18 | CONTAINER_NAME=${CONTAINER_NAME}_hpu
19 |
20 | OUTPUT_DIR_LOCAL=./exp
21 | OUTPUT_DIR_CONTAINER=/app/outputs
22 | docker run -it \
23 | --expose=${API_PORT} \
24 | --expose=${WEBAPP_PORT} \
25 | -v ${OUTPUT_DIR_LOCAL}:${OUTPUT_DIR_CONTAINER} \
26 | --env=DEVICE=hpu \
27 | --env=HABANA_VISIBLE_DEVICES=all \
28 | --env=OMPI_MCA_btl_vader_single_copy_mechanism=none \
29 | --cap-add=sys_nice \
30 | --network=host \
31 | --restart=no \
32 | --runtime=habana \
33 | --shm-size=64g \
34 | --name ${CONTAINER_NAME} \
35 | -t ${IMAGE_NAME}
36 |
--------------------------------------------------------------------------------
/exp/.gitignore:
--------------------------------------------------------------------------------
1 | *
2 |
--------------------------------------------------------------------------------
/exp/example/0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/MMPano/b176d26bc3311584e746fe88135e8d5e793d3107/exp/example/0.png
--------------------------------------------------------------------------------
/exp/example/1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/MMPano/b176d26bc3311584e746fe88135e8d5e793d3107/exp/example/1.png
--------------------------------------------------------------------------------
/exp/example/example_descriptor.json:
--------------------------------------------------------------------------------
1 | {
2 | "init_image": "exp/example/0.png",
3 | "init_prompt": "a forest",
4 | "generated_text_details": "a road in autumn",
5 | "message": "View 1: We see a narrow, pebbled forest path.\nView 2: We see a rusted, overgrown log bridge crossing a creek.\nView 3: We see an antique lamppost, partially hidden by leaves.\nView 4: We see an old wooden cabin, shrouded in shadows.\nView 5: We see a past-its-prime swing hanging from an oak.\nView 6: We see a moss-laden stone bench facing a pond.",
6 | "message_main_obj": "We see: fallen leaves.\nWe see: a path.",
7 | "message_topdown": "We see: Canopy\nWe see: Leaf-covered path",
8 | "question_for_llm_repeat": "Do we often see multiple a path in a scene with a forest? Just say 'yes' or 'no' with all lower case letters",
9 | "description_no_obj": "'a forest'",
10 | "major_obj_number": 2,
11 | "is_repeated": [true, true]
12 | }
13 |
--------------------------------------------------------------------------------
/media/ISC.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/MMPano/b176d26bc3311584e746fe88135e8d5e793d3107/media/ISC.png
--------------------------------------------------------------------------------
/media/blog.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/MMPano/b176d26bc3311584e746fe88135e8d5e793d3107/media/blog.png
--------------------------------------------------------------------------------
/media/library.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/MMPano/b176d26bc3311584e746fe88135e8d5e793d3107/media/library.jpg
--------------------------------------------------------------------------------
/media/livingRoom.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/MMPano/b176d26bc3311584e746fe88135e8d5e793d3107/media/livingRoom.jpg
--------------------------------------------------------------------------------
/media/pano.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/MMPano/b176d26bc3311584e746fe88135e8d5e793d3107/media/pano.png
--------------------------------------------------------------------------------
/media/pipeline.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/MMPano/b176d26bc3311584e746fe88135e8d5e793d3107/media/pipeline.png
--------------------------------------------------------------------------------
/media/snow.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/MMPano/b176d26bc3311584e746fe88135e8d5e793d3107/media/snow.jpg
--------------------------------------------------------------------------------
/media/underwater.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/MMPano/b176d26bc3311584e746fe88135e8d5e793d3107/media/underwater.jpeg
--------------------------------------------------------------------------------
/mm_pano/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/MMPano/b176d26bc3311584e746fe88135e8d5e793d3107/mm_pano/__init__.py
--------------------------------------------------------------------------------
/mm_pano/lib/Equirec2Perspec.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 | import cv2
4 | import numpy as np
5 |
6 | class Equirectangular:
7 | def __init__(self, img_name, text2light=False):
8 | if isinstance(img_name, str):
9 | self._img = cv2.imread(img_name, cv2.IMREAD_COLOR)
10 | else:
11 | self._img = img_name
12 | if text2light:
13 | self._img = np.roll(self._img, -60, axis=0)
14 |
15 | [self._height, self._width, _] = self._img.shape
16 |
17 |
18 | def GetPerspective(self, FOV, THETA, PHI, height, width):
19 | #
20 | # THETA is left/right angle, PHI is up/down angle, both in degree
21 | #
22 |
23 | equ_h = self._height
24 | equ_w = self._width
25 | equ_cx = (equ_w - 1) / 2.0
26 | equ_cy = (equ_h - 1) / 2.0
27 |
28 | wFOV = FOV
29 | hFOV = float(height) / width * wFOV
30 |
31 | w_len = np.tan(np.radians(wFOV / 2.0))
32 | h_len = np.tan(np.radians(hFOV / 2.0))
33 |
34 |
35 | x_map = np.ones([height, width], np.float32)
36 | y_map = np.tile(np.linspace(-w_len, w_len,width), [height,1])
37 | z_map = -np.tile(np.linspace(-h_len, h_len,height), [width,1]).T
38 |
39 | D = np.sqrt(x_map**2 + y_map**2 + z_map**2)
40 | xyz = np.stack((x_map,y_map,z_map),axis=2)/np.repeat(D[:, :, np.newaxis], 3, axis=2)
41 |
42 | y_axis = np.array([0.0, 1.0, 0.0], np.float32)
43 | z_axis = np.array([0.0, 0.0, 1.0], np.float32)
44 | [R1, _] = cv2.Rodrigues(z_axis * np.radians(THETA))
45 | [R2, _] = cv2.Rodrigues(np.dot(R1, y_axis) * np.radians(-PHI))
46 |
47 | xyz = xyz.reshape([height * width, 3]).T
48 | xyz = np.dot(R1, xyz)
49 | xyz = np.dot(R2, xyz).T
50 | lat = np.arcsin(xyz[:, 2])
51 | lon = np.arctan2(xyz[:, 1] , xyz[:, 0])
52 |
53 | lon = lon.reshape([height, width]) / np.pi * 180
54 | lat = -lat.reshape([height, width]) / np.pi * 180
55 |
56 | lon = lon / 180 * equ_cx + equ_cx
57 | lat = lat / 90 * equ_cy + equ_cy
58 |
59 |
60 |
61 | persp = cv2.remap(self._img, lon.astype(np.float32), lat.astype(np.float32), cv2.INTER_CUBIC, borderMode=cv2.BORDER_WRAP)
62 | return persp
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
--------------------------------------------------------------------------------
/mm_pano/lib/Perspec2Equirec.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 | import cv2
4 | import numpy as np
5 |
6 | class Perspective:
7 | def __init__(self, img_name , FOV, THETA, PHI ):
8 | if isinstance(img_name, str):
9 | self._img = cv2.imread(img_name, cv2.IMREAD_COLOR)
10 | else:
11 | self._img = img_name
12 | [self._height, self._width, _] = self._img.shape
13 | self.wFOV = FOV
14 | self.THETA = THETA
15 | self.PHI = PHI
16 | self.hFOV = float(self._height) / self._width * FOV
17 |
18 | self.w_len = np.tan(np.radians(self.wFOV / 2.0))
19 | self.h_len = np.tan(np.radians(self.hFOV / 2.0))
20 |
21 |
22 |
23 | def GetEquirec(self,height,width):
24 | #
25 | # THETA is left/right angle, PHI is up/down angle, both in degree
26 | #
27 |
28 | x,y = np.meshgrid(np.linspace(-180, 180,width),np.linspace(90,-90,height))
29 |
30 | x_map = np.cos(np.radians(x)) * np.cos(np.radians(y))
31 | y_map = np.sin(np.radians(x)) * np.cos(np.radians(y))
32 | z_map = np.sin(np.radians(y))
33 |
34 | xyz = np.stack((x_map,y_map,z_map),axis=2)
35 |
36 | y_axis = np.array([0.0, 1.0, 0.0], np.float32)
37 | z_axis = np.array([0.0, 0.0, 1.0], np.float32)
38 | [R1, _] = cv2.Rodrigues(z_axis * np.radians(self.THETA))
39 | [R2, _] = cv2.Rodrigues(np.dot(R1, y_axis) * np.radians(-self.PHI))
40 |
41 | R1 = np.linalg.inv(R1)
42 | R2 = np.linalg.inv(R2)
43 |
44 | xyz = xyz.reshape([height * width, 3]).T
45 | xyz = np.dot(R2, xyz)
46 | xyz = np.dot(R1, xyz).T
47 |
48 | xyz = xyz.reshape([height , width, 3])
49 | inverse_mask = np.where(xyz[:,:,0]>0,1,0)
50 |
51 | xyz[:,:] = xyz[:,:]/np.repeat(xyz[:,:,0][:, :, np.newaxis], 3, axis=2)
52 |
53 |
54 | lon_map = np.where((-self.w_len 0 and i < 5:
417 | prompt = "Question: is there a {} in this picture (just say yes or no)? Answer:".format(extract_words_after_we_see_withFailv2(lines_major_obj[obj_id]))
418 | inputs = processor(image_inpaint, text=prompt, return_tensors="pt").to(device, torch_dtype)
419 | generated_ids = img2text_pipe.generate(**inputs, max_new_tokens=15)
420 | generated_text_repeat = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
421 | print("repeated check = {}".format(generated_text_repeat))
422 | if "yes" in generated_text_repeat:
423 | print(" we see {} in the inpainted view".format(extract_words_after_we_see_withFailv2(lines_major_obj[obj_id])))
424 | pure_color_bg = True
425 | iter_count += (1.0/num_false)
426 | if not is_repeated_all and iter_count >= 20:
427 | is_repeated_all = True
428 | print("reaching maximum checking iterations, there is a conflict, setting is_repeated to true")
429 | inpainted_cv2 = pil_to_cv2(image_inpaint)
430 |
431 | # we do the same merging step as the
432 | # 1. compute the weight mask for the warped image
433 | dist2zero = distance_transform_edt(mask_accumulate)
434 |
435 | # 2. build weight map according to dist2zero
436 | weight_map_cinpaint = np.ones(mask_accumulate.shape).astype(np.float32)
437 | weight_map_cinpaint[dist2zero <= cinpaint_th] = dist2zero[dist2zero <= cinpaint_th] / cinpaint_th
438 |
439 | # Save image at each step
440 | if sr_pipe is not None:
441 | inpainted_cv2_merge = warped_image_SR * weight_map_cinpaint[:, :, np.newaxis] + inpainted_cv2 * (1 - weight_map_cinpaint)[:, :, np.newaxis]
442 | # filename = os.path.join(output_folder, f"inpaint_step_SR_{i}.png")
443 | else:
444 | inpainted_cv2_merge = warped_image * weight_map_cinpaint[:, :, np.newaxis] + inpainted_cv2 * (1 - weight_map_cinpaint)[:, :, np.newaxis]
445 | # filename = os.path.join(output_folder, f"inpaint_step_{i}.png")
446 | filename = os.path.join(output_folder, f"inpaint_step_{i}.png")
447 | cv2.imwrite(filename, inpainted_cv2_merge)
448 |
449 | # Perform super-resolution on the inpainted_cv2 (not on inpainted_cv2_SR to prevent noise amplification)
450 | if sr_pipe is not None:
451 | # image_inpaint_SR = cv2_to_pil(inpainted_cv2.astype(np.uint8))
452 | image_inpaint_SR = cv2_to_pil(inpainted_cv2_merge.astype(np.uint8))
453 | image_inpaint_SR = sr_pipe(prompt=orig_prompt, negative_prompt=orig_negative_prompt, image=image_inpaint_SR, num_inference_steps=sr_inf_step).images[0]
454 | image_inpaint_SR_cv2 = pil_to_cv2(image_inpaint_SR)
455 | filename = os.path.join(output_folder, f"inpaint_step_SR_{i}.png")
456 | cv2.imwrite(filename, image_inpaint_SR_cv2)
457 |
458 | image_list.append(inpainted_cv2)
459 | if sr_pipe is not None:
460 | image_SR_list.append(image_inpaint_SR_cv2)
461 | pose_list.append(pose)
462 |
463 | return 0
464 |
465 |
466 | def parse_args():
467 | def list_of_num(arg):
468 | return list(map(float, arg.split(',')))
469 |
470 | parser = argparse.ArgumentParser(description='Multimodal Panorama Generation')
471 | parser.add_argument('--device', type=str, default="hpu", choices=["cpu", "cuda", "hpu"], help="Target HW device for Diffusion and BLIP models")
472 | parser.add_argument('--dtype', type=str, default="float32", choices=["float16", "float32", "bfloat16"], help="Datatype for model inference.")
473 | parser.add_argument('--init_prompt', type=str, help='Prompt which will be used for text to panorama generation.')
474 | parser.add_argument('--init_image', type=str, help='Path to a image which will be used for image to panorama generation.')
475 | parser.add_argument('--output_folder', type=str, default='./exp/output')
476 | parser.add_argument('--cpu_offload', action="store_true", help="Flag if user want to offload StableDiffusion pipeline to CPU")
477 |
478 | parser.add_argument('--text2pano', action="store_true", help="Flag if user want to do text-to-panorama. Else will do image-to-panorama.")
479 | parser.add_argument('--llm_model_name', type=str, default="mistralai/Mistral-7B-Instruct-v0.2",
480 | choices=_VALIDATED_MODELS, help='Name of LLM model for text generation.')
481 | parser.add_argument('--api_key', type=str, default="", help="your OpenAI API key")
482 | parser.add_argument('--intrinsic', type=list_of_num, default=[1.11733848262, 1.11733848262, 0.5, 0.5], help="Intrinsic.")
483 | parser.add_argument('--panorama_descriptor', type=str, help='Path to a descriptor JSON that will be used for panorama generation.')
484 |
485 | parser.add_argument('--do_upscale', action="store_true", help="Flag if user want to use super resolution to upscale the generated images")
486 | parser.add_argument('--major_obj_number', type=int, default=2, choices=[1, 2], help='how many major objects we do we want to consider so that they dont repeat?')
487 | parser.add_argument('--sr_inf_step', type=int, default=35, help='number of inference steps for the super resolution model')
488 |
489 | parser.add_argument('--inpaint_model_name', type=str, default="stabilityai/stable-diffusion-2-inpainting",
490 | help="Diffusion model name")
491 | parser.add_argument('--blip_model_name', type=str, default="Salesforce/blip2-flan-t5-xl",
492 | help="BLIP model name")
493 | parser.add_argument('--upscaler_model_name', type=str, default="stabilityai/stable-diffusion-x4-upscaler",
494 | help="Super resolution upscaler model name")
495 |
496 | # Generate panorama and video
497 | parser.add_argument('--save_pano_img', action="store_true", help="Flag if user want to save the panorama image.")
498 | parser.add_argument('--gen_video', action="store_true", help="Flag if user want to generate and save a video of panorama view.")
499 | parser.add_argument('--video_codec', type=str, default="MP4V", choices=["MP4V", "VP09"],
500 | help="Video codec used to generate the video")
501 | args = parser.parse_args()
502 |
503 | # Validate arguments
504 | if len(args.intrinsic) != 4:
505 | raise RuntimeError(f"--intrinsic has to be 4 floating point number. Got {args.intrinsic}")
506 |
507 | return args
508 |
509 |
510 | def gen_multiviews(
511 | device: str,
512 | dtype: str = "float32",
513 | output_folder: str = "./outputs",
514 | init_prompt: Optional[str] = None,
515 | init_image: Optional[Union[str, Image.Image]] = None,
516 | cpu_offload: bool = False,
517 | # Text generation
518 | text2pano: bool = False,
519 | llm_model_name: str = "mistralai/Mistral-7B-Instruct-v0.2",
520 | api_key: str = "",
521 | panorama_descriptor: Optional[Union[str, Dict[str, str]]] = None, # None, path to JSON, or a dictionary
522 | use_predefine_llm_descriptor: bool = False,
523 | llm_engine = None,
524 | # Panorama generation
525 | intrinsic: List[float] = [1.11733848262, 1.11733848262, 0.5, 0.5],
526 | do_upscale: bool = False,
527 | major_obj_number: int = 2,
528 | sr_inf_step: int = 35,
529 | inpaint_model_name: Optional[str] = "stabilityai/stable-diffusion-2-inpainting",
530 | blip_model_name: Optional[str] = "Salesforce/blip2-flan-t5-xl",
531 | upscaler_model_name: Optional[str] = "stabilityai/stable-diffusion-x4-upscaler",
532 | text2img_model_name: Optional[str] = "stabilityai/stable-diffusion-2-base",
533 | # Pre-loaded pipelines, if any
534 | inpaint_pipe: Optional = None,
535 | processor: Optional = None,
536 | img2text_pipe: Optional = None,
537 | sr_pipe: Optional = None,
538 | text2img_pipe: Optional = None,
539 | **kwargs,
540 | ):
541 |
542 | if is_on_hpu(device) and dtype == "float16":
543 | # Force dtype to be bfloat16 on HPU
544 | dtype = "bfloat16"
545 |
546 | print("===========================================================================")
547 | print(f"Running Multimodal Panorama Generation on {device} in {dtype}.")
548 | print("===========================================================================")
549 |
550 | ##################
551 | # Parse descriptor
552 | ##################
553 | # If given, get the pre-generated LLM descriptions
554 | if panorama_descriptor is not None and use_predefine_llm_descriptor:
555 | if isinstance(panorama_descriptor, dict):
556 | panorama_descriptor = Descriptor(**panorama_descriptor)
557 | elif isinstance(panorama_descriptor, str) and os.path.isfile(panorama_descriptor):
558 | panorama_descriptor = Descriptor.from_json(panorama_descriptor)
559 | elif not isinstance(panorama_descriptor, Descriptor):
560 | raise RuntimeError(f"panorama_descriptor should be a JSON file, Dictionary, or Descriptor type.")
561 |
562 | # If only init_prompt is given in the panorama_descriptor, do the text-to-panorama
563 | if not panorama_descriptor.init_image:
564 | assert panorama_descriptor.init_prompt, "At least one of [`init_prompt`, `init_image`] must be given"
565 | text2pano = True
566 |
567 | elif panorama_descriptor is None and use_predefine_llm_descriptor:
568 | raise RuntimeError(f"`panorama_descriptor` must be provided when setting `use_predefine_llm_descriptor=True`")
569 |
570 | ######################
571 | # Create output folder
572 | ######################
573 | if os.path.exists(output_folder):
574 | shutil.rmtree(output_folder)
575 | os.makedirs(output_folder, exist_ok = True)
576 | print(f"Save all outputs to {output_folder}")
577 |
578 | #############################
579 | # Load pipelines if not given
580 | #############################
581 | # Inpainting pipeline
582 | if inpaint_pipe is None:
583 | inpaint_pipe = load_diffusion_model(inpaint_model_name, device=device, dtype=dtype, cpu_offload=cpu_offload)
584 |
585 | # Image-to-text pipeline
586 | if processor is None and img2text_pipe is None:
587 | processor, img2text_pipe = load_blip_model_and_processor(blip_model_name, device=device, dtype=dtype)
588 | elif (processor is not None and img2text_pipe is None) or (processor is None and img2text_pipe is not None):
589 | raise RuntimeError(
590 | "Processor and BLIP model has to be set or not set at the same time. "
591 | f"Got processor={processor}, img2text_pipe={img2text_pipe}."
592 | )
593 |
594 | # Super resolution
595 | if sr_pipe is None and do_upscale:
596 | # NOTE: Skip upscaler in light version
597 | sr_pipe = load_upscaler_model(upscaler_model_name, device, dtype)
598 |
599 | # Text-to-image
600 | if text2pano and text2img_pipe is None:
601 | # Load Diffusion pipeline
602 | text2img_pipe = load_diffusion_model(text2img_model_name, device=device, dtype=dtype, cpu_offload=cpu_offload)
603 |
604 | # Text generation
605 | if llm_engine is None:
606 | llm_engine = get_llm_engine(llm_model_name, device=device, dtype=dtype, openai_key=api_key)
607 |
608 | ###########################
609 | # Text or Image to Panorama
610 | ###########################
611 | init_prompt = init_prompt if panorama_descriptor is None else panorama_descriptor.init_prompt
612 | init_image = init_image if panorama_descriptor is None else panorama_descriptor.init_image
613 |
614 | t_begin = time.time()
615 | # Use given init_image or generate an init_image from the init_prompt.
616 | # This will be used for generating panorama
617 | if text2pano:
618 | print(f"Generating init image with prompt={init_prompt} ...")
619 | init_image = text2img_pipe(init_prompt, num_inference_steps=25).images[0]
620 | init_image = cv2.cvtColor(np.array(init_image), cv2.COLOR_RGB2BGR)
621 | elif init_image is not None:
622 | if isinstance(init_image, str):
623 | # init_image is a path to a file
624 | print(f"Loading init image from {init_image}")
625 | init_image = cv2.imread(init_image, cv2.IMREAD_COLOR)
626 | elif isinstance(init_image, Image.Image):
627 | init_image = cv2.cvtColor(np.array(init_image), cv2.COLOR_RGB2BGR)
628 | elif isinstance(init_image, np.ndarray):
629 | pass
630 | else:
631 | # TODO(Joey Chou): Add error message
632 | raise RuntimeError("Please do text2pano with a given init_prompt, or pass a init_image to do image to pano")
633 |
634 | # check whether the intrinsic matrix exist
635 | with torch.inference_mode():
636 | fail = True
637 | while fail:
638 | fail = create_panorama(
639 | init_image, intrinsic, output_folder, processor, img2text_pipe, inpaint_pipe, sr_pipe, device,
640 | sr_inf_step, init_prompt=init_prompt, major_obj_number=major_obj_number,
641 | panorama_descriptor=panorama_descriptor, llm_engine=llm_engine
642 | )
643 | print(f"Total runtime: {time.time() - t_begin}")
644 |
645 |
646 | def _gen_pano_outputs(images: List[np.ndarray],
647 | out_dir: str,
648 | rotation_degrees: List[int],
649 | fov: float = 99.9169018, gen_video: bool = False,
650 | save_pano_img: bool = True,
651 | # Video related
652 | video_size: Tuple[int, int] = (512, 512), video_codec: str = "MP4V",
653 | new_pano: Optional = None):
654 | """
655 | To make video works with gradio, please use the setup as below:
656 | * interval_deg = 1.0
657 | * fps: = 60
658 | * video_codec = "VP09"
659 |
660 | For other application that works with mp4v:
661 | * interval_deg = 0.5
662 | * fps = 60
663 | * video_codec = "MP4V"
664 | """
665 |
666 | if new_pano is None:
667 | _output_image_name = "pano.png"
668 |
669 | ee = m_P2E.Perspective(
670 | images,
671 | [
672 | [fov, rotation_degrees[0], 0], [fov, rotation_degrees[1], 0], [fov, rotation_degrees[2], 0], [fov, rotation_degrees[3], 0],
673 | [fov, rotation_degrees[4], 0], [fov, rotation_degrees[5], 0], [fov, rotation_degrees[6], 0]
674 | ]
675 | )
676 |
677 | new_pano = ee.GetEquirec(2048, 4096)
678 |
679 | if save_pano_img:
680 | # Output panorama image
681 | cv2.imwrite(os.path.join(out_dir, _output_image_name), new_pano.astype(np.uint8)[540:-540])
682 |
683 | if gen_video:
684 | if video_codec.upper() == "MP4V":
685 | codec_config = mp4vCodec()
686 | elif video_codec.upper() == "VP09":
687 | codec_config = vp90Codec()
688 | elif video_codec.upper() == "MP4":
689 | codec_config = mp4Codec()
690 | else:
691 | raise RuntimeError(f"Only support codec ['.MP4V', 'VP09']. Got {video_codec}")
692 |
693 | output_video_name = f"video{codec_config.video_format}"
694 | interval_deg = codec_config.interval_deg
695 |
696 | video_codec = codec_config.video_codec
697 | fps = codec_config.fps
698 |
699 | fov = 86
700 | num_frames = int(360 / interval_deg)
701 |
702 | equ = E2P.Equirectangular(new_pano)
703 | img = equ.GetPerspective(fov, 0, 0, *video_size) # Specify parameters(FOV, theta, phi, height, width)
704 |
705 | margin = 0
706 | if margin > 0:
707 | img = img[margin:-margin]
708 | size = (img.shape[1], img.shape[0])
709 |
710 | save_video_path = os.path.join(out_dir, output_video_name)
711 | print("save_video_path = ", save_video_path, "; ", video_codec, ", ", fps, ", ", size, ", video_size = ", video_size)
712 | out = cv2.VideoWriter(save_video_path, cv2.VideoWriter_fourcc(*video_codec), fps, size)
713 |
714 | for i in tqdm(range(num_frames)):
715 | # Process image
716 | deg = i * interval_deg
717 | img = equ.GetPerspective(fov, deg, 0, *video_size) # Specify parameters(FOV, theta, phi, height, width)
718 | if margin > 0:
719 | img = img[margin:-margin]
720 | img = np.clip(img, 0, 255).astype(np.uint8)
721 |
722 | # Write to video
723 | out.write(img)
724 | out.release()
725 |
726 | # ffmpeg -y -i /root/app/rest_api/api_output/demo/video.mp4v /root/app/rest_api/api_output/demo/video.avc1
727 | return new_pano
728 |
729 |
730 | def gen_pano(images: Optional[List[np.ndarray]] = None,
731 | output_folder: Optional[str] = None,
732 | do_upscale: bool = False,
733 | save_pano_img: bool = True,
734 | gen_video: bool = True,
735 | video_codec: str = "MP4V",
736 | pano: Optional = None,
737 | **kwargs,
738 | ):
739 | # suffix = '_SR' if do_upscale else ""
740 | suffix = ""
741 | image_names = ["input_resized" + suffix + ".png"]
742 | for i in range(6):
743 | image_names.append("inpaint_step" + suffix + "_{}.png".format(i))
744 |
745 | rotations = [create_rotation_matrix(0, 0, 0).T]
746 | rotation_degrees = [0]
747 | max_step = 6
748 | step_size = 41
749 | vortex_list = generate_left_right_fullPano_pattern(max_step=max_step, step_size=step_size, final_step=55)
750 | for i in range(6):
751 | rotations.append(create_rotation_matrix(vortex_list[i][0], vortex_list[i][1], vortex_list[i][2]).T)
752 | rotation_degrees.append(vortex_list[i][1])
753 |
754 | LR_images = []
755 | # read individual images out
756 | for image_name in tqdm(image_names):
757 | LR_images.append(cv2.imread(os.path.join(output_folder, image_name)))
758 |
759 | return _gen_pano_outputs(LR_images, output_folder, rotation_degrees, save_pano_img=save_pano_img, gen_video=gen_video, video_codec=video_codec, new_pano=pano)
760 |
761 |
762 | if __name__ == "__main__":
763 | args = parse_args()
764 |
765 | # Generate multiview scenes
766 | gen_multiviews(**args.__dict__)
767 |
768 | # Generate panorama view and optionally generate video
769 | gen_pano(**args.__dict__)
770 |
--------------------------------------------------------------------------------
/mm_pano/tgi_gaudi/README.md:
--------------------------------------------------------------------------------
1 | ## Run TGI-Gaudi for LLM Serving
2 |
3 | This is a short instruction to run a TGI Gaudi server to do LLM serving. For more information, please check [TGI-Gaudi](https://github.com/huggingface/tgi-gaudi)
4 |
5 | #### Start a Llama3 TGI-Gaudi serving
6 | ```bash
7 | ./run_tgi_gaudi.sh
8 | ```
9 |
10 | #### Quick test
11 | ```bash
12 | test_tgi.sh
13 | ```
14 |
--------------------------------------------------------------------------------
/mm_pano/tgi_gaudi/run_tgi_gaudi.sh:
--------------------------------------------------------------------------------
1 | #! /bin/bash
2 | model=meta-llama/Meta-Llama-3-8B-Instruct
3 | CONTAINER_NAME=tgi-gaudi
4 | volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run
5 | num_shard=2
6 | sharded=true
7 | max_input_length=2048
8 | max_total_tokens=4096
9 |
10 | # Usage: text-generation-launcher <
11 | # --model-id |
12 | # --revision |
13 | # --validation-workers |
14 | # --sharded |
15 | # --num-shard |
16 | # --quantize |
17 | # --speculate |
18 | # --dtype |
19 | # --trust-remote-code|
20 | # --max-concurrent-requests |
21 | # --max-best-of |
22 | # --max-stop-sequences |
23 | # --max-top-n-tokens |
24 | # --max-input-tokens |
25 | # --max-input-length |
26 | # --max-total-tokens |
27 | # --waiting-served-ratio |
28 | # --max-batch-prefill-tokens |
29 | # --max-batch-total-tokens |
30 | # --max-waiting-tokens |
31 | # --max-batch-size |
32 | # --cuda-graphs |
33 | # --hostname |
34 | # --port |
35 | # --shard-uds-path |
36 | # --master-addr |
37 | # --master-port |
38 | # --huggingface-hub-cache |
39 | # --weights-cache-override |
40 | # --disable-custom-kernels|
41 | # --cuda-memory-fraction |
42 | # --rope-scaling |
43 | # --rope-factor |
44 | # --json-output|
45 | # --otlp-endpoint |
46 | # --cors-allow-origin |
47 | # --watermark-gamma |
48 | # --watermark-delta |
49 | # --ngrok|
50 | # --ngrok-authtoken |
51 | # --ngrok-edge |
52 | # --tokenizer-config-path |
53 | # --disable-grammar-support
54 | #
55 |
56 | # -e HUGGING_FACE_HUB_TOKEN= \
57 | docker run \
58 | -p 8080:80 \
59 | -v $volume:/data \
60 | --runtime=habana \
61 | -e PT_HPU_ENABLE_LAZY_COLLECTIVES=true \
62 | -e HABANA_VISIBLE_DEVICES=all \
63 | -e OMPI_MCA_btl_vader_single_copy_mechanism=none \
64 | --cap-add=sys_nice \
65 | --ipc=host \
66 | --name=${CONTAINER_NAME} \
67 | ghcr.io/huggingface/tgi-gaudi:2.0.0 \
68 | --model-id $model --sharded $sharded --num-shard $num_shard --max-input-length $max_input_length --max-total-tokens $max_total_tokens
69 |
--------------------------------------------------------------------------------
/mm_pano/tgi_gaudi/test_tgi.sh:
--------------------------------------------------------------------------------
1 | #! /bin/bash
2 |
3 | curl 127.0.0.1:8080/generate \
4 | -X POST \
5 | -d '{"inputs":"What is deep learning?","parameters":{"max_new_tokens":2048,"temperature":0.7,"repetition_penalty":1}}' \
6 | -H 'Content-Type: application/json'
7 |
--------------------------------------------------------------------------------
/mm_pano/utils/common.py:
--------------------------------------------------------------------------------
1 | import os
2 | import re
3 | import json
4 | from dataclasses import dataclass, field
5 | from typing import Optional, List
6 |
7 |
8 | def extract_words_after_we_see_withFailv2(s):
9 | match = re.search('We .*?see: (.*)', s, re.IGNORECASE)
10 | if match:
11 | return match.group(1).replace('.', '').lower()
12 | print("No match found")
13 | return
14 |
15 |
16 | def extract_words_after_we_see_withFailv3(s):
17 | match = re.search('We .*?see(.*)', s, re.IGNORECASE) or re.search('View .*?:(.*)', s, re.IGNORECASE)
18 | if match:
19 | return match.group(1)
20 | print("No match found")
21 | return
22 |
23 |
24 | @dataclass
25 | class Descriptor:
26 | generated_text_details: Optional[str] = None
27 | message: Optional[str] = None
28 | message_main_obj: Optional[str] = None
29 | message_topdown: Optional[str] = None
30 | question_for_llm_repeat: Optional[str] = None
31 | description_no_obj: Optional[str] = None
32 | major_obj_number: int = 2
33 | is_repeated: List[bool] = field(default_factory=list)
34 |
35 | init_prompt: Optional[str] = None
36 | init_image: Optional[str] = None
37 |
38 | @classmethod
39 | def from_json(cls, json_path: str):
40 | assert isinstance(json_path, str) and os.path.isfile(json_path)
41 | with open(json_path, "r") as f:
42 | _dict = json.load(f)
43 | print(_dict)
44 | return cls(**_dict)
45 |
46 | def save_json(self, json_path: str):
47 | assert isinstance(json_path, str)
48 | with open(json_path, "w") as f:
49 | json.dump(self.__dict__, f, indent=4)
50 |
51 | def __post_init__(self):
52 | assert self.init_prompt is not None or self.init_image is not None, \
53 | "When using Descriptor, either `init_prompt` or `init_image` has to be set. Got both None."
54 |
55 | if self.init_prompt is not None and self.init_image is not None:
56 | print(f"Both `init_prompt` ({self.init_prompt}) and `init_image` ({self.init_image}) "
57 | " is given, using `init_image` and ignore `init_prompt`")
58 | self.init_prompt = None
59 |
60 | if self.init_image:
61 | assert os.path.isfile(self.init_image), f"The given `init_image` is not a valid file {self.init_image}"
62 |
--------------------------------------------------------------------------------
/mm_pano/utils/image_utils.py:
--------------------------------------------------------------------------------
1 | import cv2
2 | import numpy as np, math
3 | from dataclasses import dataclass
4 |
5 | from PIL import Image
6 | import torch
7 | import torchvision.transforms as transforms
8 |
9 | from typing import List, Union
10 | import torch.nn.functional as F
11 | from torchvision.utils import save_image
12 | import argparse
13 |
14 | import scipy.ndimage as ndimage
15 | import os
16 | import glob
17 | import io
18 | import json
19 | from scipy.ndimage import distance_transform_edt
20 |
21 |
22 | def cv2_to_pil(image):
23 | # Convert the cv2 image to RGB format
24 | image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
25 |
26 | # Convert the cv2 image to a PIL image
27 | pil_image = Image.fromarray(image)
28 |
29 | return pil_image
30 |
31 |
32 | def pil_to_cv2(image):
33 | # Convert the PIL image to a numpy array
34 | np_image = np.array(image)
35 |
36 | # Convert the numpy array to a cv2 image
37 | cv2_image = cv2.cvtColor(np_image, cv2.COLOR_RGB2BGR)
38 |
39 | return cv2_image
40 |
41 |
42 | def pil_to_tensor(image):
43 | # Define a transformation pipeline
44 | transform = transforms.Compose([
45 | transforms.ToTensor(),
46 | transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
47 | ])
48 |
49 | # Apply the transformation to the PIL image
50 | tensor_image = transform(image)
51 |
52 | # Add the batch dimension
53 | return tensor_image.unsqueeze(0)
54 |
55 |
56 | def pil_mask_to_tensor(pil_mask):
57 | # Define the transformation to convert the PIL image to a tensor
58 | transform = transforms.ToTensor()
59 |
60 | # Apply the transformation to the PIL image
61 | tensor_mask = transform(pil_mask)
62 |
63 | # Repeat the tensor along the channel dimension to create 3 channels
64 | tensor_mask = tensor_mask.repeat(3, 1, 1)
65 |
66 | # Add the batch dimension
67 | return tensor_mask.unsqueeze(0)
68 |
69 |
70 | def mask_to_pil(mask):
71 | # Multiply the mask by 255 to get values between 0 and 255
72 | mask = mask * 255
73 |
74 | # Convert the mask to an 8-bit integer numpy array
75 | mask = np.uint8(mask)
76 |
77 | # Create a black and white PIL image from the mask
78 | pil_image = Image.fromarray(mask, mode="L")
79 |
80 | return pil_image
81 |
82 |
83 | def cv2_to_tensor(image):
84 | # Convert the image from BGR to RGB format
85 | image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
86 |
87 | # Normalize pixel values to range 0.0 to 1.0
88 | image_normalized = image_rgb.astype(np.float32) / 255.0
89 |
90 | # Transpose the image array to have the shape (3, h, w)
91 | image_transposed = np.transpose(image_normalized, (2, 0, 1))
92 |
93 | # Convert the numpy array to a PyTorch tensor and add a batch dimension
94 | tensor = torch.from_numpy(image_transposed).unsqueeze(0)
95 |
96 | return tensor
97 |
98 |
99 | def tensor_to_cv2(tensor_image):
100 | # Convert tensor image to a numpy array
101 | image_np = (tensor_image * 255).numpy().astype(np.uint8)
102 |
103 | # Transpose the numpy array to have the shape (h, w, 3)
104 | image_np_transposed = np.transpose(image_np, (0, 2, 3, 1))
105 |
106 | # Remove the batch dimension
107 | image_cv2_float = np.squeeze(image_np_transposed, axis=0)
108 |
109 | # Convert the RGB order to BGR order
110 | return cv2.cvtColor(image_cv2_float, cv2.COLOR_RGB2BGR)
111 |
112 |
113 | def tensor_mask_to_numpy(mask):
114 | # Convert the mask to a numpy array
115 | mask_np = mask.numpy()
116 |
117 | # Remove the batch and channel dimensions
118 | return np.squeeze(mask_np, axis=(0, 3))
119 |
120 |
121 | def save_mask_to_png(mask, filename):
122 | assert len(mask.shape) == 3 and mask.shape[-1] == 1, "Invalid mask shape. Expected (h, w, 1)."
123 |
124 | # Convert the mask to an integer array with values in the range [0, 255]
125 | mask_255 = (mask * 255).astype(np.uint8)
126 |
127 | # Repeat the single channel mask to create a 3 channel image
128 | mask_3_channels = np.repeat(mask_255, 3, axis=-1)
129 |
130 | # Save the image
131 | img = Image.fromarray(mask_3_channels)
132 | img.save(f"{filename}")
133 |
134 |
135 |
136 | def warp_image_v2(image, K_original, K_new, R, output_size):
137 | """
138 | Warp an image to a new view given the original and new camera intrinsic matrices, relative rotation,
139 | and output image size.
140 |
141 | Parameters:
142 | image (numpy.ndarray): The original image.
143 | K_original (numpy.ndarray): The original camera's intrinsic matrix.
144 | K_new (numpy.ndarray): The new camera's intrinsic matrix.
145 | R (numpy.ndarray): The relative rotation matrix.
146 | output_size (tuple): The desired output image size (width, height).
147 |
148 | Returns:
149 | warped_image (numpy.ndarray): The warped image.
150 | mask (numpy.ndarray): Mask indicating if a pixel in the warped image has a corresponding pixel in the original image.
151 | """
152 |
153 | # Compute the transformation matrix using the scaled new camera intrinsic
154 | T = K_new.dot(R).dot(np.linalg.inv(K_original))
155 |
156 | # Warp the image using the new transformation matrix to the specified output size
157 | warped_image = cv2.warpPerspective(image, T, output_size)
158 |
159 | # Create and warp the mask
160 | mask = np.ones((image.shape[0], image.shape[1]), dtype=np.uint8) * 255
161 | mask_warped = cv2.warpPerspective(mask, T, output_size, cv2.INTER_NEAREST, borderMode=cv2.BORDER_CONSTANT, borderValue=(0,0,0))
162 |
163 | # Convert mask to binary (0 or 1)
164 | mask_binary = (mask_warped > 250).astype(np.uint8)
165 |
166 | return warped_image, mask_binary
167 |
168 | # compute the nearest unmasked region
169 | def mask_to_NN_v2(mask, invert = False):
170 | # print("mask = {}/{}".format(mask, mask.shape))
171 | # Set a threshold value to create a binary mask
172 | threshold = 0.5
173 | binary_mask = mask > threshold
174 |
175 | if invert:
176 | # Invert the binary_mask to find the unmasked pixels
177 | binary_mask = 1 - binary_mask
178 |
179 | # Convert the inverted_mask to a NumPy array
180 | inverted_mask_np = binary_mask
181 |
182 | # Compute the distance transform on the inverted mask
183 | distance_transform = ndimage.distance_transform_edt(inverted_mask_np)
184 |
185 | # Convert the distance transform back to a PyTorch tensor
186 | return torch.tensor(distance_transform, dtype=torch.float32)
187 |
188 |
189 | def generate_left_right_fullPano_pattern(max_step = 8, step_size = 42, final_step = 42):
190 | pattern = []
191 |
192 | start_step = 1
193 | angle_begin = step_size
194 | angle_end = (360 - step_size * (max_step // 2 - 1) + step_size * (max_step // 2)) / 2
195 | step_mid = angle_end - step_size * (max_step // 2)
196 | for step in range(start_step, max_step+1):
197 | if step <= max_step // 2:
198 | pattern.append((0, angle_begin, 0))
199 | angle_begin += step_size
200 | else:
201 | pattern.append((0, angle_end, 0))
202 | if step != (max_step // 2 + 1):
203 | angle_end += step_size
204 | else:
205 | angle_end += step_mid
206 |
207 | print(f"pattern = {pattern}")
208 | return pattern
209 |
210 |
211 | def create_rotation_matrix(x_angle_degrees, y_angle_degrees, z_angle_degrees):
212 | x_angle_radians = np.radians(x_angle_degrees)
213 | y_angle_radians = np.radians(y_angle_degrees)
214 | z_angle_radians = np.radians(z_angle_degrees)
215 |
216 | cos_x, sin_x = np.cos(x_angle_radians), np.sin(x_angle_radians)
217 | cos_y, sin_y = np.cos(y_angle_radians), np.sin(y_angle_radians)
218 | cos_z, sin_z = np.cos(z_angle_radians), np.sin(z_angle_radians)
219 |
220 | R_x = np.array([[1, 0, 0],
221 | [0, cos_x, -sin_x],
222 | [0, sin_x, cos_x]])
223 |
224 | R_y = np.array([[cos_y, 0, sin_y],
225 | [0, 1, 0],
226 | [-sin_y, 0, cos_y]])
227 |
228 | R_z = np.array([[cos_z, -sin_z, 0],
229 | [sin_z, cos_z, 0],
230 | [0, 0, 1]])
231 |
232 | R = R_y @ R_x @ R_z
233 |
234 | return R
235 |
236 | def read_file_into_list(file_path):
237 | # Initialize an empty list to hold the lines
238 | lines_list = []
239 |
240 | # Open the file in read mode ('r')
241 | with io.open(file_path, 'r', encoding='utf8') as file:
242 | # Read each line in the file
243 | for line in file:
244 | # Add the line to the list (removing any trailing whitespace characters)
245 | lines_list.append(line.rstrip())
246 |
247 | # Return the list of lines
248 | return lines_list
249 |
250 | def save_dict_to_file(dict_obj, file_name):
251 | with open(file_name, 'w') as file:
252 | json.dump(dict_obj, file)
253 |
254 |
255 | def load_dict_from_file(file_name):
256 | with open(file_name, 'r') as file:
257 | return json.load(file)
258 |
259 |
260 |
261 | def check_fov_overlap_simplified(rotation_matrix, fov1):
262 | """
263 | Simplified check if there is an overlap in the field of view of two images based on rotation angle.
264 |
265 | Parameters:
266 | rotation_matrix (numpy.ndarray): 3x3 rotation matrix from image1 to image2
267 | fov1 (tuple): Field of view of image1 (horizontal_angle, vertical_angle) in degrees
268 |
269 | Returns:
270 | bool: True if there is an overlap, False otherwise
271 | """
272 |
273 | # Calculate the rotation angle from the rotation matrix
274 | rotation_angle_rad = np.arccos((np.trace(rotation_matrix) - 1) / 2)
275 | rotation_angle_deg = np.degrees(rotation_angle_rad)
276 |
277 | # # Compare with the FOV (considering the larger of the horizontal or vertical FOV)
278 | return rotation_angle_deg <= fov1
279 |
280 |
281 |
282 |
283 | @dataclass
284 | class vp90Codec:
285 | interval_deg: float = 1.0
286 | fps: float = 60
287 | video_codec = "VP09"
288 | video_format = ".webm"
289 |
290 |
291 | @dataclass
292 | class mp4vCodec:
293 | interval_deg: float = 0.5
294 | fps: float = 60
295 | video_codec = "mp4v"
296 | video_format = ".mp4"
297 |
298 |
299 | @dataclass
300 | class mp4Codec:
301 | interval_deg: float = 0.5
302 | fps: float = 60
303 | video_codec = "h264"
304 | video_format = ".mp4"
305 |
--------------------------------------------------------------------------------
/mm_pano/utils/llm_engines.py:
--------------------------------------------------------------------------------
1 | import copy
2 | from typing import Optional, Dict
3 | from abc import ABC, abstractmethod
4 |
5 | from text_generation import Client
6 | from utils.model_utils import load_llm
7 |
8 | _VALIDATED_MODELS = [
9 | "gpt-4", "gpt-3.5-turbo",
10 | "meta-llama/Meta-Llama-3-8B-Instruct",
11 | "mistralai/Mistral-7B-Instruct-v0.2",
12 | "tgi",
13 | ]
14 |
15 |
16 | class BaseLLMEngine(ABC):
17 | @abstractmethod
18 | def chat(self, user_content:str, system_content: str, history: Optional[str] = None):
19 | pass
20 |
21 | def extract_output(self, output: str) -> str:
22 | return output
23 |
24 |
25 | class OpenAILLMEngine(BaseLLMEngine):
26 | def __init__(self,
27 | model_engine: str = None,
28 | openai=None,
29 | openai_key: str = None):
30 | self.model_engine = model_engine
31 | self.openai = openai
32 | self.openai.api_key = openai_key
33 | print(f"Using model engine {self.model_engine} to generate text")
34 |
35 | def chat(self,
36 | user_content: str,
37 | system_content: str = "You are a helpful assistant.",
38 | history: str = None) -> str:
39 |
40 | message = self.openai.ChatCompletion.create(
41 | model=self.model_engine,
42 | messages=[
43 | {"role": "system", "content": system_content},
44 | {"role": "user", "content": user_content},
45 | ]).choices[0]['message']['content']
46 |
47 | # For now the history always = None
48 | return message, None
49 |
50 |
51 | class QwenLLMEngine(BaseLLMEngine):
52 | def __init__(self,
53 | tokenizer,
54 | model):
55 | self.tokenizer = tokenizer
56 | self.model = model
57 |
58 | def chat(self,
59 | user_content: str,
60 | system_content: str = "You are a helpful assistant.",
61 | history: str = None):
62 | message, history = self.model.chat(self.tokenizer, user_content, history=history)
63 | return message, history
64 |
65 |
66 | class MistralLLMEngine(BaseLLMEngine):
67 | def __init__(self,
68 | tokenizer,
69 | model,
70 | default_generate_kwargs: Optional[Dict] = None):
71 | self.tokenizer = tokenizer
72 | self.model = model
73 | self.default_generate_kwargs = {} if default_generate_kwargs is None else default_generate_kwargs
74 |
75 | def chat(self,
76 | user_content: str,
77 | system_content: str = "You are a helpful assistant.",
78 | history: str = None,
79 | generate_kwargs: Optional[Dict] = None):
80 |
81 | _generate_kwargs = copy.deepcopy(self.default_generate_kwargs)
82 | if generate_kwargs is not None:
83 | _generate_kwargs.update(generate_kwargs)
84 |
85 | messages = [
86 | {"role": "user", "content": user_content},
87 | ]
88 | model_inputs = self.tokenizer.apply_chat_template(messages, return_tensors="pt").to(self.model.device)
89 | generated_ids = self.model.generate(model_inputs, **_generate_kwargs)
90 | decoded = self.tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
91 |
92 | # TODO(Tien Pei Chou): Find a better way to only output the new tokens.
93 | return decoded[0][decoded[0].rfind("[/INST]") + len("[/INST] "):], None
94 |
95 |
96 | class Llama3LLMEngine(BaseLLMEngine):
97 | def __init__(self,
98 | tokenizer,
99 | model,
100 | default_generate_kwargs: Optional[Dict] = None):
101 | self.tokenizer = tokenizer
102 | self.model = model
103 | self.default_generate_kwargs = {} if default_generate_kwargs is None else default_generate_kwargs
104 |
105 | def extract_output(self, output: str) -> str:
106 | return output[output.rfind("assistant\n\n") + len("assistant\n\n"):]
107 |
108 | def chat(self,
109 | user_content: str,
110 | system_content: str = "You are a helpful assistant.",
111 | history: str = None,
112 | generate_kwargs: Optional[Dict] = None):
113 |
114 | _generate_kwargs = copy.deepcopy(self.default_generate_kwargs)
115 | if generate_kwargs is not None:
116 | _generate_kwargs.update(generate_kwargs)
117 |
118 | messages = [
119 | {"role": "system", "content": system_content},
120 | {"role": "user", "content": user_content},
121 | ]
122 |
123 | model_inputs = self.tokenizer.apply_chat_template(messages, add_generation_prompt=False, return_tensors="pt").to(self.model.device)
124 | generated_ids = self.model.generate(model_inputs, **_generate_kwargs)
125 | decoded = self.tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
126 |
127 | # TODO(Tien Pei Chou): Find a better way to only output the new tokens.
128 | return self.extract_output(decoded[0]), None
129 |
130 |
131 | class TGILLMEngine(BaseLLMEngine):
132 | def __init__(self,
133 | tgi_url: Optional[str] = "http://127.0.0.1:8080",
134 | default_generate_kwargs: Optional[Dict] = None):
135 | self.client = Client(tgi_url)
136 | self.default_generate_kwargs = {} if default_generate_kwargs is None else default_generate_kwargs
137 |
138 | def chat(self,
139 | user_content: str,
140 | system_content: str = "You are a helpful assistant.",
141 | history: str = None,
142 | generate_kwargs: Optional[Dict] = None):
143 |
144 | _generate_kwargs = copy.deepcopy(self.default_generate_kwargs)
145 | if generate_kwargs is not None:
146 | _generate_kwargs.update(generate_kwargs)
147 |
148 | response = self.client.generate(user_content, **_generate_kwargs, return_full_text=False)
149 |
150 | return self.extract_output(response.generated_text), None
151 |
152 |
153 | def get_llm_engine(model_name: str,
154 | dtype: Optional[str] = "float32",
155 | device: Optional[str] = "hpu",
156 | openai_key: Optional[str] = None,
157 | hf_token: Optional[str] = None,
158 | tgi_url: Optional[str] = "http://127.0.0.1:8080"):
159 | if model_name in ["gpt-4", "gpt-3.5-turbo"]:
160 | import openai
161 | assert openai_key is not None, "Please set the `openai_key` when using OpenAI API"
162 | print(f"Using OpenAI {model_name} API for text generaton ...")
163 | return OpenAILLMEngine(model_engine=model_name, openai=openai, openai_key=openai_key)
164 | elif model_name == "mistralai/Mistral-7B-Instruct-v0.2":
165 | tokenizer, model = load_llm(
166 | model_name=model_name,
167 | device=device,
168 | dtype=dtype,
169 | trust_remote_code=True,
170 | hf_token=hf_token)
171 | default_generate_kwargs = {
172 | "do_sample": True,
173 | "temperature": 0.7,
174 | "max_new_tokens": 256
175 | }
176 | print(f"Using {model_name} for text generaton ...")
177 | return MistralLLMEngine(tokenizer, model, default_generate_kwargs=default_generate_kwargs)
178 | elif "Llama" in model_name: # Ex: "meta-llama/Meta-Llama-3-8B-Instruct"
179 | tokenizer, model = load_llm(
180 | model_name=model_name,
181 | device=device,
182 | dtype=dtype,
183 | trust_remote_code=True,
184 | hf_token=hf_token)
185 | default_generate_kwargs = {
186 | "do_sample": True,
187 | "temperature": 0.6,
188 | "max_new_tokens": 256
189 | }
190 | print(f"Using {model_name} for text generaton ...")
191 | return Llama3LLMEngine(tokenizer, model, default_generate_kwargs=default_generate_kwargs)
192 | elif "tgi" in model_name:
193 | assert tgi_url is not None, "Must pass a url to the client when using TGI-Gaudi"
194 | default_generate_kwargs = {
195 | "do_sample": True,
196 | "temperature": 0.6,
197 | "max_new_tokens": 256
198 | }
199 | return TGILLMEngine(tgi_url, default_generate_kwargs)
200 | else:
201 | raise NotImplementedError(f"Got unsupported model {model_name}")
202 |
--------------------------------------------------------------------------------
/mm_pano/utils/model_utils.py:
--------------------------------------------------------------------------------
1 | import os
2 | from typing import Union, Optional
3 |
4 | import torch
5 | from transformers import BlipProcessor, BlipForConditionalGeneration
6 | from transformers import Blip2Processor, Blip2ForConditionalGeneration
7 | from diffusers import DiffusionPipeline, DPMSolverMultistepScheduler, StableDiffusionUpscalePipeline
8 |
9 |
10 | def is_on_hpu(device: str) -> bool:
11 | """ Return True is the device is a Gaudi/HPU device.
12 | """
13 | return "hpu" in device
14 |
15 |
16 | def get_datatype(data_type: Union[str, torch.dtype]):
17 | if isinstance(data_type, torch.dtype):
18 | return data_type
19 | if data_type in ["fp32", "float32"]:
20 | return torch.float
21 | elif data_type in ["fp16", "float16"]:
22 | return torch.float16
23 | elif data_type in ["bfloat16", "bf16"]:
24 | return torch.bfloat16
25 | else:
26 | raise RuntimeError(f"Got unknown dtype {data_type}")
27 |
28 |
29 | def optimize_stable_diffusion_pipeline(pipeline, device, datatype, cpu_offload: bool = False, enable_xformers: bool = False):
30 | pipeline.to(device)
31 | if is_on_hpu(device):
32 | assert datatype in ["bfloat16", "float32"] or datatype in [torch.bfloat16, torch.float32]
33 | pass
34 | else:
35 | # Cuda
36 | # TODO(Joey): Check if there is an Intel version of xformers
37 | # pipeline.unet = torch.compile(pipeline.unet)
38 | if enable_xformers:
39 | pipeline.set_use_memory_efficient_attention_xformers(enable_xformers)
40 |
41 | if cpu_offload:
42 | pipeline.enable_sequential_cpu_offload()
43 | pipeline.enable_model_cpu_offload()
44 | return pipeline.to(device)
45 |
46 |
47 | def optimize_blip(model, device, datatype):
48 | model.to(device)
49 | return model
50 |
51 |
52 | def load_diffusion_model(model_name: str = "stabilityai/stable-diffusion-2-inpainting",
53 | device: str = "cuda",
54 | dtype: Union[str, torch.dtype] = "float16",
55 | cpu_offload: bool = False):
56 | """ Load diffusion or diffusion inpainting model for text-to-image.
57 | """
58 | print(f"Loading text-to-image model {model_name} ...")
59 |
60 | torch_dtype = get_datatype(dtype)
61 |
62 | if is_on_hpu(device=device):
63 | assert dtype in ["bfloat16", "float32"] or dtype in [torch.bfloat16, torch.float32]
64 | from optimum.habana.diffusers import GaudiDDIMScheduler
65 | if "inpaint" in model_name:
66 | from optimum.habana.diffusers import GaudiStableDiffusionInpaintPipeline as DiffusionPipelineClass
67 | else:
68 | from optimum.habana.diffusers import GaudiStableDiffusionPipeline as DiffusionPipelineClass
69 |
70 | # Load model and scheduler on Gaudi/HPU
71 | scheduler = GaudiDDIMScheduler.from_pretrained(model_name, subfolder="scheduler")
72 | kwargs = {
73 | "scheduler": scheduler,
74 | "use_habana": True,
75 | "use_hpu_graphs": True,
76 | "gaudi_config": "Habana/stable-diffusion"
77 | }
78 | pipe = DiffusionPipelineClass.from_pretrained(model_name, **kwargs).to(torch_dtype)
79 | else:
80 | # Load model and scheduler
81 | pipe = DiffusionPipeline.from_pretrained(model_name, torch_dtype=torch_dtype)
82 | pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)
83 | # TODO(Joey): Check why enable xformers + torch.compile for inpainting model gets error
84 | pipe = optimize_stable_diffusion_pipeline(pipe, device, torch_dtype, cpu_offload, enable_xformers=False)
85 | return pipe
86 |
87 |
88 | def load_blip_model_and_processor(model_name: str = "Salesforce/blip2-flan-t5-xl", # "Salesforce/blip2-opt-2.7b"
89 | device: str = "cuda",
90 | dtype: Union[str, torch.dtype] = "float16"):
91 | """ Load BLIP model for image-to-text.
92 | """
93 | print(f"Loading image-to-text model {model_name} ...")
94 |
95 | torch_dtype = get_datatype(dtype)
96 |
97 | if "blip2" in model_name:
98 | processor_class = Blip2Processor
99 | model_class = Blip2ForConditionalGeneration
100 | else:
101 | # Blip
102 | assert "blip" in model_name
103 | processor_class = BlipProcessor
104 | model_class = BlipForConditionalGeneration
105 |
106 | if is_on_hpu(device=device):
107 | assert dtype in ["bfloat16", "float32"] or dtype in [torch.bfloat16, torch.float32]
108 | from optimum.habana.transformers.modeling_utils import adapt_transformers_to_gaudi
109 |
110 | # TODO(Joey): Check optimum-habana once it has Blip2 support
111 | adapt_transformers_to_gaudi()
112 | else:
113 | processor_class = Blip2Processor
114 | model_class = Blip2ForConditionalGeneration
115 |
116 | # Get a blip description
117 | processor = processor_class.from_pretrained(model_name)
118 | model = model_class.from_pretrained(model_name, torch_dtype=torch_dtype)
119 | model = optimize_blip(model, device, torch_dtype)
120 |
121 | return processor, model
122 |
123 |
124 | def load_upscaler_model(model_name: str = "stabilityai/stable-diffusion-x4-upscaler",
125 | device: str = "cuda",
126 | dtype: Union[str, torch.dtype] = "float16",
127 | cpu_offload: bool = False):
128 | """ Load super resolution model for upscaling.
129 | """
130 | print(f"Loading super resolution model {model_name} ...")
131 |
132 | torch_dtype = get_datatype(dtype)
133 |
134 | if is_on_hpu(device=device):
135 | assert dtype in ["bfloat16", "float32"] or dtype in [torch.bfloat16, torch.float32]
136 | from optimum.habana.diffusers import GaudiDDIMScheduler, GaudiStableDiffusionUpscalePipeline
137 |
138 | # Load model and scheduler on Gaudi/HPU
139 | scheduler = GaudiDDIMScheduler.from_pretrained(model_name, subfolder="scheduler")
140 | kwargs = {
141 | "scheduler": scheduler,
142 | "use_habana": True,
143 | "use_hpu_graphs": True,
144 | "gaudi_config": "Habana/stable-diffusion"
145 | }
146 | pipe = GaudiStableDiffusionUpscalePipeline.from_pretrained(model_name, **kwargs).to(torch_dtype)
147 | else:
148 | # Load model and scheduler
149 | pipe = StableDiffusionUpscalePipeline.from_pretrained(model_name, torch_dtype=torch_dtype)
150 | # TODO(Joey): Check why enable xformers + torch.compiler for inpainting model gets error
151 | pipe = optimize_stable_diffusion_pipeline(pipe, device, torch_dtype, cpu_offload, enable_xformers=False)
152 |
153 | return pipe
154 |
155 |
156 | def load_llm(model_name: str = "mistralai/Mistral-7B-Instruct-v0.2",
157 | device: str = "cuda",
158 | dtype: Optional[Union[str, torch.dtype]] = "float16",
159 | trust_remote_code: bool = False,
160 | hf_token: Optional[str] = None):
161 | """ Load LLM model.
162 | """
163 | print(f"Loading LLM {model_name} ...")
164 |
165 | if is_on_hpu(device=device):
166 | assert dtype in ["bfloat16", "float32"] or dtype in [torch.bfloat16, torch.float32]
167 | from optimum.habana.transformers.modeling_utils import adapt_transformers_to_gaudi
168 | adapt_transformers_to_gaudi()
169 |
170 | from transformers import AutoModelForCausalLM, AutoTokenizer
171 | tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=trust_remote_code, token=hf_token)
172 | model = AutoModelForCausalLM.from_pretrained(
173 | model_name, trust_remote_code=trust_remote_code, token=hf_token).eval().to(device)
174 |
175 | tokenizer.pad_token_id = tokenizer.eos_token_id
176 |
177 | return tokenizer, model
178 |
179 |
180 | def release_memory(model, tokenizer, device: str = "cuda"):
181 | import gc
182 | del tokenizer
183 | del model
184 |
185 | if device == "cuda":
186 | torch.cuda.empty_cache()
187 | else:
188 | # TODO(Tien Pei Chou): Add Gaudi and XPU
189 | raise NotImplementedError()
190 | # accelerator.free_memory()
191 | gc.collect()
192 |
--------------------------------------------------------------------------------
/requirements-api.txt:
--------------------------------------------------------------------------------
1 | # fastAPI and gradio
2 | fastapi
3 | uvicorn
4 | gradio
5 | envbash
6 |
--------------------------------------------------------------------------------
/requirements-hpu.txt:
--------------------------------------------------------------------------------
1 | # Fix to the main branch once the PR (https://github.com/huggingface/optimum-habana/pull/869/commits) gets merged
2 | git+https://github.com/huggingface/optimum-habana.git@8893d602289226eda82cf19c79951fa12d15e1b9
3 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | wheel
2 | diffusers
3 | transformers
4 | scipy
5 | openai==0.27.2
6 | matplotlib
7 | imageio[ffmpeg]
8 | opencv-python
9 | accelerate
10 | einops
11 | auto-gptq
12 | tiktoken
13 | transformers_stream_generator
14 | text-generation # TGI and TGI Gaudi
15 |
--------------------------------------------------------------------------------