├── .dockerignore ├── .env ├── .gitignore ├── Dockerfile └── Dockerfile-hpu ├── LICENSE ├── README.md ├── app ├── .env └── README.md ├── docker_build.sh ├── docker_run-hpu.sh ├── exp ├── .gitignore └── example │ ├── 0.png │ ├── 1.png │ └── example_descriptor.json ├── media ├── ISC.png ├── blog.png ├── library.jpg ├── livingRoom.jpg ├── pano.png ├── pipeline.png ├── snow.jpg └── underwater.jpeg ├── mm_pano ├── __init__.py ├── lib │ ├── Equirec2Perspec.py │ ├── Perspec2Equirec.py │ └── multi_Perspec2Equirec.py ├── mmpano.py ├── tgi_gaudi │ ├── README.md │ ├── run_tgi_gaudi.sh │ └── test_tgi.sh └── utils │ ├── common.py │ ├── image_utils.py │ ├── llm_engines.py │ └── model_utils.py ├── requirements-api.txt ├── requirements-hpu.txt └── requirements.txt /.dockerignore: -------------------------------------------------------------------------------- 1 | * 2 | !.env 3 | !Dockerfile 4 | !app 5 | !exp 6 | !mm_pano 7 | !requirements* 8 | 9 | mm_pano/tgi_gaudi/data/ 10 | -------------------------------------------------------------------------------- /.env: -------------------------------------------------------------------------------- 1 | IMAGE_NAME=mm_pano 2 | IMAGE_TAG=latest 3 | CONTAINER_NAME=mm_pano 4 | 5 | # Backend API 6 | API_HOST="127.0.0.1" 7 | API_PORT="8010" 8 | API_TIMEOUT=300 9 | API_RESPONSE_PANO_NAME="pano" 10 | API_RESPONSE_VIDEO_NAME="panorama_video" 11 | 12 | # Frontend UI 13 | WEBAPP_NAME="Language Model Assisted Generation of Images with Coherence" 14 | WEBAPP_HOST="127.0.0.1" 15 | WEBAPP_PORT="8100" 16 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | **/__pycache__ 2 | -------------------------------------------------------------------------------- /Dockerfile/Dockerfile-hpu: -------------------------------------------------------------------------------- 1 | # From vault.habana.ai/gaudi-docker/1.14.0/ubuntu22.04/habanalabs/pytorch-installer-2.1.1:latest 2 | From vault.habana.ai/gaudi-docker/1.15.0/ubuntu22.04/habanalabs/pytorch-installer-2.2.0:latest 3 | 4 | WORKDIR /app 5 | 6 | RUN apt-get update && \ 7 | apt-get upgrade -y && \ 8 | apt-get install -y \ 9 | tmux 10 | ##################### 11 | # Multimodal Panorama Generation source code 12 | ##################### 13 | # Common requirements 14 | COPY requirements.txt /app/requirements.txt 15 | RUN pip3 install --upgrade pip && \ 16 | pip3 install -r requirements.txt 17 | 18 | # Copy code 19 | COPY mm_pano /app/mm_pano 20 | COPY exp /app/exp 21 | 22 | ###################### 23 | # Application frontend 24 | ###################### 25 | COPY requirements-api.txt /app/requirements-api.txt 26 | RUN pip3 install -r requirements-api.txt 27 | COPY app /app/app 28 | COPY .env /app/.env 29 | 30 | ########################### 31 | # HPU specific requirements 32 | ########################### 33 | COPY requirements-hpu.txt /app 34 | RUN pip3 install -r requirements-hpu.txt 35 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # [CVPR 2024] Official implementation of the paper: "L-MAGIC: Language Model Assisted Generation of Images with Coherence" 2 | We present a novel method that can generate 360 degree panorama from different types of zero-shot inputs (e.g., a single image, text description, hand-drawing etc.). Our Huggingface space is now available. Feel free to try it out! 3 | 4 |
5 | 6 |
7 | 8 | - [Paper](https://arxiv.org/abs/2406.01843) 9 | - [Project Page](https://zhipengcai.github.io/MMPano/) 10 | - [Youtube Video](https://youtu.be/XDMNEzH4-Ec) 11 | - [Huggingface demo (now available!)](https://huggingface.co/spaces/MMPano/MMPano) 12 | 13 | ## Industrial Impact 14 | 15 | - Our work has been selected as **one of the 5 Intel featured live demos** at [ISC HPC 2024](https://www.intel.com/content/www/us/en/events/supercomputing.html). 16 | - Our work has been featured by [Intel Community Blog](https://community.intel.com/t5/Blogs/Tech-Innovation/Artificial-Intelligence-AI/Advancing-Gen-AI-on-Intel-Gaudi-AI-Accelerators-with-Multi-Modal/post/1603746)! 17 | - Our work has been featured by [Intel Labs Linkedin](https://www.linkedin.com/feed/update/urn:li:activity:7203797143831076864/)! 18 | 19 |
20 | 21 |
22 |
23 | 24 |
25 | 26 | ## 📌 Reference 27 | 28 | ```bibtex 29 | @inproceedings{ 30 | zhipeng2024lmagic, 31 | title={L-MAGIC: Language Model Assisted Generation of Images with Coherence}, 32 | author={Zhipeng Cai and Matthias Müller and Reiner Birkl and Diana Wofk and Shao-Yen Tseng and JunDa Cheng and Gabriela Ben-Melech Stan and Vasudev Lal and Michael Paulitsch}, 33 | booktitle={The IEEE/CVF Conference on Computer Vision and Pattern Recognition}, 34 | year={2024} 35 | } 36 | ``` 37 | 38 | ## ⭐️ Show Your Support 39 | 40 | If you find this project helpful or interesting, please consider giving it a star! Your support is greatly appreciated and helps others discover the project. 41 | 42 | ## Environment 43 | 44 | This code has been tested on linux with python 3.9. It should be compatible with also other python versions. 45 | 46 | 47 | ## Run on Intel Gaudi 48 | 49 | This codebase has been developed and deployed on Intel Gaudi on Intel Developer Cloud 50 | 51 | - [Intel Gaudi](https://habana.ai/) 52 | - [Intel Developer Cloud](https://www.intel.com/content/www/us/en/developer/tools/devcloud/overview.html) 53 | 54 | 55 | #### Setup Docker environment 56 | ```bash 57 | # Build docker image 58 | ./docker_build.sh 59 | 60 | # Start the container. Following the instruction on the script, you may modify 61 | # the `HABANA_VISIBLE_DEVICES` and `HABANA_VISIBLE_MODULES` to run on different Gaudi device. 62 | ./docker_run-hpu.sh 63 | ``` 64 | 65 | 66 | ## Run on other device 67 | 68 | You can also run it on Nvidia GPU. After a proper Nvidia environment setup with pytorch installed (ex: `conda`, `venv`, `docker` ...etc) 69 | 70 | Install the necessary packages by running the following command: 71 | 72 | ```bash 73 | pip install -r requirements.txt 74 | ``` 75 | 76 | 77 | ## Run the code 78 | #### Note 79 | - If you are running on Gaudi, you will encouter a slower performance because Gaudi requires at least 2 warmup cycles. If you want to build your own application using this codebase, please to warmup the Gaudi at least 2 times. 80 | 81 | - The best performance is enabled by using ChatGPT as the LLM controller, which requires you to apply for an [OpenAI API key](https://platform.openai.com/docs/overview). 82 | 83 | - If you are in areas that cannot access the ChatGPT API, we also provided a way to use a free open sourced LLM controller (e.g., Llama3). Please see below for instructions on how to enable it. You may need to set the `HF_TOKEN` or pass a huggingface token. Feel free to also contribute to the code and enable other LLMs. 84 | 85 | #### (Optional) Start a TGI LLM server 86 | 87 | If user wants to use the TGI to do LLM serving, the code provides a script to pull the docker image and start a TGI LLM serving on Gaudi. Once the TGI is on, please make sure to pass `--llm_model_name tgi` when running the MM Pano command line in the next step. 88 | 89 | We've only validated the listed LLM models ("meta-llama/Meta-Llama-3-8B-Instruct", "mistralai/Mistral-7B-Instruct-v0.2"). We encourage users to try out new models and add them to the supported list. 90 | 91 | ```bash 92 | # Modify the model name and pass Huggingface token if needed. You can also change the `num_shard` if you like. 93 | vi mm_pano/tgi_gaudi/run_tgi_gaudi.sh 94 | 95 | # Pull and start the TGI-Gaudi in the container 96 | (cd mm_pano/tgi_gaudi && ./run_tgi_gaudi.sh) 97 | ``` 98 | 99 | If user wants to run the TGI on other devices, please make sure the default TGI url:port is set to `http://127.0.0.1:8080`. 100 | 101 | 102 | #### Command 103 | There are different choices when running the code, a simple example for 104 | 105 | - image-to-panorama task 106 | - ChatGPT LLM (GPT4) 107 | - Gaudi accelerator as the hardware 108 | 109 | ```bash 110 | python3 mm_pano/mmpano.py \ 111 | --init_image exp/example/0.png \ 112 | --output_folder exp/outputs \ 113 | --dtype bfloat16 --device hpu \ 114 | --llm_model_name gpt-4 \ 115 | --api_key \ 116 | --save_pano_img \ # To save the generated panorama picture 117 | --gen_video # To generate and save the video 118 | ``` 119 | 120 | To change the setups, e.g. 121 | - perform "text-to-panorama", change `--init_image exp/example/0.png` to `--init_prompt 'maple autum forest'`, also the `--init_prompt` can be used together with `--init_image` to provide a user specified scene description. 122 | - use other LLMs, change `--llm_model_name gpt-4` to `--llm_model_name [other LLM names]`. Currently the available choices are `"gpt-4", "gpt-3.5-turbo", "meta-llama/Meta-Llama-3-8B-Instruct", "mistralai/Mistral-7B-Instruct-v0.2", "tgi"`, 123 | where TGI can be a [TGI Gaudi](https://github.com/huggingface/tgi-gaudi) or [TGI](https://github.com/huggingface/text-generation-inference) server to run bigger model like Llama3-70B. Note that the `--api_key` is only used for gpt models. 124 | - use cuda, change `--device hpu` to `--device cuda` 125 | - specify camera intrinsic for the input image, add `--intrinsitc float, float, float, float` 126 | 127 | ## Results (see more on our project page and paper) 128 | 129 | After running the code, you will see in the output_folder (exp/outputs) a panoramic image "pano.png" (see below for examples) and a immersive video "video.mp4". 130 | 131 |
132 | 133 |
134 | 135 |
136 | 137 |
138 | 139 |
140 | 141 |
142 | 143 |
144 | 145 |
146 | 147 |
148 | 149 |
150 | 151 | 152 | ## Contact 153 | 154 | Feel free to send an email to Zhipeng (czptc2h@gmail.com) or Joey (Tien Pei) Chou (joey.t.p.chou@gmail.com) if you have any questions and comments. 155 | 156 | ## 📈 Star History 157 | 158 | [![Star History Chart](https://api.star-history.com/svg?repos=IntelLabs/MMPano&type=Date)](https://star-history.com/#IntelLabs/MMPano) 159 | -------------------------------------------------------------------------------- /app/.env: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/MMPano/b176d26bc3311584e746fe88135e8d5e793d3107/app/.env -------------------------------------------------------------------------------- /app/README.md: -------------------------------------------------------------------------------- 1 | Placeholder for Multimodal Panorama API 2 | -------------------------------------------------------------------------------- /docker_build.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | while [ "$1" != "" ]; 3 | do 4 | case $1 in 5 | -d | --device) 6 | DEVICE=$2 7 | shift 8 | ;; 9 | -h | --help ) 10 | echo "Build the docker image for Multimodal Panorama Generation" 11 | echo "Usage: docker_build.sh [OPTIONS]" 12 | echo "OPTION includes:" 13 | echo " -d | --device - Supported device [hpu]" 14 | exit 15 | ;; 16 | * ) 17 | echo "Invalid option: $1" 18 | echo "Please do 'docker_build.sh -h'" 19 | exit 20 | ;; 21 | esac 22 | shift 23 | done 24 | 25 | CUR_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )" 26 | source ${CUR_DIR}/.env 27 | 28 | DEVICE="${DEVICE:-hpu}" 29 | DOCKERFILE=${CUR_DIR}/Dockerfile/Dockerfile-${DEVICE} 30 | 31 | cmd="DOCKER_BUILDKIT=0 docker build . -f ${DOCKERFILE} -t ${IMAGE_NAME}_${DEVICE}:${IMAGE_TAG}" 32 | echo $cmd 33 | eval $cmd 34 | -------------------------------------------------------------------------------- /docker_run-hpu.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | # HABANA_VISIBLE_DEVICES, HABANA_VISIBLE_MODULES 3 | # 0, 2 4 | # 1, 6 5 | # 2, 0 6 | # 3, 7 7 | # 4, 1 8 | # 5, 4 9 | # 6, 3 10 | # 7, 5 11 | 12 | CUR_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )" 13 | source ${CUR_DIR}/.env 14 | 15 | DEVICE_IDX=0 16 | MODULES_IDX=2 17 | IMAGE_NAME=${IMAGE_NAME}_hpu:${IMAGE_TAG} 18 | CONTAINER_NAME=${CONTAINER_NAME}_hpu 19 | 20 | OUTPUT_DIR_LOCAL=./exp 21 | OUTPUT_DIR_CONTAINER=/app/outputs 22 | docker run -it \ 23 | --expose=${API_PORT} \ 24 | --expose=${WEBAPP_PORT} \ 25 | -v ${OUTPUT_DIR_LOCAL}:${OUTPUT_DIR_CONTAINER} \ 26 | --env=DEVICE=hpu \ 27 | --env=HABANA_VISIBLE_DEVICES=all \ 28 | --env=OMPI_MCA_btl_vader_single_copy_mechanism=none \ 29 | --cap-add=sys_nice \ 30 | --network=host \ 31 | --restart=no \ 32 | --runtime=habana \ 33 | --shm-size=64g \ 34 | --name ${CONTAINER_NAME} \ 35 | -t ${IMAGE_NAME} 36 | -------------------------------------------------------------------------------- /exp/.gitignore: -------------------------------------------------------------------------------- 1 | * 2 | -------------------------------------------------------------------------------- /exp/example/0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/MMPano/b176d26bc3311584e746fe88135e8d5e793d3107/exp/example/0.png -------------------------------------------------------------------------------- /exp/example/1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/MMPano/b176d26bc3311584e746fe88135e8d5e793d3107/exp/example/1.png -------------------------------------------------------------------------------- /exp/example/example_descriptor.json: -------------------------------------------------------------------------------- 1 | { 2 | "init_image": "exp/example/0.png", 3 | "init_prompt": "a forest", 4 | "generated_text_details": "a road in autumn", 5 | "message": "View 1: We see a narrow, pebbled forest path.\nView 2: We see a rusted, overgrown log bridge crossing a creek.\nView 3: We see an antique lamppost, partially hidden by leaves.\nView 4: We see an old wooden cabin, shrouded in shadows.\nView 5: We see a past-its-prime swing hanging from an oak.\nView 6: We see a moss-laden stone bench facing a pond.", 6 | "message_main_obj": "We see: fallen leaves.\nWe see: a path.", 7 | "message_topdown": "We see: Canopy\nWe see: Leaf-covered path", 8 | "question_for_llm_repeat": "Do we often see multiple a path in a scene with a forest? Just say 'yes' or 'no' with all lower case letters", 9 | "description_no_obj": "'a forest'", 10 | "major_obj_number": 2, 11 | "is_repeated": [true, true] 12 | } 13 | -------------------------------------------------------------------------------- /media/ISC.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/MMPano/b176d26bc3311584e746fe88135e8d5e793d3107/media/ISC.png -------------------------------------------------------------------------------- /media/blog.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/MMPano/b176d26bc3311584e746fe88135e8d5e793d3107/media/blog.png -------------------------------------------------------------------------------- /media/library.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/MMPano/b176d26bc3311584e746fe88135e8d5e793d3107/media/library.jpg -------------------------------------------------------------------------------- /media/livingRoom.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/MMPano/b176d26bc3311584e746fe88135e8d5e793d3107/media/livingRoom.jpg -------------------------------------------------------------------------------- /media/pano.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/MMPano/b176d26bc3311584e746fe88135e8d5e793d3107/media/pano.png -------------------------------------------------------------------------------- /media/pipeline.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/MMPano/b176d26bc3311584e746fe88135e8d5e793d3107/media/pipeline.png -------------------------------------------------------------------------------- /media/snow.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/MMPano/b176d26bc3311584e746fe88135e8d5e793d3107/media/snow.jpg -------------------------------------------------------------------------------- /media/underwater.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/MMPano/b176d26bc3311584e746fe88135e8d5e793d3107/media/underwater.jpeg -------------------------------------------------------------------------------- /mm_pano/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/MMPano/b176d26bc3311584e746fe88135e8d5e793d3107/mm_pano/__init__.py -------------------------------------------------------------------------------- /mm_pano/lib/Equirec2Perspec.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import cv2 4 | import numpy as np 5 | 6 | class Equirectangular: 7 | def __init__(self, img_name, text2light=False): 8 | if isinstance(img_name, str): 9 | self._img = cv2.imread(img_name, cv2.IMREAD_COLOR) 10 | else: 11 | self._img = img_name 12 | if text2light: 13 | self._img = np.roll(self._img, -60, axis=0) 14 | 15 | [self._height, self._width, _] = self._img.shape 16 | 17 | 18 | def GetPerspective(self, FOV, THETA, PHI, height, width): 19 | # 20 | # THETA is left/right angle, PHI is up/down angle, both in degree 21 | # 22 | 23 | equ_h = self._height 24 | equ_w = self._width 25 | equ_cx = (equ_w - 1) / 2.0 26 | equ_cy = (equ_h - 1) / 2.0 27 | 28 | wFOV = FOV 29 | hFOV = float(height) / width * wFOV 30 | 31 | w_len = np.tan(np.radians(wFOV / 2.0)) 32 | h_len = np.tan(np.radians(hFOV / 2.0)) 33 | 34 | 35 | x_map = np.ones([height, width], np.float32) 36 | y_map = np.tile(np.linspace(-w_len, w_len,width), [height,1]) 37 | z_map = -np.tile(np.linspace(-h_len, h_len,height), [width,1]).T 38 | 39 | D = np.sqrt(x_map**2 + y_map**2 + z_map**2) 40 | xyz = np.stack((x_map,y_map,z_map),axis=2)/np.repeat(D[:, :, np.newaxis], 3, axis=2) 41 | 42 | y_axis = np.array([0.0, 1.0, 0.0], np.float32) 43 | z_axis = np.array([0.0, 0.0, 1.0], np.float32) 44 | [R1, _] = cv2.Rodrigues(z_axis * np.radians(THETA)) 45 | [R2, _] = cv2.Rodrigues(np.dot(R1, y_axis) * np.radians(-PHI)) 46 | 47 | xyz = xyz.reshape([height * width, 3]).T 48 | xyz = np.dot(R1, xyz) 49 | xyz = np.dot(R2, xyz).T 50 | lat = np.arcsin(xyz[:, 2]) 51 | lon = np.arctan2(xyz[:, 1] , xyz[:, 0]) 52 | 53 | lon = lon.reshape([height, width]) / np.pi * 180 54 | lat = -lat.reshape([height, width]) / np.pi * 180 55 | 56 | lon = lon / 180 * equ_cx + equ_cx 57 | lat = lat / 90 * equ_cy + equ_cy 58 | 59 | 60 | 61 | persp = cv2.remap(self._img, lon.astype(np.float32), lat.astype(np.float32), cv2.INTER_CUBIC, borderMode=cv2.BORDER_WRAP) 62 | return persp 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | -------------------------------------------------------------------------------- /mm_pano/lib/Perspec2Equirec.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import cv2 4 | import numpy as np 5 | 6 | class Perspective: 7 | def __init__(self, img_name , FOV, THETA, PHI ): 8 | if isinstance(img_name, str): 9 | self._img = cv2.imread(img_name, cv2.IMREAD_COLOR) 10 | else: 11 | self._img = img_name 12 | [self._height, self._width, _] = self._img.shape 13 | self.wFOV = FOV 14 | self.THETA = THETA 15 | self.PHI = PHI 16 | self.hFOV = float(self._height) / self._width * FOV 17 | 18 | self.w_len = np.tan(np.radians(self.wFOV / 2.0)) 19 | self.h_len = np.tan(np.radians(self.hFOV / 2.0)) 20 | 21 | 22 | 23 | def GetEquirec(self,height,width): 24 | # 25 | # THETA is left/right angle, PHI is up/down angle, both in degree 26 | # 27 | 28 | x,y = np.meshgrid(np.linspace(-180, 180,width),np.linspace(90,-90,height)) 29 | 30 | x_map = np.cos(np.radians(x)) * np.cos(np.radians(y)) 31 | y_map = np.sin(np.radians(x)) * np.cos(np.radians(y)) 32 | z_map = np.sin(np.radians(y)) 33 | 34 | xyz = np.stack((x_map,y_map,z_map),axis=2) 35 | 36 | y_axis = np.array([0.0, 1.0, 0.0], np.float32) 37 | z_axis = np.array([0.0, 0.0, 1.0], np.float32) 38 | [R1, _] = cv2.Rodrigues(z_axis * np.radians(self.THETA)) 39 | [R2, _] = cv2.Rodrigues(np.dot(R1, y_axis) * np.radians(-self.PHI)) 40 | 41 | R1 = np.linalg.inv(R1) 42 | R2 = np.linalg.inv(R2) 43 | 44 | xyz = xyz.reshape([height * width, 3]).T 45 | xyz = np.dot(R2, xyz) 46 | xyz = np.dot(R1, xyz).T 47 | 48 | xyz = xyz.reshape([height , width, 3]) 49 | inverse_mask = np.where(xyz[:,:,0]>0,1,0) 50 | 51 | xyz[:,:] = xyz[:,:]/np.repeat(xyz[:,:,0][:, :, np.newaxis], 3, axis=2) 52 | 53 | 54 | lon_map = np.where((-self.w_len 0 and i < 5: 417 | prompt = "Question: is there a {} in this picture (just say yes or no)? Answer:".format(extract_words_after_we_see_withFailv2(lines_major_obj[obj_id])) 418 | inputs = processor(image_inpaint, text=prompt, return_tensors="pt").to(device, torch_dtype) 419 | generated_ids = img2text_pipe.generate(**inputs, max_new_tokens=15) 420 | generated_text_repeat = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip() 421 | print("repeated check = {}".format(generated_text_repeat)) 422 | if "yes" in generated_text_repeat: 423 | print(" we see {} in the inpainted view".format(extract_words_after_we_see_withFailv2(lines_major_obj[obj_id]))) 424 | pure_color_bg = True 425 | iter_count += (1.0/num_false) 426 | if not is_repeated_all and iter_count >= 20: 427 | is_repeated_all = True 428 | print("reaching maximum checking iterations, there is a conflict, setting is_repeated to true") 429 | inpainted_cv2 = pil_to_cv2(image_inpaint) 430 | 431 | # we do the same merging step as the 432 | # 1. compute the weight mask for the warped image 433 | dist2zero = distance_transform_edt(mask_accumulate) 434 | 435 | # 2. build weight map according to dist2zero 436 | weight_map_cinpaint = np.ones(mask_accumulate.shape).astype(np.float32) 437 | weight_map_cinpaint[dist2zero <= cinpaint_th] = dist2zero[dist2zero <= cinpaint_th] / cinpaint_th 438 | 439 | # Save image at each step 440 | if sr_pipe is not None: 441 | inpainted_cv2_merge = warped_image_SR * weight_map_cinpaint[:, :, np.newaxis] + inpainted_cv2 * (1 - weight_map_cinpaint)[:, :, np.newaxis] 442 | # filename = os.path.join(output_folder, f"inpaint_step_SR_{i}.png") 443 | else: 444 | inpainted_cv2_merge = warped_image * weight_map_cinpaint[:, :, np.newaxis] + inpainted_cv2 * (1 - weight_map_cinpaint)[:, :, np.newaxis] 445 | # filename = os.path.join(output_folder, f"inpaint_step_{i}.png") 446 | filename = os.path.join(output_folder, f"inpaint_step_{i}.png") 447 | cv2.imwrite(filename, inpainted_cv2_merge) 448 | 449 | # Perform super-resolution on the inpainted_cv2 (not on inpainted_cv2_SR to prevent noise amplification) 450 | if sr_pipe is not None: 451 | # image_inpaint_SR = cv2_to_pil(inpainted_cv2.astype(np.uint8)) 452 | image_inpaint_SR = cv2_to_pil(inpainted_cv2_merge.astype(np.uint8)) 453 | image_inpaint_SR = sr_pipe(prompt=orig_prompt, negative_prompt=orig_negative_prompt, image=image_inpaint_SR, num_inference_steps=sr_inf_step).images[0] 454 | image_inpaint_SR_cv2 = pil_to_cv2(image_inpaint_SR) 455 | filename = os.path.join(output_folder, f"inpaint_step_SR_{i}.png") 456 | cv2.imwrite(filename, image_inpaint_SR_cv2) 457 | 458 | image_list.append(inpainted_cv2) 459 | if sr_pipe is not None: 460 | image_SR_list.append(image_inpaint_SR_cv2) 461 | pose_list.append(pose) 462 | 463 | return 0 464 | 465 | 466 | def parse_args(): 467 | def list_of_num(arg): 468 | return list(map(float, arg.split(','))) 469 | 470 | parser = argparse.ArgumentParser(description='Multimodal Panorama Generation') 471 | parser.add_argument('--device', type=str, default="hpu", choices=["cpu", "cuda", "hpu"], help="Target HW device for Diffusion and BLIP models") 472 | parser.add_argument('--dtype', type=str, default="float32", choices=["float16", "float32", "bfloat16"], help="Datatype for model inference.") 473 | parser.add_argument('--init_prompt', type=str, help='Prompt which will be used for text to panorama generation.') 474 | parser.add_argument('--init_image', type=str, help='Path to a image which will be used for image to panorama generation.') 475 | parser.add_argument('--output_folder', type=str, default='./exp/output') 476 | parser.add_argument('--cpu_offload', action="store_true", help="Flag if user want to offload StableDiffusion pipeline to CPU") 477 | 478 | parser.add_argument('--text2pano', action="store_true", help="Flag if user want to do text-to-panorama. Else will do image-to-panorama.") 479 | parser.add_argument('--llm_model_name', type=str, default="mistralai/Mistral-7B-Instruct-v0.2", 480 | choices=_VALIDATED_MODELS, help='Name of LLM model for text generation.') 481 | parser.add_argument('--api_key', type=str, default="", help="your OpenAI API key") 482 | parser.add_argument('--intrinsic', type=list_of_num, default=[1.11733848262, 1.11733848262, 0.5, 0.5], help="Intrinsic.") 483 | parser.add_argument('--panorama_descriptor', type=str, help='Path to a descriptor JSON that will be used for panorama generation.') 484 | 485 | parser.add_argument('--do_upscale', action="store_true", help="Flag if user want to use super resolution to upscale the generated images") 486 | parser.add_argument('--major_obj_number', type=int, default=2, choices=[1, 2], help='how many major objects we do we want to consider so that they dont repeat?') 487 | parser.add_argument('--sr_inf_step', type=int, default=35, help='number of inference steps for the super resolution model') 488 | 489 | parser.add_argument('--inpaint_model_name', type=str, default="stabilityai/stable-diffusion-2-inpainting", 490 | help="Diffusion model name") 491 | parser.add_argument('--blip_model_name', type=str, default="Salesforce/blip2-flan-t5-xl", 492 | help="BLIP model name") 493 | parser.add_argument('--upscaler_model_name', type=str, default="stabilityai/stable-diffusion-x4-upscaler", 494 | help="Super resolution upscaler model name") 495 | 496 | # Generate panorama and video 497 | parser.add_argument('--save_pano_img', action="store_true", help="Flag if user want to save the panorama image.") 498 | parser.add_argument('--gen_video', action="store_true", help="Flag if user want to generate and save a video of panorama view.") 499 | parser.add_argument('--video_codec', type=str, default="MP4V", choices=["MP4V", "VP09"], 500 | help="Video codec used to generate the video") 501 | args = parser.parse_args() 502 | 503 | # Validate arguments 504 | if len(args.intrinsic) != 4: 505 | raise RuntimeError(f"--intrinsic has to be 4 floating point number. Got {args.intrinsic}") 506 | 507 | return args 508 | 509 | 510 | def gen_multiviews( 511 | device: str, 512 | dtype: str = "float32", 513 | output_folder: str = "./outputs", 514 | init_prompt: Optional[str] = None, 515 | init_image: Optional[Union[str, Image.Image]] = None, 516 | cpu_offload: bool = False, 517 | # Text generation 518 | text2pano: bool = False, 519 | llm_model_name: str = "mistralai/Mistral-7B-Instruct-v0.2", 520 | api_key: str = "", 521 | panorama_descriptor: Optional[Union[str, Dict[str, str]]] = None, # None, path to JSON, or a dictionary 522 | use_predefine_llm_descriptor: bool = False, 523 | llm_engine = None, 524 | # Panorama generation 525 | intrinsic: List[float] = [1.11733848262, 1.11733848262, 0.5, 0.5], 526 | do_upscale: bool = False, 527 | major_obj_number: int = 2, 528 | sr_inf_step: int = 35, 529 | inpaint_model_name: Optional[str] = "stabilityai/stable-diffusion-2-inpainting", 530 | blip_model_name: Optional[str] = "Salesforce/blip2-flan-t5-xl", 531 | upscaler_model_name: Optional[str] = "stabilityai/stable-diffusion-x4-upscaler", 532 | text2img_model_name: Optional[str] = "stabilityai/stable-diffusion-2-base", 533 | # Pre-loaded pipelines, if any 534 | inpaint_pipe: Optional = None, 535 | processor: Optional = None, 536 | img2text_pipe: Optional = None, 537 | sr_pipe: Optional = None, 538 | text2img_pipe: Optional = None, 539 | **kwargs, 540 | ): 541 | 542 | if is_on_hpu(device) and dtype == "float16": 543 | # Force dtype to be bfloat16 on HPU 544 | dtype = "bfloat16" 545 | 546 | print("===========================================================================") 547 | print(f"Running Multimodal Panorama Generation on {device} in {dtype}.") 548 | print("===========================================================================") 549 | 550 | ################## 551 | # Parse descriptor 552 | ################## 553 | # If given, get the pre-generated LLM descriptions 554 | if panorama_descriptor is not None and use_predefine_llm_descriptor: 555 | if isinstance(panorama_descriptor, dict): 556 | panorama_descriptor = Descriptor(**panorama_descriptor) 557 | elif isinstance(panorama_descriptor, str) and os.path.isfile(panorama_descriptor): 558 | panorama_descriptor = Descriptor.from_json(panorama_descriptor) 559 | elif not isinstance(panorama_descriptor, Descriptor): 560 | raise RuntimeError(f"panorama_descriptor should be a JSON file, Dictionary, or Descriptor type.") 561 | 562 | # If only init_prompt is given in the panorama_descriptor, do the text-to-panorama 563 | if not panorama_descriptor.init_image: 564 | assert panorama_descriptor.init_prompt, "At least one of [`init_prompt`, `init_image`] must be given" 565 | text2pano = True 566 | 567 | elif panorama_descriptor is None and use_predefine_llm_descriptor: 568 | raise RuntimeError(f"`panorama_descriptor` must be provided when setting `use_predefine_llm_descriptor=True`") 569 | 570 | ###################### 571 | # Create output folder 572 | ###################### 573 | if os.path.exists(output_folder): 574 | shutil.rmtree(output_folder) 575 | os.makedirs(output_folder, exist_ok = True) 576 | print(f"Save all outputs to {output_folder}") 577 | 578 | ############################# 579 | # Load pipelines if not given 580 | ############################# 581 | # Inpainting pipeline 582 | if inpaint_pipe is None: 583 | inpaint_pipe = load_diffusion_model(inpaint_model_name, device=device, dtype=dtype, cpu_offload=cpu_offload) 584 | 585 | # Image-to-text pipeline 586 | if processor is None and img2text_pipe is None: 587 | processor, img2text_pipe = load_blip_model_and_processor(blip_model_name, device=device, dtype=dtype) 588 | elif (processor is not None and img2text_pipe is None) or (processor is None and img2text_pipe is not None): 589 | raise RuntimeError( 590 | "Processor and BLIP model has to be set or not set at the same time. " 591 | f"Got processor={processor}, img2text_pipe={img2text_pipe}." 592 | ) 593 | 594 | # Super resolution 595 | if sr_pipe is None and do_upscale: 596 | # NOTE: Skip upscaler in light version 597 | sr_pipe = load_upscaler_model(upscaler_model_name, device, dtype) 598 | 599 | # Text-to-image 600 | if text2pano and text2img_pipe is None: 601 | # Load Diffusion pipeline 602 | text2img_pipe = load_diffusion_model(text2img_model_name, device=device, dtype=dtype, cpu_offload=cpu_offload) 603 | 604 | # Text generation 605 | if llm_engine is None: 606 | llm_engine = get_llm_engine(llm_model_name, device=device, dtype=dtype, openai_key=api_key) 607 | 608 | ########################### 609 | # Text or Image to Panorama 610 | ########################### 611 | init_prompt = init_prompt if panorama_descriptor is None else panorama_descriptor.init_prompt 612 | init_image = init_image if panorama_descriptor is None else panorama_descriptor.init_image 613 | 614 | t_begin = time.time() 615 | # Use given init_image or generate an init_image from the init_prompt. 616 | # This will be used for generating panorama 617 | if text2pano: 618 | print(f"Generating init image with prompt={init_prompt} ...") 619 | init_image = text2img_pipe(init_prompt, num_inference_steps=25).images[0] 620 | init_image = cv2.cvtColor(np.array(init_image), cv2.COLOR_RGB2BGR) 621 | elif init_image is not None: 622 | if isinstance(init_image, str): 623 | # init_image is a path to a file 624 | print(f"Loading init image from {init_image}") 625 | init_image = cv2.imread(init_image, cv2.IMREAD_COLOR) 626 | elif isinstance(init_image, Image.Image): 627 | init_image = cv2.cvtColor(np.array(init_image), cv2.COLOR_RGB2BGR) 628 | elif isinstance(init_image, np.ndarray): 629 | pass 630 | else: 631 | # TODO(Joey Chou): Add error message 632 | raise RuntimeError("Please do text2pano with a given init_prompt, or pass a init_image to do image to pano") 633 | 634 | # check whether the intrinsic matrix exist 635 | with torch.inference_mode(): 636 | fail = True 637 | while fail: 638 | fail = create_panorama( 639 | init_image, intrinsic, output_folder, processor, img2text_pipe, inpaint_pipe, sr_pipe, device, 640 | sr_inf_step, init_prompt=init_prompt, major_obj_number=major_obj_number, 641 | panorama_descriptor=panorama_descriptor, llm_engine=llm_engine 642 | ) 643 | print(f"Total runtime: {time.time() - t_begin}") 644 | 645 | 646 | def _gen_pano_outputs(images: List[np.ndarray], 647 | out_dir: str, 648 | rotation_degrees: List[int], 649 | fov: float = 99.9169018, gen_video: bool = False, 650 | save_pano_img: bool = True, 651 | # Video related 652 | video_size: Tuple[int, int] = (512, 512), video_codec: str = "MP4V", 653 | new_pano: Optional = None): 654 | """ 655 | To make video works with gradio, please use the setup as below: 656 | * interval_deg = 1.0 657 | * fps: = 60 658 | * video_codec = "VP09" 659 | 660 | For other application that works with mp4v: 661 | * interval_deg = 0.5 662 | * fps = 60 663 | * video_codec = "MP4V" 664 | """ 665 | 666 | if new_pano is None: 667 | _output_image_name = "pano.png" 668 | 669 | ee = m_P2E.Perspective( 670 | images, 671 | [ 672 | [fov, rotation_degrees[0], 0], [fov, rotation_degrees[1], 0], [fov, rotation_degrees[2], 0], [fov, rotation_degrees[3], 0], 673 | [fov, rotation_degrees[4], 0], [fov, rotation_degrees[5], 0], [fov, rotation_degrees[6], 0] 674 | ] 675 | ) 676 | 677 | new_pano = ee.GetEquirec(2048, 4096) 678 | 679 | if save_pano_img: 680 | # Output panorama image 681 | cv2.imwrite(os.path.join(out_dir, _output_image_name), new_pano.astype(np.uint8)[540:-540]) 682 | 683 | if gen_video: 684 | if video_codec.upper() == "MP4V": 685 | codec_config = mp4vCodec() 686 | elif video_codec.upper() == "VP09": 687 | codec_config = vp90Codec() 688 | elif video_codec.upper() == "MP4": 689 | codec_config = mp4Codec() 690 | else: 691 | raise RuntimeError(f"Only support codec ['.MP4V', 'VP09']. Got {video_codec}") 692 | 693 | output_video_name = f"video{codec_config.video_format}" 694 | interval_deg = codec_config.interval_deg 695 | 696 | video_codec = codec_config.video_codec 697 | fps = codec_config.fps 698 | 699 | fov = 86 700 | num_frames = int(360 / interval_deg) 701 | 702 | equ = E2P.Equirectangular(new_pano) 703 | img = equ.GetPerspective(fov, 0, 0, *video_size) # Specify parameters(FOV, theta, phi, height, width) 704 | 705 | margin = 0 706 | if margin > 0: 707 | img = img[margin:-margin] 708 | size = (img.shape[1], img.shape[0]) 709 | 710 | save_video_path = os.path.join(out_dir, output_video_name) 711 | print("save_video_path = ", save_video_path, "; ", video_codec, ", ", fps, ", ", size, ", video_size = ", video_size) 712 | out = cv2.VideoWriter(save_video_path, cv2.VideoWriter_fourcc(*video_codec), fps, size) 713 | 714 | for i in tqdm(range(num_frames)): 715 | # Process image 716 | deg = i * interval_deg 717 | img = equ.GetPerspective(fov, deg, 0, *video_size) # Specify parameters(FOV, theta, phi, height, width) 718 | if margin > 0: 719 | img = img[margin:-margin] 720 | img = np.clip(img, 0, 255).astype(np.uint8) 721 | 722 | # Write to video 723 | out.write(img) 724 | out.release() 725 | 726 | # ffmpeg -y -i /root/app/rest_api/api_output/demo/video.mp4v /root/app/rest_api/api_output/demo/video.avc1 727 | return new_pano 728 | 729 | 730 | def gen_pano(images: Optional[List[np.ndarray]] = None, 731 | output_folder: Optional[str] = None, 732 | do_upscale: bool = False, 733 | save_pano_img: bool = True, 734 | gen_video: bool = True, 735 | video_codec: str = "MP4V", 736 | pano: Optional = None, 737 | **kwargs, 738 | ): 739 | # suffix = '_SR' if do_upscale else "" 740 | suffix = "" 741 | image_names = ["input_resized" + suffix + ".png"] 742 | for i in range(6): 743 | image_names.append("inpaint_step" + suffix + "_{}.png".format(i)) 744 | 745 | rotations = [create_rotation_matrix(0, 0, 0).T] 746 | rotation_degrees = [0] 747 | max_step = 6 748 | step_size = 41 749 | vortex_list = generate_left_right_fullPano_pattern(max_step=max_step, step_size=step_size, final_step=55) 750 | for i in range(6): 751 | rotations.append(create_rotation_matrix(vortex_list[i][0], vortex_list[i][1], vortex_list[i][2]).T) 752 | rotation_degrees.append(vortex_list[i][1]) 753 | 754 | LR_images = [] 755 | # read individual images out 756 | for image_name in tqdm(image_names): 757 | LR_images.append(cv2.imread(os.path.join(output_folder, image_name))) 758 | 759 | return _gen_pano_outputs(LR_images, output_folder, rotation_degrees, save_pano_img=save_pano_img, gen_video=gen_video, video_codec=video_codec, new_pano=pano) 760 | 761 | 762 | if __name__ == "__main__": 763 | args = parse_args() 764 | 765 | # Generate multiview scenes 766 | gen_multiviews(**args.__dict__) 767 | 768 | # Generate panorama view and optionally generate video 769 | gen_pano(**args.__dict__) 770 | -------------------------------------------------------------------------------- /mm_pano/tgi_gaudi/README.md: -------------------------------------------------------------------------------- 1 | ## Run TGI-Gaudi for LLM Serving 2 | 3 | This is a short instruction to run a TGI Gaudi server to do LLM serving. For more information, please check [TGI-Gaudi](https://github.com/huggingface/tgi-gaudi) 4 | 5 | #### Start a Llama3 TGI-Gaudi serving 6 | ```bash 7 | ./run_tgi_gaudi.sh 8 | ``` 9 | 10 | #### Quick test 11 | ```bash 12 | test_tgi.sh 13 | ``` 14 | -------------------------------------------------------------------------------- /mm_pano/tgi_gaudi/run_tgi_gaudi.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | model=meta-llama/Meta-Llama-3-8B-Instruct 3 | CONTAINER_NAME=tgi-gaudi 4 | volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run 5 | num_shard=2 6 | sharded=true 7 | max_input_length=2048 8 | max_total_tokens=4096 9 | 10 | # Usage: text-generation-launcher < 11 | # --model-id | 12 | # --revision | 13 | # --validation-workers | 14 | # --sharded | 15 | # --num-shard | 16 | # --quantize | 17 | # --speculate | 18 | # --dtype | 19 | # --trust-remote-code| 20 | # --max-concurrent-requests | 21 | # --max-best-of | 22 | # --max-stop-sequences | 23 | # --max-top-n-tokens | 24 | # --max-input-tokens | 25 | # --max-input-length | 26 | # --max-total-tokens | 27 | # --waiting-served-ratio | 28 | # --max-batch-prefill-tokens | 29 | # --max-batch-total-tokens | 30 | # --max-waiting-tokens | 31 | # --max-batch-size | 32 | # --cuda-graphs | 33 | # --hostname | 34 | # --port | 35 | # --shard-uds-path | 36 | # --master-addr | 37 | # --master-port | 38 | # --huggingface-hub-cache | 39 | # --weights-cache-override | 40 | # --disable-custom-kernels| 41 | # --cuda-memory-fraction | 42 | # --rope-scaling | 43 | # --rope-factor | 44 | # --json-output| 45 | # --otlp-endpoint | 46 | # --cors-allow-origin | 47 | # --watermark-gamma | 48 | # --watermark-delta | 49 | # --ngrok| 50 | # --ngrok-authtoken | 51 | # --ngrok-edge | 52 | # --tokenizer-config-path | 53 | # --disable-grammar-support 54 | # 55 | 56 | # -e HUGGING_FACE_HUB_TOKEN= \ 57 | docker run \ 58 | -p 8080:80 \ 59 | -v $volume:/data \ 60 | --runtime=habana \ 61 | -e PT_HPU_ENABLE_LAZY_COLLECTIVES=true \ 62 | -e HABANA_VISIBLE_DEVICES=all \ 63 | -e OMPI_MCA_btl_vader_single_copy_mechanism=none \ 64 | --cap-add=sys_nice \ 65 | --ipc=host \ 66 | --name=${CONTAINER_NAME} \ 67 | ghcr.io/huggingface/tgi-gaudi:2.0.0 \ 68 | --model-id $model --sharded $sharded --num-shard $num_shard --max-input-length $max_input_length --max-total-tokens $max_total_tokens 69 | -------------------------------------------------------------------------------- /mm_pano/tgi_gaudi/test_tgi.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | curl 127.0.0.1:8080/generate \ 4 | -X POST \ 5 | -d '{"inputs":"What is deep learning?","parameters":{"max_new_tokens":2048,"temperature":0.7,"repetition_penalty":1}}' \ 6 | -H 'Content-Type: application/json' 7 | -------------------------------------------------------------------------------- /mm_pano/utils/common.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | import json 4 | from dataclasses import dataclass, field 5 | from typing import Optional, List 6 | 7 | 8 | def extract_words_after_we_see_withFailv2(s): 9 | match = re.search('We .*?see: (.*)', s, re.IGNORECASE) 10 | if match: 11 | return match.group(1).replace('.', '').lower() 12 | print("No match found") 13 | return 14 | 15 | 16 | def extract_words_after_we_see_withFailv3(s): 17 | match = re.search('We .*?see(.*)', s, re.IGNORECASE) or re.search('View .*?:(.*)', s, re.IGNORECASE) 18 | if match: 19 | return match.group(1) 20 | print("No match found") 21 | return 22 | 23 | 24 | @dataclass 25 | class Descriptor: 26 | generated_text_details: Optional[str] = None 27 | message: Optional[str] = None 28 | message_main_obj: Optional[str] = None 29 | message_topdown: Optional[str] = None 30 | question_for_llm_repeat: Optional[str] = None 31 | description_no_obj: Optional[str] = None 32 | major_obj_number: int = 2 33 | is_repeated: List[bool] = field(default_factory=list) 34 | 35 | init_prompt: Optional[str] = None 36 | init_image: Optional[str] = None 37 | 38 | @classmethod 39 | def from_json(cls, json_path: str): 40 | assert isinstance(json_path, str) and os.path.isfile(json_path) 41 | with open(json_path, "r") as f: 42 | _dict = json.load(f) 43 | print(_dict) 44 | return cls(**_dict) 45 | 46 | def save_json(self, json_path: str): 47 | assert isinstance(json_path, str) 48 | with open(json_path, "w") as f: 49 | json.dump(self.__dict__, f, indent=4) 50 | 51 | def __post_init__(self): 52 | assert self.init_prompt is not None or self.init_image is not None, \ 53 | "When using Descriptor, either `init_prompt` or `init_image` has to be set. Got both None." 54 | 55 | if self.init_prompt is not None and self.init_image is not None: 56 | print(f"Both `init_prompt` ({self.init_prompt}) and `init_image` ({self.init_image}) " 57 | " is given, using `init_image` and ignore `init_prompt`") 58 | self.init_prompt = None 59 | 60 | if self.init_image: 61 | assert os.path.isfile(self.init_image), f"The given `init_image` is not a valid file {self.init_image}" 62 | -------------------------------------------------------------------------------- /mm_pano/utils/image_utils.py: -------------------------------------------------------------------------------- 1 | import cv2 2 | import numpy as np, math 3 | from dataclasses import dataclass 4 | 5 | from PIL import Image 6 | import torch 7 | import torchvision.transforms as transforms 8 | 9 | from typing import List, Union 10 | import torch.nn.functional as F 11 | from torchvision.utils import save_image 12 | import argparse 13 | 14 | import scipy.ndimage as ndimage 15 | import os 16 | import glob 17 | import io 18 | import json 19 | from scipy.ndimage import distance_transform_edt 20 | 21 | 22 | def cv2_to_pil(image): 23 | # Convert the cv2 image to RGB format 24 | image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) 25 | 26 | # Convert the cv2 image to a PIL image 27 | pil_image = Image.fromarray(image) 28 | 29 | return pil_image 30 | 31 | 32 | def pil_to_cv2(image): 33 | # Convert the PIL image to a numpy array 34 | np_image = np.array(image) 35 | 36 | # Convert the numpy array to a cv2 image 37 | cv2_image = cv2.cvtColor(np_image, cv2.COLOR_RGB2BGR) 38 | 39 | return cv2_image 40 | 41 | 42 | def pil_to_tensor(image): 43 | # Define a transformation pipeline 44 | transform = transforms.Compose([ 45 | transforms.ToTensor(), 46 | transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]) 47 | ]) 48 | 49 | # Apply the transformation to the PIL image 50 | tensor_image = transform(image) 51 | 52 | # Add the batch dimension 53 | return tensor_image.unsqueeze(0) 54 | 55 | 56 | def pil_mask_to_tensor(pil_mask): 57 | # Define the transformation to convert the PIL image to a tensor 58 | transform = transforms.ToTensor() 59 | 60 | # Apply the transformation to the PIL image 61 | tensor_mask = transform(pil_mask) 62 | 63 | # Repeat the tensor along the channel dimension to create 3 channels 64 | tensor_mask = tensor_mask.repeat(3, 1, 1) 65 | 66 | # Add the batch dimension 67 | return tensor_mask.unsqueeze(0) 68 | 69 | 70 | def mask_to_pil(mask): 71 | # Multiply the mask by 255 to get values between 0 and 255 72 | mask = mask * 255 73 | 74 | # Convert the mask to an 8-bit integer numpy array 75 | mask = np.uint8(mask) 76 | 77 | # Create a black and white PIL image from the mask 78 | pil_image = Image.fromarray(mask, mode="L") 79 | 80 | return pil_image 81 | 82 | 83 | def cv2_to_tensor(image): 84 | # Convert the image from BGR to RGB format 85 | image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) 86 | 87 | # Normalize pixel values to range 0.0 to 1.0 88 | image_normalized = image_rgb.astype(np.float32) / 255.0 89 | 90 | # Transpose the image array to have the shape (3, h, w) 91 | image_transposed = np.transpose(image_normalized, (2, 0, 1)) 92 | 93 | # Convert the numpy array to a PyTorch tensor and add a batch dimension 94 | tensor = torch.from_numpy(image_transposed).unsqueeze(0) 95 | 96 | return tensor 97 | 98 | 99 | def tensor_to_cv2(tensor_image): 100 | # Convert tensor image to a numpy array 101 | image_np = (tensor_image * 255).numpy().astype(np.uint8) 102 | 103 | # Transpose the numpy array to have the shape (h, w, 3) 104 | image_np_transposed = np.transpose(image_np, (0, 2, 3, 1)) 105 | 106 | # Remove the batch dimension 107 | image_cv2_float = np.squeeze(image_np_transposed, axis=0) 108 | 109 | # Convert the RGB order to BGR order 110 | return cv2.cvtColor(image_cv2_float, cv2.COLOR_RGB2BGR) 111 | 112 | 113 | def tensor_mask_to_numpy(mask): 114 | # Convert the mask to a numpy array 115 | mask_np = mask.numpy() 116 | 117 | # Remove the batch and channel dimensions 118 | return np.squeeze(mask_np, axis=(0, 3)) 119 | 120 | 121 | def save_mask_to_png(mask, filename): 122 | assert len(mask.shape) == 3 and mask.shape[-1] == 1, "Invalid mask shape. Expected (h, w, 1)." 123 | 124 | # Convert the mask to an integer array with values in the range [0, 255] 125 | mask_255 = (mask * 255).astype(np.uint8) 126 | 127 | # Repeat the single channel mask to create a 3 channel image 128 | mask_3_channels = np.repeat(mask_255, 3, axis=-1) 129 | 130 | # Save the image 131 | img = Image.fromarray(mask_3_channels) 132 | img.save(f"{filename}") 133 | 134 | 135 | 136 | def warp_image_v2(image, K_original, K_new, R, output_size): 137 | """ 138 | Warp an image to a new view given the original and new camera intrinsic matrices, relative rotation, 139 | and output image size. 140 | 141 | Parameters: 142 | image (numpy.ndarray): The original image. 143 | K_original (numpy.ndarray): The original camera's intrinsic matrix. 144 | K_new (numpy.ndarray): The new camera's intrinsic matrix. 145 | R (numpy.ndarray): The relative rotation matrix. 146 | output_size (tuple): The desired output image size (width, height). 147 | 148 | Returns: 149 | warped_image (numpy.ndarray): The warped image. 150 | mask (numpy.ndarray): Mask indicating if a pixel in the warped image has a corresponding pixel in the original image. 151 | """ 152 | 153 | # Compute the transformation matrix using the scaled new camera intrinsic 154 | T = K_new.dot(R).dot(np.linalg.inv(K_original)) 155 | 156 | # Warp the image using the new transformation matrix to the specified output size 157 | warped_image = cv2.warpPerspective(image, T, output_size) 158 | 159 | # Create and warp the mask 160 | mask = np.ones((image.shape[0], image.shape[1]), dtype=np.uint8) * 255 161 | mask_warped = cv2.warpPerspective(mask, T, output_size, cv2.INTER_NEAREST, borderMode=cv2.BORDER_CONSTANT, borderValue=(0,0,0)) 162 | 163 | # Convert mask to binary (0 or 1) 164 | mask_binary = (mask_warped > 250).astype(np.uint8) 165 | 166 | return warped_image, mask_binary 167 | 168 | # compute the nearest unmasked region 169 | def mask_to_NN_v2(mask, invert = False): 170 | # print("mask = {}/{}".format(mask, mask.shape)) 171 | # Set a threshold value to create a binary mask 172 | threshold = 0.5 173 | binary_mask = mask > threshold 174 | 175 | if invert: 176 | # Invert the binary_mask to find the unmasked pixels 177 | binary_mask = 1 - binary_mask 178 | 179 | # Convert the inverted_mask to a NumPy array 180 | inverted_mask_np = binary_mask 181 | 182 | # Compute the distance transform on the inverted mask 183 | distance_transform = ndimage.distance_transform_edt(inverted_mask_np) 184 | 185 | # Convert the distance transform back to a PyTorch tensor 186 | return torch.tensor(distance_transform, dtype=torch.float32) 187 | 188 | 189 | def generate_left_right_fullPano_pattern(max_step = 8, step_size = 42, final_step = 42): 190 | pattern = [] 191 | 192 | start_step = 1 193 | angle_begin = step_size 194 | angle_end = (360 - step_size * (max_step // 2 - 1) + step_size * (max_step // 2)) / 2 195 | step_mid = angle_end - step_size * (max_step // 2) 196 | for step in range(start_step, max_step+1): 197 | if step <= max_step // 2: 198 | pattern.append((0, angle_begin, 0)) 199 | angle_begin += step_size 200 | else: 201 | pattern.append((0, angle_end, 0)) 202 | if step != (max_step // 2 + 1): 203 | angle_end += step_size 204 | else: 205 | angle_end += step_mid 206 | 207 | print(f"pattern = {pattern}") 208 | return pattern 209 | 210 | 211 | def create_rotation_matrix(x_angle_degrees, y_angle_degrees, z_angle_degrees): 212 | x_angle_radians = np.radians(x_angle_degrees) 213 | y_angle_radians = np.radians(y_angle_degrees) 214 | z_angle_radians = np.radians(z_angle_degrees) 215 | 216 | cos_x, sin_x = np.cos(x_angle_radians), np.sin(x_angle_radians) 217 | cos_y, sin_y = np.cos(y_angle_radians), np.sin(y_angle_radians) 218 | cos_z, sin_z = np.cos(z_angle_radians), np.sin(z_angle_radians) 219 | 220 | R_x = np.array([[1, 0, 0], 221 | [0, cos_x, -sin_x], 222 | [0, sin_x, cos_x]]) 223 | 224 | R_y = np.array([[cos_y, 0, sin_y], 225 | [0, 1, 0], 226 | [-sin_y, 0, cos_y]]) 227 | 228 | R_z = np.array([[cos_z, -sin_z, 0], 229 | [sin_z, cos_z, 0], 230 | [0, 0, 1]]) 231 | 232 | R = R_y @ R_x @ R_z 233 | 234 | return R 235 | 236 | def read_file_into_list(file_path): 237 | # Initialize an empty list to hold the lines 238 | lines_list = [] 239 | 240 | # Open the file in read mode ('r') 241 | with io.open(file_path, 'r', encoding='utf8') as file: 242 | # Read each line in the file 243 | for line in file: 244 | # Add the line to the list (removing any trailing whitespace characters) 245 | lines_list.append(line.rstrip()) 246 | 247 | # Return the list of lines 248 | return lines_list 249 | 250 | def save_dict_to_file(dict_obj, file_name): 251 | with open(file_name, 'w') as file: 252 | json.dump(dict_obj, file) 253 | 254 | 255 | def load_dict_from_file(file_name): 256 | with open(file_name, 'r') as file: 257 | return json.load(file) 258 | 259 | 260 | 261 | def check_fov_overlap_simplified(rotation_matrix, fov1): 262 | """ 263 | Simplified check if there is an overlap in the field of view of two images based on rotation angle. 264 | 265 | Parameters: 266 | rotation_matrix (numpy.ndarray): 3x3 rotation matrix from image1 to image2 267 | fov1 (tuple): Field of view of image1 (horizontal_angle, vertical_angle) in degrees 268 | 269 | Returns: 270 | bool: True if there is an overlap, False otherwise 271 | """ 272 | 273 | # Calculate the rotation angle from the rotation matrix 274 | rotation_angle_rad = np.arccos((np.trace(rotation_matrix) - 1) / 2) 275 | rotation_angle_deg = np.degrees(rotation_angle_rad) 276 | 277 | # # Compare with the FOV (considering the larger of the horizontal or vertical FOV) 278 | return rotation_angle_deg <= fov1 279 | 280 | 281 | 282 | 283 | @dataclass 284 | class vp90Codec: 285 | interval_deg: float = 1.0 286 | fps: float = 60 287 | video_codec = "VP09" 288 | video_format = ".webm" 289 | 290 | 291 | @dataclass 292 | class mp4vCodec: 293 | interval_deg: float = 0.5 294 | fps: float = 60 295 | video_codec = "mp4v" 296 | video_format = ".mp4" 297 | 298 | 299 | @dataclass 300 | class mp4Codec: 301 | interval_deg: float = 0.5 302 | fps: float = 60 303 | video_codec = "h264" 304 | video_format = ".mp4" 305 | -------------------------------------------------------------------------------- /mm_pano/utils/llm_engines.py: -------------------------------------------------------------------------------- 1 | import copy 2 | from typing import Optional, Dict 3 | from abc import ABC, abstractmethod 4 | 5 | from text_generation import Client 6 | from utils.model_utils import load_llm 7 | 8 | _VALIDATED_MODELS = [ 9 | "gpt-4", "gpt-3.5-turbo", 10 | "meta-llama/Meta-Llama-3-8B-Instruct", 11 | "mistralai/Mistral-7B-Instruct-v0.2", 12 | "tgi", 13 | ] 14 | 15 | 16 | class BaseLLMEngine(ABC): 17 | @abstractmethod 18 | def chat(self, user_content:str, system_content: str, history: Optional[str] = None): 19 | pass 20 | 21 | def extract_output(self, output: str) -> str: 22 | return output 23 | 24 | 25 | class OpenAILLMEngine(BaseLLMEngine): 26 | def __init__(self, 27 | model_engine: str = None, 28 | openai=None, 29 | openai_key: str = None): 30 | self.model_engine = model_engine 31 | self.openai = openai 32 | self.openai.api_key = openai_key 33 | print(f"Using model engine {self.model_engine} to generate text") 34 | 35 | def chat(self, 36 | user_content: str, 37 | system_content: str = "You are a helpful assistant.", 38 | history: str = None) -> str: 39 | 40 | message = self.openai.ChatCompletion.create( 41 | model=self.model_engine, 42 | messages=[ 43 | {"role": "system", "content": system_content}, 44 | {"role": "user", "content": user_content}, 45 | ]).choices[0]['message']['content'] 46 | 47 | # For now the history always = None 48 | return message, None 49 | 50 | 51 | class QwenLLMEngine(BaseLLMEngine): 52 | def __init__(self, 53 | tokenizer, 54 | model): 55 | self.tokenizer = tokenizer 56 | self.model = model 57 | 58 | def chat(self, 59 | user_content: str, 60 | system_content: str = "You are a helpful assistant.", 61 | history: str = None): 62 | message, history = self.model.chat(self.tokenizer, user_content, history=history) 63 | return message, history 64 | 65 | 66 | class MistralLLMEngine(BaseLLMEngine): 67 | def __init__(self, 68 | tokenizer, 69 | model, 70 | default_generate_kwargs: Optional[Dict] = None): 71 | self.tokenizer = tokenizer 72 | self.model = model 73 | self.default_generate_kwargs = {} if default_generate_kwargs is None else default_generate_kwargs 74 | 75 | def chat(self, 76 | user_content: str, 77 | system_content: str = "You are a helpful assistant.", 78 | history: str = None, 79 | generate_kwargs: Optional[Dict] = None): 80 | 81 | _generate_kwargs = copy.deepcopy(self.default_generate_kwargs) 82 | if generate_kwargs is not None: 83 | _generate_kwargs.update(generate_kwargs) 84 | 85 | messages = [ 86 | {"role": "user", "content": user_content}, 87 | ] 88 | model_inputs = self.tokenizer.apply_chat_template(messages, return_tensors="pt").to(self.model.device) 89 | generated_ids = self.model.generate(model_inputs, **_generate_kwargs) 90 | decoded = self.tokenizer.batch_decode(generated_ids, skip_special_tokens=True) 91 | 92 | # TODO(Tien Pei Chou): Find a better way to only output the new tokens. 93 | return decoded[0][decoded[0].rfind("[/INST]") + len("[/INST] "):], None 94 | 95 | 96 | class Llama3LLMEngine(BaseLLMEngine): 97 | def __init__(self, 98 | tokenizer, 99 | model, 100 | default_generate_kwargs: Optional[Dict] = None): 101 | self.tokenizer = tokenizer 102 | self.model = model 103 | self.default_generate_kwargs = {} if default_generate_kwargs is None else default_generate_kwargs 104 | 105 | def extract_output(self, output: str) -> str: 106 | return output[output.rfind("assistant\n\n") + len("assistant\n\n"):] 107 | 108 | def chat(self, 109 | user_content: str, 110 | system_content: str = "You are a helpful assistant.", 111 | history: str = None, 112 | generate_kwargs: Optional[Dict] = None): 113 | 114 | _generate_kwargs = copy.deepcopy(self.default_generate_kwargs) 115 | if generate_kwargs is not None: 116 | _generate_kwargs.update(generate_kwargs) 117 | 118 | messages = [ 119 | {"role": "system", "content": system_content}, 120 | {"role": "user", "content": user_content}, 121 | ] 122 | 123 | model_inputs = self.tokenizer.apply_chat_template(messages, add_generation_prompt=False, return_tensors="pt").to(self.model.device) 124 | generated_ids = self.model.generate(model_inputs, **_generate_kwargs) 125 | decoded = self.tokenizer.batch_decode(generated_ids, skip_special_tokens=True) 126 | 127 | # TODO(Tien Pei Chou): Find a better way to only output the new tokens. 128 | return self.extract_output(decoded[0]), None 129 | 130 | 131 | class TGILLMEngine(BaseLLMEngine): 132 | def __init__(self, 133 | tgi_url: Optional[str] = "http://127.0.0.1:8080", 134 | default_generate_kwargs: Optional[Dict] = None): 135 | self.client = Client(tgi_url) 136 | self.default_generate_kwargs = {} if default_generate_kwargs is None else default_generate_kwargs 137 | 138 | def chat(self, 139 | user_content: str, 140 | system_content: str = "You are a helpful assistant.", 141 | history: str = None, 142 | generate_kwargs: Optional[Dict] = None): 143 | 144 | _generate_kwargs = copy.deepcopy(self.default_generate_kwargs) 145 | if generate_kwargs is not None: 146 | _generate_kwargs.update(generate_kwargs) 147 | 148 | response = self.client.generate(user_content, **_generate_kwargs, return_full_text=False) 149 | 150 | return self.extract_output(response.generated_text), None 151 | 152 | 153 | def get_llm_engine(model_name: str, 154 | dtype: Optional[str] = "float32", 155 | device: Optional[str] = "hpu", 156 | openai_key: Optional[str] = None, 157 | hf_token: Optional[str] = None, 158 | tgi_url: Optional[str] = "http://127.0.0.1:8080"): 159 | if model_name in ["gpt-4", "gpt-3.5-turbo"]: 160 | import openai 161 | assert openai_key is not None, "Please set the `openai_key` when using OpenAI API" 162 | print(f"Using OpenAI {model_name} API for text generaton ...") 163 | return OpenAILLMEngine(model_engine=model_name, openai=openai, openai_key=openai_key) 164 | elif model_name == "mistralai/Mistral-7B-Instruct-v0.2": 165 | tokenizer, model = load_llm( 166 | model_name=model_name, 167 | device=device, 168 | dtype=dtype, 169 | trust_remote_code=True, 170 | hf_token=hf_token) 171 | default_generate_kwargs = { 172 | "do_sample": True, 173 | "temperature": 0.7, 174 | "max_new_tokens": 256 175 | } 176 | print(f"Using {model_name} for text generaton ...") 177 | return MistralLLMEngine(tokenizer, model, default_generate_kwargs=default_generate_kwargs) 178 | elif "Llama" in model_name: # Ex: "meta-llama/Meta-Llama-3-8B-Instruct" 179 | tokenizer, model = load_llm( 180 | model_name=model_name, 181 | device=device, 182 | dtype=dtype, 183 | trust_remote_code=True, 184 | hf_token=hf_token) 185 | default_generate_kwargs = { 186 | "do_sample": True, 187 | "temperature": 0.6, 188 | "max_new_tokens": 256 189 | } 190 | print(f"Using {model_name} for text generaton ...") 191 | return Llama3LLMEngine(tokenizer, model, default_generate_kwargs=default_generate_kwargs) 192 | elif "tgi" in model_name: 193 | assert tgi_url is not None, "Must pass a url to the client when using TGI-Gaudi" 194 | default_generate_kwargs = { 195 | "do_sample": True, 196 | "temperature": 0.6, 197 | "max_new_tokens": 256 198 | } 199 | return TGILLMEngine(tgi_url, default_generate_kwargs) 200 | else: 201 | raise NotImplementedError(f"Got unsupported model {model_name}") 202 | -------------------------------------------------------------------------------- /mm_pano/utils/model_utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | from typing import Union, Optional 3 | 4 | import torch 5 | from transformers import BlipProcessor, BlipForConditionalGeneration 6 | from transformers import Blip2Processor, Blip2ForConditionalGeneration 7 | from diffusers import DiffusionPipeline, DPMSolverMultistepScheduler, StableDiffusionUpscalePipeline 8 | 9 | 10 | def is_on_hpu(device: str) -> bool: 11 | """ Return True is the device is a Gaudi/HPU device. 12 | """ 13 | return "hpu" in device 14 | 15 | 16 | def get_datatype(data_type: Union[str, torch.dtype]): 17 | if isinstance(data_type, torch.dtype): 18 | return data_type 19 | if data_type in ["fp32", "float32"]: 20 | return torch.float 21 | elif data_type in ["fp16", "float16"]: 22 | return torch.float16 23 | elif data_type in ["bfloat16", "bf16"]: 24 | return torch.bfloat16 25 | else: 26 | raise RuntimeError(f"Got unknown dtype {data_type}") 27 | 28 | 29 | def optimize_stable_diffusion_pipeline(pipeline, device, datatype, cpu_offload: bool = False, enable_xformers: bool = False): 30 | pipeline.to(device) 31 | if is_on_hpu(device): 32 | assert datatype in ["bfloat16", "float32"] or datatype in [torch.bfloat16, torch.float32] 33 | pass 34 | else: 35 | # Cuda 36 | # TODO(Joey): Check if there is an Intel version of xformers 37 | # pipeline.unet = torch.compile(pipeline.unet) 38 | if enable_xformers: 39 | pipeline.set_use_memory_efficient_attention_xformers(enable_xformers) 40 | 41 | if cpu_offload: 42 | pipeline.enable_sequential_cpu_offload() 43 | pipeline.enable_model_cpu_offload() 44 | return pipeline.to(device) 45 | 46 | 47 | def optimize_blip(model, device, datatype): 48 | model.to(device) 49 | return model 50 | 51 | 52 | def load_diffusion_model(model_name: str = "stabilityai/stable-diffusion-2-inpainting", 53 | device: str = "cuda", 54 | dtype: Union[str, torch.dtype] = "float16", 55 | cpu_offload: bool = False): 56 | """ Load diffusion or diffusion inpainting model for text-to-image. 57 | """ 58 | print(f"Loading text-to-image model {model_name} ...") 59 | 60 | torch_dtype = get_datatype(dtype) 61 | 62 | if is_on_hpu(device=device): 63 | assert dtype in ["bfloat16", "float32"] or dtype in [torch.bfloat16, torch.float32] 64 | from optimum.habana.diffusers import GaudiDDIMScheduler 65 | if "inpaint" in model_name: 66 | from optimum.habana.diffusers import GaudiStableDiffusionInpaintPipeline as DiffusionPipelineClass 67 | else: 68 | from optimum.habana.diffusers import GaudiStableDiffusionPipeline as DiffusionPipelineClass 69 | 70 | # Load model and scheduler on Gaudi/HPU 71 | scheduler = GaudiDDIMScheduler.from_pretrained(model_name, subfolder="scheduler") 72 | kwargs = { 73 | "scheduler": scheduler, 74 | "use_habana": True, 75 | "use_hpu_graphs": True, 76 | "gaudi_config": "Habana/stable-diffusion" 77 | } 78 | pipe = DiffusionPipelineClass.from_pretrained(model_name, **kwargs).to(torch_dtype) 79 | else: 80 | # Load model and scheduler 81 | pipe = DiffusionPipeline.from_pretrained(model_name, torch_dtype=torch_dtype) 82 | pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config) 83 | # TODO(Joey): Check why enable xformers + torch.compile for inpainting model gets error 84 | pipe = optimize_stable_diffusion_pipeline(pipe, device, torch_dtype, cpu_offload, enable_xformers=False) 85 | return pipe 86 | 87 | 88 | def load_blip_model_and_processor(model_name: str = "Salesforce/blip2-flan-t5-xl", # "Salesforce/blip2-opt-2.7b" 89 | device: str = "cuda", 90 | dtype: Union[str, torch.dtype] = "float16"): 91 | """ Load BLIP model for image-to-text. 92 | """ 93 | print(f"Loading image-to-text model {model_name} ...") 94 | 95 | torch_dtype = get_datatype(dtype) 96 | 97 | if "blip2" in model_name: 98 | processor_class = Blip2Processor 99 | model_class = Blip2ForConditionalGeneration 100 | else: 101 | # Blip 102 | assert "blip" in model_name 103 | processor_class = BlipProcessor 104 | model_class = BlipForConditionalGeneration 105 | 106 | if is_on_hpu(device=device): 107 | assert dtype in ["bfloat16", "float32"] or dtype in [torch.bfloat16, torch.float32] 108 | from optimum.habana.transformers.modeling_utils import adapt_transformers_to_gaudi 109 | 110 | # TODO(Joey): Check optimum-habana once it has Blip2 support 111 | adapt_transformers_to_gaudi() 112 | else: 113 | processor_class = Blip2Processor 114 | model_class = Blip2ForConditionalGeneration 115 | 116 | # Get a blip description 117 | processor = processor_class.from_pretrained(model_name) 118 | model = model_class.from_pretrained(model_name, torch_dtype=torch_dtype) 119 | model = optimize_blip(model, device, torch_dtype) 120 | 121 | return processor, model 122 | 123 | 124 | def load_upscaler_model(model_name: str = "stabilityai/stable-diffusion-x4-upscaler", 125 | device: str = "cuda", 126 | dtype: Union[str, torch.dtype] = "float16", 127 | cpu_offload: bool = False): 128 | """ Load super resolution model for upscaling. 129 | """ 130 | print(f"Loading super resolution model {model_name} ...") 131 | 132 | torch_dtype = get_datatype(dtype) 133 | 134 | if is_on_hpu(device=device): 135 | assert dtype in ["bfloat16", "float32"] or dtype in [torch.bfloat16, torch.float32] 136 | from optimum.habana.diffusers import GaudiDDIMScheduler, GaudiStableDiffusionUpscalePipeline 137 | 138 | # Load model and scheduler on Gaudi/HPU 139 | scheduler = GaudiDDIMScheduler.from_pretrained(model_name, subfolder="scheduler") 140 | kwargs = { 141 | "scheduler": scheduler, 142 | "use_habana": True, 143 | "use_hpu_graphs": True, 144 | "gaudi_config": "Habana/stable-diffusion" 145 | } 146 | pipe = GaudiStableDiffusionUpscalePipeline.from_pretrained(model_name, **kwargs).to(torch_dtype) 147 | else: 148 | # Load model and scheduler 149 | pipe = StableDiffusionUpscalePipeline.from_pretrained(model_name, torch_dtype=torch_dtype) 150 | # TODO(Joey): Check why enable xformers + torch.compiler for inpainting model gets error 151 | pipe = optimize_stable_diffusion_pipeline(pipe, device, torch_dtype, cpu_offload, enable_xformers=False) 152 | 153 | return pipe 154 | 155 | 156 | def load_llm(model_name: str = "mistralai/Mistral-7B-Instruct-v0.2", 157 | device: str = "cuda", 158 | dtype: Optional[Union[str, torch.dtype]] = "float16", 159 | trust_remote_code: bool = False, 160 | hf_token: Optional[str] = None): 161 | """ Load LLM model. 162 | """ 163 | print(f"Loading LLM {model_name} ...") 164 | 165 | if is_on_hpu(device=device): 166 | assert dtype in ["bfloat16", "float32"] or dtype in [torch.bfloat16, torch.float32] 167 | from optimum.habana.transformers.modeling_utils import adapt_transformers_to_gaudi 168 | adapt_transformers_to_gaudi() 169 | 170 | from transformers import AutoModelForCausalLM, AutoTokenizer 171 | tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=trust_remote_code, token=hf_token) 172 | model = AutoModelForCausalLM.from_pretrained( 173 | model_name, trust_remote_code=trust_remote_code, token=hf_token).eval().to(device) 174 | 175 | tokenizer.pad_token_id = tokenizer.eos_token_id 176 | 177 | return tokenizer, model 178 | 179 | 180 | def release_memory(model, tokenizer, device: str = "cuda"): 181 | import gc 182 | del tokenizer 183 | del model 184 | 185 | if device == "cuda": 186 | torch.cuda.empty_cache() 187 | else: 188 | # TODO(Tien Pei Chou): Add Gaudi and XPU 189 | raise NotImplementedError() 190 | # accelerator.free_memory() 191 | gc.collect() 192 | -------------------------------------------------------------------------------- /requirements-api.txt: -------------------------------------------------------------------------------- 1 | # fastAPI and gradio 2 | fastapi 3 | uvicorn 4 | gradio 5 | envbash 6 | -------------------------------------------------------------------------------- /requirements-hpu.txt: -------------------------------------------------------------------------------- 1 | # Fix to the main branch once the PR (https://github.com/huggingface/optimum-habana/pull/869/commits) gets merged 2 | git+https://github.com/huggingface/optimum-habana.git@8893d602289226eda82cf19c79951fa12d15e1b9 3 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | wheel 2 | diffusers 3 | transformers 4 | scipy 5 | openai==0.27.2 6 | matplotlib 7 | imageio[ffmpeg] 8 | opencv-python 9 | accelerate 10 | einops 11 | auto-gptq 12 | tiktoken 13 | transformers_stream_generator 14 | text-generation # TGI and TGI Gaudi 15 | --------------------------------------------------------------------------------