├── .gitignore
├── LICENSE
├── README.md
├── app.py
├── data
    ├── demo.gif
    ├── drive_images
    │   ├── hhh.jpg
    │   ├── hrwh.jpg
    │   ├── jue.jpg
    │   ├── yao.jpg
    │   └── ysll.jpg
    ├── drive_videos
    │   ├── amns.mp4
    │   ├── jgz.mp4
    │   ├── jue.mp4
    │   ├── nhs.mp4
    │   ├── nice.mp4
    │   ├── qie.mp4
    │   ├── tbh.mp4
    │   ├── tiktok.mp4
    │   ├── tndtc.mp4
    │   ├── xzy1.mp4
    │   ├── xzy2.mp4
    │   ├── xzy3.mp4
    │   └── ysll.mp4
    ├── harris_yao.jpg
    ├── harris_yao_toon.jpg
    ├── jue.gif
    ├── reference_images
    │   ├── BeautyFool.jpg
    │   ├── chillout.jpg
    │   ├── civitai1.jpg
    │   ├── firefly.jpg
    │   ├── harris.jpg
    │   ├── kjl.jpg
    │   ├── majicmix1.jpg
    │   ├── majicmix2.jpg
    │   ├── majicmix3.jpg
    │   ├── majicmix8.jpg
    │   ├── mimic1.jpg
    │   ├── show1.jpg
    │   ├── show4.jpg
    │   ├── show6.jpg
    │   ├── toon.png
    │   ├── trump.jpg
    │   ├── wukong1.jpg
    │   └── zzj.jpg
    ├── trump_jue-toon.gif
    └── trump_jue.gif
├── environment.yml
├── generator.py
├── hellomeme
    ├── __init__.py
    ├── model_config.json
    ├── models
    │   ├── __init__.py
    │   ├── hm3_denoising_3d.py
    │   ├── hm3_denoising_motion.py
    │   ├── hm_adapters.py
    │   ├── hm_blocks.py
    │   ├── hm_control.py
    │   ├── hm_denoising_3d.py
    │   └── hm_denoising_motion.py
    ├── pipelines
    │   ├── __init__.py
    │   ├── pipline_hm3_image.py
    │   ├── pipline_hm3_video.py
    │   ├── pipline_hm5_image.py
    │   ├── pipline_hm5_video.py
    │   ├── pipline_hm_image.py
    │   └── pipline_hm_video.py
    ├── tools
    │   ├── __init__.py
    │   ├── hello_3dmm.py
    │   ├── hello_arkit.py
    │   ├── hello_camera_demo.py
    │   ├── hello_face_alignment.py
    │   ├── hello_face_det.py
    │   ├── pdf.py
    │   ├── sr.py
    │   └── utils.py
    └── utils.py
├── inference_image.py
└── inference_video.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | *.pyc
 2 | *.pyd
 3 | *.pth
 4 | *.pkl
 5 | *.mp4
 6 | *.jpg
 7 | *.png
 8 | *_fps15.mp4
 9 | .idea/
10 | .gradio/
11 | data/results
12 | pretrained_models/
13 | __pycache__/
14 | *~
15 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2024 HelloVision
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | <h1 align='center'>HelloMeme: Integrating Spatial Knitting Attentions to Embed High-Level and Fidelity-Rich Conditions in Diffusion Models</h1>
  2 | 
  3 | <div align='center'>
  4 |     <a href='https://github.com/songkey' target='_blank'>Shengkai Zhang</a>, <a href='https://github.com/RhythmJnh' target='_blank'>Nianhong Jiao</a>, <a href='https://github.com/Shelton0215' target='_blank'>Tian Li</a>, <a href='https://github.com/chaojie12131243' target='_blank'>Chaojie Yang</a>, <a href='https://github.com/xchgit' target='_blank'>Chenhui Xue</a><sup>*</sup>, <a href='https://github.com/boya34' target='_blank'>Boya Niu</a><sup>*</sup>, <a href='https://github.com/HelloVision/HelloMeme' target='_blank'>Jun Gao</a> 
  5 | </div>
  6 | 
  7 | <div align='center'>
  8 |     HelloVision | HelloGroup Inc.
  9 | </div>
 10 | 
 11 | <div align='center'>
 12 |     <small><sup>*</sup> Intern</small>
 13 | </div>
 14 | 
 15 | <br>
 16 | <div align='center'>
 17 |     <a href='https://github.com/HelloVision/HelloMeme'><img src='https://img.shields.io/github/stars/HelloVision/HelloMeme'></a>
 18 |     <a href='https://songkey.github.io/hellomeme/'><img src='https://img.shields.io/badge/Project-HomePage-Green'></a>
 19 |     <a href='https://arxiv.org/pdf/2410.22901'><img src='https://img.shields.io/badge/Paper-Arxiv-red'></a>
 20 |     <a href='https://huggingface.co/songkey'><img src='https://img.shields.io/badge/%F0%9F%A4%97%20HuggingFace-Model-yellow'></a>
 21 |     <a href='https://github.com/HelloVision/ComfyUI_HelloMeme'><img src='https://img.shields.io/badge/ComfyUI-UI-blue'></a>
 22 |     <a href='https://www.modelscope.cn/studios/songkey/HelloMeme'><img src='https://img.shields.io/badge/modelscpe-Demo-red'></a>
 23 | </div>
 24 | 
 25 | <p align="center">
 26 |   <img src="data/demo.gif" alt="showcase">
 27 | </p>
 28 | 
 29 | ## 🔆 New Features/Updates
 30 | 
 31 | - ☐ [`ExperimentsOnSKAttentions`](https://github.com/HelloVision/ExperimentsOnSKAttentions) for ablation experiments.
 32 | - ☐ SDXL version.
 33 | - ✅ `02/09/2025` **HelloMemeV3** is now available.
 34 | [YouTube Demo](https://www.youtube.com/watch?v=DAUA0EYjsZA)
 35 | 
 36 | - ✅ `12/17/2024` Added modelscope [Demo](https://www.modelscope.cn/studios/songkey/HelloMeme).
 37 | - ✅ `12/13/2024` Rewrite the code for the Gradio app.
 38 | - ✅ `12/12/2024` Added HelloMeme V2 (synchronize code from the [`ComfyUI`](https://github.com/HelloVision/ComfyUI_HelloMeme) repo).
 39 | - ✅ `11/14/2024` Added the `HMControlNet2` module
 40 | - ✅ `11/12/2024` Added a newly fine-tuned version of [`Animatediff`](https://huggingface.co/songkey/hm_animatediff_frame12) with a patch size of 12, which uses less VRAM (Tested on 2080Ti).
 41 | - ✅ `11/5/2024`  [`ComfyUI`](https://github.com/HelloVision/ComfyUI_HelloMeme) interface for HelloMeme.
 42 | - ✅ `11/1/2024` Release the code for the core functionalities..
 43 | 
 44 | ## Introduction
 45 | This repository contains the official code implementation of the paper [`HelloMeme`](https://arxiv.org/pdf/2410.22901). Any updates related to the code or models from the paper will be posted here. The code for the ablation experiments discussed in the paper will be added to the [`ExperimentsOnSKAttentions`](https://github.com/HelloVision/ExperimentsOnSKAttentions) section. Additionally, we plan to release a `ComfyUI` interface for HelloMeme, with updates posted here as well.
 46 | 
 47 | ## Getting Started
 48 | 
 49 | ### 1. Create a Conda Environment
 50 | 
 51 | ```bash
 52 | conda create -n hellomeme python=3.10.11
 53 | conda activate hellomeme
 54 | ```
 55 | 
 56 | ### 2. Install PyTorch and FFmpeg
 57 | To install the latest version of PyTorch, please refer to the official [PyTorch](https://pytorch.org/get-started/locally/) website for detailed installation instructions. Additionally, the code will invoke the system's ffmpeg command for video and audio editing, so the runtime environment must have ffmpeg pre-installed. For installation guidance, please refer to the official [FFmpeg](https://ffmpeg.org/) website.
 58 | 
 59 | ### 3. Install dependencies
 60 | 
 61 | ```bash
 62 | pip install diffusers transformers einops scipy opencv-python tqdm pillow onnxruntime-gpu onnx safetensors accelerate peft imageio imageio[ffmpeg] torchvision 
 63 | ```
 64 | 
 65 | > [!IMPORTANT]  
 66 | > 
 67 | > Note the version of diffusers required: frequent updates to diffusers may lead to dependency conflicts. We will periodically check the repo’s compatibility with the latest diffusers version. The currently tested and supported version is **diffusers==0.33.1**.
 68 | 
 69 | ### 4. Clone the repository
 70 | 
 71 | ```bash
 72 | git clone https://github.com/HelloVision/HelloMeme
 73 | cd HelloMeme
 74 | ```
 75 | 
 76 | ### 5. Run the code
 77 | ```bash
 78 | python inference_image.py # for image generation
 79 | python inference_video.py # for video generation
 80 | ```
 81 | 
 82 | ### 6. Install for Gradio App
 83 | 
 84 | We recommend setting up the environment with conda.
 85 | 
 86 | ```bash
 87 | pip install gradio
 88 | python app.py
 89 | ```
 90 | 
 91 | After run the app, all models will be downloaded.
 92 | 
 93 | ## Examples
 94 | 
 95 | ### Image Generation
 96 | 
 97 | The input for the image generation script `inference_image.py` consists of a reference image and a drive image, as shown in the figure below:
 98 | 
 99 | <table>
100 |     <tr>
101 |         <td><img src="./data/reference_images/harris.jpg" width="256" height="256"> <br> Reference Image</td>
102 |         <td ><img src="./data/drive_images/yao.jpg" width="192" height="256"> <br> Drive Image </td>
103 |     </tr>
104 | </table>
105 | 
106 | The output of the image generation script is shown below:
107 | 
108 | <table>
109 |     <tr>
110 |         <td><img src="./data/harris_yao.jpg" width="256" height="256"> <br> Based on SD1.5 </td>
111 |         <td ><img src="./data/harris_yao_toon.jpg" height="256" height="256"> <br> Based on <a href="https://civitai.com/models/75650/disney-pixar-cartoon-type-b">disneyPixarCartoon</a>  </td>
112 |     </tr>
113 | </table>
114 | 
115 | ### Video Generation
116 | 
117 | The input for the video generation script `inference_video.py` consists of a reference image and a drive video, as shown in the figure below:
118 | 
119 | <table>
120 |     <tr>
121 |         <td><img src="./data/reference_images/trump.jpg" width="256" height="256"> <br> Reference Image</td>
122 |         <td ><img src="./data/jue.gif" width="256" height="256"> <br> Drive Video  </td>
123 |     </tr>
124 | </table>
125 | 
126 | The output of the video generation script is shown below:
127 | 
128 | <table>
129 |     <tr>
130 |         <td><img src="./data/trump_jue.gif" width="256" height="256"> <br> Based on <a href="https://civitai.com/models/25694/epicrealism">epicrealism</a> </td>
131 |         <td ><img src="./data/trump_jue-toon.gif" width="256" height="256"> <br> Based on <a href="https://civitai.com/models/75650/disney-pixar-cartoon-type-b">disneyPixarCartoon</a> </td>
132 |     </tr>
133 | </table>
134 | 
135 | > [!Note]
136 | > 
137 | > If the face in the driving video has significant movement (such as evident camera motion), it is recommended to set the `trans_ratio` parameter to 0 to prevent distorted outputs.
138 | > 
139 | >`inference_video(engines, ref_img_path, drive_video_path, save_path, trans_ratio=0.0)`
140 | 
141 | ## Pretrained Models
142 | 
143 | Our models are all hosted on [🤗](https://huggingface.co/songkey), and the startup script will download them automatically. The specific model information is as follows:
144 | 
145 | | model | size  | url  | Info                                                  |
146 | |-------|-------|------|-------------------------------------------------------|
147 | | songkey/hm_reference  | 312M  | <a href='https://huggingface.co/songkey/hm_reference'><img src='https://img.shields.io/badge/%F0%9F%A4%97%20HuggingFace-Model-yellow'></a> | The weights of the ReferenceAdapter module            |
148 | | songkey/hm_control  | 149M  | <a href='https://huggingface.co/songkey/hm_control'><img src='https://img.shields.io/badge/%F0%9F%A4%97%20HuggingFace-Model-yellow'></a> | The weights of the HMControlNet module                |
149 | | songkey/hm_animatediff  | 835M  | <a href='https://huggingface.co/songkey/hm_animatediff'><img src='https://img.shields.io/badge/%F0%9F%A4%97%20HuggingFace-Model-yellow'></a> | The weights of the Turned Animatediff (patch size 16) |
150 | | songkey/hm_animatediff_frame12 | 835M  | <a href='https://huggingface.co/songkey/hm_animatediff_frame12'><img src='https://img.shields.io/badge/%F0%9F%A4%97%20HuggingFace-Model-yellow'></a> | The weights of the Turned Animatediff (patch size 12) |
151 | | hello_3dmm.onnx  | 311M  | <a href='https://huggingface.co/songkey/hello_group_facemodel'><img src='https://img.shields.io/badge/%F0%9F%A4%97%20HuggingFace-Model-yellow'></a> | For face RT Extractor                                 |
152 | | hello_arkit_blendshape.onnx | 9.11M | <a href='https://huggingface.co/songkey/hello_group_facemodel'><img src='https://img.shields.io/badge/%F0%9F%A4%97%20HuggingFace-Model-yellow'></a> | Extract ARKit blendshape parameters                   |
153 | | hello_face_det.onnx | 317K  | <a href='https://huggingface.co/songkey/hello_group_facemodel'><img src='https://img.shields.io/badge/%F0%9F%A4%97%20HuggingFace-Model-yellow'></a> | Face Detector                                         |
154 | | hello_face_landmark.onnx | 2.87M | <a href='https://huggingface.co/songkey/hello_group_facemodel'><img src='https://img.shields.io/badge/%F0%9F%A4%97%20HuggingFace-Model-yellow'></a> | Face Landmarks (222 points)                           |
155 | 
156 | Our pipeline also supports loading stylized base models (safetensors). For video generation tasks, using some customized models for portrait generation, such as [**Realistic Vision V6.0 B1**](https://civitai.com/models/4201/realistic-vision-v60-b1), can produce better results. You can download checkpoints and loras into the directories `pretrained_models/` and `pretrained_models/loras/`, respectively.
157 | 
158 | ## Acknowledgements
159 | 
160 | Thanks to 🤗 for providing [diffusers](https://huggingface.co/docs/diffusers), which has greatly enhanced development efficiency in diffusion-related work. We also drew considerable inspiration from [MagicAnimate](https://github.com/magic-research/magic-animate) and [EMO](https://github.com/HumanAIGC/EMO), and [Animatediff](https://github.com/guoyww/AnimateDiff) allowed us to implement the video version at a very low cost. Finally, we thank our colleagues **Shengjie Wu** and **Zemin An**, whose foundational modules played a significant role in this work.
161 | 
162 | ## Citation
163 | 
164 | ```bibtex
165 | @misc{zhang2024hellomemeintegratingspatialknitting,
166 |         title={HelloMeme: Integrating Spatial Knitting Attentions to Embed High-Level and Fidelity-Rich Conditions in Diffusion Models}, 
167 |         author={Shengkai Zhang and Nianhong Jiao and Tian Li and Chaojie Yang and Chenhui Xue and Boya Niu and Jun Gao},
168 |         year={2024},
169 |         eprint={2410.22901},
170 |         archivePrefix={arXiv},
171 |         primaryClass={cs.CV},
172 |         url={https://arxiv.org/abs/2410.22901}, 
173 |   }
174 | ```
175 | 
176 | ## Contact
177 | **Shengkai Zhang** (songkey@pku.edu.cn)
178 | 


--------------------------------------------------------------------------------
/app.py:
--------------------------------------------------------------------------------
  1 | # coding: utf-8
  2 | 
  3 | """
  4 | @File   : new_app.py
  5 | @Author : Songkey
  6 | @Email  : songkey@pku.edu.cn
  7 | @Date   : 12/12/2024
  8 | @Desc   : 
  9 | """
 10 | import os
 11 | 
 12 | import gradio as gr
 13 | from generator import Generator, DEFAULT_PROMPT, MODEL_CONFIG
 14 | import torch
 15 | import importlib.metadata
 16 | 
 17 | 
 18 | installed_packages = [package.name for package in importlib.metadata.distributions()]
 19 | 
 20 | REQUIRED = {
 21 |     'diffusers':'0.33.1', 'transformers':'4.46.3', 'einops':'0.8.0', 'opencv-python':'4.10.0.84', 'tqdm':'4.67.0',
 22 |     'pillow':'10.2.0', 'onnxruntime-gpu':'1.18.1', 'onnx':'1.17.0', 'safetensors':'0.4.5',
 23 |     'accelerate':'1.1.1', 'peft':'0.13.2'
 24 | }
 25 | 
 26 | missing = [name for name in REQUIRED.keys() if name not in installed_packages]
 27 | missing_params = ' '.join([f'{k}=={REQUIRED[k]}' for k in missing])
 28 | print("missing pkgs", missing_params)
 29 | 
 30 | # if missing:
 31 | #     os.system(f'{sys.executable} -m pip install {missing_params}')
 32 | 
 33 | modelscope = False
 34 | 
 35 | VERSION_DICT = dict(
 36 |     HelloMemeV1='v1',
 37 |     HelloMemeV2='v2',
 38 |     HelloMemeV3='v3',
 39 |     HelloMemeV4='v4',
 40 |     HelloMemeV5='v5',
 41 | )
 42 | 
 43 | with gr.Blocks(theme=gr.themes.Soft()) as app:
 44 |     gr.Markdown('''
 45 |         <div style="display: flex; justify-content: center; align-items: center; text-align: center;">
 46 |             <div>
 47 |                 <h1>HelloMeme: Integrating Spatial Knitting Attentions to Embed High-Level and Fidelity-Rich Conditions in Diffusion Models</h1>
 48 |                 <div style="display: flex; justify-content: center; align-items: center; text-align: center;">
 49 |                     <a href='https://songkey.github.io/hellomeme/'><img src='https://img.shields.io/badge/Project-HomePage-Green'></a>  &nbsp;\
 50 |                     <a href='https://github.com/HelloVision/HelloMeme'><img src='https://img.shields.io/badge/GitHub-Code-blue'></a>  &nbsp;\
 51 |                     <a href='https://arxiv.org/pdf/2410.22901'><img src='https://img.shields.io/badge/Paper-Arxiv-red'></a>  &nbsp;\
 52 |                     <a href='https://github.com/HelloVision/ComfyUI_HelloMeme'><img src='https://img.shields.io/badge/ComfyUI-UI-blue'></a>  &nbsp;\
 53 |                     <a href='https://github.com/HelloVision/HelloMeme'><img src='https://img.shields.io/github/stars/HelloVision/HelloMeme'></a>
 54 |                 </div>
 55 |             </div>
 56 |         </div>
 57 |     ''')
 58 | 
 59 |     gen = Generator(gpu_id=0, dtype=torch.float16, sr=True, pipeline_dict_len=2, modelscope=modelscope)
 60 | 
 61 |     with gr.Tab("Image Generation"):
 62 |         with gr.Row():
 63 |             ref_img = gr.Image(type="pil", label="Reference Image")
 64 |             drive_img = gr.Image(type="pil", label="Drive Image")
 65 |             result_img = gr.Image(type="pil", label="Generated Image")
 66 |         exec_btn = gr.Button("Run")
 67 |         with gr.Column():
 68 |             with gr.Row():
 69 |                 checkpoint = gr.Dropdown(choices=list(MODEL_CONFIG['sd15']['checkpoints'].keys()),
 70 |                                          value=list(MODEL_CONFIG['sd15']['checkpoints'].keys())[1], label="Checkpoint")
 71 |                 lora = gr.Dropdown(choices=['None'] + list(MODEL_CONFIG['sd15']['loras'].keys()),
 72 |                                    value="None", label="LoRA")
 73 |             with gr.Row():
 74 |                 lora_scale = gr.Slider(0.0, 10.0, 1.0, step=0.1, label="Lora Scale", interactive=True)
 75 |                 version = gr.Dropdown(choices=['HelloMemeV1', 'HelloMemeV2', 'HelloMemeV3', 'HelloMemeV4', 'HelloMemeV5'], value="HelloMemeV5", label="Version")
 76 |                 cntrl_version = gr.Dropdown(choices=['HMControlNet1', 'HMControlNet2'], value="HMControlNet2", label="Control Version")
 77 |                 stylize = gr.Dropdown(choices=['x1', 'x2'], value="x1", label="Stylize")
 78 |         with gr.Accordion("Advanced Options", open=False):
 79 |             with gr.Row():
 80 |                 num_steps = gr.Slider(1, 50, 25, step=1, label="Steps")
 81 |                 guidance = gr.Slider(1.0, 10.0, 1.5, step=0.1, label="Guidance", interactive=True)
 82 |             with gr.Row():
 83 |                 seed = gr.Number(value=-1, label="Seed (-1 for random)")
 84 |                 trans_ratio = gr.Slider(0.0, 1.0, 0.0, step=0.01, label="Trans Ratio", interactive=True)
 85 |                 crop_reference = gr.Checkbox(label="Crop Reference", value=True)
 86 | 
 87 |         def img_gen_fnc(ref_img, drive_img, num_steps, guidance, seed,
 88 |                         trans_ratio, crop_reference, cntrl_version, version, stylize, checkpoint, lora, lora_scale):
 89 | 
 90 |             if lora != 'None':
 91 |                 tmp_lora_info = MODEL_CONFIG['sd15']['loras'][lora]
 92 |             else:
 93 |                 lora_path = None
 94 | 
 95 |             if modelscope:
 96 |                 from modelscope import snapshot_download
 97 |                 checkpoint_path = snapshot_download(MODEL_CONFIG['sd15']['checkpoints'][checkpoint])
 98 |                 if lora != 'None':
 99 |                     lora_path = os.path.join(snapshot_download(tmp_lora_info[0]), tmp_lora_info[1])
100 |             else:
101 |                 from huggingface_hub import hf_hub_download
102 |                 checkpoint_path = MODEL_CONFIG['sd15']['checkpoints'][checkpoint]
103 |                 if lora != 'None':
104 |                     lora_path = hf_hub_download(tmp_lora_info[0], filename=tmp_lora_info[1])
105 | 
106 |             res = None
107 |             try:
108 |                 token = gen.load_pipeline("image", checkpoint_path=checkpoint_path, lora_path=lora_path, lora_scale=lora_scale,
109 |                                         stylize=stylize, version=VERSION_DICT[version])
110 |                 res = gen.image_generate(token,
111 |                                          ref_img,
112 |                                          drive_img,
113 |                                          num_steps,
114 |                                          guidance,
115 |                                          seed,
116 |                                          DEFAULT_PROMPT,
117 |                                          '',
118 |                                          trans_ratio,
119 |                                          crop_reference,
120 |                                          'cntrl1' if cntrl_version == 'HMControlNet1' else 'cntrl2',
121 |                                         )
122 |             except Exception as e:
123 |                 print(e)
124 |             return res
125 | 
126 |         exec_btn.click(fn=img_gen_fnc,
127 |                        inputs=[ref_img, drive_img, num_steps, guidance, seed,
128 |                                trans_ratio, crop_reference, cntrl_version, version, stylize, checkpoint,
129 |                                lora, lora_scale],
130 |                        outputs=result_img,
131 |                        api_name="Image Generation")
132 |         gr.Examples(
133 |             examples=[
134 |                 ['data/reference_images/chillout.jpg', 'data/drive_images/yao.jpg', 25, 1.5, 1024,
135 |                  0.0, False, 'HMControlNet2', 'HelloMemeV5', 'x1',
136 |                  list(MODEL_CONFIG['sd15']['checkpoints'].keys())[2], list(MODEL_CONFIG['sd15']['loras'].keys())[1], 1.5],
137 |                 ['data/reference_images/firefly.jpg', 'data/drive_images/ysll.jpg', 25, 1.5, 1024,
138 |                  0.0, False, 'HMControlNet2', 'HelloMemeV5', 'x1',
139 |                  list(MODEL_CONFIG['sd15']['checkpoints'].keys())[1], "None", 1.5],
140 |                 ['data/reference_images/majicmix8.jpg', 'data/drive_images/hrwh.jpg', 25, 1.5, 1024,
141 |                  0.0, False, 'HMControlNet2', 'HelloMemeV5', 'x1',
142 |                  list(MODEL_CONFIG['sd15']['checkpoints'].keys())[1], "None", 1.5],
143 |                 ['data/reference_images/show1.jpg', 'data/drive_images/jue.jpg', 25, 1.5, 1080,
144 |                  0.0, False, 'HMControlNet2', 'HelloMemeV5', 'x1',
145 |                  list(MODEL_CONFIG['sd15']['checkpoints'].keys())[12], "None", 1.5],
146 |                 ['data/reference_images/show4.jpg', 'data/drive_images/hhh.jpg', 25, 1.5, 768,
147 |                  0.0, False, 'HMControlNet2', 'HelloMemeV5', 'x1',
148 |                  list(MODEL_CONFIG['sd15']['checkpoints'].keys())[8], "None", 1.5],
149 |                 ['data/reference_images/show6.jpg', 'data/drive_images/hrwh.jpg', 25, 1.5, 4096,
150 |                  0.0, False, 'HMControlNet2', 'HelloMemeV5', 'x1',
151 |                  list(MODEL_CONFIG['sd15']['checkpoints'].keys())[9], "None", 1.5],
152 |             ],
153 |             fn=img_gen_fnc,
154 |             inputs=[ref_img, drive_img, num_steps, guidance, seed, trans_ratio,
155 |                     crop_reference, cntrl_version, version, stylize, checkpoint, lora, lora_scale],
156 |             outputs=result_img,
157 |             cache_examples=False,
158 |         )
159 | 
160 |     with gr.Tab("Video Generation"):
161 |         with gr.Row():
162 |             ref_img = gr.Image(type="pil", label="Reference Image")
163 |             drive_video = gr.Video(label="Drive Video")
164 |             result_video = gr.Video(autoplay=True, loop=True, label="Generated Video")
165 |         exec_btn = gr.Button("Run")
166 |         with gr.Column():
167 |             with gr.Row():
168 |                 checkpoint = gr.Dropdown(choices=list(MODEL_CONFIG['sd15']['checkpoints'].keys()),
169 |                                          value=list(MODEL_CONFIG['sd15']['checkpoints'].keys())[1], label="Checkpoint")
170 |                 lora = gr.Dropdown(choices=['None'] + list(MODEL_CONFIG['sd15']['loras'].keys()),
171 |                                    value="None", label="LoRA")
172 |             with gr.Row():
173 |                 lora_scale = gr.Slider(0.0, 10.0, 1.0, step=0.1, label="Lora Scale", interactive=True)
174 |                 version = gr.Dropdown(choices=['HelloMemeV1', 'HelloMemeV2', 'HelloMemeV3', 'HelloMemeV4', 'HelloMemeV5'], value="HelloMemeV2", label="Version")
175 |                 cntrl_version = gr.Dropdown(choices=['HMControlNet1', 'HMControlNet2'], value="HMControlNet2", label="Control Version")
176 |                 stylize = gr.Dropdown(choices=['x1', 'x2'], value="x1", label="Stylize")
177 |         with gr.Accordion("Advanced Options", open=False):
178 |             with gr.Row():
179 |                 num_steps = gr.Slider(1, 50, 25, step=1, label="Steps", interactive=True)
180 |                 guidance = gr.Slider(1.0, 10.0, 1.5, step=0.1, label="Guidance", interactive=True)
181 |                 patch_overlap = gr.Slider(1, 5, 4, step=1, label="Patch Overlap", interactive=True)
182 |             with gr.Row():
183 |                 seed = gr.Number(value=-1, label="Seed (-1 for random)")
184 |                 trans_ratio = gr.Slider(0.0, 1.0, 0.0, step=0.01, label="Trans Ratio", interactive=True)
185 |                 with gr.Column():
186 |                     crop_reference = gr.Checkbox(label="Crop Reference", value=True)
187 |                     fps8 = gr.Checkbox(label="Use fps8", value=True)
188 |         def video_gen_fnc(ref_img, drive_video, num_steps, guidance, seed,
189 |                         trans_ratio, crop_reference, cntrl_version, version, stylize, patch_overlap,
190 |                         checkpoint, lora, lora_scale, fps8):
191 |             if lora != 'None':
192 |                 tmp_lora_info = MODEL_CONFIG['sd15']['loras'][lora]
193 |             else:
194 |                 lora_path = None
195 | 
196 |             if modelscope:
197 |                 from modelscope import snapshot_download
198 |                 checkpoint_path = snapshot_download(MODEL_CONFIG['sd15']['checkpoints'][checkpoint])
199 |                 if lora != 'None':
200 |                     lora_path = os.path.join(snapshot_download(tmp_lora_info[0]), tmp_lora_info[1])
201 |             else:
202 |                 from huggingface_hub import hf_hub_download
203 |                 checkpoint_path = MODEL_CONFIG['sd15']['checkpoints'][checkpoint]
204 |                 if lora != 'None':
205 |                     lora_path = hf_hub_download(tmp_lora_info[0], filename=tmp_lora_info[1])
206 | 
207 |             res = None
208 |             try:
209 |                 token = gen.load_pipeline("video", checkpoint_path=checkpoint_path, lora_path=lora_path, lora_scale=lora_scale,
210 |                                            stylize=stylize, version=VERSION_DICT[version])
211 | 
212 |                 res = gen.video_generate(token,
213 |                                          ref_img,
214 |                                          drive_video,
215 |                                          num_steps,
216 |                                          guidance,
217 |                                          seed,
218 |                                          DEFAULT_PROMPT,
219 |                                          '',
220 |                                          trans_ratio,
221 |                                          crop_reference,
222 |                                          patch_overlap,
223 |                                          'cntrl1' if cntrl_version == 'HMControlNet1' else 'cntrl2',
224 |                                          fps8
225 |                                         )
226 |             except Exception as e:
227 |                 print(e)
228 |             return res
229 |         exec_btn.click(fn=video_gen_fnc,
230 |                        inputs=[ref_img, drive_video, num_steps, guidance, seed, trans_ratio,
231 |                                crop_reference, cntrl_version, version, stylize, patch_overlap, checkpoint, lora,
232 |                                lora_scale, fps8],
233 |                        outputs=result_video,
234 |                        api_name="Video Generation")
235 |         gr.Examples(
236 |             examples=[
237 |                 ['data/reference_images/chillout.jpg', 'data/drive_videos/nice.mp4', 25, 1.5, 1024, 0.2,
238 |                  True, 'HMControlNet2', 'HelloMemeV5', 'x1', 4, list(MODEL_CONFIG['sd15']['checkpoints'].keys())[2],
239 |                  list(MODEL_CONFIG['sd15']['loras'].keys())[1], 1.5, True],
240 |                 ['data/reference_images/zzj.jpg', 'data/drive_videos/jue.mp4', 25, 1.5, 1024, 0.0,
241 |                  True, 'HMControlNet2', 'HelloMemeV5', 'x1', 4, list(MODEL_CONFIG['sd15']['checkpoints'].keys())[1],
242 |                  "None", 1.5, True],
243 |             ],
244 |             fn=video_gen_fnc,
245 |             inputs=[ref_img, drive_video, num_steps, guidance, seed, trans_ratio,
246 |                     crop_reference, cntrl_version, version, stylize, patch_overlap, checkpoint,
247 |                     lora, lora_scale, fps8],
248 |             outputs=result_video,
249 |             cache_examples=False,
250 |         )
251 | 
252 | app.launch(inbrowser=True)


--------------------------------------------------------------------------------
/data/demo.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HelloVision/HelloMeme/98f4155d0148f5818023a798d4a100ccb8db4b91/data/demo.gif


--------------------------------------------------------------------------------
/data/drive_images/hhh.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HelloVision/HelloMeme/98f4155d0148f5818023a798d4a100ccb8db4b91/data/drive_images/hhh.jpg


--------------------------------------------------------------------------------
/data/drive_images/hrwh.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HelloVision/HelloMeme/98f4155d0148f5818023a798d4a100ccb8db4b91/data/drive_images/hrwh.jpg


--------------------------------------------------------------------------------
/data/drive_images/jue.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HelloVision/HelloMeme/98f4155d0148f5818023a798d4a100ccb8db4b91/data/drive_images/jue.jpg


--------------------------------------------------------------------------------
/data/drive_images/yao.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HelloVision/HelloMeme/98f4155d0148f5818023a798d4a100ccb8db4b91/data/drive_images/yao.jpg


--------------------------------------------------------------------------------
/data/drive_images/ysll.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HelloVision/HelloMeme/98f4155d0148f5818023a798d4a100ccb8db4b91/data/drive_images/ysll.jpg


--------------------------------------------------------------------------------
/data/drive_videos/amns.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HelloVision/HelloMeme/98f4155d0148f5818023a798d4a100ccb8db4b91/data/drive_videos/amns.mp4


--------------------------------------------------------------------------------
/data/drive_videos/jgz.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HelloVision/HelloMeme/98f4155d0148f5818023a798d4a100ccb8db4b91/data/drive_videos/jgz.mp4


--------------------------------------------------------------------------------
/data/drive_videos/jue.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HelloVision/HelloMeme/98f4155d0148f5818023a798d4a100ccb8db4b91/data/drive_videos/jue.mp4


--------------------------------------------------------------------------------
/data/drive_videos/nhs.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HelloVision/HelloMeme/98f4155d0148f5818023a798d4a100ccb8db4b91/data/drive_videos/nhs.mp4


--------------------------------------------------------------------------------
/data/drive_videos/nice.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HelloVision/HelloMeme/98f4155d0148f5818023a798d4a100ccb8db4b91/data/drive_videos/nice.mp4


--------------------------------------------------------------------------------
/data/drive_videos/qie.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HelloVision/HelloMeme/98f4155d0148f5818023a798d4a100ccb8db4b91/data/drive_videos/qie.mp4


--------------------------------------------------------------------------------
/data/drive_videos/tbh.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HelloVision/HelloMeme/98f4155d0148f5818023a798d4a100ccb8db4b91/data/drive_videos/tbh.mp4


--------------------------------------------------------------------------------
/data/drive_videos/tiktok.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HelloVision/HelloMeme/98f4155d0148f5818023a798d4a100ccb8db4b91/data/drive_videos/tiktok.mp4


--------------------------------------------------------------------------------
/data/drive_videos/tndtc.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HelloVision/HelloMeme/98f4155d0148f5818023a798d4a100ccb8db4b91/data/drive_videos/tndtc.mp4


--------------------------------------------------------------------------------
/data/drive_videos/xzy1.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HelloVision/HelloMeme/98f4155d0148f5818023a798d4a100ccb8db4b91/data/drive_videos/xzy1.mp4


--------------------------------------------------------------------------------
/data/drive_videos/xzy2.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HelloVision/HelloMeme/98f4155d0148f5818023a798d4a100ccb8db4b91/data/drive_videos/xzy2.mp4


--------------------------------------------------------------------------------
/data/drive_videos/xzy3.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HelloVision/HelloMeme/98f4155d0148f5818023a798d4a100ccb8db4b91/data/drive_videos/xzy3.mp4


--------------------------------------------------------------------------------
/data/drive_videos/ysll.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HelloVision/HelloMeme/98f4155d0148f5818023a798d4a100ccb8db4b91/data/drive_videos/ysll.mp4


--------------------------------------------------------------------------------
/data/harris_yao.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HelloVision/HelloMeme/98f4155d0148f5818023a798d4a100ccb8db4b91/data/harris_yao.jpg


--------------------------------------------------------------------------------
/data/harris_yao_toon.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HelloVision/HelloMeme/98f4155d0148f5818023a798d4a100ccb8db4b91/data/harris_yao_toon.jpg


--------------------------------------------------------------------------------
/data/jue.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HelloVision/HelloMeme/98f4155d0148f5818023a798d4a100ccb8db4b91/data/jue.gif


--------------------------------------------------------------------------------
/data/reference_images/BeautyFool.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HelloVision/HelloMeme/98f4155d0148f5818023a798d4a100ccb8db4b91/data/reference_images/BeautyFool.jpg


--------------------------------------------------------------------------------
/data/reference_images/chillout.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HelloVision/HelloMeme/98f4155d0148f5818023a798d4a100ccb8db4b91/data/reference_images/chillout.jpg


--------------------------------------------------------------------------------
/data/reference_images/civitai1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HelloVision/HelloMeme/98f4155d0148f5818023a798d4a100ccb8db4b91/data/reference_images/civitai1.jpg


--------------------------------------------------------------------------------
/data/reference_images/firefly.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HelloVision/HelloMeme/98f4155d0148f5818023a798d4a100ccb8db4b91/data/reference_images/firefly.jpg


--------------------------------------------------------------------------------
/data/reference_images/harris.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HelloVision/HelloMeme/98f4155d0148f5818023a798d4a100ccb8db4b91/data/reference_images/harris.jpg


--------------------------------------------------------------------------------
/data/reference_images/kjl.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HelloVision/HelloMeme/98f4155d0148f5818023a798d4a100ccb8db4b91/data/reference_images/kjl.jpg


--------------------------------------------------------------------------------
/data/reference_images/majicmix1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HelloVision/HelloMeme/98f4155d0148f5818023a798d4a100ccb8db4b91/data/reference_images/majicmix1.jpg


--------------------------------------------------------------------------------
/data/reference_images/majicmix2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HelloVision/HelloMeme/98f4155d0148f5818023a798d4a100ccb8db4b91/data/reference_images/majicmix2.jpg


--------------------------------------------------------------------------------
/data/reference_images/majicmix3.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HelloVision/HelloMeme/98f4155d0148f5818023a798d4a100ccb8db4b91/data/reference_images/majicmix3.jpg


--------------------------------------------------------------------------------
/data/reference_images/majicmix8.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HelloVision/HelloMeme/98f4155d0148f5818023a798d4a100ccb8db4b91/data/reference_images/majicmix8.jpg


--------------------------------------------------------------------------------
/data/reference_images/mimic1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HelloVision/HelloMeme/98f4155d0148f5818023a798d4a100ccb8db4b91/data/reference_images/mimic1.jpg


--------------------------------------------------------------------------------
/data/reference_images/show1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HelloVision/HelloMeme/98f4155d0148f5818023a798d4a100ccb8db4b91/data/reference_images/show1.jpg


--------------------------------------------------------------------------------
/data/reference_images/show4.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HelloVision/HelloMeme/98f4155d0148f5818023a798d4a100ccb8db4b91/data/reference_images/show4.jpg


--------------------------------------------------------------------------------
/data/reference_images/show6.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HelloVision/HelloMeme/98f4155d0148f5818023a798d4a100ccb8db4b91/data/reference_images/show6.jpg


--------------------------------------------------------------------------------
/data/reference_images/toon.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HelloVision/HelloMeme/98f4155d0148f5818023a798d4a100ccb8db4b91/data/reference_images/toon.png


--------------------------------------------------------------------------------
/data/reference_images/trump.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HelloVision/HelloMeme/98f4155d0148f5818023a798d4a100ccb8db4b91/data/reference_images/trump.jpg


--------------------------------------------------------------------------------
/data/reference_images/wukong1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HelloVision/HelloMeme/98f4155d0148f5818023a798d4a100ccb8db4b91/data/reference_images/wukong1.jpg


--------------------------------------------------------------------------------
/data/reference_images/zzj.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HelloVision/HelloMeme/98f4155d0148f5818023a798d4a100ccb8db4b91/data/reference_images/zzj.jpg


--------------------------------------------------------------------------------
/data/trump_jue-toon.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HelloVision/HelloMeme/98f4155d0148f5818023a798d4a100ccb8db4b91/data/trump_jue-toon.gif


--------------------------------------------------------------------------------
/data/trump_jue.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HelloVision/HelloMeme/98f4155d0148f5818023a798d4a100ccb8db4b91/data/trump_jue.gif


--------------------------------------------------------------------------------
/environment.yml:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HelloVision/HelloMeme/98f4155d0148f5818023a798d4a100ccb8db4b91/environment.yml


--------------------------------------------------------------------------------
/generator.py:
--------------------------------------------------------------------------------
  1 | # coding: utf-8
  2 | 
  3 | """
  4 | @File   : inference.py
  5 | @Author : Songkey
  6 | @Email  : songkey@pku.edu.cn
  7 | @Date   : 12/12/2024
  8 | @Desc   : 
  9 | """
 10 | 
 11 | import random
 12 | import os
 13 | import json
 14 | import os.path as osp
 15 | import shutil
 16 | import torch
 17 | import numpy as np
 18 | import cv2
 19 | import imageio
 20 | from PIL import Image
 21 | from collections import OrderedDict
 22 | 
 23 | from hellomeme.utils import (get_drive_pose,
 24 |                              get_drive_expression,
 25 |                              get_drive_expression_pd_fgc,
 26 |                              det_landmarks,
 27 |                              gen_control_heatmaps,
 28 |                              generate_random_string,
 29 |                              ff_cat_video_and_audio,
 30 |                              ff_change_fps,
 31 |                              load_face_toolkits,
 32 |                              append_pipline_weights)
 33 | from hellomeme.pipelines import (HMVideoPipeline, HMImagePipeline,
 34 |                                  HM3VideoPipeline, HM3ImagePipeline,
 35 |                                  HM5VideoPipeline, HM5ImagePipeline)
 36 | 
 37 | from hellomeme.tools.sr import RealESRGANer
 38 | 
 39 | cur_dir = osp.dirname(osp.abspath(__file__))
 40 | 
 41 | config_path = osp.join(cur_dir, 'hellomeme', 'model_config.json')
 42 | with open(config_path, 'r') as f:
 43 |     MODEL_CONFIG = json.load(f)
 44 | 
 45 | DEFAULT_PROMPT = MODEL_CONFIG['prompt']
 46 | 
 47 | class Generator(object):
 48 |     def __init__(self, gpu_id=0, dtype=torch.float16, pipeline_dict_len=10, sr=True, modelscope=False):
 49 |         self.modelscope = modelscope
 50 |         self.gpu_id = gpu_id
 51 |         self.dtype = dtype
 52 |         self.toolkits = load_face_toolkits(gpu_id=gpu_id, dtype=dtype, modelscope=modelscope)
 53 |         self.pipeline_dict = OrderedDict()
 54 |         self.pipeline_counter = OrderedDict()
 55 |         self.pipeline_dict_len = pipeline_dict_len
 56 |         if sr:
 57 |             self.upsampler = RealESRGANer(scale=2, half=True, gpu_id=gpu_id, modelscope=modelscope)
 58 | 
 59 |     @torch.no_grad()
 60 |     def load_pipeline(self, type, checkpoint_path, vae_path=None, lora_path=None, lora_scale=1.0, stylize='x1', version='v2'):
 61 |         new_token = f"{type}__{osp.basename(checkpoint_path)}__{'none' if lora_path is None else osp.basename(lora_path)}__{lora_scale}__{stylize}__{version}"
 62 |         if new_token in self.pipeline_dict:
 63 |             self.pipeline_counter[new_token] += 1
 64 |             print(f"@@ Pipeline {new_token}({self.pipeline_counter[new_token]}) already exists, reuse it.")
 65 |             return new_token
 66 | 
 67 |         if self.modelscope:
 68 |             from modelscope import snapshot_download
 69 |             sd1_5_dir = snapshot_download('songkey/stable-diffusion-v1-5')
 70 |         else:
 71 |             sd1_5_dir = 'songkey/stable-diffusion-v1-5'
 72 | 
 73 |         if version == 'v3' or version == 'v4':
 74 |             if type == 'image':
 75 |                 tmp_pipeline = HM3ImagePipeline.from_pretrained(sd1_5_dir)
 76 |             else:
 77 |                 tmp_pipeline = HM3VideoPipeline.from_pretrained(sd1_5_dir)
 78 |         elif version == 'v5':
 79 |             if type == 'image':
 80 |                 tmp_pipeline = HM5ImagePipeline.from_pretrained(sd1_5_dir)
 81 |             else:
 82 |                 tmp_pipeline = HM5VideoPipeline.from_pretrained(sd1_5_dir)
 83 |         else:
 84 |             if type == 'image':
 85 |                 tmp_pipeline = HMImagePipeline.from_pretrained(sd1_5_dir)
 86 |             else:
 87 |                 tmp_pipeline = HMVideoPipeline.from_pretrained(sd1_5_dir)
 88 | 
 89 |         tmp_pipeline.to(dtype=self.dtype)
 90 |         tmp_pipeline.caryomitosis(version=version, modelscope=self.modelscope)
 91 |         append_pipline_weights(tmp_pipeline, checkpoint_path, lora_path, vae_path,
 92 |                                stylize=stylize, lora_scale=lora_scale)
 93 |         tmp_pipeline.insert_hm_modules(dtype=self.dtype, version=version, modelscope=self.modelscope)
 94 | 
 95 |         if len(self.pipeline_dict) >= self.pipeline_dict_len:
 96 |             min_key = min(self.pipeline_counter, key=self.pipeline_counter.get)
 97 |             print(f"@@ Pipeline {min_key}({self.pipeline_counter[min_key]}) removed.")
 98 |             del self.pipeline_dict[min_key]
 99 |             del self.pipeline_counter[min_key]
100 |         self.pipeline_dict[new_token] = tmp_pipeline
101 |         self.pipeline_counter[new_token] = 1
102 | 
103 |         print(f"@@ Pipeline {new_token} created.")
104 |         return new_token
105 | 
106 |     def image_preprocess(self, images, crop=False):
107 |         _, drive_landmarks = det_landmarks(self.toolkits['face_aligner'], images)
108 |         drive_frames, drive_landmarks, drive_rot, drive_trans = get_drive_pose(self.toolkits,
109 |                                                                                images,
110 |                                                                                drive_landmarks,
111 |                                                                                crop=crop)
112 |         return drive_frames, drive_landmarks, drive_rot, drive_trans
113 | 
114 |     @torch.no_grad()
115 |     def image_generate(self,
116 |                        pipeline_token,
117 |                        ref_image,
118 |                        drive_image,
119 |                        steps,
120 |                        guidance,
121 |                        seed,
122 |                        prompt,
123 |                        negative_prompt,
124 |                        trans_ratio,
125 |                        crop_reference,
126 |                        cntrl_version='cntrl2'
127 |                        ):
128 | 
129 |         save_size = 512
130 |         dtype = self.toolkits['dtype']
131 |         device = self.toolkits['device']
132 | 
133 |         ref_image_input_np = cv2.cvtColor(np.array(ref_image.convert('RGB')), cv2.COLOR_RGB2BGR)
134 |         ref_frames, ref_landmarks, ref_rot, ref_trans = self.image_preprocess([ref_image_input_np], crop=crop_reference)
135 |         assert len(ref_frames) == 1
136 | 
137 |         input_ref_pil = Image.fromarray(cv2.cvtColor(ref_frames[0], cv2.COLOR_BGR2RGB))
138 | 
139 |         drive_image_input_np = cv2.cvtColor(np.array(drive_image.convert('RGB')), cv2.COLOR_RGB2BGR)
140 |         drive_frames, drive_landmarks, drive_rot, drive_trans = self.image_preprocess([drive_image_input_np], crop=True)
141 |         assert len(drive_frames) == 1
142 | 
143 |         if cntrl_version == 'cntrl1':
144 |             drive_params = get_drive_expression(self.toolkits, drive_frames, drive_landmarks)
145 |         else:
146 |             # for HMControlNet2
147 |             drive_params = get_drive_expression_pd_fgc(self.toolkits, drive_frames, drive_landmarks)
148 | 
149 |         control_heatmaps = gen_control_heatmaps(drive_rot,
150 |                                                 drive_trans,
151 |                                                 ref_trans[0],
152 |                                                 save_size=save_size,
153 |                                                 trans_ratio=trans_ratio)
154 | 
155 |         drive_params['condition'] = control_heatmaps.unsqueeze(0).to(dtype=dtype, device='cpu')
156 | 
157 |         generator = torch.Generator().manual_seed(seed if seed >= 0 else random.randint(0, 2**32-1))
158 | 
159 |         result_img, latents = self.pipeline_dict[pipeline_token](
160 |             prompt=[prompt],
161 |             strength=1.0,
162 |             image=input_ref_pil,
163 |             drive_params=drive_params,
164 |             num_inference_steps=steps,
165 |             negative_prompt=[negative_prompt],
166 |             guidance_scale=guidance,
167 |             generator=generator,
168 |             output_type='np',
169 |             device=device
170 |         )
171 | 
172 |         res_image_np = np.clip(result_img[0][0] * 255, 0, 255).astype(np.uint8)
173 |         if hasattr(self, 'upsampler'):
174 |             res_image_np = cv2.cvtColor(res_image_np, cv2.COLOR_RGB2BGR)
175 |             res_image_np, _ = self.upsampler.enhance(res_image_np, outscale=2)
176 |             res_image_np = cv2.cvtColor(res_image_np, cv2.COLOR_RGB2BGR)
177 | 
178 |         return Image.fromarray(res_image_np)
179 | 
180 |     @torch.no_grad()
181 |     def video_generate(self,
182 |                        pipeline_token,
183 |                        ref_image,
184 |                        drive_video_path,
185 |                        num_steps,
186 |                        guidance,
187 |                        seed,
188 |                        prompt,
189 |                        negative_prompt,
190 |                        trans_ratio,
191 |                        crop_reference,
192 |                        patch_overlap,
193 |                        cntrl_version,
194 |                        fps8):
195 | 
196 |         dtype = self.toolkits['dtype']
197 |         device = self.toolkits['device']
198 |         save_size = 512
199 | 
200 |         rand_token = generate_random_string(8)
201 |         drive_video_path_fps8 = osp.splitext(drive_video_path)[0] + f'_{rand_token}_proced.mp4'
202 |         save_video_path = osp.splitext(drive_video_path)[0] + f'_{rand_token}_save.mp4'
203 | 
204 |         if osp.exists(drive_video_path_fps8): os.remove(drive_video_path_fps8)
205 |         if fps8:
206 |             ff_change_fps(drive_video_path, drive_video_path_fps8, 8)
207 |             fps = 8
208 |         else:
209 |             shutil.copy(drive_video_path, drive_video_path_fps8)
210 | 
211 |         cap = cv2.VideoCapture(drive_video_path_fps8)
212 |         if not fps8:
213 |             fps = cap.get(cv2.CAP_PROP_FPS)
214 | 
215 |         frame_list = []
216 |         ret, frame = cap.read()
217 |         while ret:
218 |             frame_list.append(frame.copy())
219 |             ret, frame = cap.read()
220 |         cap.release()
221 | 
222 |         ref_image_input_np = cv2.cvtColor(np.array(ref_image.convert('RGB')), cv2.COLOR_RGB2BGR)
223 |         ref_frames, ref_landmarks, ref_rot, ref_trans = self.image_preprocess([ref_image_input_np], crop=crop_reference)
224 |         assert len(ref_frames) == 1
225 | 
226 |         input_ref_pil = Image.fromarray(cv2.cvtColor(ref_frames[0], cv2.COLOR_BGR2RGB))
227 | 
228 |         drive_frames, drive_landmarks, drive_rot, drive_trans = self.image_preprocess(frame_list, crop=True)
229 | 
230 |         if cntrl_version == 'cntrl1':
231 |             drive_params = get_drive_expression(self.toolkits, drive_frames, drive_landmarks)
232 |         else:
233 |             # for HMControlNet2
234 |             drive_params = get_drive_expression_pd_fgc(self.toolkits, drive_frames, drive_landmarks)
235 | 
236 |         control_heatmaps = gen_control_heatmaps(drive_rot, drive_trans, ref_trans[0], save_size=save_size,
237 |                                                 trans_ratio=trans_ratio)
238 |         drive_params['condition'] = control_heatmaps.unsqueeze(0).to(dtype=dtype, device='cpu')
239 | 
240 |         generator = torch.Generator().manual_seed(seed if seed >= 0 else random.randint(0, 2**32-1))
241 |         res_frames, latents = self.pipeline_dict[pipeline_token](
242 |             prompt=[prompt],
243 |             strength=1.0,
244 |             image=input_ref_pil,
245 |             patch_overlap=patch_overlap,
246 |             drive_params=drive_params,
247 |             num_inference_steps=num_steps,
248 |             negative_prompt=[negative_prompt],
249 |             guidance_scale=guidance,
250 |             generator=generator,
251 |             output_type='np',
252 |             device=device
253 |         )
254 |         res_frames_np = [np.clip(x[0] * 255, 0, 255).astype(np.uint8) for x in res_frames]
255 | 
256 |         if hasattr(self, 'upsampler'):
257 |             res_frames_np = [cv2.cvtColor(x, cv2.COLOR_RGB2BGR) for x in res_frames_np]
258 |             res_frames_np = [self.upsampler.enhance(x, outscale=2)[0] for x in res_frames_np]
259 |             res_frames_np = [cv2.cvtColor(x, cv2.COLOR_RGB2BGR) for x in res_frames_np]
260 | 
261 |         if osp.exists(save_video_path): os.remove(save_video_path)
262 |         imageio.mimsave(save_video_path, res_frames_np, fps=fps)
263 | 
264 |         save_video_audio_path = osp.splitext(drive_video_path)[0] + f'_{rand_token}_audio.mp4'
265 |         if osp.exists(save_video_audio_path): os.remove(save_video_audio_path)
266 |         ff_cat_video_and_audio(save_video_path, drive_video_path_fps8, save_video_audio_path)
267 |         if osp.exists(drive_video_path_fps8): os.remove(drive_video_path_fps8)
268 | 
269 |         if not osp.exists(save_video_audio_path):
270 |             save_video_audio_path = save_video_path
271 |         else:
272 |             os.remove(save_video_path)
273 | 
274 |         return save_video_audio_path
275 | 


--------------------------------------------------------------------------------
/hellomeme/__init__.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | 
 3 | """
 4 | @File   : __init__.py.py
 5 | @Author : Songkey
 6 | @Email  : songkey@pku.edu.cn
 7 | @Date   : 8/28/2024
 8 | @Desc   : 
 9 | """
10 | 
11 | from .pipelines import (HMImagePipeline, HMVideoPipeline,
12 |                         HM3ImagePipeline, HM3VideoPipeline,
13 |                         HM5ImagePipeline, HM5VideoPipeline)


--------------------------------------------------------------------------------
/hellomeme/model_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "sd15": {
 3 |     "checkpoints": {
 4 |       "SD1.5": "songkey/stable-diffusion-v1-5",
 5 |       "[preset]RealisticVisionV60B1": "songkey/realisticVisionV60B1_v51VAE",
 6 |       "[preset]DisneyPixarCartoonB": "songkey/disney-pixar-cartoon-b",
 7 |       "[preset]toonyou_beta6": "songkey/toonyou_beta6",
 8 |       "[preset]LZ_2DCartoon_V2": "songkey/LZ_2DCartoon_V2",
 9 |       "[preset]meinamix_v12Final": "songkey/meinamix_v12Final",
10 |       "[preset]animedark_v10": "songkey/animedark_v10",
11 |       "[preset]absolutereality_v181": "songkey/absolutereality_v181",
12 |       "[preset]dreamshaper_8": "songkey/dreamshaper_8",
13 |       "[preset]epicphotogasm_ultimateFidelity": "songkey/epicphotogasm_ultimateFidelity",
14 |       "[preset]epicrealism_naturalSinRC1VAE": "songkey/epicrealism_naturalSinRC1VAE",
15 |       "[preset]xxmix9realistic_v40": "songkey/xxmix9realistic_v40",
16 |       "[preset]cyberrealistic_v80": "songkey/cyberrealistic_v80"
17 |     },
18 |     "loras": {
19 |       "[preset]BabyFaceV1": ["songkey/loras_sd_1_5", "baby_face_v1.safetensors"],
20 |       "[preset]MoreDetails": ["songkey/loras_sd_1_5", "more_details.safetensors"],
21 |       "[preset]PixelPortraitV1": ["songkey/loras_sd_1_5", "pixel-portrait-v1.safetensors"],
22 |       "[preset]Drawing": ["songkey/loras_sd_1_5", "Drawing.safetensors"]
23 |     }
24 |   },
25 |   "prompt": "(best quality), highly detailed, ultra-detailed, headshot, person, well-placed five sense organs, looking at the viewer, centered composition, sharp focus, realistic skin texture"
26 | }


--------------------------------------------------------------------------------
/hellomeme/models/__init__.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | 
 3 | """
 4 | @File   : __init__.py
 5 | @Author : Songkey
 6 | @Email  : songkey@pku.edu.cn
 7 | @Date   : 8/14/2024
 8 | @Desc   : 
 9 | """
10 | 
11 | from .hm_denoising_motion import HMDenoisingMotion
12 | from .hm_control import (HMControlNet, HMControlNet2, HMV2ControlNet, HMV2ControlNet2,
13 |                          HMV3ControlNet, HMControlNetBase, HM5ControlNetBase,
14 |                          HM4SD15ControlProj, HM5SD15ControlProj)
15 | from .hm_adapters import (HMReferenceAdapter, HM3ReferenceAdapter, HM5ReferenceAdapter,
16 |                           HM3MotionAdapter, HM5MotionAdapter, HMPipeline)
17 | from .hm_denoising_3d import HMDenoising3D
18 | from .hm3_denoising_3d import HM3Denoising3D
19 | from .hm3_denoising_motion import HM3DenoisingMotion
20 | 


--------------------------------------------------------------------------------
/hellomeme/models/hm3_denoising_3d.py:
--------------------------------------------------------------------------------
  1 | # coding: utf-8
  2 | 
  3 | """
  4 | @File   : models6/hm_denoising_3d.py
  5 | @Author : Songkey
  6 | @Email  : songkey@pku.edu.cn
  7 | @Date   : 1/3/2025
  8 | @Desc   :
  9 | adapted from: https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/unets/unet_2d_condition.py
 10 | """
 11 | 
 12 | import torch
 13 | import torch.utils.checkpoint
 14 | from typing import Any, Dict, Optional, Tuple, Union
 15 | 
 16 | from einops import rearrange
 17 | 
 18 | from diffusers.utils import USE_PEFT_BACKEND, deprecate, logging, scale_lora_layers, unscale_lora_layers
 19 | from diffusers.models.unets.unet_2d_condition import UNet2DConditionModel, UNet2DConditionOutput
 20 | from .hm_adapters import CopyWeights, InsertReferenceAdapter
 21 | 
 22 | logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 23 | 
 24 | class HM3Denoising3D(UNet2DConditionModel, CopyWeights, InsertReferenceAdapter):
 25 |     def forward(
 26 |         self,
 27 |         sample: torch.Tensor,
 28 |         timestep: Union[torch.Tensor, float, int],
 29 |         encoder_hidden_states: torch.Tensor,
 30 |         reference_hidden_states: Optional[dict] = None,
 31 |         control_hidden_states: Optional[dict] = None,
 32 |         motion_pad_hidden_states: Optional[dict] = None,
 33 |         use_motion: bool = False,
 34 |         class_labels: Optional[torch.Tensor] = None,
 35 |         timestep_cond: Optional[torch.Tensor] = None,
 36 |         attention_mask: Optional[torch.Tensor] = None,
 37 |         cross_attention_kwargs: Optional[Dict[str, Any]] = None,
 38 |         added_cond_kwargs: Optional[Dict[str, torch.Tensor]] = None,
 39 |         down_block_additional_residuals: Optional[Tuple[torch.Tensor]] = None,
 40 |         mid_block_additional_residual: Optional[torch.Tensor] = None,
 41 |         down_intrablock_additional_residuals: Optional[Tuple[torch.Tensor]] = None,
 42 |         encoder_attention_mask: Optional[torch.Tensor] = None,
 43 |         return_dict: bool = True,
 44 |     ) -> Union[UNet2DConditionOutput, Tuple]:
 45 |         # By default samples have to be AT least a multiple of the overall upsampling factor.
 46 |         # The overall upsampling factor is equal to 2 ** (# num of upsampling layers).
 47 |         # However, the upsampling interpolation output size can be forced to fit any upsampling size
 48 |         # on the fly if necessary.
 49 |         default_overall_up_factor = 2**self.num_upsamplers
 50 | 
 51 |         # upsample size should be forwarded when sample is not a multiple of `default_overall_up_factor`
 52 |         forward_upsample_size = False
 53 |         upsample_size = None
 54 | 
 55 |         for dim in sample.shape[-2:]:
 56 |             if dim % default_overall_up_factor != 0:
 57 |                 # Forward upsample size to force interpolation output size.
 58 |                 forward_upsample_size = True
 59 |                 break
 60 | 
 61 |         # ensure attention_mask is a bias, and give it a singleton query_tokens dimension
 62 |         # expects mask of shape:
 63 |         #   [batch, key_tokens]
 64 |         # adds singleton query_tokens dimension:
 65 |         #   [batch,                    1, key_tokens]
 66 |         # this helps to broadcast it as a bias over attention scores, which will be in one of the following shapes:
 67 |         #   [batch,  heads, query_tokens, key_tokens] (e.g. torch sdp attn)
 68 |         #   [batch * heads, query_tokens, key_tokens] (e.g. xformers or classic attn)
 69 |         if attention_mask is not None:
 70 |             # assume that mask is expressed as:
 71 |             #   (1 = keep,      0 = discard)
 72 |             # convert mask into a bias that can be added to attention scores:
 73 |             #       (keep = +0,     discard = -10000.0)
 74 |             attention_mask = (1 - attention_mask.to(sample.dtype)) * -10000.0
 75 |             attention_mask = attention_mask.unsqueeze(1)
 76 | 
 77 |         # convert encoder_attention_mask to a bias the same way we do for attention_mask
 78 |         if encoder_attention_mask is not None:
 79 |             encoder_attention_mask = (1 - encoder_attention_mask.to(sample.dtype)) * -10000.0
 80 |             encoder_attention_mask = encoder_attention_mask.unsqueeze(1)
 81 | 
 82 |         # 0. center input if necessary
 83 |         if self.config.center_input_sample:
 84 |             sample = 2 * sample - 1.0
 85 | 
 86 |         # 1. time
 87 |         t_emb = self.get_time_embed(sample=sample, timestep=timestep)
 88 |         emb = self.time_embedding(t_emb, timestep_cond)
 89 | 
 90 |         class_emb = self.get_class_embed(sample=sample, class_labels=class_labels)
 91 |         if class_emb is not None:
 92 |             if self.config.class_embeddings_concat:
 93 |                 emb = torch.cat([emb, class_emb], dim=-1)
 94 |             else:
 95 |                 emb = emb + class_emb
 96 | 
 97 |         aug_emb = self.get_aug_embed(
 98 |             emb=emb, encoder_hidden_states=encoder_hidden_states, added_cond_kwargs=added_cond_kwargs
 99 |         )
100 |         if self.config.addition_embed_type == "image_hint":
101 |             aug_emb, hint = aug_emb
102 |             sample = torch.cat([sample, hint], dim=1)
103 | 
104 |         emb = emb + aug_emb if aug_emb is not None else emb
105 | 
106 |         if self.time_embed_act is not None:
107 |             emb = self.time_embed_act(emb)
108 | 
109 |         num_frames = sample.shape[2]
110 |         emb = emb.repeat_interleave(repeats=num_frames, dim=0)
111 | 
112 |         if not added_cond_kwargs is None:
113 |             if 'image_embeds' in added_cond_kwargs:
114 |                 if isinstance(added_cond_kwargs['image_embeds'], torch.Tensor):
115 |                     added_cond_kwargs['image_embeds'] = added_cond_kwargs['image_embeds'].repeat_interleave(repeats=num_frames, dim=0)
116 |                 else:
117 |                     added_cond_kwargs['image_embeds'] = [x.repeat_interleave(repeats=num_frames, dim=0) for x in added_cond_kwargs['image_embeds']]
118 | 
119 |         if len(encoder_hidden_states.shape) == 3:
120 |             encoder_hidden_states = encoder_hidden_states.repeat_interleave(repeats=num_frames, dim=0)
121 |         elif len(encoder_hidden_states.shape) == 4:
122 |             encoder_hidden_states = rearrange(encoder_hidden_states, "b f l d -> (b f) l d")
123 | 
124 |         encoder_hidden_states = self.process_encoder_hidden_states(
125 |             encoder_hidden_states=encoder_hidden_states, added_cond_kwargs=added_cond_kwargs
126 |         )
127 | 
128 |         # 2. pre-process
129 |         sample = rearrange(sample, "b c f h w -> (b f) c h w")
130 |         sample = self.conv_in(sample)
131 | 
132 |         # 2.5 GLIGEN position net
133 |         if cross_attention_kwargs is not None and cross_attention_kwargs.get("gligen", None) is not None:
134 |             cross_attention_kwargs = cross_attention_kwargs.copy()
135 |             gligen_args = cross_attention_kwargs.pop("gligen")
136 |             cross_attention_kwargs["gligen"] = {"objs": self.position_net(**gligen_args)}
137 | 
138 |         # 3. down
139 |         # we're popping the `scale` instead of getting it because otherwise `scale` will be propagated
140 |         # to the internal blocks and will raise deprecation warnings. this will be confusing for our users.
141 |         if cross_attention_kwargs is not None:
142 |             cross_attention_kwargs = cross_attention_kwargs.copy()
143 |             lora_scale = cross_attention_kwargs.pop("scale", 1.0)
144 |         else:
145 |             lora_scale = 1.0
146 | 
147 |         if USE_PEFT_BACKEND:
148 |             # weight the lora layers by setting `lora_scale` for each PEFT layer
149 |             scale_lora_layers(self, lora_scale)
150 | 
151 |         is_controlnet = mid_block_additional_residual is not None and down_block_additional_residuals is not None
152 |         # using new arg down_intrablock_additional_residuals for T2I-Adapters, to distinguish from controlnets
153 |         is_adapter = down_intrablock_additional_residuals is not None
154 |         # maintain backward compatibility for legacy usage, where
155 |         #       T2I-Adapter and ControlNet both use down_block_additional_residuals arg
156 |         #       but can only use one or the other
157 |         if not is_adapter and mid_block_additional_residual is None and down_block_additional_residuals is not None:
158 |             deprecate(
159 |                 "T2I should not use down_block_additional_residuals",
160 |                 "1.3.0",
161 |                 "Passing intrablock residual connections with `down_block_additional_residuals` is deprecated \
162 |                        and will be removed in diffusers 1.3.0.  `down_block_additional_residuals` should only be used \
163 |                        for ControlNet. Please make sure use `down_intrablock_additional_residuals` instead. ",
164 |                 standard_warn=False,
165 |             )
166 |             down_intrablock_additional_residuals = down_block_additional_residuals
167 |             is_adapter = True
168 | 
169 |         res_cache = dict()
170 |         down_block_res_samples = (sample,)
171 |         for idx, downsample_block in enumerate(self.down_blocks):
172 |             if hasattr(downsample_block, "has_cross_attention") and downsample_block.has_cross_attention:
173 |                 # For t2i-adapter CrossAttnDownBlock2D
174 |                 additional_residuals = {}
175 |                 if is_adapter and len(down_intrablock_additional_residuals) > 0:
176 |                     additional_residuals["additional_residuals"] = down_intrablock_additional_residuals.pop(0)
177 | 
178 |                 sample, res_samples = downsample_block(
179 |                     hidden_states=sample,
180 |                     temb=emb,
181 |                     encoder_hidden_states=encoder_hidden_states,
182 |                     attention_mask=attention_mask,
183 |                     cross_attention_kwargs=cross_attention_kwargs,
184 |                     encoder_attention_mask=encoder_attention_mask,
185 |                     **additional_residuals,
186 |                 )
187 |             else:
188 |                 sample, res_samples = downsample_block(hidden_states=sample, temb=emb)
189 |                 if is_adapter and len(down_intrablock_additional_residuals) > 0:
190 |                     sample += down_intrablock_additional_residuals.pop(0)
191 | 
192 |             res_cache[f"down_{idx}"] = sample.clone()
193 |             if not control_hidden_states is None and f'down3_{idx}' in control_hidden_states:
194 |                 sample += rearrange(control_hidden_states[f'down3_{idx}'], "b c f h w -> (b f) c h w")
195 |             if hasattr(self, 'motion_down') and use_motion:
196 |                 sample = self.motion_down[idx](sample,
197 |                               None if motion_pad_hidden_states is None else motion_pad_hidden_states[f'down_{idx}'],
198 |                               emb, num_frames)
199 | 
200 |             down_block_res_samples += res_samples
201 | 
202 |         if is_controlnet:
203 |             new_down_block_res_samples = ()
204 | 
205 |             for down_block_res_sample, down_block_additional_residual in zip(
206 |                 down_block_res_samples, down_block_additional_residuals
207 |             ):
208 |                 down_block_res_sample = down_block_res_sample + down_block_additional_residual
209 |                 new_down_block_res_samples = new_down_block_res_samples + (down_block_res_sample,)
210 | 
211 |             down_block_res_samples = new_down_block_res_samples
212 | 
213 |         # 4. mid
214 |         if self.mid_block is not None:
215 |             if hasattr(self.mid_block, "has_cross_attention") and self.mid_block.has_cross_attention:
216 |                 sample = self.mid_block(
217 |                     sample,
218 |                     emb,
219 |                     encoder_hidden_states=encoder_hidden_states,
220 |                     attention_mask=attention_mask,
221 |                     cross_attention_kwargs=cross_attention_kwargs,
222 |                     encoder_attention_mask=encoder_attention_mask,
223 |                 )
224 |             else:
225 |                 sample = self.mid_block(sample, emb)
226 | 
227 |             # To support T2I-Adapter-XL
228 |             if (
229 |                 is_adapter
230 |                 and len(down_intrablock_additional_residuals) > 0
231 |                 and sample.shape == down_intrablock_additional_residuals[0].shape
232 |             ):
233 |                 sample += down_intrablock_additional_residuals.pop(0)
234 | 
235 |         if is_controlnet:
236 |             sample = sample + mid_block_additional_residual
237 | 
238 |         # 5. up
239 |         for i, upsample_block in enumerate(self.up_blocks):
240 |             is_final_block = i == len(self.up_blocks) - 1
241 | 
242 |             res_samples = down_block_res_samples[-len(upsample_block.resnets) :]
243 |             down_block_res_samples = down_block_res_samples[: -len(upsample_block.resnets)]
244 | 
245 |             # if we have not reached the final block and need to forward the
246 |             # upsample size, we do it here
247 |             if not is_final_block and forward_upsample_size:
248 |                 upsample_size = down_block_res_samples[-1].shape[2:]
249 | 
250 |             res_cache[f"up_{i}"] = sample.clone()
251 |             if not control_hidden_states is None and f'up3_{i}' in control_hidden_states:
252 |                 sample += rearrange(control_hidden_states[f'up3_{i}'], "b c f h w -> (b f) c h w")
253 |             if hasattr(self, "reference_modules_up") and not reference_hidden_states is None and f'up_{i}' in reference_hidden_states:
254 |                 sample = self.reference_modules_up[i](sample, reference_hidden_states[f'up_{i}'], num_frames)
255 |             if hasattr(self, 'motion_up') and use_motion:
256 |                 sample = self.motion_up[i](sample,
257 |                               None if motion_pad_hidden_states is None else motion_pad_hidden_states[f'up_{i}'],
258 |                               emb, num_frames)
259 | 
260 |             if hasattr(upsample_block, "has_cross_attention") and upsample_block.has_cross_attention:
261 |                 sample = upsample_block(
262 |                     hidden_states=sample,
263 |                     temb=emb,
264 |                     res_hidden_states_tuple=res_samples,
265 |                     encoder_hidden_states=encoder_hidden_states,
266 |                     cross_attention_kwargs=cross_attention_kwargs,
267 |                     upsample_size=upsample_size,
268 |                     attention_mask=attention_mask,
269 |                     encoder_attention_mask=encoder_attention_mask,
270 |                 )
271 |             else:
272 |                 sample = upsample_block(
273 |                     hidden_states=sample,
274 |                     temb=emb,
275 |                     res_hidden_states_tuple=res_samples,
276 |                     upsample_size=upsample_size,
277 |                 )
278 | 
279 |         # 6. post-process
280 |         if self.conv_norm_out:
281 |             sample = self.conv_norm_out(sample)
282 |             sample = self.conv_act(sample)
283 |         sample = self.conv_out(sample)
284 | 
285 |         if USE_PEFT_BACKEND:
286 |             # remove `lora_scale` from each PEFT layer
287 |             unscale_lora_layers(self, lora_scale)
288 | 
289 |         # reshape to (batch, channel, framerate, width, height)
290 |         sample = rearrange(sample, "(b f) c h w -> b c f h w", f=num_frames)
291 | 
292 |         if not return_dict:
293 |             return (sample, res_cache)
294 | 
295 |         return (UNet2DConditionOutput(sample=sample), res_cache)
296 | 


--------------------------------------------------------------------------------
/hellomeme/models/hm3_denoising_motion.py:
--------------------------------------------------------------------------------
  1 | # coding: utf-8
  2 | 
  3 | """
  4 | @File   : models6/hm_denoising_motion.py
  5 | @Author : Songkey
  6 | @Email  : songkey@pku.edu.cn
  7 | @Date   : 1/3/2025
  8 | @Desc   :
  9 | adapted from: https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/unets/unet_motion_model.py
 10 | """
 11 | 
 12 | import torch
 13 | import torch.utils.checkpoint
 14 | from typing import Any, Dict, Optional, Tuple, Union
 15 | 
 16 | from einops import rearrange
 17 | 
 18 | from diffusers.utils import logging
 19 | from diffusers.models.unets.unet_motion_model import UNetMotionModel, UNetMotionOutput
 20 | from .hm_adapters import InsertReferenceAdapter
 21 | 
 22 | logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 23 | 
 24 | 
 25 | class HM3DenoisingMotion(UNetMotionModel, InsertReferenceAdapter):
 26 |     def forward(
 27 |             self,
 28 |             sample: torch.Tensor,
 29 |             timestep: Union[torch.Tensor, float, int],
 30 |             encoder_hidden_states: torch.Tensor,
 31 |             reference_hidden_states: Optional[dict] = None,
 32 |             control_hidden_states: Optional[torch.Tensor] = None,
 33 |             motion_pad_hidden_states: Optional[dict] = None,
 34 |             use_motion: bool = False,
 35 |             timestep_cond: Optional[torch.Tensor] = None,
 36 |             attention_mask: Optional[torch.Tensor] = None,
 37 |             cross_attention_kwargs: Optional[Dict[str, Any]] = None,
 38 |             added_cond_kwargs: Optional[Dict[str, torch.Tensor]] = None,
 39 |             down_block_additional_residuals: Optional[Tuple[torch.Tensor]] = None,
 40 |             mid_block_additional_residual: Optional[torch.Tensor] = None,
 41 |             return_dict: bool = True,
 42 |     ) -> Union[UNetMotionOutput, Tuple[torch.Tensor]]:
 43 | 
 44 |         # By default samples have to be AT least a multiple of the overall upsampling factor.
 45 |         # The overall upsampling factor is equal to 2 ** (# num of upsampling layears).
 46 |         # However, the upsampling interpolation output size can be forced to fit any upsampling size
 47 |         # on the fly if necessary.
 48 |         default_overall_up_factor = 2 ** self.num_upsamplers
 49 | 
 50 |         # upsample size should be forwarded when sample is not a multiple of `default_overall_up_factor`
 51 |         forward_upsample_size = False
 52 |         upsample_size = None
 53 | 
 54 |         if any(s % default_overall_up_factor != 0 for s in sample.shape[-2:]):
 55 |             logger.info("Forward upsample size to force interpolation output size.")
 56 |             forward_upsample_size = True
 57 | 
 58 |         # prepare attention_mask
 59 |         if attention_mask is not None:
 60 |             attention_mask = (1 - attention_mask.to(sample.dtype)) * -10000.0
 61 |             attention_mask = attention_mask.unsqueeze(1)
 62 | 
 63 |         # 1. time
 64 |         timesteps = timestep
 65 |         if not torch.is_tensor(timesteps):
 66 |             # TODO: this requires sync between CPU and GPU. So try to pass timesteps as tensors if you can
 67 |             # This would be a good case for the `match` statement (Python 3.10+)
 68 |             is_mps = sample.device.type == "mps"
 69 |             if isinstance(timestep, float):
 70 |                 dtype = torch.float32 if is_mps else torch.float64
 71 |             else:
 72 |                 dtype = torch.int32 if is_mps else torch.int64
 73 |             timesteps = torch.tensor([timesteps], dtype=dtype, device=sample.device)
 74 |         elif len(timesteps.shape) == 0:
 75 |             timesteps = timesteps[None].to(sample.device)
 76 | 
 77 |         # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
 78 |         num_frames = sample.shape[2]
 79 |         timesteps = timesteps.expand(sample.shape[0])
 80 | 
 81 |         t_emb = self.time_proj(timesteps)
 82 | 
 83 |         # timesteps does not contain any weights and will always return f32 tensors
 84 |         # but time_embedding might actually be running in fp16. so we need to cast here.
 85 |         # there might be better ways to encapsulate this.
 86 |         t_emb = t_emb.to(dtype=self.dtype)
 87 | 
 88 |         emb = self.time_embedding(t_emb, timestep_cond)
 89 |         aug_emb = None
 90 | 
 91 |         if self.config.addition_embed_type == "text_time":
 92 |             if "text_embeds" not in added_cond_kwargs:
 93 |                 raise ValueError(
 94 |                     f"{self.__class__} has the config param `addition_embed_type` set to 'text_time' which requires the keyword argument `text_embeds` to be passed in `added_cond_kwargs`"
 95 |                 )
 96 | 
 97 |             text_embeds = added_cond_kwargs.get("text_embeds")
 98 |             if "time_ids" not in added_cond_kwargs:
 99 |                 raise ValueError(
100 |                     f"{self.__class__} has the config param `addition_embed_type` set to 'text_time' which requires the keyword argument `time_ids` to be passed in `added_cond_kwargs`"
101 |                 )
102 |             time_ids = added_cond_kwargs.get("time_ids")
103 |             time_embeds = self.add_time_proj(time_ids.flatten())
104 |             time_embeds = time_embeds.reshape((text_embeds.shape[0], -1))
105 | 
106 |             add_embeds = torch.concat([text_embeds, time_embeds], dim=-1)
107 |             add_embeds = add_embeds.to(emb.dtype)
108 |             aug_emb = self.add_embedding(add_embeds)
109 | 
110 |         emb = emb if aug_emb is None else emb + aug_emb
111 |         emb = emb.repeat_interleave(repeats=num_frames, dim=0)
112 | 
113 |         if len(encoder_hidden_states.shape) == 3:
114 |             encoder_hidden_states = encoder_hidden_states.repeat_interleave(repeats=num_frames, dim=0)
115 |         elif len(encoder_hidden_states.shape) == 4:
116 |             encoder_hidden_states = rearrange(encoder_hidden_states, "b f l d -> (b f) l d")
117 | 
118 |         if self.encoder_hid_proj is not None and self.config.encoder_hid_dim_type == "ip_image_proj":
119 |             if "image_embeds" not in added_cond_kwargs:
120 |                 raise ValueError(
121 |                     f"{self.__class__} has the config param `encoder_hid_dim_type` set to 'ip_image_proj' which requires the keyword argument `image_embeds` to be passed in  `added_conditions`"
122 |                 )
123 |             image_embeds = added_cond_kwargs.get("image_embeds")
124 |             image_embeds = self.encoder_hid_proj(image_embeds)
125 |             image_embeds = [image_embed.repeat_interleave(repeats=num_frames, dim=0) for image_embed in image_embeds]
126 |             encoder_hidden_states = (encoder_hidden_states, image_embeds)
127 | 
128 |         # 2. pre-process
129 |         sample = sample.permute(0, 2, 1, 3, 4).reshape((sample.shape[0] * num_frames, -1) + sample.shape[3:])
130 |         sample = self.conv_in(sample)
131 | 
132 |         # 3. down
133 |         down_block_res_samples = (sample,)
134 |         for idx, downsample_block in enumerate(self.down_blocks):
135 |             if hasattr(downsample_block, "has_cross_attention") and downsample_block.has_cross_attention:
136 |                 sample, res_samples = downsample_block(
137 |                     hidden_states=sample,
138 |                     temb=emb,
139 |                     encoder_hidden_states=encoder_hidden_states,
140 |                     attention_mask=attention_mask,
141 |                     num_frames=num_frames,
142 |                     cross_attention_kwargs=cross_attention_kwargs,
143 |                 )
144 |             else:
145 |                 sample, res_samples = downsample_block(hidden_states=sample, temb=emb, num_frames=num_frames)
146 | 
147 |             if not control_hidden_states is None and f'down3_{idx}' in control_hidden_states:
148 |                 sample += rearrange(control_hidden_states[f'down3_{idx}'], "b c f h w -> (b f) c h w")
149 |             if hasattr(self, 'motion_down') and use_motion:
150 |                 sample = self.motion_down[idx](sample, motion_pad_hidden_states[f'down_{idx}'], emb, num_frames)
151 | 
152 |             down_block_res_samples += res_samples
153 | 
154 |         if down_block_additional_residuals is not None:
155 |             new_down_block_res_samples = ()
156 | 
157 |             for down_block_res_sample, down_block_additional_residual in zip(
158 |                     down_block_res_samples, down_block_additional_residuals
159 |             ):
160 |                 down_block_res_sample = down_block_res_sample + down_block_additional_residual
161 |                 new_down_block_res_samples += (down_block_res_sample,)
162 | 
163 |             down_block_res_samples = new_down_block_res_samples
164 | 
165 |         # 4. mid
166 |         if self.mid_block is not None:
167 |             # To support older versions of motion modules that don't have a mid_block
168 |             if hasattr(self.mid_block, "motion_modules"):
169 |                 sample = self.mid_block(
170 |                     sample,
171 |                     emb,
172 |                     encoder_hidden_states=encoder_hidden_states,
173 |                     attention_mask=attention_mask,
174 |                     num_frames=num_frames,
175 |                     cross_attention_kwargs=cross_attention_kwargs,
176 |                 )
177 |             else:
178 |                 sample = self.mid_block(
179 |                     sample,
180 |                     emb,
181 |                     encoder_hidden_states=encoder_hidden_states,
182 |                     attention_mask=attention_mask,
183 |                     cross_attention_kwargs=cross_attention_kwargs,
184 |                 )
185 | 
186 |         if mid_block_additional_residual is not None:
187 |             sample = sample + mid_block_additional_residual
188 | 
189 |         # 5. up
190 |         for i, upsample_block in enumerate(self.up_blocks):
191 |             is_final_block = i == len(self.up_blocks) - 1
192 | 
193 |             res_samples = down_block_res_samples[-len(upsample_block.resnets):]
194 |             down_block_res_samples = down_block_res_samples[: -len(upsample_block.resnets)]
195 | 
196 |             # if we have not reached the final block and need to forward the
197 |             # upsample size, we do it here
198 |             if not is_final_block and forward_upsample_size:
199 |                 upsample_size = down_block_res_samples[-1].shape[2:]
200 | 
201 |             if not control_hidden_states is None and f'up3_{i}' in control_hidden_states:
202 |                 sample += rearrange(control_hidden_states[f'up3_{i}'], "b c f h w -> (b f) c h w")
203 |             if hasattr(self, "reference_modules_up") and not reference_hidden_states is None and f'up_{i}' in reference_hidden_states:
204 |                 sample = self.reference_modules_up[i](sample, reference_hidden_states[f'up_{i}'], num_frames)
205 |             if hasattr(self, 'motion_up') and use_motion:
206 |                 sample = self.motion_up[i](sample, motion_pad_hidden_states[f'up_{i}'], emb, num_frames)
207 | 
208 |             if hasattr(upsample_block, "has_cross_attention") and upsample_block.has_cross_attention:
209 |                 sample = upsample_block(
210 |                     hidden_states=sample,
211 |                     temb=emb,
212 |                     res_hidden_states_tuple=res_samples,
213 |                     encoder_hidden_states=encoder_hidden_states,
214 |                     upsample_size=upsample_size,
215 |                     attention_mask=attention_mask,
216 |                     num_frames=num_frames,
217 |                     cross_attention_kwargs=cross_attention_kwargs,
218 |                 )
219 |             else:
220 |                 sample = upsample_block(
221 |                     hidden_states=sample,
222 |                     temb=emb,
223 |                     res_hidden_states_tuple=res_samples,
224 |                     upsample_size=upsample_size,
225 |                     num_frames=num_frames,
226 |                 )
227 | 
228 |         # 6. post-process
229 |         if self.conv_norm_out:
230 |             sample = self.conv_norm_out(sample)
231 |             sample = self.conv_act(sample)
232 | 
233 |         sample = self.conv_out(sample)
234 | 
235 |         # reshape to (batch, channel, framerate, width, height)
236 |         sample = sample[None, :].reshape((-1, num_frames) + sample.shape[1:]).permute(0, 2, 1, 3, 4)
237 | 
238 |         if not return_dict:
239 |             return (sample,)
240 | 
241 |         return UNetMotionOutput(sample=sample)


--------------------------------------------------------------------------------
/hellomeme/models/hm_denoising_3d.py:
--------------------------------------------------------------------------------
  1 | # coding: utf-8
  2 | 
  3 | """
  4 | @File   : models6/hm_denoising_3d.py
  5 | @Author : Songkey
  6 | @Email  : songkey@pku.edu.cn
  7 | @Date   : 8/14/2024
  8 | @Desc   : 删除实验代码，精简结构
  9 | adapted from: https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/unets/unet_2d_condition.py
 10 | """
 11 | 
 12 | import torch
 13 | import torch.utils.checkpoint
 14 | from typing import Any, Dict, Optional, Tuple, Union
 15 | 
 16 | from einops import rearrange
 17 | 
 18 | from diffusers.utils import USE_PEFT_BACKEND, deprecate, logging, scale_lora_layers, unscale_lora_layers
 19 | from diffusers.models.unets.unet_2d_condition import UNet2DConditionModel, UNet2DConditionOutput
 20 | from .hm_adapters import CopyWeights, InsertReferenceAdapter
 21 | 
 22 | logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 23 | 
 24 | 
 25 | class HMDenoising3D(UNet2DConditionModel, CopyWeights, InsertReferenceAdapter):
 26 |     def forward(
 27 |         self,
 28 |         sample: torch.Tensor,
 29 |         timestep: Union[torch.Tensor, float, int],
 30 |         encoder_hidden_states: torch.Tensor,
 31 |         reference_hidden_states: Optional[dict] = None,
 32 |         control_hidden_states: Optional[torch.Tensor] = None,
 33 |         class_labels: Optional[torch.Tensor] = None,
 34 |         timestep_cond: Optional[torch.Tensor] = None,
 35 |         attention_mask: Optional[torch.Tensor] = None,
 36 |         cross_attention_kwargs: Optional[Dict[str, Any]] = None,
 37 |         added_cond_kwargs: Optional[Dict[str, torch.Tensor]] = None,
 38 |         down_block_additional_residuals: Optional[Tuple[torch.Tensor]] = None,
 39 |         mid_block_additional_residual: Optional[torch.Tensor] = None,
 40 |         down_intrablock_additional_residuals: Optional[Tuple[torch.Tensor]] = None,
 41 |         encoder_attention_mask: Optional[torch.Tensor] = None,
 42 |         return_dict: bool = True,
 43 |     ) -> Union[UNet2DConditionOutput, Tuple]:
 44 |         # By default samples have to be AT least a multiple of the overall upsampling factor.
 45 |         # The overall upsampling factor is equal to 2 ** (# num of upsampling layers).
 46 |         # However, the upsampling interpolation output size can be forced to fit any upsampling size
 47 |         # on the fly if necessary.
 48 |         default_overall_up_factor = 2**self.num_upsamplers
 49 | 
 50 |         # upsample size should be forwarded when sample is not a multiple of `default_overall_up_factor`
 51 |         forward_upsample_size = False
 52 |         upsample_size = None
 53 | 
 54 |         for dim in sample.shape[-2:]:
 55 |             if dim % default_overall_up_factor != 0:
 56 |                 # Forward upsample size to force interpolation output size.
 57 |                 forward_upsample_size = True
 58 |                 break
 59 | 
 60 |         # ensure attention_mask is a bias, and give it a singleton query_tokens dimension
 61 |         # expects mask of shape:
 62 |         #   [batch, key_tokens]
 63 |         # adds singleton query_tokens dimension:
 64 |         #   [batch,                    1, key_tokens]
 65 |         # this helps to broadcast it as a bias over attention scores, which will be in one of the following shapes:
 66 |         #   [batch,  heads, query_tokens, key_tokens] (e.g. torch sdp attn)
 67 |         #   [batch * heads, query_tokens, key_tokens] (e.g. xformers or classic attn)
 68 |         if attention_mask is not None:
 69 |             # assume that mask is expressed as:
 70 |             #   (1 = keep,      0 = discard)
 71 |             # convert mask into a bias that can be added to attention scores:
 72 |             #       (keep = +0,     discard = -10000.0)
 73 |             attention_mask = (1 - attention_mask.to(sample.dtype)) * -10000.0
 74 |             attention_mask = attention_mask.unsqueeze(1)
 75 | 
 76 |         # convert encoder_attention_mask to a bias the same way we do for attention_mask
 77 |         if encoder_attention_mask is not None:
 78 |             encoder_attention_mask = (1 - encoder_attention_mask.to(sample.dtype)) * -10000.0
 79 |             encoder_attention_mask = encoder_attention_mask.unsqueeze(1)
 80 | 
 81 |         # 0. center input if necessary
 82 |         if self.config.center_input_sample:
 83 |             sample = 2 * sample - 1.0
 84 | 
 85 |         # 1. time
 86 |         t_emb = self.get_time_embed(sample=sample, timestep=timestep)
 87 |         emb = self.time_embedding(t_emb, timestep_cond)
 88 |         aug_emb = None
 89 | 
 90 |         class_emb = self.get_class_embed(sample=sample, class_labels=class_labels)
 91 |         if class_emb is not None:
 92 |             if self.config.class_embeddings_concat:
 93 |                 emb = torch.cat([emb, class_emb], dim=-1)
 94 |             else:
 95 |                 emb = emb + class_emb
 96 | 
 97 |         aug_emb = self.get_aug_embed(
 98 |             emb=emb, encoder_hidden_states=encoder_hidden_states, added_cond_kwargs=added_cond_kwargs
 99 |         )
100 |         if self.config.addition_embed_type == "image_hint":
101 |             aug_emb, hint = aug_emb
102 |             sample = torch.cat([sample, hint], dim=1)
103 | 
104 |         emb = emb + aug_emb if aug_emb is not None else emb
105 | 
106 |         if self.time_embed_act is not None:
107 |             emb = self.time_embed_act(emb)
108 | 
109 |         num_frames = sample.shape[2]
110 |         emb = emb.repeat_interleave(repeats=num_frames, dim=0)
111 | 
112 |         if len(encoder_hidden_states.shape) == 3:
113 |             encoder_hidden_states = encoder_hidden_states.repeat_interleave(repeats=num_frames, dim=0)
114 |         elif len(encoder_hidden_states.shape) == 4:
115 |             encoder_hidden_states = rearrange(encoder_hidden_states, "b f l d -> (b f) l d")
116 | 
117 |         if not added_cond_kwargs is None and 'image_embeds' in added_cond_kwargs:
118 |             if isinstance(added_cond_kwargs['image_embeds'], torch.Tensor):
119 |                 added_cond_kwargs['image_embeds'] = added_cond_kwargs['image_embeds'].repeat_interleave(repeats=num_frames, dim=0)
120 |             else:
121 |                 added_cond_kwargs['image_embeds'] = [x.repeat_interleave(repeats=num_frames, dim=0) for x in added_cond_kwargs['image_embeds']]
122 | 
123 |         encoder_hidden_states = self.process_encoder_hidden_states(
124 |             encoder_hidden_states=encoder_hidden_states, added_cond_kwargs=added_cond_kwargs
125 |         )
126 | 
127 |         # 2. pre-process
128 |         sample = rearrange(sample, "b c f h w -> (b f) c h w")
129 |         sample = self.conv_in(sample)
130 | 
131 |         # 2.5 GLIGEN position net
132 |         if cross_attention_kwargs is not None and cross_attention_kwargs.get("gligen", None) is not None:
133 |             cross_attention_kwargs = cross_attention_kwargs.copy()
134 |             gligen_args = cross_attention_kwargs.pop("gligen")
135 |             cross_attention_kwargs["gligen"] = {"objs": self.position_net(**gligen_args)}
136 | 
137 |         # 3. down
138 |         # we're popping the `scale` instead of getting it because otherwise `scale` will be propagated
139 |         # to the internal blocks and will raise deprecation warnings. this will be confusing for our users.
140 |         if cross_attention_kwargs is not None:
141 |             cross_attention_kwargs = cross_attention_kwargs.copy()
142 |             lora_scale = cross_attention_kwargs.pop("scale", 1.0)
143 |         else:
144 |             lora_scale = 1.0
145 | 
146 |         if USE_PEFT_BACKEND:
147 |             # weight the lora layers by setting `lora_scale` for each PEFT layer
148 |             scale_lora_layers(self, lora_scale)
149 | 
150 |         is_controlnet = mid_block_additional_residual is not None and down_block_additional_residuals is not None
151 |         # using new arg down_intrablock_additional_residuals for T2I-Adapters, to distinguish from controlnets
152 |         is_adapter = down_intrablock_additional_residuals is not None
153 |         # maintain backward compatibility for legacy usage, where
154 |         #       T2I-Adapter and ControlNet both use down_block_additional_residuals arg
155 |         #       but can only use one or the other
156 |         if not is_adapter and mid_block_additional_residual is None and down_block_additional_residuals is not None:
157 |             deprecate(
158 |                 "T2I should not use down_block_additional_residuals",
159 |                 "1.3.0",
160 |                 "Passing intrablock residual connections with `down_block_additional_residuals` is deprecated \
161 |                        and will be removed in diffusers 1.3.0.  `down_block_additional_residuals` should only be used \
162 |                        for ControlNet. Please make sure use `down_intrablock_additional_residuals` instead. ",
163 |                 standard_warn=False,
164 |             )
165 |             down_intrablock_additional_residuals = down_block_additional_residuals
166 |             is_adapter = True
167 | 
168 |         res_cache = dict()
169 |         down_block_res_samples = (sample,)
170 |         for idx, downsample_block in enumerate(self.down_blocks):
171 |             if hasattr(downsample_block, "has_cross_attention") and downsample_block.has_cross_attention:
172 |                 # For t2i-adapter CrossAttnDownBlock2D
173 |                 additional_residuals = {}
174 |                 if is_adapter and len(down_intrablock_additional_residuals) > 0:
175 |                     additional_residuals["additional_residuals"] = down_intrablock_additional_residuals.pop(0)
176 | 
177 |                 sample, res_samples = downsample_block(
178 |                     hidden_states=sample,
179 |                     temb=emb,
180 |                     encoder_hidden_states=encoder_hidden_states,
181 |                     attention_mask=attention_mask,
182 |                     cross_attention_kwargs=cross_attention_kwargs,
183 |                     encoder_attention_mask=encoder_attention_mask,
184 |                     **additional_residuals,
185 |                 )
186 |                 res_cache[f"down_{idx}"] = sample.clone()
187 |             else:
188 |                 sample, res_samples = downsample_block(hidden_states=sample, temb=emb)
189 |                 if is_adapter and len(down_intrablock_additional_residuals) > 0:
190 |                     sample += down_intrablock_additional_residuals.pop(0)
191 | 
192 |             if not control_hidden_states is None and f'down_{idx}' in control_hidden_states:
193 |                 sample += rearrange(control_hidden_states[f'down_{idx}'], "b c f h w -> (b f) c h w")
194 |             if not control_hidden_states is None and f'down2_{idx}' in control_hidden_states:
195 |                 sample += rearrange(control_hidden_states[f'down2_{idx}'], "b c f h w -> (b f) c h w")
196 |             if hasattr(self, 'reference_modules_down') and not reference_hidden_states is None and f'down_{idx}' in reference_hidden_states:
197 |                 sample = self.reference_modules_down[idx](sample, reference_hidden_states[f'down_{idx}'], num_frames)
198 | 
199 |             down_block_res_samples += res_samples
200 | 
201 |         if is_controlnet:
202 |             new_down_block_res_samples = ()
203 | 
204 |             for down_block_res_sample, down_block_additional_residual in zip(
205 |                 down_block_res_samples, down_block_additional_residuals
206 |             ):
207 |                 down_block_res_sample = down_block_res_sample + down_block_additional_residual
208 |                 new_down_block_res_samples = new_down_block_res_samples + (down_block_res_sample,)
209 | 
210 |             down_block_res_samples = new_down_block_res_samples
211 | 
212 |         # 4. mid
213 |         if self.mid_block is not None:
214 |             if hasattr(self.mid_block, "has_cross_attention") and self.mid_block.has_cross_attention:
215 |                 sample = self.mid_block(
216 |                     sample,
217 |                     emb,
218 |                     encoder_hidden_states=encoder_hidden_states,
219 |                     attention_mask=attention_mask,
220 |                     cross_attention_kwargs=cross_attention_kwargs,
221 |                     encoder_attention_mask=encoder_attention_mask,
222 |                 )
223 |             else:
224 |                 sample = self.mid_block(sample, emb)
225 |             if hasattr(self, 'reference_modules_mid') and not reference_hidden_states is None and f'mid' in reference_hidden_states:
226 |                 sample = self.reference_modules_mid(sample, reference_hidden_states[f'mid'], num_frames)
227 | 
228 |             # To support T2I-Adapter-XL
229 |             if (
230 |                 is_adapter
231 |                 and len(down_intrablock_additional_residuals) > 0
232 |                 and sample.shape == down_intrablock_additional_residuals[0].shape
233 |             ):
234 |                 sample += down_intrablock_additional_residuals.pop(0)
235 |             res_cache[f"mid"] = sample.clone()
236 | 
237 |         if is_controlnet:
238 |             sample = sample + mid_block_additional_residual
239 | 
240 |         # 5. up
241 |         for i, upsample_block in enumerate(self.up_blocks):
242 |             is_final_block = i == len(self.up_blocks) - 1
243 | 
244 |             res_samples = down_block_res_samples[-len(upsample_block.resnets) :]
245 |             down_block_res_samples = down_block_res_samples[: -len(upsample_block.resnets)]
246 | 
247 |             # if we have not reached the final block and need to forward the
248 |             # upsample size, we do it here
249 |             if not is_final_block and forward_upsample_size:
250 |                 upsample_size = down_block_res_samples[-1].shape[2:]
251 | 
252 |             if hasattr(upsample_block, "has_cross_attention") and upsample_block.has_cross_attention:
253 |                 res_cache[f"up_{i}"] = sample.clone()
254 |                 if not control_hidden_states is None and f'up_v2_{i}' in control_hidden_states:
255 |                     sample += rearrange(control_hidden_states[f'up_v2_{i}'], "b c f h w -> (b f) c h w")
256 |                 if not control_hidden_states is None and f'up2_v2_{i}' in control_hidden_states:
257 |                     sample += rearrange(control_hidden_states[f'up2_v2_{i}'], "b c f h w -> (b f) c h w")
258 |                 if hasattr(self, "reference_modules_up") and not reference_hidden_states is None and f'up_{i}' in reference_hidden_states:
259 |                     sample = self.reference_modules_up[i-1](sample, reference_hidden_states[f'up_{i}'], num_frames)
260 | 
261 |                 sample = upsample_block(
262 |                     hidden_states=sample,
263 |                     temb=emb,
264 |                     res_hidden_states_tuple=res_samples,
265 |                     encoder_hidden_states=encoder_hidden_states,
266 |                     cross_attention_kwargs=cross_attention_kwargs,
267 |                     upsample_size=upsample_size,
268 |                     attention_mask=attention_mask,
269 |                     encoder_attention_mask=encoder_attention_mask,
270 |                 )
271 |             else:
272 |                 if not control_hidden_states is None and f'up_v2_{i}' in control_hidden_states:
273 |                     sample += rearrange(control_hidden_states[f'up_v2_{i}'], "b c f h w -> (b f) c h w")
274 |                 if not control_hidden_states is None and f'up2_v2_{i}' in control_hidden_states:
275 |                     sample += rearrange(control_hidden_states[f'up2_v2_{i}'], "b c f h w -> (b f) c h w")
276 |                 sample = upsample_block(
277 |                     hidden_states=sample,
278 |                     temb=emb,
279 |                     res_hidden_states_tuple=res_samples,
280 |                     upsample_size=upsample_size,
281 |                 )
282 | 
283 |         # 6. post-process
284 |         if self.conv_norm_out:
285 |             sample = self.conv_norm_out(sample)
286 |             sample = self.conv_act(sample)
287 |         sample = self.conv_out(sample)
288 | 
289 |         if USE_PEFT_BACKEND:
290 |             # remove `lora_scale` from each PEFT layer
291 |             unscale_lora_layers(self, lora_scale)
292 | 
293 |         # reshape to (batch, channel, framerate, width, height)
294 |         sample = rearrange(sample, "(b f) c h w -> b c f h w", f=num_frames)
295 | 
296 |         if not return_dict:
297 |             return (sample, res_cache)
298 | 
299 |         return (UNet2DConditionOutput(sample=sample), res_cache)
300 | 


--------------------------------------------------------------------------------
/hellomeme/models/hm_denoising_motion.py:
--------------------------------------------------------------------------------
  1 | # coding: utf-8
  2 | 
  3 | """
  4 | @File   : models6/hm_denoising_motion.py
  5 | @Author : Songkey
  6 | @Email  : songkey@pku.edu.cn
  7 | @Date   : 9/9/2024
  8 | @Desc   :
  9 | adapted from: https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/unets/unet_motion_model.py
 10 | """
 11 | 
 12 | import torch
 13 | import torch.utils.checkpoint
 14 | from typing import Any, Dict, Optional, Tuple, Union
 15 | 
 16 | from einops import rearrange
 17 | 
 18 | from diffusers.utils import logging
 19 | from diffusers.models.unets.unet_motion_model import UNetMotionModel, UNetMotionOutput
 20 | from .hm_adapters import InsertReferenceAdapter
 21 | 
 22 | logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 23 | 
 24 | 
 25 | class HMDenoisingMotion(UNetMotionModel, InsertReferenceAdapter):
 26 |     def forward(
 27 |             self,
 28 |             sample: torch.Tensor,
 29 |             timestep: Union[torch.Tensor, float, int],
 30 |             encoder_hidden_states: torch.Tensor,
 31 |             reference_hidden_states: Optional[dict] = None,
 32 |             control_hidden_states: Optional[torch.Tensor] = None,
 33 |             timestep_cond: Optional[torch.Tensor] = None,
 34 |             attention_mask: Optional[torch.Tensor] = None,
 35 |             cross_attention_kwargs: Optional[Dict[str, Any]] = None,
 36 |             added_cond_kwargs: Optional[Dict[str, torch.Tensor]] = None,
 37 |             down_block_additional_residuals: Optional[Tuple[torch.Tensor]] = None,
 38 |             mid_block_additional_residual: Optional[torch.Tensor] = None,
 39 |             return_dict: bool = True,
 40 |     ) -> Union[UNetMotionOutput, Tuple[torch.Tensor]]:
 41 | 
 42 |         # By default samples have to be AT least a multiple of the overall upsampling factor.
 43 |         # The overall upsampling factor is equal to 2 ** (# num of upsampling layears).
 44 |         # However, the upsampling interpolation output size can be forced to fit any upsampling size
 45 |         # on the fly if necessary.
 46 |         default_overall_up_factor = 2 ** self.num_upsamplers
 47 | 
 48 |         # upsample size should be forwarded when sample is not a multiple of `default_overall_up_factor`
 49 |         forward_upsample_size = False
 50 |         upsample_size = None
 51 | 
 52 |         if any(s % default_overall_up_factor != 0 for s in sample.shape[-2:]):
 53 |             logger.info("Forward upsample size to force interpolation output size.")
 54 |             forward_upsample_size = True
 55 | 
 56 |         # prepare attention_mask
 57 |         if attention_mask is not None:
 58 |             attention_mask = (1 - attention_mask.to(sample.dtype)) * -10000.0
 59 |             attention_mask = attention_mask.unsqueeze(1)
 60 | 
 61 |         # 1. time
 62 |         timesteps = timestep
 63 |         if not torch.is_tensor(timesteps):
 64 |             # TODO: this requires sync between CPU and GPU. So try to pass timesteps as tensors if you can
 65 |             # This would be a good case for the `match` statement (Python 3.10+)
 66 |             is_mps = sample.device.type == "mps"
 67 |             if isinstance(timestep, float):
 68 |                 dtype = torch.float32 if is_mps else torch.float64
 69 |             else:
 70 |                 dtype = torch.int32 if is_mps else torch.int64
 71 |             timesteps = torch.tensor([timesteps], dtype=dtype, device=sample.device)
 72 |         elif len(timesteps.shape) == 0:
 73 |             timesteps = timesteps[None].to(sample.device)
 74 | 
 75 |         # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
 76 |         num_frames = sample.shape[2]
 77 |         timesteps = timesteps.expand(sample.shape[0])
 78 | 
 79 |         t_emb = self.time_proj(timesteps)
 80 | 
 81 |         # timesteps does not contain any weights and will always return f32 tensors
 82 |         # but time_embedding might actually be running in fp16. so we need to cast here.
 83 |         # there might be better ways to encapsulate this.
 84 |         t_emb = t_emb.to(dtype=self.dtype)
 85 | 
 86 |         emb = self.time_embedding(t_emb, timestep_cond)
 87 |         aug_emb = None
 88 | 
 89 |         if self.config.addition_embed_type == "text_time":
 90 |             if "text_embeds" not in added_cond_kwargs:
 91 |                 raise ValueError(
 92 |                     f"{self.__class__} has the config param `addition_embed_type` set to 'text_time' which requires the keyword argument `text_embeds` to be passed in `added_cond_kwargs`"
 93 |                 )
 94 | 
 95 |             text_embeds = added_cond_kwargs.get("text_embeds")
 96 |             if "time_ids" not in added_cond_kwargs:
 97 |                 raise ValueError(
 98 |                     f"{self.__class__} has the config param `addition_embed_type` set to 'text_time' which requires the keyword argument `time_ids` to be passed in `added_cond_kwargs`"
 99 |                 )
100 |             time_ids = added_cond_kwargs.get("time_ids")
101 |             time_embeds = self.add_time_proj(time_ids.flatten())
102 |             time_embeds = time_embeds.reshape((text_embeds.shape[0], -1))
103 | 
104 |             add_embeds = torch.concat([text_embeds, time_embeds], dim=-1)
105 |             add_embeds = add_embeds.to(emb.dtype)
106 |             aug_emb = self.add_embedding(add_embeds)
107 | 
108 |         emb = emb if aug_emb is None else emb + aug_emb
109 |         emb = emb.repeat_interleave(repeats=num_frames, dim=0)
110 |         if len(encoder_hidden_states.shape) == 3:
111 |             encoder_hidden_states = encoder_hidden_states.repeat_interleave(repeats=num_frames, dim=0)
112 |         elif len(encoder_hidden_states.shape) == 4:
113 |             encoder_hidden_states = rearrange(encoder_hidden_states, "b f l d -> (b f) l d")
114 | 
115 |         if self.encoder_hid_proj is not None and self.config.encoder_hid_dim_type == "ip_image_proj":
116 |             if "image_embeds" not in added_cond_kwargs:
117 |                 raise ValueError(
118 |                     f"{self.__class__} has the config param `encoder_hid_dim_type` set to 'ip_image_proj' which requires the keyword argument `image_embeds` to be passed in  `added_conditions`"
119 |                 )
120 |             image_embeds = added_cond_kwargs.get("image_embeds")
121 |             image_embeds = self.encoder_hid_proj(image_embeds)
122 |             image_embeds = [image_embed.repeat_interleave(repeats=num_frames, dim=0) for image_embed in image_embeds]
123 |             encoder_hidden_states = (encoder_hidden_states, image_embeds)
124 | 
125 |         # 2. pre-process
126 |         sample = sample.permute(0, 2, 1, 3, 4).reshape((sample.shape[0] * num_frames, -1) + sample.shape[3:])
127 |         sample = self.conv_in(sample)
128 | 
129 |         # 3. down
130 |         down_block_res_samples = (sample,)
131 |         for idx, downsample_block in enumerate(self.down_blocks):
132 |             if hasattr(downsample_block, "has_cross_attention") and downsample_block.has_cross_attention:
133 |                 sample, res_samples = downsample_block(
134 |                     hidden_states=sample,
135 |                     temb=emb,
136 |                     encoder_hidden_states=encoder_hidden_states,
137 |                     attention_mask=attention_mask,
138 |                     num_frames=num_frames,
139 |                     cross_attention_kwargs=cross_attention_kwargs,
140 |                 )
141 |             else:
142 |                 sample, res_samples = downsample_block(hidden_states=sample, temb=emb, num_frames=num_frames)
143 | 
144 |             if not control_hidden_states is None and f'down_{idx}' in control_hidden_states:
145 |                 sample += rearrange(control_hidden_states[f'down_{idx}'], "b c f h w -> (b f) c h w")
146 |             if not control_hidden_states is None and f'down2_{idx}' in control_hidden_states:
147 |                 sample += rearrange(control_hidden_states[f'down2_{idx}'], "b c f h w -> (b f) c h w")
148 | 
149 |             if hasattr(self, 'reference_modules_down') and not reference_hidden_states is None and f'down_{idx}' in reference_hidden_states:
150 |                 sample = self.reference_modules_down[idx](sample, reference_hidden_states[f'down_{idx}'], num_frames)
151 | 
152 |             down_block_res_samples += res_samples
153 | 
154 |         if down_block_additional_residuals is not None:
155 |             new_down_block_res_samples = ()
156 | 
157 |             for down_block_res_sample, down_block_additional_residual in zip(
158 |                     down_block_res_samples, down_block_additional_residuals
159 |             ):
160 |                 down_block_res_sample = down_block_res_sample + down_block_additional_residual
161 |                 new_down_block_res_samples += (down_block_res_sample,)
162 | 
163 |             down_block_res_samples = new_down_block_res_samples
164 | 
165 |         # 4. mid
166 |         if self.mid_block is not None:
167 |             # To support older versions of motion modules that don't have a mid_block
168 |             if hasattr(self.mid_block, "motion_modules"):
169 |                 sample = self.mid_block(
170 |                     sample,
171 |                     emb,
172 |                     encoder_hidden_states=encoder_hidden_states,
173 |                     attention_mask=attention_mask,
174 |                     num_frames=num_frames,
175 |                     cross_attention_kwargs=cross_attention_kwargs,
176 |                 )
177 |             else:
178 |                 sample = self.mid_block(
179 |                     sample,
180 |                     emb,
181 |                     encoder_hidden_states=encoder_hidden_states,
182 |                     attention_mask=attention_mask,
183 |                     cross_attention_kwargs=cross_attention_kwargs,
184 |                 )
185 |             if hasattr(self, 'reference_modules_mid') and not reference_hidden_states is None and f'mid' in reference_hidden_states:
186 |                 sample = self.reference_modules_mid(sample, reference_hidden_states[f'mid'], num_frames)
187 | 
188 |         if mid_block_additional_residual is not None:
189 |             sample = sample + mid_block_additional_residual
190 | 
191 |         # 5. up
192 |         for i, upsample_block in enumerate(self.up_blocks):
193 |             is_final_block = i == len(self.up_blocks) - 1
194 | 
195 |             res_samples = down_block_res_samples[-len(upsample_block.resnets):]
196 |             down_block_res_samples = down_block_res_samples[: -len(upsample_block.resnets)]
197 | 
198 |             # if we have not reached the final block and need to forward the
199 |             # upsample size, we do it here
200 |             if not is_final_block and forward_upsample_size:
201 |                 upsample_size = down_block_res_samples[-1].shape[2:]
202 | 
203 |             if hasattr(upsample_block, "has_cross_attention") and upsample_block.has_cross_attention:
204 |                 if hasattr(upsample_block, "has_cross_attention") and upsample_block.has_cross_attention:
205 |                     if not control_hidden_states is None and f'up_v2_{i}' in control_hidden_states:
206 |                         sample += rearrange(control_hidden_states[f'up_v2_{i}'], "b c f h w -> (b f) c h w")
207 |                     if not control_hidden_states is None and f'up2_v2_{i}' in control_hidden_states:
208 |                         sample += rearrange(control_hidden_states[f'up2_v2_{i}'], "b c f h w -> (b f) c h w")
209 |                     if hasattr(self,
210 |                                "reference_modules_up") and not reference_hidden_states is None and f'up_{i}' in reference_hidden_states:
211 |                         sample = self.reference_modules_up[i - 1](sample, reference_hidden_states[f'up_{i}'],
212 |                                                                   num_frames)
213 | 
214 |                 sample = upsample_block(
215 |                     hidden_states=sample,
216 |                     temb=emb,
217 |                     res_hidden_states_tuple=res_samples,
218 |                     encoder_hidden_states=encoder_hidden_states,
219 |                     upsample_size=upsample_size,
220 |                     attention_mask=attention_mask,
221 |                     num_frames=num_frames,
222 |                     cross_attention_kwargs=cross_attention_kwargs,
223 |                 )
224 |             else:
225 |                 if not control_hidden_states is None and f'up_v2_{i}' in control_hidden_states:
226 |                     sample += rearrange(control_hidden_states[f'up_v2_{i}'], "b c f h w -> (b f) c h w")
227 |                 if not control_hidden_states is None and f'up2_v2_{i}' in control_hidden_states:
228 |                     sample += rearrange(control_hidden_states[f'up2_v2_{i}'], "b c f h w -> (b f) c h w")
229 |                 sample = upsample_block(
230 |                     hidden_states=sample,
231 |                     temb=emb,
232 |                     res_hidden_states_tuple=res_samples,
233 |                     upsample_size=upsample_size,
234 |                     num_frames=num_frames,
235 |                 )
236 | 
237 |         # 6. post-process
238 |         if self.conv_norm_out:
239 |             sample = self.conv_norm_out(sample)
240 |             sample = self.conv_act(sample)
241 | 
242 |         sample = self.conv_out(sample)
243 | 
244 |         # reshape to (batch, channel, framerate, width, height)
245 |         sample = sample[None, :].reshape((-1, num_frames) + sample.shape[1:]).permute(0, 2, 1, 3, 4)
246 | 
247 |         if not return_dict:
248 |             return (sample,)
249 | 
250 |         return UNetMotionOutput(sample=sample)


--------------------------------------------------------------------------------
/hellomeme/pipelines/__init__.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | 
 3 | """
 4 | @File   : __init__.py.py
 5 | @Author : Songkey
 6 | @Email  : songkey@pku.edu.cn
 7 | @Date   : 8/29/2024
 8 | @Desc   : 
 9 | """
10 | 
11 | from .pipline_hm_image import HMImagePipeline
12 | from .pipline_hm_video import HMVideoPipeline
13 | from .pipline_hm3_image import HM3ImagePipeline
14 | from .pipline_hm3_video import HM3VideoPipeline
15 | from .pipline_hm5_image import HM5ImagePipeline
16 | from .pipline_hm5_video import HM5VideoPipeline


--------------------------------------------------------------------------------
/hellomeme/pipelines/pipline_hm3_image.py:
--------------------------------------------------------------------------------
  1 | # coding: utf-8
  2 | 
  3 | """
  4 | @File   : hm_pipline_image.py
  5 | @Author : Songkey
  6 | @Email  : songkey@pku.edu.cn
  7 | @Date   : 1/3/2025
  8 | @Desc   :
  9 | adapted from: https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
 10 | """
 11 | 
 12 | import copy
 13 | from typing import Any, Callable, Dict, List, Optional, Union
 14 | import torch
 15 | 
 16 | from diffusers.callbacks import MultiPipelineCallbacks, PipelineCallback
 17 | from diffusers.image_processor import PipelineImageInput
 18 | from diffusers.utils import deprecate
 19 | from diffusers.utils.torch_utils import randn_tensor
 20 | from diffusers.pipelines.stable_diffusion.pipeline_output import StableDiffusionPipelineOutput
 21 | from diffusers import DPMSolverMultistepScheduler
 22 | from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img import retrieve_timesteps, retrieve_latents
 23 | from ..models import HM3Denoising3D, HMV3ControlNet, HMPipeline, HM3ReferenceAdapter, HMControlNetBase, HM4SD15ControlProj
 24 | 
 25 | class HM3ImagePipeline(HMPipeline):
 26 |     def caryomitosis(self, **kwargs):
 27 |         if hasattr(self, "unet_ref"):
 28 |             del self.unet_ref
 29 |         self.unet_ref = HM3Denoising3D.from_unet2d(self.unet)
 30 |         self.unet_ref.cpu()
 31 | 
 32 |         if not isinstance(self.unet, HM3Denoising3D):
 33 |             unet = HM3Denoising3D.from_unet2d(unet=self.unet)
 34 |             # todo: 不够优雅
 35 |             del self.unet
 36 |             self.unet = unet
 37 |             self.unet.cpu()
 38 | 
 39 |         self.vae.cpu()
 40 |         self.vae_decode = copy.deepcopy(self.vae)
 41 |         self.text_encoder.cpu()
 42 |         self.text_encoder_ref = copy.deepcopy(self.text_encoder)
 43 |         self.safety_checker.cpu()
 44 | 
 45 |     def insert_hm_modules(self, version='v3', dtype=torch.float16, modelscope=False):
 46 |         self.version = version
 47 |         if modelscope:
 48 |             from modelscope import snapshot_download
 49 |             if version == 'v3':
 50 |                 hm_reference_dir = snapshot_download('songkey/hm3_reference')
 51 |                 hm_control_dir = snapshot_download('songkey/hm3_control_mix')
 52 |             else:
 53 |                 hm_reference_dir = snapshot_download('songkey/hm4_reference')
 54 |                 hm_control_dir = snapshot_download('songkey/hm_control_base')
 55 |                 hm_control_proj_dir = snapshot_download('songkey/hm4_control_proj')
 56 |         else:
 57 |             if version == 'v3':
 58 |                 hm_reference_dir = 'songkey/hm3_reference'
 59 |                 hm_control_dir = 'songkey/hm3_control_mix'
 60 |             else:
 61 |                 hm_reference_dir = 'songkey/hm4_reference'
 62 |                 hm_control_dir = 'songkey/hm_control_base'
 63 |                 hm_control_proj_dir = 'songkey/hm4_control_proj'
 64 | 
 65 |         if isinstance(self.unet, HM3Denoising3D):
 66 |             hm_adapter = HM3ReferenceAdapter.from_pretrained(hm_reference_dir)
 67 |             self.unet.insert_reference_adapter(hm_adapter)
 68 |             self.unet.to(device='cpu', dtype=dtype).eval()
 69 | 
 70 |         if hasattr(self, "unet_ref"):
 71 |             self.unet_ref.to(device='cpu', dtype=dtype).eval()
 72 | 
 73 |         if hasattr(self, "mp_control"):
 74 |             del self.mp_control
 75 | 
 76 |         if hasattr(self, "mp_control_proj"):
 77 |             del self.mp_control_proj
 78 | 
 79 |         if version == 'v3':
 80 |             self.mp_control = HMV3ControlNet.from_pretrained(hm_control_dir)
 81 |         else:
 82 |             self.mp_control = HMControlNetBase.from_pretrained(hm_control_dir)
 83 |             self.mp_control_proj = HM4SD15ControlProj.from_pretrained(hm_control_proj_dir)
 84 | 
 85 |             self.mp_control_proj.to(device='cpu', dtype=dtype).eval()
 86 | 
 87 |         self.mp_control.to(device='cpu', dtype=dtype).eval()
 88 | 
 89 |         self.vae.to(device='cpu', dtype=dtype).eval()
 90 |         self.vae_decode.to(device='cpu', dtype=dtype).eval()
 91 |         self.text_encoder.to(device='cpu', dtype=dtype).eval()
 92 | 
 93 |     @torch.no_grad()
 94 |     def __call__(
 95 |             self,
 96 |             prompt: Union[str, List[str]] = None,
 97 |             image: PipelineImageInput = None,
 98 |             drive_params: Dict[str, Any] = None,
 99 |             strength: float = 0.8,
100 |             num_inference_steps: Optional[int] = 50,
101 |             timesteps: List[int] = None,
102 |             sigmas: List[float] = None,
103 |             guidance_scale: Optional[float] = 7.5,
104 |             negative_prompt: Optional[Union[str, List[str]]] = None,
105 |             eta: Optional[float] = 0.0,
106 |             generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
107 |             prompt_embeds: Optional[torch.Tensor] = None,
108 |             negative_prompt_embeds: Optional[torch.Tensor] = None,
109 |             ip_adapter_image: Optional[PipelineImageInput] = None,
110 |             ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None,
111 |             output_type: Optional[str] = "pil",
112 |             device: Optional[str] = "cpu",
113 |             return_dict: bool = True,
114 |             cross_attention_kwargs: Optional[Dict[str, Any]] = None,
115 |             clip_skip: int = None,
116 |             callback_on_step_end: Optional[
117 |                 Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks]
118 |             ] = None,
119 |             callback_on_step_end_tensor_inputs: List[str] = ["latents"],
120 |             **kwargs,
121 |     ):
122 |         callback = kwargs.pop("callback", None)
123 |         callback_steps = kwargs.pop("callback_steps", None)
124 |         num_images_per_prompt = 1
125 | 
126 |         if callback is not None:
127 |             deprecate(
128 |                 "callback",
129 |                 "1.0.0",
130 |                 "Passing `callback` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`",
131 |             )
132 |         if callback_steps is not None:
133 |             deprecate(
134 |                 "callback_steps",
135 |                 "1.0.0",
136 |                 "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`",
137 |             )
138 | 
139 |         if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
140 |             callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
141 | 
142 |         # 1. Check inputs. Raise error if not correct
143 |         self.check_inputs(
144 |             prompt,
145 |             strength,
146 |             callback_steps,
147 |             negative_prompt,
148 |             prompt_embeds,
149 |             negative_prompt_embeds,
150 |             ip_adapter_image,
151 |             ip_adapter_image_embeds,
152 |             callback_on_step_end_tensor_inputs,
153 |         )
154 | 
155 |         self._guidance_scale = guidance_scale
156 |         self._clip_skip = clip_skip
157 |         self._cross_attention_kwargs = cross_attention_kwargs
158 |         self._interrupt = False
159 | 
160 |         # 2. Define call parameters
161 |         if prompt is not None and isinstance(prompt, str):
162 |             batch_size = 1
163 |         elif prompt is not None and isinstance(prompt, list):
164 |             batch_size = len(prompt)
165 |         else:
166 |             batch_size = prompt_embeds.shape[0]
167 | 
168 |         # device = self.device
169 | 
170 |         # 3. Encode input prompt
171 |         text_encoder_lora_scale = (
172 |             self.cross_attention_kwargs.get("scale", None) if self.cross_attention_kwargs is not None else None
173 |         )
174 | 
175 |         self.text_encoder_ref.to(device=device)
176 |         prompt_embeds_ref, negative_prompt_embeds_ref = self.encode_prompt_sk(
177 |             self.text_encoder_ref,
178 |             prompt,
179 |             device,
180 |             num_images_per_prompt,
181 |             self.do_classifier_free_guidance,
182 |             negative_prompt,
183 |             prompt_embeds=prompt_embeds,
184 |             negative_prompt_embeds=negative_prompt_embeds,
185 |             lora_scale=text_encoder_lora_scale,
186 |             clip_skip=self.clip_skip,
187 |         )
188 |         self.text_encoder_ref.cpu()
189 | 
190 |         self.text_encoder.to(device=device)
191 |         prompt_embeds, negative_prompt_embeds = self.encode_prompt_sk(
192 |             self.text_encoder,
193 |             prompt,
194 |             device,
195 |             num_images_per_prompt,
196 |             self.do_classifier_free_guidance,
197 |             negative_prompt,
198 |             prompt_embeds=prompt_embeds,
199 |             negative_prompt_embeds=negative_prompt_embeds,
200 |             lora_scale=text_encoder_lora_scale,
201 |             clip_skip=self.clip_skip,
202 |         )
203 |         self.text_encoder.cpu()
204 | 
205 |         # For classifier free guidance, we need to do two forward passes.
206 |         # Here we concatenate the unconditional and text embeddings into a single batch
207 |         # to avoid doing two forward passes
208 |         if self.do_classifier_free_guidance:
209 |             prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
210 |             prompt_embeds_ref = torch.cat([negative_prompt_embeds_ref, prompt_embeds_ref])
211 | 
212 |         if ip_adapter_image is not None or ip_adapter_image_embeds is not None:
213 |             image_embeds = self.prepare_ip_adapter_image_embeds(
214 |                 ip_adapter_image,
215 |                 ip_adapter_image_embeds,
216 |                 device,
217 |                 batch_size * num_images_per_prompt,
218 |                 self.do_classifier_free_guidance,
219 |             )
220 | 
221 |         # 4. Preprocess
222 |         image = self.image_processor.preprocess(image).to(device=device, dtype=prompt_embeds.dtype)
223 | 
224 |         scheduler = DPMSolverMultistepScheduler(
225 |             num_train_timesteps=1000,
226 |             beta_start=0.00085,
227 |             beta_end=0.012,
228 |             beta_schedule="scaled_linear",
229 |             # use_karras_sigmas=True,
230 |             algorithm_type="sde-dpmsolver++",
231 |         )
232 | 
233 |         # 5. set timesteps
234 |         timesteps, num_inference_steps = retrieve_timesteps(scheduler, num_inference_steps, device, timesteps, sigmas)
235 | 
236 |         # 6. Prepare reference latents
237 |         self.vae.to(device=device)
238 |         ref_latents = [
239 |             retrieve_latents(self.vae.encode(image[i: i + 1].to(device=device)), generator=generator)
240 |             for i in range(batch_size)
241 |         ]
242 |         self.vae.cpu()
243 | 
244 |         ref_latents = torch.cat(ref_latents, dim=0)
245 |         ref_latents = self.vae.config.scaling_factor * ref_latents
246 |         c, h, w = ref_latents.shape[1:]
247 | 
248 |         condition = drive_params['condition'].clone().to(device=device)
249 |         if self.do_classifier_free_guidance:
250 |             condition = torch.cat([torch.ones_like(condition) * -1, condition], dim=0)
251 | 
252 |         control_latents = {}
253 |         self.mp_control.to(device=device)
254 |         if hasattr(self, 'mp_control_proj') and self.version == 'v4':
255 |             self.mp_control_proj.to(device=device)
256 |         if 'drive_coeff' in drive_params:
257 |             drive_coeff = drive_params['drive_coeff'].clone().to(device=device)
258 |             face_parts = drive_params['face_parts'].clone().to(device=device)
259 |             if self.do_classifier_free_guidance:
260 |                 drive_coeff = torch.cat([torch.zeros_like(drive_coeff), drive_coeff], dim=0)
261 |                 face_parts = torch.cat([torch.zeros_like(face_parts), face_parts], dim=0)
262 |             control_latents1 = self.mp_control(condition=condition, drive_coeff=drive_coeff, face_parts=face_parts)
263 |             if self.version == 'v4':
264 |                 control_latents1 = self.mp_control_proj(control_latents1)
265 |             control_latents.update(control_latents1)
266 |         elif 'pd_fpg' in drive_params:
267 |             pd_fpg = drive_params['pd_fpg'].clone().to(device=device)
268 |             if self.do_classifier_free_guidance:
269 |                 pd_fpg = torch.cat([torch.zeros_like(pd_fpg), pd_fpg], dim=0)
270 |             control_latents2 = self.mp_control(condition=condition, emo_embedding=pd_fpg)
271 |             if self.version == 'v4':
272 |                 control_latents2 = self.mp_control_proj(control_latents2)
273 |             control_latents.update(control_latents2)
274 |         self.mp_control.cpu()
275 |         if self.version == 'v4':
276 |             self.mp_control_proj.cpu()
277 | 
278 |         # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
279 |         extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
280 | 
281 |         # 7.1 Add image embeds for IP-Adapter
282 |         added_cond_kwargs = (
283 |             {"image_embeds": image_embeds}
284 |             if ip_adapter_image is not None or ip_adapter_image_embeds is not None
285 |             else None
286 |         )
287 |         base_noise = randn_tensor([batch_size, c, h, w], dtype=prompt_embeds.dtype, generator=generator).to(device=device)
288 | 
289 |         latent_model_input = torch.cat([torch.zeros_like(ref_latents), ref_latents]) if (
290 |             self.do_classifier_free_guidance) else ref_latents
291 |         # latent_model_input = torch.cat([ref_latents_neg, ref_latents], dim=0)
292 |         self.unet_ref.to(device=device)
293 |         cached_res = self.unet_ref(
294 |             latent_model_input.unsqueeze(2),
295 |             0,
296 |             encoder_hidden_states=prompt_embeds_ref,
297 |             return_dict=False,
298 |         )[1]
299 |         self.unet_ref.cpu()
300 | 
301 |         # 7.2 Optionally get Guidance Scale Embedding
302 |         timestep_cond = None
303 |         if self.unet.config.time_cond_proj_dim is not None:
304 |             guidance_scale_tensor = torch.tensor(self.guidance_scale - 1).repeat(batch_size * num_images_per_prompt)
305 |             timestep_cond = self.get_guidance_scale_embedding(
306 |                 guidance_scale_tensor, embedding_dim=self.unet.config.time_cond_proj_dim
307 |             ).to(device=device, dtype=prompt_embeds.dtype)
308 | 
309 |         latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt)
310 |         # base_noise = randn_tensor([batch_size, c, h, w], dtype=prompt_embeds.dtype, generator=generator).to(device=device)
311 |         latents = base_noise * scheduler.init_noise_sigma
312 |         # 8. Denoising loop
313 |         num_warmup_steps = len(timesteps) - num_inference_steps * scheduler.order
314 |         self._num_timesteps = len(timesteps)
315 |         self.unet.to(device=device)
316 |         with self.progress_bar(total=num_inference_steps) as progress_bar:
317 |             for i, t in enumerate(timesteps):
318 |                 if self.interrupt:
319 |                     continue
320 | 
321 |                 # expand the latents if we are doing classifier free guidance
322 |                 latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
323 |                 latent_model_input = scheduler.scale_model_input(latent_model_input, t)
324 | 
325 |                 # predict the noise residual
326 |                 noise_pred = self.unet(
327 |                     latent_model_input.unsqueeze(2),
328 |                     t,
329 |                     encoder_hidden_states=prompt_embeds,
330 |                     reference_hidden_states=cached_res,
331 |                     control_hidden_states=control_latents,
332 |                     timestep_cond=timestep_cond,
333 |                     cross_attention_kwargs=self.cross_attention_kwargs,
334 |                     added_cond_kwargs=added_cond_kwargs,
335 |                     return_dict=False,
336 |                 )[0][:,:,0,:,:]
337 | 
338 |                 # perform guidance
339 |                 if self.do_classifier_free_guidance:
340 |                     noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
341 |                     noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond)
342 | 
343 |                 # compute the previous noisy sample x_t -> x_t-1
344 |                 latents = scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
345 | 
346 |                 if callback_on_step_end is not None:
347 |                     callback_kwargs = {}
348 |                     for k in callback_on_step_end_tensor_inputs:
349 |                         callback_kwargs[k] = locals()[k]
350 |                     callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
351 | 
352 |                     latents = callback_outputs.pop("latents", latents)
353 |                     prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
354 |                     negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
355 | 
356 |                 # call the callback, if provided
357 |                 if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % scheduler.order == 0):
358 |                     progress_bar.update()
359 |                     if callback is not None and i % callback_steps == 0:
360 |                         step_idx = i // getattr(scheduler, "order", 1)
361 |                         callback(step_idx, t, latents)
362 | 
363 |         self.unet.cpu()
364 | 
365 |         self.vae_decode.to(device=device)
366 |         if not output_type == "latent":
367 |             image = self.vae_decode.decode(latents / self.vae.config.scaling_factor, return_dict=False, generator=generator)[
368 |                 0
369 |             ]
370 |         else:
371 |             image = latents
372 |         self.vae_decode.cpu()
373 | 
374 |         do_denormalize = [True] * image.shape[0]
375 | 
376 |         image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
377 | 
378 |         # Offload all models
379 |         self.maybe_free_model_hooks()
380 | 
381 |         return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=None), latents.detach().cpu() / self.vae.config.scaling_factor
382 | 


--------------------------------------------------------------------------------
/hellomeme/pipelines/pipline_hm5_image.py:
--------------------------------------------------------------------------------
  1 | # coding: utf-8
  2 | 
  3 | """
  4 | @File   : hm_pipline_image.py
  5 | @Author : Songkey
  6 | @Email  : songkey@pku.edu.cn
  7 | @Date   : 1/3/2025
  8 | @Desc   :
  9 | adapted from: https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
 10 | """
 11 | 
 12 | import copy
 13 | from typing import Any, Callable, Dict, List, Optional, Union
 14 | import torch
 15 | 
 16 | from diffusers.callbacks import MultiPipelineCallbacks, PipelineCallback
 17 | from diffusers.image_processor import PipelineImageInput
 18 | from diffusers.utils import deprecate
 19 | from diffusers.utils.torch_utils import randn_tensor
 20 | from diffusers.pipelines.stable_diffusion.pipeline_output import StableDiffusionPipelineOutput
 21 | from diffusers import DPMSolverMultistepScheduler
 22 | from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img import retrieve_timesteps, retrieve_latents
 23 | from ..models import (HM3Denoising3D,
 24 |                      HMPipeline, HM5ReferenceAdapter,
 25 |                      HM5ControlNetBase,
 26 |                      HM5SD15ControlProj)
 27 | 
 28 | class HM5ImagePipeline(HMPipeline):
 29 |     def caryomitosis(self, **kwargs):
 30 |         if hasattr(self, "unet_ref"):
 31 |             del self.unet_ref
 32 |         self.unet_ref = HM3Denoising3D.from_unet2d(self.unet)
 33 |         self.unet_ref.cpu()
 34 | 
 35 |         if not isinstance(self.unet, HM3Denoising3D):
 36 |             unet = HM3Denoising3D.from_unet2d(unet=self.unet)
 37 |             # todo: 不够优雅
 38 |             del self.unet
 39 |             self.unet = unet
 40 |             self.unet.cpu()
 41 | 
 42 |         self.vae.cpu()
 43 |         self.vae_decode = copy.deepcopy(self.vae)
 44 |         self.text_encoder.cpu()
 45 |         self.text_encoder_ref = copy.deepcopy(self.text_encoder)
 46 |         self.safety_checker.cpu()
 47 | 
 48 |     def insert_hm_modules(self, version='v5', dtype=torch.float16, modelscope=False):
 49 | 
 50 |         self.version = version
 51 |         if modelscope:
 52 |             from modelscope import snapshot_download
 53 |             hm_reference_dir = snapshot_download('songkey/hm5_reference')
 54 |             hm_control_dir = snapshot_download('songkey/hm5_control_base')
 55 |             hm_control_proj_dir = snapshot_download('songkey/hm5_control_proj')
 56 |         else:
 57 |             hm_reference_dir = 'songkey/hm5_reference'
 58 |             hm_control_dir = 'songkey/hm5_control_base'
 59 |             hm_control_proj_dir = 'songkey/hm5_control_proj'
 60 | 
 61 |         if isinstance(self.unet, HM3Denoising3D):
 62 |             hm_adapter = HM5ReferenceAdapter.from_pretrained(hm_reference_dir)
 63 | 
 64 |             self.unet.insert_reference_adapter(hm_adapter)
 65 |             self.unet.to(device='cpu', dtype=dtype).eval()
 66 | 
 67 |         if hasattr(self, "unet_ref"):
 68 |             self.unet_ref.to(device='cpu', dtype=dtype).eval()
 69 | 
 70 |         if hasattr(self, "mp_control"):
 71 |             del self.mp_control
 72 | 
 73 |         if hasattr(self, "mp_control_proj"):
 74 |             del self.mp_control_proj
 75 | 
 76 |         self.mp_control = HM5ControlNetBase.from_pretrained(hm_control_dir)
 77 |         self.mp_control_proj = HM5SD15ControlProj.from_pretrained(hm_control_proj_dir)
 78 | 
 79 |         self.mp_control.to(device='cpu', dtype=dtype).eval()
 80 |         self.mp_control_proj.to(device='cpu', dtype=dtype).eval()
 81 | 
 82 |         self.vae.to(device='cpu', dtype=dtype).eval()
 83 |         self.vae_decode.to(device='cpu', dtype=dtype).eval()
 84 |         self.text_encoder.to(device='cpu', dtype=dtype).eval()
 85 | 
 86 |     @torch.no_grad()
 87 |     def __call__(
 88 |             self,
 89 |             prompt: Union[str, List[str]] = None,
 90 |             image: PipelineImageInput = None,
 91 |             drive_params: Dict[str, Any] = None,
 92 |             strength: float = 0.8,
 93 |             num_inference_steps: Optional[int] = 50,
 94 |             timesteps: List[int] = None,
 95 |             sigmas: List[float] = None,
 96 |             guidance_scale: Optional[float] = 7.5,
 97 |             negative_prompt: Optional[Union[str, List[str]]] = None,
 98 |             eta: Optional[float] = 0.0,
 99 |             generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
100 |             prompt_embeds: Optional[torch.Tensor] = None,
101 |             negative_prompt_embeds: Optional[torch.Tensor] = None,
102 |             ip_adapter_image: Optional[PipelineImageInput] = None,
103 |             ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None,
104 |             output_type: Optional[str] = "pil",
105 |             device: Optional[str] = "cpu",
106 |             return_dict: bool = True,
107 |             cross_attention_kwargs: Optional[Dict[str, Any]] = None,
108 |             clip_skip: int = None,
109 |             callback_on_step_end: Optional[
110 |                 Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks]
111 |             ] = None,
112 |             callback_on_step_end_tensor_inputs: List[str] = ["latents"],
113 |             **kwargs,
114 |     ):
115 |         callback = kwargs.pop("callback", None)
116 |         callback_steps = kwargs.pop("callback_steps", None)
117 |         num_images_per_prompt = 1
118 | 
119 |         if callback is not None:
120 |             deprecate(
121 |                 "callback",
122 |                 "1.0.0",
123 |                 "Passing `callback` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`",
124 |             )
125 |         if callback_steps is not None:
126 |             deprecate(
127 |                 "callback_steps",
128 |                 "1.0.0",
129 |                 "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`",
130 |             )
131 | 
132 |         if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
133 |             callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
134 | 
135 |         # 1. Check inputs. Raise error if not correct
136 |         self.check_inputs(
137 |             prompt,
138 |             strength,
139 |             callback_steps,
140 |             negative_prompt,
141 |             prompt_embeds,
142 |             negative_prompt_embeds,
143 |             ip_adapter_image,
144 |             ip_adapter_image_embeds,
145 |             callback_on_step_end_tensor_inputs,
146 |         )
147 | 
148 |         self._guidance_scale = guidance_scale
149 |         self._clip_skip = clip_skip
150 |         self._cross_attention_kwargs = cross_attention_kwargs
151 |         self._interrupt = False
152 | 
153 |         # 2. Define call parameters
154 |         if prompt is not None and isinstance(prompt, str):
155 |             batch_size = 1
156 |         elif prompt is not None and isinstance(prompt, list):
157 |             batch_size = len(prompt)
158 |         else:
159 |             batch_size = prompt_embeds.shape[0]
160 | 
161 |         # device = self.device
162 | 
163 |         # 3. Encode input prompt
164 |         text_encoder_lora_scale = (
165 |             self.cross_attention_kwargs.get("scale", None) if self.cross_attention_kwargs is not None else None
166 |         )
167 | 
168 |         self.text_encoder_ref.to(device=device)
169 |         prompt_embeds_ref, negative_prompt_embeds_ref = self.encode_prompt_sk(
170 |             self.text_encoder_ref,
171 |             prompt,
172 |             device,
173 |             num_images_per_prompt,
174 |             self.do_classifier_free_guidance,
175 |             negative_prompt,
176 |             prompt_embeds=prompt_embeds,
177 |             negative_prompt_embeds=negative_prompt_embeds,
178 |             lora_scale=text_encoder_lora_scale,
179 |             clip_skip=self.clip_skip,
180 |         )
181 |         self.text_encoder_ref.cpu()
182 | 
183 |         self.text_encoder.to(device=device)
184 |         prompt_embeds, negative_prompt_embeds = self.encode_prompt_sk(
185 |             self.text_encoder,
186 |             prompt,
187 |             device,
188 |             num_images_per_prompt,
189 |             self.do_classifier_free_guidance,
190 |             negative_prompt,
191 |             prompt_embeds=prompt_embeds,
192 |             negative_prompt_embeds=negative_prompt_embeds,
193 |             lora_scale=text_encoder_lora_scale,
194 |             clip_skip=self.clip_skip,
195 |         )
196 |         self.text_encoder.cpu()
197 | 
198 |         # For classifier free guidance, we need to do two forward passes.
199 |         # Here we concatenate the unconditional and text embeddings into a single batch
200 |         # to avoid doing two forward passes
201 |         if self.do_classifier_free_guidance:
202 |             prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
203 |             prompt_embeds_ref = torch.cat([negative_prompt_embeds_ref, prompt_embeds_ref])
204 | 
205 |         if ip_adapter_image is not None or ip_adapter_image_embeds is not None:
206 |             image_embeds = self.prepare_ip_adapter_image_embeds(
207 |                 ip_adapter_image,
208 |                 ip_adapter_image_embeds,
209 |                 device,
210 |                 batch_size * num_images_per_prompt,
211 |                 self.do_classifier_free_guidance,
212 |             )
213 | 
214 |         # 4. Preprocess
215 |         image = self.image_processor.preprocess(image).to(device=device, dtype=prompt_embeds.dtype)
216 | 
217 |         scheduler = DPMSolverMultistepScheduler(
218 |             num_train_timesteps=1000,
219 |             beta_start=0.00085,
220 |             beta_end=0.012,
221 |             beta_schedule="scaled_linear",
222 |             # use_karras_sigmas=True,
223 |             algorithm_type="sde-dpmsolver++",
224 |         )
225 | 
226 |         # 5. set timesteps
227 |         timesteps, num_inference_steps = retrieve_timesteps(scheduler, num_inference_steps, device, timesteps, sigmas)
228 | 
229 |         # 6. Prepare reference latents
230 |         self.vae.to(device=device)
231 |         ref_latents = [
232 |             retrieve_latents(self.vae.encode(image[i: i + 1].to(device=device)), generator=generator)
233 |             for i in range(batch_size)
234 |         ]
235 |         self.vae.cpu()
236 | 
237 |         ref_latents = torch.cat(ref_latents, dim=0)
238 |         ref_latents = self.vae.config.scaling_factor * ref_latents
239 |         c, h, w = ref_latents.shape[1:]
240 | 
241 |         condition = drive_params['condition'].clone().to(device=device)
242 |         if self.do_classifier_free_guidance:
243 |             condition = torch.cat([torch.ones_like(condition) * -1, condition], dim=0)
244 | 
245 |         control_latents = {}
246 |         self.mp_control.to(device=device)
247 |         self.mp_control_proj.to(device=device)
248 |         if 'drive_coeff' in drive_params:
249 |             drive_coeff = drive_params['drive_coeff'].clone().to(device=device)
250 |             face_parts = drive_params['face_parts'].clone().to(device=device)
251 |             if self.do_classifier_free_guidance:
252 |                 drive_coeff = torch.cat([torch.zeros_like(drive_coeff), drive_coeff], dim=0)
253 |                 face_parts = torch.cat([torch.zeros_like(face_parts), face_parts], dim=0)
254 |             control_latents1 = self.mp_control(condition=condition, drive_coeff=drive_coeff, face_parts=face_parts)
255 |             control_latents1 = self.mp_control_proj(control_latents1)
256 |             control_latents.update(control_latents1)
257 |         elif 'pd_fpg' in drive_params:
258 |             pd_fpg = drive_params['pd_fpg'].clone().to(device=device)
259 |             if self.do_classifier_free_guidance:
260 |                 pd_fpg = torch.cat([torch.zeros_like(pd_fpg), pd_fpg], dim=0)
261 |             control_latents2 = self.mp_control(condition=condition, emo_embedding=pd_fpg)
262 |             control_latents2 = self.mp_control_proj(control_latents2)
263 |             control_latents.update(control_latents2)
264 |         self.mp_control.cpu()
265 |         self.mp_control_proj.cpu()
266 | 
267 |         # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
268 |         extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
269 | 
270 |         # 7.1 Add image embeds for IP-Adapter
271 |         added_cond_kwargs = (
272 |             {"image_embeds": image_embeds}
273 |             if ip_adapter_image is not None or ip_adapter_image_embeds is not None
274 |             else None
275 |         )
276 |         base_noise = randn_tensor([batch_size, c, h, w], dtype=prompt_embeds.dtype, generator=generator).to(device=device)
277 | 
278 |         latent_model_input = torch.cat([torch.zeros_like(ref_latents), ref_latents]) if (
279 |             self.do_classifier_free_guidance) else ref_latents
280 |         # latent_model_input = torch.cat([ref_latents_neg, ref_latents], dim=0)
281 |         self.unet_ref.to(device=device)
282 |         cached_res = self.unet_ref(
283 |             latent_model_input.unsqueeze(2),
284 |             0,
285 |             encoder_hidden_states=prompt_embeds_ref,
286 |             return_dict=False,
287 |         )[1]
288 |         self.unet_ref.cpu()
289 | 
290 |         # 7.2 Optionally get Guidance Scale Embedding
291 |         timestep_cond = None
292 |         if self.unet.config.time_cond_proj_dim is not None:
293 |             guidance_scale_tensor = torch.tensor(self.guidance_scale - 1).repeat(batch_size * num_images_per_prompt)
294 |             timestep_cond = self.get_guidance_scale_embedding(
295 |                 guidance_scale_tensor, embedding_dim=self.unet.config.time_cond_proj_dim
296 |             ).to(device=device, dtype=prompt_embeds.dtype)
297 | 
298 |         latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt)
299 | 
300 |         latents = base_noise * scheduler.init_noise_sigma
301 |         # 8. Denoising loop
302 |         num_warmup_steps = len(timesteps) - num_inference_steps * scheduler.order
303 |         self._num_timesteps = len(timesteps)
304 |         self.unet.to(device=device)
305 |         with self.progress_bar(total=num_inference_steps) as progress_bar:
306 |             for i, t in enumerate(timesteps):
307 |                 if self.interrupt:
308 |                     continue
309 | 
310 |                 # expand the latents if we are doing classifier free guidance
311 |                 latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
312 |                 latent_model_input = scheduler.scale_model_input(latent_model_input, t)
313 | 
314 |                 # predict the noise residual
315 |                 noise_pred = self.unet(
316 |                     latent_model_input.unsqueeze(2),
317 |                     t,
318 |                     encoder_hidden_states=prompt_embeds,
319 |                     reference_hidden_states=cached_res,
320 |                     control_hidden_states=control_latents,
321 |                     timestep_cond=timestep_cond,
322 |                     cross_attention_kwargs=self.cross_attention_kwargs,
323 |                     added_cond_kwargs=added_cond_kwargs,
324 |                     return_dict=False,
325 |                 )[0][:,:,0,:,:]
326 | 
327 |                 # perform guidance
328 |                 if self.do_classifier_free_guidance:
329 |                     noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
330 |                     noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond)
331 | 
332 |                 # compute the previous noisy sample x_t -> x_t-1
333 |                 latents = scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
334 | 
335 |                 if callback_on_step_end is not None:
336 |                     callback_kwargs = {}
337 |                     for k in callback_on_step_end_tensor_inputs:
338 |                         callback_kwargs[k] = locals()[k]
339 |                     callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
340 | 
341 |                     latents = callback_outputs.pop("latents", latents)
342 |                     prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
343 |                     negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
344 | 
345 |                 # call the callback, if provided
346 |                 if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % scheduler.order == 0):
347 |                     progress_bar.update()
348 |                     if callback is not None and i % callback_steps == 0:
349 |                         step_idx = i // getattr(scheduler, "order", 1)
350 |                         callback(step_idx, t, latents)
351 | 
352 |         self.unet.cpu()
353 | 
354 |         self.vae_decode.to(device=device)
355 |         if not output_type == "latent":
356 |             image = self.vae_decode.decode(latents / self.vae.config.scaling_factor, return_dict=False, generator=generator)[
357 |                 0
358 |             ]
359 |         else:
360 |             image = latents
361 |         self.vae_decode.cpu()
362 | 
363 |         do_denormalize = [True] * image.shape[0]
364 | 
365 |         image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
366 | 
367 |         # Offload all models
368 |         self.maybe_free_model_hooks()
369 | 
370 |         return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=None), latents.detach().cpu() / self.vae.config.scaling_factor
371 | 


--------------------------------------------------------------------------------
/hellomeme/pipelines/pipline_hm_image.py:
--------------------------------------------------------------------------------
  1 | # coding: utf-8
  2 | 
  3 | """
  4 | @File   : hm_pipline_image.py
  5 | @Author : Songkey
  6 | @Email  : songkey@pku.edu.cn
  7 | @Date   : 8/29/2024
  8 | @Desc   :
  9 | adapted from: https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
 10 | """
 11 | 
 12 | import copy
 13 | from typing import Any, Callable, Dict, List, Optional, Union
 14 | import torch
 15 | 
 16 | from diffusers import EulerDiscreteScheduler
 17 | from diffusers.callbacks import MultiPipelineCallbacks, PipelineCallback
 18 | from diffusers.image_processor import PipelineImageInput
 19 | from diffusers.utils import deprecate
 20 | from diffusers.utils.torch_utils import randn_tensor
 21 | from diffusers.pipelines.stable_diffusion.pipeline_output import StableDiffusionPipelineOutput
 22 | from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img import retrieve_timesteps, retrieve_latents
 23 | 
 24 | from ..models import HMDenoising3D, HMControlNet, HMControlNet2, HMV2ControlNet, HMV2ControlNet2, HMPipeline
 25 | from ..models import HMReferenceAdapter
 26 | 
 27 | class HMImagePipeline(HMPipeline):
 28 |     def caryomitosis(self, **kwargs):
 29 |         if hasattr(self, "unet_ref"):
 30 |             del self.unet_ref
 31 |         self.unet_ref = HMDenoising3D.from_unet2d(self.unet)
 32 |         self.unet_ref.cpu()
 33 | 
 34 |         if not isinstance(self.unet, HMDenoising3D):
 35 |             unet = HMDenoising3D.from_unet2d(unet=self.unet)
 36 |             # todo: 不够优雅
 37 |             del self.unet
 38 |             self.unet = unet
 39 |             self.unet.cpu()
 40 | 
 41 |         self.vae.cpu()
 42 |         self.vae_decode = copy.deepcopy(self.vae)
 43 |         self.text_encoder.cpu()
 44 |         self.text_encoder_ref = copy.deepcopy(self.text_encoder)
 45 |         self.safety_checker.cpu()
 46 | 
 47 |     def insert_hm_modules(self, version, dtype, modelscope=False):
 48 |         if modelscope:
 49 |             from modelscope import snapshot_download
 50 |             hm_reference_dir = snapshot_download('songkey/hm_reference')
 51 |             hm2_reference_dir = snapshot_download('songkey/hm2_reference')
 52 |             hm_control_dir = snapshot_download('songkey/hm_control')
 53 |             hm_control2_dir = snapshot_download('songkey/hm_control2')
 54 |             hm2_control_dir = snapshot_download('songkey/hm2_control')
 55 |             hm2_control2_dir = snapshot_download('songkey/hm2_control2')
 56 |         else:
 57 |             hm_reference_dir = 'songkey/hm_reference'
 58 |             hm2_reference_dir = 'songkey/hm2_reference'
 59 |             hm_control_dir = 'songkey/hm_control'
 60 |             hm_control2_dir = 'songkey/hm_control2'
 61 |             hm2_control_dir = 'songkey/hm2_control'
 62 |             hm2_control2_dir = 'songkey/hm2_control2'
 63 | 
 64 |         if isinstance(self.unet, HMDenoising3D):
 65 |             if version == 'v1':
 66 |                 hm_adapter = HMReferenceAdapter.from_pretrained(hm_reference_dir)
 67 |             else:
 68 |                 hm_adapter = HMReferenceAdapter.from_pretrained(hm2_reference_dir)
 69 |             self.unet.insert_reference_adapter(hm_adapter)
 70 |             self.unet.to(device='cpu', dtype=dtype).eval()
 71 | 
 72 |         if hasattr(self, "unet_ref"):
 73 |             self.unet_ref.to(device='cpu', dtype=dtype).eval()
 74 | 
 75 |         if hasattr(self, "mp_control"):
 76 |             del self.mp_control
 77 |         if version == 'v1':
 78 |             self.mp_control = HMControlNet.from_pretrained(hm_control_dir)
 79 |         else:
 80 |             self.mp_control = HMV2ControlNet.from_pretrained(hm2_control_dir)
 81 |         self.mp_control.to(device='cpu', dtype=dtype).eval()
 82 | 
 83 |         if hasattr(self, "mp_control2"):
 84 |             del self.mp_control2
 85 |         if version == 'v1':
 86 |             self.mp_control2 = HMControlNet2.from_pretrained(hm_control2_dir)
 87 |         else:
 88 |             self.mp_control2 = HMV2ControlNet2.from_pretrained(hm2_control2_dir)
 89 |         self.mp_control2.to(device='cpu', dtype=dtype).eval()
 90 | 
 91 |         self.vae.to(device='cpu', dtype=dtype).eval()
 92 |         self.vae_decode.to(device='cpu', dtype=dtype).eval()
 93 |         self.text_encoder.to(device='cpu', dtype=dtype).eval()
 94 | 
 95 |     @torch.no_grad()
 96 |     def __call__(
 97 |             self,
 98 |             prompt: Union[str, List[str]] = None,
 99 |             image: PipelineImageInput = None,
100 |             drive_params: Dict[str, Any] = None,
101 |             strength: float = 0.8,
102 |             num_inference_steps: Optional[int] = 50,
103 |             timesteps: List[int] = None,
104 |             sigmas: List[float] = None,
105 |             guidance_scale: Optional[float] = 7.5,
106 |             negative_prompt: Optional[Union[str, List[str]]] = None,
107 |             eta: Optional[float] = 0.0,
108 |             generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
109 |             prompt_embeds: Optional[torch.Tensor] = None,
110 |             negative_prompt_embeds: Optional[torch.Tensor] = None,
111 |             ip_adapter_image: Optional[PipelineImageInput] = None,
112 |             ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None,
113 |             output_type: Optional[str] = "pil",
114 |             device: Optional[str] = "cpu",
115 |             return_dict: bool = True,
116 |             cross_attention_kwargs: Optional[Dict[str, Any]] = None,
117 |             clip_skip: int = None,
118 |             callback_on_step_end: Optional[
119 |                 Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks]
120 |             ] = None,
121 |             callback_on_step_end_tensor_inputs: List[str] = ["latents"],
122 |             **kwargs,
123 |     ):
124 |         callback = kwargs.pop("callback", None)
125 |         callback_steps = kwargs.pop("callback_steps", None)
126 |         num_images_per_prompt = 1
127 | 
128 |         if callback is not None:
129 |             deprecate(
130 |                 "callback",
131 |                 "1.0.0",
132 |                 "Passing `callback` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`",
133 |             )
134 |         if callback_steps is not None:
135 |             deprecate(
136 |                 "callback_steps",
137 |                 "1.0.0",
138 |                 "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`",
139 |             )
140 | 
141 |         if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
142 |             callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
143 | 
144 |         # 1. Check inputs. Raise error if not correct
145 |         self.check_inputs(
146 |             prompt,
147 |             strength,
148 |             callback_steps,
149 |             negative_prompt,
150 |             prompt_embeds,
151 |             negative_prompt_embeds,
152 |             ip_adapter_image,
153 |             ip_adapter_image_embeds,
154 |             callback_on_step_end_tensor_inputs,
155 |         )
156 | 
157 |         self._guidance_scale = guidance_scale
158 |         self._clip_skip = clip_skip
159 |         self._cross_attention_kwargs = cross_attention_kwargs
160 |         self._interrupt = False
161 | 
162 |         # 2. Define call parameters
163 |         if prompt is not None and isinstance(prompt, str):
164 |             batch_size = 1
165 |         elif prompt is not None and isinstance(prompt, list):
166 |             batch_size = len(prompt)
167 |         else:
168 |             batch_size = prompt_embeds.shape[0]
169 | 
170 |         # device = self.device
171 | 
172 |         # 3. Encode input prompt
173 |         text_encoder_lora_scale = (
174 |             self.cross_attention_kwargs.get("scale", None) if self.cross_attention_kwargs is not None else None
175 |         )
176 | 
177 |         self.text_encoder_ref.to(device=device)
178 |         prompt_embeds_ref, negative_prompt_embeds_ref = self.encode_prompt_sk(
179 |             self.text_encoder_ref,
180 |             prompt,
181 |             device,
182 |             num_images_per_prompt,
183 |             self.do_classifier_free_guidance,
184 |             negative_prompt,
185 |             prompt_embeds=prompt_embeds,
186 |             negative_prompt_embeds=negative_prompt_embeds,
187 |             lora_scale=text_encoder_lora_scale,
188 |             clip_skip=self.clip_skip,
189 |         )
190 |         self.text_encoder_ref.cpu()
191 | 
192 |         self.text_encoder.to(device=device)
193 |         prompt_embeds, negative_prompt_embeds = self.encode_prompt_sk(
194 |             self.text_encoder,
195 |             prompt,
196 |             device,
197 |             num_images_per_prompt,
198 |             self.do_classifier_free_guidance,
199 |             negative_prompt,
200 |             prompt_embeds=prompt_embeds,
201 |             negative_prompt_embeds=negative_prompt_embeds,
202 |             lora_scale=text_encoder_lora_scale,
203 |             clip_skip=self.clip_skip,
204 |         )
205 |         self.text_encoder.cpu()
206 | 
207 |         # For classifier free guidance, we need to do two forward passes.
208 |         # Here we concatenate the unconditional and text embeddings into a single batch
209 |         # to avoid doing two forward passes
210 |         if self.do_classifier_free_guidance:
211 |             prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
212 |             prompt_embeds_ref = torch.cat([negative_prompt_embeds_ref, prompt_embeds_ref])
213 | 
214 |         if ip_adapter_image is not None or ip_adapter_image_embeds is not None:
215 |             image_embeds = self.prepare_ip_adapter_image_embeds(
216 |                 ip_adapter_image,
217 |                 ip_adapter_image_embeds,
218 |                 device,
219 |                 batch_size * num_images_per_prompt,
220 |                 self.do_classifier_free_guidance,
221 |             )
222 | 
223 |         # 4. Preprocess
224 |         image = self.image_processor.preprocess(image).to(device=device, dtype=prompt_embeds.dtype)
225 | 
226 |         scheduler = EulerDiscreteScheduler(
227 |             num_train_timesteps=1000,
228 |             beta_start=0.00085,
229 |             beta_end=0.012,
230 |             beta_schedule="scaled_linear",
231 |         )
232 | 
233 |         # 5. set timesteps
234 |         timesteps, num_inference_steps = retrieve_timesteps(
235 |             scheduler, num_inference_steps, device, timesteps, sigmas
236 |         )
237 |         # timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength, device)
238 |         latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt)
239 | 
240 |         # 6. Prepare reference latents
241 |         self.vae.to(device=device)
242 |         ref_latents = [
243 |             retrieve_latents(self.vae.encode(image[i: i + 1].to(device=device)), generator=generator)
244 |             for i in range(batch_size)
245 |         ]
246 |         self.vae.cpu()
247 | 
248 |         ref_latents = torch.cat(ref_latents, dim=0)
249 |         ref_latents = self.vae.config.scaling_factor * ref_latents
250 |         c, h, w = ref_latents.shape[1:]
251 | 
252 |         condition = drive_params['condition'].clone().to(device=device)
253 |         if self.do_classifier_free_guidance:
254 |             condition = torch.cat([torch.ones_like(condition) * -1, condition], dim=0)
255 | 
256 |         control_latents = {}
257 |         if 'drive_coeff' in drive_params:
258 |             self.mp_control.to(device=device)
259 |             drive_coeff = drive_params['drive_coeff'].clone().to(device=device)
260 |             face_parts = drive_params['face_parts'].clone().to(device=device)
261 |             if self.do_classifier_free_guidance:
262 |                 drive_coeff = torch.cat([torch.zeros_like(drive_coeff), drive_coeff], dim=0)
263 |                 face_parts = torch.cat([torch.zeros_like(face_parts), face_parts], dim=0)
264 |             control_latents1 = self.mp_control(condition=condition, drive_coeff=drive_coeff, face_parts=face_parts)
265 |             control_latents.update(control_latents1)
266 |             self.mp_control.cpu()
267 | 
268 |         if 'pd_fpg' in drive_params:
269 |             self.mp_control2.to(device=device)
270 |             pd_fpg = drive_params['pd_fpg'].clone().to(device=device)
271 |             if self.do_classifier_free_guidance:
272 |                 neg_pd_fpg = drive_params['neg_pd_fpg'].clone().to(device=device)
273 |                 neg_pd_fpg.repeat_interleave(pd_fpg.size(1), dim=1)
274 |                 pd_fpg = torch.cat([neg_pd_fpg, pd_fpg], dim=0)
275 |             control_latents2 = self.mp_control2(condition=condition, emo_embedding=pd_fpg)
276 |             control_latents.update(control_latents2)
277 |             self.mp_control2.cpu()
278 | 
279 |         # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
280 |         extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
281 | 
282 |         # 7.1 Add image embeds for IP-Adapter
283 |         added_cond_kwargs = (
284 |             {"image_embeds": image_embeds}
285 |             if ip_adapter_image is not None or ip_adapter_image_embeds is not None
286 |             else None
287 |         )
288 | 
289 |         latent_model_input = torch.cat([torch.zeros_like(ref_latents), ref_latents]) if self.do_classifier_free_guidance else ref_latents
290 |         self.unet_ref.to(device=device)
291 |         cached_res = self.unet_ref(
292 |             latent_model_input.unsqueeze(2),
293 |             0,
294 |             encoder_hidden_states=prompt_embeds_ref,
295 |             return_dict=False,
296 |         )[1]
297 |         self.unet_ref.cpu()
298 | 
299 |         # 7.2 Optionally get Guidance Scale Embedding
300 |         timestep_cond = None
301 |         if self.unet.config.time_cond_proj_dim is not None:
302 |             guidance_scale_tensor = torch.tensor(self.guidance_scale - 1).repeat(batch_size * num_images_per_prompt)
303 |             timestep_cond = self.get_guidance_scale_embedding(
304 |                 guidance_scale_tensor, embedding_dim=self.unet.config.time_cond_proj_dim
305 |             ).to(device=device, dtype=prompt_embeds.dtype)
306 | 
307 |         latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt)
308 |         base_noise = randn_tensor([batch_size, c, h, w], dtype=prompt_embeds.dtype, generator=generator).to(device=device)
309 |         latents = base_noise * scheduler.init_noise_sigma
310 |         # 8. Denoising loop
311 |         num_warmup_steps = len(timesteps) - num_inference_steps * scheduler.order
312 |         self._num_timesteps = len(timesteps)
313 |         self.unet.to(device=device)
314 |         with self.progress_bar(total=num_inference_steps) as progress_bar:
315 |             for i, t in enumerate(timesteps):
316 |                 if self.interrupt:
317 |                     continue
318 | 
319 |                 # expand the latents if we are doing classifier free guidance
320 |                 latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
321 |                 latent_model_input = scheduler.scale_model_input(latent_model_input, t)
322 | 
323 |                 # predict the noise residual
324 |                 noise_pred = self.unet(
325 |                     latent_model_input.unsqueeze(2),
326 |                     t,
327 |                     encoder_hidden_states=prompt_embeds,
328 |                     reference_hidden_states=cached_res,
329 |                     control_hidden_states=control_latents,
330 |                     timestep_cond=timestep_cond,
331 |                     cross_attention_kwargs=self.cross_attention_kwargs,
332 |                     added_cond_kwargs=added_cond_kwargs,
333 |                     return_dict=False,
334 |                 )[0][:,:,0,:,:]
335 | 
336 |                 # perform guidance
337 |                 if self.do_classifier_free_guidance:
338 |                     noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
339 |                     noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond)
340 | 
341 |                 # compute the previous noisy sample x_t -> x_t-1
342 |                 latents = scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
343 | 
344 |                 if callback_on_step_end is not None:
345 |                     callback_kwargs = {}
346 |                     for k in callback_on_step_end_tensor_inputs:
347 |                         callback_kwargs[k] = locals()[k]
348 |                     callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
349 | 
350 |                     latents = callback_outputs.pop("latents", latents)
351 |                     prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
352 |                     negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
353 | 
354 |                 # call the callback, if provided
355 |                 if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % scheduler.order == 0):
356 |                     progress_bar.update()
357 |                     if callback is not None and i % callback_steps == 0:
358 |                         step_idx = i // getattr(scheduler, "order", 1)
359 |                         callback(step_idx, t, latents)
360 | 
361 |         self.unet.cpu()
362 | 
363 |         self.vae_decode.to(device=device)
364 |         if not output_type == "latent":
365 |             image = self.vae_decode.decode(latents / self.vae.config.scaling_factor, return_dict=False, generator=generator)[
366 |                 0
367 |             ]
368 |         else:
369 |             image = latents
370 |         self.vae_decode.cpu()
371 | 
372 |         do_denormalize = [True] * image.shape[0]
373 | 
374 |         image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
375 | 
376 |         # Offload all models
377 |         self.maybe_free_model_hooks()
378 | 
379 |         return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=None), latents.detach().cpu() / self.vae.config.scaling_factor
380 | 


--------------------------------------------------------------------------------
/hellomeme/tools/__init__.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | 
 3 | # @File   : __init__.py
 4 | # @Author : Songkey
 5 | # @Email  : songkey@pku.edu.cn
 6 | # @Date   : 8/28/2024
 7 | # @Desc   :
 8 | 
 9 | from .hello_arkit import HelloARKitBSPred
10 | from .hello_face_det import HelloFaceDet
11 | from .hello_camera_demo import HelloCameraDemo
12 | from .hello_3dmm import Hello3DMMPred
13 | from .hello_face_alignment import HelloFaceAlignment
14 | from .pdf import FanEncoder
15 | 


--------------------------------------------------------------------------------
/hellomeme/tools/hello_3dmm.py:
--------------------------------------------------------------------------------
  1 | # coding: utf-8
  2 | 
  3 | """
  4 | @File   : test.py
  5 | @Author : Songkey
  6 | @Email  : songkey@pku.edu.cn
  7 | @Date   : 11/1/2024
  8 | @Desc   : Created by Shengjie Wu (wu.shengjie@immomo.com)
  9 | 这可能是一个很强大的模型
 10 | """
 11 | 
 12 | import numpy as np
 13 | import cv2
 14 | import os.path as osp
 15 | 
 16 | from .utils import get_warp_mat_bbox_by_gt_pts_float, create_onnx_session
 17 | 
 18 | def crop_transl_to_full_transl(crop_trans, crop_center, scale, full_center, focal_length):
 19 |     """
 20 |     :param crop_trans: (3), float
 21 |     :param crop_center: (2), float
 22 |     :param scale: (1), float
 23 |     :param full_center: (2), float
 24 |     :param focal_length: (1), float
 25 |     :return:
 26 |     """
 27 |     crop_c_x, crop_c_y = crop_center
 28 |     full_c_x, full_c_y = full_center
 29 |     bs = 2 * focal_length / scale / crop_trans[2]
 30 |     full_x = crop_trans[0] - 2 * (crop_c_x - full_c_x) / bs
 31 |     full_y = crop_trans[1] + 2 * (crop_c_y - full_c_y) / bs
 32 |     full_z = crop_trans[2] * scale
 33 | 
 34 |     full_trans = np.array([full_x, full_y, full_z], dtype=np.float32)
 35 | 
 36 |     return full_trans
 37 | 
 38 | class Hello3DMMPred(object):
 39 |     def __init__(self, gpu_id=None, modelscope=False):
 40 |         if modelscope:
 41 |             from modelscope import snapshot_download
 42 |             model_path = osp.join(snapshot_download('songkey/hello_group_facemodel'), 'hello_3dmm.onnx')
 43 |         else:
 44 |             from huggingface_hub import hf_hub_download
 45 |             model_path = hf_hub_download('songkey/hello_group_facemodel', filename='hello_3dmm.onnx')
 46 |         self.deep3d_pred_net = create_onnx_session(model_path, gpu_id=gpu_id)
 47 |         self.deep3d_pred_net_input_name = self.deep3d_pred_net.get_inputs()[0].name
 48 |         self.deep3d_pred_net_output_name = [output.name for output in self.deep3d_pred_net.get_outputs()]
 49 | 
 50 |         self.image_size = 224
 51 |         self.camera_init_z = -0.4
 52 |         self.camera_init_focal_len = 386.2879122887948
 53 |         self.used_focal_len = -5.0 / self.camera_init_z * self.camera_init_focal_len
 54 |         self.id_dims = 526
 55 |         self.exp_dims = 203
 56 |         self.tex_dims = 439
 57 | 
 58 |     def forward_params(self, src_image, src_pt):
 59 |         align_mat_info = get_warp_mat_bbox_by_gt_pts_float(src_pt, base_angle=0, dst_size=self.image_size, expand_ratio=0.35, return_info=True)
 60 |         align_mat = align_mat_info["M"]
 61 | 
 62 |         align_image_rgb_uint8 = cv2.cvtColor(cv2.warpAffine(src_image, align_mat, (self.image_size, self.image_size)), cv2.COLOR_BGR2RGB)
 63 | 
 64 |         # cv2.imshow('align_image_rgb_uint8', align_image_rgb_uint8)
 65 | 
 66 |         align_image_rgb_fp32 = align_image_rgb_uint8.astype(np.float32) / 255.0
 67 |         align_image_rgb_fp32_onnx_input = align_image_rgb_fp32.copy().transpose((2, 0, 1))[np.newaxis, ...]
 68 |         pred_coeffs = self.deep3d_pred_net.run(self.deep3d_pred_net_output_name,
 69 |                                                {self.deep3d_pred_net_input_name: align_image_rgb_fp32_onnx_input})[0]
 70 | 
 71 |         angles = pred_coeffs[:, self.id_dims + self.exp_dims + self.tex_dims:self.id_dims + self.exp_dims + self.tex_dims + 3]
 72 |         translations = pred_coeffs[:, self.id_dims + self.exp_dims + self.tex_dims + 3 + 27:]
 73 | 
 74 |         crop_global_transl = crop_transl_to_full_transl(translations[0],
 75 |                                                         crop_center=[align_mat_info["center_x"],
 76 |                                                                      align_mat_info["center_y"]],
 77 |                                                         scale=align_mat_info["scale"],
 78 |                                                         full_center=[src_image.shape[1] * 0.5, src_image.shape[0] * 0.5],
 79 |                                                         focal_length=self.used_focal_len)
 80 |         return angles, crop_global_transl[np.newaxis, :]
 81 | 
 82 | def compute_rotation_matrix(angles):
 83 |     n_b = angles.shape[0]
 84 |     sinx = np.sin(angles[:, 0])
 85 |     siny = np.sin(angles[:, 1])
 86 |     sinz = np.sin(angles[:, 2])
 87 |     cosx = np.cos(angles[:, 0])
 88 |     cosy = np.cos(angles[:, 1])
 89 |     cosz = np.cos(angles[:, 2])
 90 |     rotXYZ = np.eye(3).reshape(1, 3, 3).repeat(n_b*3, 0).reshape(3, n_b, 3, 3)
 91 |     rotXYZ[0, :, 1, 1] = cosx
 92 |     rotXYZ[0, :, 1, 2] = -sinx
 93 |     rotXYZ[0, :, 2, 1] = sinx
 94 |     rotXYZ[0, :, 2, 2] = cosx
 95 |     rotXYZ[1, :, 0, 0] = cosy
 96 |     rotXYZ[1, :, 0, 2] = siny
 97 |     rotXYZ[1, :, 2, 0] = -siny
 98 |     rotXYZ[1, :, 2, 2] = cosy
 99 |     rotXYZ[2, :, 0, 0] = cosz
100 |     rotXYZ[2, :, 0, 1] = -sinz
101 |     rotXYZ[2, :, 1, 0] = sinz
102 |     rotXYZ[2, :, 1, 1] = cosz
103 |     rotation = np.matmul(np.matmul(rotXYZ[2], rotXYZ[1]), rotXYZ[0])
104 |     return rotation.transpose(0, 2, 1)
105 | 
106 | def rigid_transform(vs, rot, trans):
107 |     vs_r = np.matmul(vs, rot)
108 |     vs_t = vs_r + trans.reshape(-1, 1, 3)
109 |     return vs_t
110 | 
111 | def perspective_projection_points(points, image_w, image_h, used_focal_len):
112 |     batch_size = points.shape[0]
113 |     K = np.zeros([batch_size, 3, 3])
114 |     K[:, 0, 0] = used_focal_len
115 |     K[:, 1, 1] = used_focal_len
116 |     K[:, 2, 2] = 1.
117 |     K[:, 0, 2] = image_w * 0.5
118 |     K[:, 1, 2] = image_h * 0.5
119 | 
120 |     reverse_z = np.array([[1, 0, 0], [0, 1, 0], [0, 0, -1]])[np.newaxis, :, :].repeat(batch_size, 0)
121 | 
122 |     # Transform points
123 |     aug_projection = np.matmul(points, reverse_z)
124 |     aug_projection = np.matmul(aug_projection, K.transpose((0, 2, 1)))
125 | 
126 |     # Apply perspective distortion
127 |     projected_points = aug_projection[:, :, :2] / aug_projection[:, :, 2:]
128 |     return projected_points
129 | 
130 | def get_project_points_rect(angle, trans, image_w, image_h, used_focal_len=4828.598903609935):
131 |     vs = np.array(
132 |         [[-1, -1, 0], [-1, 1, 0], [1, 1, 0], [1, -1, 0]],
133 |     ) * 0.05
134 |     vs = vs[np.newaxis, :, :]
135 | 
136 |     rotation = compute_rotation_matrix(angle)
137 |     translation = trans.copy()
138 |     translation[0, 2] *= 0.05
139 | 
140 |     vs_t = rigid_transform(vs, rotation, translation)
141 | 
142 |     project_points = perspective_projection_points(vs_t, image_w, image_h, used_focal_len*0.05)
143 |     project_points = np.stack([project_points[:, :, 0], image_h - project_points[:, :, 1]], axis=2)
144 | 
145 |     return project_points[0]
146 | 
147 | 


--------------------------------------------------------------------------------
/hellomeme/tools/hello_arkit.py:
--------------------------------------------------------------------------------
 1 | """
 2 | @File   : test.py
 3 | @Author : Songkey
 4 | @Email  : songkey@pku.edu.cn
 5 | @Date   : 11/1/2024
 6 | @Desc   : Created by Shengjie Wu (wu.shengjie@immomo.com)
 7 | """
 8 | 
 9 | import numpy as np
10 | import cv2
11 | import os.path as osp
12 | from .utils import create_onnx_session, get_warp_mat_bbox_by_gt_pts_float
13 | 
14 | class HelloARKitBSPred(object):
15 |     def __init__(self, gpu_id=0, modelscope=False):
16 |         if modelscope:
17 |             from modelscope import snapshot_download
18 |             model_path = osp.join(snapshot_download('songkey/hello_group_facemodel'), 'hello_arkit_blendshape.onnx')
19 |         else:
20 |             from huggingface_hub import hf_hub_download
21 |             model_path = hf_hub_download('songkey/hello_group_facemodel', filename='hello_arkit_blendshape.onnx')
22 | 
23 |         self.face_rig_net = create_onnx_session(model_path, gpu_id=gpu_id)
24 |         self.onnx_input_name = self.face_rig_net.get_inputs()[0].name
25 |         self.onnx_output_name = [output.name for output in self.face_rig_net.get_outputs()]
26 |         self.image_size = 224
27 |         self.expand_ratio = 0.15
28 | 
29 |     def forward(self, src_image, src_pt):
30 |         left_eye_corner = src_pt[74]
31 |         right_eye_corner = src_pt[96]
32 |         radian = np.arctan2(right_eye_corner[1] - left_eye_corner[1], right_eye_corner[0] - left_eye_corner[0] + 0.00000001)
33 |         rotate_angle = np.rad2deg(radian)
34 |         align_warp_mat = get_warp_mat_bbox_by_gt_pts_float(src_pt, base_angle=rotate_angle, dst_size=self.image_size,
35 |                                                            expand_ratio=self.expand_ratio)
36 |         face_rig_input = cv2.warpAffine(src_image, align_warp_mat, (self.image_size, self.image_size))
37 | 
38 |         face_rig_onnx_input = face_rig_input.transpose((2, 0, 1)).astype(np.float32)[np.newaxis, :, :, :] / 255.0
39 |         face_rig_params = self.face_rig_net.run(self.onnx_output_name,
40 |                                                 {self.onnx_input_name: face_rig_onnx_input})
41 |         face_rig_params = face_rig_params[0][0]
42 |         return face_rig_params
43 | 


--------------------------------------------------------------------------------
/hellomeme/tools/hello_face_alignment.py:
--------------------------------------------------------------------------------
  1 | """
  2 | @File   : test.py
  3 | @Author : Songkey
  4 | @Email  : songkey@pku.edu.cn
  5 | @Date   : 11/1/2024
  6 | @Desc   : Created by Shengjie Wu (wu.shengjie@immomo.com)
  7 | """
  8 | 
  9 | import cv2
 10 | import os.path as osp
 11 | import numpy as np
 12 | from .hello_face_det import HelloFaceDet
 13 | from .utils import get_warp_mat_bbox, get_warp_mat_bbox_by_gt_pts_float, transform_points
 14 | from .utils import create_onnx_session
 15 | 
 16 | class HelloFaceAlignment(object):
 17 |     def __init__(self, gpu_id=None, modelscope=False):
 18 |         expand_ratio = 0.15
 19 | 
 20 |         if modelscope:
 21 |             from modelscope import snapshot_download
 22 |             alignment_model_path = osp.join(snapshot_download('songkey/hello_group_facemodel'), 'hello_face_landmark.onnx')
 23 |             det_model_path = osp.join(snapshot_download('songkey/hello_group_facemodel'), 'hello_face_det.onnx')
 24 |         else:
 25 |             from huggingface_hub import hf_hub_download
 26 |             alignment_model_path = hf_hub_download('songkey/hello_group_facemodel', filename='hello_face_landmark.onnx')
 27 |             det_model_path = hf_hub_download('songkey/hello_group_facemodel', filename='hello_face_det.onnx')
 28 |         self.face_alignment_net_222 = (
 29 |             create_onnx_session(alignment_model_path, gpu_id=gpu_id))
 30 |         self.onnx_input_name_222 = self.face_alignment_net_222.get_inputs()[0].name
 31 |         self.onnx_output_name_222 = [output.name for output in self.face_alignment_net_222.get_outputs()]
 32 |         self.face_image_size = 128
 33 | 
 34 |         self.face_detector = HelloFaceDet(det_model_path, gpu_id=gpu_id)
 35 |         self.expand_ratio = expand_ratio
 36 | 
 37 |     def onnx_infer(self, input_uint8):
 38 |         assert input_uint8.shape[0] == input_uint8.shape[1] == self.face_image_size
 39 |         onnx_input = input_uint8.transpose((2, 0, 1)).astype(np.float32)[np.newaxis, :, :, :] / 255.0
 40 |         landmark, euler, prob = self.face_alignment_net_222.run(self.onnx_output_name_222,
 41 |                                                                 {self.onnx_input_name_222: onnx_input})
 42 | 
 43 |         landmark = np.reshape(landmark[0], (2, -1)).transpose((1, 0)) * self.face_image_size
 44 |         left_eye_corner = landmark[74]
 45 |         right_eye_corner = landmark[96]
 46 |         radian = np.arctan2(right_eye_corner[1] - left_eye_corner[1],
 47 |                             right_eye_corner[0] - left_eye_corner[0] + 0.00000001)
 48 |         euler_rad = np.array([euler[0, 0], euler[0, 1], radian], dtype=np.float32)
 49 |         prob = prob[0]
 50 | 
 51 |         return landmark, euler_rad, prob
 52 | 
 53 |     def forward(self, src_image, face_box=None, pre_pts=None, iterations=3):
 54 |         if pre_pts is None:
 55 |             if face_box is None:
 56 |                 # Detect max size face
 57 |                 bounding_boxes, _, score = self.face_detector.detect(src_image)
 58 |                 print("facedet score", score)
 59 |                 if len(bounding_boxes) == 0:
 60 |                     return None
 61 |                 bbox = np.zeros(4, dtype=np.float32)
 62 |                 if len(bounding_boxes) >= 1:
 63 |                     max_area = 0.0
 64 |                     for each_bbox in bounding_boxes:
 65 |                         area = (each_bbox[2] - each_bbox[0]) * (each_bbox[3] - each_bbox[1])
 66 |                         if area > max_area:
 67 |                             bbox[:4] = each_bbox[:4]
 68 |                         max_area = area
 69 |                 else:
 70 |                     bbox = bounding_boxes[0, :4]
 71 |             else:
 72 |                 bbox = face_box.copy()
 73 |             M_Face = get_warp_mat_bbox(bbox, 0, self.face_image_size, expand_ratio=self.expand_ratio)
 74 |         else:
 75 |             left_eye_corner = pre_pts[74]
 76 |             right_eye_corner = pre_pts[96]
 77 | 
 78 |             radian = np.arctan2(right_eye_corner[1] - left_eye_corner[1],
 79 |                                 right_eye_corner[0] - left_eye_corner[0] + 0.00000001)
 80 |             M_Face = get_warp_mat_bbox_by_gt_pts_float(pre_pts, np.rad2deg(radian), self.face_image_size,
 81 |                                                        expand_ratio=self.expand_ratio)
 82 | 
 83 |         face_input = cv2.warpAffine(src_image, M_Face, (self.face_image_size, self.face_image_size))
 84 |         landmarks, euler, prob = self.onnx_infer(face_input)
 85 |         landmarks = transform_points(landmarks, M_Face, invert=True)
 86 | 
 87 |         # Repeat
 88 |         for i in range(iterations - 1):
 89 |             M_Face = get_warp_mat_bbox_by_gt_pts_float(landmarks, np.rad2deg(euler[2]), self.face_image_size,
 90 |                                                        expand_ratio=self.expand_ratio)
 91 |             face_input = cv2.warpAffine(src_image, M_Face, (self.face_image_size, self.face_image_size))
 92 |             landmarks, euler, prob = self.onnx_infer(face_input)
 93 |             landmarks = transform_points(landmarks, M_Face, invert=True)
 94 | 
 95 |         return_dict = {
 96 |             "pt222": landmarks,
 97 |             "euler_rad": euler,
 98 |             "prob": prob,
 99 |             "M_Face": M_Face,
100 |             "face_input": face_input
101 |         }
102 | 
103 |         return return_dict
104 | 


--------------------------------------------------------------------------------
/hellomeme/tools/hello_face_det.py:
--------------------------------------------------------------------------------
  1 | """
  2 | @File   : test.py
  3 | @Author : Songkey
  4 | @Email  : songkey@pku.edu.cn
  5 | @Date   : 11/1/2024
  6 | @Desc   : Created by Zemin An (an.zemin@hellogroup.com)
  7 | """
  8 | 
  9 | from abc import ABCMeta, abstractmethod
 10 | import cv2
 11 | import numpy as np
 12 | from scipy.special import softmax
 13 | import os.path as osp
 14 | from .utils import create_onnx_session
 15 | 
 16 | songkey_weights_dir = 'pretrained_models'
 17 | 
 18 | _COLORS = (
 19 |     np.array(
 20 |         [
 21 |             0.000,
 22 |             0.447,
 23 |             0.741,
 24 |         ]
 25 |     )
 26 |         .astype(np.float32)
 27 |         .reshape(-1, 3)
 28 | )
 29 | 
 30 | def get_resize_matrix(raw_shape, dst_shape, keep_ratio):
 31 |     """
 32 |     Get resize matrix for resizing raw img to input size
 33 |     :param raw_shape: (width, height) of raw image
 34 |     :param dst_shape: (width, height) of input image
 35 |     :param keep_ratio: whether keep original ratio
 36 |     :return: 3x3 Matrix
 37 |     """
 38 |     r_w, r_h = raw_shape
 39 |     d_w, d_h = dst_shape
 40 |     Rs = np.eye(3)
 41 |     if keep_ratio:
 42 |         C = np.eye(3)
 43 |         C[0, 2] = -r_w / 2
 44 |         C[1, 2] = -r_h / 2
 45 | 
 46 |         if r_w / r_h < d_w / d_h:
 47 |             ratio = d_h / r_h
 48 |         else:
 49 |             ratio = d_w / r_w
 50 |         Rs[0, 0] *= ratio
 51 |         Rs[1, 1] *= ratio
 52 | 
 53 |         T = np.eye(3)
 54 |         T[0, 2] = 0.5 * d_w
 55 |         T[1, 2] = 0.5 * d_h
 56 |         return T @ Rs @ C
 57 |     else:
 58 |         Rs[0, 0] *= d_w / r_w
 59 |         Rs[1, 1] *= d_h / r_h
 60 |         return Rs
 61 | 
 62 | def warp_boxes(boxes, M, width, height):
 63 |     """Apply transform to boxes
 64 |     Copy from nanodet/data/transform/warp.py
 65 |     """
 66 |     n = len(boxes)
 67 |     if n:
 68 |         # warp points
 69 |         xy = np.ones((n * 4, 3))
 70 |         xy[:, :2] = boxes[:, [0, 1, 2, 3, 0, 3, 2, 1]].reshape(
 71 |             n * 4, 2
 72 |         )  # x1y1, x2y2, x1y2, x2y1
 73 |         xy = xy @ M.T  # transform
 74 |         xy = (xy[:, :2] / xy[:, 2:3]).reshape(n, 8)  # rescale
 75 |         # create new boxes
 76 |         x = xy[:, [0, 2, 4, 6]]
 77 |         y = xy[:, [1, 3, 5, 7]]
 78 |         xy = np.concatenate((x.min(1), y.min(1), x.max(1), y.max(1))).reshape(4, n).T
 79 |         # clip boxes
 80 |         xy[:, [0, 2]] = xy[:, [0, 2]].clip(0, width)
 81 |         xy[:, [1, 3]] = xy[:, [1, 3]].clip(0, height)
 82 |         return xy.astype(np.float32)
 83 |     else:
 84 |         return boxes
 85 | 
 86 | def overlay_bbox_cv(img, all_box, class_names):
 87 |     """Draw result boxes
 88 |     Copy from nanodet/util/visualization.py
 89 |     """
 90 |     # all_box array of [label, x0, y0, x1, y1, score]
 91 |     all_box.sort(key=lambda v: v[5])
 92 |     for box in all_box:
 93 |         label, x0, y0, x1, y1, score = box
 94 |         # color = self.cmap(i)[:3]
 95 |         color = (_COLORS[label] * 255).astype(np.uint8).tolist()
 96 |         text = "{}:{:.1f}%".format(class_names[label], score * 100)
 97 |         txt_color = (0, 0, 0) if np.mean(_COLORS[label]) > 0.5 else (255, 255, 255)
 98 |         font = cv2.FONT_HERSHEY_SIMPLEX
 99 |         txt_size = cv2.getTextSize(text, font, 0.5, 2)[0]
100 |         cv2.rectangle(img, (x0, y0), (x1, y1), color, 2)
101 | 
102 |         cv2.rectangle(
103 |             img,
104 |             (x0, y0 - txt_size[1] - 1),
105 |             (x0 + txt_size[0] + txt_size[1], y0 - 1),
106 |             color,
107 |             -1,
108 |         )
109 |         cv2.putText(img, text, (x0, y0 - 1), font, 0.5, txt_color, thickness=1)
110 |     return img
111 | 
112 | def hard_nms(box_scores, iou_threshold, top_k=-1, candidate_size=200):
113 |     """
114 | 
115 |     Args:
116 |         box_scores (N, 5): boxes in corner-form and probabilities.
117 |         iou_threshold: intersection over union threshold.
118 |         top_k: keep top_k results. If k <= 0, keep all the results.
119 |         candidate_size: only consider the candidates with the highest scores.
120 |     Returns:
121 |          picked: a list of indexes of the kept boxes
122 |     """
123 |     scores = box_scores[:, -1]
124 |     boxes = box_scores[:, :-1]
125 |     picked = []
126 |     # _, indexes = scores.sort(descending=True)
127 |     indexes = np.argsort(scores)
128 |     # indexes = indexes[:candidate_size]
129 |     indexes = indexes[-candidate_size:]
130 |     while len(indexes) > 0:
131 |         # current = indexes[0]
132 |         current = indexes[-1]
133 |         picked.append(current)
134 |         if 0 < top_k == len(picked) or len(indexes) == 1:
135 |             break
136 |         current_box = boxes[current, :]
137 |         # indexes = indexes[1:]
138 |         indexes = indexes[:-1]
139 |         rest_boxes = boxes[indexes, :]
140 |         iou = iou_of(
141 |             rest_boxes,
142 |             np.expand_dims(current_box, axis=0),
143 |         )
144 |         indexes = indexes[iou <= iou_threshold]
145 | 
146 |     return box_scores[picked, :]
147 | 
148 | 
149 | def iou_of(boxes0, boxes1, eps=1e-5):
150 |     """Return intersection-over-union (Jaccard index) of boxes.
151 | 
152 |     Args:
153 |         boxes0 (N, 4): ground truth boxes.
154 |         boxes1 (N or 1, 4): predicted boxes.
155 |         eps: a small number to avoid 0 as denominator.
156 |     Returns:
157 |         iou (N): IoU values.
158 |     """
159 |     overlap_left_top = np.maximum(boxes0[..., :2], boxes1[..., :2])
160 |     overlap_right_bottom = np.minimum(boxes0[..., 2:], boxes1[..., 2:])
161 | 
162 |     overlap_area = area_of(overlap_left_top, overlap_right_bottom)
163 |     area0 = area_of(boxes0[..., :2], boxes0[..., 2:])
164 |     area1 = area_of(boxes1[..., :2], boxes1[..., 2:])
165 |     return overlap_area / (area0 + area1 - overlap_area + eps)
166 | 
167 | 
168 | def area_of(left_top, right_bottom):
169 |     """Compute the areas of rectangles given two corners.
170 | 
171 |     Args:
172 |         left_top (N, 2): left top corner.
173 |         right_bottom (N, 2): right bottom corner.
174 | 
175 |     Returns:
176 |         area (N): return the area.
177 |     """
178 |     hw = np.clip(right_bottom - left_top, 0.0, None)
179 |     return hw[..., 0] * hw[..., 1]
180 | 
181 | 
182 | class NanoDetABC(metaclass=ABCMeta):
183 |     def __init__(
184 |             self,
185 |             input_shape=[272, 160],
186 |             reg_max=7,
187 |             strides=[8, 16, 32],
188 |             prob_threshold=0.4,
189 |             iou_threshold=0.3,
190 |             num_candidate=1000,
191 |             top_k=-1,
192 |             class_names=["face"]
193 |     ):
194 |         self.strides = strides
195 |         self.input_shape = input_shape
196 |         self.reg_max = reg_max
197 |         self.prob_threshold = prob_threshold
198 |         self.iou_threshold = iou_threshold
199 |         self.num_candidate = num_candidate
200 |         self.top_k = top_k
201 |         self.img_mean = [103.53, 116.28, 123.675]
202 |         self.img_std = [57.375, 57.12, 58.395]
203 |         self.input_size = (self.input_shape[1], self.input_shape[0])
204 |         self.class_names = class_names
205 |         self.num_classes = len(self.class_names)
206 | 
207 |     def preprocess(self, img):
208 |         # resize image
209 |         ResizeM = get_resize_matrix((img.shape[1], img.shape[0]), self.input_size, True)
210 |         img_resize = cv2.warpPerspective(img, ResizeM, dsize=self.input_size)
211 | 
212 |         # normalize image
213 |         img_input = img_resize.astype(np.float32) / 255
214 |         img_mean = np.array(self.img_mean, dtype=np.float32).reshape(1, 1, 3) / 255
215 |         img_std = np.array(self.img_std, dtype=np.float32).reshape(1, 1, 3) / 255
216 |         img_input = (img_input - img_mean) / img_std
217 | 
218 |         # expand dims
219 |         img_input = np.transpose(img_input, [2, 0, 1])
220 |         img_input = np.expand_dims(img_input, axis=0)
221 |         return img_input, ResizeM
222 | 
223 |     def postprocess(self, scores, raw_boxes, ResizeM, raw_shape):
224 |         # generate centers
225 |         decode_boxes = []
226 |         select_scores = []
227 |         for stride, box_distribute, score in zip(self.strides, raw_boxes, scores):
228 |             # centers
229 |             fm_h = self.input_shape[0] / stride
230 |             fm_w = self.input_shape[1] / stride
231 | 
232 |             h_range = np.arange(fm_h)
233 |             w_range = np.arange(fm_w)
234 |             ww, hh = np.meshgrid(w_range, h_range)
235 | 
236 |             ct_row = hh.flatten() * stride
237 |             ct_col = ww.flatten() * stride
238 | 
239 |             center = np.stack((ct_col, ct_row, ct_col, ct_row), axis=1)
240 | 
241 |             # box distribution to distance
242 |             reg_range = np.arange(self.reg_max + 1)
243 |             box_distance = box_distribute.reshape((-1, self.reg_max + 1))
244 |             box_distance = softmax(box_distance, axis=1)
245 |             box_distance = box_distance * np.expand_dims(reg_range, axis=0)
246 |             box_distance = np.sum(box_distance, axis=1).reshape((-1, 4))
247 |             box_distance = box_distance * stride
248 | 
249 |             # top K candidate
250 |             topk_idx = np.argsort(score.max(axis=1))[::-1]
251 |             topk_idx = topk_idx[: self.num_candidate]
252 |             center = center[topk_idx]
253 |             score = score[topk_idx]
254 |             box_distance = box_distance[topk_idx]
255 | 
256 |             # decode box
257 |             decode_box = center + [-1, -1, 1, 1] * box_distance
258 | 
259 |             select_scores.append(score)
260 |             decode_boxes.append(decode_box)
261 | 
262 |         # nms
263 |         bboxes = np.concatenate(decode_boxes, axis=0)
264 |         confidences = np.concatenate(select_scores, axis=0)
265 |         picked_box_probs = []
266 |         picked_labels = []
267 |         for class_index in range(0, confidences.shape[1]):
268 |             probs = confidences[:, class_index]
269 |             mask = probs > self.prob_threshold
270 |             probs = probs[mask]
271 |             if probs.shape[0] == 0:
272 |                 continue
273 |             subset_boxes = bboxes[mask, :]
274 |             box_probs = np.concatenate([subset_boxes, probs.reshape(-1, 1)], axis=1)
275 |             box_probs = hard_nms(
276 |                 box_probs,
277 |                 iou_threshold=self.iou_threshold,
278 |                 top_k=self.top_k,
279 |             )
280 |             picked_box_probs.append(box_probs)
281 |             picked_labels.extend([class_index] * box_probs.shape[0])
282 |         if not picked_box_probs:
283 |             return np.array([]), np.array([]), np.array([])
284 |         picked_box_probs = np.concatenate(picked_box_probs)
285 | 
286 |         # resize output boxes
287 |         picked_box_probs[:, :4] = warp_boxes(
288 |             picked_box_probs[:, :4], np.linalg.inv(ResizeM), raw_shape[1], raw_shape[0]
289 |         )
290 |         return (
291 |             picked_box_probs[:, :4].astype(np.int32),
292 |             np.array(picked_labels),
293 |             picked_box_probs[:, 4],
294 |         )
295 | 
296 |     @abstractmethod
297 |     def infer_image(self, img_input):
298 |         pass
299 | 
300 |     def detect(self, img):
301 |         raw_shape = img.shape
302 |         img_input, ResizeM = self.preprocess(img)
303 |         scores, raw_boxes = self.infer_image(img_input)
304 |         if scores[0].ndim == 1:  # handling num_classes=1 case
305 |             scores = [x[:, None] for x in scores]
306 |         bbox, label, score = self.postprocess(scores, raw_boxes, ResizeM, raw_shape)
307 | 
308 |         return bbox, label, score
309 | 
310 | class HelloFaceDet(NanoDetABC):
311 |     def __init__(self, model_path=osp.join(songkey_weights_dir, 'face/nanodet_humandet_320-192_220302_model_20220315_test3.onnx'), gpu_id=None, *args, **kwargs):
312 |         super(HelloFaceDet, self).__init__(*args, **kwargs)
313 |         # print("Using ONNX as inference backend")
314 |         # print(f"Using weight: {model_path}")
315 | 
316 |         # load model
317 |         self.model_path = model_path
318 |         self.ort_session = create_onnx_session(model_path, gpu_id=gpu_id)
319 |         self.input_name = self.ort_session.get_inputs()[0].name
320 | 
321 |     def infer_image(self, img_input):
322 |         inference_results = self.ort_session.run(None, {self.input_name: img_input})
323 | 
324 |         scores = [np.squeeze(x) for x in inference_results[:3]]
325 |         raw_boxes = [np.squeeze(x) for x in inference_results[3:]]
326 |         return scores, raw_boxes
327 | 


--------------------------------------------------------------------------------
/hellomeme/tools/pdf.py:
--------------------------------------------------------------------------------
  1 | # coding: utf-8
  2 | 
  3 | """
  4 | @File   : pdf.py
  5 | @Author : Songkey
  6 | @Email  : songkey@pku.edu.cn
  7 | @Date   : 11/7/2024
  8 | @Desc   : Adapted from: https://github.com/Dorniwang/PD-FGC-inference/blob/main/lib/models/networks/encoder.py
  9 | """
 10 | 
 11 | import torch
 12 | import torch.nn as nn
 13 | import torch.nn.functional as F
 14 | 
 15 | from diffusers.models.modeling_utils import ModelMixin
 16 | from diffusers.configuration_utils import ConfigMixin, register_to_config
 17 | 
 18 | def conv3x3(in_planes, out_planes, strd=1, padding=1, bias=False):
 19 |     "3x3 convolution with padding"
 20 |     return nn.Conv2d(in_planes, out_planes, kernel_size=3,
 21 |                      stride=strd, padding=padding, bias=bias)
 22 | 
 23 | class ConvBlock(nn.Module):
 24 |     def __init__(self, in_planes, out_planes):
 25 |         super(ConvBlock, self).__init__()
 26 |         self.bn1 = nn.BatchNorm2d(in_planes)
 27 |         self.conv1 = conv3x3(in_planes, int(out_planes / 2))
 28 |         self.bn2 = nn.BatchNorm2d(int(out_planes / 2))
 29 |         self.conv2 = conv3x3(int(out_planes / 2), int(out_planes / 4))
 30 |         self.bn3 = nn.BatchNorm2d(int(out_planes / 4))
 31 |         self.conv3 = conv3x3(int(out_planes / 4), int(out_planes / 4))
 32 | 
 33 |         if in_planes != out_planes:
 34 |             self.downsample = nn.Sequential(
 35 |                 nn.BatchNorm2d(in_planes),
 36 |                 nn.ReLU(True),
 37 |                 nn.Conv2d(in_planes, out_planes,
 38 |                           kernel_size=1, stride=1, bias=False),
 39 |             )
 40 |         else:
 41 |             self.downsample = None
 42 | 
 43 |     def forward(self, x):
 44 |         residual = x
 45 | 
 46 |         out1 = self.bn1(x)
 47 |         out1 = F.relu(out1, True)
 48 |         out1 = self.conv1(out1)
 49 | 
 50 |         out2 = self.bn2(out1)
 51 |         out2 = F.relu(out2, True)
 52 |         out2 = self.conv2(out2)
 53 | 
 54 |         out3 = self.bn3(out2)
 55 |         out3 = F.relu(out3, True)
 56 |         out3 = self.conv3(out3)
 57 | 
 58 |         out3 = torch.cat((out1, out2, out3), 1)
 59 | 
 60 |         if self.downsample is not None:
 61 |             residual = self.downsample(residual)
 62 | 
 63 |         out3 += residual
 64 | 
 65 |         return out3
 66 | 
 67 | 
 68 | class HourGlass(nn.Module):
 69 |     def __init__(self, num_modules, depth, num_features):
 70 |         super(HourGlass, self).__init__()
 71 |         self.num_modules = num_modules
 72 |         self.depth = depth
 73 |         self.features = num_features
 74 |         self.dropout = nn.Dropout(0.5)
 75 | 
 76 |         self._generate_network(self.depth)
 77 | 
 78 |     def _generate_network(self, level):
 79 |         self.add_module('b1_' + str(level), ConvBlock(256, 256))
 80 | 
 81 |         self.add_module('b2_' + str(level), ConvBlock(256, 256))
 82 | 
 83 |         if level > 1:
 84 |             self._generate_network(level - 1)
 85 |         else:
 86 |             self.add_module('b2_plus_' + str(level), ConvBlock(256, 256))
 87 | 
 88 |         self.add_module('b3_' + str(level), ConvBlock(256, 256))
 89 | 
 90 |     def _forward(self, level, inp):
 91 |         # Upper branch
 92 |         up1 = inp
 93 |         up1 = self._modules['b1_' + str(level)](up1)
 94 |         up1 = self.dropout(up1)
 95 |         # Lower branch
 96 |         low1 = F.max_pool2d(inp, 2, stride=2)
 97 |         low1 = self._modules['b2_' + str(level)](low1)
 98 | 
 99 |         if level > 1:
100 |             low2 = self._forward(level - 1, low1)
101 |         else:
102 |             low2 = low1
103 |             low2 = self._modules['b2_plus_' + str(level)](low2)
104 | 
105 |         low3 = low2
106 |         low3 = self._modules['b3_' + str(level)](low3)
107 |         up1size = up1.size()
108 |         rescale_size = (up1size[2], up1size[3])
109 |         up2 = F.interpolate(low3, size=rescale_size, mode='bilinear')
110 | 
111 |         return up1 + up2
112 | 
113 |     def forward(self, x):
114 |         return self._forward(self.depth, x)
115 | 
116 | class FAN_use(nn.Module):
117 |     def __init__(self):
118 |         super(FAN_use, self).__init__()
119 |         self.num_modules = 1
120 | 
121 |         # Base part
122 |         self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3)
123 |         self.bn1 = nn.BatchNorm2d(64)
124 |         self.conv2 = ConvBlock(64, 128)
125 |         self.conv3 = ConvBlock(128, 128)
126 |         self.conv4 = ConvBlock(128, 256)
127 | 
128 |         # Stacking part
129 |         hg_module = 0
130 |         self.add_module('m' + str(hg_module), HourGlass(1, 4, 256))
131 |         self.add_module('top_m_' + str(hg_module), ConvBlock(256, 256))
132 |         self.add_module('conv_last' + str(hg_module),
133 |                         nn.Conv2d(256, 256, kernel_size=1, stride=1, padding=0))
134 |         self.add_module('l' + str(hg_module), nn.Conv2d(256,
135 |                                                         68, kernel_size=1, stride=1, padding=0))
136 |         self.add_module('bn_end' + str(hg_module), nn.BatchNorm2d(256))
137 | 
138 |         if hg_module < self.num_modules - 1:
139 |             self.add_module(
140 |                 'bl' + str(hg_module), nn.Conv2d(256, 256, kernel_size=1, stride=1, padding=0))
141 |             self.add_module('al' + str(hg_module), nn.Conv2d(68,
142 |                                                              256, kernel_size=1, stride=1, padding=0))
143 | 
144 |         self.avgpool = nn.MaxPool2d((2, 2), 2)
145 |         self.conv6 = nn.Conv2d(68, 1, 3, 2, 1)
146 |         self.fc = nn.Linear(28 * 28, 512)
147 |         self.bn5 = nn.BatchNorm2d(68)
148 |         self.relu = nn.ReLU(True)
149 | 
150 |     def forward(self, x):
151 |         x = F.relu(self.bn1(self.conv1(x)), True)
152 |         x = F.max_pool2d(self.conv2(x), 2)
153 |         x = self.conv3(x)
154 |         x = self.conv4(x)
155 | 
156 |         previous = x
157 | 
158 |         i = 0
159 |         hg = self._modules['m' + str(i)](previous)
160 | 
161 |         ll = hg
162 |         ll = self._modules['top_m_' + str(i)](ll)
163 | 
164 |         ll = self._modules['bn_end' + str(i)](self._modules['conv_last' + str(i)](ll))
165 |         tmp_out = self._modules['l' + str(i)](F.relu(ll))
166 | 
167 |         net = self.relu(self.bn5(tmp_out))
168 |         net = self.conv6(net)
169 |         net = net.view(-1, net.shape[-2] * net.shape[-1])
170 |         net = self.relu(net)
171 |         net = self.fc(net)
172 |         return net
173 | 
174 | class FanEncoder(ModelMixin, ConfigMixin):
175 |     @register_to_config
176 |     def __init__(self, pose_dim=6, eye_dim=6):
177 |         super().__init__()
178 | 
179 |         self.model = FAN_use()
180 | 
181 |         self.to_mouth = nn.Sequential(nn.Linear(512, 512), nn.ReLU(), nn.BatchNorm1d(512), nn.Linear(512, 512))
182 |         self.mouth_embed = nn.Sequential(nn.ReLU(), nn.Linear(512, 512 - pose_dim - eye_dim))
183 | 
184 |         # self.to_headpose = nn.Sequential(nn.Linear(512, 512), nn.ReLU(), nn.BatchNorm1d(512), nn.Linear(512, 512))
185 |         # self.headpose_embed = nn.Sequential(nn.ReLU(), nn.Linear(512, pose_dim))
186 | 
187 |         self.to_eye = nn.Sequential(nn.Linear(512, 512), nn.ReLU(), nn.BatchNorm1d(512), nn.Linear(512, 512))
188 |         self.eye_embed = nn.Sequential(nn.ReLU(), nn.Linear(512, eye_dim))
189 | 
190 |         self.to_emo = nn.Sequential(nn.Linear(512, 512), nn.ReLU(), nn.BatchNorm1d(512), nn.Linear(512, 512))
191 |         self.emo_embed = nn.Sequential(nn.ReLU(), nn.Linear(512, 30))
192 | 
193 |     def forward_feature(self, x):
194 |         net = self.model(x)
195 |         return net
196 | 
197 |     def forward(self, x):
198 |         x = self.model(x)
199 |         mouth_feat = self.to_mouth(x)
200 |         # headpose_feat = self.to_headpose(x)
201 |         # headpose_emb = self.headpose_embed(headpose_feat)
202 |         eye_feat = self.to_eye(x)
203 |         eye_embed = self.eye_embed(eye_feat)
204 |         emo_feat = self.to_emo(x)
205 |         emo_embed = self.emo_embed(emo_feat)
206 | 
207 |         return torch.cat([eye_embed, emo_embed, mouth_feat], dim=1)
208 |         # return headpose_emb, eye_embed, emo_embed, mouth_feat
209 | 


--------------------------------------------------------------------------------
/hellomeme/tools/sr.py:
--------------------------------------------------------------------------------
  1 | # coding: utf-8
  2 | 
  3 | """
  4 | @File   : sr.py
  5 | @Author : Songkey
  6 | @Email  : songkey@pku.edu.cn
  7 | @Date   : 5/30/2025
  8 | @Desc   : adapted from: https://github.com/xinntao/Real-ESRGAN
  9 | """
 10 | 
 11 | import torch
 12 | from torch import nn as nn
 13 | from torch.nn import functional as F
 14 | import cv2
 15 | import numpy as np
 16 | import math
 17 | import os.path as osp
 18 | 
 19 | def pixel_unshuffle(x, scale):
 20 |     """ Pixel unshuffle.
 21 | 
 22 |     Args:
 23 |         x (Tensor): Input feature with shape (b, c, hh, hw).
 24 |         scale (int): Downsample ratio.
 25 | 
 26 |     Returns:
 27 |         Tensor: the pixel unshuffled feature.
 28 |     """
 29 |     b, c, hh, hw = x.size()
 30 |     out_channel = c * (scale**2)
 31 |     assert hh % scale == 0 and hw % scale == 0
 32 |     h = hh // scale
 33 |     w = hw // scale
 34 |     x_view = x.view(b, c, h, scale, w, scale)
 35 |     return x_view.permute(0, 1, 3, 5, 2, 4).reshape(b, out_channel, h, w)
 36 | 
 37 | def make_layer(basic_block, num_basic_block, **kwarg):
 38 |     """Make layers by stacking the same blocks.
 39 | 
 40 |     Args:
 41 |         basic_block (nn.module): nn.module class for basic block.
 42 |         num_basic_block (int): number of blocks.
 43 | 
 44 |     Returns:
 45 |         nn.Sequential: Stacked blocks in nn.Sequential.
 46 |     """
 47 |     layers = []
 48 |     for _ in range(num_basic_block):
 49 |         layers.append(basic_block(**kwarg))
 50 |     return nn.Sequential(*layers)
 51 | 
 52 | class ResidualDenseBlock(nn.Module):
 53 |     """Residual Dense Block.
 54 | 
 55 |     Used in RRDB block in ESRGAN.
 56 | 
 57 |     Args:
 58 |         num_feat (int): Channel number of intermediate features.
 59 |         num_grow_ch (int): Channels for each growth.
 60 |     """
 61 | 
 62 |     def __init__(self, num_feat=64, num_grow_ch=32):
 63 |         super(ResidualDenseBlock, self).__init__()
 64 |         self.conv1 = nn.Conv2d(num_feat, num_grow_ch, 3, 1, 1)
 65 |         self.conv2 = nn.Conv2d(num_feat + num_grow_ch, num_grow_ch, 3, 1, 1)
 66 |         self.conv3 = nn.Conv2d(num_feat + 2 * num_grow_ch, num_grow_ch, 3, 1, 1)
 67 |         self.conv4 = nn.Conv2d(num_feat + 3 * num_grow_ch, num_grow_ch, 3, 1, 1)
 68 |         self.conv5 = nn.Conv2d(num_feat + 4 * num_grow_ch, num_feat, 3, 1, 1)
 69 | 
 70 |         self.lrelu = nn.LeakyReLU(negative_slope=0.2, inplace=True)
 71 | 
 72 |         # initialization
 73 |         # default_init_weights([self.conv1, self.conv2, self.conv3, self.conv4, self.conv5], 0.1)
 74 | 
 75 |     def forward(self, x):
 76 |         x1 = self.lrelu(self.conv1(x))
 77 |         x2 = self.lrelu(self.conv2(torch.cat((x, x1), 1)))
 78 |         x3 = self.lrelu(self.conv3(torch.cat((x, x1, x2), 1)))
 79 |         x4 = self.lrelu(self.conv4(torch.cat((x, x1, x2, x3), 1)))
 80 |         x5 = self.conv5(torch.cat((x, x1, x2, x3, x4), 1))
 81 |         # Empirically, we use 0.2 to scale the residual for better performance
 82 |         return x5 * 0.2 + x
 83 | 
 84 | 
 85 | class RRDB(nn.Module):
 86 |     """Residual in Residual Dense Block.
 87 | 
 88 |     Used in RRDB-Net in ESRGAN.
 89 | 
 90 |     Args:
 91 |         num_feat (int): Channel number of intermediate features.
 92 |         num_grow_ch (int): Channels for each growth.
 93 |     """
 94 | 
 95 |     def __init__(self, num_feat, num_grow_ch=32):
 96 |         super(RRDB, self).__init__()
 97 |         self.rdb1 = ResidualDenseBlock(num_feat, num_grow_ch)
 98 |         self.rdb2 = ResidualDenseBlock(num_feat, num_grow_ch)
 99 |         self.rdb3 = ResidualDenseBlock(num_feat, num_grow_ch)
100 | 
101 |     def forward(self, x):
102 |         out = self.rdb1(x)
103 |         out = self.rdb2(out)
104 |         out = self.rdb3(out)
105 |         # Empirically, we use 0.2 to scale the residual for better performance
106 |         return out * 0.2 + x
107 | 
108 | class RRDBNet(nn.Module):
109 |     """Networks consisting of Residual in Residual Dense Block, which is used
110 |     in ESRGAN.
111 | 
112 |     ESRGAN: Enhanced Super-Resolution Generative Adversarial Networks.
113 | 
114 |     We extend ESRGAN for scale x2 and scale x1.
115 |     Note: This is one option for scale 1, scale 2 in RRDBNet.
116 |     We first employ the pixel-unshuffle (an inverse operation of pixelshuffle to reduce the spatial size
117 |     and enlarge the channel size before feeding inputs into the main ESRGAN architecture.
118 | 
119 |     Args:
120 |         num_in_ch (int): Channel number of inputs.
121 |         num_out_ch (int): Channel number of outputs.
122 |         num_feat (int): Channel number of intermediate features.
123 |             Default: 64
124 |         num_block (int): Block number in the trunk network. Defaults: 23
125 |         num_grow_ch (int): Channels for each growth. Default: 32.
126 |     """
127 | 
128 |     def __init__(self, num_in_ch, num_out_ch, scale=4, num_feat=64, num_block=23, num_grow_ch=32):
129 |         super(RRDBNet, self).__init__()
130 |         self.scale = scale
131 |         if scale == 2:
132 |             num_in_ch = num_in_ch * 4
133 |         elif scale == 1:
134 |             num_in_ch = num_in_ch * 16
135 |         self.conv_first = nn.Conv2d(num_in_ch, num_feat, 3, 1, 1)
136 |         self.body = make_layer(RRDB, num_block, num_feat=num_feat, num_grow_ch=num_grow_ch)
137 |         self.conv_body = nn.Conv2d(num_feat, num_feat, 3, 1, 1)
138 |         # upsample
139 |         self.conv_up1 = nn.Conv2d(num_feat, num_feat, 3, 1, 1)
140 |         self.conv_up2 = nn.Conv2d(num_feat, num_feat, 3, 1, 1)
141 |         self.conv_hr = nn.Conv2d(num_feat, num_feat, 3, 1, 1)
142 |         self.conv_last = nn.Conv2d(num_feat, num_out_ch, 3, 1, 1)
143 | 
144 |         self.lrelu = nn.LeakyReLU(negative_slope=0.2, inplace=True)
145 | 
146 |     def forward(self, x):
147 |         if self.scale == 2:
148 |             feat = pixel_unshuffle(x, scale=2)
149 |         elif self.scale == 1:
150 |             feat = pixel_unshuffle(x, scale=4)
151 |         else:
152 |             feat = x
153 |         feat = self.conv_first(feat)
154 |         body_feat = self.conv_body(self.body(feat))
155 |         feat = feat + body_feat
156 |         # upsample
157 |         feat = self.lrelu(self.conv_up1(F.interpolate(feat, scale_factor=2, mode='nearest')))
158 |         feat = self.lrelu(self.conv_up2(F.interpolate(feat, scale_factor=2, mode='nearest')))
159 |         out = self.conv_last(self.lrelu(self.conv_hr(feat)))
160 |         return out
161 | 
162 | class RealESRGANer():
163 |     def __init__(self,
164 |                  scale,
165 |                  tile=0,
166 |                  tile_pad=10,
167 |                  pre_pad=10,
168 |                  half=True,
169 |                  device=None,
170 |                  gpu_id=None,
171 |                  modelscope=False):
172 |         self.scale = scale
173 |         self.tile_size = tile
174 |         self.tile_pad = tile_pad
175 |         self.pre_pad = pre_pad
176 |         self.mod_scale = None
177 |         self.half = half
178 | 
179 |         # initialize model
180 |         if gpu_id:
181 |             self.device = torch.device(
182 |                 f'cuda:{gpu_id}' if torch.cuda.is_available() else 'cpu') if device is None else device
183 |         else:
184 |             self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') if device is None else device
185 | 
186 |         if modelscope:
187 |             from modelscope import snapshot_download
188 |             model_path = osp.join(snapshot_download('songkey/ESRGAN'), 'RealESRGAN_x2plus.pth')
189 |         else:
190 |             from huggingface_hub import hf_hub_download
191 |             model_path = hf_hub_download('songkey/ESRGAN', filename='RealESRGAN_x2plus.pth')
192 | 
193 |         loadnet = torch.load(model_path, map_location=torch.device('cpu'))
194 | 
195 |         # prefer to use params_ema
196 |         if 'params_ema' in loadnet:
197 |             keyname = 'params_ema'
198 |         else:
199 |             keyname = 'params'
200 |         self.model = RRDBNet(num_in_ch=3, num_out_ch=3, num_feat=64, num_block=23, num_grow_ch=32, scale=2)
201 |         self.model.load_state_dict(loadnet[keyname], strict=True)
202 | 
203 |         self.model.eval()
204 |         self.model = self.model.to(self.device)
205 |         if self.half:
206 |             self.model = self.model.half()
207 | 
208 |     def dni(self, net_a, net_b, dni_weight, key='params', loc='cpu'):
209 |         """Deep network interpolation.
210 | 
211 |         ``Paper: Deep Network Interpolation for Continuous Imagery Effect Transition``
212 |         """
213 |         net_a = torch.load(net_a, map_location=torch.device(loc))
214 |         net_b = torch.load(net_b, map_location=torch.device(loc))
215 |         for k, v_a in net_a[key].items():
216 |             net_a[key][k] = dni_weight[0] * v_a + dni_weight[1] * net_b[key][k]
217 |         return net_a
218 | 
219 |     def pre_process(self, img):
220 |         """Pre-process, such as pre-pad and mod pad, so that the images can be divisible
221 |         """
222 |         img = torch.from_numpy(np.transpose(img, (2, 0, 1))).float()
223 |         self.img = img.unsqueeze(0).to(self.device)
224 |         if self.half:
225 |             self.img = self.img.half()
226 | 
227 |         # pre_pad
228 |         if self.pre_pad != 0:
229 |             self.img = F.pad(self.img, (0, self.pre_pad, 0, self.pre_pad), 'reflect')
230 |         # mod pad for divisible borders
231 |         if self.scale == 2:
232 |             self.mod_scale = 2
233 |         elif self.scale == 1:
234 |             self.mod_scale = 4
235 |         if self.mod_scale is not None:
236 |             self.mod_pad_h, self.mod_pad_w = 0, 0
237 |             _, _, h, w = self.img.size()
238 |             if (h % self.mod_scale != 0):
239 |                 self.mod_pad_h = (self.mod_scale - h % self.mod_scale)
240 |             if (w % self.mod_scale != 0):
241 |                 self.mod_pad_w = (self.mod_scale - w % self.mod_scale)
242 |             self.img = F.pad(self.img, (0, self.mod_pad_w, 0, self.mod_pad_h), 'reflect')
243 | 
244 |     def process(self):
245 |         # model inference
246 |         self.output = self.model(self.img)
247 | 
248 |     def tile_process(self):
249 |         """It will first crop input images to tiles, and then process each tile.
250 |         Finally, all the processed tiles are merged into one images.
251 | 
252 |         Modified from: https://github.com/ata4/esrgan-launcher
253 |         """
254 |         batch, channel, height, width = self.img.shape
255 |         output_height = height * self.scale
256 |         output_width = width * self.scale
257 |         output_shape = (batch, channel, output_height, output_width)
258 | 
259 |         # start with black image
260 |         self.output = self.img.new_zeros(output_shape)
261 |         tiles_x = math.ceil(width / self.tile_size)
262 |         tiles_y = math.ceil(height / self.tile_size)
263 | 
264 |         # loop over all tiles
265 |         for y in range(tiles_y):
266 |             for x in range(tiles_x):
267 |                 # extract tile from input image
268 |                 ofs_x = x * self.tile_size
269 |                 ofs_y = y * self.tile_size
270 |                 # input tile area on total image
271 |                 input_start_x = ofs_x
272 |                 input_end_x = min(ofs_x + self.tile_size, width)
273 |                 input_start_y = ofs_y
274 |                 input_end_y = min(ofs_y + self.tile_size, height)
275 | 
276 |                 # input tile area on total image with padding
277 |                 input_start_x_pad = max(input_start_x - self.tile_pad, 0)
278 |                 input_end_x_pad = min(input_end_x + self.tile_pad, width)
279 |                 input_start_y_pad = max(input_start_y - self.tile_pad, 0)
280 |                 input_end_y_pad = min(input_end_y + self.tile_pad, height)
281 | 
282 |                 # input tile dimensions
283 |                 input_tile_width = input_end_x - input_start_x
284 |                 input_tile_height = input_end_y - input_start_y
285 |                 tile_idx = y * tiles_x + x + 1
286 |                 input_tile = self.img[:, :, input_start_y_pad:input_end_y_pad, input_start_x_pad:input_end_x_pad]
287 | 
288 |                 # upscale tile
289 |                 try:
290 |                     with torch.no_grad():
291 |                         output_tile = self.model(input_tile)
292 |                 except RuntimeError as error:
293 |                     print('Error', error)
294 |                 print(f'\tTile {tile_idx}/{tiles_x * tiles_y}')
295 | 
296 |                 # output tile area on total image
297 |                 output_start_x = input_start_x * self.scale
298 |                 output_end_x = input_end_x * self.scale
299 |                 output_start_y = input_start_y * self.scale
300 |                 output_end_y = input_end_y * self.scale
301 | 
302 |                 # output tile area without padding
303 |                 output_start_x_tile = (input_start_x - input_start_x_pad) * self.scale
304 |                 output_end_x_tile = output_start_x_tile + input_tile_width * self.scale
305 |                 output_start_y_tile = (input_start_y - input_start_y_pad) * self.scale
306 |                 output_end_y_tile = output_start_y_tile + input_tile_height * self.scale
307 | 
308 |                 # put tile into output image
309 |                 self.output[:, :, output_start_y:output_end_y,
310 |                             output_start_x:output_end_x] = output_tile[:, :, output_start_y_tile:output_end_y_tile,
311 |                                                                        output_start_x_tile:output_end_x_tile]
312 | 
313 |     def post_process(self):
314 |         # remove extra pad
315 |         if self.mod_scale is not None:
316 |             _, _, h, w = self.output.size()
317 |             self.output = self.output[:, :, 0:h - self.mod_pad_h * self.scale, 0:w - self.mod_pad_w * self.scale]
318 |         # remove prepad
319 |         if self.pre_pad != 0:
320 |             _, _, h, w = self.output.size()
321 |             self.output = self.output[:, :, 0:h - self.pre_pad * self.scale, 0:w - self.pre_pad * self.scale]
322 |         return self.output
323 | 
324 |     @torch.no_grad()
325 |     def enhance(self, img, outscale=None, alpha_upsampler='realesrgan'):
326 |         h_input, w_input = img.shape[0:2]
327 |         # img: numpy
328 |         img = img.astype(np.float32)
329 |         if np.max(img) > 256:  # 16-bit image
330 |             max_range = 65535
331 |             print('\tInput is a 16-bit image')
332 |         else:
333 |             max_range = 255
334 |         img = img / max_range
335 |         if len(img.shape) == 2:  # gray image
336 |             img_mode = 'L'
337 |             img = cv2.cvtColor(img, cv2.COLOR_GRAY2RGB)
338 |         elif img.shape[2] == 4:  # RGBA image with alpha channel
339 |             img_mode = 'RGBA'
340 |             alpha = img[:, :, 3]
341 |             img = img[:, :, 0:3]
342 |             img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
343 |             if alpha_upsampler == 'realesrgan':
344 |                 alpha = cv2.cvtColor(alpha, cv2.COLOR_GRAY2RGB)
345 |         else:
346 |             img_mode = 'RGB'
347 |             img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
348 | 
349 |         # ------------------- process image (without the alpha channel) ------------------- #
350 |         self.pre_process(img)
351 |         if self.tile_size > 0:
352 |             self.tile_process()
353 |         else:
354 |             self.process()
355 |         output_img = self.post_process()
356 |         output_img = output_img.data.squeeze().float().cpu().clamp_(0, 1).numpy()
357 |         output_img = np.transpose(output_img[[2, 1, 0], :, :], (1, 2, 0))
358 |         if img_mode == 'L':
359 |             output_img = cv2.cvtColor(output_img, cv2.COLOR_BGR2GRAY)
360 | 
361 |         # ------------------- process the alpha channel if necessary ------------------- #
362 |         if img_mode == 'RGBA':
363 |             if alpha_upsampler == 'realesrgan':
364 |                 self.pre_process(alpha)
365 |                 if self.tile_size > 0:
366 |                     self.tile_process()
367 |                 else:
368 |                     self.process()
369 |                 output_alpha = self.post_process()
370 |                 output_alpha = output_alpha.data.squeeze().float().cpu().clamp_(0, 1).numpy()
371 |                 output_alpha = np.transpose(output_alpha[[2, 1, 0], :, :], (1, 2, 0))
372 |                 output_alpha = cv2.cvtColor(output_alpha, cv2.COLOR_BGR2GRAY)
373 |             else:  # use the cv2 resize for alpha channel
374 |                 h, w = alpha.shape[0:2]
375 |                 output_alpha = cv2.resize(alpha, (w * self.scale, h * self.scale), interpolation=cv2.INTER_LINEAR)
376 | 
377 |             # merge the alpha channel
378 |             output_img = cv2.cvtColor(output_img, cv2.COLOR_BGR2BGRA)
379 |             output_img[:, :, 3] = output_alpha
380 | 
381 |         # ------------------------------ return ------------------------------ #
382 |         if max_range == 65535:  # 16-bit image
383 |             output = (output_img * 65535.0).round().astype(np.uint16)
384 |         else:
385 |             output = (output_img * 255.0).round().astype(np.uint8)
386 | 
387 |         if outscale is not None and outscale != float(self.scale):
388 |             output = cv2.resize(
389 |                 output, (
390 |                     int(w_input * outscale),
391 |                     int(h_input * outscale),
392 |                 ), interpolation=cv2.INTER_LANCZOS4)
393 | 
394 |         return output, img_mode


--------------------------------------------------------------------------------
/hellomeme/tools/utils.py:
--------------------------------------------------------------------------------
  1 | # coding: utf-8
  2 | 
  3 | # @File   : utils.py
  4 | # @Author : Songkey
  5 | # @Email  : songkey@pku.edu.cn
  6 | # @Date   : 8/18/2024
  7 | # @Desc   :
  8 | 
  9 | import onnx, onnxruntime
 10 | import time
 11 | import cv2
 12 | import numpy as np
 13 | import math
 14 | 
 15 | def create_onnx_session(onnx_path, gpu_id=None)->onnxruntime.InferenceSession:
 16 |     start = time.perf_counter()
 17 |     onnx_model = onnx.load(onnx_path)
 18 |     onnx.checker.check_model(onnx_model)
 19 |     providers = [
 20 |         ('CUDAExecutionProvider', {
 21 |             'device_id': int(gpu_id),
 22 |             'arena_extend_strategy': 'kNextPowerOfTwo',
 23 |             #'cuda_mem_limit': 5 * 1024 * 1024 * 1024,
 24 |             'cudnn_conv_algo_search': 'EXHAUSTIVE',
 25 |             'do_copy_in_default_stream': True,
 26 |         }),
 27 |         'CPUExecutionProvider',
 28 |     ] if (gpu_id is not None and gpu_id >= 0) else ['CPUExecutionProvider']
 29 | 
 30 |     sess = onnxruntime.InferenceSession(onnx_path, providers=providers)
 31 |     print('create onnx session cost: {:.3f}s. {}'.format(time.perf_counter() - start, onnx_path))
 32 |     return sess
 33 | 
 34 | def smoothing_factor(t_e, cutoff):
 35 |     r = 2 * math.pi * cutoff * t_e
 36 |     return r / (r + 1)
 37 | 
 38 | def exponential_smoothing(a, x, x_prev):
 39 |     return a * x + (1 - a) * x_prev
 40 | 
 41 | class OneEuroFilter:
 42 |     def __init__(self, dx0=0.0, d_cutoff=1.0):
 43 |         """Initialize the one euro filter."""
 44 |         # self.min_cutoff = float(min_cutoff)
 45 |         # self.beta = float(beta)
 46 |         self.d_cutoff = float(d_cutoff)
 47 |         self.dx_prev = float(dx0)
 48 |         # self.t_e = fcmin
 49 | 
 50 |     def __call__(self, x, x_prev, fcmin=1.0, min_cutoff=1.0, beta=0.0):
 51 |         if x_prev is None:
 52 |             return x
 53 |         # t_e = 1
 54 |         a_d = smoothing_factor(fcmin, self.d_cutoff)
 55 |         dx = (x - x_prev) / fcmin
 56 |         dx_hat = exponential_smoothing(a_d, dx, self.dx_prev)
 57 |         cutoff = min_cutoff + beta * abs(dx_hat)
 58 |         a = smoothing_factor(fcmin, cutoff)
 59 |         x_hat = exponential_smoothing(a, x, x_prev)
 60 |         self.dx_prev = dx_hat
 61 |         return x_hat
 62 | 
 63 | def get_warp_mat_bbox(face_bbox, base_angle, dst_size=128, expand_ratio=0.15, aug_angle=0.0, aug_scale=1.0):
 64 |     face_x_min, face_y_min, face_x_max, face_y_max = face_bbox
 65 |     face_x_center = (face_x_min + face_x_max) / 2
 66 |     face_y_center = (face_y_min + face_y_max) / 2
 67 |     face_width = face_x_max - face_x_min
 68 |     face_height = face_y_max - face_y_min
 69 |     scale = dst_size / max(face_width, face_height) * (1 - expand_ratio) * aug_scale
 70 |     M = cv2.getRotationMatrix2D((face_x_center, face_y_center), angle=base_angle + aug_angle, scale=scale)
 71 |     offset = [dst_size / 2 - face_x_center, dst_size / 2 - face_y_center]
 72 |     M[:, 2] += offset
 73 |     return M
 74 | 
 75 | def transform_points(points, mat, invert=False):
 76 |     if invert:
 77 |         mat = cv2.invertAffineTransform(mat)
 78 |     points = np.expand_dims(points, axis=1)
 79 |     points = cv2.transform(points, mat, points.shape)
 80 |     points = np.squeeze(points)
 81 |     return points
 82 | 
 83 | def get_warp_mat_bbox_by_gt_pts_float(gt_pts, base_angle=0.0, dst_size=128, expand_ratio=0.15, return_info=False):
 84 |     # step 1
 85 |     face_x_min, face_x_max = np.min(gt_pts[:, 0]), np.max(gt_pts[:, 0])
 86 |     face_y_min, face_y_max = np.min(gt_pts[:, 1]), np.max(gt_pts[:, 1])
 87 |     face_x_center = (face_x_min + face_x_max) / 2
 88 |     face_y_center = (face_y_min + face_y_max) / 2
 89 |     M_step_1 = cv2.getRotationMatrix2D((face_x_center, face_y_center), angle=base_angle, scale=1.0)
 90 |     pts_step_1 = transform_points(gt_pts, M_step_1)
 91 |     face_x_min_step_1, face_x_max_step_1 = np.min(pts_step_1[:, 0]), np.max(pts_step_1[:, 0])
 92 |     face_y_min_step_1, face_y_max_step_1 = np.min(pts_step_1[:, 1]), np.max(pts_step_1[:, 1])
 93 |     # step 2
 94 |     face_width = face_x_max_step_1 - face_x_min_step_1
 95 |     face_height = face_y_max_step_1 - face_y_min_step_1
 96 |     scale = dst_size / max(face_width, face_height) * (1 - expand_ratio)
 97 |     M_step_2 = cv2.getRotationMatrix2D((face_x_center, face_y_center), angle=base_angle, scale=scale)
 98 |     pts_step_2 = transform_points(gt_pts, M_step_2)
 99 |     face_x_min_step_2, face_x_max_step_2 = np.min(pts_step_2[:, 0]), np.max(pts_step_2[:, 0])
100 |     face_y_min_step_2, face_y_max_step_2 = np.min(pts_step_2[:, 1]), np.max(pts_step_2[:, 1])
101 |     face_x_center_step_2 = (face_x_min_step_2 + face_x_max_step_2) / 2
102 |     face_y_center_step_2 = (face_y_min_step_2 + face_y_max_step_2) / 2
103 | 
104 |     M = cv2.getRotationMatrix2D((face_x_center, face_y_center), angle=base_angle, scale=scale)
105 |     offset = [dst_size / 2 - face_x_center_step_2, dst_size / 2 - face_y_center_step_2]
106 |     M[:, 2] += offset
107 | 
108 |     if not return_info:
109 |         return M
110 |     else:
111 |         transform_info = {
112 |             "M": M,
113 |             "center_x": face_x_center,
114 |             "center_y": face_y_center,
115 |             "rotate_angle": base_angle,
116 |             "scale": scale
117 |         }
118 |         return transform_info
119 | 


--------------------------------------------------------------------------------
/inference_image.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | 
 3 | """
 4 | @File   : inference_image.py
 5 | @Author : Songkey
 6 | @Email  : songkey@pku.edu.cn
 7 | @Date   : 8/29/2024
 8 | @Desc   : 
 9 | """
10 | 
11 | import os
12 | from generator import Generator, DEFAULT_PROMPT, MODEL_CONFIG
13 | 
14 | from PIL import Image
15 | 
16 | lora_names = [None] + list(MODEL_CONFIG['sd15']['loras'].keys())
17 | checkpoint_names = list(MODEL_CONFIG['sd15']['checkpoints'].keys())
18 | 
19 | print("Available lora models: ", lora_names)
20 | print("Available checkpoints: ", checkpoint_names)
21 | 
22 | modelscope = False
23 | 
24 | if __name__ == '__main__':
25 |     ref_img_path = r"data/reference_images/chillout.jpg"
26 |     drive_img_path = r"data/drive_images/yao.jpg"
27 | 
28 |     lora = lora_names[2]
29 |     checkpoint = checkpoint_names[1]
30 | 
31 |     tmp_lora_info = MODEL_CONFIG['sd15']['loras'][lora]
32 |     if modelscope:
33 |         from modelscope import snapshot_download
34 |         checkpoint_path = snapshot_download(MODEL_CONFIG['sd15']['checkpoints'][checkpoint])
35 |         if lora is None:
36 |             lora_path = None
37 |         else:
38 |             lora_path = os.path.join(snapshot_download(tmp_lora_info[0]), tmp_lora_info[1])
39 |     else:
40 |         checkpoint_path = MODEL_CONFIG['sd15']['checkpoints'][checkpoint]
41 |         if lora is None:
42 |             lora_path = None
43 |         else:
44 |             from huggingface_hub import hf_hub_download
45 |             lora_path = hf_hub_download(tmp_lora_info[0], filename=tmp_lora_info[1])
46 | 
47 |     vae_path = "same as checkpoint"
48 | 
49 |     gpu_id = 0
50 |     generator = Generator(gpu_id=gpu_id, modelscope=modelscope)
51 |     ref_image = Image.open(ref_img_path)
52 |     drive_image = Image.open(drive_img_path)
53 |     token = generator.load_pipeline('image', checkpoint_path, vae_path, lora_path, stylize='x1', version='v5')
54 |     result = generator.image_generate(token,
55 |                                       ref_image,
56 |                                       drive_image,
57 |                                       25,
58 |                                       1.5,
59 |                                       1,
60 |                                       DEFAULT_PROMPT,
61 |                                       '',
62 |                                       0.5,
63 |                                       False,
64 |                                       'cntrl2')
65 |     result.show()


--------------------------------------------------------------------------------
/inference_video.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | 
 3 | """
 4 | @File   : inference_image.py
 5 | @Author : Songkey
 6 | @Email  : songkey@pku.edu.cn
 7 | @Date   : 8/29/2024
 8 | @Desc   : 
 9 | """
10 | 
11 | import os
12 | from generator import Generator, DEFAULT_PROMPT, MODEL_CONFIG
13 | 
14 | from PIL import Image
15 | 
16 | lora_names = [None] + list(MODEL_CONFIG['sd15']['loras'].keys())
17 | checkpoint_names = list(MODEL_CONFIG['sd15']['checkpoints'].keys())
18 | 
19 | print("Available lora models: ", lora_names)
20 | print("Available checkpoints: ", checkpoint_names)
21 | 
22 | modelscope = False
23 | 
24 | if __name__ == '__main__':
25 |     ref_img_path = r"data/reference_images/trump.jpg"
26 |     drive_video_path = r"data/drive_videos/jue.mp4"
27 | 
28 |     lora = lora_names[2]
29 |     tmp_lora_info = MODEL_CONFIG['sd15']['loras'][lora]
30 |     checkpoint = checkpoint_names[1]
31 | 
32 |     print("lora: ", lora, "checkpoint: ", checkpoint)
33 |     if modelscope:
34 |         from modelscope import snapshot_download
35 |         checkpoint_path = snapshot_download(MODEL_CONFIG['sd15']['checkpoints'][checkpoint])
36 |         if lora is None:
37 |             lora_path = None
38 |         else:
39 |             lora_path = os.path.join(snapshot_download(tmp_lora_info[0]), tmp_lora_info[1])
40 |     else:
41 |         checkpoint_path = MODEL_CONFIG['sd15']['checkpoints'][checkpoint]
42 |         if lora is None:
43 |             lora_path = None
44 |         else:
45 |             from huggingface_hub import hf_hub_download
46 |             lora_path = hf_hub_download(tmp_lora_info[0], filename=tmp_lora_info[1])
47 |     vae_path = "same as checkpoint"
48 | 
49 |     gpu_id = 0
50 |     generator = Generator(gpu_id=gpu_id, modelscope=False)
51 |     ref_image = Image.open(ref_img_path)
52 |     token = generator.load_pipeline('video', checkpoint_path, vae_path, lora_path, stylize='x1', version='v5')
53 | 
54 |     save_path = generator.video_generate(token,
55 |                                         ref_image=ref_image,
56 |                                         drive_video_path=drive_video_path,
57 |                                         num_steps=25,
58 |                                         guidance=1.5,
59 |                                         seed=-1,
60 |                                         prompt=DEFAULT_PROMPT,
61 |                                         negative_prompt='',
62 |                                         trans_ratio=0.5,
63 |                                         crop_reference=True,
64 |                                         patch_overlap=4,
65 |                                         cntrl_version='cntrl2',
66 |                                         fps8=True)
67 |     print(f"Save path: {save_path}")


--------------------------------------------------------------------------------