├── .github
    └── workflows
    │   └── publish.yml
├── ReadMe.md
├── __init__.py
├── musetalk_global_data.py
├── musetalk_postprocess.py
├── musetalk_preprocess.py
├── musetalk_train.py
├── musetalk_train_preprocess.py
├── musetalk_utils.py
├── pyproject.toml
├── unet.py
├── vae.py
└── workflow
    ├── musetalk flow.json
    ├── musetalk flow.png
    ├── musetalk train flow.json
    ├── sampleimage.png
    ├── train.png
    └── trainsample.png


/.github/workflows/publish.yml:
--------------------------------------------------------------------------------
 1 | name: Publish to Comfy registry
 2 | on:
 3 |   workflow_dispatch:
 4 |   push:
 5 |     branches:
 6 |       - main
 7 |       - master
 8 |     paths:
 9 |       - "pyproject.toml"
10 | 
11 | permissions:
12 |   issues: write
13 | 
14 | jobs:
15 |   publish-node:
16 |     name: Publish Custom Node to registry
17 |     runs-on: ubuntu-latest
18 |     if: ${{ github.repository_owner == 'xuhongming251' }}
19 |     steps:
20 |       - name: Check out code
21 |         uses: actions/checkout@v4
22 |       - name: Publish Custom Node
23 |         uses: Comfy-Org/publish-node-action@v1
24 |         with:
25 |           ## Add your own personal access token to your Github Repository secrets and reference it here.
26 |           personal_access_token: ${{ secrets.REGISTRY_ACCESS_TOKEN }}
27 | 


--------------------------------------------------------------------------------
/ReadMe.md:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | # MuseTalk ComfyUI Preprocess and Postprocess Nodes
 4 | 
 5 | # Preprocess Node
 6 | 1. rotated image、crop face images
 7 | 
 8 | # Postprocess Node
 9 | 1. uncrop faces and rotated images
10 | 
11 | 
12 | # MuseTalk Work Flow
13 | 1. open musetalk flow and upload video(or image)
14 | 2. set audio path(wav or mp3 ...)
15 | 3. run the flow
16 | 4. video tutorial: https://www.bilibili.com/video/BV1ni421X7ok/?share_source=copy_web&vd_source=43ee8c0ef3a0b12097f69db4423c1332
17 | 
18 | 5. GPU < 5G can run
19 | 
20 | ![image](./workflow/sampleimage.png)
21 | 
22 | # MuseTalk Train Work Flow
23 | 1. open train flow and upload video
24 | 2. run the train flow
25 | 3. `epoch_0.pth`、`epoch_1.pth`、`epoch_2.pth`... will gen into `models\musetalk\musetalk` folder
26 | 4. watch loss value in the cmd terminal, manual stop terminal when the training loss has decreased to 0.005 or lower
27 | 5. select musetalk model by `epoch_x.pth` in the musetalk flow.
28 | 6. run the musetalk flow for test.
29 | 7. the train flow just a demo for test.
30 | 8. GPU 16G can Run.
31 | 
32 | ![image](./workflow/train.png)
33 | ![image](./workflow/trainsample.png)
34 | 
35 | 
36 | Original repo:
37 | https://github.com/TMElyralab/MuseTalk


--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from .musetalk_preprocess import *
 3 | from .musetalk_postprocess import *
 4 | from .musetalk_train_preprocess import *
 5 | from .musetalk_train import *
 6 | 
 7 | 
 8 | NODE_CLASS_MAPPINGS = {
 9 | 
10 |     # "MuseTalkUncropMask":MuseTalkUncropMask,
11 | 
12 |     "MuseTalkPreprocess": MuseTalkPreprocess,
13 |     "MuseTalkPostprocess": MuseTalkPostprocess,
14 | 
15 |     "MuseTalkTrainPreprocess": MuseTalkTrainPreprocess,
16 |     "MuseTalkTrain": MuseTalkTrain,
17 | 
18 | }
19 | 
20 | 
21 | NODE_DISPLAY_NAME_MAPPINGS = {
22 | 
23 |     # "MuseTalkUncropMask": "MuseTalkUncropMask",
24 | 
25 |     "MuseTalkPreprocess": "MuseTalkPreprocess",
26 |     "MuseTalkPostprocess": "MuseTalkPostprocess",
27 | 
28 |     "MuseTalkTrainPreprocess": "MuseTalkTrainPreprocess",
29 |     "MuseTalkTrain": "MuseTalkTrain",
30 | 
31 | }


--------------------------------------------------------------------------------
/musetalk_global_data.py:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | # for debug
 4 | rotated_faces_with_landmarks = []
 5 | # origin_face_bboxs = []
 6 | # origin_face_landmarks = []
 7 | 
 8 | 
 9 | # gen in preprocess node, use in postprocess
10 | rotated_faces = []
11 | rotated_bboxs = []
12 | rotated_images = []
13 | 
14 | face_center_points = []
15 | rotated_angles = []
16 | origin_face_masks = []
17 | 
18 | rotated_resized_half_face_masks = []
19 | 
20 | 
21 | # for train
22 | faces_latent_list = []
23 | resized_cv2_frame_list = []
24 | 


--------------------------------------------------------------------------------
/musetalk_postprocess.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import cv2
  3 | import numpy as np
  4 | import torch
  5 | import torch.nn.functional as F
  6 | from PIL import Image
  7 | import comfy
  8 | import time
  9 | 
 10 | from . import musetalk_utils
 11 | from . import musetalk_global_data
 12 | 
 13 | 
 14 | # def create_uncrop_mask(width, height, center, v_axes, h_axes):
 15 | class MuseTalkUncropMask:
 16 |     def __init__(self):
 17 |         pass
 18 |     
 19 |     @classmethod
 20 |     def INPUT_TYPES(s):
 21 |         return {
 22 |             "required": {
 23 |                 "width": ("INT", {"default": 256, "min": -9999, "max": 9999, "step": 1}),
 24 |                 "height": ("INT", {"default": 256, "min": -9999, "max": 9999, "step": 1}),
 25 |                 "ellipse_center_x": ("INT", {"default": 128, "min": -9999, "max": 9999, "step": 1}),
 26 |                 "ellipse_center_y": ("INT", {"default": 192, "min": -9999, "max": 9999, "step": 1}),
 27 |                 "ellipse_center_v_axes": ("INT", {"default": 128, "min": -9999, "max": 9999, "step": 1}),
 28 |                 "ellipse_center_h_axes": ("INT", {"default": 64, "min": -9999, "max": 9999, "step": 1}),
 29 |             },
 30 |         }
 31 | # top_reserve, bottom_reserve, left_reserve, right_reserve
 32 |     RETURN_TYPES = ("IMAGE",)
 33 |     RETURN_NAMES = (
 34 |         "images",
 35 |     )
 36 | 
 37 |     FUNCTION = "run"
 38 |     CATEGORY = "MuseTalkUtils"
 39 | 
 40 |     def run(self, width, height, ellipse_center_x, ellipse_center_y, ellipse_center_v_axes, ellipse_center_h_axes):
 41 |         pil_image_mask = musetalk_utils.create_uncrop_mask(width, height, (ellipse_center_x, ellipse_center_y), ellipse_center_v_axes, ellipse_center_h_axes)
 42 |         image = pil_image_mask.convert("RGB")
 43 |         image = np.array(image).astype(np.float32) / 255.0
 44 |         image = torch.from_numpy(image)[None,]
 45 |         return (image, )
 46 | 
 47 | 
 48 | class MuseTalkPostprocess:
 49 |     def __init__(self):
 50 |         pass
 51 |     
 52 |     @classmethod
 53 |     def INPUT_TYPES(s):
 54 |         return {
 55 |             "required": {
 56 |                 "origin_images": ("IMAGE",),
 57 |                 "musetalk_faces": ("IMAGE",),
 58 |                 # "rotated_bboxs": ("FACE_BBOX",),
 59 |                 # "rotated_images": ("IMAGE",),
 60 |                 # "face_center_points": ("FACE_CENTER_POINT",),
 61 |                 # "rotated_angles": ("ROTATE_ANGLE",),
 62 |                 # "origin_face_bboxs": ("FACE_BBOX",),
 63 |                 # "origin_face_masks": ("IMAGE",),
 64 |                 # "landmarks":("LANDMARK",),
 65 |                 # "uncrop_mask":("IMAGE",),
 66 |                 "extend": ("INT", {"default": 0, "min": -9999, "max": 9999, "step": 1}),
 67 |                 "blur_radius": ("INT", {"default": 0, "min": -9999, "max": 9999, "step": 1}),
 68 |                 "extend1": ("INT", {"default": -5, "min": -9999, "max": 9999, "step": 1}),
 69 |                 "blur_radius1": ("INT", {"default": 5, "min": -9999, "max": 9999, "step": 1}),
 70 |             },
 71 |             "optional": {
 72 |                 "uncrop_mask":("IMAGE",),
 73 |             }
 74 |         }
 75 | # top_reserve, bottom_reserve, left_reserve, right_reserve
 76 |     RETURN_TYPES = ("IMAGE",
 77 |                     # "IMAGE", "IMAGE","IMAGE",
 78 |                     )
 79 |     RETURN_NAMES = (
 80 |         "images",
 81 |         # "uncrop_masks",
 82 |         # "uncroped_images",
 83 |         # "face_masks",
 84 |         )
 85 | 
 86 |     FUNCTION = "postprocess"
 87 |     CATEGORY = "MuseTalkUtils"
 88 | 
 89 |     def getRealIndex(self, index, origin_img_len):
 90 |         if index >= origin_img_len: 
 91 |             return (origin_img_len * 2 - index - 1)
 92 |         else:
 93 |             return index
 94 | 
 95 |     def postprocess(self, origin_images, musetalk_faces, 
 96 |                     # rotated_bboxs, rotated_images, 
 97 |                     # face_center_points, rotated_angles, origin_face_bboxs, origin_face_masks, landmarks,
 98 |                     # uncrop_mask,
 99 |                     extend, blur_radius,extend1, blur_radius1,
100 |                     uncrop_mask = None):
101 |         
102 |         global rotated_faces
103 |         global rotated_faces_with_landmarks
104 | 
105 |         global rotated_bboxs
106 |         global rotated_images
107 |         global face_center_points
108 |         global rotated_angles
109 |         global origin_face_bboxs
110 |         global origin_face_masks
111 |         global origin_face_landmarks
112 |         global rotated_resized_half_face_masks
113 | 
114 |         if uncrop_mask is not None:
115 |             uncrop_mask = uncrop_mask[0]
116 | 
117 |         print(f"MuseTalkPreprocess postprocess, len(origin_images): {len(origin_images)}, len(musetalk_faces): {len(musetalk_faces)}")
118 | 
119 |         musetalk_face_image_count = len(musetalk_faces)
120 | 
121 |         
122 |         # TODO, process default value
123 | 
124 |         origin_img_len = len(origin_images)
125 |         rotated_bboxs_len = len(musetalk_global_data.rotated_bboxs)
126 |         rotated_images_len = len(musetalk_global_data.rotated_images)
127 |         face_center_points_len = len(musetalk_global_data.face_center_points)
128 |         rotated_angles_len = len(musetalk_global_data.rotated_angles)
129 |         origin_face_bboxs_len = len(musetalk_global_data.origin_face_bboxs)
130 |         origin_face_masks_len = len(musetalk_global_data.origin_face_masks)
131 | 
132 |         MAX_LEN = origin_img_len * 2
133 | 
134 |         print(f"origin_img_len: {origin_img_len}, rotated_bboxs_len: {rotated_bboxs_len}, rotated_images_len: {rotated_images_len}, face_center_points_len: {face_center_points_len}, rotated_angles_len: {rotated_angles_len},origin_face_bboxs_len: {origin_face_bboxs_len}, origin_face_masks_len: {origin_face_masks_len}")
135 |         if origin_img_len == rotated_bboxs_len == rotated_images_len == face_center_points_len == rotated_angles_len == origin_face_bboxs_len ==origin_face_masks_len:
136 |             if origin_img_len < musetalk_face_image_count:
137 |                 pass
138 |         else:
139 |             print("the len is not same")
140 |             return (None)
141 | 
142 |         result_images = []
143 | 
144 |         # face_masks = []
145 |         # uncrop_masks = []
146 |         # uncroped_images = []
147 |         idx = 0
148 | 
149 |         pbar = comfy.utils.ProgressBar(len(musetalk_faces))
150 | 
151 |         for musetalk_face in musetalk_faces:
152 | 
153 |             start_time0 = time.time()
154 | 
155 |             real_index = self.getRealIndex(idx, origin_img_len)
156 |             # TODO: valid real_index
157 |             
158 |             origin_image = origin_images[real_index]
159 | 
160 |             rotated_bbox = musetalk_global_data.rotated_bboxs[real_index]
161 |             rotated_image = musetalk_global_data.rotated_images[real_index]
162 |             face_center_point = musetalk_global_data.face_center_points[real_index]
163 |             rotate_angle = musetalk_global_data.rotated_angles[real_index]
164 |             # origin_face_bbox = musetalk_global_data.origin_face_bboxs[real_index]
165 |             origin_face_mask = musetalk_global_data.origin_face_masks[real_index]
166 |             # landmark = musetalk_global_data.origin_face_landmarks[real_index]
167 |             rotated_face = musetalk_global_data.rotated_faces[real_index]
168 | 
169 |             
170 |             # musetalk_face = musetalk_utils.tensorimg_to_cv2img(musetalk_face)
171 |             # rotated_image = musetalk_utils.tensorimg_to_cv2img(rotated_image)
172 |             
173 |             origin_image_height, origin_image_width = musetalk_utils.tensorimg_to_cv2img(origin_image).shape[:2]
174 | 
175 |             # print("origin_image shape: ", origin_image.shape[:2])
176 |             # print("musetalk_face shape: ", musetalk_face.shape[:2])
177 |             # print("rotated_image shape: ", rotated_image.shape[:2])
178 | 
179 | 
180 |             if uncrop_mask == None:
181 | 
182 |                 pil_uncrop_mask_image = musetalk_global_data.rotated_resized_half_face_masks[real_index]
183 | 
184 |                 start_time1 = time.time()
185 |                 musetalk_rotated_image, pil_uncrop_mask_image = musetalk_utils.uncrop_to_rotated_image(musetalk_utils.tensorimg_to_pilimg(rotated_face), 
186 |                                                                                 musetalk_utils.tensorimg_to_pilimg(musetalk_face), 
187 |                                                                                     rotated_bbox, 
188 |                                                                                     musetalk_utils.tensorimg_to_pilimg(rotated_image), 
189 |                                                                                     pil_uncrop_mask_image, extend, blur_radius)
190 |                 # uncrop_masks.append(musetalk_utils.pilimg_to_tensorimg(pil_uncrop_mask_image))
191 | 
192 |                 print(f"frame index: {idx}, real_index: {real_index}, uncrop one frame, use: {((time.time() - start_time1)*1000):.2f} ms")
193 |             else:
194 |                 musetalk_rotated_image, _ = musetalk_utils.uncrop_to_rotated_image(musetalk_utils.tensorimg_to_pilimg(rotated_face), 
195 |                                                                                 musetalk_utils.tensorimg_to_pilimg(musetalk_face), 
196 |                                                                                     rotated_bbox, 
197 |                                                                                     musetalk_utils.tensorimg_to_pilimg(rotated_image), 
198 |                                                                                     musetalk_utils.tensorimg_to_pilimg(uncrop_mask), 0, 0)
199 |                 # uncrop_masks.append(uncrop_mask)
200 |             
201 |             musetalk_rotated_image_tensor = musetalk_utils.pilimg_to_tensorimg(musetalk_rotated_image)
202 |             # uncroped_images.append(musetalk_rotated_image_tensor)
203 |             
204 |             # uncrop_masks.append(musetalk_utils.pilimg_to_tensorimg(uncrop_mask))
205 | 
206 |             # TODO: optimize
207 |             musetalk_origin_image = musetalk_utils.unrotated_image(musetalk_utils.tensorimg_to_cv2img(musetalk_rotated_image_tensor), face_center_point, rotate_angle, origin_image_width, origin_image_height)
208 | 
209 |             musetalk_origin_image_tensor = musetalk_utils.cv2img_to_tensorimg(musetalk_origin_image)
210 | 
211 |             # landmark = landmark[0]
212 | 
213 |             # (origin_image, musetalk_origin_image, origin_face_bbox, mouth_center_point, mouth_width, origin_face_mask, extend, radius):
214 |             
215 |             start_time1 = time.time()
216 |             result_image, face_mask = musetalk_utils.blend_to_origin_image(musetalk_utils.tensorimg_to_pilimg(origin_image), 
217 |                                                                 musetalk_utils.tensorimg_to_pilimg(musetalk_origin_image_tensor), 
218 |                                                                 musetalk_utils.tensorimg_to_pilimg(origin_face_mask),
219 |                                                                 extend1, blur_radius1)
220 |             
221 |             print(f"frame index: {idx}, real_index: {real_index}, blend one frame, use: {((time.time() - start_time1)*1000):.2f} ms")
222 | 
223 |             result_images.append(musetalk_utils.pilimg_to_tensorimg(result_image))
224 |             # face_masks.append(musetalk_utils.pilimg_to_tensorimg(face_mask))
225 | 
226 |             pbar.update(1)
227 | 
228 |             print(f"frame index: {idx}, real_index: {real_index}, processed one frame, total use: {((time.time() - start_time0)*1000):.2f} ms")
229 | 
230 |             idx = (idx + 1)%MAX_LEN
231 | 
232 |         return (
233 |                 torch.stack(result_images, dim=0), 
234 |                 # torch.stack(uncrop_masks, dim=0), 
235 |                 # torch.stack(uncroped_images, dim=0), 
236 |                 # torch.stack(face_masks, dim=0), 
237 |                 )
238 | 
239 | 
240 | 
241 | if __name__ == "__main__":
242 | 
243 |     # def postprocess(self, origin_images, 
244 |     # musetalk_faces, 
245 |     # rotated_bboxs, 
246 |     # rotated_images, 
247 |     # face_center_points, 
248 |     # rotated_angles, 
249 |     # origin_face_bboxs, 
250 |     # origin_face_masks, 
251 |     # landmarks):
252 | 
253 |     ori_img = Image.open("./ori.png")
254 |     ori_img_cv2 = cv2.cvtColor(np.array(ori_img), cv2.COLOR_RGB2BGR)
255 |     ori_img_tensor = musetalk_utils.cv2img_to_tensorimg(ori_img_cv2)    
256 | 
257 |     musetalk_face = Image.open("./failed_image_musetalk_face36.jpg")
258 |     musetalk_face_cv2 = cv2.cvtColor(np.array(musetalk_face), cv2.COLOR_RGB2BGR)
259 |     musetalk_face_tensor = musetalk_utils.cv2img_to_tensorimg(musetalk_face_cv2)
260 | 
261 |     rotated_bbox = (150, 201, 721, 729)
262 | 
263 |     rotated_image = Image.open("./failed_image_rotated_image36.jpg")
264 |     rotated_image_cv2 = cv2.cvtColor(np.array(rotated_image), cv2.COLOR_RGB2BGR)
265 |     rotated_image_tensor = musetalk_utils.cv2img_to_tensorimg(rotated_image_cv2)
266 | 
267 |     face_center_point = [(50,50)]
268 | 
269 |     rotated_angle = [20]
270 | 
271 |     origin_face_bbox = [(0,0,500,500)]
272 | 
273 |     origin_face_mask = [None]
274 | 
275 |     isok, musetalk_rotated_image = musetalk_utils.uncrop_to_rotated_image(musetalk_utils.tensorimg_to_cv2img(musetalk_face_tensor), 
276 |                                                                           rotated_bbox, 
277 |                                                                           musetalk_utils.tensorimg_to_cv2img(rotated_image_tensor))
278 |     print(isok)
279 | 
280 | 
281 | 
282 |     # test = MuseTalkPostprocess()
283 |     # test.postprocess()
284 | 
285 | 


--------------------------------------------------------------------------------
/musetalk_preprocess.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import cv2
  3 | from einops import rearrange
  4 | import torch
  5 | 
  6 | from . import musetalk_utils
  7 | from . import musetalk_global_data
  8 | 
  9 | import comfy
 10 | 
 11 | class MuseTalkPreprocess:
 12 |     def __init__(self):
 13 |         pass
 14 |     
 15 |     @classmethod
 16 |     def INPUT_TYPES(s):
 17 |         return {
 18 |             "required": {
 19 |                 "origin_images": ("IMAGE",),
 20 |                 "pose_kps": ("POSE_KEYPOINT",),
 21 |                 "crop_type": (["full", "middle-min", "middle-max"],),
 22 |                 "top_reserve": ("INT", {"default": 0, "min": -9999, "max": 9999, "step": 1}),
 23 |                 "bottom_reserve": ("INT", {"default": 0, "min": -9999, "max": 9999, "step": 1}),
 24 |                 "left_reserve": ("INT", {"default": 0, "min": -9999, "max": 9999, "step": 1}),
 25 |                 "right_reserve": ("INT", {"default": 0, "min": -9999, "max": 9999, "step": 1}),
 26 |             },
 27 |         }
 28 | 
 29 |     RETURN_TYPES = ("IMAGE", 
 30 |                     # "FACE_BBOX", "IMAGE", "FACE_CENTER_POINT", "ROTATE_ANGLE", "FACE_BBOX", "IMAGE", "LANDMARK", 
 31 |                     "IMAGE", )
 32 |     RETURN_NAMES = (
 33 |         "rotated_faces",
 34 |         # "rotated_bboxs",
 35 |         # "rotated_images",
 36 |         # "face_center_points",
 37 |         # "rotated_angles",
 38 |         # "origin_face_bboxs",
 39 |         # "origin_face_masks",
 40 |         # "landmarks",
 41 |         "rotated_faces_with_landmarks"
 42 |     )
 43 | 
 44 |     FUNCTION = "preprocess"
 45 |     CATEGORY = "MuseTalkUtils"
 46 | 
 47 |     def preprocess(self, origin_images, pose_kps, crop_type, top_reserve, bottom_reserve, left_reserve, right_reserve):
 48 | 
 49 |         print(f"MuseTalkPreprocess preprocess, len(origin_images): {len(origin_images)}")
 50 |         
 51 |         global rotated_faces
 52 |         global rotated_faces_with_landmarks
 53 | 
 54 |         global rotated_bboxs
 55 |         global rotated_images
 56 |         global face_center_points
 57 |         global rotated_angles
 58 |         global origin_face_bboxs
 59 |         global origin_face_masks
 60 |         global origin_face_landmarks
 61 | 
 62 | 
 63 |         musetalk_global_data.rotated_faces = []
 64 |         musetalk_global_data.rotated_faces_with_landmarks = []
 65 | 
 66 |         musetalk_global_data.rotated_bboxs = []
 67 |         musetalk_global_data.rotated_images = []
 68 | 
 69 |         musetalk_global_data.face_center_points = []
 70 |         musetalk_global_data.rotated_angles = []
 71 |         musetalk_global_data.origin_face_bboxs = []
 72 |         musetalk_global_data.origin_face_masks = []
 73 |         musetalk_global_data.rotated_resized_half_face_masks = []
 74 |         
 75 | 
 76 |         if len(origin_images) != len(pose_kps):
 77 |             print("origin_images is not same with pose_kps by len")
 78 |             return None
 79 | 
 80 |         musetalk_global_data.origin_face_landmarks = musetalk_utils.get_landmards_by_posekey(pose_kps)
 81 | 
 82 |         idx = -1
 83 | 
 84 |         pbar = comfy.utils.ProgressBar(len(origin_images))
 85 | 
 86 |         for image, landmark in zip(origin_images, musetalk_global_data.origin_face_landmarks):
 87 | 
 88 |             idx = idx + 1
 89 | 
 90 |             # print("landmark len: ", len(landmark))
 91 |             # print("landmark: ", landmark)
 92 | 
 93 |             if len(landmark) == 0:
 94 |                 if len(musetalk_global_data.rotated_faces) > 0:
 95 |                     cur_index = len(musetalk_global_data.rotated_faces)-1
 96 |                     musetalk_global_data.rotated_faces.append(musetalk_global_data.rotated_faces[cur_index])
 97 |                     musetalk_global_data.rotated_faces_with_landmarks.append(musetalk_global_data.rotated_faces_with_landmarks[cur_index])
 98 | 
 99 |                     musetalk_global_data.rotated_bboxs.append(musetalk_global_data.rotated_bboxs[cur_index])
100 |                     musetalk_global_data.rotated_images.append(musetalk_global_data.rotated_images[cur_index])
101 |                     musetalk_global_data.face_center_points.append(musetalk_global_data.face_center_points[cur_index])
102 |                     musetalk_global_data.rotated_angles.append(musetalk_global_data.rotated_angles[cur_index])
103 |                     musetalk_global_data.origin_face_bboxs.append(musetalk_global_data.origin_face_bboxs[cur_index])
104 |                     musetalk_global_data.origin_face_masks.append(musetalk_global_data.origin_face_masks[cur_index])
105 |                     musetalk_global_data.origin_face_landmarks.append(musetalk_global_data.origin_face_landmarks[cur_index])
106 |                     print(f"not found face, image index: {idx}")
107 |                     continue
108 |                 else:
109 |                     # TODO: process no face in first frame
110 |                     continue
111 | 
112 |             landmark = landmark[0]
113 |             
114 |             origin_image = musetalk_utils.tensorimg_to_cv2img(image)
115 |             origin_height, origin_width = image.shape[:2]
116 |             # print("origin_image shape: ", image.shape)
117 | 
118 |             origin_face_bbox = musetalk_utils.get_image_face_bbox(landmark)
119 |             musetalk_global_data.origin_face_bboxs.append(origin_face_bbox)
120 |             # print("origin_face_bbox: ", origin_face_bbox)
121 | 
122 |             origin_face_mask = musetalk_utils.get_half_face_mask(landmark, origin_width, origin_height)
123 |             musetalk_global_data.origin_face_masks.append(musetalk_utils.pilimg_to_tensorimg(origin_face_mask))
124 |             # print("origin_face_mask: ", origin_face_mask.size)
125 | 
126 |             face_center_point, rotate_angle = musetalk_utils.get_face_center_point_and_rotate_angles(landmark)
127 |             musetalk_global_data.face_center_points.append(face_center_point)
128 |             musetalk_global_data.rotated_angles.append(rotate_angle)
129 | 
130 |             # print("face_center_point: ", face_center_point)
131 |             # print("rotate_angle: ", rotate_angle)
132 | 
133 |             rotated_image = musetalk_utils.get_rotated_image(origin_image, face_center_point, rotate_angle)
134 |             musetalk_global_data.rotated_images.append(musetalk_utils.cv2img_to_tensorimg(rotated_image))
135 |             # print("rotated_image: ", rotated_image)
136 | 
137 |             rotated_landmark = musetalk_utils.get_rotatedimage_landmarks(landmark, face_center_point, rotate_angle)
138 | 
139 |             # print("rotated_landmark:",rotated_landmark)
140 | 
141 |             rotated_face, resized_rotated_face, rotated_face_bbox = musetalk_utils.get_face_img_and_face_bbox(rotated_image, rotated_landmark, crop_type, top_reserve, bottom_reserve, left_reserve, right_reserve)
142 | 
143 |             rotated_face_landmark = musetalk_utils.adjust_landmarks_to_crop(rotated_landmark, rotated_face_bbox)
144 | 
145 |             left, top, right, bottom = rotated_face_bbox
146 | 
147 |             # print(rotated_face_bbox, right - left, top - bottom)
148 | 
149 |             rotated_resized_half_face_mask = musetalk_utils.get_half_face_mask(rotated_face_landmark, right - left, bottom - top)
150 | 
151 |             rotated_resized_half_face_mask = rotated_resized_half_face_mask.resize((256, 256))
152 | 
153 |             musetalk_global_data.rotated_resized_half_face_masks.append(rotated_resized_half_face_mask)
154 | 
155 |             rotated_face_with_landmark = musetalk_utils.draw_landmarks(rotated_face, rotated_face_landmark)
156 | 
157 |             rotated_face_with_landmark = cv2.resize(rotated_face_with_landmark, (256, 256))
158 | 
159 |             width = rotated_face_with_landmark.shape[1]
160 |             height = rotated_face_with_landmark.shape[0]
161 |             
162 |             cv2.line(rotated_face_with_landmark, (width//2, 0), (width//2, height-1), (255, 0, 0), 1)  # v-center-line, blue
163 |             cv2.line(rotated_face_with_landmark, (0, height//2), (width-1, height//2), (255, 0, 0), 1)  # h-center-line, blue
164 | 
165 |             musetalk_global_data.rotated_faces_with_landmarks.append(musetalk_utils.cv2img_to_tensorimg(rotated_face_with_landmark))
166 | 
167 |             musetalk_global_data.rotated_bboxs.append(rotated_face_bbox)
168 |             musetalk_global_data.rotated_faces.append(musetalk_utils.cv2img_to_tensorimg(resized_rotated_face))
169 | 
170 |             pbar.update(1)
171 | 
172 |         return (
173 |                 torch.stack(musetalk_global_data.rotated_faces, dim=0), 
174 |                 # rotated_bboxs, 
175 |                 # torch.stack(rotated_images, dim=0), 
176 |                 # face_center_points, 
177 |                 # rotated_angles, 
178 |                 # origin_face_bboxs, 
179 |                 # torch.stack(origin_face_masks, dim=0), 
180 |                 # origin_face_landmarks,
181 |                 torch.stack(musetalk_global_data.rotated_faces_with_landmarks, dim=0), 
182 |                 )     
183 | 
184 | 
185 | 
186 | if __name__ == "__main__":
187 |     musetalk_utils.get_landmards_by_posekey(None)
188 |     print("hello")
189 | 


--------------------------------------------------------------------------------
/musetalk_train.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import cv2
  3 | import os
  4 | import numpy as np
  5 | import torch
  6 | from torch import nn
  7 | import torchvision.transforms as transforms
  8 | import tqdm
  9 | 
 10 | from torch.utils.data import DataLoader
 11 | from torch.utils.data import Dataset
 12 | 
 13 | import pickle
 14 | import glob
 15 | 
 16 | from .vae import VAE
 17 | from .unet import UNet
 18 | from .import musetalk_global_data
 19 | 
 20 | import folder_paths
 21 | 
 22 | image_transform = transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
 23 | 
 24 | def preprocess_img(cv2_img_frame, image_size=256, device="cuda"):
 25 |     window = []
 26 |     if isinstance(cv2_img_frame, str):
 27 |         window_fnames = [cv2_img_frame]
 28 |         for fname in window_fnames:
 29 |             img = cv2.imread(fname)
 30 |             img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
 31 |             img = cv2.resize(img, (image_size, image_size),
 32 |                              interpolation=cv2.INTER_LANCZOS4)
 33 |             window.append(img)
 34 |     else:
 35 |         img = cv2.cvtColor(cv2_img_frame, cv2.COLOR_BGR2RGB)
 36 |         window.append(img)
 37 |     x = np.asarray(window) / 255.
 38 |     x = np.transpose(x, (3, 0, 1, 2))
 39 |     x = torch.squeeze(torch.FloatTensor(x))
 40 |     x = image_transform(x)
 41 |     # x = x.unsqueeze(0)  # [1, 3, 256, 256] torch tensor
 42 |     x = x.to(device)
 43 |     return x
 44 | 
 45 | 
 46 | class FaceDataset(Dataset):
 47 |     def __init__(self, face_latents, audio_features, cv2_frames):
 48 | 
 49 |         super(FaceDataset, self).__init__()
 50 | 
 51 |         self.face_latents = face_latents
 52 |         self.audio_features = audio_features
 53 |         self.cv2_frames =  cv2_frames
 54 | 
 55 |         print('FaceDataset', len(self.face_latents))
 56 | 
 57 |     def __getitem__(self, item):
 58 | 
 59 |         frame_tensor = preprocess_img(self.cv2_frames[item])
 60 |         latent = self.face_latents[item].squeeze(0)
 61 |         audio_feature = self.audio_features[item]
 62 |         audio_feature = torch.tensor(audio_feature).cuda()
 63 | 
 64 |         # print(f"frame_tensor: {frame_tensor.shape}, latent: {latent.shape}, audio_feature: {audio_feature.shape}")
 65 |         return frame_tensor, latent, audio_feature
 66 | 
 67 |     def __len__(self):
 68 |         return len(self.face_latents)
 69 | 
 70 | # only for debug
 71 | class FaceDataset2(Dataset):
 72 |     def __init__(self, dataset_root):
 73 |         super(FaceDataset2, self).__init__()
 74 |         self.dataset_root = dataset_root
 75 |         self.frame_root = os.path.join(self.dataset_root, "frame")
 76 |         with open(os.path.join(self.dataset_root, "face_latent.pkl"), 'rb') as f:
 77 |             self.face_latents = pickle.load(f)
 78 |         with open(os.path.join(self.dataset_root, "whisper_chunks.pkl"), 'rb') as f:
 79 |             self.audio_features = pickle.load(f)
 80 |         self.frames_im_path_list = list(sorted(glob.glob(os.path.join(self.frame_root, "*.png"))))
 81 | 
 82 |     def __getitem__(self, item):
 83 |         frame = cv2.imread(self.frames_im_path_list[item])
 84 |         frame_tensor = preprocess_img(frame)
 85 |         latent = self.face_latents[item].squeeze(0)
 86 |         audio_feature = self.audio_features[item]
 87 |         audio_feature = torch.tensor(audio_feature).cuda()
 88 | 
 89 |         # print(f"frame_tensor: {frame_tensor.shape}, latent: {latent.shape}, audio_feature: {audio_feature.shape}")
 90 |         return frame_tensor, latent, audio_feature
 91 | 
 92 |     def __len__(self):
 93 |         return len(self.frames_im_path_list)
 94 | 
 95 | 
 96 | class MuseTalkTrain:
 97 |     def __init__(self):
 98 |         pass
 99 |     
100 |     @classmethod
101 |     def INPUT_TYPES(s):
102 |         return {
103 |             "required": {
104 |                 "images": ("IMAGE",),
105 |                 "whisper_features" : ("WHISPERFEAT",),
106 |                 "batch_size": ("INT", {"default": 4, "min": 1, "max": 4096, "step": 1}),
107 |             },
108 |         }
109 | 
110 |     RETURN_TYPES = ("IMAGE", )
111 |     RETURN_NAMES = ("images", )
112 | 
113 |     FUNCTION = "train"
114 |     CATEGORY = "MuseTalkUtils"
115 | 
116 |     # TODO, images
117 |     def train(self, images, whisper_features, batch_size):
118 | 
119 |         with torch.inference_mode(False):
120 | 
121 |             model_path_base = os.path.join(folder_paths.models_dir,'musetalk')
122 |             model_config_path = os.path.join(model_path_base, "musetalk", "musetalk.json")
123 |             model_bin_path = os.path.join(model_path_base, "musetalk", "pytorch_model.bin")# TODO, name
124 |             vae_path = os.path.join(model_path_base, "sd-vae-ft-mse")
125 | 
126 |             # model_config_path = "F:/MuseTalk/talk/models/musetalk/musetalk.json"
127 |             # model_bin_path = "F:/MuseTalk/talk/models/musetalk/pytorch_model.bin"
128 |             # vae_path = "F:/MuseTalk/talk/models/sd-vae-ft-mse/"
129 | 
130 |             unet = UNet(unet_config = model_config_path, model_path = model_bin_path)
131 |             vae = VAE(model_path = vae_path)
132 | 
133 |             # global unet
134 |             global resized_cv2_frame_list
135 |             global faces_latent_list
136 | 
137 |             device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
138 | 
139 |             vae.vae.eval()
140 |             unet.model.train()
141 | 
142 |             timesteps = torch.tensor([0], device=device)
143 |             lr = 1e-4
144 |             # lr = 5e-5
145 |             criterion = nn.HuberLoss()
146 |             optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, unet.model.parameters()), lr=lr)
147 |             # optimizer = torch.optim.Adamax(filter(lambda p: p.requires_grad, unet.model.parameters()), lr=lr)
148 | 
149 |             # 
150 |             save_ckpt_dir = os.path.join(model_path_base, "musetalk")
151 | 
152 |             print("len", len(musetalk_global_data.faces_latent_list), len(whisper_features), len(musetalk_global_data.resized_cv2_frame_list))
153 | 
154 |             face_dataset = FaceDataset(musetalk_global_data.faces_latent_list, whisper_features, musetalk_global_data.resized_cv2_frame_list)
155 | 
156 |             # face_dataset = FaceDataset2("F:/MuseTalk/talk/data/train_dataset/v2")
157 | 
158 |             face_dataloader = DataLoader(face_dataset, batch_size = batch_size, shuffle=True, num_workers=0)
159 | 
160 |             # TODO param
161 |             for epoch in range(0, 100):
162 |                 pbar = tqdm.tqdm(enumerate(face_dataloader), total=len(face_dataloader))
163 |                 loss_log = []
164 |                 for i, (face_tensor, latent_tensor, audio_feat) in pbar:
165 | 
166 |                     audio_feat = audio_feat.to(torch.float32)
167 | 
168 |                     pred_latents = unet.model(latent_tensor, timesteps, encoder_hidden_states=audio_feat).sample
169 | 
170 |                     # print(f"pred_latents: {pred_latents.requires_grad}")
171 |                     recon = vae.just_decode_latents(pred_latents)
172 | 
173 |                     gt_latent = vae.encode_latents(face_tensor)
174 |                     loss = 0.2 * criterion(pred_latents, gt_latent) + 0.8 * criterion(recon, face_tensor)
175 |                     loss.backward()
176 |                     loss_log.append(loss.item())
177 |                     optimizer.step()
178 |                     optimizer.zero_grad()
179 |                     pbar.set_description("(Epoch {}) TRAIN LOSS:{:.8f}".format((epoch + 1), np.mean(loss_log)))
180 | 
181 |                 torch.save(unet.model.state_dict(), os.path.join(save_ckpt_dir, "epoch_{}.pth".format(epoch)))
182 | 
183 |         return (images,)
184 | 
185 | if __name__ == "__main__":
186 | 
187 |     train = MuseTalkTrain()
188 |     train.train(None, [], 4)
189 | 
190 | 
191 | # print("dgdg")
192 | 
193 | # print("hehehh ")
194 | # train = MuseTalkTrain()
195 | # train.train(None, [], 4)
196 | 


--------------------------------------------------------------------------------
/musetalk_train_preprocess.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import cv2
  3 | import os
  4 | import random
  5 | import torch
  6 | 
  7 | from . import musetalk_utils
  8 | from . import vae
  9 | from . import musetalk_global_data
 10 | 
 11 | import comfy
 12 | import folder_paths
 13 | 
 14 | model_path = os.path.join(folder_paths.models_dir,'musetalk')
 15 | 
 16 | vae_module = vae.VAE(model_path = os.path.join(model_path, "sd-vae-ft-mse"))
 17 | 
 18 | class MuseTalkTrainPreprocess:
 19 |     def __init__(self):
 20 |         pass
 21 |     
 22 |     @classmethod
 23 |     def INPUT_TYPES(s):
 24 |         return {
 25 |             "required": {
 26 |                 "origin_images": ("IMAGE",),
 27 |                 "pose_kps": ("POSE_KEYPOINT",),
 28 |                 "crop_type": (["full", "middle-min", "middle-max"],),
 29 |                 "top_reserve": ("INT", {"default": 0, "min": -9999, "max": 9999, "step": 1}),
 30 |                 "bottom_reserve": ("INT", {"default": 0, "min": -9999, "max": 9999, "step": 1}),
 31 |                 "left_reserve": ("INT", {"default": 0, "min": -9999, "max": 9999, "step": 1}),
 32 |                 "right_reserve": ("INT", {"default": 0, "min": -9999, "max": 9999, "step": 1}),
 33 |             },
 34 |         }
 35 | 
 36 |     RETURN_TYPES = ("IMAGE", 
 37 |                     # "FACE_BBOX", "IMAGE", "FACE_CENTER_POINT", "ROTATE_ANGLE", "FACE_BBOX", "IMAGE", "LANDMARK", 
 38 |                     "IMAGE", )
 39 |     RETURN_NAMES = (
 40 |         "rotated_faces",
 41 |         # "rotated_bboxs",
 42 |         # "rotated_images",
 43 |         # "face_center_points",
 44 |         # "rotated_angles",
 45 |         # "origin_face_bboxs",
 46 |         # "origin_face_masks",
 47 |         # "landmarks",
 48 |         "rotated_faces_with_landmarks"
 49 |     )
 50 | 
 51 |     FUNCTION = "preprocess"
 52 |     CATEGORY = "MuseTalkUtils"
 53 | 
 54 |     def preprocess(self, origin_images, pose_kps, crop_type, top_reserve, bottom_reserve, left_reserve, right_reserve):
 55 | 
 56 |         print(f"MuseTalkPreprocess preprocess, len(origin_images): {len(origin_images)}")
 57 |         
 58 |         global rotated_faces
 59 |         global rotated_faces_with_landmarks
 60 | 
 61 |         global rotated_bboxs
 62 |         global rotated_images
 63 |         global face_center_points
 64 |         global rotated_angles
 65 |         global origin_face_bboxs
 66 |         global origin_face_masks
 67 |         global origin_face_landmarks
 68 |         global faces_latent_list
 69 |         global resized_cv2_frame_list
 70 | 
 71 | 
 72 |         musetalk_global_data.rotated_faces = []
 73 |         musetalk_global_data.rotated_faces_with_landmarks = []
 74 | 
 75 |         musetalk_global_data.rotated_bboxs = []
 76 |         musetalk_global_data.rotated_images = []
 77 | 
 78 |         musetalk_global_data.face_center_points = []
 79 |         musetalk_global_data.rotated_angles = []
 80 |         musetalk_global_data.origin_face_bboxs = []
 81 |         musetalk_global_data.origin_face_masks = []
 82 |         musetalk_global_data.rotated_resized_half_face_masks = []
 83 | 
 84 |         musetalk_global_data.faces_latent_list = []
 85 |         musetalk_global_data.resized_cv2_frame_list = []
 86 |         
 87 | 
 88 |         if len(origin_images) != len(pose_kps):
 89 |             print("origin_images is not same with pose_kps by len")
 90 |             return None
 91 | 
 92 |         musetalk_global_data.origin_face_landmarks = musetalk_utils.get_landmards_by_posekey(pose_kps)
 93 | 
 94 |         idx = -1
 95 | 
 96 |         # pbar = comfy.utils.ProgressBar(len(origin_images))
 97 | 
 98 |         for image, landmark in zip(origin_images, musetalk_global_data.origin_face_landmarks):
 99 | 
100 |             idx = idx + 1
101 | 
102 |             # print("landmark len: ", len(landmark))
103 |             # print("landmark: ", landmark)
104 | 
105 |             if len(landmark) == 0:
106 |                 if len(musetalk_global_data.rotated_faces) > 0:
107 |                     cur_index = len(musetalk_global_data.rotated_faces)-1
108 |                     musetalk_global_data.rotated_faces.append(musetalk_global_data.rotated_faces[cur_index])
109 |                     musetalk_global_data.rotated_faces_with_landmarks.append(musetalk_global_data.rotated_faces_with_landmarks[cur_index])
110 | 
111 |                     musetalk_global_data.rotated_bboxs.append(musetalk_global_data.rotated_bboxs[cur_index])
112 |                     musetalk_global_data.rotated_images.append(musetalk_global_data.rotated_images[cur_index])
113 |                     musetalk_global_data.face_center_points.append(musetalk_global_data.face_center_points[cur_index])
114 |                     musetalk_global_data.rotated_angles.append(musetalk_global_data.rotated_angles[cur_index])
115 |                     musetalk_global_data.origin_face_bboxs.append(musetalk_global_data.origin_face_bboxs[cur_index])
116 |                     musetalk_global_data.origin_face_masks.append(musetalk_global_data.origin_face_masks[cur_index])
117 |                     musetalk_global_data.origin_face_landmarks.append(musetalk_global_data.origin_face_landmarks[cur_index])
118 |                     print(f"not found face, image index: {idx}")
119 |                     continue
120 |                 else:
121 |                     # TODO: process no face first frame
122 |                     continue
123 | 
124 |             landmark = landmark[0]
125 |             
126 |             origin_image = musetalk_utils.tensorimg_to_cv2img(image)
127 |             origin_height, origin_width = image.shape[:2]
128 |             # print("origin_image shape: ", image.shape)
129 | 
130 |             origin_face_bbox = musetalk_utils.get_image_face_bbox(landmark)
131 |             musetalk_global_data.origin_face_bboxs.append(origin_face_bbox)
132 |             # print("origin_face_bbox: ", origin_face_bbox)
133 | 
134 |             origin_face_mask = musetalk_utils.get_half_face_mask(landmark, origin_width, origin_height)
135 |             musetalk_global_data.origin_face_masks.append(musetalk_utils.pilimg_to_tensorimg(origin_face_mask))
136 |             # print("origin_face_mask: ", origin_face_mask.size)
137 | 
138 |             face_center_point, rotate_angle = musetalk_utils.get_face_center_point_and_rotate_angles(landmark)
139 |             musetalk_global_data.face_center_points.append(face_center_point)
140 |             musetalk_global_data.rotated_angles.append(rotate_angle)
141 | 
142 |             # print("face_center_point: ", face_center_point)
143 |             # print("rotate_angle: ", rotate_angle)
144 | 
145 |             rotated_image = musetalk_utils.get_rotated_image(origin_image, face_center_point, rotate_angle)
146 |             musetalk_global_data.rotated_images.append(musetalk_utils.cv2img_to_tensorimg(rotated_image))
147 |             # print("rotated_image: ", rotated_image)
148 | 
149 |             rotated_landmark = musetalk_utils.get_rotatedimage_landmarks(landmark, face_center_point, rotate_angle)
150 | 
151 |             # print("rotated_landmark:",rotated_landmark)
152 | 
153 |             rotated_face, resized_rotated_face, rotated_face_bbox = musetalk_utils.get_face_img_and_face_bbox(rotated_image, rotated_landmark, crop_type, top_reserve, bottom_reserve, left_reserve, right_reserve)
154 | 
155 |             musetalk_global_data.resized_cv2_frame_list.append(resized_rotated_face)
156 | 
157 |             rotated_face_landmark = musetalk_utils.adjust_landmarks_to_crop(rotated_landmark, rotated_face_bbox)
158 | 
159 |             left, top, right, bottom = rotated_face_bbox
160 | 
161 |             # print(rotated_face_bbox, right - left, top - bottom)
162 | 
163 |             rotated_resized_half_face_mask = musetalk_utils.get_half_face_mask(rotated_face_landmark, right - left, bottom - top)
164 | 
165 |             rotated_resized_half_face_mask = rotated_resized_half_face_mask.resize((256, 256))
166 | 
167 |             musetalk_global_data.rotated_resized_half_face_masks.append(rotated_resized_half_face_mask)
168 | 
169 |             rotated_face_with_landmark = musetalk_utils.draw_landmarks(rotated_face, rotated_face_landmark)
170 | 
171 |             rotated_face_with_landmark = cv2.resize(rotated_face_with_landmark, (256, 256))
172 | 
173 |             # draw debug center line
174 |             width = rotated_face_with_landmark.shape[1]
175 |             height = rotated_face_with_landmark.shape[0]
176 |             cv2.line(rotated_face_with_landmark, (width//2, 0), (width//2, height-1), (255, 0, 0), 1)  # v line, blue
177 |             cv2.line(rotated_face_with_landmark, (0, height//2), (width-1, height//2), (255, 0, 0), 1)  # h line, blue
178 | 
179 |             musetalk_global_data.rotated_faces_with_landmarks.append(musetalk_utils.cv2img_to_tensorimg(rotated_face_with_landmark))
180 | 
181 |             musetalk_global_data.rotated_bboxs.append(rotated_face_bbox)
182 |             musetalk_global_data.rotated_faces.append(musetalk_utils.cv2img_to_tensorimg(resized_rotated_face))
183 |             # pbar.update(1)
184 | 
185 |         
186 |         frame_count = len(musetalk_global_data.resized_cv2_frame_list)
187 |         pbar  = comfy.utils.ProgressBar(frame_count)
188 | 
189 |         print("frame_count", frame_count)
190 | 
191 |         for fid in range(frame_count):
192 |             gt_face = musetalk_global_data.resized_cv2_frame_list[fid]
193 |             rand_num = random.randint(0, frame_count-1)
194 |             ref_face = musetalk_global_data.resized_cv2_frame_list[rand_num]
195 |             latents = vae_module.get_train_latents_for_unet(gt_face, ref_face)
196 |             musetalk_global_data.faces_latent_list.append(latents)
197 | 
198 |             pbar.update(1)
199 | 
200 |         return (
201 |                 torch.stack(musetalk_global_data.rotated_faces, dim=0), 
202 |                 # rotated_bboxs, 
203 |                 # torch.stack(rotated_images, dim=0), 
204 |                 # face_center_points, 
205 |                 # rotated_angles, 
206 |                 # origin_face_bboxs, 
207 |                 # torch.stack(origin_face_masks, dim=0), 
208 |                 # origin_face_landmarks,
209 |                 torch.stack(musetalk_global_data.rotated_faces_with_landmarks, dim=0), 
210 |                 )     
211 | 
212 | 
213 | 


--------------------------------------------------------------------------------
/musetalk_utils.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import cv2
  3 | import torch
  4 | import numpy as np
  5 | from einops import rearrange
  6 | from PIL import Image, ImageDraw,ImageFilter
  7 | import scipy.ndimage
  8 | 
  9 | def pilimg_to_cv2img(pil_img):
 10 | 
 11 |     numpy_image = np.array(pil_img)
 12 |     
 13 |     # to 3 channels
 14 |     if numpy_image.ndim == 2:
 15 |         numpy_image = np.repeat(numpy_image[:, :, np.newaxis], 3, axis=2)
 16 |     
 17 |     # remove Alpha
 18 |     if numpy_image.shape[2] == 4:
 19 |         numpy_image = numpy_image[:, :, :3]
 20 |     
 21 |     # to BRG
 22 |     bgr_image = cv2.cvtColor(numpy_image, cv2.COLOR_RGB2BGR)
 23 |     
 24 |     return bgr_image
 25 | 
 26 | def tensorimg_to_cv2img(tensor_img):
 27 |     numpy_image = tensor_img.numpy()
 28 |     numpy_image = numpy_image * 255.0
 29 |     numpy_image = numpy_image.astype('uint8')
 30 |     rgb_image = cv2.cvtColor(numpy_image, cv2.COLOR_BGR2RGB)
 31 |     return rgb_image
 32 | 
 33 | def cv2img_to_tensorimg(img):
 34 |     img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
 35 |     numpy_image = np.array(img_rgb)
 36 |     numpy_image = numpy_image / 255.0
 37 |     tensor_img = torch.from_numpy(numpy_image)
 38 |     return tensor_img
 39 | 
 40 | def pilimg_to_tensorimg(pil_img):
 41 |     numpy_image = np.array(pil_img)
 42 |     tensor_img = torch.tensor(numpy_image, dtype=torch.float32) / 255.0
 43 |     return tensor_img
 44 | 
 45 | def tensorimg_to_pilimg(tensor_img):
 46 |     numpy_image = (tensor_img * 255).byte().numpy()
 47 |     
 48 |     numpy_image = np.clip(numpy_image, 0, 255).astype(np.uint8)
 49 |     
 50 |     pil_img = Image.fromarray(numpy_image)
 51 |     
 52 |     return pil_img
 53 | 
 54 | def is_normalized(keypoints) -> bool:
 55 |     point_normalized = [
 56 |         0 <= np.abs(k[0]) <= 1 and 0 <= np.abs(k[1]) <= 1 
 57 |         for k in keypoints 
 58 |         if k is not None
 59 |     ]
 60 |     if not point_normalized:
 61 |         return False
 62 |     return np.all(point_normalized)
 63 | 
 64 | 
 65 | def get_half_face_mask(landmark, width, height):
 66 | 
 67 |     mask = Image.new("RGB", (width, height), (0,0,0))
 68 | 
 69 |     # https://www.researchgate.net/profile/Fabrizio-Falchi/publication/338048224/figure/fig1/AS:837860722741255@1576772971540/68-facial-landmarks.jpg
 70 |     points = landmark[0:17]
 71 |     # points.append(landmark[30])
 72 | 
 73 |     draw = ImageDraw.Draw(mask)
 74 |     draw.polygon(points, fill=(255,255,255))
 75 | 
 76 |     return mask
 77 | 
 78 | def draw_landmarks(img, landmarks):
 79 |     
 80 |     img_copy = img.copy()
 81 |     for i, (x, y) in enumerate(landmarks):
 82 |         # # https://www.researchgate.net/profile/Fabrizio-Falchi/publication/338048224/figure/fig1/AS:837860722741255@1576772971540/68-facial-landmarks.jpg
 83 |         if i == 29 or i==48 or i == 54:
 84 |             # center nose , left mouth, right mouth
 85 |             cv2.circle(img_copy, (x, y), 2, (0, 0, 255), -1)  # red
 86 |         else:
 87 |             cv2.circle(img_copy, (x, y), 2, (0, 255, 0), -1)  # green
 88 | 
 89 |     return img_copy
 90 | 
 91 | 
 92 | def get_landmards_by_posekey(pose_kps):
 93 |     # print("in get_landmards_by_posekey len(pose_kps)", len(pose_kps))
 94 |     land_marks = []
 95 |     for pose_frame in pose_kps:
 96 |         width, height = pose_frame["canvas_width"], pose_frame["canvas_height"]
 97 |         person_landmark = []
 98 |         for person in pose_frame["people"]:
 99 | 
100 |             if "face_keypoints_2d" in person and person["face_keypoints_2d"] is not None:
101 |             
102 |                 n = len(person["face_keypoints_2d"]) // 3
103 | 
104 |                 facial_kps = rearrange(np.array(person["face_keypoints_2d"]), "(n c) -> n c", n=n, c=3)[:, :2]
105 | 
106 |                 if is_normalized(facial_kps):
107 |                     facial_kps *= (width, height)
108 |                 
109 |                 facial_kps = facial_kps.astype(np.int32)
110 |                 
111 |                 one_person_land_marks = [(x, y) for x, y in facial_kps]
112 | 
113 |                 person_landmark.append(one_person_land_marks)
114 |             else:
115 |                 print("not found face!!!")
116 |         
117 |         land_marks.append(person_landmark)
118 |     
119 |     return land_marks
120 | 
121 | def get_mouth_center_point_by_landmark(landmark):
122 |     mouth_center_x = (landmark[51][0] + landmark[57][0]) // 2
123 |     mouth_center_y = (landmark[51][1] + landmark[57][1]) // 2
124 |     return (mouth_center_x, mouth_center_y)
125 | 
126 | def get_mouth_width_by_landmark(landmark):
127 |     left_mouth_x = landmark[48][0]  # left mouth
128 |     right_mouth_x = landmark[54][0]  # right mouth
129 |     return right_mouth_x - left_mouth_x
130 | 
131 | 
132 | def get_image_face_bbox(landmark):
133 | 
134 |     # face bbox
135 |     left = min(landmark[i][0] for i in range(0, 17))
136 |     right = max(landmark[i][0] for i in range(0, 17))
137 |     bottom = max(landmark[i][1] for i in range(0, 27))
138 | 
139 |     # 51 top mouth
140 |     # 57 bottom mouth
141 |     # mouth_center_x = (landmark[51][0] + landmark[57][0]) // 2
142 |     mouth_center_y = (landmark[51][1] + landmark[57][1]) // 2
143 | 
144 | 
145 |     # left_mouth_x = landmark[48][0]  
146 |     # right_mouth_x = landmark[54][0]
147 | 
148 |     # mouth_width = right_mouth_x - left_mouth_x
149 | 
150 |     # harf_x_left = mouth_center_x - left
151 |     # harf_x_right = right - mouth_center_x
152 |     # harf_x = max(harf_x_left, harf_x_right)
153 | 
154 |     # print("left:", bottom)
155 |     # print("harf_x:", harf_x)
156 | 
157 |     # left = mouth_center_x - harf_x
158 |     # right = mouth_center_x + harf_x
159 | 
160 |     one_fourth_y = bottom - mouth_center_y
161 |     top = bottom - one_fourth_y*4
162 | 
163 |     # middle_y = bottom - landmark[29][1]
164 |     # top = bottom - middle_y * 2
165 | 
166 | 
167 |     # TODO，out-of-bounds process
168 |     # top = top + top_reserve
169 |     # bottom = bottom + bottom_reserve
170 |     # left = left + left_reserve
171 |     # right = right + right_reserve
172 |     
173 |     face_bbox = left, top, right, bottom
174 | 
175 |     return face_bbox
176 | 
177 | def get_face_center_point_and_rotate_angles(landmarks):
178 |     
179 |     landmarks = np.array(landmarks)
180 | 
181 |     # face center point
182 |     center_point = np.mean(landmarks, axis=0)
183 |     
184 |     # left eye and right eye
185 |     # left_point = landmarks[36]
186 |     # right_point = landmarks[45]
187 |     
188 |     # left mouth and right mouth
189 |     left_point = landmarks[48]
190 |     right_point = landmarks[54]
191 |     
192 |     # cal angle
193 |     angle = np.arctan2(right_point[1] - left_point[1], right_point[0] - left_point[0]) * 180 / np.pi
194 |     
195 |     return center_point, angle
196 | 
197 | def get_rotated_image(origin_image, face_center_point, rotate_angle):
198 |         
199 |     rotation_matrix = cv2.getRotationMatrix2D(tuple(face_center_point), rotate_angle, 1)
200 |     rotated_image = cv2.warpAffine(origin_image, rotation_matrix, (origin_image.shape[1], origin_image.shape[0]), flags=cv2.INTER_NEAREST)
201 | 
202 |     return rotated_image
203 | 
204 | 
205 | def get_rotatedimage_landmarks(landmark, face_center_point, rotate_angle):
206 |     
207 |     landmark = np.array(landmark)
208 |     
209 |     rotation_matrix = cv2.getRotationMatrix2D(tuple(face_center_point), rotate_angle, 1)
210 | 
211 |     adjusted_landmarks = landmark - face_center_point
212 |     rotated_landmark = np.dot(rotation_matrix[:, :2], adjusted_landmarks.T).T + face_center_point
213 | 
214 |     converted_landmarks = [(int(point[0]), int(point[1])) for point in rotated_landmark]
215 | 
216 |     return converted_landmarks
217 | 
218 | def adjust_landmarks_to_crop(landmarks, bbox):
219 | 
220 |     left, top, right, bottom = bbox
221 |     width = right - left
222 |     height = bottom - top
223 |     
224 |     offset_x = left
225 |     offset_y = top
226 |     
227 |     adjusted_landmarks = [(x - offset_x, y - offset_y) for x, y in landmarks]
228 |     
229 |     return adjusted_landmarks
230 | 
231 | def get_face_img_and_face_bbox(image, landmark, crop_type, top_reserve, bottom_reserve, left_reserve, right_reserve):
232 | 
233 |     # face bbox
234 |     left = min(landmark[i][0] for i in range(0, 17))
235 |     right = max(landmark[i][0] for i in range(0, 17))
236 |     bottom = max(landmark[i][1] for i in range(0, 27))
237 | 
238 |     # modify top last
239 |     bottom = bottom + bottom_reserve
240 |     left = left - left_reserve
241 |     right = right + right_reserve
242 | 
243 |     # mouth up center: 51
244 |     # mouth down center: 57
245 |     mouth_center_x = (landmark[51][0] + landmark[57][0]) // 2
246 |     mouth_center_y = (landmark[51][1] + landmark[57][1]) // 2
247 | 
248 |     left_mouth_x = landmark[48][0]
249 |     right_mouth_x = landmark[54][0]
250 | 
251 |     mouth_width = right_mouth_x - left_mouth_x
252 | 
253 |     harf_x_left = mouth_center_x - left
254 |     harf_x_right = right - mouth_center_x
255 | 
256 |     if crop_type == "middle-min":
257 |         harf_x = min(harf_x_left, harf_x_right)
258 | 
259 |         # print("left:", bottom)
260 |         # print("harf_x:", harf_x)
261 | 
262 |         left = mouth_center_x - harf_x
263 |         right = mouth_center_x + harf_x
264 |     elif crop_type == "middle-max":
265 |         harf_x = max(harf_x_left, harf_x_right)
266 | 
267 |         # print("left:", bottom)
268 |         # print("harf_x:", harf_x)
269 | 
270 |         left = mouth_center_x - harf_x
271 |         right = mouth_center_x + harf_x
272 |     elif crop_type == "full":
273 |         pass
274 | 
275 |     # left = int(mouth_center_x - mouth_width)
276 |     # right = int(mouth_center_x + mouth_width)
277 | 
278 |     # one_fourth_height = bottom - mouth_center_y
279 |     # half_height = one_fourth_height * 2
280 | 
281 |     # middle_y = bottom - half_height
282 |     
283 |     # if middle_y < landmark[28][1]:
284 |     #     middle_y = landmark[28][1]
285 |     # if middle_y > landmark[30][1]:
286 |     #     middle_y = landmark[30][1]
287 | 
288 |     # half_height = bottom - middle_y
289 |     # top = bottom - half_height * 2
290 | 
291 |     # landmark29 in v-center
292 |     middle_y = bottom - landmark[29][1]
293 |     top = bottom - middle_y * 2
294 | 
295 |     top = top - top_reserve
296 |    
297 |    # out of bounds
298 |     left = max(0, left)
299 |     top = max(0, top)
300 |     right = min(image.shape[1], right)
301 |     bottom = min(image.shape[0], bottom)
302 | 
303 |     # print(f"left: {left}, top: {top}, right: {right}, bottom: {bottom}")
304 |     
305 |     face_image = image[top:bottom, left:right]
306 | 
307 |     resized_face_image = cv2.resize(face_image,(256,256))
308 |     
309 |     face_bbox = left, top, right, bottom
310 |     
311 |     return face_image, resized_face_image, face_bbox
312 | 
313 | 
314 | def create_uncrop_mask(width, height, center, v_axes, h_axes):
315 | 
316 | 
317 |     mask = np.zeros((height, width), dtype=np.uint8)
318 | 
319 |     axes = (h_axes, v_axes)
320 |     angle = 90
321 |     color = 255
322 | 
323 |     cv2.ellipse(mask, center, axes, angle, 0, 360, color, thickness=-1)
324 | 
325 |     pil_image = Image.fromarray(mask)
326 | 
327 |     return pil_image
328 |     
329 | 
330 | 
331 | def uncrop_to_rotated_image(rotated_face, musetalk_face, rotated_bbox, rotated_image, uncrop_mask, extend, radius):
332 | 
333 |     mask = uncrop_mask.copy()
334 | 
335 |     # TODO，optimize
336 |     mask = mask.convert('L')
337 | 
338 |     rotated_face_copy = rotated_face.copy()
339 | 
340 |     rotated_face_copy.paste(musetalk_face, (0, 0), mask)
341 | 
342 |     x_min, y_min, x_max, y_max = rotated_bbox
343 | 
344 |     origin_width, origin_height = rotated_image.size
345 | 
346 |     x_min = max(0, x_min)
347 |     y_min = max(0, y_min)
348 | 
349 |     x_max = min(x_max, origin_width)
350 |     y_max = min(y_max, origin_height)
351 |     
352 |     width = x_max - x_min
353 |     height = y_max - y_min
354 |     
355 |     rotated_face_copy = rotated_face_copy.resize((width, height))
356 | 
357 |     # print("width:", width)
358 |     # print("Height:", height)
359 | 
360 |     # musetalk_face = musetalk_face.resize((width, height))
361 |     
362 |     # mask = uncrop_mask
363 | 
364 |     # mask = mask.convert('L')
365 | 
366 |     # mask = mask.resize((width, height))
367 | 
368 |     if extend != 0:
369 |         mask = expand_mask(mask, extend, True)
370 | 
371 |     if radius != 0:
372 |         mask = mask.filter(ImageFilter.GaussianBlur(radius=radius))
373 | 
374 |     # print(f"musetalk_face mode:{musetalk_face.mode} {musetalk_face.size}, rotated_image mode: {rotated_image.mode}, {rotated_image.size}, {mask.size}")
375 | 
376 |     rotated_image.paste(rotated_face_copy, (x_min, y_min))
377 | 
378 |     mask = mask.convert('RGB')
379 | 
380 |     return rotated_image, mask
381 | 
382 | def unrotated_image(musetalk_rotated_image, face_center_point, rotate_angle, width, height):
383 |     rotation_matrix = cv2.getRotationMatrix2D(face_center_point, -rotate_angle, 1)
384 |     musetalk_origin_image = cv2.warpAffine(musetalk_rotated_image, rotation_matrix, (width, height))
385 |         
386 |     return musetalk_origin_image
387 | 
388 | def expand_mask(mask, expand, tapered_corners):
389 |     
390 |     mask = np.array(mask)
391 |     c = 0 if tapered_corners else 1
392 | 
393 |     kernel = np.array([[c, 1, c],
394 |                        [1, 1, 1],
395 |                        [c, 1, c]])
396 | 
397 |     iterations = abs(expand)
398 | 
399 |     operation = scipy.ndimage.morphology.binary_erosion if expand < 0 else scipy.ndimage.morphology.binary_dilation
400 | 
401 |     mask = operation(mask, structure=kernel, iterations=iterations)
402 | 
403 |     return Image.fromarray(mask.astype(np.uint8) * 255)
404 | 
405 | 
406 | def blend_to_origin_image(origin_image, musetalk_origin_image, origin_face_mask, extend, radius):
407 |     
408 |     origin_face_mask = origin_face_mask.convert('L')
409 | 
410 |     origin_face_mask = expand_mask(origin_face_mask, extend, True)
411 | 
412 |     origin_face_mask = origin_face_mask.resize(musetalk_origin_image.size)
413 | 
414 |     origin_face_mask = origin_face_mask.filter(ImageFilter.BoxBlur(radius=radius))
415 |     origin_face_mask = origin_face_mask.filter(ImageFilter.GaussianBlur(radius=radius))
416 | 
417 |     origin_image.paste(musetalk_origin_image, (0, 0), origin_face_mask)
418 | 
419 |     return origin_image, origin_face_mask


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [project]
 2 | name = "comfyui-musetalkutils"
 3 | description = "MuseTalk ComfyUI Preprocess and Postprocess Nodes"
 4 | version = "1.0.0"
 5 | license = "LICENSE"
 6 | 
 7 | [project.urls]
 8 | Repository = "https://github.com/xuhongming251/ComfyUI-MuseTalkUtils"
 9 | #  Used by Comfy Registry https://comfyregistry.org
10 | 
11 | [tool.comfy]
12 | PublisherId = ""
13 | DisplayName = "ComfyUI-MuseTalkUtils"
14 | Icon = ""
15 | 


--------------------------------------------------------------------------------
/unet.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import math
 4 | import json
 5 | 
 6 | from diffusers import UNet2DConditionModel
 7 | import sys
 8 | import time
 9 | import numpy as np
10 | import os
11 | 
12 | class PositionalEncoding(nn.Module):
13 |     def __init__(self, d_model=384, max_len=5000):
14 |         super(PositionalEncoding, self).__init__()
15 |         pe = torch.zeros(max_len, d_model)
16 |         position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
17 |         div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
18 |         pe[:, 0::2] = torch.sin(position * div_term)
19 |         pe[:, 1::2] = torch.cos(position * div_term)
20 |         pe = pe.unsqueeze(0)
21 |         self.register_buffer('pe', pe)
22 | 
23 |     def forward(self, x):
24 |         b, seq_len, d_model = x.size()
25 |         pe = self.pe[:, :seq_len, :]
26 |         x = x + pe.to(x.device)
27 |         return x
28 |     
29 | class UNet():
30 |     def __init__(self, 
31 |                  unet_config,
32 |                  model_path,
33 |                  use_float16=False,
34 |         ):
35 |         with open(unet_config, 'r') as f:
36 |             unet_config = json.load(f)
37 |         self.model = UNet2DConditionModel(**unet_config)
38 |         self.pe = PositionalEncoding(d_model=384)
39 |         self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
40 |         weights = torch.load(model_path) if torch.cuda.is_available() else torch.load(model_path, map_location=self.device)
41 |         self.model.load_state_dict(weights)
42 |         if use_float16:
43 |             self.model = self.model.half()
44 |         self.model.to(self.device)
45 |     
46 | if __name__ == "__main__":
47 |     unet = UNet()
48 | 


--------------------------------------------------------------------------------
/vae.py:
--------------------------------------------------------------------------------
  1 | from diffusers import AutoencoderKL
  2 | import torch
  3 | import torchvision.transforms as transforms
  4 | import torch.nn.functional as F
  5 | import cv2
  6 | import numpy as np
  7 | from PIL import Image
  8 | import os
  9 | 
 10 | class VAE():
 11 |     """
 12 |     VAE (Variational Autoencoder) class for image processing.
 13 |     """
 14 | 
 15 |     def __init__(self, model_path="./models/sd-vae-ft-mse/", resized_img=256, use_float16=False):
 16 |         """
 17 |         Initialize the VAE instance.
 18 | 
 19 |         :param model_path: Path to the trained model.
 20 |         :param resized_img: The size to which images are resized.
 21 |         :param use_float16: Whether to use float16 precision.
 22 |         """
 23 |         self.model_path = model_path
 24 |         self.vae = AutoencoderKL.from_pretrained(self.model_path)
 25 | 
 26 |         self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 27 |         self.vae.to(self.device)
 28 | 
 29 |         if use_float16:
 30 |             self.vae = self.vae.half()
 31 |             self._use_float16 = True
 32 |         else:
 33 |             self._use_float16 = False
 34 | 
 35 |         self.scaling_factor = self.vae.config.scaling_factor
 36 |         self.transform = transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
 37 |         self._resized_img = resized_img
 38 |         self._mask_tensor = self.get_mask_tensor()
 39 |         
 40 |     def get_mask_tensor(self):
 41 |         """
 42 |         Creates a mask tensor for image processing.
 43 |         :return: A mask tensor.
 44 |         """
 45 |         mask_tensor = torch.zeros((self._resized_img,self._resized_img))
 46 |         mask_tensor[:self._resized_img//2,:] = 1
 47 |         mask_tensor[mask_tensor< 0.5] = 0
 48 |         mask_tensor[mask_tensor>= 0.5] = 1
 49 |         return mask_tensor
 50 |             
 51 |     def preprocess_img(self,img_name,half_mask=False):
 52 |         """
 53 |         Preprocess an image for the VAE.
 54 | 
 55 |         :param img_name: The image file path or a list of image file paths.
 56 |         :param half_mask: Whether to apply a half mask to the image.
 57 |         :return: A preprocessed image tensor.
 58 |         """
 59 |         window = []
 60 |         if isinstance(img_name, str):
 61 |             window_fnames = [img_name]
 62 |             for fname in window_fnames:
 63 |                 img = cv2.imread(fname)
 64 |                 img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
 65 |                 img = cv2.resize(img, (self._resized_img, self._resized_img),
 66 |                                      interpolation=cv2.INTER_LANCZOS4)
 67 |                 window.append(img)
 68 |         else:
 69 |             img = cv2.cvtColor(img_name, cv2.COLOR_BGR2RGB)
 70 |             window.append(img)
 71 |             
 72 |         x = np.asarray(window) / 255.
 73 |         # print("0x shape:", x.shape)
 74 |         x = np.transpose(x, (3, 0, 1, 2))
 75 |         # print("1x shape:", x.shape)
 76 |         # print("self._mask_tensor shape", self._mask_tensor.shape)
 77 |         x = torch.squeeze(torch.FloatTensor(x))
 78 |         if half_mask:
 79 |             # print("_mask_tensor:", self._mask_tensor)
 80 |             # print("x:", x)
 81 | 
 82 |             x = x * (self._mask_tensor>0.5)
 83 |         x = self.transform(x)
 84 |         
 85 |         x = x.unsqueeze(0) # [1, 3, 256, 256] torch tensor
 86 |         x = x.to(self.vae.device)
 87 | 
 88 |         return x
 89 | 
 90 |     def encode_latents(self,image):
 91 |         """
 92 |         Encode an image into latent variables.
 93 | 
 94 |         :param image: The image tensor to encode.
 95 |         :return: The encoded latent variables.
 96 |         """
 97 |         with torch.no_grad():
 98 |             init_latent_dist = self.vae.encode(image.to(self.vae.dtype)).latent_dist
 99 |         init_latents = self.scaling_factor * init_latent_dist.sample()
100 |         return init_latents
101 |     
102 |     def decode_latents(self, latents):
103 |         """
104 |         Decode latent variables back into an image.
105 |         :param latents: The latent variables to decode.
106 |         :return: A NumPy array representing the decoded image.
107 |         """
108 |         latents = (1/  self.scaling_factor) * latents
109 |         image = self.vae.decode(latents.to(self.vae.dtype)).sample
110 |         image = (image / 2 + 0.5).clamp(0, 1)
111 |         image = image.detach().cpu().permute(0, 2, 3, 1).float().numpy()
112 |         image = (image * 255).round().astype("uint8")
113 |         image = image[...,::-1] # RGB to BGR
114 |         return image
115 |     
116 |     def just_decode_latents(self, latents):
117 |         latents = (1 / self.scaling_factor) * latents
118 |         image = self.vae.decode(latents.to(self.vae.dtype)).sample
119 |         return image
120 |         
121 |     def get_latents_for_unet(self,img):
122 |         """
123 |         Prepare latent variables for a U-Net model.
124 |         :param img: The image to process.
125 |         :return: A concatenated tensor of latents for U-Net input.
126 |         """
127 |         
128 |         ref_image = self.preprocess_img(img,half_mask=True) # [1, 3, 256, 256] RGB, torch tensor
129 |         masked_latents = self.encode_latents(ref_image) # [1, 4, 32, 32], torch tensor
130 |         ref_image = self.preprocess_img(img,half_mask=False) # [1, 3, 256, 256] RGB, torch tensor
131 |         ref_latents = self.encode_latents(ref_image) # [1, 4, 32, 32], torch tensor
132 |         latent_model_input = torch.cat([masked_latents, ref_latents], dim=1)
133 |         return latent_model_input
134 |     
135 |     def get_train_latents_for_unet(self, hal_face, ref_face):
136 |         ref_image = self.preprocess_img(hal_face,half_mask=True) # [1, 3, 256, 256] RGB, torch tensor
137 |         masked_latents = self.encode_latents(ref_image) # [1, 4, 32, 32], torch tensor
138 |         ref_image = self.preprocess_img(ref_face,half_mask=False) # [1, 3, 256, 256] RGB, torch tensor
139 |         ref_latents = self.encode_latents(ref_image) # [1, 4, 32, 32], torch tensor
140 |         latent_model_input = torch.cat([masked_latents, ref_latents], dim=1)
141 |         return latent_model_input
142 |     
143 | 
144 | 
145 | if __name__ == "__main__":
146 |     window = []
147 |     img = cv2.imread("d:/11.png")
148 |     # print(img)
149 |     img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
150 |     img = cv2.resize(img, (256, 256), interpolation=cv2.INTER_LANCZOS4)
151 |     window.append(img)
152 |     x = np.asarray(window) / 255.
153 |     print("0x shape:", x.shape)
154 |     x = np.transpose(x, (3, 0, 1, 2))
155 |     print("1x shape:", x.shape)
156 |     # vv = VAE()
157 |     # _mask_tensor = vv.get_mask_tensor()
158 |     # print("self._mask_tensor shape", _mask_tensor.shape)
159 |     # x = torch.squeeze(torch.FloatTensor(x))
160 | 
161 |     # print("_mask_tensor:", _mask_tensor)
162 |     # print("x:", x)
163 | 
164 | 
165 |     # x = x * (_mask_tensor>0.5)
166 |     # x = selftransform(x)
167 |         
168 |     # x = x.unsqueeze(0) # [1, 3, 256, 256] torch tensor
169 | 
170 |     # print(x.shape)
171 | 
172 |     # vae_mode_path = "./models/sd-vae-ft-mse/"
173 |     # vae = VAE(model_path = vae_mode_path,use_float16=False)
174 |     # img_path = "./results/sun001_crop/00000.png"
175 |     
176 |     # crop_imgs_path = "./results/sun001_crop/"
177 |     # latents_out_path = "./results/latents/"
178 |     # if not os.path.exists(latents_out_path):
179 |     #     os.mkdir(latents_out_path)
180 | 
181 |     # files = os.listdir(crop_imgs_path)
182 |     # files.sort()
183 |     # files = [file for file in files if file.split(".")[-1] == "png"]
184 | 
185 |     # for file in files:
186 |     #     index = file.split(".")[0]
187 |     #     img_path = crop_imgs_path + file
188 |     #     latents = vae.get_latents_for_unet(img_path)
189 |     #     print(img_path,"latents",latents.size())
190 |     #     #torch.save(latents,os.path.join(latents_out_path,index+".pt"))
191 |     #     #reload_tensor = torch.load('tensor.pt')
192 |     #     #print(reload_tensor.size())
193 |         
194 | 
195 |     


--------------------------------------------------------------------------------
/workflow/musetalk flow.json:
--------------------------------------------------------------------------------
   1 | {
   2 |   "last_node_id": 1116,
   3 |   "last_link_id": 1849,
   4 |   "nodes": [
   5 |     {
   6 |       "id": 529,
   7 |       "type": "GetImageSize+",
   8 |       "pos": [
   9 |         -6837.223017960602,
  10 |         1181.0887697221267
  11 |       ],
  12 |       "size": {
  13 |         "0": 210,
  14 |         "1": 46
  15 |       },
  16 |       "flags": {},
  17 |       "order": 21,
  18 |       "mode": 0,
  19 |       "inputs": [
  20 |         {
  21 |           "name": "image",
  22 |           "type": "IMAGE",
  23 |           "link": 1530
  24 |         }
  25 |       ],
  26 |       "outputs": [
  27 |         {
  28 |           "name": "width",
  29 |           "type": "INT",
  30 |           "links": [
  31 |             871
  32 |           ],
  33 |           "shape": 3,
  34 |           "slot_index": 0
  35 |         },
  36 |         {
  37 |           "name": "height",
  38 |           "type": "INT",
  39 |           "links": [
  40 |             872
  41 |           ],
  42 |           "shape": 3,
  43 |           "slot_index": 1
  44 |         }
  45 |       ],
  46 |       "properties": {
  47 |         "Node name for S&R": "GetImageSize+"
  48 |       }
  49 |     },
  50 |     {
  51 |       "id": 27,
  52 |       "type": "vhs_audio_to_audio_tensor",
  53 |       "pos": [
  54 |         -7600.1983555191255,
  55 |         -491.89309959092043
  56 |       ],
  57 |       "size": {
  58 |         "0": 315,
  59 |         "1": 102
  60 |       },
  61 |       "flags": {},
  62 |       "order": 14,
  63 |       "mode": 0,
  64 |       "inputs": [
  65 |         {
  66 |           "name": "vhs_audio",
  67 |           "type": "VHS_AUDIO",
  68 |           "link": 45,
  69 |           "slot_index": 0,
  70 |           "label": "vhs_audio"
  71 |         }
  72 |       ],
  73 |       "outputs": [
  74 |         {
  75 |           "name": "audio_tensor",
  76 |           "type": "VCAUDIOTENSOR",
  77 |           "links": [
  78 |             67
  79 |           ],
  80 |           "shape": 3,
  81 |           "slot_index": 0,
  82 |           "label": "audio_tensor"
  83 |         },
  84 |         {
  85 |           "name": "audio_dur",
  86 |           "type": "INT",
  87 |           "links": null,
  88 |           "shape": 3,
  89 |           "label": "audio_dur"
  90 |         }
  91 |       ],
  92 |       "properties": {
  93 |         "Node name for S&R": "vhs_audio_to_audio_tensor"
  94 |       },
  95 |       "widgets_values": [
  96 |         16000,
  97 |         1
  98 |       ]
  99 |     },
 100 |     {
 101 |       "id": 223,
 102 |       "type": "SetNode",
 103 |       "pos": [
 104 |         -6827.198355519126,
 105 |         -760.8930995909205
 106 |       ],
 107 |       "size": {
 108 |         "0": 235.1999969482422,
 109 |         "1": 58
 110 |       },
 111 |       "flags": {},
 112 |       "order": 26,
 113 |       "mode": 0,
 114 |       "inputs": [
 115 |         {
 116 |           "name": "INT",
 117 |           "type": "INT",
 118 |           "link": 422
 119 |         }
 120 |       ],
 121 |       "outputs": [
 122 |         {
 123 |           "name": "*",
 124 |           "type": "*",
 125 |           "links": null
 126 |         }
 127 |       ],
 128 |       "title": "Set_output_video_frame_count",
 129 |       "properties": {
 130 |         "previousName": "output_video_frame_count"
 131 |       },
 132 |       "widgets_values": [
 133 |         "output_video_frame_count"
 134 |       ]
 135 |     },
 136 |     {
 137 |       "id": 125,
 138 |       "type": "GetNode",
 139 |       "pos": [
 140 |         -4474.410906003187,
 141 |         -641.4087271741254
 142 |       ],
 143 |       "size": {
 144 |         "0": 210,
 145 |         "1": 58
 146 |       },
 147 |       "flags": {
 148 |         "collapsed": false
 149 |       },
 150 |       "order": 0,
 151 |       "mode": 0,
 152 |       "outputs": [
 153 |         {
 154 |           "name": "VHS_AUDIO",
 155 |           "type": "VHS_AUDIO",
 156 |           "links": [
 157 |             1380
 158 |           ],
 159 |           "slot_index": 0,
 160 |           "label": "VHS_AUDIO"
 161 |         }
 162 |       ],
 163 |       "title": "Get_audio",
 164 |       "properties": {},
 165 |       "widgets_values": [
 166 |         "audio"
 167 |       ]
 168 |     },
 169 |     {
 170 |       "id": 224,
 171 |       "type": "GetNode",
 172 |       "pos": [
 173 |         -8757,
 174 |         874
 175 |       ],
 176 |       "size": {
 177 |         "0": 285.89874267578125,
 178 |         "1": 97.85186767578125
 179 |       },
 180 |       "flags": {},
 181 |       "order": 1,
 182 |       "mode": 0,
 183 |       "outputs": [
 184 |         {
 185 |           "name": "INT",
 186 |           "type": "INT",
 187 |           "links": [
 188 |             1761
 189 |           ],
 190 |           "slot_index": 0
 191 |         }
 192 |       ],
 193 |       "title": "Get_output_video_frame_count",
 194 |       "properties": {},
 195 |       "widgets_values": [
 196 |         "output_video_frame_count"
 197 |       ]
 198 |     },
 199 |     {
 200 |       "id": 527,
 201 |       "type": "PixelPerfectResolution",
 202 |       "pos": [
 203 |         -6777.911213012693,
 204 |         1385.9297019486887
 205 |       ],
 206 |       "size": {
 207 |         "0": 393,
 208 |         "1": 106
 209 |       },
 210 |       "flags": {},
 211 |       "order": 27,
 212 |       "mode": 0,
 213 |       "inputs": [
 214 |         {
 215 |           "name": "original_image",
 216 |           "type": "IMAGE",
 217 |           "link": 1531,
 218 |           "label": "original_image"
 219 |         },
 220 |         {
 221 |           "name": "image_gen_width",
 222 |           "type": "INT",
 223 |           "link": 871,
 224 |           "widget": {
 225 |             "name": "image_gen_width"
 226 |           },
 227 |           "slot_index": 1
 228 |         },
 229 |         {
 230 |           "name": "image_gen_height",
 231 |           "type": "INT",
 232 |           "link": 872,
 233 |           "widget": {
 234 |             "name": "image_gen_height"
 235 |           }
 236 |         }
 237 |       ],
 238 |       "outputs": [
 239 |         {
 240 |           "name": "RESOLUTION (INT)",
 241 |           "type": "INT",
 242 |           "links": [
 243 |             873
 244 |           ],
 245 |           "shape": 3,
 246 |           "label": "RESOLUTION (INT)",
 247 |           "slot_index": 0
 248 |         }
 249 |       ],
 250 |       "properties": {
 251 |         "Node name for S&R": "PixelPerfectResolution"
 252 |       },
 253 |       "widgets_values": [
 254 |         800,
 255 |         536,
 256 |         "Just Resize"
 257 |       ]
 258 |     },
 259 |     {
 260 |       "id": 36,
 261 |       "type": "whisper_to_features",
 262 |       "pos": [
 263 |         -7224.752500484394,
 264 |         -626.0040445507748
 265 |       ],
 266 |       "size": {
 267 |         "0": 342.5999755859375,
 268 |         "1": 78
 269 |       },
 270 |       "flags": {},
 271 |       "order": 20,
 272 |       "mode": 0,
 273 |       "inputs": [
 274 |         {
 275 |           "name": "audio_tensor",
 276 |           "type": "VCAUDIOTENSOR",
 277 |           "link": 67,
 278 |           "slot_index": 0,
 279 |           "label": "audio_tensor"
 280 |         }
 281 |       ],
 282 |       "outputs": [
 283 |         {
 284 |           "name": "whisper_chunks",
 285 |           "type": "WHISPERFEAT",
 286 |           "links": [
 287 |             281
 288 |           ],
 289 |           "shape": 3,
 290 |           "slot_index": 0,
 291 |           "label": "whisper_chunks"
 292 |         },
 293 |         {
 294 |           "name": "frame_count",
 295 |           "type": "INT",
 296 |           "links": [
 297 |             297,
 298 |             422
 299 |           ],
 300 |           "shape": 3,
 301 |           "slot_index": 1,
 302 |           "label": "frame_count"
 303 |         }
 304 |       ],
 305 |       "properties": {
 306 |         "Node name for S&R": "whisper_to_features"
 307 |       },
 308 |       "widgets_values": [
 309 |         25
 310 |       ]
 311 |     },
 312 |     {
 313 |       "id": 526,
 314 |       "type": "DWPreprocessor",
 315 |       "pos": [
 316 |         -6370.911213012693,
 317 |         1083.9297019486887
 318 |       ],
 319 |       "size": {
 320 |         "0": 315,
 321 |         "1": 198
 322 |       },
 323 |       "flags": {},
 324 |       "order": 30,
 325 |       "mode": 0,
 326 |       "inputs": [
 327 |         {
 328 |           "name": "image",
 329 |           "type": "IMAGE",
 330 |           "link": 1529
 331 |         },
 332 |         {
 333 |           "name": "resolution",
 334 |           "type": "INT",
 335 |           "link": 873,
 336 |           "widget": {
 337 |             "name": "resolution"
 338 |           }
 339 |         }
 340 |       ],
 341 |       "outputs": [
 342 |         {
 343 |           "name": "IMAGE",
 344 |           "type": "IMAGE",
 345 |           "links": [],
 346 |           "shape": 3,
 347 |           "slot_index": 0
 348 |         },
 349 |         {
 350 |           "name": "POSE_KEYPOINT",
 351 |           "type": "POSE_KEYPOINT",
 352 |           "links": [
 353 |             1776
 354 |           ],
 355 |           "shape": 3,
 356 |           "slot_index": 1
 357 |         }
 358 |       ],
 359 |       "properties": {
 360 |         "Node name for S&R": "DWPreprocessor"
 361 |       },
 362 |       "widgets_values": [
 363 |         "disable",
 364 |         "disable",
 365 |         "enable",
 366 |         512,
 367 |         "yolox_l.torchscript.pt",
 368 |         "dw-ll_ucoco_384_bs5.torchscript.pt"
 369 |       ]
 370 |     },
 371 |     {
 372 |       "id": 1083,
 373 |       "type": "Reroute",
 374 |       "pos": [
 375 |         -6032.854807912913,
 376 |         -332.51994679218205
 377 |       ],
 378 |       "size": [
 379 |         75,
 380 |         26
 381 |       ],
 382 |       "flags": {},
 383 |       "order": 38,
 384 |       "mode": 0,
 385 |       "inputs": [
 386 |         {
 387 |           "name": "",
 388 |           "type": "*",
 389 |           "link": 1790
 390 |         }
 391 |       ],
 392 |       "outputs": [
 393 |         {
 394 |           "name": "",
 395 |           "type": "IMAGE",
 396 |           "links": [
 397 |             1786
 398 |           ],
 399 |           "slot_index": 0
 400 |         }
 401 |       ],
 402 |       "properties": {
 403 |         "showOutputText": false,
 404 |         "horizontal": false
 405 |       }
 406 |     },
 407 |     {
 408 |       "id": 16,
 409 |       "type": "ImageCompositeMasked",
 410 |       "pos": [
 411 |         -5506.854807912913,
 412 |         -418.519946792182
 413 |       ],
 414 |       "size": {
 415 |         "0": 291.8965759277344,
 416 |         "1": 146
 417 |       },
 418 |       "flags": {},
 419 |       "order": 39,
 420 |       "mode": 0,
 421 |       "inputs": [
 422 |         {
 423 |           "name": "destination",
 424 |           "type": "IMAGE",
 425 |           "link": 1786,
 426 |           "label": "destination"
 427 |         },
 428 |         {
 429 |           "name": "source",
 430 |           "type": "IMAGE",
 431 |           "link": 626,
 432 |           "label": "source"
 433 |         },
 434 |         {
 435 |           "name": "mask",
 436 |           "type": "MASK",
 437 |           "link": null,
 438 |           "label": "mask"
 439 |         }
 440 |       ],
 441 |       "outputs": [
 442 |         {
 443 |           "name": "IMAGE",
 444 |           "type": "IMAGE",
 445 |           "links": [
 446 |             1512
 447 |           ],
 448 |           "shape": 3,
 449 |           "slot_index": 0,
 450 |           "label": "IMAGE"
 451 |         }
 452 |       ],
 453 |       "properties": {
 454 |         "Node name for S&R": "ImageCompositeMasked"
 455 |       },
 456 |       "widgets_values": [
 457 |         0,
 458 |         128,
 459 |         false
 460 |       ]
 461 |     },
 462 |     {
 463 |       "id": 980,
 464 |       "type": "ReActorRestoreFace",
 465 |       "pos": [
 466 |         -5975.633963343581,
 467 |         385.2136881125497
 468 |       ],
 469 |       "size": {
 470 |         "0": 315,
 471 |         "1": 130
 472 |       },
 473 |       "flags": {},
 474 |       "order": 2,
 475 |       "mode": 4,
 476 |       "inputs": [
 477 |         {
 478 |           "name": "image",
 479 |           "type": "IMAGE",
 480 |           "link": null
 481 |         }
 482 |       ],
 483 |       "outputs": [
 484 |         {
 485 |           "name": "IMAGE",
 486 |           "type": "IMAGE",
 487 |           "links": [],
 488 |           "shape": 3,
 489 |           "slot_index": 0
 490 |         }
 491 |       ],
 492 |       "properties": {
 493 |         "Node name for S&R": "ReActorRestoreFace"
 494 |       },
 495 |       "widgets_values": [
 496 |         "retinaface_resnet50",
 497 |         "GFPGANv1.4.pth",
 498 |         1,
 499 |         0.5
 500 |       ]
 501 |     },
 502 |     {
 503 |       "id": 1055,
 504 |       "type": "easy imageToMask",
 505 |       "pos": [
 506 |         -3226,
 507 |         -140
 508 |       ],
 509 |       "size": {
 510 |         "0": 213.45140075683594,
 511 |         "1": 58
 512 |       },
 513 |       "flags": {},
 514 |       "order": 13,
 515 |       "mode": 0,
 516 |       "inputs": [
 517 |         {
 518 |           "name": "image",
 519 |           "type": "IMAGE",
 520 |           "link": 1704
 521 |         }
 522 |       ],
 523 |       "outputs": [
 524 |         {
 525 |           "name": "MASK",
 526 |           "type": "MASK",
 527 |           "links": [
 528 |             1705
 529 |           ],
 530 |           "shape": 3,
 531 |           "slot_index": 0
 532 |         }
 533 |       ],
 534 |       "properties": {
 535 |         "Node name for S&R": "easy imageToMask"
 536 |       },
 537 |       "widgets_values": [
 538 |         "red"
 539 |       ]
 540 |     },
 541 |     {
 542 |       "id": 15,
 543 |       "type": "EmptyImage",
 544 |       "pos": [
 545 |         -5951.311109737003,
 546 |         -171.4442407796418
 547 |       ],
 548 |       "size": {
 549 |         "0": 315,
 550 |         "1": 130
 551 |       },
 552 |       "flags": {
 553 |         "collapsed": false
 554 |       },
 555 |       "order": 3,
 556 |       "mode": 0,
 557 |       "outputs": [
 558 |         {
 559 |           "name": "IMAGE",
 560 |           "type": "IMAGE",
 561 |           "links": [
 562 |             626
 563 |           ],
 564 |           "shape": 3,
 565 |           "slot_index": 0,
 566 |           "label": "IMAGE"
 567 |         }
 568 |       ],
 569 |       "properties": {
 570 |         "Node name for S&R": "EmptyImage"
 571 |       },
 572 |       "widgets_values": [
 573 |         256,
 574 |         256,
 575 |         1,
 576 |         0
 577 |       ]
 578 |     },
 579 |     {
 580 |       "id": 124,
 581 |       "type": "SetNode",
 582 |       "pos": [
 583 |         -7534.198355519126,
 584 |         -717.8930995909205
 585 |       ],
 586 |       "size": {
 587 |         "0": 210,
 588 |         "1": 58
 589 |       },
 590 |       "flags": {
 591 |         "collapsed": false
 592 |       },
 593 |       "order": 15,
 594 |       "mode": 0,
 595 |       "inputs": [
 596 |         {
 597 |           "name": "VHS_AUDIO",
 598 |           "type": "VHS_AUDIO",
 599 |           "link": 287,
 600 |           "label": "VHS_AUDIO"
 601 |         }
 602 |       ],
 603 |       "outputs": [
 604 |         {
 605 |           "name": "*",
 606 |           "type": "*",
 607 |           "links": null,
 608 |           "label": "*"
 609 |         }
 610 |       ],
 611 |       "title": "Set_audio",
 612 |       "properties": {
 613 |         "previousName": "audio"
 614 |       },
 615 |       "widgets_values": [
 616 |         "audio"
 617 |       ]
 618 |     },
 619 |     {
 620 |       "id": 1056,
 621 |       "type": "MaskToImage",
 622 |       "pos": [
 623 |         -2952,
 624 |         272
 625 |       ],
 626 |       "size": {
 627 |         "0": 210,
 628 |         "1": 26
 629 |       },
 630 |       "flags": {},
 631 |       "order": 24,
 632 |       "mode": 0,
 633 |       "inputs": [
 634 |         {
 635 |           "name": "mask",
 636 |           "type": "MASK",
 637 |           "link": 1706
 638 |         }
 639 |       ],
 640 |       "outputs": [
 641 |         {
 642 |           "name": "IMAGE",
 643 |           "type": "IMAGE",
 644 |           "links": [
 645 |             1708,
 646 |             1787
 647 |           ],
 648 |           "shape": 3,
 649 |           "slot_index": 0
 650 |         }
 651 |       ],
 652 |       "properties": {
 653 |         "Node name for S&R": "MaskToImage"
 654 |       }
 655 |     },
 656 |     {
 657 |       "id": 1048,
 658 |       "type": "CR Draw Shape",
 659 |       "pos": [
 660 |         -3597,
 661 |         -170
 662 |       ],
 663 |       "size": {
 664 |         "0": 315,
 665 |         "1": 318
 666 |       },
 667 |       "flags": {},
 668 |       "order": 4,
 669 |       "mode": 0,
 670 |       "outputs": [
 671 |         {
 672 |           "name": "IMAGE",
 673 |           "type": "IMAGE",
 674 |           "links": [
 675 |             1704
 676 |           ],
 677 |           "shape": 3,
 678 |           "slot_index": 0
 679 |         },
 680 |         {
 681 |           "name": "show_help",
 682 |           "type": "STRING",
 683 |           "links": null,
 684 |           "shape": 3
 685 |         }
 686 |       ],
 687 |       "properties": {
 688 |         "Node name for S&R": "CR Draw Shape"
 689 |       },
 690 |       "widgets_values": [
 691 |         256,
 692 |         256,
 693 |         "half circle",
 694 |         "white",
 695 |         "custom",
 696 |         0,
 697 |         0,
 698 |         0.98,
 699 |         0,
 700 |         "#000000",
 701 |         "#000000"
 702 |       ]
 703 |     },
 704 |     {
 705 |       "id": 129,
 706 |       "type": "Display Any (rgthree)",
 707 |       "pos": [
 708 |         -6840.198355519126,
 709 |         -480.89309959092043
 710 |       ],
 711 |       "size": {
 712 |         "0": 226.42002868652344,
 713 |         "1": 116.54998779296875
 714 |       },
 715 |       "flags": {},
 716 |       "order": 25,
 717 |       "mode": 0,
 718 |       "inputs": [
 719 |         {
 720 |           "name": "source",
 721 |           "type": "*",
 722 |           "link": 297,
 723 |           "dir": 3,
 724 |           "label": "source"
 725 |         }
 726 |       ],
 727 |       "properties": {
 728 |         "Node name for S&R": "Display Any (rgthree)"
 729 |       },
 730 |       "widgets_values": [
 731 |         ""
 732 |       ]
 733 |     },
 734 |     {
 735 |       "id": 1092,
 736 |       "type": "ImageConcanate",
 737 |       "pos": [
 738 |         -1414.0301884973185,
 739 |         783.7210591809077
 740 |       ],
 741 |       "size": {
 742 |         "0": 315,
 743 |         "1": 102
 744 |       },
 745 |       "flags": {},
 746 |       "order": 49,
 747 |       "mode": 0,
 748 |       "inputs": [
 749 |         {
 750 |           "name": "image1",
 751 |           "type": "IMAGE",
 752 |           "link": 1826
 753 |         },
 754 |         {
 755 |           "name": "image2",
 756 |           "type": "IMAGE",
 757 |           "link": 1799
 758 |         }
 759 |       ],
 760 |       "outputs": [
 761 |         {
 762 |           "name": "IMAGE",
 763 |           "type": "IMAGE",
 764 |           "links": [
 765 |             1806
 766 |           ],
 767 |           "shape": 3,
 768 |           "slot_index": 0
 769 |         }
 770 |       ],
 771 |       "properties": {
 772 |         "Node name for S&R": "ImageConcanate"
 773 |       },
 774 |       "widgets_values": [
 775 |         "right",
 776 |         false
 777 |       ]
 778 |     },
 779 |     {
 780 |       "id": 1094,
 781 |       "type": "GetNode",
 782 |       "pos": [
 783 |         -1384.0301884973185,
 784 |         1034.7210591809078
 785 |       ],
 786 |       "size": {
 787 |         "0": 210,
 788 |         "1": 58
 789 |       },
 790 |       "flags": {
 791 |         "collapsed": false
 792 |       },
 793 |       "order": 5,
 794 |       "mode": 0,
 795 |       "outputs": [
 796 |         {
 797 |           "name": "VHS_AUDIO",
 798 |           "type": "VHS_AUDIO",
 799 |           "links": [
 800 |             1801
 801 |           ],
 802 |           "slot_index": 0,
 803 |           "label": "VHS_AUDIO"
 804 |         }
 805 |       ],
 806 |       "title": "Get_audio",
 807 |       "properties": {},
 808 |       "widgets_values": [
 809 |         "audio"
 810 |       ]
 811 |     },
 812 |     {
 813 |       "id": 1108,
 814 |       "type": "MuseTalkPostprocess",
 815 |       "pos": [
 816 |         -2876,
 817 |         788
 818 |       ],
 819 |       "size": {
 820 |         "0": 315,
 821 |         "1": 170
 822 |       },
 823 |       "flags": {},
 824 |       "order": 43,
 825 |       "mode": 0,
 826 |       "inputs": [
 827 |         {
 828 |           "name": "origin_images",
 829 |           "type": "IMAGE",
 830 |           "link": 1823
 831 |         },
 832 |         {
 833 |           "name": "musetalk_faces",
 834 |           "type": "IMAGE",
 835 |           "link": 1824
 836 |         },
 837 |         {
 838 |           "name": "uncrop_mask",
 839 |           "type": "IMAGE",
 840 |           "link": 1825
 841 |         }
 842 |       ],
 843 |       "outputs": [
 844 |         {
 845 |           "name": "images",
 846 |           "type": "IMAGE",
 847 |           "links": [
 848 |             1826,
 849 |             1827
 850 |           ],
 851 |           "shape": 3,
 852 |           "slot_index": 0
 853 |         }
 854 |       ],
 855 |       "properties": {
 856 |         "Node name for S&R": "MuseTalkPostprocess"
 857 |       },
 858 |       "widgets_values": [
 859 |         0,
 860 |         0,
 861 |         -5,
 862 |         5
 863 |       ]
 864 |     },
 865 |     {
 866 |       "id": 1051,
 867 |       "type": "PreviewImage",
 868 |       "pos": [
 869 |         -2469,
 870 |         -178
 871 |       ],
 872 |       "size": {
 873 |         "0": 210,
 874 |         "1": 246
 875 |       },
 876 |       "flags": {},
 877 |       "order": 28,
 878 |       "mode": 0,
 879 |       "inputs": [
 880 |         {
 881 |           "name": "images",
 882 |           "type": "IMAGE",
 883 |           "link": 1708
 884 |         }
 885 |       ],
 886 |       "properties": {
 887 |         "Node name for S&R": "PreviewImage"
 888 |       }
 889 |     },
 890 |     {
 891 |       "id": 122,
 892 |       "type": "muse_talk_sampler",
 893 |       "pos": [
 894 |         -4927.311109737003,
 895 |         -911.4442407796415
 896 |       ],
 897 |       "size": {
 898 |         "0": 315,
 899 |         "1": 162
 900 |       },
 901 |       "flags": {},
 902 |       "order": 40,
 903 |       "mode": 0,
 904 |       "inputs": [
 905 |         {
 906 |           "name": "model",
 907 |           "type": "MODEL",
 908 |           "link": 280,
 909 |           "label": "model"
 910 |         },
 911 |         {
 912 |           "name": "vae",
 913 |           "type": "VAE",
 914 |           "link": 279,
 915 |           "slot_index": 1,
 916 |           "label": "vae"
 917 |         },
 918 |         {
 919 |           "name": "whisper_features",
 920 |           "type": "WHISPERFEAT",
 921 |           "link": 281,
 922 |           "slot_index": 2,
 923 |           "label": "whisper_features"
 924 |         },
 925 |         {
 926 |           "name": "images",
 927 |           "type": "IMAGE",
 928 |           "link": 1559,
 929 |           "slot_index": 3,
 930 |           "label": "images"
 931 |         },
 932 |         {
 933 |           "name": "masked_images",
 934 |           "type": "IMAGE",
 935 |           "link": 1512,
 936 |           "slot_index": 4,
 937 |           "label": "masked_images"
 938 |         }
 939 |       ],
 940 |       "outputs": [
 941 |         {
 942 |           "name": "image",
 943 |           "type": "IMAGE",
 944 |           "links": [
 945 |             1379,
 946 |             1575,
 947 |             1824
 948 |           ],
 949 |           "shape": 3,
 950 |           "slot_index": 0,
 951 |           "label": "image"
 952 |         }
 953 |       ],
 954 |       "properties": {
 955 |         "Node name for S&R": "muse_talk_sampler"
 956 |       },
 957 |       "widgets_values": [
 958 |         1,
 959 |         0
 960 |       ]
 961 |     },
 962 |     {
 963 |       "id": 735,
 964 |       "type": "Display Any (rgthree)",
 965 |       "pos": [
 966 |         -7238,
 967 |         1339
 968 |       ],
 969 |       "size": {
 970 |         "0": 226.42002868652344,
 971 |         "1": 116.54998779296875
 972 |       },
 973 |       "flags": {},
 974 |       "order": 23,
 975 |       "mode": 0,
 976 |       "inputs": [
 977 |         {
 978 |           "name": "source",
 979 |           "type": "*",
 980 |           "link": 1165,
 981 |           "dir": 3,
 982 |           "label": "source"
 983 |         }
 984 |       ],
 985 |       "properties": {
 986 |         "Node name for S&R": "Display Any (rgthree)"
 987 |       },
 988 |       "widgets_values": [
 989 |         ""
 990 |       ]
 991 |     },
 992 |     {
 993 |       "id": 1091,
 994 |       "type": "MuseTalkPostprocess",
 995 |       "pos": [
 996 |         -3011,
 997 |         1271
 998 |       ],
 999 |       "size": {
1000 |         "0": 380.4000244140625,
1001 |         "1": 190
1002 |       },
1003 |       "flags": {},
1004 |       "order": 47,
1005 |       "mode": 0,
1006 |       "inputs": [
1007 |         {
1008 |           "name": "origin_images",
1009 |           "type": "IMAGE",
1010 |           "link": 1822
1011 |         },
1012 |         {
1013 |           "name": "musetalk_faces",
1014 |           "type": "IMAGE",
1015 |           "link": 1848
1016 |         },
1017 |         {
1018 |           "name": "uncrop_mask",
1019 |           "type": "IMAGE",
1020 |           "link": 1796
1021 |         }
1022 |       ],
1023 |       "outputs": [
1024 |         {
1025 |           "name": "images",
1026 |           "type": "IMAGE",
1027 |           "links": [
1028 |             1797,
1029 |             1799
1030 |           ],
1031 |           "shape": 3,
1032 |           "slot_index": 0
1033 |         },
1034 |         {
1035 |           "name": "uncrop_masks",
1036 |           "type": "IMAGE",
1037 |           "links": null,
1038 |           "shape": 3
1039 |         },
1040 |         {
1041 |           "name": "uncroped_images",
1042 |           "type": "IMAGE",
1043 |           "links": null,
1044 |           "shape": 3
1045 |         },
1046 |         {
1047 |           "name": "face_masks",
1048 |           "type": "IMAGE",
1049 |           "links": null,
1050 |           "shape": 3
1051 |         }
1052 |       ],
1053 |       "properties": {
1054 |         "Node name for S&R": "MuseTalkPostprocess"
1055 |       },
1056 |       "widgets_values": [
1057 |         0,
1058 |         0,
1059 |         -5,
1060 |         5
1061 |       ]
1062 |     },
1063 |     {
1064 |       "id": 1080,
1065 |       "type": "MuseTalkPreprocess",
1066 |       "pos": [
1067 |         -5926,
1068 |         900
1069 |       ],
1070 |       "size": {
1071 |         "0": 354.3999938964844,
1072 |         "1": 174
1073 |       },
1074 |       "flags": {},
1075 |       "order": 31,
1076 |       "mode": 0,
1077 |       "inputs": [
1078 |         {
1079 |           "name": "origin_images",
1080 |           "type": "IMAGE",
1081 |           "link": 1775
1082 |         },
1083 |         {
1084 |           "name": "pose_kps",
1085 |           "type": "POSE_KEYPOINT",
1086 |           "link": 1776
1087 |         }
1088 |       ],
1089 |       "outputs": [
1090 |         {
1091 |           "name": "rotated_faces",
1092 |           "type": "IMAGE",
1093 |           "links": [
1094 |             1778,
1095 |             1780,
1096 |             1784
1097 |           ],
1098 |           "shape": 3,
1099 |           "slot_index": 0
1100 |         },
1101 |         {
1102 |           "name": "rotated_faces_with_landmarks",
1103 |           "type": "IMAGE",
1104 |           "links": [
1105 |             1779
1106 |           ],
1107 |           "shape": 3,
1108 |           "slot_index": 1
1109 |         }
1110 |       ],
1111 |       "properties": {
1112 |         "Node name for S&R": "MuseTalkPreprocess"
1113 |       },
1114 |       "widgets_values": [
1115 |         "full",
1116 |         0,
1117 |         8,
1118 |         0,
1119 |         0
1120 |       ]
1121 |     },
1122 |     {
1123 |       "id": 1090,
1124 |       "type": "JWImageResize",
1125 |       "pos": [
1126 |         -4220,
1127 |         459
1128 |       ],
1129 |       "size": {
1130 |         "0": 315,
1131 |         "1": 106
1132 |       },
1133 |       "flags": {},
1134 |       "order": 44,
1135 |       "mode": 0,
1136 |       "inputs": [
1137 |         {
1138 |           "name": "image",
1139 |           "type": "IMAGE",
1140 |           "link": 1791
1141 |         }
1142 |       ],
1143 |       "outputs": [
1144 |         {
1145 |           "name": "IMAGE",
1146 |           "type": "IMAGE",
1147 |           "links": [
1148 |             1792,
1149 |             1848
1150 |           ],
1151 |           "shape": 3,
1152 |           "slot_index": 0
1153 |         }
1154 |       ],
1155 |       "properties": {
1156 |         "Node name for S&R": "JWImageResize"
1157 |       },
1158 |       "widgets_values": [
1159 |         256,
1160 |         256,
1161 |         "nearest"
1162 |       ]
1163 |     },
1164 |     {
1165 |       "id": 28,
1166 |       "type": "VHS_LoadAudio",
1167 |       "pos": [
1168 |         -8000,
1169 |         -590
1170 |       ],
1171 |       "size": {
1172 |         "0": 315,
1173 |         "1": 82
1174 |       },
1175 |       "flags": {},
1176 |       "order": 6,
1177 |       "mode": 0,
1178 |       "outputs": [
1179 |         {
1180 |           "name": "audio",
1181 |           "type": "VHS_AUDIO",
1182 |           "links": [
1183 |             45,
1184 |             287
1185 |           ],
1186 |           "shape": 3,
1187 |           "slot_index": 0,
1188 |           "label": "audio"
1189 |         }
1190 |       ],
1191 |       "properties": {
1192 |         "Node name for S&R": "VHS_LoadAudio"
1193 |       },
1194 |       "widgets_values": {
1195 |         "audio_file": "C:\\Users\\Administrator\\Videos\\视频素材\\xiwang.wav",
1196 |         "seek_seconds": 0
1197 |       }
1198 |     },
1199 |     {
1200 |       "id": 947,
1201 |       "type": "VHS_LoadVideo",
1202 |       "pos": [
1203 |         -8396,
1204 |         870
1205 |       ],
1206 |       "size": [
1207 |         240,
1208 |         643.1111111111111
1209 |       ],
1210 |       "flags": {},
1211 |       "order": 12,
1212 |       "mode": 0,
1213 |       "inputs": [
1214 |         {
1215 |           "name": "batch_manager",
1216 |           "type": "VHS_BatchManager",
1217 |           "link": null
1218 |         },
1219 |         {
1220 |           "name": "frame_load_cap",
1221 |           "type": "INT",
1222 |           "link": 1761,
1223 |           "widget": {
1224 |             "name": "frame_load_cap"
1225 |           }
1226 |         }
1227 |       ],
1228 |       "outputs": [
1229 |         {
1230 |           "name": "IMAGE",
1231 |           "type": "IMAGE",
1232 |           "links": [
1233 |             1849
1234 |           ],
1235 |           "shape": 3,
1236 |           "slot_index": 0
1237 |         },
1238 |         {
1239 |           "name": "frame_count",
1240 |           "type": "INT",
1241 |           "links": [
1242 |             1515
1243 |           ],
1244 |           "shape": 3,
1245 |           "slot_index": 1
1246 |         },
1247 |         {
1248 |           "name": "audio",
1249 |           "type": "VHS_AUDIO",
1250 |           "links": null,
1251 |           "shape": 3,
1252 |           "slot_index": 2
1253 |         },
1254 |         {
1255 |           "name": "video_info",
1256 |           "type": "VHS_VIDEOINFO",
1257 |           "links": [
1258 |             1516
1259 |           ],
1260 |           "shape": 3,
1261 |           "slot_index": 3
1262 |         }
1263 |       ],
1264 |       "properties": {
1265 |         "Node name for S&R": "VHS_LoadVideo"
1266 |       },
1267 |       "widgets_values": {
1268 |         "video": "2.mp4",
1269 |         "force_rate": 25,
1270 |         "force_size": "Disabled",
1271 |         "custom_width": 512,
1272 |         "custom_height": 512,
1273 |         "frame_load_cap": 0,
1274 |         "skip_first_frames": 0,
1275 |         "select_every_nth": 1,
1276 |         "choose video to upload": "image",
1277 |         "videopreview": {
1278 |           "hidden": false,
1279 |           "paused": false,
1280 |           "params": {
1281 |             "frame_load_cap": 0,
1282 |             "skip_first_frames": 0,
1283 |             "force_rate": 25,
1284 |             "filename": "2.mp4",
1285 |             "type": "input",
1286 |             "format": "video/mp4",
1287 |             "select_every_nth": 1
1288 |           }
1289 |         }
1290 |       }
1291 |     },
1292 |     {
1293 |       "id": 958,
1294 |       "type": "Reroute",
1295 |       "pos": [
1296 |         -6798,
1297 |         748
1298 |       ],
1299 |       "size": [
1300 |         75,
1301 |         26
1302 |       ],
1303 |       "flags": {},
1304 |       "order": 16,
1305 |       "mode": 0,
1306 |       "inputs": [
1307 |         {
1308 |           "name": "",
1309 |           "type": "*",
1310 |           "link": 1849
1311 |         }
1312 |       ],
1313 |       "outputs": [
1314 |         {
1315 |           "name": "",
1316 |           "type": "IMAGE",
1317 |           "links": [
1318 |             1529,
1319 |             1530,
1320 |             1531,
1321 |             1775,
1322 |             1828
1323 |           ],
1324 |           "slot_index": 0
1325 |         }
1326 |       ],
1327 |       "properties": {
1328 |         "showOutputText": false,
1329 |         "horizontal": false
1330 |       }
1331 |     },
1332 |     {
1333 |       "id": 97,
1334 |       "type": "Display Any (rgthree)",
1335 |       "pos": [
1336 |         -7762,
1337 |         1341
1338 |       ],
1339 |       "size": {
1340 |         "0": 226.42002868652344,
1341 |         "1": 116.54998779296875
1342 |       },
1343 |       "flags": {},
1344 |       "order": 17,
1345 |       "mode": 0,
1346 |       "inputs": [
1347 |         {
1348 |           "name": "source",
1349 |           "type": "*",
1350 |           "link": 1515,
1351 |           "dir": 3,
1352 |           "label": "source"
1353 |         }
1354 |       ],
1355 |       "properties": {
1356 |         "Node name for S&R": "Display Any (rgthree)"
1357 |       },
1358 |       "widgets_values": [
1359 |         ""
1360 |       ]
1361 |     },
1362 |     {
1363 |       "id": 734,
1364 |       "type": "VHS_VideoInfo",
1365 |       "pos": [
1366 |         -7816,
1367 |         1039
1368 |       ],
1369 |       "size": {
1370 |         "0": 393,
1371 |         "1": 206
1372 |       },
1373 |       "flags": {},
1374 |       "order": 18,
1375 |       "mode": 0,
1376 |       "inputs": [
1377 |         {
1378 |           "name": "video_info",
1379 |           "type": "VHS_VIDEOINFO",
1380 |           "link": 1516
1381 |         }
1382 |       ],
1383 |       "outputs": [
1384 |         {
1385 |           "name": "source_fps🟨",
1386 |           "type": "FLOAT",
1387 |           "links": [
1388 |             1165
1389 |           ],
1390 |           "shape": 3,
1391 |           "slot_index": 0
1392 |         },
1393 |         {
1394 |           "name": "source_frame_count🟨",
1395 |           "type": "INT",
1396 |           "links": null,
1397 |           "shape": 3
1398 |         },
1399 |         {
1400 |           "name": "source_duration🟨",
1401 |           "type": "FLOAT",
1402 |           "links": null,
1403 |           "shape": 3
1404 |         },
1405 |         {
1406 |           "name": "source_width🟨",
1407 |           "type": "INT",
1408 |           "links": null,
1409 |           "shape": 3
1410 |         },
1411 |         {
1412 |           "name": "source_height🟨",
1413 |           "type": "INT",
1414 |           "links": null,
1415 |           "shape": 3
1416 |         },
1417 |         {
1418 |           "name": "loaded_fps🟦",
1419 |           "type": "FLOAT",
1420 |           "links": null,
1421 |           "shape": 3
1422 |         },
1423 |         {
1424 |           "name": "loaded_frame_count🟦",
1425 |           "type": "INT",
1426 |           "links": null,
1427 |           "shape": 3
1428 |         },
1429 |         {
1430 |           "name": "loaded_duration🟦",
1431 |           "type": "FLOAT",
1432 |           "links": null,
1433 |           "shape": 3
1434 |         },
1435 |         {
1436 |           "name": "loaded_width🟦",
1437 |           "type": "INT",
1438 |           "links": null,
1439 |           "shape": 3
1440 |         },
1441 |         {
1442 |           "name": "loaded_height🟦",
1443 |           "type": "INT",
1444 |           "links": null,
1445 |           "shape": 3
1446 |         }
1447 |       ],
1448 |       "properties": {
1449 |         "Node name for S&R": "VHS_VideoInfo"
1450 |       },
1451 |       "widgets_values": {}
1452 |     },
1453 |     {
1454 |       "id": 4,
1455 |       "type": "VAELoader",
1456 |       "pos": [
1457 |         -5971.280013365008,
1458 |         -734.1600439000975
1459 |       ],
1460 |       "size": {
1461 |         "0": 389.75921630859375,
1462 |         "1": 58
1463 |       },
1464 |       "flags": {},
1465 |       "order": 7,
1466 |       "mode": 0,
1467 |       "outputs": [
1468 |         {
1469 |           "name": "VAE",
1470 |           "type": "VAE",
1471 |           "links": [
1472 |             279
1473 |           ],
1474 |           "shape": 3,
1475 |           "slot_index": 0,
1476 |           "label": "VAE"
1477 |         }
1478 |       ],
1479 |       "properties": {
1480 |         "Node name for S&R": "VAELoader"
1481 |       },
1482 |       "widgets_values": [
1483 |         "vae-ft-mse-840000-ema-pruned.safetensors"
1484 |       ]
1485 |     },
1486 |     {
1487 |       "id": 121,
1488 |       "type": "UNETLoader_MuseTalk",
1489 |       "pos": [
1490 |         -5884.311109737003,
1491 |         -986.4442407796415
1492 |       ],
1493 |       "size": {
1494 |         "0": 214.1832275390625,
1495 |         "1": 58
1496 |       },
1497 |       "flags": {},
1498 |       "order": 8,
1499 |       "mode": 0,
1500 |       "outputs": [
1501 |         {
1502 |           "name": "MODEL",
1503 |           "type": "MODEL",
1504 |           "links": [
1505 |             280
1506 |           ],
1507 |           "shape": 3,
1508 |           "slot_index": 0,
1509 |           "label": "MODEL"
1510 |         }
1511 |       ],
1512 |       "properties": {
1513 |         "Node name for S&R": "UNETLoader_MuseTalk"
1514 |       },
1515 |       "widgets_values": [
1516 |         "pytorch_model.bin"
1517 |       ]
1518 |     },
1519 |     {
1520 |       "id": 979,
1521 |       "type": "JWImageResize",
1522 |       "pos": [
1523 |         -6410,
1524 |         316
1525 |       ],
1526 |       "size": {
1527 |         "0": 315,
1528 |         "1": 106
1529 |       },
1530 |       "flags": {},
1531 |       "order": 37,
1532 |       "mode": 4,
1533 |       "inputs": [
1534 |         {
1535 |           "name": "image",
1536 |           "type": "IMAGE",
1537 |           "link": 1587
1538 |         }
1539 |       ],
1540 |       "outputs": [
1541 |         {
1542 |           "name": "IMAGE",
1543 |           "type": "IMAGE",
1544 |           "links": [
1545 |             1790
1546 |           ],
1547 |           "shape": 3,
1548 |           "slot_index": 0
1549 |         }
1550 |       ],
1551 |       "properties": {
1552 |         "Node name for S&R": "JWImageResize"
1553 |       },
1554 |       "widgets_values": [
1555 |         256,
1556 |         256,
1557 |         "nearest"
1558 |       ]
1559 |     },
1560 |     {
1561 |       "id": 866,
1562 |       "type": "FaceEnhancement",
1563 |       "pos": [
1564 |         -4240,
1565 |         310
1566 |       ],
1567 |       "size": {
1568 |         "0": 300.08770751953125,
1569 |         "1": 73.68206024169922
1570 |       },
1571 |       "flags": {},
1572 |       "order": 42,
1573 |       "mode": 0,
1574 |       "inputs": [
1575 |         {
1576 |           "name": "images",
1577 |           "type": "IMAGE",
1578 |           "link": 1575
1579 |         }
1580 |       ],
1581 |       "outputs": [
1582 |         {
1583 |           "name": "images",
1584 |           "type": "IMAGE",
1585 |           "links": [
1586 |             1791
1587 |           ],
1588 |           "shape": 3,
1589 |           "slot_index": 0
1590 |         }
1591 |       ],
1592 |       "properties": {
1593 |         "Node name for S&R": "FaceEnhancement"
1594 |       }
1595 |     },
1596 |     {
1597 |       "id": 911,
1598 |       "type": "GetNode",
1599 |       "pos": [
1600 |         -2844,
1601 |         1067
1602 |       ],
1603 |       "size": {
1604 |         "0": 210,
1605 |         "1": 58
1606 |       },
1607 |       "flags": {
1608 |         "collapsed": false
1609 |       },
1610 |       "order": 9,
1611 |       "mode": 0,
1612 |       "outputs": [
1613 |         {
1614 |           "name": "VHS_AUDIO",
1615 |           "type": "VHS_AUDIO",
1616 |           "links": [
1617 |             1460,
1618 |             1570
1619 |           ],
1620 |           "slot_index": 0,
1621 |           "label": "VHS_AUDIO"
1622 |         }
1623 |       ],
1624 |       "title": "Get_audio",
1625 |       "properties": {},
1626 |       "widgets_values": [
1627 |         "audio"
1628 |       ]
1629 |     },
1630 |     {
1631 |       "id": 236,
1632 |       "type": "INTConstant",
1633 |       "pos": [
1634 |         -8753,
1635 |         1060
1636 |       ],
1637 |       "size": {
1638 |         "0": 224.5601348876953,
1639 |         "1": 58
1640 |       },
1641 |       "flags": {
1642 |         "collapsed": false
1643 |       },
1644 |       "order": 10,
1645 |       "mode": 0,
1646 |       "outputs": [
1647 |         {
1648 |           "name": "value",
1649 |           "type": "INT",
1650 |           "links": [],
1651 |           "shape": 3,
1652 |           "slot_index": 0
1653 |         }
1654 |       ],
1655 |       "properties": {
1656 |         "Node name for S&R": "INTConstant"
1657 |       },
1658 |       "widgets_values": [
1659 |         100
1660 |       ],
1661 |       "color": "#1b4669",
1662 |       "bgcolor": "#29699c"
1663 |     },
1664 |     {
1665 |       "id": 1084,
1666 |       "type": "Reroute",
1667 |       "pos": [
1668 |         -3254,
1669 |         496
1670 |       ],
1671 |       "size": [
1672 |         75,
1673 |         26
1674 |       ],
1675 |       "flags": {},
1676 |       "order": 29,
1677 |       "mode": 0,
1678 |       "inputs": [
1679 |         {
1680 |           "name": "",
1681 |           "type": "*",
1682 |           "link": 1787
1683 |         }
1684 |       ],
1685 |       "outputs": [
1686 |         {
1687 |           "name": "",
1688 |           "type": "IMAGE",
1689 |           "links": [
1690 |             1796,
1691 |             1825
1692 |           ],
1693 |           "slot_index": 0
1694 |         }
1695 |       ],
1696 |       "properties": {
1697 |         "showOutputText": false,
1698 |         "horizontal": false
1699 |       }
1700 |     },
1701 |     {
1702 |       "id": 913,
1703 |       "type": "GetNode",
1704 |       "pos": [
1705 |         -4285,
1706 |         938
1707 |       ],
1708 |       "size": {
1709 |         "0": 210,
1710 |         "1": 58
1711 |       },
1712 |       "flags": {
1713 |         "collapsed": false
1714 |       },
1715 |       "order": 11,
1716 |       "mode": 0,
1717 |       "outputs": [
1718 |         {
1719 |           "name": "VHS_AUDIO",
1720 |           "type": "VHS_AUDIO",
1721 |           "links": [
1722 |             1462
1723 |           ],
1724 |           "slot_index": 0,
1725 |           "label": "VHS_AUDIO"
1726 |         }
1727 |       ],
1728 |       "title": "Get_audio",
1729 |       "properties": {},
1730 |       "widgets_values": [
1731 |         "audio"
1732 |       ]
1733 |     },
1734 |     {
1735 |       "id": 1107,
1736 |       "type": "Reroute",
1737 |       "pos": [
1738 |         -3847,
1739 |         1582
1740 |       ],
1741 |       "size": [
1742 |         75,
1743 |         26
1744 |       ],
1745 |       "flags": {},
1746 |       "order": 22,
1747 |       "mode": 0,
1748 |       "inputs": [
1749 |         {
1750 |           "name": "",
1751 |           "type": "*",
1752 |           "link": 1828
1753 |         }
1754 |       ],
1755 |       "outputs": [
1756 |         {
1757 |           "name": "",
1758 |           "type": "IMAGE",
1759 |           "links": [
1760 |             1822,
1761 |             1823
1762 |           ],
1763 |           "slot_index": 0
1764 |         }
1765 |       ],
1766 |       "properties": {
1767 |         "showOutputText": false,
1768 |         "horizontal": false
1769 |       }
1770 |     },
1771 |     {
1772 |       "id": 912,
1773 |       "type": "VHS_VideoCombine",
1774 |       "pos": [
1775 |         -3994,
1776 |         840
1777 |       ],
1778 |       "size": [
1779 |         320,
1780 |         290
1781 |       ],
1782 |       "flags": {},
1783 |       "order": 46,
1784 |       "mode": 0,
1785 |       "inputs": [
1786 |         {
1787 |           "name": "images",
1788 |           "type": "IMAGE",
1789 |           "link": 1792
1790 |         },
1791 |         {
1792 |           "name": "audio",
1793 |           "type": "VHS_AUDIO",
1794 |           "link": 1462
1795 |         },
1796 |         {
1797 |           "name": "batch_manager",
1798 |           "type": "VHS_BatchManager",
1799 |           "link": null
1800 |         }
1801 |       ],
1802 |       "outputs": [
1803 |         {
1804 |           "name": "Filenames",
1805 |           "type": "VHS_FILENAMES",
1806 |           "links": null,
1807 |           "shape": 3
1808 |         }
1809 |       ],
1810 |       "properties": {
1811 |         "Node name for S&R": "VHS_VideoCombine"
1812 |       },
1813 |       "widgets_values": {
1814 |         "frame_rate": 25,
1815 |         "loop_count": 0,
1816 |         "filename_prefix": "AnimateDiff",
1817 |         "format": "video/h264-mp4",
1818 |         "pix_fmt": "yuv420p",
1819 |         "crf": 19,
1820 |         "save_metadata": false,
1821 |         "pingpong": false,
1822 |         "save_output": false,
1823 |         "videopreview": {
1824 |           "hidden": false,
1825 |           "paused": false,
1826 |           "params": {
1827 |             "filename": "AnimateDiff_00004-audio.mp4",
1828 |             "subfolder": "",
1829 |             "type": "temp",
1830 |             "format": "video/h264-mp4"
1831 |           }
1832 |         }
1833 |       }
1834 |     },
1835 |     {
1836 |       "id": 1054,
1837 |       "type": "GrowMaskWithBlur",
1838 |       "pos": [
1839 |         -2952,
1840 |         -205
1841 |       ],
1842 |       "size": {
1843 |         "0": 315,
1844 |         "1": 246
1845 |       },
1846 |       "flags": {},
1847 |       "order": 19,
1848 |       "mode": 0,
1849 |       "inputs": [
1850 |         {
1851 |           "name": "mask",
1852 |           "type": "MASK",
1853 |           "link": 1705
1854 |         }
1855 |       ],
1856 |       "outputs": [
1857 |         {
1858 |           "name": "mask",
1859 |           "type": "MASK",
1860 |           "links": [
1861 |             1706
1862 |           ],
1863 |           "shape": 3,
1864 |           "slot_index": 0
1865 |         },
1866 |         {
1867 |           "name": "mask_inverted",
1868 |           "type": "MASK",
1869 |           "links": null,
1870 |           "shape": 3
1871 |         }
1872 |       ],
1873 |       "properties": {
1874 |         "Node name for S&R": "GrowMaskWithBlur"
1875 |       },
1876 |       "widgets_values": [
1877 |         -5,
1878 |         0,
1879 |         true,
1880 |         false,
1881 |         2,
1882 |         1,
1883 |         1,
1884 |         false
1885 |       ]
1886 |     },
1887 |     {
1888 |       "id": 971,
1889 |       "type": "ImageFilterGaussianBlur",
1890 |       "size": {
1891 |         "0": 315,
1892 |         "1": 82
1893 |       },
1894 |       "flags": {},
1895 |       "mode": 4,
1896 |       "inputs": [
1897 |         {
1898 |           "name": "images",
1899 |           "type": "IMAGE",
1900 |           "link": 1780
1901 |         }
1902 |       ],
1903 |       "outputs": [
1904 |         {
1905 |           "name": "IMAGE",
1906 |           "type": "IMAGE",
1907 |           "links": [
1908 |             1558,
1909 |             1559
1910 |           ],
1911 |           "shape": 3,
1912 |           "slot_index": 0
1913 |         }
1914 |       ],
1915 |       "properties": {
1916 |         "Node name for S&R": "ImageFilterGaussianBlur"
1917 |       },
1918 |       "widgets_values": [
1919 |         10,
1920 |         10
1921 |       ],
1922 |       "order": 34,
1923 |       "pos": [
1924 |         -5311,
1925 |         410
1926 |       ]
1927 |     },
1928 |     {
1929 |       "id": 833,
1930 |       "type": "VHS_VideoCombine",
1931 |       "pos": [
1932 |         -5418,
1933 |         1001
1934 |       ],
1935 |       "size": [
1936 |         320,
1937 |         290
1938 |       ],
1939 |       "flags": {},
1940 |       "order": 35,
1941 |       "mode": 0,
1942 |       "inputs": [
1943 |         {
1944 |           "name": "images",
1945 |           "type": "IMAGE",
1946 |           "link": 1779
1947 |         },
1948 |         {
1949 |           "name": "audio",
1950 |           "type": "VHS_AUDIO",
1951 |           "link": null
1952 |         },
1953 |         {
1954 |           "name": "batch_manager",
1955 |           "type": "VHS_BatchManager",
1956 |           "link": null
1957 |         }
1958 |       ],
1959 |       "outputs": [
1960 |         {
1961 |           "name": "Filenames",
1962 |           "type": "VHS_FILENAMES",
1963 |           "links": null,
1964 |           "shape": 3
1965 |         }
1966 |       ],
1967 |       "properties": {
1968 |         "Node name for S&R": "VHS_VideoCombine"
1969 |       },
1970 |       "widgets_values": {
1971 |         "frame_rate": 25,
1972 |         "loop_count": 0,
1973 |         "filename_prefix": "musetalk",
1974 |         "format": "video/h264-mp4",
1975 |         "pix_fmt": "yuv420p",
1976 |         "crf": 19,
1977 |         "save_metadata": false,
1978 |         "pingpong": false,
1979 |         "save_output": false,
1980 |         "videopreview": {
1981 |           "hidden": false,
1982 |           "paused": false,
1983 |           "params": {
1984 |             "filename": "AnimateDiff_00001.mp4",
1985 |             "subfolder": "",
1986 |             "type": "temp",
1987 |             "format": "video/h264-mp4"
1988 |           }
1989 |         }
1990 |       }
1991 |     },
1992 |     {
1993 |       "id": 844,
1994 |       "type": "VHS_VideoCombine",
1995 |       "pos": [
1996 |         -4933,
1997 |         1001
1998 |       ],
1999 |       "size": [
2000 |         320,
2001 |         290
2002 |       ],
2003 |       "flags": {},
2004 |       "order": 32,
2005 |       "mode": 0,
2006 |       "inputs": [
2007 |         {
2008 |           "name": "images",
2009 |           "type": "IMAGE",
2010 |           "link": 1778
2011 |         },
2012 |         {
2013 |           "name": "audio",
2014 |           "type": "VHS_AUDIO",
2015 |           "link": null
2016 |         },
2017 |         {
2018 |           "name": "batch_manager",
2019 |           "type": "VHS_BatchManager",
2020 |           "link": null
2021 |         }
2022 |       ],
2023 |       "outputs": [
2024 |         {
2025 |           "name": "Filenames",
2026 |           "type": "VHS_FILENAMES",
2027 |           "links": null,
2028 |           "shape": 3
2029 |         }
2030 |       ],
2031 |       "properties": {
2032 |         "Node name for S&R": "VHS_VideoCombine"
2033 |       },
2034 |       "widgets_values": {
2035 |         "frame_rate": 25,
2036 |         "loop_count": 0,
2037 |         "filename_prefix": "musetalk",
2038 |         "format": "video/h264-mp4",
2039 |         "pix_fmt": "yuv420p",
2040 |         "crf": 19,
2041 |         "save_metadata": false,
2042 |         "pingpong": false,
2043 |         "save_output": false,
2044 |         "videopreview": {
2045 |           "hidden": false,
2046 |           "paused": false,
2047 |           "params": {
2048 |             "filename": "AnimateDiff_00002.mp4",
2049 |             "subfolder": "",
2050 |             "type": "temp",
2051 |             "format": "video/h264-mp4"
2052 |           }
2053 |         }
2054 |       }
2055 |     },
2056 |     {
2057 |       "id": 970,
2058 |       "type": "VHS_VideoCombine",
2059 |       "pos": [
2060 |         -4794.311109737003,
2061 |         -480.4442407796421
2062 |       ],
2063 |       "size": [
2064 |         320,
2065 |         290
2066 |       ],
2067 |       "flags": {},
2068 |       "order": 36,
2069 |       "mode": 0,
2070 |       "inputs": [
2071 |         {
2072 |           "name": "images",
2073 |           "type": "IMAGE",
2074 |           "link": 1558
2075 |         },
2076 |         {
2077 |           "name": "audio",
2078 |           "type": "VHS_AUDIO",
2079 |           "link": null
2080 |         },
2081 |         {
2082 |           "name": "batch_manager",
2083 |           "type": "VHS_BatchManager",
2084 |           "link": null
2085 |         }
2086 |       ],
2087 |       "outputs": [
2088 |         {
2089 |           "name": "Filenames",
2090 |           "type": "VHS_FILENAMES",
2091 |           "links": null,
2092 |           "shape": 3
2093 |         }
2094 |       ],
2095 |       "properties": {
2096 |         "Node name for S&R": "VHS_VideoCombine"
2097 |       },
2098 |       "widgets_values": {
2099 |         "frame_rate": 25,
2100 |         "loop_count": 0,
2101 |         "filename_prefix": "musetalk",
2102 |         "format": "video/h264-mp4",
2103 |         "pix_fmt": "yuv420p",
2104 |         "crf": 19,
2105 |         "save_metadata": false,
2106 |         "pingpong": false,
2107 |         "save_output": false,
2108 |         "videopreview": {
2109 |           "hidden": false,
2110 |           "paused": false,
2111 |           "params": {
2112 |             "filename": "AnimateDiff_00003.mp4",
2113 |             "subfolder": "",
2114 |             "type": "temp",
2115 |             "format": "video/h264-mp4"
2116 |           }
2117 |         }
2118 |       }
2119 |     },
2120 |     {
2121 |       "id": 842,
2122 |       "type": "VHS_VideoCombine",
2123 |       "pos": [
2124 |         -4134.311109737003,
2125 |         -941.4442407796415
2126 |       ],
2127 |       "size": [
2128 |         320,
2129 |         604
2130 |       ],
2131 |       "flags": {},
2132 |       "order": 41,
2133 |       "mode": 0,
2134 |       "inputs": [
2135 |         {
2136 |           "name": "images",
2137 |           "type": "IMAGE",
2138 |           "link": 1379
2139 |         },
2140 |         {
2141 |           "name": "audio",
2142 |           "type": "VHS_AUDIO",
2143 |           "link": 1380
2144 |         },
2145 |         {
2146 |           "name": "batch_manager",
2147 |           "type": "VHS_BatchManager",
2148 |           "link": null
2149 |         }
2150 |       ],
2151 |       "outputs": [
2152 |         {
2153 |           "name": "Filenames",
2154 |           "type": "VHS_FILENAMES",
2155 |           "links": null,
2156 |           "shape": 3,
2157 |           "slot_index": 0
2158 |         }
2159 |       ],
2160 |       "properties": {
2161 |         "Node name for S&R": "VHS_VideoCombine"
2162 |       },
2163 |       "widgets_values": {
2164 |         "frame_rate": 25,
2165 |         "loop_count": 0,
2166 |         "filename_prefix": "musetalk",
2167 |         "format": "video/h264-mp4",
2168 |         "pix_fmt": "yuv420p",
2169 |         "crf": 19,
2170 |         "save_metadata": false,
2171 |         "pingpong": false,
2172 |         "save_output": true,
2173 |         "videopreview": {
2174 |           "hidden": false,
2175 |           "paused": false,
2176 |           "params": {
2177 |             "filename": "AnimateDiff_00021-audio.mp4",
2178 |             "subfolder": "",
2179 |             "type": "output",
2180 |             "format": "video/h264-mp4"
2181 |           }
2182 |         }
2183 |       }
2184 |     },
2185 |     {
2186 |       "id": 867,
2187 |       "type": "VHS_VideoCombine",
2188 |       "pos": [
2189 |         -1978,
2190 |         924
2191 |       ],
2192 |       "size": [
2193 |         320,
2194 |         290
2195 |       ],
2196 |       "flags": {},
2197 |       "order": 45,
2198 |       "mode": 0,
2199 |       "inputs": [
2200 |         {
2201 |           "name": "images",
2202 |           "type": "IMAGE",
2203 |           "link": 1827
2204 |         },
2205 |         {
2206 |           "name": "audio",
2207 |           "type": "VHS_AUDIO",
2208 |           "link": 1460
2209 |         },
2210 |         {
2211 |           "name": "batch_manager",
2212 |           "type": "VHS_BatchManager",
2213 |           "link": null
2214 |         }
2215 |       ],
2216 |       "outputs": [
2217 |         {
2218 |           "name": "Filenames",
2219 |           "type": "VHS_FILENAMES",
2220 |           "links": null,
2221 |           "shape": 3,
2222 |           "slot_index": 0
2223 |         }
2224 |       ],
2225 |       "properties": {
2226 |         "Node name for S&R": "VHS_VideoCombine"
2227 |       },
2228 |       "widgets_values": {
2229 |         "frame_rate": 25,
2230 |         "loop_count": 0,
2231 |         "filename_prefix": "musetalk",
2232 |         "format": "video/h264-mp4",
2233 |         "pix_fmt": "yuv420p",
2234 |         "crf": 19,
2235 |         "save_metadata": false,
2236 |         "pingpong": false,
2237 |         "save_output": false,
2238 |         "videopreview": {
2239 |           "hidden": false,
2240 |           "paused": false,
2241 |           "params": {
2242 |             "filename": "AnimateDiff_00005-audio.mp4",
2243 |             "subfolder": "",
2244 |             "type": "temp",
2245 |             "format": "video/h264-mp4"
2246 |           }
2247 |         }
2248 |       }
2249 |     },
2250 |     {
2251 |       "id": 973,
2252 |       "type": "VHS_VideoCombine",
2253 |       "pos": [
2254 |         -2420,
2255 |         1279
2256 |       ],
2257 |       "size": [
2258 |         320,
2259 |         290
2260 |       ],
2261 |       "flags": {},
2262 |       "order": 48,
2263 |       "mode": 0,
2264 |       "inputs": [
2265 |         {
2266 |           "name": "images",
2267 |           "type": "IMAGE",
2268 |           "link": 1797
2269 |         },
2270 |         {
2271 |           "name": "audio",
2272 |           "type": "VHS_AUDIO",
2273 |           "link": 1570
2274 |         },
2275 |         {
2276 |           "name": "batch_manager",
2277 |           "type": "VHS_BatchManager",
2278 |           "link": null
2279 |         }
2280 |       ],
2281 |       "outputs": [
2282 |         {
2283 |           "name": "Filenames",
2284 |           "type": "VHS_FILENAMES",
2285 |           "links": null,
2286 |           "shape": 3
2287 |         }
2288 |       ],
2289 |       "properties": {
2290 |         "Node name for S&R": "VHS_VideoCombine"
2291 |       },
2292 |       "widgets_values": {
2293 |         "frame_rate": 25,
2294 |         "loop_count": 0,
2295 |         "filename_prefix": "musetalk",
2296 |         "format": "video/h264-mp4",
2297 |         "pix_fmt": "yuv420p",
2298 |         "crf": 19,
2299 |         "save_metadata": false,
2300 |         "pingpong": false,
2301 |         "save_output": false,
2302 |         "videopreview": {
2303 |           "hidden": false,
2304 |           "paused": false,
2305 |           "params": {
2306 |             "filename": "AnimateDiff_00006-audio.mp4",
2307 |             "subfolder": "",
2308 |             "type": "temp",
2309 |             "format": "video/h264-mp4"
2310 |           }
2311 |         }
2312 |       }
2313 |     },
2314 |     {
2315 |       "id": 1093,
2316 |       "type": "VHS_VideoCombine",
2317 |       "pos": [
2318 |         -948,
2319 |         754
2320 |       ],
2321 |       "size": [
2322 |         418.9444885253906,
2323 |         290
2324 |       ],
2325 |       "flags": {},
2326 |       "order": 50,
2327 |       "mode": 0,
2328 |       "inputs": [
2329 |         {
2330 |           "name": "images",
2331 |           "type": "IMAGE",
2332 |           "link": 1806
2333 |         },
2334 |         {
2335 |           "name": "audio",
2336 |           "type": "VHS_AUDIO",
2337 |           "link": 1801
2338 |         },
2339 |         {
2340 |           "name": "batch_manager",
2341 |           "type": "VHS_BatchManager",
2342 |           "link": null
2343 |         }
2344 |       ],
2345 |       "outputs": [
2346 |         {
2347 |           "name": "Filenames",
2348 |           "type": "VHS_FILENAMES",
2349 |           "links": null,
2350 |           "shape": 3,
2351 |           "slot_index": 0
2352 |         }
2353 |       ],
2354 |       "properties": {
2355 |         "Node name for S&R": "VHS_VideoCombine"
2356 |       },
2357 |       "widgets_values": {
2358 |         "frame_rate": 25,
2359 |         "loop_count": 0,
2360 |         "filename_prefix": "musetalk",
2361 |         "format": "video/h264-mp4",
2362 |         "pix_fmt": "yuv420p",
2363 |         "crf": 19,
2364 |         "save_metadata": false,
2365 |         "pingpong": false,
2366 |         "save_output": false,
2367 |         "videopreview": {
2368 |           "hidden": false,
2369 |           "paused": false,
2370 |           "params": {
2371 |             "filename": "AnimateDiff_00007-audio.mp4",
2372 |             "subfolder": "",
2373 |             "type": "temp",
2374 |             "format": "video/h264-mp4"
2375 |           }
2376 |         }
2377 |       }
2378 |     },
2379 |     {
2380 |       "id": 976,
2381 |       "type": "FaceEnhancement",
2382 |       "pos": [
2383 |         -6383,
2384 |         488
2385 |       ],
2386 |       "size": {
2387 |         "0": 300.08770751953125,
2388 |         "1": 73.68206024169922
2389 |       },
2390 |       "flags": {},
2391 |       "order": 34,
2392 |       "mode": 4,
2393 |       "inputs": [
2394 |         {
2395 |           "name": "images",
2396 |           "type": "IMAGE",
2397 |           "link": 1784
2398 |         }
2399 |       ],
2400 |       "outputs": [
2401 |         {
2402 |           "name": "images",
2403 |           "type": "IMAGE",
2404 |           "links": [
2405 |             1587
2406 |           ],
2407 |           "shape": 3,
2408 |           "slot_index": 0
2409 |         }
2410 |       ],
2411 |       "properties": {
2412 |         "Node name for S&R": "FaceEnhancement"
2413 |       }
2414 |     }
2415 |   ],
2416 |   "links": [
2417 |     [
2418 |       45,
2419 |       28,
2420 |       0,
2421 |       27,
2422 |       0,
2423 |       "VHS_AUDIO"
2424 |     ],
2425 |     [
2426 |       67,
2427 |       27,
2428 |       0,
2429 |       36,
2430 |       0,
2431 |       "VCAUDIOTENSOR"
2432 |     ],
2433 |     [
2434 |       279,
2435 |       4,
2436 |       0,
2437 |       122,
2438 |       1,
2439 |       "VAE"
2440 |     ],
2441 |     [
2442 |       280,
2443 |       121,
2444 |       0,
2445 |       122,
2446 |       0,
2447 |       "MODEL"
2448 |     ],
2449 |     [
2450 |       281,
2451 |       36,
2452 |       0,
2453 |       122,
2454 |       2,
2455 |       "WHISPERFEAT"
2456 |     ],
2457 |     [
2458 |       287,
2459 |       28,
2460 |       0,
2461 |       124,
2462 |       0,
2463 |       "*"
2464 |     ],
2465 |     [
2466 |       297,
2467 |       36,
2468 |       1,
2469 |       129,
2470 |       0,
2471 |       "*"
2472 |     ],
2473 |     [
2474 |       422,
2475 |       36,
2476 |       1,
2477 |       223,
2478 |       0,
2479 |       "*"
2480 |     ],
2481 |     [
2482 |       626,
2483 |       15,
2484 |       0,
2485 |       16,
2486 |       1,
2487 |       "IMAGE"
2488 |     ],
2489 |     [
2490 |       871,
2491 |       529,
2492 |       0,
2493 |       527,
2494 |       1,
2495 |       "INT"
2496 |     ],
2497 |     [
2498 |       872,
2499 |       529,
2500 |       1,
2501 |       527,
2502 |       2,
2503 |       "INT"
2504 |     ],
2505 |     [
2506 |       873,
2507 |       527,
2508 |       0,
2509 |       526,
2510 |       1,
2511 |       "INT"
2512 |     ],
2513 |     [
2514 |       1165,
2515 |       734,
2516 |       0,
2517 |       735,
2518 |       0,
2519 |       "*"
2520 |     ],
2521 |     [
2522 |       1379,
2523 |       122,
2524 |       0,
2525 |       842,
2526 |       0,
2527 |       "IMAGE"
2528 |     ],
2529 |     [
2530 |       1380,
2531 |       125,
2532 |       0,
2533 |       842,
2534 |       1,
2535 |       "VHS_AUDIO"
2536 |     ],
2537 |     [
2538 |       1460,
2539 |       911,
2540 |       0,
2541 |       867,
2542 |       1,
2543 |       "VHS_AUDIO"
2544 |     ],
2545 |     [
2546 |       1462,
2547 |       913,
2548 |       0,
2549 |       912,
2550 |       1,
2551 |       "VHS_AUDIO"
2552 |     ],
2553 |     [
2554 |       1512,
2555 |       16,
2556 |       0,
2557 |       122,
2558 |       4,
2559 |       "IMAGE"
2560 |     ],
2561 |     [
2562 |       1515,
2563 |       947,
2564 |       1,
2565 |       97,
2566 |       0,
2567 |       "*"
2568 |     ],
2569 |     [
2570 |       1516,
2571 |       947,
2572 |       3,
2573 |       734,
2574 |       0,
2575 |       "VHS_VIDEOINFO"
2576 |     ],
2577 |     [
2578 |       1529,
2579 |       958,
2580 |       0,
2581 |       526,
2582 |       0,
2583 |       "IMAGE"
2584 |     ],
2585 |     [
2586 |       1530,
2587 |       958,
2588 |       0,
2589 |       529,
2590 |       0,
2591 |       "IMAGE"
2592 |     ],
2593 |     [
2594 |       1531,
2595 |       958,
2596 |       0,
2597 |       527,
2598 |       0,
2599 |       "IMAGE"
2600 |     ],
2601 |     [
2602 |       1558,
2603 |       971,
2604 |       0,
2605 |       970,
2606 |       0,
2607 |       "IMAGE"
2608 |     ],
2609 |     [
2610 |       1559,
2611 |       971,
2612 |       0,
2613 |       122,
2614 |       3,
2615 |       "IMAGE"
2616 |     ],
2617 |     [
2618 |       1570,
2619 |       911,
2620 |       0,
2621 |       973,
2622 |       1,
2623 |       "VHS_AUDIO"
2624 |     ],
2625 |     [
2626 |       1575,
2627 |       122,
2628 |       0,
2629 |       866,
2630 |       0,
2631 |       "IMAGE"
2632 |     ],
2633 |     [
2634 |       1587,
2635 |       976,
2636 |       0,
2637 |       979,
2638 |       0,
2639 |       "IMAGE"
2640 |     ],
2641 |     [
2642 |       1704,
2643 |       1048,
2644 |       0,
2645 |       1055,
2646 |       0,
2647 |       "IMAGE"
2648 |     ],
2649 |     [
2650 |       1705,
2651 |       1055,
2652 |       0,
2653 |       1054,
2654 |       0,
2655 |       "MASK"
2656 |     ],
2657 |     [
2658 |       1706,
2659 |       1054,
2660 |       0,
2661 |       1056,
2662 |       0,
2663 |       "MASK"
2664 |     ],
2665 |     [
2666 |       1708,
2667 |       1056,
2668 |       0,
2669 |       1051,
2670 |       0,
2671 |       "IMAGE"
2672 |     ],
2673 |     [
2674 |       1761,
2675 |       224,
2676 |       0,
2677 |       947,
2678 |       1,
2679 |       "INT"
2680 |     ],
2681 |     [
2682 |       1775,
2683 |       958,
2684 |       0,
2685 |       1080,
2686 |       0,
2687 |       "IMAGE"
2688 |     ],
2689 |     [
2690 |       1776,
2691 |       526,
2692 |       1,
2693 |       1080,
2694 |       1,
2695 |       "POSE_KEYPOINT"
2696 |     ],
2697 |     [
2698 |       1778,
2699 |       1080,
2700 |       0,
2701 |       844,
2702 |       0,
2703 |       "IMAGE"
2704 |     ],
2705 |     [
2706 |       1779,
2707 |       1080,
2708 |       1,
2709 |       833,
2710 |       0,
2711 |       "IMAGE"
2712 |     ],
2713 |     [
2714 |       1780,
2715 |       1080,
2716 |       0,
2717 |       971,
2718 |       0,
2719 |       "IMAGE"
2720 |     ],
2721 |     [
2722 |       1784,
2723 |       1080,
2724 |       0,
2725 |       976,
2726 |       0,
2727 |       "IMAGE"
2728 |     ],
2729 |     [
2730 |       1786,
2731 |       1083,
2732 |       0,
2733 |       16,
2734 |       0,
2735 |       "IMAGE"
2736 |     ],
2737 |     [
2738 |       1787,
2739 |       1056,
2740 |       0,
2741 |       1084,
2742 |       0,
2743 |       "*"
2744 |     ],
2745 |     [
2746 |       1790,
2747 |       979,
2748 |       0,
2749 |       1083,
2750 |       0,
2751 |       "*"
2752 |     ],
2753 |     [
2754 |       1791,
2755 |       866,
2756 |       0,
2757 |       1090,
2758 |       0,
2759 |       "IMAGE"
2760 |     ],
2761 |     [
2762 |       1792,
2763 |       1090,
2764 |       0,
2765 |       912,
2766 |       0,
2767 |       "IMAGE"
2768 |     ],
2769 |     [
2770 |       1796,
2771 |       1084,
2772 |       0,
2773 |       1091,
2774 |       2,
2775 |       "IMAGE"
2776 |     ],
2777 |     [
2778 |       1797,
2779 |       1091,
2780 |       0,
2781 |       973,
2782 |       0,
2783 |       "IMAGE"
2784 |     ],
2785 |     [
2786 |       1799,
2787 |       1091,
2788 |       0,
2789 |       1092,
2790 |       1,
2791 |       "IMAGE"
2792 |     ],
2793 |     [
2794 |       1801,
2795 |       1094,
2796 |       0,
2797 |       1093,
2798 |       1,
2799 |       "VHS_AUDIO"
2800 |     ],
2801 |     [
2802 |       1806,
2803 |       1092,
2804 |       0,
2805 |       1093,
2806 |       0,
2807 |       "IMAGE"
2808 |     ],
2809 |     [
2810 |       1822,
2811 |       1107,
2812 |       0,
2813 |       1091,
2814 |       0,
2815 |       "IMAGE"
2816 |     ],
2817 |     [
2818 |       1823,
2819 |       1107,
2820 |       0,
2821 |       1108,
2822 |       0,
2823 |       "IMAGE"
2824 |     ],
2825 |     [
2826 |       1824,
2827 |       122,
2828 |       0,
2829 |       1108,
2830 |       1,
2831 |       "IMAGE"
2832 |     ],
2833 |     [
2834 |       1825,
2835 |       1084,
2836 |       0,
2837 |       1108,
2838 |       2,
2839 |       "IMAGE"
2840 |     ],
2841 |     [
2842 |       1826,
2843 |       1108,
2844 |       0,
2845 |       1092,
2846 |       0,
2847 |       "IMAGE"
2848 |     ],
2849 |     [
2850 |       1827,
2851 |       1108,
2852 |       0,
2853 |       867,
2854 |       0,
2855 |       "IMAGE"
2856 |     ],
2857 |     [
2858 |       1828,
2859 |       958,
2860 |       0,
2861 |       1107,
2862 |       0,
2863 |       "*"
2864 |     ],
2865 |     [
2866 |       1848,
2867 |       1090,
2868 |       0,
2869 |       1091,
2870 |       1,
2871 |       "IMAGE"
2872 |     ],
2873 |     [
2874 |       1849,
2875 |       947,
2876 |       0,
2877 |       958,
2878 |       0,
2879 |       "*"
2880 |     ]
2881 |   ],
2882 |   "groups": [
2883 |     {
2884 |       "title": "loadvideo",
2885 |       "bounding": [
2886 |         -8845,
2887 |         639,
2888 |         1883,
2889 |         1047
2890 |       ],
2891 |       "color": "#3f789e",
2892 |       "font_size": 24,
2893 |       "locked": false
2894 |     },
2895 |     {
2896 |       "title": "load audio and cal video frame",
2897 |       "bounding": [
2898 |         -8064,
2899 |         -883,
2900 |         1878,
2901 |         675
2902 |       ],
2903 |       "color": "#3f789e",
2904 |       "font_size": 24,
2905 |       "locked": false
2906 |     },
2907 |     {
2908 |       "title": "musetalk sampler",
2909 |       "bounding": [
2910 |         -6125,
2911 |         -1096,
2912 |         2378,
2913 |         1226
2914 |       ],
2915 |       "color": "#3f789e",
2916 |       "font_size": 24,
2917 |       "locked": false
2918 |     },
2919 |     {
2920 |       "title": "preprocess and crop",
2921 |       "bounding": [
2922 |         -6917,
2923 |         650,
2924 |         2505,
2925 |         1051
2926 |       ],
2927 |       "color": "#3f789e",
2928 |       "font_size": 24,
2929 |       "locked": false
2930 |     },
2931 |     {
2932 |       "title": "postprocess and uncrop",
2933 |       "bounding": [
2934 |         -4369,
2935 |         619,
2936 |         2815,
2937 |         1523
2938 |       ],
2939 |       "color": "#3f789e",
2940 |       "font_size": 24,
2941 |       "locked": false
2942 |     },
2943 |     {
2944 |       "title": "uncrop face mask",
2945 |       "bounding": [
2946 |         -3651,
2947 |         -332,
2948 |         1440,
2949 |         917
2950 |       ],
2951 |       "color": "#3f789e",
2952 |       "font_size": 24,
2953 |       "locked": false
2954 |     },
2955 |     {
2956 |       "title": "hires",
2957 |       "bounding": [
2958 |         -6520,
2959 |         220,
2960 |         1033,
2961 |         387
2962 |       ],
2963 |       "color": "#3f789e",
2964 |       "font_size": 24,
2965 |       "locked": false
2966 |     },
2967 |     {
2968 |       "title": "blur",
2969 |       "bounding": [
2970 |         -5448,
2971 |         217,
2972 |         916,
2973 |         396
2974 |       ],
2975 |       "color": "#3f789e",
2976 |       "font_size": 24,
2977 |       "locked": false
2978 |     },
2979 |     {
2980 |       "title": "hires",
2981 |       "bounding": [
2982 |         -4374,
2983 |         227,
2984 |         631,
2985 |         364
2986 |       ],
2987 |       "color": "#3f789e",
2988 |       "font_size": 24,
2989 |       "locked": false
2990 |     },
2991 |     {
2992 |       "title": "diff",
2993 |       "bounding": [
2994 |         -1514,
2995 |         613,
2996 |         1121,
2997 |         1519
2998 |       ],
2999 |       "color": "#3f789e",
3000 |       "font_size": 24,
3001 |       "locked": false
3002 |     }
3003 |   ],
3004 |   "config": {},
3005 |   "extra": {
3006 |     "workspace_info": {
3007 |       "id": "Ar3D9He4S5MoHhCb_zi9p"
3008 |     }
3009 |   },
3010 |   "version": 0.4
3011 | }


--------------------------------------------------------------------------------
/workflow/musetalk flow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xuhongming251/ComfyUI-MuseTalkUtils/df7b26788afb765a7ad0920bf9be81e213710068/workflow/musetalk flow.png


--------------------------------------------------------------------------------
/workflow/musetalk train flow.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "last_node_id": 48,
  3 |   "last_link_id": 59,
  4 |   "nodes": [
  5 |     {
  6 |       "id": 14,
  7 |       "type": "PixelPerfectResolution",
  8 |       "pos": [
  9 |         1910,
 10 |         980
 11 |       ],
 12 |       "size": {
 13 |         "0": 393,
 14 |         "1": 106
 15 |       },
 16 |       "flags": {},
 17 |       "order": 8,
 18 |       "mode": 0,
 19 |       "inputs": [
 20 |         {
 21 |           "name": "original_image",
 22 |           "type": "IMAGE",
 23 |           "link": 13,
 24 |           "label": "original_image"
 25 |         },
 26 |         {
 27 |           "name": "image_gen_width",
 28 |           "type": "INT",
 29 |           "link": 14,
 30 |           "widget": {
 31 |             "name": "image_gen_width"
 32 |           },
 33 |           "slot_index": 1
 34 |         },
 35 |         {
 36 |           "name": "image_gen_height",
 37 |           "type": "INT",
 38 |           "link": 15,
 39 |           "widget": {
 40 |             "name": "image_gen_height"
 41 |           }
 42 |         }
 43 |       ],
 44 |       "outputs": [
 45 |         {
 46 |           "name": "RESOLUTION (INT)",
 47 |           "type": "INT",
 48 |           "links": [
 49 |             12
 50 |           ],
 51 |           "shape": 3,
 52 |           "label": "RESOLUTION (INT)",
 53 |           "slot_index": 0
 54 |         }
 55 |       ],
 56 |       "properties": {
 57 |         "Node name for S&R": "PixelPerfectResolution"
 58 |       },
 59 |       "widgets_values": [
 60 |         800,
 61 |         536,
 62 |         "Just Resize"
 63 |       ]
 64 |     },
 65 |     {
 66 |       "id": 15,
 67 |       "type": "GetImageSize+",
 68 |       "pos": [
 69 |         1850,
 70 |         770
 71 |       ],
 72 |       "size": {
 73 |         "0": 210,
 74 |         "1": 46
 75 |       },
 76 |       "flags": {},
 77 |       "order": 6,
 78 |       "mode": 0,
 79 |       "inputs": [
 80 |         {
 81 |           "name": "image",
 82 |           "type": "IMAGE",
 83 |           "link": 16
 84 |         }
 85 |       ],
 86 |       "outputs": [
 87 |         {
 88 |           "name": "width",
 89 |           "type": "INT",
 90 |           "links": [
 91 |             14
 92 |           ],
 93 |           "shape": 3,
 94 |           "slot_index": 0
 95 |         },
 96 |         {
 97 |           "name": "height",
 98 |           "type": "INT",
 99 |           "links": [
100 |             15
101 |           ],
102 |           "shape": 3,
103 |           "slot_index": 1
104 |         }
105 |       ],
106 |       "properties": {
107 |         "Node name for S&R": "GetImageSize+"
108 |       }
109 |     },
110 |     {
111 |       "id": 22,
112 |       "type": "Reroute",
113 |       "pos": [
114 |         1473,
115 |         390
116 |       ],
117 |       "size": [
118 |         75,
119 |         26
120 |       ],
121 |       "flags": {},
122 |       "order": 4,
123 |       "mode": 0,
124 |       "inputs": [
125 |         {
126 |           "name": "",
127 |           "type": "*",
128 |           "link": 57
129 |         }
130 |       ],
131 |       "outputs": [
132 |         {
133 |           "name": "",
134 |           "type": "IMAGE",
135 |           "links": [
136 |             11,
137 |             13,
138 |             16,
139 |             49
140 |           ],
141 |           "slot_index": 0
142 |         }
143 |       ],
144 |       "properties": {
145 |         "showOutputText": false,
146 |         "horizontal": false
147 |       }
148 |     },
149 |     {
150 |       "id": 35,
151 |       "type": "Display Any (rgthree)",
152 |       "pos": [
153 |         4113,
154 |         938
155 |       ],
156 |       "size": {
157 |         "0": 226.42002868652344,
158 |         "1": 116.54998779296875
159 |       },
160 |       "flags": {},
161 |       "order": 9,
162 |       "mode": 0,
163 |       "inputs": [
164 |         {
165 |           "name": "source",
166 |           "type": "*",
167 |           "link": 34,
168 |           "dir": 3,
169 |           "label": "source"
170 |         }
171 |       ],
172 |       "properties": {
173 |         "Node name for S&R": "Display Any (rgthree)"
174 |       },
175 |       "widgets_values": [
176 |         ""
177 |       ]
178 |     },
179 |     {
180 |       "id": 25,
181 |       "type": "MuseTalkTrainPreprocess",
182 |       "pos": [
183 |         2894,
184 |         393
185 |       ],
186 |       "size": {
187 |         "0": 354.3999938964844,
188 |         "1": 174
189 |       },
190 |       "flags": {},
191 |       "order": 11,
192 |       "mode": 0,
193 |       "inputs": [
194 |         {
195 |           "name": "origin_images",
196 |           "type": "IMAGE",
197 |           "link": 49
198 |         },
199 |         {
200 |           "name": "pose_kps",
201 |           "type": "POSE_KEYPOINT",
202 |           "link": 25
203 |         }
204 |       ],
205 |       "outputs": [
206 |         {
207 |           "name": "rotated_faces",
208 |           "type": "IMAGE",
209 |           "links": [
210 |             44
211 |           ],
212 |           "shape": 3,
213 |           "slot_index": 0
214 |         },
215 |         {
216 |           "name": "rotated_faces_with_landmarks",
217 |           "type": "IMAGE",
218 |           "links": [],
219 |           "shape": 3,
220 |           "slot_index": 1
221 |         }
222 |       ],
223 |       "properties": {
224 |         "Node name for S&R": "MuseTalkTrainPreprocess"
225 |       },
226 |       "widgets_values": [
227 |         "full",
228 |         0,
229 |         0,
230 |         0,
231 |         0
232 |       ]
233 |     },
234 |     {
235 |       "id": 41,
236 |       "type": "MuseTalkTrain",
237 |       "pos": [
238 |         4021,
239 |         379
240 |       ],
241 |       "size": {
242 |         "0": 315,
243 |         "1": 78
244 |       },
245 |       "flags": {},
246 |       "order": 12,
247 |       "mode": 0,
248 |       "inputs": [
249 |         {
250 |           "name": "images",
251 |           "type": "IMAGE",
252 |           "link": 44
253 |         },
254 |         {
255 |           "name": "whisper_features",
256 |           "type": "WHISPERFEAT",
257 |           "link": 45
258 |         }
259 |       ],
260 |       "outputs": [
261 |         {
262 |           "name": "images",
263 |           "type": "IMAGE",
264 |           "links": [
265 |             46
266 |           ],
267 |           "shape": 3,
268 |           "slot_index": 0
269 |         }
270 |       ],
271 |       "properties": {
272 |         "Node name for S&R": "MuseTalkTrain"
273 |       },
274 |       "widgets_values": [
275 |         1
276 |       ]
277 |     },
278 |     {
279 |       "id": 38,
280 |       "type": "VHS_VideoCombine",
281 |       "pos": [
282 |         4788,
283 |         360
284 |       ],
285 |       "size": {
286 |         "0": 320,
287 |         "1": 290
288 |       },
289 |       "flags": {},
290 |       "order": 13,
291 |       "mode": 0,
292 |       "inputs": [
293 |         {
294 |           "name": "images",
295 |           "type": "IMAGE",
296 |           "link": 46
297 |         },
298 |         {
299 |           "name": "audio",
300 |           "type": "VHS_AUDIO",
301 |           "link": null
302 |         },
303 |         {
304 |           "name": "batch_manager",
305 |           "type": "VHS_BatchManager",
306 |           "link": null
307 |         }
308 |       ],
309 |       "outputs": [
310 |         {
311 |           "name": "Filenames",
312 |           "type": "VHS_FILENAMES",
313 |           "links": null,
314 |           "shape": 3
315 |         }
316 |       ],
317 |       "properties": {
318 |         "Node name for S&R": "VHS_VideoCombine"
319 |       },
320 |       "widgets_values": {
321 |         "frame_rate": 25,
322 |         "loop_count": 0,
323 |         "filename_prefix": "AnimateDiff",
324 |         "format": "video/h264-mp4",
325 |         "pix_fmt": "yuv420p",
326 |         "crf": 19,
327 |         "save_metadata": false,
328 |         "pingpong": false,
329 |         "save_output": false,
330 |         "videopreview": {
331 |           "hidden": false,
332 |           "paused": false,
333 |           "params": {
334 |             "filename": "AnimateDiff_00019.gif",
335 |             "subfolder": "",
336 |             "type": "output",
337 |             "format": "image/gif"
338 |           }
339 |         }
340 |       }
341 |     },
342 |     {
343 |       "id": 13,
344 |       "type": "DWPreprocessor",
345 |       "pos": [
346 |         2320,
347 |         670
348 |       ],
349 |       "size": {
350 |         "0": 315,
351 |         "1": 198
352 |       },
353 |       "flags": {},
354 |       "order": 10,
355 |       "mode": 0,
356 |       "inputs": [
357 |         {
358 |           "name": "image",
359 |           "type": "IMAGE",
360 |           "link": 11
361 |         },
362 |         {
363 |           "name": "resolution",
364 |           "type": "INT",
365 |           "link": 12,
366 |           "widget": {
367 |             "name": "resolution"
368 |           }
369 |         }
370 |       ],
371 |       "outputs": [
372 |         {
373 |           "name": "IMAGE",
374 |           "type": "IMAGE",
375 |           "links": [],
376 |           "shape": 3,
377 |           "slot_index": 0
378 |         },
379 |         {
380 |           "name": "POSE_KEYPOINT",
381 |           "type": "POSE_KEYPOINT",
382 |           "links": [
383 |             25
384 |           ],
385 |           "shape": 3,
386 |           "slot_index": 1
387 |         }
388 |       ],
389 |       "properties": {
390 |         "Node name for S&R": "DWPreprocessor"
391 |       },
392 |       "widgets_values": [
393 |         "disable",
394 |         "disable",
395 |         "enable",
396 |         512,
397 |         "yolox_l.torchscript.pt",
398 |         "dw-ll_ucoco_384_bs5.torchscript.pt"
399 |       ]
400 |     },
401 |     {
402 |       "id": 47,
403 |       "type": "ImageCrop",
404 |       "pos": [
405 |         1049,
406 |         473
407 |       ],
408 |       "size": {
409 |         "0": 315,
410 |         "1": 130
411 |       },
412 |       "flags": {},
413 |       "order": 1,
414 |       "mode": 4,
415 |       "inputs": [
416 |         {
417 |           "name": "image",
418 |           "type": "IMAGE",
419 |           "link": 55
420 |         }
421 |       ],
422 |       "outputs": [
423 |         {
424 |           "name": "IMAGE",
425 |           "type": "IMAGE",
426 |           "links": [
427 |             56,
428 |             57
429 |           ],
430 |           "shape": 3,
431 |           "slot_index": 0
432 |         }
433 |       ],
434 |       "properties": {
435 |         "Node name for S&R": "ImageCrop"
436 |       },
437 |       "widgets_values": [
438 |         1000,
439 |         1000,
440 |         600,
441 |         120
442 |       ]
443 |     },
444 |     {
445 |       "id": 46,
446 |       "type": "VHS_VideoCombine",
447 |       "pos": [
448 |         1039,
449 |         42
450 |       ],
451 |       "size": [
452 |         320,
453 |         290
454 |       ],
455 |       "flags": {},
456 |       "order": 3,
457 |       "mode": 4,
458 |       "inputs": [
459 |         {
460 |           "name": "images",
461 |           "type": "IMAGE",
462 |           "link": 56
463 |         },
464 |         {
465 |           "name": "audio",
466 |           "type": "VHS_AUDIO",
467 |           "link": null
468 |         },
469 |         {
470 |           "name": "batch_manager",
471 |           "type": "VHS_BatchManager",
472 |           "link": null
473 |         }
474 |       ],
475 |       "outputs": [
476 |         {
477 |           "name": "Filenames",
478 |           "type": "VHS_FILENAMES",
479 |           "links": null,
480 |           "shape": 3
481 |         }
482 |       ],
483 |       "properties": {
484 |         "Node name for S&R": "VHS_VideoCombine"
485 |       },
486 |       "widgets_values": {
487 |         "frame_rate": 25,
488 |         "loop_count": 0,
489 |         "filename_prefix": "AnimateDiff",
490 |         "format": "video/h264-mp4",
491 |         "pix_fmt": "yuv420p",
492 |         "crf": 19,
493 |         "save_metadata": false,
494 |         "pingpong": false,
495 |         "save_output": false,
496 |         "videopreview": {
497 |           "hidden": false,
498 |           "paused": false,
499 |           "params": {
500 |             "filename": "AnimateDiff_00013.mp4",
501 |             "subfolder": "",
502 |             "type": "temp",
503 |             "format": "video/h264-mp4"
504 |           }
505 |         }
506 |       }
507 |     },
508 |     {
509 |       "id": 42,
510 |       "type": "VHS_LoadVideo",
511 |       "pos": [
512 |         663,
513 |         600
514 |       ],
515 |       "size": [
516 |         240,
517 |         262
518 |       ],
519 |       "flags": {},
520 |       "order": 0,
521 |       "mode": 0,
522 |       "inputs": [
523 |         {
524 |           "name": "batch_manager",
525 |           "type": "VHS_BatchManager",
526 |           "link": null
527 |         }
528 |       ],
529 |       "outputs": [
530 |         {
531 |           "name": "IMAGE",
532 |           "type": "IMAGE",
533 |           "links": [
534 |             55
535 |           ],
536 |           "shape": 3,
537 |           "slot_index": 0
538 |         },
539 |         {
540 |           "name": "frame_count",
541 |           "type": "INT",
542 |           "links": [],
543 |           "shape": 3,
544 |           "slot_index": 1
545 |         },
546 |         {
547 |           "name": "audio",
548 |           "type": "VHS_AUDIO",
549 |           "links": [
550 |             59
551 |           ],
552 |           "shape": 3,
553 |           "slot_index": 2
554 |         },
555 |         {
556 |           "name": "video_info",
557 |           "type": "VHS_VIDEOINFO",
558 |           "links": [],
559 |           "shape": 3,
560 |           "slot_index": 3
561 |         }
562 |       ],
563 |       "properties": {
564 |         "Node name for S&R": "VHS_LoadVideo"
565 |       },
566 |       "widgets_values": {
567 |         "video": "高清.mp4",
568 |         "force_rate": 25,
569 |         "force_size": "Disabled",
570 |         "custom_width": 512,
571 |         "custom_height": 512,
572 |         "frame_load_cap": 0,
573 |         "skip_first_frames": 0,
574 |         "select_every_nth": 1,
575 |         "choose video to upload": "image",
576 |         "videopreview": {
577 |           "hidden": false,
578 |           "paused": false,
579 |           "params": {
580 |             "frame_load_cap": 0,
581 |             "skip_first_frames": 0,
582 |             "force_rate": 25,
583 |             "filename": "高清.mp4",
584 |             "type": "input",
585 |             "format": "video/mp4",
586 |             "select_every_nth": 1
587 |           }
588 |         }
589 |       }
590 |     },
591 |     {
592 |       "id": 31,
593 |       "type": "vhs_audio_to_audio_tensor",
594 |       "pos": [
595 |         2577,
596 |         1354
597 |       ],
598 |       "size": {
599 |         "0": 315,
600 |         "1": 102
601 |       },
602 |       "flags": {},
603 |       "order": 5,
604 |       "mode": 0,
605 |       "inputs": [
606 |         {
607 |           "name": "vhs_audio",
608 |           "type": "VHS_AUDIO",
609 |           "link": 38,
610 |           "slot_index": 0,
611 |           "label": "vhs_audio"
612 |         }
613 |       ],
614 |       "outputs": [
615 |         {
616 |           "name": "audio_tensor",
617 |           "type": "VCAUDIOTENSOR",
618 |           "links": [
619 |             32
620 |           ],
621 |           "shape": 3,
622 |           "slot_index": 0,
623 |           "label": "audio_tensor"
624 |         },
625 |         {
626 |           "name": "audio_dur",
627 |           "type": "INT",
628 |           "links": null,
629 |           "shape": 3,
630 |           "label": "audio_dur"
631 |         }
632 |       ],
633 |       "properties": {
634 |         "Node name for S&R": "vhs_audio_to_audio_tensor"
635 |       },
636 |       "widgets_values": [
637 |         16000,
638 |         1
639 |       ]
640 |     },
641 |     {
642 |       "id": 33,
643 |       "type": "whisper_to_features",
644 |       "pos": [
645 |         3209,
646 |         1353
647 |       ],
648 |       "size": {
649 |         "0": 342.5999755859375,
650 |         "1": 78
651 |       },
652 |       "flags": {},
653 |       "order": 7,
654 |       "mode": 0,
655 |       "inputs": [
656 |         {
657 |           "name": "audio_tensor",
658 |           "type": "VCAUDIOTENSOR",
659 |           "link": 32,
660 |           "slot_index": 0,
661 |           "label": "audio_tensor"
662 |         }
663 |       ],
664 |       "outputs": [
665 |         {
666 |           "name": "whisper_chunks",
667 |           "type": "WHISPERFEAT",
668 |           "links": [
669 |             45
670 |           ],
671 |           "shape": 3,
672 |           "slot_index": 0,
673 |           "label": "whisper_chunks"
674 |         },
675 |         {
676 |           "name": "frame_count",
677 |           "type": "INT",
678 |           "links": [
679 |             34
680 |           ],
681 |           "shape": 3,
682 |           "slot_index": 1,
683 |           "label": "frame_count"
684 |         }
685 |       ],
686 |       "properties": {
687 |         "Node name for S&R": "whisper_to_features"
688 |       },
689 |       "widgets_values": [
690 |         25
691 |       ]
692 |     },
693 |     {
694 |       "id": 37,
695 |       "type": "Reroute",
696 |       "pos": [
697 |         1802,
698 |         1338
699 |       ],
700 |       "size": [
701 |         75,
702 |         26
703 |       ],
704 |       "flags": {},
705 |       "order": 2,
706 |       "mode": 0,
707 |       "inputs": [
708 |         {
709 |           "name": "",
710 |           "type": "*",
711 |           "link": 59
712 |         }
713 |       ],
714 |       "outputs": [
715 |         {
716 |           "name": "",
717 |           "type": "VHS_AUDIO",
718 |           "links": [
719 |             38
720 |           ],
721 |           "slot_index": 0
722 |         }
723 |       ],
724 |       "properties": {
725 |         "showOutputText": false,
726 |         "horizontal": false
727 |       }
728 |     }
729 |   ],
730 |   "links": [
731 |     [
732 |       11,
733 |       22,
734 |       0,
735 |       13,
736 |       0,
737 |       "IMAGE"
738 |     ],
739 |     [
740 |       12,
741 |       14,
742 |       0,
743 |       13,
744 |       1,
745 |       "INT"
746 |     ],
747 |     [
748 |       13,
749 |       22,
750 |       0,
751 |       14,
752 |       0,
753 |       "IMAGE"
754 |     ],
755 |     [
756 |       14,
757 |       15,
758 |       0,
759 |       14,
760 |       1,
761 |       "INT"
762 |     ],
763 |     [
764 |       15,
765 |       15,
766 |       1,
767 |       14,
768 |       2,
769 |       "INT"
770 |     ],
771 |     [
772 |       16,
773 |       22,
774 |       0,
775 |       15,
776 |       0,
777 |       "IMAGE"
778 |     ],
779 |     [
780 |       25,
781 |       13,
782 |       1,
783 |       25,
784 |       1,
785 |       "POSE_KEYPOINT"
786 |     ],
787 |     [
788 |       32,
789 |       31,
790 |       0,
791 |       33,
792 |       0,
793 |       "VCAUDIOTENSOR"
794 |     ],
795 |     [
796 |       34,
797 |       33,
798 |       1,
799 |       35,
800 |       0,
801 |       "*"
802 |     ],
803 |     [
804 |       38,
805 |       37,
806 |       0,
807 |       31,
808 |       0,
809 |       "VHS_AUDIO"
810 |     ],
811 |     [
812 |       44,
813 |       25,
814 |       0,
815 |       41,
816 |       0,
817 |       "IMAGE"
818 |     ],
819 |     [
820 |       45,
821 |       33,
822 |       0,
823 |       41,
824 |       1,
825 |       "WHISPERFEAT"
826 |     ],
827 |     [
828 |       46,
829 |       41,
830 |       0,
831 |       38,
832 |       0,
833 |       "IMAGE"
834 |     ],
835 |     [
836 |       49,
837 |       22,
838 |       0,
839 |       25,
840 |       0,
841 |       "IMAGE"
842 |     ],
843 |     [
844 |       55,
845 |       42,
846 |       0,
847 |       47,
848 |       0,
849 |       "IMAGE"
850 |     ],
851 |     [
852 |       56,
853 |       47,
854 |       0,
855 |       46,
856 |       0,
857 |       "IMAGE"
858 |     ],
859 |     [
860 |       57,
861 |       47,
862 |       0,
863 |       22,
864 |       0,
865 |       "*"
866 |     ],
867 |     [
868 |       59,
869 |       42,
870 |       2,
871 |       37,
872 |       0,
873 |       "*"
874 |     ]
875 |   ],
876 |   "groups": [],
877 |   "config": {},
878 |   "extra": {
879 |     "workspace_info": {
880 |       "id": "yYiOrtl5rbrkJhUJ9UPwT"
881 |     }
882 |   },
883 |   "version": 0.4
884 | }


--------------------------------------------------------------------------------
/workflow/sampleimage.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xuhongming251/ComfyUI-MuseTalkUtils/df7b26788afb765a7ad0920bf9be81e213710068/workflow/sampleimage.png


--------------------------------------------------------------------------------
/workflow/train.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xuhongming251/ComfyUI-MuseTalkUtils/df7b26788afb765a7ad0920bf9be81e213710068/workflow/train.png


--------------------------------------------------------------------------------
/workflow/trainsample.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xuhongming251/ComfyUI-MuseTalkUtils/df7b26788afb765a7ad0920bf9be81e213710068/workflow/trainsample.png


--------------------------------------------------------------------------------