├── .github └── workflows │ └── publish.yml ├── ReadMe.md ├── __init__.py ├── musetalk_global_data.py ├── musetalk_postprocess.py ├── musetalk_preprocess.py ├── musetalk_train.py ├── musetalk_train_preprocess.py ├── musetalk_utils.py ├── pyproject.toml ├── unet.py ├── vae.py └── workflow ├── musetalk flow.json ├── musetalk flow.png ├── musetalk train flow.json ├── sampleimage.png ├── train.png └── trainsample.png /.github/workflows/publish.yml: -------------------------------------------------------------------------------- 1 | name: Publish to Comfy registry 2 | on: 3 | workflow_dispatch: 4 | push: 5 | branches: 6 | - main 7 | - master 8 | paths: 9 | - "pyproject.toml" 10 | 11 | permissions: 12 | issues: write 13 | 14 | jobs: 15 | publish-node: 16 | name: Publish Custom Node to registry 17 | runs-on: ubuntu-latest 18 | if: ${{ github.repository_owner == 'xuhongming251' }} 19 | steps: 20 | - name: Check out code 21 | uses: actions/checkout@v4 22 | - name: Publish Custom Node 23 | uses: Comfy-Org/publish-node-action@v1 24 | with: 25 | ## Add your own personal access token to your Github Repository secrets and reference it here. 26 | personal_access_token: ${{ secrets.REGISTRY_ACCESS_TOKEN }} 27 | -------------------------------------------------------------------------------- /ReadMe.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | # MuseTalk ComfyUI Preprocess and Postprocess Nodes 4 | 5 | # Preprocess Node 6 | 1. rotated image、crop face images 7 | 8 | # Postprocess Node 9 | 1. uncrop faces and rotated images 10 | 11 | 12 | # MuseTalk Work Flow 13 | 1. open musetalk flow and upload video(or image) 14 | 2. set audio path(wav or mp3 ...) 15 | 3. run the flow 16 | 4. video tutorial: https://www.bilibili.com/video/BV1ni421X7ok/?share_source=copy_web&vd_source=43ee8c0ef3a0b12097f69db4423c1332 17 | 18 | 5. GPU < 5G can run 19 | 20 | ![image](./workflow/sampleimage.png) 21 | 22 | # MuseTalk Train Work Flow 23 | 1. open train flow and upload video 24 | 2. run the train flow 25 | 3. `epoch_0.pth`、`epoch_1.pth`、`epoch_2.pth`... will gen into `models\musetalk\musetalk` folder 26 | 4. watch loss value in the cmd terminal, manual stop terminal when the training loss has decreased to 0.005 or lower 27 | 5. select musetalk model by `epoch_x.pth` in the musetalk flow. 28 | 6. run the musetalk flow for test. 29 | 7. the train flow just a demo for test. 30 | 8. GPU 16G can Run. 31 | 32 | ![image](./workflow/train.png) 33 | ![image](./workflow/trainsample.png) 34 | 35 | 36 | Original repo: 37 | https://github.com/TMElyralab/MuseTalk -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- 1 | 2 | from .musetalk_preprocess import * 3 | from .musetalk_postprocess import * 4 | from .musetalk_train_preprocess import * 5 | from .musetalk_train import * 6 | 7 | 8 | NODE_CLASS_MAPPINGS = { 9 | 10 | # "MuseTalkUncropMask":MuseTalkUncropMask, 11 | 12 | "MuseTalkPreprocess": MuseTalkPreprocess, 13 | "MuseTalkPostprocess": MuseTalkPostprocess, 14 | 15 | "MuseTalkTrainPreprocess": MuseTalkTrainPreprocess, 16 | "MuseTalkTrain": MuseTalkTrain, 17 | 18 | } 19 | 20 | 21 | NODE_DISPLAY_NAME_MAPPINGS = { 22 | 23 | # "MuseTalkUncropMask": "MuseTalkUncropMask", 24 | 25 | "MuseTalkPreprocess": "MuseTalkPreprocess", 26 | "MuseTalkPostprocess": "MuseTalkPostprocess", 27 | 28 | "MuseTalkTrainPreprocess": "MuseTalkTrainPreprocess", 29 | "MuseTalkTrain": "MuseTalkTrain", 30 | 31 | } -------------------------------------------------------------------------------- /musetalk_global_data.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | # for debug 4 | rotated_faces_with_landmarks = [] 5 | # origin_face_bboxs = [] 6 | # origin_face_landmarks = [] 7 | 8 | 9 | # gen in preprocess node, use in postprocess 10 | rotated_faces = [] 11 | rotated_bboxs = [] 12 | rotated_images = [] 13 | 14 | face_center_points = [] 15 | rotated_angles = [] 16 | origin_face_masks = [] 17 | 18 | rotated_resized_half_face_masks = [] 19 | 20 | 21 | # for train 22 | faces_latent_list = [] 23 | resized_cv2_frame_list = [] 24 | -------------------------------------------------------------------------------- /musetalk_postprocess.py: -------------------------------------------------------------------------------- 1 | 2 | import cv2 3 | import numpy as np 4 | import torch 5 | import torch.nn.functional as F 6 | from PIL import Image 7 | import comfy 8 | import time 9 | 10 | from . import musetalk_utils 11 | from . import musetalk_global_data 12 | 13 | 14 | # def create_uncrop_mask(width, height, center, v_axes, h_axes): 15 | class MuseTalkUncropMask: 16 | def __init__(self): 17 | pass 18 | 19 | @classmethod 20 | def INPUT_TYPES(s): 21 | return { 22 | "required": { 23 | "width": ("INT", {"default": 256, "min": -9999, "max": 9999, "step": 1}), 24 | "height": ("INT", {"default": 256, "min": -9999, "max": 9999, "step": 1}), 25 | "ellipse_center_x": ("INT", {"default": 128, "min": -9999, "max": 9999, "step": 1}), 26 | "ellipse_center_y": ("INT", {"default": 192, "min": -9999, "max": 9999, "step": 1}), 27 | "ellipse_center_v_axes": ("INT", {"default": 128, "min": -9999, "max": 9999, "step": 1}), 28 | "ellipse_center_h_axes": ("INT", {"default": 64, "min": -9999, "max": 9999, "step": 1}), 29 | }, 30 | } 31 | # top_reserve, bottom_reserve, left_reserve, right_reserve 32 | RETURN_TYPES = ("IMAGE",) 33 | RETURN_NAMES = ( 34 | "images", 35 | ) 36 | 37 | FUNCTION = "run" 38 | CATEGORY = "MuseTalkUtils" 39 | 40 | def run(self, width, height, ellipse_center_x, ellipse_center_y, ellipse_center_v_axes, ellipse_center_h_axes): 41 | pil_image_mask = musetalk_utils.create_uncrop_mask(width, height, (ellipse_center_x, ellipse_center_y), ellipse_center_v_axes, ellipse_center_h_axes) 42 | image = pil_image_mask.convert("RGB") 43 | image = np.array(image).astype(np.float32) / 255.0 44 | image = torch.from_numpy(image)[None,] 45 | return (image, ) 46 | 47 | 48 | class MuseTalkPostprocess: 49 | def __init__(self): 50 | pass 51 | 52 | @classmethod 53 | def INPUT_TYPES(s): 54 | return { 55 | "required": { 56 | "origin_images": ("IMAGE",), 57 | "musetalk_faces": ("IMAGE",), 58 | # "rotated_bboxs": ("FACE_BBOX",), 59 | # "rotated_images": ("IMAGE",), 60 | # "face_center_points": ("FACE_CENTER_POINT",), 61 | # "rotated_angles": ("ROTATE_ANGLE",), 62 | # "origin_face_bboxs": ("FACE_BBOX",), 63 | # "origin_face_masks": ("IMAGE",), 64 | # "landmarks":("LANDMARK",), 65 | # "uncrop_mask":("IMAGE",), 66 | "extend": ("INT", {"default": 0, "min": -9999, "max": 9999, "step": 1}), 67 | "blur_radius": ("INT", {"default": 0, "min": -9999, "max": 9999, "step": 1}), 68 | "extend1": ("INT", {"default": -5, "min": -9999, "max": 9999, "step": 1}), 69 | "blur_radius1": ("INT", {"default": 5, "min": -9999, "max": 9999, "step": 1}), 70 | }, 71 | "optional": { 72 | "uncrop_mask":("IMAGE",), 73 | } 74 | } 75 | # top_reserve, bottom_reserve, left_reserve, right_reserve 76 | RETURN_TYPES = ("IMAGE", 77 | # "IMAGE", "IMAGE","IMAGE", 78 | ) 79 | RETURN_NAMES = ( 80 | "images", 81 | # "uncrop_masks", 82 | # "uncroped_images", 83 | # "face_masks", 84 | ) 85 | 86 | FUNCTION = "postprocess" 87 | CATEGORY = "MuseTalkUtils" 88 | 89 | def getRealIndex(self, index, origin_img_len): 90 | if index >= origin_img_len: 91 | return (origin_img_len * 2 - index - 1) 92 | else: 93 | return index 94 | 95 | def postprocess(self, origin_images, musetalk_faces, 96 | # rotated_bboxs, rotated_images, 97 | # face_center_points, rotated_angles, origin_face_bboxs, origin_face_masks, landmarks, 98 | # uncrop_mask, 99 | extend, blur_radius,extend1, blur_radius1, 100 | uncrop_mask = None): 101 | 102 | global rotated_faces 103 | global rotated_faces_with_landmarks 104 | 105 | global rotated_bboxs 106 | global rotated_images 107 | global face_center_points 108 | global rotated_angles 109 | global origin_face_bboxs 110 | global origin_face_masks 111 | global origin_face_landmarks 112 | global rotated_resized_half_face_masks 113 | 114 | if uncrop_mask is not None: 115 | uncrop_mask = uncrop_mask[0] 116 | 117 | print(f"MuseTalkPreprocess postprocess, len(origin_images): {len(origin_images)}, len(musetalk_faces): {len(musetalk_faces)}") 118 | 119 | musetalk_face_image_count = len(musetalk_faces) 120 | 121 | 122 | # TODO, process default value 123 | 124 | origin_img_len = len(origin_images) 125 | rotated_bboxs_len = len(musetalk_global_data.rotated_bboxs) 126 | rotated_images_len = len(musetalk_global_data.rotated_images) 127 | face_center_points_len = len(musetalk_global_data.face_center_points) 128 | rotated_angles_len = len(musetalk_global_data.rotated_angles) 129 | origin_face_bboxs_len = len(musetalk_global_data.origin_face_bboxs) 130 | origin_face_masks_len = len(musetalk_global_data.origin_face_masks) 131 | 132 | MAX_LEN = origin_img_len * 2 133 | 134 | print(f"origin_img_len: {origin_img_len}, rotated_bboxs_len: {rotated_bboxs_len}, rotated_images_len: {rotated_images_len}, face_center_points_len: {face_center_points_len}, rotated_angles_len: {rotated_angles_len},origin_face_bboxs_len: {origin_face_bboxs_len}, origin_face_masks_len: {origin_face_masks_len}") 135 | if origin_img_len == rotated_bboxs_len == rotated_images_len == face_center_points_len == rotated_angles_len == origin_face_bboxs_len ==origin_face_masks_len: 136 | if origin_img_len < musetalk_face_image_count: 137 | pass 138 | else: 139 | print("the len is not same") 140 | return (None) 141 | 142 | result_images = [] 143 | 144 | # face_masks = [] 145 | # uncrop_masks = [] 146 | # uncroped_images = [] 147 | idx = 0 148 | 149 | pbar = comfy.utils.ProgressBar(len(musetalk_faces)) 150 | 151 | for musetalk_face in musetalk_faces: 152 | 153 | start_time0 = time.time() 154 | 155 | real_index = self.getRealIndex(idx, origin_img_len) 156 | # TODO: valid real_index 157 | 158 | origin_image = origin_images[real_index] 159 | 160 | rotated_bbox = musetalk_global_data.rotated_bboxs[real_index] 161 | rotated_image = musetalk_global_data.rotated_images[real_index] 162 | face_center_point = musetalk_global_data.face_center_points[real_index] 163 | rotate_angle = musetalk_global_data.rotated_angles[real_index] 164 | # origin_face_bbox = musetalk_global_data.origin_face_bboxs[real_index] 165 | origin_face_mask = musetalk_global_data.origin_face_masks[real_index] 166 | # landmark = musetalk_global_data.origin_face_landmarks[real_index] 167 | rotated_face = musetalk_global_data.rotated_faces[real_index] 168 | 169 | 170 | # musetalk_face = musetalk_utils.tensorimg_to_cv2img(musetalk_face) 171 | # rotated_image = musetalk_utils.tensorimg_to_cv2img(rotated_image) 172 | 173 | origin_image_height, origin_image_width = musetalk_utils.tensorimg_to_cv2img(origin_image).shape[:2] 174 | 175 | # print("origin_image shape: ", origin_image.shape[:2]) 176 | # print("musetalk_face shape: ", musetalk_face.shape[:2]) 177 | # print("rotated_image shape: ", rotated_image.shape[:2]) 178 | 179 | 180 | if uncrop_mask == None: 181 | 182 | pil_uncrop_mask_image = musetalk_global_data.rotated_resized_half_face_masks[real_index] 183 | 184 | start_time1 = time.time() 185 | musetalk_rotated_image, pil_uncrop_mask_image = musetalk_utils.uncrop_to_rotated_image(musetalk_utils.tensorimg_to_pilimg(rotated_face), 186 | musetalk_utils.tensorimg_to_pilimg(musetalk_face), 187 | rotated_bbox, 188 | musetalk_utils.tensorimg_to_pilimg(rotated_image), 189 | pil_uncrop_mask_image, extend, blur_radius) 190 | # uncrop_masks.append(musetalk_utils.pilimg_to_tensorimg(pil_uncrop_mask_image)) 191 | 192 | print(f"frame index: {idx}, real_index: {real_index}, uncrop one frame, use: {((time.time() - start_time1)*1000):.2f} ms") 193 | else: 194 | musetalk_rotated_image, _ = musetalk_utils.uncrop_to_rotated_image(musetalk_utils.tensorimg_to_pilimg(rotated_face), 195 | musetalk_utils.tensorimg_to_pilimg(musetalk_face), 196 | rotated_bbox, 197 | musetalk_utils.tensorimg_to_pilimg(rotated_image), 198 | musetalk_utils.tensorimg_to_pilimg(uncrop_mask), 0, 0) 199 | # uncrop_masks.append(uncrop_mask) 200 | 201 | musetalk_rotated_image_tensor = musetalk_utils.pilimg_to_tensorimg(musetalk_rotated_image) 202 | # uncroped_images.append(musetalk_rotated_image_tensor) 203 | 204 | # uncrop_masks.append(musetalk_utils.pilimg_to_tensorimg(uncrop_mask)) 205 | 206 | # TODO: optimize 207 | musetalk_origin_image = musetalk_utils.unrotated_image(musetalk_utils.tensorimg_to_cv2img(musetalk_rotated_image_tensor), face_center_point, rotate_angle, origin_image_width, origin_image_height) 208 | 209 | musetalk_origin_image_tensor = musetalk_utils.cv2img_to_tensorimg(musetalk_origin_image) 210 | 211 | # landmark = landmark[0] 212 | 213 | # (origin_image, musetalk_origin_image, origin_face_bbox, mouth_center_point, mouth_width, origin_face_mask, extend, radius): 214 | 215 | start_time1 = time.time() 216 | result_image, face_mask = musetalk_utils.blend_to_origin_image(musetalk_utils.tensorimg_to_pilimg(origin_image), 217 | musetalk_utils.tensorimg_to_pilimg(musetalk_origin_image_tensor), 218 | musetalk_utils.tensorimg_to_pilimg(origin_face_mask), 219 | extend1, blur_radius1) 220 | 221 | print(f"frame index: {idx}, real_index: {real_index}, blend one frame, use: {((time.time() - start_time1)*1000):.2f} ms") 222 | 223 | result_images.append(musetalk_utils.pilimg_to_tensorimg(result_image)) 224 | # face_masks.append(musetalk_utils.pilimg_to_tensorimg(face_mask)) 225 | 226 | pbar.update(1) 227 | 228 | print(f"frame index: {idx}, real_index: {real_index}, processed one frame, total use: {((time.time() - start_time0)*1000):.2f} ms") 229 | 230 | idx = (idx + 1)%MAX_LEN 231 | 232 | return ( 233 | torch.stack(result_images, dim=0), 234 | # torch.stack(uncrop_masks, dim=0), 235 | # torch.stack(uncroped_images, dim=0), 236 | # torch.stack(face_masks, dim=0), 237 | ) 238 | 239 | 240 | 241 | if __name__ == "__main__": 242 | 243 | # def postprocess(self, origin_images, 244 | # musetalk_faces, 245 | # rotated_bboxs, 246 | # rotated_images, 247 | # face_center_points, 248 | # rotated_angles, 249 | # origin_face_bboxs, 250 | # origin_face_masks, 251 | # landmarks): 252 | 253 | ori_img = Image.open("./ori.png") 254 | ori_img_cv2 = cv2.cvtColor(np.array(ori_img), cv2.COLOR_RGB2BGR) 255 | ori_img_tensor = musetalk_utils.cv2img_to_tensorimg(ori_img_cv2) 256 | 257 | musetalk_face = Image.open("./failed_image_musetalk_face36.jpg") 258 | musetalk_face_cv2 = cv2.cvtColor(np.array(musetalk_face), cv2.COLOR_RGB2BGR) 259 | musetalk_face_tensor = musetalk_utils.cv2img_to_tensorimg(musetalk_face_cv2) 260 | 261 | rotated_bbox = (150, 201, 721, 729) 262 | 263 | rotated_image = Image.open("./failed_image_rotated_image36.jpg") 264 | rotated_image_cv2 = cv2.cvtColor(np.array(rotated_image), cv2.COLOR_RGB2BGR) 265 | rotated_image_tensor = musetalk_utils.cv2img_to_tensorimg(rotated_image_cv2) 266 | 267 | face_center_point = [(50,50)] 268 | 269 | rotated_angle = [20] 270 | 271 | origin_face_bbox = [(0,0,500,500)] 272 | 273 | origin_face_mask = [None] 274 | 275 | isok, musetalk_rotated_image = musetalk_utils.uncrop_to_rotated_image(musetalk_utils.tensorimg_to_cv2img(musetalk_face_tensor), 276 | rotated_bbox, 277 | musetalk_utils.tensorimg_to_cv2img(rotated_image_tensor)) 278 | print(isok) 279 | 280 | 281 | 282 | # test = MuseTalkPostprocess() 283 | # test.postprocess() 284 | 285 | -------------------------------------------------------------------------------- /musetalk_preprocess.py: -------------------------------------------------------------------------------- 1 | 2 | import cv2 3 | from einops import rearrange 4 | import torch 5 | 6 | from . import musetalk_utils 7 | from . import musetalk_global_data 8 | 9 | import comfy 10 | 11 | class MuseTalkPreprocess: 12 | def __init__(self): 13 | pass 14 | 15 | @classmethod 16 | def INPUT_TYPES(s): 17 | return { 18 | "required": { 19 | "origin_images": ("IMAGE",), 20 | "pose_kps": ("POSE_KEYPOINT",), 21 | "crop_type": (["full", "middle-min", "middle-max"],), 22 | "top_reserve": ("INT", {"default": 0, "min": -9999, "max": 9999, "step": 1}), 23 | "bottom_reserve": ("INT", {"default": 0, "min": -9999, "max": 9999, "step": 1}), 24 | "left_reserve": ("INT", {"default": 0, "min": -9999, "max": 9999, "step": 1}), 25 | "right_reserve": ("INT", {"default": 0, "min": -9999, "max": 9999, "step": 1}), 26 | }, 27 | } 28 | 29 | RETURN_TYPES = ("IMAGE", 30 | # "FACE_BBOX", "IMAGE", "FACE_CENTER_POINT", "ROTATE_ANGLE", "FACE_BBOX", "IMAGE", "LANDMARK", 31 | "IMAGE", ) 32 | RETURN_NAMES = ( 33 | "rotated_faces", 34 | # "rotated_bboxs", 35 | # "rotated_images", 36 | # "face_center_points", 37 | # "rotated_angles", 38 | # "origin_face_bboxs", 39 | # "origin_face_masks", 40 | # "landmarks", 41 | "rotated_faces_with_landmarks" 42 | ) 43 | 44 | FUNCTION = "preprocess" 45 | CATEGORY = "MuseTalkUtils" 46 | 47 | def preprocess(self, origin_images, pose_kps, crop_type, top_reserve, bottom_reserve, left_reserve, right_reserve): 48 | 49 | print(f"MuseTalkPreprocess preprocess, len(origin_images): {len(origin_images)}") 50 | 51 | global rotated_faces 52 | global rotated_faces_with_landmarks 53 | 54 | global rotated_bboxs 55 | global rotated_images 56 | global face_center_points 57 | global rotated_angles 58 | global origin_face_bboxs 59 | global origin_face_masks 60 | global origin_face_landmarks 61 | 62 | 63 | musetalk_global_data.rotated_faces = [] 64 | musetalk_global_data.rotated_faces_with_landmarks = [] 65 | 66 | musetalk_global_data.rotated_bboxs = [] 67 | musetalk_global_data.rotated_images = [] 68 | 69 | musetalk_global_data.face_center_points = [] 70 | musetalk_global_data.rotated_angles = [] 71 | musetalk_global_data.origin_face_bboxs = [] 72 | musetalk_global_data.origin_face_masks = [] 73 | musetalk_global_data.rotated_resized_half_face_masks = [] 74 | 75 | 76 | if len(origin_images) != len(pose_kps): 77 | print("origin_images is not same with pose_kps by len") 78 | return None 79 | 80 | musetalk_global_data.origin_face_landmarks = musetalk_utils.get_landmards_by_posekey(pose_kps) 81 | 82 | idx = -1 83 | 84 | pbar = comfy.utils.ProgressBar(len(origin_images)) 85 | 86 | for image, landmark in zip(origin_images, musetalk_global_data.origin_face_landmarks): 87 | 88 | idx = idx + 1 89 | 90 | # print("landmark len: ", len(landmark)) 91 | # print("landmark: ", landmark) 92 | 93 | if len(landmark) == 0: 94 | if len(musetalk_global_data.rotated_faces) > 0: 95 | cur_index = len(musetalk_global_data.rotated_faces)-1 96 | musetalk_global_data.rotated_faces.append(musetalk_global_data.rotated_faces[cur_index]) 97 | musetalk_global_data.rotated_faces_with_landmarks.append(musetalk_global_data.rotated_faces_with_landmarks[cur_index]) 98 | 99 | musetalk_global_data.rotated_bboxs.append(musetalk_global_data.rotated_bboxs[cur_index]) 100 | musetalk_global_data.rotated_images.append(musetalk_global_data.rotated_images[cur_index]) 101 | musetalk_global_data.face_center_points.append(musetalk_global_data.face_center_points[cur_index]) 102 | musetalk_global_data.rotated_angles.append(musetalk_global_data.rotated_angles[cur_index]) 103 | musetalk_global_data.origin_face_bboxs.append(musetalk_global_data.origin_face_bboxs[cur_index]) 104 | musetalk_global_data.origin_face_masks.append(musetalk_global_data.origin_face_masks[cur_index]) 105 | musetalk_global_data.origin_face_landmarks.append(musetalk_global_data.origin_face_landmarks[cur_index]) 106 | print(f"not found face, image index: {idx}") 107 | continue 108 | else: 109 | # TODO: process no face in first frame 110 | continue 111 | 112 | landmark = landmark[0] 113 | 114 | origin_image = musetalk_utils.tensorimg_to_cv2img(image) 115 | origin_height, origin_width = image.shape[:2] 116 | # print("origin_image shape: ", image.shape) 117 | 118 | origin_face_bbox = musetalk_utils.get_image_face_bbox(landmark) 119 | musetalk_global_data.origin_face_bboxs.append(origin_face_bbox) 120 | # print("origin_face_bbox: ", origin_face_bbox) 121 | 122 | origin_face_mask = musetalk_utils.get_half_face_mask(landmark, origin_width, origin_height) 123 | musetalk_global_data.origin_face_masks.append(musetalk_utils.pilimg_to_tensorimg(origin_face_mask)) 124 | # print("origin_face_mask: ", origin_face_mask.size) 125 | 126 | face_center_point, rotate_angle = musetalk_utils.get_face_center_point_and_rotate_angles(landmark) 127 | musetalk_global_data.face_center_points.append(face_center_point) 128 | musetalk_global_data.rotated_angles.append(rotate_angle) 129 | 130 | # print("face_center_point: ", face_center_point) 131 | # print("rotate_angle: ", rotate_angle) 132 | 133 | rotated_image = musetalk_utils.get_rotated_image(origin_image, face_center_point, rotate_angle) 134 | musetalk_global_data.rotated_images.append(musetalk_utils.cv2img_to_tensorimg(rotated_image)) 135 | # print("rotated_image: ", rotated_image) 136 | 137 | rotated_landmark = musetalk_utils.get_rotatedimage_landmarks(landmark, face_center_point, rotate_angle) 138 | 139 | # print("rotated_landmark:",rotated_landmark) 140 | 141 | rotated_face, resized_rotated_face, rotated_face_bbox = musetalk_utils.get_face_img_and_face_bbox(rotated_image, rotated_landmark, crop_type, top_reserve, bottom_reserve, left_reserve, right_reserve) 142 | 143 | rotated_face_landmark = musetalk_utils.adjust_landmarks_to_crop(rotated_landmark, rotated_face_bbox) 144 | 145 | left, top, right, bottom = rotated_face_bbox 146 | 147 | # print(rotated_face_bbox, right - left, top - bottom) 148 | 149 | rotated_resized_half_face_mask = musetalk_utils.get_half_face_mask(rotated_face_landmark, right - left, bottom - top) 150 | 151 | rotated_resized_half_face_mask = rotated_resized_half_face_mask.resize((256, 256)) 152 | 153 | musetalk_global_data.rotated_resized_half_face_masks.append(rotated_resized_half_face_mask) 154 | 155 | rotated_face_with_landmark = musetalk_utils.draw_landmarks(rotated_face, rotated_face_landmark) 156 | 157 | rotated_face_with_landmark = cv2.resize(rotated_face_with_landmark, (256, 256)) 158 | 159 | width = rotated_face_with_landmark.shape[1] 160 | height = rotated_face_with_landmark.shape[0] 161 | 162 | cv2.line(rotated_face_with_landmark, (width//2, 0), (width//2, height-1), (255, 0, 0), 1) # v-center-line, blue 163 | cv2.line(rotated_face_with_landmark, (0, height//2), (width-1, height//2), (255, 0, 0), 1) # h-center-line, blue 164 | 165 | musetalk_global_data.rotated_faces_with_landmarks.append(musetalk_utils.cv2img_to_tensorimg(rotated_face_with_landmark)) 166 | 167 | musetalk_global_data.rotated_bboxs.append(rotated_face_bbox) 168 | musetalk_global_data.rotated_faces.append(musetalk_utils.cv2img_to_tensorimg(resized_rotated_face)) 169 | 170 | pbar.update(1) 171 | 172 | return ( 173 | torch.stack(musetalk_global_data.rotated_faces, dim=0), 174 | # rotated_bboxs, 175 | # torch.stack(rotated_images, dim=0), 176 | # face_center_points, 177 | # rotated_angles, 178 | # origin_face_bboxs, 179 | # torch.stack(origin_face_masks, dim=0), 180 | # origin_face_landmarks, 181 | torch.stack(musetalk_global_data.rotated_faces_with_landmarks, dim=0), 182 | ) 183 | 184 | 185 | 186 | if __name__ == "__main__": 187 | musetalk_utils.get_landmards_by_posekey(None) 188 | print("hello") 189 | -------------------------------------------------------------------------------- /musetalk_train.py: -------------------------------------------------------------------------------- 1 | 2 | import cv2 3 | import os 4 | import numpy as np 5 | import torch 6 | from torch import nn 7 | import torchvision.transforms as transforms 8 | import tqdm 9 | 10 | from torch.utils.data import DataLoader 11 | from torch.utils.data import Dataset 12 | 13 | import pickle 14 | import glob 15 | 16 | from .vae import VAE 17 | from .unet import UNet 18 | from .import musetalk_global_data 19 | 20 | import folder_paths 21 | 22 | image_transform = transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]) 23 | 24 | def preprocess_img(cv2_img_frame, image_size=256, device="cuda"): 25 | window = [] 26 | if isinstance(cv2_img_frame, str): 27 | window_fnames = [cv2_img_frame] 28 | for fname in window_fnames: 29 | img = cv2.imread(fname) 30 | img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) 31 | img = cv2.resize(img, (image_size, image_size), 32 | interpolation=cv2.INTER_LANCZOS4) 33 | window.append(img) 34 | else: 35 | img = cv2.cvtColor(cv2_img_frame, cv2.COLOR_BGR2RGB) 36 | window.append(img) 37 | x = np.asarray(window) / 255. 38 | x = np.transpose(x, (3, 0, 1, 2)) 39 | x = torch.squeeze(torch.FloatTensor(x)) 40 | x = image_transform(x) 41 | # x = x.unsqueeze(0) # [1, 3, 256, 256] torch tensor 42 | x = x.to(device) 43 | return x 44 | 45 | 46 | class FaceDataset(Dataset): 47 | def __init__(self, face_latents, audio_features, cv2_frames): 48 | 49 | super(FaceDataset, self).__init__() 50 | 51 | self.face_latents = face_latents 52 | self.audio_features = audio_features 53 | self.cv2_frames = cv2_frames 54 | 55 | print('FaceDataset', len(self.face_latents)) 56 | 57 | def __getitem__(self, item): 58 | 59 | frame_tensor = preprocess_img(self.cv2_frames[item]) 60 | latent = self.face_latents[item].squeeze(0) 61 | audio_feature = self.audio_features[item] 62 | audio_feature = torch.tensor(audio_feature).cuda() 63 | 64 | # print(f"frame_tensor: {frame_tensor.shape}, latent: {latent.shape}, audio_feature: {audio_feature.shape}") 65 | return frame_tensor, latent, audio_feature 66 | 67 | def __len__(self): 68 | return len(self.face_latents) 69 | 70 | # only for debug 71 | class FaceDataset2(Dataset): 72 | def __init__(self, dataset_root): 73 | super(FaceDataset2, self).__init__() 74 | self.dataset_root = dataset_root 75 | self.frame_root = os.path.join(self.dataset_root, "frame") 76 | with open(os.path.join(self.dataset_root, "face_latent.pkl"), 'rb') as f: 77 | self.face_latents = pickle.load(f) 78 | with open(os.path.join(self.dataset_root, "whisper_chunks.pkl"), 'rb') as f: 79 | self.audio_features = pickle.load(f) 80 | self.frames_im_path_list = list(sorted(glob.glob(os.path.join(self.frame_root, "*.png")))) 81 | 82 | def __getitem__(self, item): 83 | frame = cv2.imread(self.frames_im_path_list[item]) 84 | frame_tensor = preprocess_img(frame) 85 | latent = self.face_latents[item].squeeze(0) 86 | audio_feature = self.audio_features[item] 87 | audio_feature = torch.tensor(audio_feature).cuda() 88 | 89 | # print(f"frame_tensor: {frame_tensor.shape}, latent: {latent.shape}, audio_feature: {audio_feature.shape}") 90 | return frame_tensor, latent, audio_feature 91 | 92 | def __len__(self): 93 | return len(self.frames_im_path_list) 94 | 95 | 96 | class MuseTalkTrain: 97 | def __init__(self): 98 | pass 99 | 100 | @classmethod 101 | def INPUT_TYPES(s): 102 | return { 103 | "required": { 104 | "images": ("IMAGE",), 105 | "whisper_features" : ("WHISPERFEAT",), 106 | "batch_size": ("INT", {"default": 4, "min": 1, "max": 4096, "step": 1}), 107 | }, 108 | } 109 | 110 | RETURN_TYPES = ("IMAGE", ) 111 | RETURN_NAMES = ("images", ) 112 | 113 | FUNCTION = "train" 114 | CATEGORY = "MuseTalkUtils" 115 | 116 | # TODO, images 117 | def train(self, images, whisper_features, batch_size): 118 | 119 | with torch.inference_mode(False): 120 | 121 | model_path_base = os.path.join(folder_paths.models_dir,'musetalk') 122 | model_config_path = os.path.join(model_path_base, "musetalk", "musetalk.json") 123 | model_bin_path = os.path.join(model_path_base, "musetalk", "pytorch_model.bin")# TODO, name 124 | vae_path = os.path.join(model_path_base, "sd-vae-ft-mse") 125 | 126 | # model_config_path = "F:/MuseTalk/talk/models/musetalk/musetalk.json" 127 | # model_bin_path = "F:/MuseTalk/talk/models/musetalk/pytorch_model.bin" 128 | # vae_path = "F:/MuseTalk/talk/models/sd-vae-ft-mse/" 129 | 130 | unet = UNet(unet_config = model_config_path, model_path = model_bin_path) 131 | vae = VAE(model_path = vae_path) 132 | 133 | # global unet 134 | global resized_cv2_frame_list 135 | global faces_latent_list 136 | 137 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 138 | 139 | vae.vae.eval() 140 | unet.model.train() 141 | 142 | timesteps = torch.tensor([0], device=device) 143 | lr = 1e-4 144 | # lr = 5e-5 145 | criterion = nn.HuberLoss() 146 | optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, unet.model.parameters()), lr=lr) 147 | # optimizer = torch.optim.Adamax(filter(lambda p: p.requires_grad, unet.model.parameters()), lr=lr) 148 | 149 | # 150 | save_ckpt_dir = os.path.join(model_path_base, "musetalk") 151 | 152 | print("len", len(musetalk_global_data.faces_latent_list), len(whisper_features), len(musetalk_global_data.resized_cv2_frame_list)) 153 | 154 | face_dataset = FaceDataset(musetalk_global_data.faces_latent_list, whisper_features, musetalk_global_data.resized_cv2_frame_list) 155 | 156 | # face_dataset = FaceDataset2("F:/MuseTalk/talk/data/train_dataset/v2") 157 | 158 | face_dataloader = DataLoader(face_dataset, batch_size = batch_size, shuffle=True, num_workers=0) 159 | 160 | # TODO param 161 | for epoch in range(0, 100): 162 | pbar = tqdm.tqdm(enumerate(face_dataloader), total=len(face_dataloader)) 163 | loss_log = [] 164 | for i, (face_tensor, latent_tensor, audio_feat) in pbar: 165 | 166 | audio_feat = audio_feat.to(torch.float32) 167 | 168 | pred_latents = unet.model(latent_tensor, timesteps, encoder_hidden_states=audio_feat).sample 169 | 170 | # print(f"pred_latents: {pred_latents.requires_grad}") 171 | recon = vae.just_decode_latents(pred_latents) 172 | 173 | gt_latent = vae.encode_latents(face_tensor) 174 | loss = 0.2 * criterion(pred_latents, gt_latent) + 0.8 * criterion(recon, face_tensor) 175 | loss.backward() 176 | loss_log.append(loss.item()) 177 | optimizer.step() 178 | optimizer.zero_grad() 179 | pbar.set_description("(Epoch {}) TRAIN LOSS:{:.8f}".format((epoch + 1), np.mean(loss_log))) 180 | 181 | torch.save(unet.model.state_dict(), os.path.join(save_ckpt_dir, "epoch_{}.pth".format(epoch))) 182 | 183 | return (images,) 184 | 185 | if __name__ == "__main__": 186 | 187 | train = MuseTalkTrain() 188 | train.train(None, [], 4) 189 | 190 | 191 | # print("dgdg") 192 | 193 | # print("hehehh ") 194 | # train = MuseTalkTrain() 195 | # train.train(None, [], 4) 196 | -------------------------------------------------------------------------------- /musetalk_train_preprocess.py: -------------------------------------------------------------------------------- 1 | 2 | import cv2 3 | import os 4 | import random 5 | import torch 6 | 7 | from . import musetalk_utils 8 | from . import vae 9 | from . import musetalk_global_data 10 | 11 | import comfy 12 | import folder_paths 13 | 14 | model_path = os.path.join(folder_paths.models_dir,'musetalk') 15 | 16 | vae_module = vae.VAE(model_path = os.path.join(model_path, "sd-vae-ft-mse")) 17 | 18 | class MuseTalkTrainPreprocess: 19 | def __init__(self): 20 | pass 21 | 22 | @classmethod 23 | def INPUT_TYPES(s): 24 | return { 25 | "required": { 26 | "origin_images": ("IMAGE",), 27 | "pose_kps": ("POSE_KEYPOINT",), 28 | "crop_type": (["full", "middle-min", "middle-max"],), 29 | "top_reserve": ("INT", {"default": 0, "min": -9999, "max": 9999, "step": 1}), 30 | "bottom_reserve": ("INT", {"default": 0, "min": -9999, "max": 9999, "step": 1}), 31 | "left_reserve": ("INT", {"default": 0, "min": -9999, "max": 9999, "step": 1}), 32 | "right_reserve": ("INT", {"default": 0, "min": -9999, "max": 9999, "step": 1}), 33 | }, 34 | } 35 | 36 | RETURN_TYPES = ("IMAGE", 37 | # "FACE_BBOX", "IMAGE", "FACE_CENTER_POINT", "ROTATE_ANGLE", "FACE_BBOX", "IMAGE", "LANDMARK", 38 | "IMAGE", ) 39 | RETURN_NAMES = ( 40 | "rotated_faces", 41 | # "rotated_bboxs", 42 | # "rotated_images", 43 | # "face_center_points", 44 | # "rotated_angles", 45 | # "origin_face_bboxs", 46 | # "origin_face_masks", 47 | # "landmarks", 48 | "rotated_faces_with_landmarks" 49 | ) 50 | 51 | FUNCTION = "preprocess" 52 | CATEGORY = "MuseTalkUtils" 53 | 54 | def preprocess(self, origin_images, pose_kps, crop_type, top_reserve, bottom_reserve, left_reserve, right_reserve): 55 | 56 | print(f"MuseTalkPreprocess preprocess, len(origin_images): {len(origin_images)}") 57 | 58 | global rotated_faces 59 | global rotated_faces_with_landmarks 60 | 61 | global rotated_bboxs 62 | global rotated_images 63 | global face_center_points 64 | global rotated_angles 65 | global origin_face_bboxs 66 | global origin_face_masks 67 | global origin_face_landmarks 68 | global faces_latent_list 69 | global resized_cv2_frame_list 70 | 71 | 72 | musetalk_global_data.rotated_faces = [] 73 | musetalk_global_data.rotated_faces_with_landmarks = [] 74 | 75 | musetalk_global_data.rotated_bboxs = [] 76 | musetalk_global_data.rotated_images = [] 77 | 78 | musetalk_global_data.face_center_points = [] 79 | musetalk_global_data.rotated_angles = [] 80 | musetalk_global_data.origin_face_bboxs = [] 81 | musetalk_global_data.origin_face_masks = [] 82 | musetalk_global_data.rotated_resized_half_face_masks = [] 83 | 84 | musetalk_global_data.faces_latent_list = [] 85 | musetalk_global_data.resized_cv2_frame_list = [] 86 | 87 | 88 | if len(origin_images) != len(pose_kps): 89 | print("origin_images is not same with pose_kps by len") 90 | return None 91 | 92 | musetalk_global_data.origin_face_landmarks = musetalk_utils.get_landmards_by_posekey(pose_kps) 93 | 94 | idx = -1 95 | 96 | # pbar = comfy.utils.ProgressBar(len(origin_images)) 97 | 98 | for image, landmark in zip(origin_images, musetalk_global_data.origin_face_landmarks): 99 | 100 | idx = idx + 1 101 | 102 | # print("landmark len: ", len(landmark)) 103 | # print("landmark: ", landmark) 104 | 105 | if len(landmark) == 0: 106 | if len(musetalk_global_data.rotated_faces) > 0: 107 | cur_index = len(musetalk_global_data.rotated_faces)-1 108 | musetalk_global_data.rotated_faces.append(musetalk_global_data.rotated_faces[cur_index]) 109 | musetalk_global_data.rotated_faces_with_landmarks.append(musetalk_global_data.rotated_faces_with_landmarks[cur_index]) 110 | 111 | musetalk_global_data.rotated_bboxs.append(musetalk_global_data.rotated_bboxs[cur_index]) 112 | musetalk_global_data.rotated_images.append(musetalk_global_data.rotated_images[cur_index]) 113 | musetalk_global_data.face_center_points.append(musetalk_global_data.face_center_points[cur_index]) 114 | musetalk_global_data.rotated_angles.append(musetalk_global_data.rotated_angles[cur_index]) 115 | musetalk_global_data.origin_face_bboxs.append(musetalk_global_data.origin_face_bboxs[cur_index]) 116 | musetalk_global_data.origin_face_masks.append(musetalk_global_data.origin_face_masks[cur_index]) 117 | musetalk_global_data.origin_face_landmarks.append(musetalk_global_data.origin_face_landmarks[cur_index]) 118 | print(f"not found face, image index: {idx}") 119 | continue 120 | else: 121 | # TODO: process no face first frame 122 | continue 123 | 124 | landmark = landmark[0] 125 | 126 | origin_image = musetalk_utils.tensorimg_to_cv2img(image) 127 | origin_height, origin_width = image.shape[:2] 128 | # print("origin_image shape: ", image.shape) 129 | 130 | origin_face_bbox = musetalk_utils.get_image_face_bbox(landmark) 131 | musetalk_global_data.origin_face_bboxs.append(origin_face_bbox) 132 | # print("origin_face_bbox: ", origin_face_bbox) 133 | 134 | origin_face_mask = musetalk_utils.get_half_face_mask(landmark, origin_width, origin_height) 135 | musetalk_global_data.origin_face_masks.append(musetalk_utils.pilimg_to_tensorimg(origin_face_mask)) 136 | # print("origin_face_mask: ", origin_face_mask.size) 137 | 138 | face_center_point, rotate_angle = musetalk_utils.get_face_center_point_and_rotate_angles(landmark) 139 | musetalk_global_data.face_center_points.append(face_center_point) 140 | musetalk_global_data.rotated_angles.append(rotate_angle) 141 | 142 | # print("face_center_point: ", face_center_point) 143 | # print("rotate_angle: ", rotate_angle) 144 | 145 | rotated_image = musetalk_utils.get_rotated_image(origin_image, face_center_point, rotate_angle) 146 | musetalk_global_data.rotated_images.append(musetalk_utils.cv2img_to_tensorimg(rotated_image)) 147 | # print("rotated_image: ", rotated_image) 148 | 149 | rotated_landmark = musetalk_utils.get_rotatedimage_landmarks(landmark, face_center_point, rotate_angle) 150 | 151 | # print("rotated_landmark:",rotated_landmark) 152 | 153 | rotated_face, resized_rotated_face, rotated_face_bbox = musetalk_utils.get_face_img_and_face_bbox(rotated_image, rotated_landmark, crop_type, top_reserve, bottom_reserve, left_reserve, right_reserve) 154 | 155 | musetalk_global_data.resized_cv2_frame_list.append(resized_rotated_face) 156 | 157 | rotated_face_landmark = musetalk_utils.adjust_landmarks_to_crop(rotated_landmark, rotated_face_bbox) 158 | 159 | left, top, right, bottom = rotated_face_bbox 160 | 161 | # print(rotated_face_bbox, right - left, top - bottom) 162 | 163 | rotated_resized_half_face_mask = musetalk_utils.get_half_face_mask(rotated_face_landmark, right - left, bottom - top) 164 | 165 | rotated_resized_half_face_mask = rotated_resized_half_face_mask.resize((256, 256)) 166 | 167 | musetalk_global_data.rotated_resized_half_face_masks.append(rotated_resized_half_face_mask) 168 | 169 | rotated_face_with_landmark = musetalk_utils.draw_landmarks(rotated_face, rotated_face_landmark) 170 | 171 | rotated_face_with_landmark = cv2.resize(rotated_face_with_landmark, (256, 256)) 172 | 173 | # draw debug center line 174 | width = rotated_face_with_landmark.shape[1] 175 | height = rotated_face_with_landmark.shape[0] 176 | cv2.line(rotated_face_with_landmark, (width//2, 0), (width//2, height-1), (255, 0, 0), 1) # v line, blue 177 | cv2.line(rotated_face_with_landmark, (0, height//2), (width-1, height//2), (255, 0, 0), 1) # h line, blue 178 | 179 | musetalk_global_data.rotated_faces_with_landmarks.append(musetalk_utils.cv2img_to_tensorimg(rotated_face_with_landmark)) 180 | 181 | musetalk_global_data.rotated_bboxs.append(rotated_face_bbox) 182 | musetalk_global_data.rotated_faces.append(musetalk_utils.cv2img_to_tensorimg(resized_rotated_face)) 183 | # pbar.update(1) 184 | 185 | 186 | frame_count = len(musetalk_global_data.resized_cv2_frame_list) 187 | pbar = comfy.utils.ProgressBar(frame_count) 188 | 189 | print("frame_count", frame_count) 190 | 191 | for fid in range(frame_count): 192 | gt_face = musetalk_global_data.resized_cv2_frame_list[fid] 193 | rand_num = random.randint(0, frame_count-1) 194 | ref_face = musetalk_global_data.resized_cv2_frame_list[rand_num] 195 | latents = vae_module.get_train_latents_for_unet(gt_face, ref_face) 196 | musetalk_global_data.faces_latent_list.append(latents) 197 | 198 | pbar.update(1) 199 | 200 | return ( 201 | torch.stack(musetalk_global_data.rotated_faces, dim=0), 202 | # rotated_bboxs, 203 | # torch.stack(rotated_images, dim=0), 204 | # face_center_points, 205 | # rotated_angles, 206 | # origin_face_bboxs, 207 | # torch.stack(origin_face_masks, dim=0), 208 | # origin_face_landmarks, 209 | torch.stack(musetalk_global_data.rotated_faces_with_landmarks, dim=0), 210 | ) 211 | 212 | 213 | -------------------------------------------------------------------------------- /musetalk_utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import cv2 3 | import torch 4 | import numpy as np 5 | from einops import rearrange 6 | from PIL import Image, ImageDraw,ImageFilter 7 | import scipy.ndimage 8 | 9 | def pilimg_to_cv2img(pil_img): 10 | 11 | numpy_image = np.array(pil_img) 12 | 13 | # to 3 channels 14 | if numpy_image.ndim == 2: 15 | numpy_image = np.repeat(numpy_image[:, :, np.newaxis], 3, axis=2) 16 | 17 | # remove Alpha 18 | if numpy_image.shape[2] == 4: 19 | numpy_image = numpy_image[:, :, :3] 20 | 21 | # to BRG 22 | bgr_image = cv2.cvtColor(numpy_image, cv2.COLOR_RGB2BGR) 23 | 24 | return bgr_image 25 | 26 | def tensorimg_to_cv2img(tensor_img): 27 | numpy_image = tensor_img.numpy() 28 | numpy_image = numpy_image * 255.0 29 | numpy_image = numpy_image.astype('uint8') 30 | rgb_image = cv2.cvtColor(numpy_image, cv2.COLOR_BGR2RGB) 31 | return rgb_image 32 | 33 | def cv2img_to_tensorimg(img): 34 | img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) 35 | numpy_image = np.array(img_rgb) 36 | numpy_image = numpy_image / 255.0 37 | tensor_img = torch.from_numpy(numpy_image) 38 | return tensor_img 39 | 40 | def pilimg_to_tensorimg(pil_img): 41 | numpy_image = np.array(pil_img) 42 | tensor_img = torch.tensor(numpy_image, dtype=torch.float32) / 255.0 43 | return tensor_img 44 | 45 | def tensorimg_to_pilimg(tensor_img): 46 | numpy_image = (tensor_img * 255).byte().numpy() 47 | 48 | numpy_image = np.clip(numpy_image, 0, 255).astype(np.uint8) 49 | 50 | pil_img = Image.fromarray(numpy_image) 51 | 52 | return pil_img 53 | 54 | def is_normalized(keypoints) -> bool: 55 | point_normalized = [ 56 | 0 <= np.abs(k[0]) <= 1 and 0 <= np.abs(k[1]) <= 1 57 | for k in keypoints 58 | if k is not None 59 | ] 60 | if not point_normalized: 61 | return False 62 | return np.all(point_normalized) 63 | 64 | 65 | def get_half_face_mask(landmark, width, height): 66 | 67 | mask = Image.new("RGB", (width, height), (0,0,0)) 68 | 69 | # https://www.researchgate.net/profile/Fabrizio-Falchi/publication/338048224/figure/fig1/AS:837860722741255@1576772971540/68-facial-landmarks.jpg 70 | points = landmark[0:17] 71 | # points.append(landmark[30]) 72 | 73 | draw = ImageDraw.Draw(mask) 74 | draw.polygon(points, fill=(255,255,255)) 75 | 76 | return mask 77 | 78 | def draw_landmarks(img, landmarks): 79 | 80 | img_copy = img.copy() 81 | for i, (x, y) in enumerate(landmarks): 82 | # # https://www.researchgate.net/profile/Fabrizio-Falchi/publication/338048224/figure/fig1/AS:837860722741255@1576772971540/68-facial-landmarks.jpg 83 | if i == 29 or i==48 or i == 54: 84 | # center nose , left mouth, right mouth 85 | cv2.circle(img_copy, (x, y), 2, (0, 0, 255), -1) # red 86 | else: 87 | cv2.circle(img_copy, (x, y), 2, (0, 255, 0), -1) # green 88 | 89 | return img_copy 90 | 91 | 92 | def get_landmards_by_posekey(pose_kps): 93 | # print("in get_landmards_by_posekey len(pose_kps)", len(pose_kps)) 94 | land_marks = [] 95 | for pose_frame in pose_kps: 96 | width, height = pose_frame["canvas_width"], pose_frame["canvas_height"] 97 | person_landmark = [] 98 | for person in pose_frame["people"]: 99 | 100 | if "face_keypoints_2d" in person and person["face_keypoints_2d"] is not None: 101 | 102 | n = len(person["face_keypoints_2d"]) // 3 103 | 104 | facial_kps = rearrange(np.array(person["face_keypoints_2d"]), "(n c) -> n c", n=n, c=3)[:, :2] 105 | 106 | if is_normalized(facial_kps): 107 | facial_kps *= (width, height) 108 | 109 | facial_kps = facial_kps.astype(np.int32) 110 | 111 | one_person_land_marks = [(x, y) for x, y in facial_kps] 112 | 113 | person_landmark.append(one_person_land_marks) 114 | else: 115 | print("not found face!!!") 116 | 117 | land_marks.append(person_landmark) 118 | 119 | return land_marks 120 | 121 | def get_mouth_center_point_by_landmark(landmark): 122 | mouth_center_x = (landmark[51][0] + landmark[57][0]) // 2 123 | mouth_center_y = (landmark[51][1] + landmark[57][1]) // 2 124 | return (mouth_center_x, mouth_center_y) 125 | 126 | def get_mouth_width_by_landmark(landmark): 127 | left_mouth_x = landmark[48][0] # left mouth 128 | right_mouth_x = landmark[54][0] # right mouth 129 | return right_mouth_x - left_mouth_x 130 | 131 | 132 | def get_image_face_bbox(landmark): 133 | 134 | # face bbox 135 | left = min(landmark[i][0] for i in range(0, 17)) 136 | right = max(landmark[i][0] for i in range(0, 17)) 137 | bottom = max(landmark[i][1] for i in range(0, 27)) 138 | 139 | # 51 top mouth 140 | # 57 bottom mouth 141 | # mouth_center_x = (landmark[51][0] + landmark[57][0]) // 2 142 | mouth_center_y = (landmark[51][1] + landmark[57][1]) // 2 143 | 144 | 145 | # left_mouth_x = landmark[48][0] 146 | # right_mouth_x = landmark[54][0] 147 | 148 | # mouth_width = right_mouth_x - left_mouth_x 149 | 150 | # harf_x_left = mouth_center_x - left 151 | # harf_x_right = right - mouth_center_x 152 | # harf_x = max(harf_x_left, harf_x_right) 153 | 154 | # print("left:", bottom) 155 | # print("harf_x:", harf_x) 156 | 157 | # left = mouth_center_x - harf_x 158 | # right = mouth_center_x + harf_x 159 | 160 | one_fourth_y = bottom - mouth_center_y 161 | top = bottom - one_fourth_y*4 162 | 163 | # middle_y = bottom - landmark[29][1] 164 | # top = bottom - middle_y * 2 165 | 166 | 167 | # TODO,out-of-bounds process 168 | # top = top + top_reserve 169 | # bottom = bottom + bottom_reserve 170 | # left = left + left_reserve 171 | # right = right + right_reserve 172 | 173 | face_bbox = left, top, right, bottom 174 | 175 | return face_bbox 176 | 177 | def get_face_center_point_and_rotate_angles(landmarks): 178 | 179 | landmarks = np.array(landmarks) 180 | 181 | # face center point 182 | center_point = np.mean(landmarks, axis=0) 183 | 184 | # left eye and right eye 185 | # left_point = landmarks[36] 186 | # right_point = landmarks[45] 187 | 188 | # left mouth and right mouth 189 | left_point = landmarks[48] 190 | right_point = landmarks[54] 191 | 192 | # cal angle 193 | angle = np.arctan2(right_point[1] - left_point[1], right_point[0] - left_point[0]) * 180 / np.pi 194 | 195 | return center_point, angle 196 | 197 | def get_rotated_image(origin_image, face_center_point, rotate_angle): 198 | 199 | rotation_matrix = cv2.getRotationMatrix2D(tuple(face_center_point), rotate_angle, 1) 200 | rotated_image = cv2.warpAffine(origin_image, rotation_matrix, (origin_image.shape[1], origin_image.shape[0]), flags=cv2.INTER_NEAREST) 201 | 202 | return rotated_image 203 | 204 | 205 | def get_rotatedimage_landmarks(landmark, face_center_point, rotate_angle): 206 | 207 | landmark = np.array(landmark) 208 | 209 | rotation_matrix = cv2.getRotationMatrix2D(tuple(face_center_point), rotate_angle, 1) 210 | 211 | adjusted_landmarks = landmark - face_center_point 212 | rotated_landmark = np.dot(rotation_matrix[:, :2], adjusted_landmarks.T).T + face_center_point 213 | 214 | converted_landmarks = [(int(point[0]), int(point[1])) for point in rotated_landmark] 215 | 216 | return converted_landmarks 217 | 218 | def adjust_landmarks_to_crop(landmarks, bbox): 219 | 220 | left, top, right, bottom = bbox 221 | width = right - left 222 | height = bottom - top 223 | 224 | offset_x = left 225 | offset_y = top 226 | 227 | adjusted_landmarks = [(x - offset_x, y - offset_y) for x, y in landmarks] 228 | 229 | return adjusted_landmarks 230 | 231 | def get_face_img_and_face_bbox(image, landmark, crop_type, top_reserve, bottom_reserve, left_reserve, right_reserve): 232 | 233 | # face bbox 234 | left = min(landmark[i][0] for i in range(0, 17)) 235 | right = max(landmark[i][0] for i in range(0, 17)) 236 | bottom = max(landmark[i][1] for i in range(0, 27)) 237 | 238 | # modify top last 239 | bottom = bottom + bottom_reserve 240 | left = left - left_reserve 241 | right = right + right_reserve 242 | 243 | # mouth up center: 51 244 | # mouth down center: 57 245 | mouth_center_x = (landmark[51][0] + landmark[57][0]) // 2 246 | mouth_center_y = (landmark[51][1] + landmark[57][1]) // 2 247 | 248 | left_mouth_x = landmark[48][0] 249 | right_mouth_x = landmark[54][0] 250 | 251 | mouth_width = right_mouth_x - left_mouth_x 252 | 253 | harf_x_left = mouth_center_x - left 254 | harf_x_right = right - mouth_center_x 255 | 256 | if crop_type == "middle-min": 257 | harf_x = min(harf_x_left, harf_x_right) 258 | 259 | # print("left:", bottom) 260 | # print("harf_x:", harf_x) 261 | 262 | left = mouth_center_x - harf_x 263 | right = mouth_center_x + harf_x 264 | elif crop_type == "middle-max": 265 | harf_x = max(harf_x_left, harf_x_right) 266 | 267 | # print("left:", bottom) 268 | # print("harf_x:", harf_x) 269 | 270 | left = mouth_center_x - harf_x 271 | right = mouth_center_x + harf_x 272 | elif crop_type == "full": 273 | pass 274 | 275 | # left = int(mouth_center_x - mouth_width) 276 | # right = int(mouth_center_x + mouth_width) 277 | 278 | # one_fourth_height = bottom - mouth_center_y 279 | # half_height = one_fourth_height * 2 280 | 281 | # middle_y = bottom - half_height 282 | 283 | # if middle_y < landmark[28][1]: 284 | # middle_y = landmark[28][1] 285 | # if middle_y > landmark[30][1]: 286 | # middle_y = landmark[30][1] 287 | 288 | # half_height = bottom - middle_y 289 | # top = bottom - half_height * 2 290 | 291 | # landmark29 in v-center 292 | middle_y = bottom - landmark[29][1] 293 | top = bottom - middle_y * 2 294 | 295 | top = top - top_reserve 296 | 297 | # out of bounds 298 | left = max(0, left) 299 | top = max(0, top) 300 | right = min(image.shape[1], right) 301 | bottom = min(image.shape[0], bottom) 302 | 303 | # print(f"left: {left}, top: {top}, right: {right}, bottom: {bottom}") 304 | 305 | face_image = image[top:bottom, left:right] 306 | 307 | resized_face_image = cv2.resize(face_image,(256,256)) 308 | 309 | face_bbox = left, top, right, bottom 310 | 311 | return face_image, resized_face_image, face_bbox 312 | 313 | 314 | def create_uncrop_mask(width, height, center, v_axes, h_axes): 315 | 316 | 317 | mask = np.zeros((height, width), dtype=np.uint8) 318 | 319 | axes = (h_axes, v_axes) 320 | angle = 90 321 | color = 255 322 | 323 | cv2.ellipse(mask, center, axes, angle, 0, 360, color, thickness=-1) 324 | 325 | pil_image = Image.fromarray(mask) 326 | 327 | return pil_image 328 | 329 | 330 | 331 | def uncrop_to_rotated_image(rotated_face, musetalk_face, rotated_bbox, rotated_image, uncrop_mask, extend, radius): 332 | 333 | mask = uncrop_mask.copy() 334 | 335 | # TODO,optimize 336 | mask = mask.convert('L') 337 | 338 | rotated_face_copy = rotated_face.copy() 339 | 340 | rotated_face_copy.paste(musetalk_face, (0, 0), mask) 341 | 342 | x_min, y_min, x_max, y_max = rotated_bbox 343 | 344 | origin_width, origin_height = rotated_image.size 345 | 346 | x_min = max(0, x_min) 347 | y_min = max(0, y_min) 348 | 349 | x_max = min(x_max, origin_width) 350 | y_max = min(y_max, origin_height) 351 | 352 | width = x_max - x_min 353 | height = y_max - y_min 354 | 355 | rotated_face_copy = rotated_face_copy.resize((width, height)) 356 | 357 | # print("width:", width) 358 | # print("Height:", height) 359 | 360 | # musetalk_face = musetalk_face.resize((width, height)) 361 | 362 | # mask = uncrop_mask 363 | 364 | # mask = mask.convert('L') 365 | 366 | # mask = mask.resize((width, height)) 367 | 368 | if extend != 0: 369 | mask = expand_mask(mask, extend, True) 370 | 371 | if radius != 0: 372 | mask = mask.filter(ImageFilter.GaussianBlur(radius=radius)) 373 | 374 | # print(f"musetalk_face mode:{musetalk_face.mode} {musetalk_face.size}, rotated_image mode: {rotated_image.mode}, {rotated_image.size}, {mask.size}") 375 | 376 | rotated_image.paste(rotated_face_copy, (x_min, y_min)) 377 | 378 | mask = mask.convert('RGB') 379 | 380 | return rotated_image, mask 381 | 382 | def unrotated_image(musetalk_rotated_image, face_center_point, rotate_angle, width, height): 383 | rotation_matrix = cv2.getRotationMatrix2D(face_center_point, -rotate_angle, 1) 384 | musetalk_origin_image = cv2.warpAffine(musetalk_rotated_image, rotation_matrix, (width, height)) 385 | 386 | return musetalk_origin_image 387 | 388 | def expand_mask(mask, expand, tapered_corners): 389 | 390 | mask = np.array(mask) 391 | c = 0 if tapered_corners else 1 392 | 393 | kernel = np.array([[c, 1, c], 394 | [1, 1, 1], 395 | [c, 1, c]]) 396 | 397 | iterations = abs(expand) 398 | 399 | operation = scipy.ndimage.morphology.binary_erosion if expand < 0 else scipy.ndimage.morphology.binary_dilation 400 | 401 | mask = operation(mask, structure=kernel, iterations=iterations) 402 | 403 | return Image.fromarray(mask.astype(np.uint8) * 255) 404 | 405 | 406 | def blend_to_origin_image(origin_image, musetalk_origin_image, origin_face_mask, extend, radius): 407 | 408 | origin_face_mask = origin_face_mask.convert('L') 409 | 410 | origin_face_mask = expand_mask(origin_face_mask, extend, True) 411 | 412 | origin_face_mask = origin_face_mask.resize(musetalk_origin_image.size) 413 | 414 | origin_face_mask = origin_face_mask.filter(ImageFilter.BoxBlur(radius=radius)) 415 | origin_face_mask = origin_face_mask.filter(ImageFilter.GaussianBlur(radius=radius)) 416 | 417 | origin_image.paste(musetalk_origin_image, (0, 0), origin_face_mask) 418 | 419 | return origin_image, origin_face_mask -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "comfyui-musetalkutils" 3 | description = "MuseTalk ComfyUI Preprocess and Postprocess Nodes" 4 | version = "1.0.0" 5 | license = "LICENSE" 6 | 7 | [project.urls] 8 | Repository = "https://github.com/xuhongming251/ComfyUI-MuseTalkUtils" 9 | # Used by Comfy Registry https://comfyregistry.org 10 | 11 | [tool.comfy] 12 | PublisherId = "" 13 | DisplayName = "ComfyUI-MuseTalkUtils" 14 | Icon = "" 15 | -------------------------------------------------------------------------------- /unet.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import math 4 | import json 5 | 6 | from diffusers import UNet2DConditionModel 7 | import sys 8 | import time 9 | import numpy as np 10 | import os 11 | 12 | class PositionalEncoding(nn.Module): 13 | def __init__(self, d_model=384, max_len=5000): 14 | super(PositionalEncoding, self).__init__() 15 | pe = torch.zeros(max_len, d_model) 16 | position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1) 17 | div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model)) 18 | pe[:, 0::2] = torch.sin(position * div_term) 19 | pe[:, 1::2] = torch.cos(position * div_term) 20 | pe = pe.unsqueeze(0) 21 | self.register_buffer('pe', pe) 22 | 23 | def forward(self, x): 24 | b, seq_len, d_model = x.size() 25 | pe = self.pe[:, :seq_len, :] 26 | x = x + pe.to(x.device) 27 | return x 28 | 29 | class UNet(): 30 | def __init__(self, 31 | unet_config, 32 | model_path, 33 | use_float16=False, 34 | ): 35 | with open(unet_config, 'r') as f: 36 | unet_config = json.load(f) 37 | self.model = UNet2DConditionModel(**unet_config) 38 | self.pe = PositionalEncoding(d_model=384) 39 | self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 40 | weights = torch.load(model_path) if torch.cuda.is_available() else torch.load(model_path, map_location=self.device) 41 | self.model.load_state_dict(weights) 42 | if use_float16: 43 | self.model = self.model.half() 44 | self.model.to(self.device) 45 | 46 | if __name__ == "__main__": 47 | unet = UNet() 48 | -------------------------------------------------------------------------------- /vae.py: -------------------------------------------------------------------------------- 1 | from diffusers import AutoencoderKL 2 | import torch 3 | import torchvision.transforms as transforms 4 | import torch.nn.functional as F 5 | import cv2 6 | import numpy as np 7 | from PIL import Image 8 | import os 9 | 10 | class VAE(): 11 | """ 12 | VAE (Variational Autoencoder) class for image processing. 13 | """ 14 | 15 | def __init__(self, model_path="./models/sd-vae-ft-mse/", resized_img=256, use_float16=False): 16 | """ 17 | Initialize the VAE instance. 18 | 19 | :param model_path: Path to the trained model. 20 | :param resized_img: The size to which images are resized. 21 | :param use_float16: Whether to use float16 precision. 22 | """ 23 | self.model_path = model_path 24 | self.vae = AutoencoderKL.from_pretrained(self.model_path) 25 | 26 | self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 27 | self.vae.to(self.device) 28 | 29 | if use_float16: 30 | self.vae = self.vae.half() 31 | self._use_float16 = True 32 | else: 33 | self._use_float16 = False 34 | 35 | self.scaling_factor = self.vae.config.scaling_factor 36 | self.transform = transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]) 37 | self._resized_img = resized_img 38 | self._mask_tensor = self.get_mask_tensor() 39 | 40 | def get_mask_tensor(self): 41 | """ 42 | Creates a mask tensor for image processing. 43 | :return: A mask tensor. 44 | """ 45 | mask_tensor = torch.zeros((self._resized_img,self._resized_img)) 46 | mask_tensor[:self._resized_img//2,:] = 1 47 | mask_tensor[mask_tensor< 0.5] = 0 48 | mask_tensor[mask_tensor>= 0.5] = 1 49 | return mask_tensor 50 | 51 | def preprocess_img(self,img_name,half_mask=False): 52 | """ 53 | Preprocess an image for the VAE. 54 | 55 | :param img_name: The image file path or a list of image file paths. 56 | :param half_mask: Whether to apply a half mask to the image. 57 | :return: A preprocessed image tensor. 58 | """ 59 | window = [] 60 | if isinstance(img_name, str): 61 | window_fnames = [img_name] 62 | for fname in window_fnames: 63 | img = cv2.imread(fname) 64 | img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) 65 | img = cv2.resize(img, (self._resized_img, self._resized_img), 66 | interpolation=cv2.INTER_LANCZOS4) 67 | window.append(img) 68 | else: 69 | img = cv2.cvtColor(img_name, cv2.COLOR_BGR2RGB) 70 | window.append(img) 71 | 72 | x = np.asarray(window) / 255. 73 | # print("0x shape:", x.shape) 74 | x = np.transpose(x, (3, 0, 1, 2)) 75 | # print("1x shape:", x.shape) 76 | # print("self._mask_tensor shape", self._mask_tensor.shape) 77 | x = torch.squeeze(torch.FloatTensor(x)) 78 | if half_mask: 79 | # print("_mask_tensor:", self._mask_tensor) 80 | # print("x:", x) 81 | 82 | x = x * (self._mask_tensor>0.5) 83 | x = self.transform(x) 84 | 85 | x = x.unsqueeze(0) # [1, 3, 256, 256] torch tensor 86 | x = x.to(self.vae.device) 87 | 88 | return x 89 | 90 | def encode_latents(self,image): 91 | """ 92 | Encode an image into latent variables. 93 | 94 | :param image: The image tensor to encode. 95 | :return: The encoded latent variables. 96 | """ 97 | with torch.no_grad(): 98 | init_latent_dist = self.vae.encode(image.to(self.vae.dtype)).latent_dist 99 | init_latents = self.scaling_factor * init_latent_dist.sample() 100 | return init_latents 101 | 102 | def decode_latents(self, latents): 103 | """ 104 | Decode latent variables back into an image. 105 | :param latents: The latent variables to decode. 106 | :return: A NumPy array representing the decoded image. 107 | """ 108 | latents = (1/ self.scaling_factor) * latents 109 | image = self.vae.decode(latents.to(self.vae.dtype)).sample 110 | image = (image / 2 + 0.5).clamp(0, 1) 111 | image = image.detach().cpu().permute(0, 2, 3, 1).float().numpy() 112 | image = (image * 255).round().astype("uint8") 113 | image = image[...,::-1] # RGB to BGR 114 | return image 115 | 116 | def just_decode_latents(self, latents): 117 | latents = (1 / self.scaling_factor) * latents 118 | image = self.vae.decode(latents.to(self.vae.dtype)).sample 119 | return image 120 | 121 | def get_latents_for_unet(self,img): 122 | """ 123 | Prepare latent variables for a U-Net model. 124 | :param img: The image to process. 125 | :return: A concatenated tensor of latents for U-Net input. 126 | """ 127 | 128 | ref_image = self.preprocess_img(img,half_mask=True) # [1, 3, 256, 256] RGB, torch tensor 129 | masked_latents = self.encode_latents(ref_image) # [1, 4, 32, 32], torch tensor 130 | ref_image = self.preprocess_img(img,half_mask=False) # [1, 3, 256, 256] RGB, torch tensor 131 | ref_latents = self.encode_latents(ref_image) # [1, 4, 32, 32], torch tensor 132 | latent_model_input = torch.cat([masked_latents, ref_latents], dim=1) 133 | return latent_model_input 134 | 135 | def get_train_latents_for_unet(self, hal_face, ref_face): 136 | ref_image = self.preprocess_img(hal_face,half_mask=True) # [1, 3, 256, 256] RGB, torch tensor 137 | masked_latents = self.encode_latents(ref_image) # [1, 4, 32, 32], torch tensor 138 | ref_image = self.preprocess_img(ref_face,half_mask=False) # [1, 3, 256, 256] RGB, torch tensor 139 | ref_latents = self.encode_latents(ref_image) # [1, 4, 32, 32], torch tensor 140 | latent_model_input = torch.cat([masked_latents, ref_latents], dim=1) 141 | return latent_model_input 142 | 143 | 144 | 145 | if __name__ == "__main__": 146 | window = [] 147 | img = cv2.imread("d:/11.png") 148 | # print(img) 149 | img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) 150 | img = cv2.resize(img, (256, 256), interpolation=cv2.INTER_LANCZOS4) 151 | window.append(img) 152 | x = np.asarray(window) / 255. 153 | print("0x shape:", x.shape) 154 | x = np.transpose(x, (3, 0, 1, 2)) 155 | print("1x shape:", x.shape) 156 | # vv = VAE() 157 | # _mask_tensor = vv.get_mask_tensor() 158 | # print("self._mask_tensor shape", _mask_tensor.shape) 159 | # x = torch.squeeze(torch.FloatTensor(x)) 160 | 161 | # print("_mask_tensor:", _mask_tensor) 162 | # print("x:", x) 163 | 164 | 165 | # x = x * (_mask_tensor>0.5) 166 | # x = selftransform(x) 167 | 168 | # x = x.unsqueeze(0) # [1, 3, 256, 256] torch tensor 169 | 170 | # print(x.shape) 171 | 172 | # vae_mode_path = "./models/sd-vae-ft-mse/" 173 | # vae = VAE(model_path = vae_mode_path,use_float16=False) 174 | # img_path = "./results/sun001_crop/00000.png" 175 | 176 | # crop_imgs_path = "./results/sun001_crop/" 177 | # latents_out_path = "./results/latents/" 178 | # if not os.path.exists(latents_out_path): 179 | # os.mkdir(latents_out_path) 180 | 181 | # files = os.listdir(crop_imgs_path) 182 | # files.sort() 183 | # files = [file for file in files if file.split(".")[-1] == "png"] 184 | 185 | # for file in files: 186 | # index = file.split(".")[0] 187 | # img_path = crop_imgs_path + file 188 | # latents = vae.get_latents_for_unet(img_path) 189 | # print(img_path,"latents",latents.size()) 190 | # #torch.save(latents,os.path.join(latents_out_path,index+".pt")) 191 | # #reload_tensor = torch.load('tensor.pt') 192 | # #print(reload_tensor.size()) 193 | 194 | 195 | -------------------------------------------------------------------------------- /workflow/musetalk flow.json: -------------------------------------------------------------------------------- 1 | { 2 | "last_node_id": 1116, 3 | "last_link_id": 1849, 4 | "nodes": [ 5 | { 6 | "id": 529, 7 | "type": "GetImageSize+", 8 | "pos": [ 9 | -6837.223017960602, 10 | 1181.0887697221267 11 | ], 12 | "size": { 13 | "0": 210, 14 | "1": 46 15 | }, 16 | "flags": {}, 17 | "order": 21, 18 | "mode": 0, 19 | "inputs": [ 20 | { 21 | "name": "image", 22 | "type": "IMAGE", 23 | "link": 1530 24 | } 25 | ], 26 | "outputs": [ 27 | { 28 | "name": "width", 29 | "type": "INT", 30 | "links": [ 31 | 871 32 | ], 33 | "shape": 3, 34 | "slot_index": 0 35 | }, 36 | { 37 | "name": "height", 38 | "type": "INT", 39 | "links": [ 40 | 872 41 | ], 42 | "shape": 3, 43 | "slot_index": 1 44 | } 45 | ], 46 | "properties": { 47 | "Node name for S&R": "GetImageSize+" 48 | } 49 | }, 50 | { 51 | "id": 27, 52 | "type": "vhs_audio_to_audio_tensor", 53 | "pos": [ 54 | -7600.1983555191255, 55 | -491.89309959092043 56 | ], 57 | "size": { 58 | "0": 315, 59 | "1": 102 60 | }, 61 | "flags": {}, 62 | "order": 14, 63 | "mode": 0, 64 | "inputs": [ 65 | { 66 | "name": "vhs_audio", 67 | "type": "VHS_AUDIO", 68 | "link": 45, 69 | "slot_index": 0, 70 | "label": "vhs_audio" 71 | } 72 | ], 73 | "outputs": [ 74 | { 75 | "name": "audio_tensor", 76 | "type": "VCAUDIOTENSOR", 77 | "links": [ 78 | 67 79 | ], 80 | "shape": 3, 81 | "slot_index": 0, 82 | "label": "audio_tensor" 83 | }, 84 | { 85 | "name": "audio_dur", 86 | "type": "INT", 87 | "links": null, 88 | "shape": 3, 89 | "label": "audio_dur" 90 | } 91 | ], 92 | "properties": { 93 | "Node name for S&R": "vhs_audio_to_audio_tensor" 94 | }, 95 | "widgets_values": [ 96 | 16000, 97 | 1 98 | ] 99 | }, 100 | { 101 | "id": 223, 102 | "type": "SetNode", 103 | "pos": [ 104 | -6827.198355519126, 105 | -760.8930995909205 106 | ], 107 | "size": { 108 | "0": 235.1999969482422, 109 | "1": 58 110 | }, 111 | "flags": {}, 112 | "order": 26, 113 | "mode": 0, 114 | "inputs": [ 115 | { 116 | "name": "INT", 117 | "type": "INT", 118 | "link": 422 119 | } 120 | ], 121 | "outputs": [ 122 | { 123 | "name": "*", 124 | "type": "*", 125 | "links": null 126 | } 127 | ], 128 | "title": "Set_output_video_frame_count", 129 | "properties": { 130 | "previousName": "output_video_frame_count" 131 | }, 132 | "widgets_values": [ 133 | "output_video_frame_count" 134 | ] 135 | }, 136 | { 137 | "id": 125, 138 | "type": "GetNode", 139 | "pos": [ 140 | -4474.410906003187, 141 | -641.4087271741254 142 | ], 143 | "size": { 144 | "0": 210, 145 | "1": 58 146 | }, 147 | "flags": { 148 | "collapsed": false 149 | }, 150 | "order": 0, 151 | "mode": 0, 152 | "outputs": [ 153 | { 154 | "name": "VHS_AUDIO", 155 | "type": "VHS_AUDIO", 156 | "links": [ 157 | 1380 158 | ], 159 | "slot_index": 0, 160 | "label": "VHS_AUDIO" 161 | } 162 | ], 163 | "title": "Get_audio", 164 | "properties": {}, 165 | "widgets_values": [ 166 | "audio" 167 | ] 168 | }, 169 | { 170 | "id": 224, 171 | "type": "GetNode", 172 | "pos": [ 173 | -8757, 174 | 874 175 | ], 176 | "size": { 177 | "0": 285.89874267578125, 178 | "1": 97.85186767578125 179 | }, 180 | "flags": {}, 181 | "order": 1, 182 | "mode": 0, 183 | "outputs": [ 184 | { 185 | "name": "INT", 186 | "type": "INT", 187 | "links": [ 188 | 1761 189 | ], 190 | "slot_index": 0 191 | } 192 | ], 193 | "title": "Get_output_video_frame_count", 194 | "properties": {}, 195 | "widgets_values": [ 196 | "output_video_frame_count" 197 | ] 198 | }, 199 | { 200 | "id": 527, 201 | "type": "PixelPerfectResolution", 202 | "pos": [ 203 | -6777.911213012693, 204 | 1385.9297019486887 205 | ], 206 | "size": { 207 | "0": 393, 208 | "1": 106 209 | }, 210 | "flags": {}, 211 | "order": 27, 212 | "mode": 0, 213 | "inputs": [ 214 | { 215 | "name": "original_image", 216 | "type": "IMAGE", 217 | "link": 1531, 218 | "label": "original_image" 219 | }, 220 | { 221 | "name": "image_gen_width", 222 | "type": "INT", 223 | "link": 871, 224 | "widget": { 225 | "name": "image_gen_width" 226 | }, 227 | "slot_index": 1 228 | }, 229 | { 230 | "name": "image_gen_height", 231 | "type": "INT", 232 | "link": 872, 233 | "widget": { 234 | "name": "image_gen_height" 235 | } 236 | } 237 | ], 238 | "outputs": [ 239 | { 240 | "name": "RESOLUTION (INT)", 241 | "type": "INT", 242 | "links": [ 243 | 873 244 | ], 245 | "shape": 3, 246 | "label": "RESOLUTION (INT)", 247 | "slot_index": 0 248 | } 249 | ], 250 | "properties": { 251 | "Node name for S&R": "PixelPerfectResolution" 252 | }, 253 | "widgets_values": [ 254 | 800, 255 | 536, 256 | "Just Resize" 257 | ] 258 | }, 259 | { 260 | "id": 36, 261 | "type": "whisper_to_features", 262 | "pos": [ 263 | -7224.752500484394, 264 | -626.0040445507748 265 | ], 266 | "size": { 267 | "0": 342.5999755859375, 268 | "1": 78 269 | }, 270 | "flags": {}, 271 | "order": 20, 272 | "mode": 0, 273 | "inputs": [ 274 | { 275 | "name": "audio_tensor", 276 | "type": "VCAUDIOTENSOR", 277 | "link": 67, 278 | "slot_index": 0, 279 | "label": "audio_tensor" 280 | } 281 | ], 282 | "outputs": [ 283 | { 284 | "name": "whisper_chunks", 285 | "type": "WHISPERFEAT", 286 | "links": [ 287 | 281 288 | ], 289 | "shape": 3, 290 | "slot_index": 0, 291 | "label": "whisper_chunks" 292 | }, 293 | { 294 | "name": "frame_count", 295 | "type": "INT", 296 | "links": [ 297 | 297, 298 | 422 299 | ], 300 | "shape": 3, 301 | "slot_index": 1, 302 | "label": "frame_count" 303 | } 304 | ], 305 | "properties": { 306 | "Node name for S&R": "whisper_to_features" 307 | }, 308 | "widgets_values": [ 309 | 25 310 | ] 311 | }, 312 | { 313 | "id": 526, 314 | "type": "DWPreprocessor", 315 | "pos": [ 316 | -6370.911213012693, 317 | 1083.9297019486887 318 | ], 319 | "size": { 320 | "0": 315, 321 | "1": 198 322 | }, 323 | "flags": {}, 324 | "order": 30, 325 | "mode": 0, 326 | "inputs": [ 327 | { 328 | "name": "image", 329 | "type": "IMAGE", 330 | "link": 1529 331 | }, 332 | { 333 | "name": "resolution", 334 | "type": "INT", 335 | "link": 873, 336 | "widget": { 337 | "name": "resolution" 338 | } 339 | } 340 | ], 341 | "outputs": [ 342 | { 343 | "name": "IMAGE", 344 | "type": "IMAGE", 345 | "links": [], 346 | "shape": 3, 347 | "slot_index": 0 348 | }, 349 | { 350 | "name": "POSE_KEYPOINT", 351 | "type": "POSE_KEYPOINT", 352 | "links": [ 353 | 1776 354 | ], 355 | "shape": 3, 356 | "slot_index": 1 357 | } 358 | ], 359 | "properties": { 360 | "Node name for S&R": "DWPreprocessor" 361 | }, 362 | "widgets_values": [ 363 | "disable", 364 | "disable", 365 | "enable", 366 | 512, 367 | "yolox_l.torchscript.pt", 368 | "dw-ll_ucoco_384_bs5.torchscript.pt" 369 | ] 370 | }, 371 | { 372 | "id": 1083, 373 | "type": "Reroute", 374 | "pos": [ 375 | -6032.854807912913, 376 | -332.51994679218205 377 | ], 378 | "size": [ 379 | 75, 380 | 26 381 | ], 382 | "flags": {}, 383 | "order": 38, 384 | "mode": 0, 385 | "inputs": [ 386 | { 387 | "name": "", 388 | "type": "*", 389 | "link": 1790 390 | } 391 | ], 392 | "outputs": [ 393 | { 394 | "name": "", 395 | "type": "IMAGE", 396 | "links": [ 397 | 1786 398 | ], 399 | "slot_index": 0 400 | } 401 | ], 402 | "properties": { 403 | "showOutputText": false, 404 | "horizontal": false 405 | } 406 | }, 407 | { 408 | "id": 16, 409 | "type": "ImageCompositeMasked", 410 | "pos": [ 411 | -5506.854807912913, 412 | -418.519946792182 413 | ], 414 | "size": { 415 | "0": 291.8965759277344, 416 | "1": 146 417 | }, 418 | "flags": {}, 419 | "order": 39, 420 | "mode": 0, 421 | "inputs": [ 422 | { 423 | "name": "destination", 424 | "type": "IMAGE", 425 | "link": 1786, 426 | "label": "destination" 427 | }, 428 | { 429 | "name": "source", 430 | "type": "IMAGE", 431 | "link": 626, 432 | "label": "source" 433 | }, 434 | { 435 | "name": "mask", 436 | "type": "MASK", 437 | "link": null, 438 | "label": "mask" 439 | } 440 | ], 441 | "outputs": [ 442 | { 443 | "name": "IMAGE", 444 | "type": "IMAGE", 445 | "links": [ 446 | 1512 447 | ], 448 | "shape": 3, 449 | "slot_index": 0, 450 | "label": "IMAGE" 451 | } 452 | ], 453 | "properties": { 454 | "Node name for S&R": "ImageCompositeMasked" 455 | }, 456 | "widgets_values": [ 457 | 0, 458 | 128, 459 | false 460 | ] 461 | }, 462 | { 463 | "id": 980, 464 | "type": "ReActorRestoreFace", 465 | "pos": [ 466 | -5975.633963343581, 467 | 385.2136881125497 468 | ], 469 | "size": { 470 | "0": 315, 471 | "1": 130 472 | }, 473 | "flags": {}, 474 | "order": 2, 475 | "mode": 4, 476 | "inputs": [ 477 | { 478 | "name": "image", 479 | "type": "IMAGE", 480 | "link": null 481 | } 482 | ], 483 | "outputs": [ 484 | { 485 | "name": "IMAGE", 486 | "type": "IMAGE", 487 | "links": [], 488 | "shape": 3, 489 | "slot_index": 0 490 | } 491 | ], 492 | "properties": { 493 | "Node name for S&R": "ReActorRestoreFace" 494 | }, 495 | "widgets_values": [ 496 | "retinaface_resnet50", 497 | "GFPGANv1.4.pth", 498 | 1, 499 | 0.5 500 | ] 501 | }, 502 | { 503 | "id": 1055, 504 | "type": "easy imageToMask", 505 | "pos": [ 506 | -3226, 507 | -140 508 | ], 509 | "size": { 510 | "0": 213.45140075683594, 511 | "1": 58 512 | }, 513 | "flags": {}, 514 | "order": 13, 515 | "mode": 0, 516 | "inputs": [ 517 | { 518 | "name": "image", 519 | "type": "IMAGE", 520 | "link": 1704 521 | } 522 | ], 523 | "outputs": [ 524 | { 525 | "name": "MASK", 526 | "type": "MASK", 527 | "links": [ 528 | 1705 529 | ], 530 | "shape": 3, 531 | "slot_index": 0 532 | } 533 | ], 534 | "properties": { 535 | "Node name for S&R": "easy imageToMask" 536 | }, 537 | "widgets_values": [ 538 | "red" 539 | ] 540 | }, 541 | { 542 | "id": 15, 543 | "type": "EmptyImage", 544 | "pos": [ 545 | -5951.311109737003, 546 | -171.4442407796418 547 | ], 548 | "size": { 549 | "0": 315, 550 | "1": 130 551 | }, 552 | "flags": { 553 | "collapsed": false 554 | }, 555 | "order": 3, 556 | "mode": 0, 557 | "outputs": [ 558 | { 559 | "name": "IMAGE", 560 | "type": "IMAGE", 561 | "links": [ 562 | 626 563 | ], 564 | "shape": 3, 565 | "slot_index": 0, 566 | "label": "IMAGE" 567 | } 568 | ], 569 | "properties": { 570 | "Node name for S&R": "EmptyImage" 571 | }, 572 | "widgets_values": [ 573 | 256, 574 | 256, 575 | 1, 576 | 0 577 | ] 578 | }, 579 | { 580 | "id": 124, 581 | "type": "SetNode", 582 | "pos": [ 583 | -7534.198355519126, 584 | -717.8930995909205 585 | ], 586 | "size": { 587 | "0": 210, 588 | "1": 58 589 | }, 590 | "flags": { 591 | "collapsed": false 592 | }, 593 | "order": 15, 594 | "mode": 0, 595 | "inputs": [ 596 | { 597 | "name": "VHS_AUDIO", 598 | "type": "VHS_AUDIO", 599 | "link": 287, 600 | "label": "VHS_AUDIO" 601 | } 602 | ], 603 | "outputs": [ 604 | { 605 | "name": "*", 606 | "type": "*", 607 | "links": null, 608 | "label": "*" 609 | } 610 | ], 611 | "title": "Set_audio", 612 | "properties": { 613 | "previousName": "audio" 614 | }, 615 | "widgets_values": [ 616 | "audio" 617 | ] 618 | }, 619 | { 620 | "id": 1056, 621 | "type": "MaskToImage", 622 | "pos": [ 623 | -2952, 624 | 272 625 | ], 626 | "size": { 627 | "0": 210, 628 | "1": 26 629 | }, 630 | "flags": {}, 631 | "order": 24, 632 | "mode": 0, 633 | "inputs": [ 634 | { 635 | "name": "mask", 636 | "type": "MASK", 637 | "link": 1706 638 | } 639 | ], 640 | "outputs": [ 641 | { 642 | "name": "IMAGE", 643 | "type": "IMAGE", 644 | "links": [ 645 | 1708, 646 | 1787 647 | ], 648 | "shape": 3, 649 | "slot_index": 0 650 | } 651 | ], 652 | "properties": { 653 | "Node name for S&R": "MaskToImage" 654 | } 655 | }, 656 | { 657 | "id": 1048, 658 | "type": "CR Draw Shape", 659 | "pos": [ 660 | -3597, 661 | -170 662 | ], 663 | "size": { 664 | "0": 315, 665 | "1": 318 666 | }, 667 | "flags": {}, 668 | "order": 4, 669 | "mode": 0, 670 | "outputs": [ 671 | { 672 | "name": "IMAGE", 673 | "type": "IMAGE", 674 | "links": [ 675 | 1704 676 | ], 677 | "shape": 3, 678 | "slot_index": 0 679 | }, 680 | { 681 | "name": "show_help", 682 | "type": "STRING", 683 | "links": null, 684 | "shape": 3 685 | } 686 | ], 687 | "properties": { 688 | "Node name for S&R": "CR Draw Shape" 689 | }, 690 | "widgets_values": [ 691 | 256, 692 | 256, 693 | "half circle", 694 | "white", 695 | "custom", 696 | 0, 697 | 0, 698 | 0.98, 699 | 0, 700 | "#000000", 701 | "#000000" 702 | ] 703 | }, 704 | { 705 | "id": 129, 706 | "type": "Display Any (rgthree)", 707 | "pos": [ 708 | -6840.198355519126, 709 | -480.89309959092043 710 | ], 711 | "size": { 712 | "0": 226.42002868652344, 713 | "1": 116.54998779296875 714 | }, 715 | "flags": {}, 716 | "order": 25, 717 | "mode": 0, 718 | "inputs": [ 719 | { 720 | "name": "source", 721 | "type": "*", 722 | "link": 297, 723 | "dir": 3, 724 | "label": "source" 725 | } 726 | ], 727 | "properties": { 728 | "Node name for S&R": "Display Any (rgthree)" 729 | }, 730 | "widgets_values": [ 731 | "" 732 | ] 733 | }, 734 | { 735 | "id": 1092, 736 | "type": "ImageConcanate", 737 | "pos": [ 738 | -1414.0301884973185, 739 | 783.7210591809077 740 | ], 741 | "size": { 742 | "0": 315, 743 | "1": 102 744 | }, 745 | "flags": {}, 746 | "order": 49, 747 | "mode": 0, 748 | "inputs": [ 749 | { 750 | "name": "image1", 751 | "type": "IMAGE", 752 | "link": 1826 753 | }, 754 | { 755 | "name": "image2", 756 | "type": "IMAGE", 757 | "link": 1799 758 | } 759 | ], 760 | "outputs": [ 761 | { 762 | "name": "IMAGE", 763 | "type": "IMAGE", 764 | "links": [ 765 | 1806 766 | ], 767 | "shape": 3, 768 | "slot_index": 0 769 | } 770 | ], 771 | "properties": { 772 | "Node name for S&R": "ImageConcanate" 773 | }, 774 | "widgets_values": [ 775 | "right", 776 | false 777 | ] 778 | }, 779 | { 780 | "id": 1094, 781 | "type": "GetNode", 782 | "pos": [ 783 | -1384.0301884973185, 784 | 1034.7210591809078 785 | ], 786 | "size": { 787 | "0": 210, 788 | "1": 58 789 | }, 790 | "flags": { 791 | "collapsed": false 792 | }, 793 | "order": 5, 794 | "mode": 0, 795 | "outputs": [ 796 | { 797 | "name": "VHS_AUDIO", 798 | "type": "VHS_AUDIO", 799 | "links": [ 800 | 1801 801 | ], 802 | "slot_index": 0, 803 | "label": "VHS_AUDIO" 804 | } 805 | ], 806 | "title": "Get_audio", 807 | "properties": {}, 808 | "widgets_values": [ 809 | "audio" 810 | ] 811 | }, 812 | { 813 | "id": 1108, 814 | "type": "MuseTalkPostprocess", 815 | "pos": [ 816 | -2876, 817 | 788 818 | ], 819 | "size": { 820 | "0": 315, 821 | "1": 170 822 | }, 823 | "flags": {}, 824 | "order": 43, 825 | "mode": 0, 826 | "inputs": [ 827 | { 828 | "name": "origin_images", 829 | "type": "IMAGE", 830 | "link": 1823 831 | }, 832 | { 833 | "name": "musetalk_faces", 834 | "type": "IMAGE", 835 | "link": 1824 836 | }, 837 | { 838 | "name": "uncrop_mask", 839 | "type": "IMAGE", 840 | "link": 1825 841 | } 842 | ], 843 | "outputs": [ 844 | { 845 | "name": "images", 846 | "type": "IMAGE", 847 | "links": [ 848 | 1826, 849 | 1827 850 | ], 851 | "shape": 3, 852 | "slot_index": 0 853 | } 854 | ], 855 | "properties": { 856 | "Node name for S&R": "MuseTalkPostprocess" 857 | }, 858 | "widgets_values": [ 859 | 0, 860 | 0, 861 | -5, 862 | 5 863 | ] 864 | }, 865 | { 866 | "id": 1051, 867 | "type": "PreviewImage", 868 | "pos": [ 869 | -2469, 870 | -178 871 | ], 872 | "size": { 873 | "0": 210, 874 | "1": 246 875 | }, 876 | "flags": {}, 877 | "order": 28, 878 | "mode": 0, 879 | "inputs": [ 880 | { 881 | "name": "images", 882 | "type": "IMAGE", 883 | "link": 1708 884 | } 885 | ], 886 | "properties": { 887 | "Node name for S&R": "PreviewImage" 888 | } 889 | }, 890 | { 891 | "id": 122, 892 | "type": "muse_talk_sampler", 893 | "pos": [ 894 | -4927.311109737003, 895 | -911.4442407796415 896 | ], 897 | "size": { 898 | "0": 315, 899 | "1": 162 900 | }, 901 | "flags": {}, 902 | "order": 40, 903 | "mode": 0, 904 | "inputs": [ 905 | { 906 | "name": "model", 907 | "type": "MODEL", 908 | "link": 280, 909 | "label": "model" 910 | }, 911 | { 912 | "name": "vae", 913 | "type": "VAE", 914 | "link": 279, 915 | "slot_index": 1, 916 | "label": "vae" 917 | }, 918 | { 919 | "name": "whisper_features", 920 | "type": "WHISPERFEAT", 921 | "link": 281, 922 | "slot_index": 2, 923 | "label": "whisper_features" 924 | }, 925 | { 926 | "name": "images", 927 | "type": "IMAGE", 928 | "link": 1559, 929 | "slot_index": 3, 930 | "label": "images" 931 | }, 932 | { 933 | "name": "masked_images", 934 | "type": "IMAGE", 935 | "link": 1512, 936 | "slot_index": 4, 937 | "label": "masked_images" 938 | } 939 | ], 940 | "outputs": [ 941 | { 942 | "name": "image", 943 | "type": "IMAGE", 944 | "links": [ 945 | 1379, 946 | 1575, 947 | 1824 948 | ], 949 | "shape": 3, 950 | "slot_index": 0, 951 | "label": "image" 952 | } 953 | ], 954 | "properties": { 955 | "Node name for S&R": "muse_talk_sampler" 956 | }, 957 | "widgets_values": [ 958 | 1, 959 | 0 960 | ] 961 | }, 962 | { 963 | "id": 735, 964 | "type": "Display Any (rgthree)", 965 | "pos": [ 966 | -7238, 967 | 1339 968 | ], 969 | "size": { 970 | "0": 226.42002868652344, 971 | "1": 116.54998779296875 972 | }, 973 | "flags": {}, 974 | "order": 23, 975 | "mode": 0, 976 | "inputs": [ 977 | { 978 | "name": "source", 979 | "type": "*", 980 | "link": 1165, 981 | "dir": 3, 982 | "label": "source" 983 | } 984 | ], 985 | "properties": { 986 | "Node name for S&R": "Display Any (rgthree)" 987 | }, 988 | "widgets_values": [ 989 | "" 990 | ] 991 | }, 992 | { 993 | "id": 1091, 994 | "type": "MuseTalkPostprocess", 995 | "pos": [ 996 | -3011, 997 | 1271 998 | ], 999 | "size": { 1000 | "0": 380.4000244140625, 1001 | "1": 190 1002 | }, 1003 | "flags": {}, 1004 | "order": 47, 1005 | "mode": 0, 1006 | "inputs": [ 1007 | { 1008 | "name": "origin_images", 1009 | "type": "IMAGE", 1010 | "link": 1822 1011 | }, 1012 | { 1013 | "name": "musetalk_faces", 1014 | "type": "IMAGE", 1015 | "link": 1848 1016 | }, 1017 | { 1018 | "name": "uncrop_mask", 1019 | "type": "IMAGE", 1020 | "link": 1796 1021 | } 1022 | ], 1023 | "outputs": [ 1024 | { 1025 | "name": "images", 1026 | "type": "IMAGE", 1027 | "links": [ 1028 | 1797, 1029 | 1799 1030 | ], 1031 | "shape": 3, 1032 | "slot_index": 0 1033 | }, 1034 | { 1035 | "name": "uncrop_masks", 1036 | "type": "IMAGE", 1037 | "links": null, 1038 | "shape": 3 1039 | }, 1040 | { 1041 | "name": "uncroped_images", 1042 | "type": "IMAGE", 1043 | "links": null, 1044 | "shape": 3 1045 | }, 1046 | { 1047 | "name": "face_masks", 1048 | "type": "IMAGE", 1049 | "links": null, 1050 | "shape": 3 1051 | } 1052 | ], 1053 | "properties": { 1054 | "Node name for S&R": "MuseTalkPostprocess" 1055 | }, 1056 | "widgets_values": [ 1057 | 0, 1058 | 0, 1059 | -5, 1060 | 5 1061 | ] 1062 | }, 1063 | { 1064 | "id": 1080, 1065 | "type": "MuseTalkPreprocess", 1066 | "pos": [ 1067 | -5926, 1068 | 900 1069 | ], 1070 | "size": { 1071 | "0": 354.3999938964844, 1072 | "1": 174 1073 | }, 1074 | "flags": {}, 1075 | "order": 31, 1076 | "mode": 0, 1077 | "inputs": [ 1078 | { 1079 | "name": "origin_images", 1080 | "type": "IMAGE", 1081 | "link": 1775 1082 | }, 1083 | { 1084 | "name": "pose_kps", 1085 | "type": "POSE_KEYPOINT", 1086 | "link": 1776 1087 | } 1088 | ], 1089 | "outputs": [ 1090 | { 1091 | "name": "rotated_faces", 1092 | "type": "IMAGE", 1093 | "links": [ 1094 | 1778, 1095 | 1780, 1096 | 1784 1097 | ], 1098 | "shape": 3, 1099 | "slot_index": 0 1100 | }, 1101 | { 1102 | "name": "rotated_faces_with_landmarks", 1103 | "type": "IMAGE", 1104 | "links": [ 1105 | 1779 1106 | ], 1107 | "shape": 3, 1108 | "slot_index": 1 1109 | } 1110 | ], 1111 | "properties": { 1112 | "Node name for S&R": "MuseTalkPreprocess" 1113 | }, 1114 | "widgets_values": [ 1115 | "full", 1116 | 0, 1117 | 8, 1118 | 0, 1119 | 0 1120 | ] 1121 | }, 1122 | { 1123 | "id": 1090, 1124 | "type": "JWImageResize", 1125 | "pos": [ 1126 | -4220, 1127 | 459 1128 | ], 1129 | "size": { 1130 | "0": 315, 1131 | "1": 106 1132 | }, 1133 | "flags": {}, 1134 | "order": 44, 1135 | "mode": 0, 1136 | "inputs": [ 1137 | { 1138 | "name": "image", 1139 | "type": "IMAGE", 1140 | "link": 1791 1141 | } 1142 | ], 1143 | "outputs": [ 1144 | { 1145 | "name": "IMAGE", 1146 | "type": "IMAGE", 1147 | "links": [ 1148 | 1792, 1149 | 1848 1150 | ], 1151 | "shape": 3, 1152 | "slot_index": 0 1153 | } 1154 | ], 1155 | "properties": { 1156 | "Node name for S&R": "JWImageResize" 1157 | }, 1158 | "widgets_values": [ 1159 | 256, 1160 | 256, 1161 | "nearest" 1162 | ] 1163 | }, 1164 | { 1165 | "id": 28, 1166 | "type": "VHS_LoadAudio", 1167 | "pos": [ 1168 | -8000, 1169 | -590 1170 | ], 1171 | "size": { 1172 | "0": 315, 1173 | "1": 82 1174 | }, 1175 | "flags": {}, 1176 | "order": 6, 1177 | "mode": 0, 1178 | "outputs": [ 1179 | { 1180 | "name": "audio", 1181 | "type": "VHS_AUDIO", 1182 | "links": [ 1183 | 45, 1184 | 287 1185 | ], 1186 | "shape": 3, 1187 | "slot_index": 0, 1188 | "label": "audio" 1189 | } 1190 | ], 1191 | "properties": { 1192 | "Node name for S&R": "VHS_LoadAudio" 1193 | }, 1194 | "widgets_values": { 1195 | "audio_file": "C:\\Users\\Administrator\\Videos\\视频素材\\xiwang.wav", 1196 | "seek_seconds": 0 1197 | } 1198 | }, 1199 | { 1200 | "id": 947, 1201 | "type": "VHS_LoadVideo", 1202 | "pos": [ 1203 | -8396, 1204 | 870 1205 | ], 1206 | "size": [ 1207 | 240, 1208 | 643.1111111111111 1209 | ], 1210 | "flags": {}, 1211 | "order": 12, 1212 | "mode": 0, 1213 | "inputs": [ 1214 | { 1215 | "name": "batch_manager", 1216 | "type": "VHS_BatchManager", 1217 | "link": null 1218 | }, 1219 | { 1220 | "name": "frame_load_cap", 1221 | "type": "INT", 1222 | "link": 1761, 1223 | "widget": { 1224 | "name": "frame_load_cap" 1225 | } 1226 | } 1227 | ], 1228 | "outputs": [ 1229 | { 1230 | "name": "IMAGE", 1231 | "type": "IMAGE", 1232 | "links": [ 1233 | 1849 1234 | ], 1235 | "shape": 3, 1236 | "slot_index": 0 1237 | }, 1238 | { 1239 | "name": "frame_count", 1240 | "type": "INT", 1241 | "links": [ 1242 | 1515 1243 | ], 1244 | "shape": 3, 1245 | "slot_index": 1 1246 | }, 1247 | { 1248 | "name": "audio", 1249 | "type": "VHS_AUDIO", 1250 | "links": null, 1251 | "shape": 3, 1252 | "slot_index": 2 1253 | }, 1254 | { 1255 | "name": "video_info", 1256 | "type": "VHS_VIDEOINFO", 1257 | "links": [ 1258 | 1516 1259 | ], 1260 | "shape": 3, 1261 | "slot_index": 3 1262 | } 1263 | ], 1264 | "properties": { 1265 | "Node name for S&R": "VHS_LoadVideo" 1266 | }, 1267 | "widgets_values": { 1268 | "video": "2.mp4", 1269 | "force_rate": 25, 1270 | "force_size": "Disabled", 1271 | "custom_width": 512, 1272 | "custom_height": 512, 1273 | "frame_load_cap": 0, 1274 | "skip_first_frames": 0, 1275 | "select_every_nth": 1, 1276 | "choose video to upload": "image", 1277 | "videopreview": { 1278 | "hidden": false, 1279 | "paused": false, 1280 | "params": { 1281 | "frame_load_cap": 0, 1282 | "skip_first_frames": 0, 1283 | "force_rate": 25, 1284 | "filename": "2.mp4", 1285 | "type": "input", 1286 | "format": "video/mp4", 1287 | "select_every_nth": 1 1288 | } 1289 | } 1290 | } 1291 | }, 1292 | { 1293 | "id": 958, 1294 | "type": "Reroute", 1295 | "pos": [ 1296 | -6798, 1297 | 748 1298 | ], 1299 | "size": [ 1300 | 75, 1301 | 26 1302 | ], 1303 | "flags": {}, 1304 | "order": 16, 1305 | "mode": 0, 1306 | "inputs": [ 1307 | { 1308 | "name": "", 1309 | "type": "*", 1310 | "link": 1849 1311 | } 1312 | ], 1313 | "outputs": [ 1314 | { 1315 | "name": "", 1316 | "type": "IMAGE", 1317 | "links": [ 1318 | 1529, 1319 | 1530, 1320 | 1531, 1321 | 1775, 1322 | 1828 1323 | ], 1324 | "slot_index": 0 1325 | } 1326 | ], 1327 | "properties": { 1328 | "showOutputText": false, 1329 | "horizontal": false 1330 | } 1331 | }, 1332 | { 1333 | "id": 97, 1334 | "type": "Display Any (rgthree)", 1335 | "pos": [ 1336 | -7762, 1337 | 1341 1338 | ], 1339 | "size": { 1340 | "0": 226.42002868652344, 1341 | "1": 116.54998779296875 1342 | }, 1343 | "flags": {}, 1344 | "order": 17, 1345 | "mode": 0, 1346 | "inputs": [ 1347 | { 1348 | "name": "source", 1349 | "type": "*", 1350 | "link": 1515, 1351 | "dir": 3, 1352 | "label": "source" 1353 | } 1354 | ], 1355 | "properties": { 1356 | "Node name for S&R": "Display Any (rgthree)" 1357 | }, 1358 | "widgets_values": [ 1359 | "" 1360 | ] 1361 | }, 1362 | { 1363 | "id": 734, 1364 | "type": "VHS_VideoInfo", 1365 | "pos": [ 1366 | -7816, 1367 | 1039 1368 | ], 1369 | "size": { 1370 | "0": 393, 1371 | "1": 206 1372 | }, 1373 | "flags": {}, 1374 | "order": 18, 1375 | "mode": 0, 1376 | "inputs": [ 1377 | { 1378 | "name": "video_info", 1379 | "type": "VHS_VIDEOINFO", 1380 | "link": 1516 1381 | } 1382 | ], 1383 | "outputs": [ 1384 | { 1385 | "name": "source_fps🟨", 1386 | "type": "FLOAT", 1387 | "links": [ 1388 | 1165 1389 | ], 1390 | "shape": 3, 1391 | "slot_index": 0 1392 | }, 1393 | { 1394 | "name": "source_frame_count🟨", 1395 | "type": "INT", 1396 | "links": null, 1397 | "shape": 3 1398 | }, 1399 | { 1400 | "name": "source_duration🟨", 1401 | "type": "FLOAT", 1402 | "links": null, 1403 | "shape": 3 1404 | }, 1405 | { 1406 | "name": "source_width🟨", 1407 | "type": "INT", 1408 | "links": null, 1409 | "shape": 3 1410 | }, 1411 | { 1412 | "name": "source_height🟨", 1413 | "type": "INT", 1414 | "links": null, 1415 | "shape": 3 1416 | }, 1417 | { 1418 | "name": "loaded_fps🟦", 1419 | "type": "FLOAT", 1420 | "links": null, 1421 | "shape": 3 1422 | }, 1423 | { 1424 | "name": "loaded_frame_count🟦", 1425 | "type": "INT", 1426 | "links": null, 1427 | "shape": 3 1428 | }, 1429 | { 1430 | "name": "loaded_duration🟦", 1431 | "type": "FLOAT", 1432 | "links": null, 1433 | "shape": 3 1434 | }, 1435 | { 1436 | "name": "loaded_width🟦", 1437 | "type": "INT", 1438 | "links": null, 1439 | "shape": 3 1440 | }, 1441 | { 1442 | "name": "loaded_height🟦", 1443 | "type": "INT", 1444 | "links": null, 1445 | "shape": 3 1446 | } 1447 | ], 1448 | "properties": { 1449 | "Node name for S&R": "VHS_VideoInfo" 1450 | }, 1451 | "widgets_values": {} 1452 | }, 1453 | { 1454 | "id": 4, 1455 | "type": "VAELoader", 1456 | "pos": [ 1457 | -5971.280013365008, 1458 | -734.1600439000975 1459 | ], 1460 | "size": { 1461 | "0": 389.75921630859375, 1462 | "1": 58 1463 | }, 1464 | "flags": {}, 1465 | "order": 7, 1466 | "mode": 0, 1467 | "outputs": [ 1468 | { 1469 | "name": "VAE", 1470 | "type": "VAE", 1471 | "links": [ 1472 | 279 1473 | ], 1474 | "shape": 3, 1475 | "slot_index": 0, 1476 | "label": "VAE" 1477 | } 1478 | ], 1479 | "properties": { 1480 | "Node name for S&R": "VAELoader" 1481 | }, 1482 | "widgets_values": [ 1483 | "vae-ft-mse-840000-ema-pruned.safetensors" 1484 | ] 1485 | }, 1486 | { 1487 | "id": 121, 1488 | "type": "UNETLoader_MuseTalk", 1489 | "pos": [ 1490 | -5884.311109737003, 1491 | -986.4442407796415 1492 | ], 1493 | "size": { 1494 | "0": 214.1832275390625, 1495 | "1": 58 1496 | }, 1497 | "flags": {}, 1498 | "order": 8, 1499 | "mode": 0, 1500 | "outputs": [ 1501 | { 1502 | "name": "MODEL", 1503 | "type": "MODEL", 1504 | "links": [ 1505 | 280 1506 | ], 1507 | "shape": 3, 1508 | "slot_index": 0, 1509 | "label": "MODEL" 1510 | } 1511 | ], 1512 | "properties": { 1513 | "Node name for S&R": "UNETLoader_MuseTalk" 1514 | }, 1515 | "widgets_values": [ 1516 | "pytorch_model.bin" 1517 | ] 1518 | }, 1519 | { 1520 | "id": 979, 1521 | "type": "JWImageResize", 1522 | "pos": [ 1523 | -6410, 1524 | 316 1525 | ], 1526 | "size": { 1527 | "0": 315, 1528 | "1": 106 1529 | }, 1530 | "flags": {}, 1531 | "order": 37, 1532 | "mode": 4, 1533 | "inputs": [ 1534 | { 1535 | "name": "image", 1536 | "type": "IMAGE", 1537 | "link": 1587 1538 | } 1539 | ], 1540 | "outputs": [ 1541 | { 1542 | "name": "IMAGE", 1543 | "type": "IMAGE", 1544 | "links": [ 1545 | 1790 1546 | ], 1547 | "shape": 3, 1548 | "slot_index": 0 1549 | } 1550 | ], 1551 | "properties": { 1552 | "Node name for S&R": "JWImageResize" 1553 | }, 1554 | "widgets_values": [ 1555 | 256, 1556 | 256, 1557 | "nearest" 1558 | ] 1559 | }, 1560 | { 1561 | "id": 866, 1562 | "type": "FaceEnhancement", 1563 | "pos": [ 1564 | -4240, 1565 | 310 1566 | ], 1567 | "size": { 1568 | "0": 300.08770751953125, 1569 | "1": 73.68206024169922 1570 | }, 1571 | "flags": {}, 1572 | "order": 42, 1573 | "mode": 0, 1574 | "inputs": [ 1575 | { 1576 | "name": "images", 1577 | "type": "IMAGE", 1578 | "link": 1575 1579 | } 1580 | ], 1581 | "outputs": [ 1582 | { 1583 | "name": "images", 1584 | "type": "IMAGE", 1585 | "links": [ 1586 | 1791 1587 | ], 1588 | "shape": 3, 1589 | "slot_index": 0 1590 | } 1591 | ], 1592 | "properties": { 1593 | "Node name for S&R": "FaceEnhancement" 1594 | } 1595 | }, 1596 | { 1597 | "id": 911, 1598 | "type": "GetNode", 1599 | "pos": [ 1600 | -2844, 1601 | 1067 1602 | ], 1603 | "size": { 1604 | "0": 210, 1605 | "1": 58 1606 | }, 1607 | "flags": { 1608 | "collapsed": false 1609 | }, 1610 | "order": 9, 1611 | "mode": 0, 1612 | "outputs": [ 1613 | { 1614 | "name": "VHS_AUDIO", 1615 | "type": "VHS_AUDIO", 1616 | "links": [ 1617 | 1460, 1618 | 1570 1619 | ], 1620 | "slot_index": 0, 1621 | "label": "VHS_AUDIO" 1622 | } 1623 | ], 1624 | "title": "Get_audio", 1625 | "properties": {}, 1626 | "widgets_values": [ 1627 | "audio" 1628 | ] 1629 | }, 1630 | { 1631 | "id": 236, 1632 | "type": "INTConstant", 1633 | "pos": [ 1634 | -8753, 1635 | 1060 1636 | ], 1637 | "size": { 1638 | "0": 224.5601348876953, 1639 | "1": 58 1640 | }, 1641 | "flags": { 1642 | "collapsed": false 1643 | }, 1644 | "order": 10, 1645 | "mode": 0, 1646 | "outputs": [ 1647 | { 1648 | "name": "value", 1649 | "type": "INT", 1650 | "links": [], 1651 | "shape": 3, 1652 | "slot_index": 0 1653 | } 1654 | ], 1655 | "properties": { 1656 | "Node name for S&R": "INTConstant" 1657 | }, 1658 | "widgets_values": [ 1659 | 100 1660 | ], 1661 | "color": "#1b4669", 1662 | "bgcolor": "#29699c" 1663 | }, 1664 | { 1665 | "id": 1084, 1666 | "type": "Reroute", 1667 | "pos": [ 1668 | -3254, 1669 | 496 1670 | ], 1671 | "size": [ 1672 | 75, 1673 | 26 1674 | ], 1675 | "flags": {}, 1676 | "order": 29, 1677 | "mode": 0, 1678 | "inputs": [ 1679 | { 1680 | "name": "", 1681 | "type": "*", 1682 | "link": 1787 1683 | } 1684 | ], 1685 | "outputs": [ 1686 | { 1687 | "name": "", 1688 | "type": "IMAGE", 1689 | "links": [ 1690 | 1796, 1691 | 1825 1692 | ], 1693 | "slot_index": 0 1694 | } 1695 | ], 1696 | "properties": { 1697 | "showOutputText": false, 1698 | "horizontal": false 1699 | } 1700 | }, 1701 | { 1702 | "id": 913, 1703 | "type": "GetNode", 1704 | "pos": [ 1705 | -4285, 1706 | 938 1707 | ], 1708 | "size": { 1709 | "0": 210, 1710 | "1": 58 1711 | }, 1712 | "flags": { 1713 | "collapsed": false 1714 | }, 1715 | "order": 11, 1716 | "mode": 0, 1717 | "outputs": [ 1718 | { 1719 | "name": "VHS_AUDIO", 1720 | "type": "VHS_AUDIO", 1721 | "links": [ 1722 | 1462 1723 | ], 1724 | "slot_index": 0, 1725 | "label": "VHS_AUDIO" 1726 | } 1727 | ], 1728 | "title": "Get_audio", 1729 | "properties": {}, 1730 | "widgets_values": [ 1731 | "audio" 1732 | ] 1733 | }, 1734 | { 1735 | "id": 1107, 1736 | "type": "Reroute", 1737 | "pos": [ 1738 | -3847, 1739 | 1582 1740 | ], 1741 | "size": [ 1742 | 75, 1743 | 26 1744 | ], 1745 | "flags": {}, 1746 | "order": 22, 1747 | "mode": 0, 1748 | "inputs": [ 1749 | { 1750 | "name": "", 1751 | "type": "*", 1752 | "link": 1828 1753 | } 1754 | ], 1755 | "outputs": [ 1756 | { 1757 | "name": "", 1758 | "type": "IMAGE", 1759 | "links": [ 1760 | 1822, 1761 | 1823 1762 | ], 1763 | "slot_index": 0 1764 | } 1765 | ], 1766 | "properties": { 1767 | "showOutputText": false, 1768 | "horizontal": false 1769 | } 1770 | }, 1771 | { 1772 | "id": 912, 1773 | "type": "VHS_VideoCombine", 1774 | "pos": [ 1775 | -3994, 1776 | 840 1777 | ], 1778 | "size": [ 1779 | 320, 1780 | 290 1781 | ], 1782 | "flags": {}, 1783 | "order": 46, 1784 | "mode": 0, 1785 | "inputs": [ 1786 | { 1787 | "name": "images", 1788 | "type": "IMAGE", 1789 | "link": 1792 1790 | }, 1791 | { 1792 | "name": "audio", 1793 | "type": "VHS_AUDIO", 1794 | "link": 1462 1795 | }, 1796 | { 1797 | "name": "batch_manager", 1798 | "type": "VHS_BatchManager", 1799 | "link": null 1800 | } 1801 | ], 1802 | "outputs": [ 1803 | { 1804 | "name": "Filenames", 1805 | "type": "VHS_FILENAMES", 1806 | "links": null, 1807 | "shape": 3 1808 | } 1809 | ], 1810 | "properties": { 1811 | "Node name for S&R": "VHS_VideoCombine" 1812 | }, 1813 | "widgets_values": { 1814 | "frame_rate": 25, 1815 | "loop_count": 0, 1816 | "filename_prefix": "AnimateDiff", 1817 | "format": "video/h264-mp4", 1818 | "pix_fmt": "yuv420p", 1819 | "crf": 19, 1820 | "save_metadata": false, 1821 | "pingpong": false, 1822 | "save_output": false, 1823 | "videopreview": { 1824 | "hidden": false, 1825 | "paused": false, 1826 | "params": { 1827 | "filename": "AnimateDiff_00004-audio.mp4", 1828 | "subfolder": "", 1829 | "type": "temp", 1830 | "format": "video/h264-mp4" 1831 | } 1832 | } 1833 | } 1834 | }, 1835 | { 1836 | "id": 1054, 1837 | "type": "GrowMaskWithBlur", 1838 | "pos": [ 1839 | -2952, 1840 | -205 1841 | ], 1842 | "size": { 1843 | "0": 315, 1844 | "1": 246 1845 | }, 1846 | "flags": {}, 1847 | "order": 19, 1848 | "mode": 0, 1849 | "inputs": [ 1850 | { 1851 | "name": "mask", 1852 | "type": "MASK", 1853 | "link": 1705 1854 | } 1855 | ], 1856 | "outputs": [ 1857 | { 1858 | "name": "mask", 1859 | "type": "MASK", 1860 | "links": [ 1861 | 1706 1862 | ], 1863 | "shape": 3, 1864 | "slot_index": 0 1865 | }, 1866 | { 1867 | "name": "mask_inverted", 1868 | "type": "MASK", 1869 | "links": null, 1870 | "shape": 3 1871 | } 1872 | ], 1873 | "properties": { 1874 | "Node name for S&R": "GrowMaskWithBlur" 1875 | }, 1876 | "widgets_values": [ 1877 | -5, 1878 | 0, 1879 | true, 1880 | false, 1881 | 2, 1882 | 1, 1883 | 1, 1884 | false 1885 | ] 1886 | }, 1887 | { 1888 | "id": 971, 1889 | "type": "ImageFilterGaussianBlur", 1890 | "size": { 1891 | "0": 315, 1892 | "1": 82 1893 | }, 1894 | "flags": {}, 1895 | "mode": 4, 1896 | "inputs": [ 1897 | { 1898 | "name": "images", 1899 | "type": "IMAGE", 1900 | "link": 1780 1901 | } 1902 | ], 1903 | "outputs": [ 1904 | { 1905 | "name": "IMAGE", 1906 | "type": "IMAGE", 1907 | "links": [ 1908 | 1558, 1909 | 1559 1910 | ], 1911 | "shape": 3, 1912 | "slot_index": 0 1913 | } 1914 | ], 1915 | "properties": { 1916 | "Node name for S&R": "ImageFilterGaussianBlur" 1917 | }, 1918 | "widgets_values": [ 1919 | 10, 1920 | 10 1921 | ], 1922 | "order": 34, 1923 | "pos": [ 1924 | -5311, 1925 | 410 1926 | ] 1927 | }, 1928 | { 1929 | "id": 833, 1930 | "type": "VHS_VideoCombine", 1931 | "pos": [ 1932 | -5418, 1933 | 1001 1934 | ], 1935 | "size": [ 1936 | 320, 1937 | 290 1938 | ], 1939 | "flags": {}, 1940 | "order": 35, 1941 | "mode": 0, 1942 | "inputs": [ 1943 | { 1944 | "name": "images", 1945 | "type": "IMAGE", 1946 | "link": 1779 1947 | }, 1948 | { 1949 | "name": "audio", 1950 | "type": "VHS_AUDIO", 1951 | "link": null 1952 | }, 1953 | { 1954 | "name": "batch_manager", 1955 | "type": "VHS_BatchManager", 1956 | "link": null 1957 | } 1958 | ], 1959 | "outputs": [ 1960 | { 1961 | "name": "Filenames", 1962 | "type": "VHS_FILENAMES", 1963 | "links": null, 1964 | "shape": 3 1965 | } 1966 | ], 1967 | "properties": { 1968 | "Node name for S&R": "VHS_VideoCombine" 1969 | }, 1970 | "widgets_values": { 1971 | "frame_rate": 25, 1972 | "loop_count": 0, 1973 | "filename_prefix": "musetalk", 1974 | "format": "video/h264-mp4", 1975 | "pix_fmt": "yuv420p", 1976 | "crf": 19, 1977 | "save_metadata": false, 1978 | "pingpong": false, 1979 | "save_output": false, 1980 | "videopreview": { 1981 | "hidden": false, 1982 | "paused": false, 1983 | "params": { 1984 | "filename": "AnimateDiff_00001.mp4", 1985 | "subfolder": "", 1986 | "type": "temp", 1987 | "format": "video/h264-mp4" 1988 | } 1989 | } 1990 | } 1991 | }, 1992 | { 1993 | "id": 844, 1994 | "type": "VHS_VideoCombine", 1995 | "pos": [ 1996 | -4933, 1997 | 1001 1998 | ], 1999 | "size": [ 2000 | 320, 2001 | 290 2002 | ], 2003 | "flags": {}, 2004 | "order": 32, 2005 | "mode": 0, 2006 | "inputs": [ 2007 | { 2008 | "name": "images", 2009 | "type": "IMAGE", 2010 | "link": 1778 2011 | }, 2012 | { 2013 | "name": "audio", 2014 | "type": "VHS_AUDIO", 2015 | "link": null 2016 | }, 2017 | { 2018 | "name": "batch_manager", 2019 | "type": "VHS_BatchManager", 2020 | "link": null 2021 | } 2022 | ], 2023 | "outputs": [ 2024 | { 2025 | "name": "Filenames", 2026 | "type": "VHS_FILENAMES", 2027 | "links": null, 2028 | "shape": 3 2029 | } 2030 | ], 2031 | "properties": { 2032 | "Node name for S&R": "VHS_VideoCombine" 2033 | }, 2034 | "widgets_values": { 2035 | "frame_rate": 25, 2036 | "loop_count": 0, 2037 | "filename_prefix": "musetalk", 2038 | "format": "video/h264-mp4", 2039 | "pix_fmt": "yuv420p", 2040 | "crf": 19, 2041 | "save_metadata": false, 2042 | "pingpong": false, 2043 | "save_output": false, 2044 | "videopreview": { 2045 | "hidden": false, 2046 | "paused": false, 2047 | "params": { 2048 | "filename": "AnimateDiff_00002.mp4", 2049 | "subfolder": "", 2050 | "type": "temp", 2051 | "format": "video/h264-mp4" 2052 | } 2053 | } 2054 | } 2055 | }, 2056 | { 2057 | "id": 970, 2058 | "type": "VHS_VideoCombine", 2059 | "pos": [ 2060 | -4794.311109737003, 2061 | -480.4442407796421 2062 | ], 2063 | "size": [ 2064 | 320, 2065 | 290 2066 | ], 2067 | "flags": {}, 2068 | "order": 36, 2069 | "mode": 0, 2070 | "inputs": [ 2071 | { 2072 | "name": "images", 2073 | "type": "IMAGE", 2074 | "link": 1558 2075 | }, 2076 | { 2077 | "name": "audio", 2078 | "type": "VHS_AUDIO", 2079 | "link": null 2080 | }, 2081 | { 2082 | "name": "batch_manager", 2083 | "type": "VHS_BatchManager", 2084 | "link": null 2085 | } 2086 | ], 2087 | "outputs": [ 2088 | { 2089 | "name": "Filenames", 2090 | "type": "VHS_FILENAMES", 2091 | "links": null, 2092 | "shape": 3 2093 | } 2094 | ], 2095 | "properties": { 2096 | "Node name for S&R": "VHS_VideoCombine" 2097 | }, 2098 | "widgets_values": { 2099 | "frame_rate": 25, 2100 | "loop_count": 0, 2101 | "filename_prefix": "musetalk", 2102 | "format": "video/h264-mp4", 2103 | "pix_fmt": "yuv420p", 2104 | "crf": 19, 2105 | "save_metadata": false, 2106 | "pingpong": false, 2107 | "save_output": false, 2108 | "videopreview": { 2109 | "hidden": false, 2110 | "paused": false, 2111 | "params": { 2112 | "filename": "AnimateDiff_00003.mp4", 2113 | "subfolder": "", 2114 | "type": "temp", 2115 | "format": "video/h264-mp4" 2116 | } 2117 | } 2118 | } 2119 | }, 2120 | { 2121 | "id": 842, 2122 | "type": "VHS_VideoCombine", 2123 | "pos": [ 2124 | -4134.311109737003, 2125 | -941.4442407796415 2126 | ], 2127 | "size": [ 2128 | 320, 2129 | 604 2130 | ], 2131 | "flags": {}, 2132 | "order": 41, 2133 | "mode": 0, 2134 | "inputs": [ 2135 | { 2136 | "name": "images", 2137 | "type": "IMAGE", 2138 | "link": 1379 2139 | }, 2140 | { 2141 | "name": "audio", 2142 | "type": "VHS_AUDIO", 2143 | "link": 1380 2144 | }, 2145 | { 2146 | "name": "batch_manager", 2147 | "type": "VHS_BatchManager", 2148 | "link": null 2149 | } 2150 | ], 2151 | "outputs": [ 2152 | { 2153 | "name": "Filenames", 2154 | "type": "VHS_FILENAMES", 2155 | "links": null, 2156 | "shape": 3, 2157 | "slot_index": 0 2158 | } 2159 | ], 2160 | "properties": { 2161 | "Node name for S&R": "VHS_VideoCombine" 2162 | }, 2163 | "widgets_values": { 2164 | "frame_rate": 25, 2165 | "loop_count": 0, 2166 | "filename_prefix": "musetalk", 2167 | "format": "video/h264-mp4", 2168 | "pix_fmt": "yuv420p", 2169 | "crf": 19, 2170 | "save_metadata": false, 2171 | "pingpong": false, 2172 | "save_output": true, 2173 | "videopreview": { 2174 | "hidden": false, 2175 | "paused": false, 2176 | "params": { 2177 | "filename": "AnimateDiff_00021-audio.mp4", 2178 | "subfolder": "", 2179 | "type": "output", 2180 | "format": "video/h264-mp4" 2181 | } 2182 | } 2183 | } 2184 | }, 2185 | { 2186 | "id": 867, 2187 | "type": "VHS_VideoCombine", 2188 | "pos": [ 2189 | -1978, 2190 | 924 2191 | ], 2192 | "size": [ 2193 | 320, 2194 | 290 2195 | ], 2196 | "flags": {}, 2197 | "order": 45, 2198 | "mode": 0, 2199 | "inputs": [ 2200 | { 2201 | "name": "images", 2202 | "type": "IMAGE", 2203 | "link": 1827 2204 | }, 2205 | { 2206 | "name": "audio", 2207 | "type": "VHS_AUDIO", 2208 | "link": 1460 2209 | }, 2210 | { 2211 | "name": "batch_manager", 2212 | "type": "VHS_BatchManager", 2213 | "link": null 2214 | } 2215 | ], 2216 | "outputs": [ 2217 | { 2218 | "name": "Filenames", 2219 | "type": "VHS_FILENAMES", 2220 | "links": null, 2221 | "shape": 3, 2222 | "slot_index": 0 2223 | } 2224 | ], 2225 | "properties": { 2226 | "Node name for S&R": "VHS_VideoCombine" 2227 | }, 2228 | "widgets_values": { 2229 | "frame_rate": 25, 2230 | "loop_count": 0, 2231 | "filename_prefix": "musetalk", 2232 | "format": "video/h264-mp4", 2233 | "pix_fmt": "yuv420p", 2234 | "crf": 19, 2235 | "save_metadata": false, 2236 | "pingpong": false, 2237 | "save_output": false, 2238 | "videopreview": { 2239 | "hidden": false, 2240 | "paused": false, 2241 | "params": { 2242 | "filename": "AnimateDiff_00005-audio.mp4", 2243 | "subfolder": "", 2244 | "type": "temp", 2245 | "format": "video/h264-mp4" 2246 | } 2247 | } 2248 | } 2249 | }, 2250 | { 2251 | "id": 973, 2252 | "type": "VHS_VideoCombine", 2253 | "pos": [ 2254 | -2420, 2255 | 1279 2256 | ], 2257 | "size": [ 2258 | 320, 2259 | 290 2260 | ], 2261 | "flags": {}, 2262 | "order": 48, 2263 | "mode": 0, 2264 | "inputs": [ 2265 | { 2266 | "name": "images", 2267 | "type": "IMAGE", 2268 | "link": 1797 2269 | }, 2270 | { 2271 | "name": "audio", 2272 | "type": "VHS_AUDIO", 2273 | "link": 1570 2274 | }, 2275 | { 2276 | "name": "batch_manager", 2277 | "type": "VHS_BatchManager", 2278 | "link": null 2279 | } 2280 | ], 2281 | "outputs": [ 2282 | { 2283 | "name": "Filenames", 2284 | "type": "VHS_FILENAMES", 2285 | "links": null, 2286 | "shape": 3 2287 | } 2288 | ], 2289 | "properties": { 2290 | "Node name for S&R": "VHS_VideoCombine" 2291 | }, 2292 | "widgets_values": { 2293 | "frame_rate": 25, 2294 | "loop_count": 0, 2295 | "filename_prefix": "musetalk", 2296 | "format": "video/h264-mp4", 2297 | "pix_fmt": "yuv420p", 2298 | "crf": 19, 2299 | "save_metadata": false, 2300 | "pingpong": false, 2301 | "save_output": false, 2302 | "videopreview": { 2303 | "hidden": false, 2304 | "paused": false, 2305 | "params": { 2306 | "filename": "AnimateDiff_00006-audio.mp4", 2307 | "subfolder": "", 2308 | "type": "temp", 2309 | "format": "video/h264-mp4" 2310 | } 2311 | } 2312 | } 2313 | }, 2314 | { 2315 | "id": 1093, 2316 | "type": "VHS_VideoCombine", 2317 | "pos": [ 2318 | -948, 2319 | 754 2320 | ], 2321 | "size": [ 2322 | 418.9444885253906, 2323 | 290 2324 | ], 2325 | "flags": {}, 2326 | "order": 50, 2327 | "mode": 0, 2328 | "inputs": [ 2329 | { 2330 | "name": "images", 2331 | "type": "IMAGE", 2332 | "link": 1806 2333 | }, 2334 | { 2335 | "name": "audio", 2336 | "type": "VHS_AUDIO", 2337 | "link": 1801 2338 | }, 2339 | { 2340 | "name": "batch_manager", 2341 | "type": "VHS_BatchManager", 2342 | "link": null 2343 | } 2344 | ], 2345 | "outputs": [ 2346 | { 2347 | "name": "Filenames", 2348 | "type": "VHS_FILENAMES", 2349 | "links": null, 2350 | "shape": 3, 2351 | "slot_index": 0 2352 | } 2353 | ], 2354 | "properties": { 2355 | "Node name for S&R": "VHS_VideoCombine" 2356 | }, 2357 | "widgets_values": { 2358 | "frame_rate": 25, 2359 | "loop_count": 0, 2360 | "filename_prefix": "musetalk", 2361 | "format": "video/h264-mp4", 2362 | "pix_fmt": "yuv420p", 2363 | "crf": 19, 2364 | "save_metadata": false, 2365 | "pingpong": false, 2366 | "save_output": false, 2367 | "videopreview": { 2368 | "hidden": false, 2369 | "paused": false, 2370 | "params": { 2371 | "filename": "AnimateDiff_00007-audio.mp4", 2372 | "subfolder": "", 2373 | "type": "temp", 2374 | "format": "video/h264-mp4" 2375 | } 2376 | } 2377 | } 2378 | }, 2379 | { 2380 | "id": 976, 2381 | "type": "FaceEnhancement", 2382 | "pos": [ 2383 | -6383, 2384 | 488 2385 | ], 2386 | "size": { 2387 | "0": 300.08770751953125, 2388 | "1": 73.68206024169922 2389 | }, 2390 | "flags": {}, 2391 | "order": 34, 2392 | "mode": 4, 2393 | "inputs": [ 2394 | { 2395 | "name": "images", 2396 | "type": "IMAGE", 2397 | "link": 1784 2398 | } 2399 | ], 2400 | "outputs": [ 2401 | { 2402 | "name": "images", 2403 | "type": "IMAGE", 2404 | "links": [ 2405 | 1587 2406 | ], 2407 | "shape": 3, 2408 | "slot_index": 0 2409 | } 2410 | ], 2411 | "properties": { 2412 | "Node name for S&R": "FaceEnhancement" 2413 | } 2414 | } 2415 | ], 2416 | "links": [ 2417 | [ 2418 | 45, 2419 | 28, 2420 | 0, 2421 | 27, 2422 | 0, 2423 | "VHS_AUDIO" 2424 | ], 2425 | [ 2426 | 67, 2427 | 27, 2428 | 0, 2429 | 36, 2430 | 0, 2431 | "VCAUDIOTENSOR" 2432 | ], 2433 | [ 2434 | 279, 2435 | 4, 2436 | 0, 2437 | 122, 2438 | 1, 2439 | "VAE" 2440 | ], 2441 | [ 2442 | 280, 2443 | 121, 2444 | 0, 2445 | 122, 2446 | 0, 2447 | "MODEL" 2448 | ], 2449 | [ 2450 | 281, 2451 | 36, 2452 | 0, 2453 | 122, 2454 | 2, 2455 | "WHISPERFEAT" 2456 | ], 2457 | [ 2458 | 287, 2459 | 28, 2460 | 0, 2461 | 124, 2462 | 0, 2463 | "*" 2464 | ], 2465 | [ 2466 | 297, 2467 | 36, 2468 | 1, 2469 | 129, 2470 | 0, 2471 | "*" 2472 | ], 2473 | [ 2474 | 422, 2475 | 36, 2476 | 1, 2477 | 223, 2478 | 0, 2479 | "*" 2480 | ], 2481 | [ 2482 | 626, 2483 | 15, 2484 | 0, 2485 | 16, 2486 | 1, 2487 | "IMAGE" 2488 | ], 2489 | [ 2490 | 871, 2491 | 529, 2492 | 0, 2493 | 527, 2494 | 1, 2495 | "INT" 2496 | ], 2497 | [ 2498 | 872, 2499 | 529, 2500 | 1, 2501 | 527, 2502 | 2, 2503 | "INT" 2504 | ], 2505 | [ 2506 | 873, 2507 | 527, 2508 | 0, 2509 | 526, 2510 | 1, 2511 | "INT" 2512 | ], 2513 | [ 2514 | 1165, 2515 | 734, 2516 | 0, 2517 | 735, 2518 | 0, 2519 | "*" 2520 | ], 2521 | [ 2522 | 1379, 2523 | 122, 2524 | 0, 2525 | 842, 2526 | 0, 2527 | "IMAGE" 2528 | ], 2529 | [ 2530 | 1380, 2531 | 125, 2532 | 0, 2533 | 842, 2534 | 1, 2535 | "VHS_AUDIO" 2536 | ], 2537 | [ 2538 | 1460, 2539 | 911, 2540 | 0, 2541 | 867, 2542 | 1, 2543 | "VHS_AUDIO" 2544 | ], 2545 | [ 2546 | 1462, 2547 | 913, 2548 | 0, 2549 | 912, 2550 | 1, 2551 | "VHS_AUDIO" 2552 | ], 2553 | [ 2554 | 1512, 2555 | 16, 2556 | 0, 2557 | 122, 2558 | 4, 2559 | "IMAGE" 2560 | ], 2561 | [ 2562 | 1515, 2563 | 947, 2564 | 1, 2565 | 97, 2566 | 0, 2567 | "*" 2568 | ], 2569 | [ 2570 | 1516, 2571 | 947, 2572 | 3, 2573 | 734, 2574 | 0, 2575 | "VHS_VIDEOINFO" 2576 | ], 2577 | [ 2578 | 1529, 2579 | 958, 2580 | 0, 2581 | 526, 2582 | 0, 2583 | "IMAGE" 2584 | ], 2585 | [ 2586 | 1530, 2587 | 958, 2588 | 0, 2589 | 529, 2590 | 0, 2591 | "IMAGE" 2592 | ], 2593 | [ 2594 | 1531, 2595 | 958, 2596 | 0, 2597 | 527, 2598 | 0, 2599 | "IMAGE" 2600 | ], 2601 | [ 2602 | 1558, 2603 | 971, 2604 | 0, 2605 | 970, 2606 | 0, 2607 | "IMAGE" 2608 | ], 2609 | [ 2610 | 1559, 2611 | 971, 2612 | 0, 2613 | 122, 2614 | 3, 2615 | "IMAGE" 2616 | ], 2617 | [ 2618 | 1570, 2619 | 911, 2620 | 0, 2621 | 973, 2622 | 1, 2623 | "VHS_AUDIO" 2624 | ], 2625 | [ 2626 | 1575, 2627 | 122, 2628 | 0, 2629 | 866, 2630 | 0, 2631 | "IMAGE" 2632 | ], 2633 | [ 2634 | 1587, 2635 | 976, 2636 | 0, 2637 | 979, 2638 | 0, 2639 | "IMAGE" 2640 | ], 2641 | [ 2642 | 1704, 2643 | 1048, 2644 | 0, 2645 | 1055, 2646 | 0, 2647 | "IMAGE" 2648 | ], 2649 | [ 2650 | 1705, 2651 | 1055, 2652 | 0, 2653 | 1054, 2654 | 0, 2655 | "MASK" 2656 | ], 2657 | [ 2658 | 1706, 2659 | 1054, 2660 | 0, 2661 | 1056, 2662 | 0, 2663 | "MASK" 2664 | ], 2665 | [ 2666 | 1708, 2667 | 1056, 2668 | 0, 2669 | 1051, 2670 | 0, 2671 | "IMAGE" 2672 | ], 2673 | [ 2674 | 1761, 2675 | 224, 2676 | 0, 2677 | 947, 2678 | 1, 2679 | "INT" 2680 | ], 2681 | [ 2682 | 1775, 2683 | 958, 2684 | 0, 2685 | 1080, 2686 | 0, 2687 | "IMAGE" 2688 | ], 2689 | [ 2690 | 1776, 2691 | 526, 2692 | 1, 2693 | 1080, 2694 | 1, 2695 | "POSE_KEYPOINT" 2696 | ], 2697 | [ 2698 | 1778, 2699 | 1080, 2700 | 0, 2701 | 844, 2702 | 0, 2703 | "IMAGE" 2704 | ], 2705 | [ 2706 | 1779, 2707 | 1080, 2708 | 1, 2709 | 833, 2710 | 0, 2711 | "IMAGE" 2712 | ], 2713 | [ 2714 | 1780, 2715 | 1080, 2716 | 0, 2717 | 971, 2718 | 0, 2719 | "IMAGE" 2720 | ], 2721 | [ 2722 | 1784, 2723 | 1080, 2724 | 0, 2725 | 976, 2726 | 0, 2727 | "IMAGE" 2728 | ], 2729 | [ 2730 | 1786, 2731 | 1083, 2732 | 0, 2733 | 16, 2734 | 0, 2735 | "IMAGE" 2736 | ], 2737 | [ 2738 | 1787, 2739 | 1056, 2740 | 0, 2741 | 1084, 2742 | 0, 2743 | "*" 2744 | ], 2745 | [ 2746 | 1790, 2747 | 979, 2748 | 0, 2749 | 1083, 2750 | 0, 2751 | "*" 2752 | ], 2753 | [ 2754 | 1791, 2755 | 866, 2756 | 0, 2757 | 1090, 2758 | 0, 2759 | "IMAGE" 2760 | ], 2761 | [ 2762 | 1792, 2763 | 1090, 2764 | 0, 2765 | 912, 2766 | 0, 2767 | "IMAGE" 2768 | ], 2769 | [ 2770 | 1796, 2771 | 1084, 2772 | 0, 2773 | 1091, 2774 | 2, 2775 | "IMAGE" 2776 | ], 2777 | [ 2778 | 1797, 2779 | 1091, 2780 | 0, 2781 | 973, 2782 | 0, 2783 | "IMAGE" 2784 | ], 2785 | [ 2786 | 1799, 2787 | 1091, 2788 | 0, 2789 | 1092, 2790 | 1, 2791 | "IMAGE" 2792 | ], 2793 | [ 2794 | 1801, 2795 | 1094, 2796 | 0, 2797 | 1093, 2798 | 1, 2799 | "VHS_AUDIO" 2800 | ], 2801 | [ 2802 | 1806, 2803 | 1092, 2804 | 0, 2805 | 1093, 2806 | 0, 2807 | "IMAGE" 2808 | ], 2809 | [ 2810 | 1822, 2811 | 1107, 2812 | 0, 2813 | 1091, 2814 | 0, 2815 | "IMAGE" 2816 | ], 2817 | [ 2818 | 1823, 2819 | 1107, 2820 | 0, 2821 | 1108, 2822 | 0, 2823 | "IMAGE" 2824 | ], 2825 | [ 2826 | 1824, 2827 | 122, 2828 | 0, 2829 | 1108, 2830 | 1, 2831 | "IMAGE" 2832 | ], 2833 | [ 2834 | 1825, 2835 | 1084, 2836 | 0, 2837 | 1108, 2838 | 2, 2839 | "IMAGE" 2840 | ], 2841 | [ 2842 | 1826, 2843 | 1108, 2844 | 0, 2845 | 1092, 2846 | 0, 2847 | "IMAGE" 2848 | ], 2849 | [ 2850 | 1827, 2851 | 1108, 2852 | 0, 2853 | 867, 2854 | 0, 2855 | "IMAGE" 2856 | ], 2857 | [ 2858 | 1828, 2859 | 958, 2860 | 0, 2861 | 1107, 2862 | 0, 2863 | "*" 2864 | ], 2865 | [ 2866 | 1848, 2867 | 1090, 2868 | 0, 2869 | 1091, 2870 | 1, 2871 | "IMAGE" 2872 | ], 2873 | [ 2874 | 1849, 2875 | 947, 2876 | 0, 2877 | 958, 2878 | 0, 2879 | "*" 2880 | ] 2881 | ], 2882 | "groups": [ 2883 | { 2884 | "title": "loadvideo", 2885 | "bounding": [ 2886 | -8845, 2887 | 639, 2888 | 1883, 2889 | 1047 2890 | ], 2891 | "color": "#3f789e", 2892 | "font_size": 24, 2893 | "locked": false 2894 | }, 2895 | { 2896 | "title": "load audio and cal video frame", 2897 | "bounding": [ 2898 | -8064, 2899 | -883, 2900 | 1878, 2901 | 675 2902 | ], 2903 | "color": "#3f789e", 2904 | "font_size": 24, 2905 | "locked": false 2906 | }, 2907 | { 2908 | "title": "musetalk sampler", 2909 | "bounding": [ 2910 | -6125, 2911 | -1096, 2912 | 2378, 2913 | 1226 2914 | ], 2915 | "color": "#3f789e", 2916 | "font_size": 24, 2917 | "locked": false 2918 | }, 2919 | { 2920 | "title": "preprocess and crop", 2921 | "bounding": [ 2922 | -6917, 2923 | 650, 2924 | 2505, 2925 | 1051 2926 | ], 2927 | "color": "#3f789e", 2928 | "font_size": 24, 2929 | "locked": false 2930 | }, 2931 | { 2932 | "title": "postprocess and uncrop", 2933 | "bounding": [ 2934 | -4369, 2935 | 619, 2936 | 2815, 2937 | 1523 2938 | ], 2939 | "color": "#3f789e", 2940 | "font_size": 24, 2941 | "locked": false 2942 | }, 2943 | { 2944 | "title": "uncrop face mask", 2945 | "bounding": [ 2946 | -3651, 2947 | -332, 2948 | 1440, 2949 | 917 2950 | ], 2951 | "color": "#3f789e", 2952 | "font_size": 24, 2953 | "locked": false 2954 | }, 2955 | { 2956 | "title": "hires", 2957 | "bounding": [ 2958 | -6520, 2959 | 220, 2960 | 1033, 2961 | 387 2962 | ], 2963 | "color": "#3f789e", 2964 | "font_size": 24, 2965 | "locked": false 2966 | }, 2967 | { 2968 | "title": "blur", 2969 | "bounding": [ 2970 | -5448, 2971 | 217, 2972 | 916, 2973 | 396 2974 | ], 2975 | "color": "#3f789e", 2976 | "font_size": 24, 2977 | "locked": false 2978 | }, 2979 | { 2980 | "title": "hires", 2981 | "bounding": [ 2982 | -4374, 2983 | 227, 2984 | 631, 2985 | 364 2986 | ], 2987 | "color": "#3f789e", 2988 | "font_size": 24, 2989 | "locked": false 2990 | }, 2991 | { 2992 | "title": "diff", 2993 | "bounding": [ 2994 | -1514, 2995 | 613, 2996 | 1121, 2997 | 1519 2998 | ], 2999 | "color": "#3f789e", 3000 | "font_size": 24, 3001 | "locked": false 3002 | } 3003 | ], 3004 | "config": {}, 3005 | "extra": { 3006 | "workspace_info": { 3007 | "id": "Ar3D9He4S5MoHhCb_zi9p" 3008 | } 3009 | }, 3010 | "version": 0.4 3011 | } -------------------------------------------------------------------------------- /workflow/musetalk flow.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xuhongming251/ComfyUI-MuseTalkUtils/df7b26788afb765a7ad0920bf9be81e213710068/workflow/musetalk flow.png -------------------------------------------------------------------------------- /workflow/musetalk train flow.json: -------------------------------------------------------------------------------- 1 | { 2 | "last_node_id": 48, 3 | "last_link_id": 59, 4 | "nodes": [ 5 | { 6 | "id": 14, 7 | "type": "PixelPerfectResolution", 8 | "pos": [ 9 | 1910, 10 | 980 11 | ], 12 | "size": { 13 | "0": 393, 14 | "1": 106 15 | }, 16 | "flags": {}, 17 | "order": 8, 18 | "mode": 0, 19 | "inputs": [ 20 | { 21 | "name": "original_image", 22 | "type": "IMAGE", 23 | "link": 13, 24 | "label": "original_image" 25 | }, 26 | { 27 | "name": "image_gen_width", 28 | "type": "INT", 29 | "link": 14, 30 | "widget": { 31 | "name": "image_gen_width" 32 | }, 33 | "slot_index": 1 34 | }, 35 | { 36 | "name": "image_gen_height", 37 | "type": "INT", 38 | "link": 15, 39 | "widget": { 40 | "name": "image_gen_height" 41 | } 42 | } 43 | ], 44 | "outputs": [ 45 | { 46 | "name": "RESOLUTION (INT)", 47 | "type": "INT", 48 | "links": [ 49 | 12 50 | ], 51 | "shape": 3, 52 | "label": "RESOLUTION (INT)", 53 | "slot_index": 0 54 | } 55 | ], 56 | "properties": { 57 | "Node name for S&R": "PixelPerfectResolution" 58 | }, 59 | "widgets_values": [ 60 | 800, 61 | 536, 62 | "Just Resize" 63 | ] 64 | }, 65 | { 66 | "id": 15, 67 | "type": "GetImageSize+", 68 | "pos": [ 69 | 1850, 70 | 770 71 | ], 72 | "size": { 73 | "0": 210, 74 | "1": 46 75 | }, 76 | "flags": {}, 77 | "order": 6, 78 | "mode": 0, 79 | "inputs": [ 80 | { 81 | "name": "image", 82 | "type": "IMAGE", 83 | "link": 16 84 | } 85 | ], 86 | "outputs": [ 87 | { 88 | "name": "width", 89 | "type": "INT", 90 | "links": [ 91 | 14 92 | ], 93 | "shape": 3, 94 | "slot_index": 0 95 | }, 96 | { 97 | "name": "height", 98 | "type": "INT", 99 | "links": [ 100 | 15 101 | ], 102 | "shape": 3, 103 | "slot_index": 1 104 | } 105 | ], 106 | "properties": { 107 | "Node name for S&R": "GetImageSize+" 108 | } 109 | }, 110 | { 111 | "id": 22, 112 | "type": "Reroute", 113 | "pos": [ 114 | 1473, 115 | 390 116 | ], 117 | "size": [ 118 | 75, 119 | 26 120 | ], 121 | "flags": {}, 122 | "order": 4, 123 | "mode": 0, 124 | "inputs": [ 125 | { 126 | "name": "", 127 | "type": "*", 128 | "link": 57 129 | } 130 | ], 131 | "outputs": [ 132 | { 133 | "name": "", 134 | "type": "IMAGE", 135 | "links": [ 136 | 11, 137 | 13, 138 | 16, 139 | 49 140 | ], 141 | "slot_index": 0 142 | } 143 | ], 144 | "properties": { 145 | "showOutputText": false, 146 | "horizontal": false 147 | } 148 | }, 149 | { 150 | "id": 35, 151 | "type": "Display Any (rgthree)", 152 | "pos": [ 153 | 4113, 154 | 938 155 | ], 156 | "size": { 157 | "0": 226.42002868652344, 158 | "1": 116.54998779296875 159 | }, 160 | "flags": {}, 161 | "order": 9, 162 | "mode": 0, 163 | "inputs": [ 164 | { 165 | "name": "source", 166 | "type": "*", 167 | "link": 34, 168 | "dir": 3, 169 | "label": "source" 170 | } 171 | ], 172 | "properties": { 173 | "Node name for S&R": "Display Any (rgthree)" 174 | }, 175 | "widgets_values": [ 176 | "" 177 | ] 178 | }, 179 | { 180 | "id": 25, 181 | "type": "MuseTalkTrainPreprocess", 182 | "pos": [ 183 | 2894, 184 | 393 185 | ], 186 | "size": { 187 | "0": 354.3999938964844, 188 | "1": 174 189 | }, 190 | "flags": {}, 191 | "order": 11, 192 | "mode": 0, 193 | "inputs": [ 194 | { 195 | "name": "origin_images", 196 | "type": "IMAGE", 197 | "link": 49 198 | }, 199 | { 200 | "name": "pose_kps", 201 | "type": "POSE_KEYPOINT", 202 | "link": 25 203 | } 204 | ], 205 | "outputs": [ 206 | { 207 | "name": "rotated_faces", 208 | "type": "IMAGE", 209 | "links": [ 210 | 44 211 | ], 212 | "shape": 3, 213 | "slot_index": 0 214 | }, 215 | { 216 | "name": "rotated_faces_with_landmarks", 217 | "type": "IMAGE", 218 | "links": [], 219 | "shape": 3, 220 | "slot_index": 1 221 | } 222 | ], 223 | "properties": { 224 | "Node name for S&R": "MuseTalkTrainPreprocess" 225 | }, 226 | "widgets_values": [ 227 | "full", 228 | 0, 229 | 0, 230 | 0, 231 | 0 232 | ] 233 | }, 234 | { 235 | "id": 41, 236 | "type": "MuseTalkTrain", 237 | "pos": [ 238 | 4021, 239 | 379 240 | ], 241 | "size": { 242 | "0": 315, 243 | "1": 78 244 | }, 245 | "flags": {}, 246 | "order": 12, 247 | "mode": 0, 248 | "inputs": [ 249 | { 250 | "name": "images", 251 | "type": "IMAGE", 252 | "link": 44 253 | }, 254 | { 255 | "name": "whisper_features", 256 | "type": "WHISPERFEAT", 257 | "link": 45 258 | } 259 | ], 260 | "outputs": [ 261 | { 262 | "name": "images", 263 | "type": "IMAGE", 264 | "links": [ 265 | 46 266 | ], 267 | "shape": 3, 268 | "slot_index": 0 269 | } 270 | ], 271 | "properties": { 272 | "Node name for S&R": "MuseTalkTrain" 273 | }, 274 | "widgets_values": [ 275 | 1 276 | ] 277 | }, 278 | { 279 | "id": 38, 280 | "type": "VHS_VideoCombine", 281 | "pos": [ 282 | 4788, 283 | 360 284 | ], 285 | "size": { 286 | "0": 320, 287 | "1": 290 288 | }, 289 | "flags": {}, 290 | "order": 13, 291 | "mode": 0, 292 | "inputs": [ 293 | { 294 | "name": "images", 295 | "type": "IMAGE", 296 | "link": 46 297 | }, 298 | { 299 | "name": "audio", 300 | "type": "VHS_AUDIO", 301 | "link": null 302 | }, 303 | { 304 | "name": "batch_manager", 305 | "type": "VHS_BatchManager", 306 | "link": null 307 | } 308 | ], 309 | "outputs": [ 310 | { 311 | "name": "Filenames", 312 | "type": "VHS_FILENAMES", 313 | "links": null, 314 | "shape": 3 315 | } 316 | ], 317 | "properties": { 318 | "Node name for S&R": "VHS_VideoCombine" 319 | }, 320 | "widgets_values": { 321 | "frame_rate": 25, 322 | "loop_count": 0, 323 | "filename_prefix": "AnimateDiff", 324 | "format": "video/h264-mp4", 325 | "pix_fmt": "yuv420p", 326 | "crf": 19, 327 | "save_metadata": false, 328 | "pingpong": false, 329 | "save_output": false, 330 | "videopreview": { 331 | "hidden": false, 332 | "paused": false, 333 | "params": { 334 | "filename": "AnimateDiff_00019.gif", 335 | "subfolder": "", 336 | "type": "output", 337 | "format": "image/gif" 338 | } 339 | } 340 | } 341 | }, 342 | { 343 | "id": 13, 344 | "type": "DWPreprocessor", 345 | "pos": [ 346 | 2320, 347 | 670 348 | ], 349 | "size": { 350 | "0": 315, 351 | "1": 198 352 | }, 353 | "flags": {}, 354 | "order": 10, 355 | "mode": 0, 356 | "inputs": [ 357 | { 358 | "name": "image", 359 | "type": "IMAGE", 360 | "link": 11 361 | }, 362 | { 363 | "name": "resolution", 364 | "type": "INT", 365 | "link": 12, 366 | "widget": { 367 | "name": "resolution" 368 | } 369 | } 370 | ], 371 | "outputs": [ 372 | { 373 | "name": "IMAGE", 374 | "type": "IMAGE", 375 | "links": [], 376 | "shape": 3, 377 | "slot_index": 0 378 | }, 379 | { 380 | "name": "POSE_KEYPOINT", 381 | "type": "POSE_KEYPOINT", 382 | "links": [ 383 | 25 384 | ], 385 | "shape": 3, 386 | "slot_index": 1 387 | } 388 | ], 389 | "properties": { 390 | "Node name for S&R": "DWPreprocessor" 391 | }, 392 | "widgets_values": [ 393 | "disable", 394 | "disable", 395 | "enable", 396 | 512, 397 | "yolox_l.torchscript.pt", 398 | "dw-ll_ucoco_384_bs5.torchscript.pt" 399 | ] 400 | }, 401 | { 402 | "id": 47, 403 | "type": "ImageCrop", 404 | "pos": [ 405 | 1049, 406 | 473 407 | ], 408 | "size": { 409 | "0": 315, 410 | "1": 130 411 | }, 412 | "flags": {}, 413 | "order": 1, 414 | "mode": 4, 415 | "inputs": [ 416 | { 417 | "name": "image", 418 | "type": "IMAGE", 419 | "link": 55 420 | } 421 | ], 422 | "outputs": [ 423 | { 424 | "name": "IMAGE", 425 | "type": "IMAGE", 426 | "links": [ 427 | 56, 428 | 57 429 | ], 430 | "shape": 3, 431 | "slot_index": 0 432 | } 433 | ], 434 | "properties": { 435 | "Node name for S&R": "ImageCrop" 436 | }, 437 | "widgets_values": [ 438 | 1000, 439 | 1000, 440 | 600, 441 | 120 442 | ] 443 | }, 444 | { 445 | "id": 46, 446 | "type": "VHS_VideoCombine", 447 | "pos": [ 448 | 1039, 449 | 42 450 | ], 451 | "size": [ 452 | 320, 453 | 290 454 | ], 455 | "flags": {}, 456 | "order": 3, 457 | "mode": 4, 458 | "inputs": [ 459 | { 460 | "name": "images", 461 | "type": "IMAGE", 462 | "link": 56 463 | }, 464 | { 465 | "name": "audio", 466 | "type": "VHS_AUDIO", 467 | "link": null 468 | }, 469 | { 470 | "name": "batch_manager", 471 | "type": "VHS_BatchManager", 472 | "link": null 473 | } 474 | ], 475 | "outputs": [ 476 | { 477 | "name": "Filenames", 478 | "type": "VHS_FILENAMES", 479 | "links": null, 480 | "shape": 3 481 | } 482 | ], 483 | "properties": { 484 | "Node name for S&R": "VHS_VideoCombine" 485 | }, 486 | "widgets_values": { 487 | "frame_rate": 25, 488 | "loop_count": 0, 489 | "filename_prefix": "AnimateDiff", 490 | "format": "video/h264-mp4", 491 | "pix_fmt": "yuv420p", 492 | "crf": 19, 493 | "save_metadata": false, 494 | "pingpong": false, 495 | "save_output": false, 496 | "videopreview": { 497 | "hidden": false, 498 | "paused": false, 499 | "params": { 500 | "filename": "AnimateDiff_00013.mp4", 501 | "subfolder": "", 502 | "type": "temp", 503 | "format": "video/h264-mp4" 504 | } 505 | } 506 | } 507 | }, 508 | { 509 | "id": 42, 510 | "type": "VHS_LoadVideo", 511 | "pos": [ 512 | 663, 513 | 600 514 | ], 515 | "size": [ 516 | 240, 517 | 262 518 | ], 519 | "flags": {}, 520 | "order": 0, 521 | "mode": 0, 522 | "inputs": [ 523 | { 524 | "name": "batch_manager", 525 | "type": "VHS_BatchManager", 526 | "link": null 527 | } 528 | ], 529 | "outputs": [ 530 | { 531 | "name": "IMAGE", 532 | "type": "IMAGE", 533 | "links": [ 534 | 55 535 | ], 536 | "shape": 3, 537 | "slot_index": 0 538 | }, 539 | { 540 | "name": "frame_count", 541 | "type": "INT", 542 | "links": [], 543 | "shape": 3, 544 | "slot_index": 1 545 | }, 546 | { 547 | "name": "audio", 548 | "type": "VHS_AUDIO", 549 | "links": [ 550 | 59 551 | ], 552 | "shape": 3, 553 | "slot_index": 2 554 | }, 555 | { 556 | "name": "video_info", 557 | "type": "VHS_VIDEOINFO", 558 | "links": [], 559 | "shape": 3, 560 | "slot_index": 3 561 | } 562 | ], 563 | "properties": { 564 | "Node name for S&R": "VHS_LoadVideo" 565 | }, 566 | "widgets_values": { 567 | "video": "高清.mp4", 568 | "force_rate": 25, 569 | "force_size": "Disabled", 570 | "custom_width": 512, 571 | "custom_height": 512, 572 | "frame_load_cap": 0, 573 | "skip_first_frames": 0, 574 | "select_every_nth": 1, 575 | "choose video to upload": "image", 576 | "videopreview": { 577 | "hidden": false, 578 | "paused": false, 579 | "params": { 580 | "frame_load_cap": 0, 581 | "skip_first_frames": 0, 582 | "force_rate": 25, 583 | "filename": "高清.mp4", 584 | "type": "input", 585 | "format": "video/mp4", 586 | "select_every_nth": 1 587 | } 588 | } 589 | } 590 | }, 591 | { 592 | "id": 31, 593 | "type": "vhs_audio_to_audio_tensor", 594 | "pos": [ 595 | 2577, 596 | 1354 597 | ], 598 | "size": { 599 | "0": 315, 600 | "1": 102 601 | }, 602 | "flags": {}, 603 | "order": 5, 604 | "mode": 0, 605 | "inputs": [ 606 | { 607 | "name": "vhs_audio", 608 | "type": "VHS_AUDIO", 609 | "link": 38, 610 | "slot_index": 0, 611 | "label": "vhs_audio" 612 | } 613 | ], 614 | "outputs": [ 615 | { 616 | "name": "audio_tensor", 617 | "type": "VCAUDIOTENSOR", 618 | "links": [ 619 | 32 620 | ], 621 | "shape": 3, 622 | "slot_index": 0, 623 | "label": "audio_tensor" 624 | }, 625 | { 626 | "name": "audio_dur", 627 | "type": "INT", 628 | "links": null, 629 | "shape": 3, 630 | "label": "audio_dur" 631 | } 632 | ], 633 | "properties": { 634 | "Node name for S&R": "vhs_audio_to_audio_tensor" 635 | }, 636 | "widgets_values": [ 637 | 16000, 638 | 1 639 | ] 640 | }, 641 | { 642 | "id": 33, 643 | "type": "whisper_to_features", 644 | "pos": [ 645 | 3209, 646 | 1353 647 | ], 648 | "size": { 649 | "0": 342.5999755859375, 650 | "1": 78 651 | }, 652 | "flags": {}, 653 | "order": 7, 654 | "mode": 0, 655 | "inputs": [ 656 | { 657 | "name": "audio_tensor", 658 | "type": "VCAUDIOTENSOR", 659 | "link": 32, 660 | "slot_index": 0, 661 | "label": "audio_tensor" 662 | } 663 | ], 664 | "outputs": [ 665 | { 666 | "name": "whisper_chunks", 667 | "type": "WHISPERFEAT", 668 | "links": [ 669 | 45 670 | ], 671 | "shape": 3, 672 | "slot_index": 0, 673 | "label": "whisper_chunks" 674 | }, 675 | { 676 | "name": "frame_count", 677 | "type": "INT", 678 | "links": [ 679 | 34 680 | ], 681 | "shape": 3, 682 | "slot_index": 1, 683 | "label": "frame_count" 684 | } 685 | ], 686 | "properties": { 687 | "Node name for S&R": "whisper_to_features" 688 | }, 689 | "widgets_values": [ 690 | 25 691 | ] 692 | }, 693 | { 694 | "id": 37, 695 | "type": "Reroute", 696 | "pos": [ 697 | 1802, 698 | 1338 699 | ], 700 | "size": [ 701 | 75, 702 | 26 703 | ], 704 | "flags": {}, 705 | "order": 2, 706 | "mode": 0, 707 | "inputs": [ 708 | { 709 | "name": "", 710 | "type": "*", 711 | "link": 59 712 | } 713 | ], 714 | "outputs": [ 715 | { 716 | "name": "", 717 | "type": "VHS_AUDIO", 718 | "links": [ 719 | 38 720 | ], 721 | "slot_index": 0 722 | } 723 | ], 724 | "properties": { 725 | "showOutputText": false, 726 | "horizontal": false 727 | } 728 | } 729 | ], 730 | "links": [ 731 | [ 732 | 11, 733 | 22, 734 | 0, 735 | 13, 736 | 0, 737 | "IMAGE" 738 | ], 739 | [ 740 | 12, 741 | 14, 742 | 0, 743 | 13, 744 | 1, 745 | "INT" 746 | ], 747 | [ 748 | 13, 749 | 22, 750 | 0, 751 | 14, 752 | 0, 753 | "IMAGE" 754 | ], 755 | [ 756 | 14, 757 | 15, 758 | 0, 759 | 14, 760 | 1, 761 | "INT" 762 | ], 763 | [ 764 | 15, 765 | 15, 766 | 1, 767 | 14, 768 | 2, 769 | "INT" 770 | ], 771 | [ 772 | 16, 773 | 22, 774 | 0, 775 | 15, 776 | 0, 777 | "IMAGE" 778 | ], 779 | [ 780 | 25, 781 | 13, 782 | 1, 783 | 25, 784 | 1, 785 | "POSE_KEYPOINT" 786 | ], 787 | [ 788 | 32, 789 | 31, 790 | 0, 791 | 33, 792 | 0, 793 | "VCAUDIOTENSOR" 794 | ], 795 | [ 796 | 34, 797 | 33, 798 | 1, 799 | 35, 800 | 0, 801 | "*" 802 | ], 803 | [ 804 | 38, 805 | 37, 806 | 0, 807 | 31, 808 | 0, 809 | "VHS_AUDIO" 810 | ], 811 | [ 812 | 44, 813 | 25, 814 | 0, 815 | 41, 816 | 0, 817 | "IMAGE" 818 | ], 819 | [ 820 | 45, 821 | 33, 822 | 0, 823 | 41, 824 | 1, 825 | "WHISPERFEAT" 826 | ], 827 | [ 828 | 46, 829 | 41, 830 | 0, 831 | 38, 832 | 0, 833 | "IMAGE" 834 | ], 835 | [ 836 | 49, 837 | 22, 838 | 0, 839 | 25, 840 | 0, 841 | "IMAGE" 842 | ], 843 | [ 844 | 55, 845 | 42, 846 | 0, 847 | 47, 848 | 0, 849 | "IMAGE" 850 | ], 851 | [ 852 | 56, 853 | 47, 854 | 0, 855 | 46, 856 | 0, 857 | "IMAGE" 858 | ], 859 | [ 860 | 57, 861 | 47, 862 | 0, 863 | 22, 864 | 0, 865 | "*" 866 | ], 867 | [ 868 | 59, 869 | 42, 870 | 2, 871 | 37, 872 | 0, 873 | "*" 874 | ] 875 | ], 876 | "groups": [], 877 | "config": {}, 878 | "extra": { 879 | "workspace_info": { 880 | "id": "yYiOrtl5rbrkJhUJ9UPwT" 881 | } 882 | }, 883 | "version": 0.4 884 | } -------------------------------------------------------------------------------- /workflow/sampleimage.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xuhongming251/ComfyUI-MuseTalkUtils/df7b26788afb765a7ad0920bf9be81e213710068/workflow/sampleimage.png -------------------------------------------------------------------------------- /workflow/train.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xuhongming251/ComfyUI-MuseTalkUtils/df7b26788afb765a7ad0920bf9be81e213710068/workflow/train.png -------------------------------------------------------------------------------- /workflow/trainsample.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xuhongming251/ComfyUI-MuseTalkUtils/df7b26788afb765a7ad0920bf9be81e213710068/workflow/trainsample.png --------------------------------------------------------------------------------