├── .gitignore ├── README.md ├── __init__.py ├── doc ├── demo1.jpg ├── mimicmotion_demo_20240702092927.mp4 └── mimicmotion_workflow.json ├── mimicmotion ├── __init__.py ├── dwpose │ ├── .gitignore │ ├── __init__.py │ ├── dwpose_detector.py │ ├── onnxdet.py │ ├── onnxpose.py │ ├── preprocess.py │ ├── util.py │ └── wholebody.py ├── modules │ ├── __init__.py │ ├── attention.py │ ├── pose_net.py │ └── unet.py ├── pipelines │ └── pipeline_mimicmotion.py └── utils │ ├── __init__.py │ ├── loader.py │ └── utils.py ├── nodes.py ├── requirements.txt ├── test.yaml └── web └── js ├── previewVideo.js └── uploadVideo.js /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__ 2 | models -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # ComfyUI-MimicMotion 2 | a comfyui custom node for [MimicMotion](https://github.com/Tencent/MimicMotion) 3 | [workflow](./doc/mimicmotion_workflow.json) 4 | 5 | ## Example 6 | test on 2080ti 11GB torch==2.3.0+cu121 python 3.10.8 7 | - input 8 | 9 | refer_img 10 |
11 |
12 | 13 |
14 |
15 | 16 | refer_video 17 | 18 | - output 19 | 20 | https://github.com/Tencent/MimicMotion/assets/149982694/940a4aa0-a174-48e6-add7-96bb74ea916e 21 | 22 | ## How to use 23 | make sure `ffmpeg` is worked in your commandline 24 | for Linux 25 | ``` 26 | apt update 27 | apt install ffmpeg 28 | ``` 29 | for Windows,you can install `ffmpeg` by [WingetUI](https://github.com/marticliment/WingetUI) automatically 30 | 31 | then! 32 | ``` 33 | ## insatll xformers match your torch,for torch==2.1.0+cu121 34 | pip install xformers==0.0.22.post7 35 | 36 | ## in ComfyUI/custom_nodes 37 | git clone https://github.com/AIFSH/ComfyUI-MimicMotion.git 38 | cd ComfyUI-MimicMotion 39 | pip install -r requirements.txt 40 | ``` 41 | weights will be downloaded from huggingface 42 | 43 | ## Tutorial 44 | -【MimicMotion! ComfyUI插件来了-哔哩哔哩】 https://b23.tv/McnRUpd 45 | - QQ群:852228202 46 | 47 | ## Thanks 48 | 49 | [MimicMotion](https://github.com/Tencent/MimicMotion) 50 | -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- 1 | # Set the web directory, any .js file in that directory will be loaded by the frontend as a frontend extension 2 | WEB_DIRECTORY = "./web" 3 | 4 | from .nodes import MimicMotionNode, LoadVideo, PreViewVideo 5 | # A dictionary that contains all nodes you want to export with their names 6 | # NOTE: names should be globally unique 7 | NODE_CLASS_MAPPINGS = { 8 | "MimicMotionNode": MimicMotionNode, 9 | "LoadVideo": LoadVideo, 10 | "PreViewVideo": PreViewVideo 11 | } 12 | -------------------------------------------------------------------------------- /doc/demo1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AIFSH/ComfyUI-MimicMotion/0f376479219afe8431f634539359eb26b981d1e5/doc/demo1.jpg -------------------------------------------------------------------------------- /doc/mimicmotion_demo_20240702092927.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AIFSH/ComfyUI-MimicMotion/0f376479219afe8431f634539359eb26b981d1e5/doc/mimicmotion_demo_20240702092927.mp4 -------------------------------------------------------------------------------- /doc/mimicmotion_workflow.json: -------------------------------------------------------------------------------- 1 | { 2 | "last_node_id": 5, 3 | "last_link_id": 6, 4 | "nodes": [ 5 | { 6 | "id": 2, 7 | "type": "LoadImage", 8 | "pos": [ 9 | 39, 10 | 28 11 | ], 12 | "size": { 13 | "0": 315, 14 | "1": 314 15 | }, 16 | "flags": {}, 17 | "order": 0, 18 | "mode": 0, 19 | "outputs": [ 20 | { 21 | "name": "IMAGE", 22 | "type": "IMAGE", 23 | "links": [ 24 | 4 25 | ], 26 | "shape": 3 27 | }, 28 | { 29 | "name": "MASK", 30 | "type": "MASK", 31 | "links": null, 32 | "shape": 3 33 | } 34 | ], 35 | "properties": { 36 | "Node name for S&R": "LoadImage" 37 | }, 38 | "widgets_values": [ 39 | "demo1.jpg", 40 | "image" 41 | ] 42 | }, 43 | { 44 | "id": 3, 45 | "type": "LoadVideo", 46 | "pos": [ 47 | 51, 48 | 356 49 | ], 50 | "size": { 51 | "0": 315, 52 | "1": 612.4444580078125 53 | }, 54 | "flags": {}, 55 | "order": 1, 56 | "mode": 0, 57 | "outputs": [ 58 | { 59 | "name": "VIDEO", 60 | "type": "VIDEO", 61 | "links": [ 62 | 5 63 | ], 64 | "shape": 3, 65 | "slot_index": 0 66 | } 67 | ], 68 | "properties": { 69 | "Node name for S&R": "LoadVideo" 70 | }, 71 | "widgets_values": [ 72 | "demo.mp4", 73 | "Video", 74 | { 75 | "hidden": false, 76 | "paused": false, 77 | "params": {} 78 | } 79 | ] 80 | }, 81 | { 82 | "id": 5, 83 | "type": "MimicMotionNode", 84 | "pos": [ 85 | 459, 86 | 54 87 | ], 88 | "size": { 89 | "0": 315, 90 | "1": 294 91 | }, 92 | "flags": {}, 93 | "order": 2, 94 | "mode": 0, 95 | "inputs": [ 96 | { 97 | "name": "ref_image", 98 | "type": "IMAGE", 99 | "link": 4, 100 | "slot_index": 0 101 | }, 102 | { 103 | "name": "ref_video_path", 104 | "type": "VIDEO", 105 | "link": 5 106 | } 107 | ], 108 | "outputs": [ 109 | { 110 | "name": "VIDEO", 111 | "type": "VIDEO", 112 | "links": [ 113 | 6 114 | ], 115 | "shape": 3, 116 | "slot_index": 0 117 | } 118 | ], 119 | "properties": { 120 | "Node name for S&R": "MimicMotionNode" 121 | }, 122 | "widgets_values": [ 123 | 576, 124 | 2, 125 | 8, 126 | 6, 127 | 4, 128 | 25, 129 | 2, 130 | 15, 131 | 415, 132 | "randomize" 133 | ] 134 | }, 135 | { 136 | "id": 4, 137 | "type": "PreViewVideo", 138 | "pos": [ 139 | 816, 140 | 74 141 | ], 142 | "size": { 143 | "0": 210, 144 | "1": 377.77777099609375 145 | }, 146 | "flags": {}, 147 | "order": 3, 148 | "mode": 0, 149 | "inputs": [ 150 | { 151 | "name": "video", 152 | "type": "VIDEO", 153 | "link": 6 154 | } 155 | ], 156 | "properties": { 157 | "Node name for S&R": "PreViewVideo" 158 | }, 159 | "widgets_values": [ 160 | { 161 | "hidden": false, 162 | "paused": false, 163 | "params": {} 164 | } 165 | ] 166 | } 167 | ], 168 | "links": [ 169 | [ 170 | 4, 171 | 2, 172 | 0, 173 | 5, 174 | 0, 175 | "IMAGE" 176 | ], 177 | [ 178 | 5, 179 | 3, 180 | 0, 181 | 5, 182 | 1, 183 | "VIDEO" 184 | ], 185 | [ 186 | 6, 187 | 5, 188 | 0, 189 | 4, 190 | 0, 191 | "VIDEO" 192 | ] 193 | ], 194 | "groups": [], 195 | "config": {}, 196 | "extra": { 197 | "ds": { 198 | "scale": 1.1000000000000005, 199 | "offset": [ 200 | 86.66106242715553, 201 | 12.114120018825606 202 | ] 203 | } 204 | }, 205 | "version": 0.4 206 | } -------------------------------------------------------------------------------- /mimicmotion/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AIFSH/ComfyUI-MimicMotion/0f376479219afe8431f634539359eb26b981d1e5/mimicmotion/__init__.py -------------------------------------------------------------------------------- /mimicmotion/dwpose/.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | -------------------------------------------------------------------------------- /mimicmotion/dwpose/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AIFSH/ComfyUI-MimicMotion/0f376479219afe8431f634539359eb26b981d1e5/mimicmotion/dwpose/__init__.py -------------------------------------------------------------------------------- /mimicmotion/dwpose/dwpose_detector.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import numpy as np 4 | import torch 5 | 6 | from .wholebody import Wholebody 7 | 8 | os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE" 9 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 10 | 11 | class DWposeDetector: 12 | """ 13 | A pose detect method for image-like data. 14 | 15 | Parameters: 16 | model_det: (str) serialized ONNX format model path, 17 | such as https://huggingface.co/yzd-v/DWPose/blob/main/yolox_l.onnx 18 | model_pose: (str) serialized ONNX format model path, 19 | such as https://huggingface.co/yzd-v/DWPose/blob/main/dw-ll_ucoco_384.onnx 20 | device: (str) 'cpu' or 'cuda:{device_id}' 21 | """ 22 | def __init__(self, model_det, model_pose, device='cpu'): 23 | self.pose_estimation = Wholebody(model_det=model_det, model_pose=model_pose, device=device) 24 | 25 | def __call__(self, oriImg): 26 | oriImg = oriImg.copy() 27 | H, W, C = oriImg.shape 28 | with torch.no_grad(): 29 | candidate, score = self.pose_estimation(oriImg) 30 | nums, _, locs = candidate.shape 31 | candidate[..., 0] /= float(W) 32 | candidate[..., 1] /= float(H) 33 | body = candidate[:, :18].copy() 34 | body = body.reshape(nums * 18, locs) 35 | subset = score[:, :18].copy() 36 | for i in range(len(subset)): 37 | for j in range(len(subset[i])): 38 | if subset[i][j] > 0.3: 39 | subset[i][j] = int(18 * i + j) 40 | else: 41 | subset[i][j] = -1 42 | 43 | # un_visible = subset < 0.3 44 | # candidate[un_visible] = -1 45 | 46 | # foot = candidate[:, 18:24] 47 | 48 | faces = candidate[:, 24:92] 49 | 50 | hands = candidate[:, 92:113] 51 | hands = np.vstack([hands, candidate[:, 113:]]) 52 | 53 | faces_score = score[:, 24:92] 54 | hands_score = np.vstack([score[:, 92:113], score[:, 113:]]) 55 | 56 | bodies = dict(candidate=body, subset=subset, score=score[:, :18]) 57 | pose = dict(bodies=bodies, hands=hands, hands_score=hands_score, faces=faces, faces_score=faces_score) 58 | 59 | return pose 60 | 61 | 62 | dwpose_detector = DWposeDetector( 63 | model_det=os.path.join(os.environ["dwpose"],"yolox_l.onnx"), 64 | model_pose=os.path.join(os.environ["dwpose"],"dw-ll_ucoco_384.onnx"), 65 | device=device) 66 | -------------------------------------------------------------------------------- /mimicmotion/dwpose/onnxdet.py: -------------------------------------------------------------------------------- 1 | import cv2 2 | import numpy as np 3 | 4 | 5 | def nms(boxes, scores, nms_thr): 6 | """Single class NMS implemented in Numpy. 7 | 8 | Args: 9 | boxes (np.ndarray): shape=(N,4); N is number of boxes 10 | scores (np.ndarray): the score of bboxes 11 | nms_thr (float): the threshold in NMS 12 | 13 | Returns: 14 | List[int]: output bbox ids 15 | """ 16 | x1 = boxes[:, 0] 17 | y1 = boxes[:, 1] 18 | x2 = boxes[:, 2] 19 | y2 = boxes[:, 3] 20 | 21 | areas = (x2 - x1 + 1) * (y2 - y1 + 1) 22 | order = scores.argsort()[::-1] 23 | 24 | keep = [] 25 | while order.size > 0: 26 | i = order[0] 27 | keep.append(i) 28 | xx1 = np.maximum(x1[i], x1[order[1:]]) 29 | yy1 = np.maximum(y1[i], y1[order[1:]]) 30 | xx2 = np.minimum(x2[i], x2[order[1:]]) 31 | yy2 = np.minimum(y2[i], y2[order[1:]]) 32 | 33 | w = np.maximum(0.0, xx2 - xx1 + 1) 34 | h = np.maximum(0.0, yy2 - yy1 + 1) 35 | inter = w * h 36 | ovr = inter / (areas[i] + areas[order[1:]] - inter) 37 | 38 | inds = np.where(ovr <= nms_thr)[0] 39 | order = order[inds + 1] 40 | 41 | return keep 42 | 43 | def multiclass_nms(boxes, scores, nms_thr, score_thr): 44 | """Multiclass NMS implemented in Numpy. Class-aware version. 45 | 46 | Args: 47 | boxes (np.ndarray): shape=(N,4); N is number of boxes 48 | scores (np.ndarray): the score of bboxes 49 | nms_thr (float): the threshold in NMS 50 | score_thr (float): the threshold of cls score 51 | 52 | Returns: 53 | np.ndarray: outputs bboxes coordinate 54 | """ 55 | final_dets = [] 56 | num_classes = scores.shape[1] 57 | for cls_ind in range(num_classes): 58 | cls_scores = scores[:, cls_ind] 59 | valid_score_mask = cls_scores > score_thr 60 | if valid_score_mask.sum() == 0: 61 | continue 62 | else: 63 | valid_scores = cls_scores[valid_score_mask] 64 | valid_boxes = boxes[valid_score_mask] 65 | keep = nms(valid_boxes, valid_scores, nms_thr) 66 | if len(keep) > 0: 67 | cls_inds = np.ones((len(keep), 1)) * cls_ind 68 | dets = np.concatenate( 69 | [valid_boxes[keep], valid_scores[keep, None], cls_inds], 1 70 | ) 71 | final_dets.append(dets) 72 | if len(final_dets) == 0: 73 | return None 74 | return np.concatenate(final_dets, 0) 75 | 76 | def demo_postprocess(outputs, img_size, p6=False): 77 | grids = [] 78 | expanded_strides = [] 79 | strides = [8, 16, 32] if not p6 else [8, 16, 32, 64] 80 | 81 | hsizes = [img_size[0] // stride for stride in strides] 82 | wsizes = [img_size[1] // stride for stride in strides] 83 | 84 | for hsize, wsize, stride in zip(hsizes, wsizes, strides): 85 | xv, yv = np.meshgrid(np.arange(wsize), np.arange(hsize)) 86 | grid = np.stack((xv, yv), 2).reshape(1, -1, 2) 87 | grids.append(grid) 88 | shape = grid.shape[:2] 89 | expanded_strides.append(np.full((*shape, 1), stride)) 90 | 91 | grids = np.concatenate(grids, 1) 92 | expanded_strides = np.concatenate(expanded_strides, 1) 93 | outputs[..., :2] = (outputs[..., :2] + grids) * expanded_strides 94 | outputs[..., 2:4] = np.exp(outputs[..., 2:4]) * expanded_strides 95 | 96 | return outputs 97 | 98 | def preprocess(img, input_size, swap=(2, 0, 1)): 99 | if len(img.shape) == 3: 100 | padded_img = np.ones((input_size[0], input_size[1], 3), dtype=np.uint8) * 114 101 | else: 102 | padded_img = np.ones(input_size, dtype=np.uint8) * 114 103 | 104 | r = min(input_size[0] / img.shape[0], input_size[1] / img.shape[1]) 105 | resized_img = cv2.resize( 106 | img, 107 | (int(img.shape[1] * r), int(img.shape[0] * r)), 108 | interpolation=cv2.INTER_LINEAR, 109 | ).astype(np.uint8) 110 | padded_img[: int(img.shape[0] * r), : int(img.shape[1] * r)] = resized_img 111 | 112 | padded_img = padded_img.transpose(swap) 113 | padded_img = np.ascontiguousarray(padded_img, dtype=np.float32) 114 | return padded_img, r 115 | 116 | def inference_detector(session, oriImg): 117 | """run human detect 118 | """ 119 | input_shape = (640,640) 120 | img, ratio = preprocess(oriImg, input_shape) 121 | 122 | ort_inputs = {session.get_inputs()[0].name: img[None, :, :, :]} 123 | output = session.run(None, ort_inputs) 124 | predictions = demo_postprocess(output[0], input_shape)[0] 125 | 126 | boxes = predictions[:, :4] 127 | scores = predictions[:, 4:5] * predictions[:, 5:] 128 | 129 | boxes_xyxy = np.ones_like(boxes) 130 | boxes_xyxy[:, 0] = boxes[:, 0] - boxes[:, 2]/2. 131 | boxes_xyxy[:, 1] = boxes[:, 1] - boxes[:, 3]/2. 132 | boxes_xyxy[:, 2] = boxes[:, 0] + boxes[:, 2]/2. 133 | boxes_xyxy[:, 3] = boxes[:, 1] + boxes[:, 3]/2. 134 | boxes_xyxy /= ratio 135 | dets = multiclass_nms(boxes_xyxy, scores, nms_thr=0.45, score_thr=0.1) 136 | if dets is not None: 137 | final_boxes, final_scores, final_cls_inds = dets[:, :4], dets[:, 4], dets[:, 5] 138 | isscore = final_scores>0.3 139 | iscat = final_cls_inds == 0 140 | isbbox = [ i and j for (i, j) in zip(isscore, iscat)] 141 | final_boxes = final_boxes[isbbox] 142 | else: 143 | final_boxes = np.array([]) 144 | 145 | return final_boxes 146 | -------------------------------------------------------------------------------- /mimicmotion/dwpose/onnxpose.py: -------------------------------------------------------------------------------- 1 | from typing import List, Tuple 2 | 3 | import cv2 4 | import numpy as np 5 | import onnxruntime as ort 6 | 7 | def preprocess( 8 | img: np.ndarray, out_bbox, input_size: Tuple[int, int] = (192, 256) 9 | ) -> Tuple[np.ndarray, np.ndarray, np.ndarray]: 10 | """Do preprocessing for RTMPose model inference. 11 | 12 | Args: 13 | img (np.ndarray): Input image in shape. 14 | input_size (tuple): Input image size in shape (w, h). 15 | 16 | Returns: 17 | tuple: 18 | - resized_img (np.ndarray): Preprocessed image. 19 | - center (np.ndarray): Center of image. 20 | - scale (np.ndarray): Scale of image. 21 | """ 22 | # get shape of image 23 | img_shape = img.shape[:2] 24 | out_img, out_center, out_scale = [], [], [] 25 | if len(out_bbox) == 0: 26 | out_bbox = [[0, 0, img_shape[1], img_shape[0]]] 27 | for i in range(len(out_bbox)): 28 | x0 = out_bbox[i][0] 29 | y0 = out_bbox[i][1] 30 | x1 = out_bbox[i][2] 31 | y1 = out_bbox[i][3] 32 | bbox = np.array([x0, y0, x1, y1]) 33 | 34 | # get center and scale 35 | center, scale = bbox_xyxy2cs(bbox, padding=1.25) 36 | 37 | # do affine transformation 38 | resized_img, scale = top_down_affine(input_size, scale, center, img) 39 | 40 | # normalize image 41 | mean = np.array([123.675, 116.28, 103.53]) 42 | std = np.array([58.395, 57.12, 57.375]) 43 | resized_img = (resized_img - mean) / std 44 | 45 | out_img.append(resized_img) 46 | out_center.append(center) 47 | out_scale.append(scale) 48 | 49 | return out_img, out_center, out_scale 50 | 51 | 52 | def inference(sess: ort.InferenceSession, img: np.ndarray) -> np.ndarray: 53 | """Inference RTMPose model. 54 | 55 | Args: 56 | sess (ort.InferenceSession): ONNXRuntime session. 57 | img (np.ndarray): Input image in shape. 58 | 59 | Returns: 60 | outputs (np.ndarray): Output of RTMPose model. 61 | """ 62 | all_out = [] 63 | # build input 64 | for i in range(len(img)): 65 | input = [img[i].transpose(2, 0, 1)] 66 | 67 | # build output 68 | sess_input = {sess.get_inputs()[0].name: input} 69 | sess_output = [] 70 | for out in sess.get_outputs(): 71 | sess_output.append(out.name) 72 | 73 | # run model 74 | outputs = sess.run(sess_output, sess_input) 75 | all_out.append(outputs) 76 | 77 | return all_out 78 | 79 | 80 | def postprocess(outputs: List[np.ndarray], 81 | model_input_size: Tuple[int, int], 82 | center: Tuple[int, int], 83 | scale: Tuple[int, int], 84 | simcc_split_ratio: float = 2.0 85 | ) -> Tuple[np.ndarray, np.ndarray]: 86 | """Postprocess for RTMPose model output. 87 | 88 | Args: 89 | outputs (np.ndarray): Output of RTMPose model. 90 | model_input_size (tuple): RTMPose model Input image size. 91 | center (tuple): Center of bbox in shape (x, y). 92 | scale (tuple): Scale of bbox in shape (w, h). 93 | simcc_split_ratio (float): Split ratio of simcc. 94 | 95 | Returns: 96 | tuple: 97 | - keypoints (np.ndarray): Rescaled keypoints. 98 | - scores (np.ndarray): Model predict scores. 99 | """ 100 | all_key = [] 101 | all_score = [] 102 | for i in range(len(outputs)): 103 | # use simcc to decode 104 | simcc_x, simcc_y = outputs[i] 105 | keypoints, scores = decode(simcc_x, simcc_y, simcc_split_ratio) 106 | 107 | # rescale keypoints 108 | keypoints = keypoints / model_input_size * scale[i] + center[i] - scale[i] / 2 109 | all_key.append(keypoints[0]) 110 | all_score.append(scores[0]) 111 | 112 | return np.array(all_key), np.array(all_score) 113 | 114 | 115 | def bbox_xyxy2cs(bbox: np.ndarray, 116 | padding: float = 1.) -> Tuple[np.ndarray, np.ndarray]: 117 | """Transform the bbox format from (x,y,w,h) into (center, scale) 118 | 119 | Args: 120 | bbox (ndarray): Bounding box(es) in shape (4,) or (n, 4), formatted 121 | as (left, top, right, bottom) 122 | padding (float): BBox padding factor that will be multilied to scale. 123 | Default: 1.0 124 | 125 | Returns: 126 | tuple: A tuple containing center and scale. 127 | - np.ndarray[float32]: Center (x, y) of the bbox in shape (2,) or 128 | (n, 2) 129 | - np.ndarray[float32]: Scale (w, h) of the bbox in shape (2,) or 130 | (n, 2) 131 | """ 132 | # convert single bbox from (4, ) to (1, 4) 133 | dim = bbox.ndim 134 | if dim == 1: 135 | bbox = bbox[None, :] 136 | 137 | # get bbox center and scale 138 | x1, y1, x2, y2 = np.hsplit(bbox, [1, 2, 3]) 139 | center = np.hstack([x1 + x2, y1 + y2]) * 0.5 140 | scale = np.hstack([x2 - x1, y2 - y1]) * padding 141 | 142 | if dim == 1: 143 | center = center[0] 144 | scale = scale[0] 145 | 146 | return center, scale 147 | 148 | 149 | def _fix_aspect_ratio(bbox_scale: np.ndarray, 150 | aspect_ratio: float) -> np.ndarray: 151 | """Extend the scale to match the given aspect ratio. 152 | 153 | Args: 154 | scale (np.ndarray): The image scale (w, h) in shape (2, ) 155 | aspect_ratio (float): The ratio of ``w/h`` 156 | 157 | Returns: 158 | np.ndarray: The reshaped image scale in (2, ) 159 | """ 160 | w, h = np.hsplit(bbox_scale, [1]) 161 | bbox_scale = np.where(w > h * aspect_ratio, 162 | np.hstack([w, w / aspect_ratio]), 163 | np.hstack([h * aspect_ratio, h])) 164 | return bbox_scale 165 | 166 | 167 | def _rotate_point(pt: np.ndarray, angle_rad: float) -> np.ndarray: 168 | """Rotate a point by an angle. 169 | 170 | Args: 171 | pt (np.ndarray): 2D point coordinates (x, y) in shape (2, ) 172 | angle_rad (float): rotation angle in radian 173 | 174 | Returns: 175 | np.ndarray: Rotated point in shape (2, ) 176 | """ 177 | sn, cs = np.sin(angle_rad), np.cos(angle_rad) 178 | rot_mat = np.array([[cs, -sn], [sn, cs]]) 179 | return rot_mat @ pt 180 | 181 | 182 | def _get_3rd_point(a: np.ndarray, b: np.ndarray) -> np.ndarray: 183 | """To calculate the affine matrix, three pairs of points are required. This 184 | function is used to get the 3rd point, given 2D points a & b. 185 | 186 | The 3rd point is defined by rotating vector `a - b` by 90 degrees 187 | anticlockwise, using b as the rotation center. 188 | 189 | Args: 190 | a (np.ndarray): The 1st point (x,y) in shape (2, ) 191 | b (np.ndarray): The 2nd point (x,y) in shape (2, ) 192 | 193 | Returns: 194 | np.ndarray: The 3rd point. 195 | """ 196 | direction = a - b 197 | c = b + np.r_[-direction[1], direction[0]] 198 | return c 199 | 200 | 201 | def get_warp_matrix(center: np.ndarray, 202 | scale: np.ndarray, 203 | rot: float, 204 | output_size: Tuple[int, int], 205 | shift: Tuple[float, float] = (0., 0.), 206 | inv: bool = False) -> np.ndarray: 207 | """Calculate the affine transformation matrix that can warp the bbox area 208 | in the input image to the output size. 209 | 210 | Args: 211 | center (np.ndarray[2, ]): Center of the bounding box (x, y). 212 | scale (np.ndarray[2, ]): Scale of the bounding box 213 | wrt [width, height]. 214 | rot (float): Rotation angle (degree). 215 | output_size (np.ndarray[2, ] | list(2,)): Size of the 216 | destination heatmaps. 217 | shift (0-100%): Shift translation ratio wrt the width/height. 218 | Default (0., 0.). 219 | inv (bool): Option to inverse the affine transform direction. 220 | (inv=False: src->dst or inv=True: dst->src) 221 | 222 | Returns: 223 | np.ndarray: A 2x3 transformation matrix 224 | """ 225 | shift = np.array(shift) 226 | src_w = scale[0] 227 | dst_w = output_size[0] 228 | dst_h = output_size[1] 229 | 230 | # compute transformation matrix 231 | rot_rad = np.deg2rad(rot) 232 | src_dir = _rotate_point(np.array([0., src_w * -0.5]), rot_rad) 233 | dst_dir = np.array([0., dst_w * -0.5]) 234 | 235 | # get four corners of the src rectangle in the original image 236 | src = np.zeros((3, 2), dtype=np.float32) 237 | src[0, :] = center + scale * shift 238 | src[1, :] = center + src_dir + scale * shift 239 | src[2, :] = _get_3rd_point(src[0, :], src[1, :]) 240 | 241 | # get four corners of the dst rectangle in the input image 242 | dst = np.zeros((3, 2), dtype=np.float32) 243 | dst[0, :] = [dst_w * 0.5, dst_h * 0.5] 244 | dst[1, :] = np.array([dst_w * 0.5, dst_h * 0.5]) + dst_dir 245 | dst[2, :] = _get_3rd_point(dst[0, :], dst[1, :]) 246 | 247 | if inv: 248 | warp_mat = cv2.getAffineTransform(np.float32(dst), np.float32(src)) 249 | else: 250 | warp_mat = cv2.getAffineTransform(np.float32(src), np.float32(dst)) 251 | 252 | return warp_mat 253 | 254 | 255 | def top_down_affine(input_size: dict, bbox_scale: dict, bbox_center: dict, 256 | img: np.ndarray) -> Tuple[np.ndarray, np.ndarray]: 257 | """Get the bbox image as the model input by affine transform. 258 | 259 | Args: 260 | input_size (dict): The input size of the model. 261 | bbox_scale (dict): The bbox scale of the img. 262 | bbox_center (dict): The bbox center of the img. 263 | img (np.ndarray): The original image. 264 | 265 | Returns: 266 | tuple: A tuple containing center and scale. 267 | - np.ndarray[float32]: img after affine transform. 268 | - np.ndarray[float32]: bbox scale after affine transform. 269 | """ 270 | w, h = input_size 271 | warp_size = (int(w), int(h)) 272 | 273 | # reshape bbox to fixed aspect ratio 274 | bbox_scale = _fix_aspect_ratio(bbox_scale, aspect_ratio=w / h) 275 | 276 | # get the affine matrix 277 | center = bbox_center 278 | scale = bbox_scale 279 | rot = 0 280 | warp_mat = get_warp_matrix(center, scale, rot, output_size=(w, h)) 281 | 282 | # do affine transform 283 | img = cv2.warpAffine(img, warp_mat, warp_size, flags=cv2.INTER_LINEAR) 284 | 285 | return img, bbox_scale 286 | 287 | 288 | def get_simcc_maximum(simcc_x: np.ndarray, 289 | simcc_y: np.ndarray) -> Tuple[np.ndarray, np.ndarray]: 290 | """Get maximum response location and value from simcc representations. 291 | 292 | Note: 293 | instance number: N 294 | num_keypoints: K 295 | heatmap height: H 296 | heatmap width: W 297 | 298 | Args: 299 | simcc_x (np.ndarray): x-axis SimCC in shape (K, Wx) or (N, K, Wx) 300 | simcc_y (np.ndarray): y-axis SimCC in shape (K, Wy) or (N, K, Wy) 301 | 302 | Returns: 303 | tuple: 304 | - locs (np.ndarray): locations of maximum heatmap responses in shape 305 | (K, 2) or (N, K, 2) 306 | - vals (np.ndarray): values of maximum heatmap responses in shape 307 | (K,) or (N, K) 308 | """ 309 | N, K, Wx = simcc_x.shape 310 | simcc_x = simcc_x.reshape(N * K, -1) 311 | simcc_y = simcc_y.reshape(N * K, -1) 312 | 313 | # get maximum value locations 314 | x_locs = np.argmax(simcc_x, axis=1) 315 | y_locs = np.argmax(simcc_y, axis=1) 316 | locs = np.stack((x_locs, y_locs), axis=-1).astype(np.float32) 317 | max_val_x = np.amax(simcc_x, axis=1) 318 | max_val_y = np.amax(simcc_y, axis=1) 319 | 320 | # get maximum value across x and y axis 321 | mask = max_val_x > max_val_y 322 | max_val_x[mask] = max_val_y[mask] 323 | vals = max_val_x 324 | locs[vals <= 0.] = -1 325 | 326 | # reshape 327 | locs = locs.reshape(N, K, 2) 328 | vals = vals.reshape(N, K) 329 | 330 | return locs, vals 331 | 332 | 333 | def decode(simcc_x: np.ndarray, simcc_y: np.ndarray, 334 | simcc_split_ratio) -> Tuple[np.ndarray, np.ndarray]: 335 | """Modulate simcc distribution with Gaussian. 336 | 337 | Args: 338 | simcc_x (np.ndarray[K, Wx]): model predicted simcc in x. 339 | simcc_y (np.ndarray[K, Wy]): model predicted simcc in y. 340 | simcc_split_ratio (int): The split ratio of simcc. 341 | 342 | Returns: 343 | tuple: A tuple containing center and scale. 344 | - np.ndarray[float32]: keypoints in shape (K, 2) or (n, K, 2) 345 | - np.ndarray[float32]: scores in shape (K,) or (n, K) 346 | """ 347 | keypoints, scores = get_simcc_maximum(simcc_x, simcc_y) 348 | keypoints /= simcc_split_ratio 349 | 350 | return keypoints, scores 351 | 352 | 353 | def inference_pose(session, out_bbox, oriImg): 354 | """run pose detect 355 | 356 | Args: 357 | session (ort.InferenceSession): ONNXRuntime session. 358 | out_bbox (np.ndarray): bbox list 359 | oriImg (np.ndarray): Input image in shape. 360 | 361 | Returns: 362 | tuple: 363 | - keypoints (np.ndarray): Rescaled keypoints. 364 | - scores (np.ndarray): Model predict scores. 365 | """ 366 | h, w = session.get_inputs()[0].shape[2:] 367 | model_input_size = (w, h) 368 | # preprocess for rtm-pose model inference. 369 | resized_img, center, scale = preprocess(oriImg, out_bbox, model_input_size) 370 | # run pose estimation for processed img 371 | outputs = inference(session, resized_img) 372 | # postprocess for rtm-pose model output. 373 | keypoints, scores = postprocess(outputs, model_input_size, center, scale) 374 | 375 | return keypoints, scores 376 | -------------------------------------------------------------------------------- /mimicmotion/dwpose/preprocess.py: -------------------------------------------------------------------------------- 1 | import decord 2 | import numpy as np 3 | from tqdm import tqdm 4 | from .util import draw_pose 5 | from .dwpose_detector import dwpose_detector as dwprocessor 6 | 7 | 8 | def get_video_pose( 9 | video_path: str, 10 | ref_image: np.ndarray, 11 | sample_stride: int=1): 12 | """preprocess ref image pose and video pose 13 | 14 | Args: 15 | video_path (str): video pose path 16 | ref_image (np.ndarray): reference image 17 | sample_stride (int, optional): Defaults to 1. 18 | 19 | Returns: 20 | np.ndarray: sequence of video pose 21 | """ 22 | # select ref-keypoint from reference pose for pose rescale 23 | ref_pose = dwprocessor(ref_image) 24 | ref_keypoint_id = [0, 1, 2, 5, 8, 11, 14, 15, 16, 17] 25 | ref_keypoint_id = [i for i in ref_keypoint_id \ 26 | if ref_pose['bodies']['score'].shape[0] > 0 and ref_pose['bodies']['score'][0][i] > 0.3] 27 | ref_body = ref_pose['bodies']['candidate'][ref_keypoint_id] 28 | 29 | height, width, _ = ref_image.shape 30 | 31 | # read input video 32 | vr = decord.VideoReader(video_path, ctx=decord.cpu(0)) 33 | sample_stride *= max(1, int(vr.get_avg_fps() / 24)) 34 | 35 | detected_poses = [dwprocessor(frm) for frm in tqdm(vr.get_batch(list(range(0, len(vr), sample_stride))).asnumpy(),desc="detect video poses",total=len(range(0, len(vr), sample_stride)))] 36 | 37 | detected_bodies = np.stack( 38 | [p['bodies']['candidate'] for p in detected_poses if p['bodies']['candidate'].shape[0] == 18])[:, 39 | ref_keypoint_id] 40 | # compute linear-rescale params 41 | ay, by = np.polyfit(detected_bodies[:, :, 1].flatten(), np.tile(ref_body[:, 1], len(detected_bodies)), 1) 42 | fh, fw, _ = vr[0].shape 43 | ax = ay / (fh / fw / height * width) 44 | bx = np.mean(np.tile(ref_body[:, 0], len(detected_bodies)) - detected_bodies[:, :, 0].flatten() * ax) 45 | a = np.array([ax, ay]) 46 | b = np.array([bx, by]) 47 | output_pose = [] 48 | # pose rescale 49 | for detected_pose in detected_poses: 50 | detected_pose['bodies']['candidate'] = detected_pose['bodies']['candidate'] * a + b 51 | detected_pose['faces'] = detected_pose['faces'] * a + b 52 | detected_pose['hands'] = detected_pose['hands'] * a + b 53 | im = draw_pose(detected_pose, height, width) 54 | output_pose.append(np.array(im)) 55 | return np.stack(output_pose) 56 | 57 | 58 | def get_image_pose(ref_image): 59 | """process image pose 60 | 61 | Args: 62 | ref_image (np.ndarray): reference image pixel value 63 | 64 | Returns: 65 | np.ndarray: pose visual image in RGB-mode 66 | """ 67 | height, width, _ = ref_image.shape 68 | ref_pose = dwprocessor(ref_image) 69 | pose_img = draw_pose(ref_pose, height, width) 70 | return np.array(pose_img) 71 | -------------------------------------------------------------------------------- /mimicmotion/dwpose/util.py: -------------------------------------------------------------------------------- 1 | import math 2 | import numpy as np 3 | import matplotlib 4 | import cv2 5 | 6 | 7 | eps = 0.01 8 | 9 | def alpha_blend_color(color, alpha): 10 | """blend color according to point conf 11 | """ 12 | return [int(c * alpha) for c in color] 13 | 14 | def draw_bodypose(canvas, candidate, subset, score): 15 | H, W, C = canvas.shape 16 | candidate = np.array(candidate) 17 | subset = np.array(subset) 18 | 19 | stickwidth = 4 20 | 21 | limbSeq = [[2, 3], [2, 6], [3, 4], [4, 5], [6, 7], [7, 8], [2, 9], [9, 10], \ 22 | [10, 11], [2, 12], [12, 13], [13, 14], [2, 1], [1, 15], [15, 17], \ 23 | [1, 16], [16, 18], [3, 17], [6, 18]] 24 | 25 | colors = [[255, 0, 0], [255, 85, 0], [255, 170, 0], [255, 255, 0], [170, 255, 0], [85, 255, 0], [0, 255, 0], \ 26 | [0, 255, 85], [0, 255, 170], [0, 255, 255], [0, 170, 255], [0, 85, 255], [0, 0, 255], [85, 0, 255], \ 27 | [170, 0, 255], [255, 0, 255], [255, 0, 170], [255, 0, 85]] 28 | 29 | for i in range(17): 30 | for n in range(len(subset)): 31 | index = subset[n][np.array(limbSeq[i]) - 1] 32 | conf = score[n][np.array(limbSeq[i]) - 1] 33 | if conf[0] < 0.3 or conf[1] < 0.3: 34 | continue 35 | Y = candidate[index.astype(int), 0] * float(W) 36 | X = candidate[index.astype(int), 1] * float(H) 37 | mX = np.mean(X) 38 | mY = np.mean(Y) 39 | length = ((X[0] - X[1]) ** 2 + (Y[0] - Y[1]) ** 2) ** 0.5 40 | angle = math.degrees(math.atan2(X[0] - X[1], Y[0] - Y[1])) 41 | polygon = cv2.ellipse2Poly((int(mY), int(mX)), (int(length / 2), stickwidth), int(angle), 0, 360, 1) 42 | cv2.fillConvexPoly(canvas, polygon, alpha_blend_color(colors[i], conf[0] * conf[1])) 43 | 44 | canvas = (canvas * 0.6).astype(np.uint8) 45 | 46 | for i in range(18): 47 | for n in range(len(subset)): 48 | index = int(subset[n][i]) 49 | if index == -1: 50 | continue 51 | x, y = candidate[index][0:2] 52 | conf = score[n][i] 53 | x = int(x * W) 54 | y = int(y * H) 55 | cv2.circle(canvas, (int(x), int(y)), 4, alpha_blend_color(colors[i], conf), thickness=-1) 56 | 57 | return canvas 58 | 59 | def draw_handpose(canvas, all_hand_peaks, all_hand_scores): 60 | H, W, C = canvas.shape 61 | 62 | edges = [[0, 1], [1, 2], [2, 3], [3, 4], [0, 5], [5, 6], [6, 7], [7, 8], [0, 9], [9, 10], \ 63 | [10, 11], [11, 12], [0, 13], [13, 14], [14, 15], [15, 16], [0, 17], [17, 18], [18, 19], [19, 20]] 64 | 65 | for peaks, scores in zip(all_hand_peaks, all_hand_scores): 66 | 67 | for ie, e in enumerate(edges): 68 | x1, y1 = peaks[e[0]] 69 | x2, y2 = peaks[e[1]] 70 | x1 = int(x1 * W) 71 | y1 = int(y1 * H) 72 | x2 = int(x2 * W) 73 | y2 = int(y2 * H) 74 | score = int(scores[e[0]] * scores[e[1]] * 255) 75 | if x1 > eps and y1 > eps and x2 > eps and y2 > eps: 76 | cv2.line(canvas, (x1, y1), (x2, y2), 77 | matplotlib.colors.hsv_to_rgb([ie / float(len(edges)), 1.0, 1.0]) * score, thickness=2) 78 | 79 | for i, keyponit in enumerate(peaks): 80 | x, y = keyponit 81 | x = int(x * W) 82 | y = int(y * H) 83 | score = int(scores[i] * 255) 84 | if x > eps and y > eps: 85 | cv2.circle(canvas, (x, y), 4, (0, 0, score), thickness=-1) 86 | return canvas 87 | 88 | def draw_facepose(canvas, all_lmks, all_scores): 89 | H, W, C = canvas.shape 90 | for lmks, scores in zip(all_lmks, all_scores): 91 | for lmk, score in zip(lmks, scores): 92 | x, y = lmk 93 | x = int(x * W) 94 | y = int(y * H) 95 | conf = int(score * 255) 96 | if x > eps and y > eps: 97 | cv2.circle(canvas, (x, y), 3, (conf, conf, conf), thickness=-1) 98 | return canvas 99 | 100 | def draw_pose(pose, H, W, ref_w=2160): 101 | """vis dwpose outputs 102 | 103 | Args: 104 | pose (List): DWposeDetector outputs in dwpose_detector.py 105 | H (int): height 106 | W (int): width 107 | ref_w (int, optional) Defaults to 2160. 108 | 109 | Returns: 110 | np.ndarray: image pixel value in RGB mode 111 | """ 112 | bodies = pose['bodies'] 113 | faces = pose['faces'] 114 | hands = pose['hands'] 115 | candidate = bodies['candidate'] 116 | subset = bodies['subset'] 117 | 118 | sz = min(H, W) 119 | sr = (ref_w / sz) if sz != ref_w else 1 120 | 121 | ########################################## create zero canvas ################################################## 122 | canvas = np.zeros(shape=(int(H*sr), int(W*sr), 3), dtype=np.uint8) 123 | 124 | ########################################### draw body pose ##################################################### 125 | canvas = draw_bodypose(canvas, candidate, subset, score=bodies['score']) 126 | 127 | ########################################### draw hand pose ##################################################### 128 | canvas = draw_handpose(canvas, hands, pose['hands_score']) 129 | 130 | ########################################### draw face pose ##################################################### 131 | canvas = draw_facepose(canvas, faces, pose['faces_score']) 132 | 133 | return cv2.cvtColor(cv2.resize(canvas, (W, H)), cv2.COLOR_BGR2RGB).transpose(2, 0, 1) 134 | -------------------------------------------------------------------------------- /mimicmotion/dwpose/wholebody.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import onnxruntime as ort 3 | 4 | from .onnxdet import inference_detector 5 | from .onnxpose import inference_pose 6 | 7 | 8 | class Wholebody: 9 | """detect human pose by dwpose 10 | """ 11 | def __init__(self, model_det, model_pose, device="cpu"): 12 | providers = ['CPUExecutionProvider'] if device == 'cpu' else ['CUDAExecutionProvider'] 13 | provider_options = None if device == 'cpu' else [{'device_id': 0}] 14 | 15 | self.session_det = ort.InferenceSession( 16 | path_or_bytes=model_det, providers=providers, provider_options=provider_options 17 | ) 18 | self.session_pose = ort.InferenceSession( 19 | path_or_bytes=model_pose, providers=providers, provider_options=provider_options 20 | ) 21 | 22 | def __call__(self, oriImg): 23 | """call to process dwpose-detect 24 | 25 | Args: 26 | oriImg (np.ndarray): detected image 27 | 28 | """ 29 | det_result = inference_detector(self.session_det, oriImg) 30 | keypoints, scores = inference_pose(self.session_pose, det_result, oriImg) 31 | 32 | keypoints_info = np.concatenate( 33 | (keypoints, scores[..., None]), axis=-1) 34 | # compute neck joint 35 | neck = np.mean(keypoints_info[:, [5, 6]], axis=1) 36 | # neck score when visualizing pred 37 | neck[:, 2:4] = np.logical_and( 38 | keypoints_info[:, 5, 2:4] > 0.3, 39 | keypoints_info[:, 6, 2:4] > 0.3).astype(int) 40 | new_keypoints_info = np.insert( 41 | keypoints_info, 17, neck, axis=1) 42 | mmpose_idx = [ 43 | 17, 6, 8, 10, 7, 9, 12, 14, 16, 13, 15, 2, 1, 4, 3 44 | ] 45 | openpose_idx = [ 46 | 1, 2, 3, 4, 6, 7, 8, 9, 10, 12, 13, 14, 15, 16, 17 47 | ] 48 | new_keypoints_info[:, openpose_idx] = \ 49 | new_keypoints_info[:, mmpose_idx] 50 | keypoints_info = new_keypoints_info 51 | 52 | keypoints, scores = keypoints_info[ 53 | ..., :2], keypoints_info[..., 2] 54 | 55 | return keypoints, scores 56 | 57 | 58 | -------------------------------------------------------------------------------- /mimicmotion/modules/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AIFSH/ComfyUI-MimicMotion/0f376479219afe8431f634539359eb26b981d1e5/mimicmotion/modules/__init__.py -------------------------------------------------------------------------------- /mimicmotion/modules/attention.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | from typing import Any, Dict, Optional 3 | 4 | import torch 5 | from diffusers.configuration_utils import ConfigMixin, register_to_config 6 | from diffusers.models.attention import BasicTransformerBlock, TemporalBasicTransformerBlock 7 | from diffusers.models.embeddings import TimestepEmbedding, Timesteps 8 | from diffusers.models.modeling_utils import ModelMixin 9 | from diffusers.models.resnet import AlphaBlender 10 | from diffusers.utils import BaseOutput 11 | from torch import nn 12 | 13 | 14 | @dataclass 15 | class TransformerTemporalModelOutput(BaseOutput): 16 | """ 17 | The output of [`TransformerTemporalModel`]. 18 | 19 | Args: 20 | sample (`torch.FloatTensor` of shape `(batch_size x num_frames, num_channels, height, width)`): 21 | The hidden states output conditioned on `encoder_hidden_states` input. 22 | """ 23 | 24 | sample: torch.FloatTensor 25 | 26 | 27 | class TransformerTemporalModel(ModelMixin, ConfigMixin): 28 | """ 29 | A Transformer model for video-like data. 30 | 31 | Parameters: 32 | num_attention_heads (`int`, *optional*, defaults to 16): The number of heads to use for multi-head attention. 33 | attention_head_dim (`int`, *optional*, defaults to 88): The number of channels in each head. 34 | in_channels (`int`, *optional*): 35 | The number of channels in the input and output (specify if the input is **continuous**). 36 | num_layers (`int`, *optional*, defaults to 1): The number of layers of Transformer blocks to use. 37 | dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use. 38 | cross_attention_dim (`int`, *optional*): The number of `encoder_hidden_states` dimensions to use. 39 | attention_bias (`bool`, *optional*): 40 | Configure if the `TransformerBlock` attention should contain a bias parameter. 41 | sample_size (`int`, *optional*): The width of the latent images (specify if the input is **discrete**). 42 | This is fixed during training since it is used to learn a number of position embeddings. 43 | activation_fn (`str`, *optional*, defaults to `"geglu"`): 44 | Activation function to use in feed-forward. See `diffusers.models.activations.get_activation` for supported 45 | activation functions. 46 | norm_elementwise_affine (`bool`, *optional*): 47 | Configure if the `TransformerBlock` should use learnable elementwise affine parameters for normalization. 48 | double_self_attention (`bool`, *optional*): 49 | Configure if each `TransformerBlock` should contain two self-attention layers. 50 | positional_embeddings: (`str`, *optional*): 51 | The type of positional embeddings to apply to the sequence input before passing use. 52 | num_positional_embeddings: (`int`, *optional*): 53 | The maximum length of the sequence over which to apply positional embeddings. 54 | """ 55 | 56 | @register_to_config 57 | def __init__( 58 | self, 59 | num_attention_heads: int = 16, 60 | attention_head_dim: int = 88, 61 | in_channels: Optional[int] = None, 62 | out_channels: Optional[int] = None, 63 | num_layers: int = 1, 64 | dropout: float = 0.0, 65 | norm_num_groups: int = 32, 66 | cross_attention_dim: Optional[int] = None, 67 | attention_bias: bool = False, 68 | sample_size: Optional[int] = None, 69 | activation_fn: str = "geglu", 70 | norm_elementwise_affine: bool = True, 71 | double_self_attention: bool = True, 72 | positional_embeddings: Optional[str] = None, 73 | num_positional_embeddings: Optional[int] = None, 74 | ): 75 | super().__init__() 76 | self.num_attention_heads = num_attention_heads 77 | self.attention_head_dim = attention_head_dim 78 | inner_dim = num_attention_heads * attention_head_dim 79 | 80 | self.in_channels = in_channels 81 | 82 | self.norm = torch.nn.GroupNorm(num_groups=norm_num_groups, num_channels=in_channels, eps=1e-6, affine=True) 83 | self.proj_in = nn.Linear(in_channels, inner_dim) 84 | 85 | # 3. Define transformers blocks 86 | self.transformer_blocks = nn.ModuleList( 87 | [ 88 | BasicTransformerBlock( 89 | inner_dim, 90 | num_attention_heads, 91 | attention_head_dim, 92 | dropout=dropout, 93 | cross_attention_dim=cross_attention_dim, 94 | activation_fn=activation_fn, 95 | attention_bias=attention_bias, 96 | double_self_attention=double_self_attention, 97 | norm_elementwise_affine=norm_elementwise_affine, 98 | positional_embeddings=positional_embeddings, 99 | num_positional_embeddings=num_positional_embeddings, 100 | ) 101 | for d in range(num_layers) 102 | ] 103 | ) 104 | 105 | self.proj_out = nn.Linear(inner_dim, in_channels) 106 | 107 | def forward( 108 | self, 109 | hidden_states: torch.FloatTensor, 110 | encoder_hidden_states: Optional[torch.LongTensor] = None, 111 | timestep: Optional[torch.LongTensor] = None, 112 | class_labels: torch.LongTensor = None, 113 | num_frames: int = 1, 114 | cross_attention_kwargs: Optional[Dict[str, Any]] = None, 115 | return_dict: bool = True, 116 | ) -> TransformerTemporalModelOutput: 117 | """ 118 | The [`TransformerTemporal`] forward method. 119 | 120 | Args: 121 | hidden_states (`torch.LongTensor` of shape `(batch size, num latent pixels)` if discrete, 122 | `torch.FloatTensor` of shape `(batch size, channel, height, width)`if continuous): Input hidden_states. 123 | encoder_hidden_states ( `torch.LongTensor` of shape `(batch size, encoder_hidden_states dim)`, *optional*): 124 | Conditional embeddings for cross attention layer. If not given, cross-attention defaults to 125 | self-attention. 126 | timestep ( `torch.LongTensor`, *optional*): 127 | Used to indicate denoising step. Optional timestep to be applied as an embedding in `AdaLayerNorm`. 128 | class_labels ( `torch.LongTensor` of shape `(batch size, num classes)`, *optional*): 129 | Used to indicate class labels conditioning. Optional class labels to be applied as an embedding in 130 | `AdaLayerZeroNorm`. 131 | num_frames (`int`, *optional*, defaults to 1): 132 | The number of frames to be processed per batch. This is used to reshape the hidden states. 133 | cross_attention_kwargs (`dict`, *optional*): 134 | A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under 135 | `self.processor` in [diffusers.models.attention_processor]( 136 | https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py). 137 | return_dict (`bool`, *optional*, defaults to `True`): 138 | Whether or not to return a [`~models.unets.unet_2d_condition.UNet2DConditionOutput`] instead of a plain 139 | tuple. 140 | 141 | Returns: 142 | [`~models.transformer_temporal.TransformerTemporalModelOutput`] or `tuple`: 143 | If `return_dict` is True, an [`~models.transformer_temporal.TransformerTemporalModelOutput`] is 144 | returned, otherwise a `tuple` where the first element is the sample tensor. 145 | """ 146 | # 1. Input 147 | batch_frames, channel, height, width = hidden_states.shape 148 | batch_size = batch_frames // num_frames 149 | 150 | residual = hidden_states 151 | 152 | hidden_states = hidden_states[None, :].reshape(batch_size, num_frames, channel, height, width) 153 | hidden_states = hidden_states.permute(0, 2, 1, 3, 4) 154 | 155 | hidden_states = self.norm(hidden_states) 156 | hidden_states = hidden_states.permute(0, 3, 4, 2, 1).reshape(batch_size * height * width, num_frames, channel) 157 | 158 | hidden_states = self.proj_in(hidden_states) 159 | 160 | # 2. Blocks 161 | for block in self.transformer_blocks: 162 | hidden_states = block( 163 | hidden_states, 164 | encoder_hidden_states=encoder_hidden_states, 165 | timestep=timestep, 166 | cross_attention_kwargs=cross_attention_kwargs, 167 | class_labels=class_labels, 168 | ) 169 | 170 | # 3. Output 171 | hidden_states = self.proj_out(hidden_states) 172 | hidden_states = ( 173 | hidden_states[None, None, :] 174 | .reshape(batch_size, height, width, num_frames, channel) 175 | .permute(0, 3, 4, 1, 2) 176 | .contiguous() 177 | ) 178 | hidden_states = hidden_states.reshape(batch_frames, channel, height, width) 179 | 180 | output = hidden_states + residual 181 | 182 | if not return_dict: 183 | return (output,) 184 | 185 | return TransformerTemporalModelOutput(sample=output) 186 | 187 | 188 | class TransformerSpatioTemporalModel(nn.Module): 189 | """ 190 | A Transformer model for video-like data. 191 | 192 | Parameters: 193 | num_attention_heads (`int`, *optional*, defaults to 16): The number of heads to use for multi-head attention. 194 | attention_head_dim (`int`, *optional*, defaults to 88): The number of channels in each head. 195 | in_channels (`int`, *optional*): 196 | The number of channels in the input and output (specify if the input is **continuous**). 197 | out_channels (`int`, *optional*): 198 | The number of channels in the output (specify if the input is **continuous**). 199 | num_layers (`int`, *optional*, defaults to 1): The number of layers of Transformer blocks to use. 200 | cross_attention_dim (`int`, *optional*): The number of `encoder_hidden_states` dimensions to use. 201 | """ 202 | 203 | def __init__( 204 | self, 205 | num_attention_heads: int = 16, 206 | attention_head_dim: int = 88, 207 | in_channels: int = 320, 208 | out_channels: Optional[int] = None, 209 | num_layers: int = 1, 210 | cross_attention_dim: Optional[int] = None, 211 | ): 212 | super().__init__() 213 | self.num_attention_heads = num_attention_heads 214 | self.attention_head_dim = attention_head_dim 215 | 216 | inner_dim = num_attention_heads * attention_head_dim 217 | self.inner_dim = inner_dim 218 | 219 | # 2. Define input layers 220 | self.in_channels = in_channels 221 | self.norm = torch.nn.GroupNorm(num_groups=32, num_channels=in_channels, eps=1e-6) 222 | self.proj_in = nn.Linear(in_channels, inner_dim) 223 | 224 | # 3. Define transformers blocks 225 | self.transformer_blocks = nn.ModuleList( 226 | [ 227 | BasicTransformerBlock( 228 | inner_dim, 229 | num_attention_heads, 230 | attention_head_dim, 231 | cross_attention_dim=cross_attention_dim, 232 | ) 233 | for d in range(num_layers) 234 | ] 235 | ) 236 | 237 | time_mix_inner_dim = inner_dim 238 | self.temporal_transformer_blocks = nn.ModuleList( 239 | [ 240 | TemporalBasicTransformerBlock( 241 | inner_dim, 242 | time_mix_inner_dim, 243 | num_attention_heads, 244 | attention_head_dim, 245 | cross_attention_dim=cross_attention_dim, 246 | ) 247 | for _ in range(num_layers) 248 | ] 249 | ) 250 | 251 | time_embed_dim = in_channels * 4 252 | self.time_pos_embed = TimestepEmbedding(in_channels, time_embed_dim, out_dim=in_channels) 253 | self.time_proj = Timesteps(in_channels, True, 0) 254 | self.time_mixer = AlphaBlender(alpha=0.5, merge_strategy="learned_with_images") 255 | 256 | # 4. Define output layers 257 | self.out_channels = in_channels if out_channels is None else out_channels 258 | # TODO: should use out_channels for continuous projections 259 | self.proj_out = nn.Linear(inner_dim, in_channels) 260 | 261 | self.gradient_checkpointing = False 262 | 263 | def forward( 264 | self, 265 | hidden_states: torch.Tensor, 266 | encoder_hidden_states: Optional[torch.Tensor] = None, 267 | image_only_indicator: Optional[torch.Tensor] = None, 268 | return_dict: bool = True, 269 | ): 270 | """ 271 | Args: 272 | hidden_states (`torch.FloatTensor` of shape `(batch size, channel, height, width)`): 273 | Input hidden_states. 274 | num_frames (`int`): 275 | The number of frames to be processed per batch. This is used to reshape the hidden states. 276 | encoder_hidden_states ( `torch.LongTensor` of shape `(batch size, encoder_hidden_states dim)`, *optional*): 277 | Conditional embeddings for cross attention layer. If not given, cross-attention defaults to 278 | self-attention. 279 | image_only_indicator (`torch.LongTensor` of shape `(batch size, num_frames)`, *optional*): 280 | A tensor indicating whether the input contains only images. 1 indicates that the input contains only 281 | images, 0 indicates that the input contains video frames. 282 | return_dict (`bool`, *optional*, defaults to `True`): 283 | Whether or not to return a [`~models.transformer_temporal.TransformerTemporalModelOutput`] 284 | instead of a plain tuple. 285 | 286 | Returns: 287 | [`~models.transformer_temporal.TransformerTemporalModelOutput`] or `tuple`: 288 | If `return_dict` is True, an [`~models.transformer_temporal.TransformerTemporalModelOutput`] is 289 | returned, otherwise a `tuple` where the first element is the sample tensor. 290 | """ 291 | # 1. Input 292 | batch_frames, _, height, width = hidden_states.shape 293 | num_frames = image_only_indicator.shape[-1] 294 | batch_size = batch_frames // num_frames 295 | 296 | time_context = encoder_hidden_states 297 | time_context_first_timestep = time_context[None, :].reshape( 298 | batch_size, num_frames, -1, time_context.shape[-1] 299 | )[:, 0] 300 | time_context = time_context_first_timestep[None, :].broadcast_to( 301 | height * width, batch_size, 1, time_context.shape[-1] 302 | ) 303 | time_context = time_context.reshape(height * width * batch_size, 1, time_context.shape[-1]) 304 | 305 | residual = hidden_states 306 | 307 | hidden_states = self.norm(hidden_states) 308 | inner_dim = hidden_states.shape[1] 309 | hidden_states = hidden_states.permute(0, 2, 3, 1).reshape(batch_frames, height * width, inner_dim) 310 | hidden_states = torch.utils.checkpoint.checkpoint(self.proj_in, hidden_states) 311 | 312 | num_frames_emb = torch.arange(num_frames, device=hidden_states.device) 313 | num_frames_emb = num_frames_emb.repeat(batch_size, 1) 314 | num_frames_emb = num_frames_emb.reshape(-1) 315 | t_emb = self.time_proj(num_frames_emb) 316 | 317 | # `Timesteps` does not contain any weights and will always return f32 tensors 318 | # but time_embedding might actually be running in fp16. so we need to cast here. 319 | # there might be better ways to encapsulate this. 320 | t_emb = t_emb.to(dtype=hidden_states.dtype) 321 | 322 | emb = self.time_pos_embed(t_emb) 323 | emb = emb[:, None, :] 324 | 325 | # 2. Blocks 326 | for block, temporal_block in zip(self.transformer_blocks, self.temporal_transformer_blocks): 327 | if self.gradient_checkpointing: 328 | hidden_states = torch.utils.checkpoint.checkpoint( 329 | block, 330 | hidden_states, 331 | None, 332 | encoder_hidden_states, 333 | None, 334 | use_reentrant=False, 335 | ) 336 | else: 337 | hidden_states = block( 338 | hidden_states, 339 | encoder_hidden_states=encoder_hidden_states, 340 | ) 341 | 342 | hidden_states_mix = hidden_states 343 | hidden_states_mix = hidden_states_mix + emb 344 | 345 | if self.gradient_checkpointing: 346 | hidden_states_mix = torch.utils.checkpoint.checkpoint( 347 | temporal_block, 348 | hidden_states_mix, 349 | num_frames, 350 | time_context, 351 | ) 352 | hidden_states = self.time_mixer( 353 | x_spatial=hidden_states, 354 | x_temporal=hidden_states_mix, 355 | image_only_indicator=image_only_indicator, 356 | ) 357 | else: 358 | hidden_states_mix = temporal_block( 359 | hidden_states_mix, 360 | num_frames=num_frames, 361 | encoder_hidden_states=time_context, 362 | ) 363 | hidden_states = self.time_mixer( 364 | x_spatial=hidden_states, 365 | x_temporal=hidden_states_mix, 366 | image_only_indicator=image_only_indicator, 367 | ) 368 | 369 | # 3. Output 370 | hidden_states = torch.utils.checkpoint.checkpoint(self.proj_out, hidden_states) 371 | hidden_states = hidden_states.reshape(batch_frames, height, width, inner_dim).permute(0, 3, 1, 2).contiguous() 372 | 373 | output = hidden_states + residual 374 | 375 | if not return_dict: 376 | return (output,) 377 | 378 | return TransformerTemporalModelOutput(sample=output) 379 | -------------------------------------------------------------------------------- /mimicmotion/modules/pose_net.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | import einops 4 | import numpy as np 5 | import torch 6 | import torch.nn as nn 7 | import torch.nn.init as init 8 | 9 | 10 | class PoseNet(nn.Module): 11 | """a tiny conv network for introducing pose sequence as the condition 12 | """ 13 | def __init__(self, noise_latent_channels=320, *args, **kwargs): 14 | super().__init__(*args, **kwargs) 15 | # multiple convolution layers 16 | self.conv_layers = nn.Sequential( 17 | nn.Conv2d(in_channels=3, out_channels=3, kernel_size=3, padding=1), 18 | nn.SiLU(), 19 | nn.Conv2d(in_channels=3, out_channels=16, kernel_size=4, stride=2, padding=1), 20 | nn.SiLU(), 21 | 22 | nn.Conv2d(in_channels=16, out_channels=16, kernel_size=3, padding=1), 23 | nn.SiLU(), 24 | nn.Conv2d(in_channels=16, out_channels=32, kernel_size=4, stride=2, padding=1), 25 | nn.SiLU(), 26 | 27 | nn.Conv2d(in_channels=32, out_channels=32, kernel_size=3, padding=1), 28 | nn.SiLU(), 29 | nn.Conv2d(in_channels=32, out_channels=64, kernel_size=4, stride=2, padding=1), 30 | nn.SiLU(), 31 | 32 | nn.Conv2d(in_channels=64, out_channels=64, kernel_size=3, padding=1), 33 | nn.SiLU(), 34 | nn.Conv2d(in_channels=64, out_channels=128, kernel_size=3, stride=1, padding=1), 35 | nn.SiLU() 36 | ) 37 | 38 | # Final projection layer 39 | self.final_proj = nn.Conv2d(in_channels=128, out_channels=noise_latent_channels, kernel_size=1) 40 | 41 | # Initialize layers 42 | self._initialize_weights() 43 | 44 | self.scale = nn.Parameter(torch.ones(1) * 2) 45 | 46 | def _initialize_weights(self): 47 | """Initialize weights with He. initialization and zero out the biases 48 | """ 49 | for m in self.conv_layers: 50 | if isinstance(m, nn.Conv2d): 51 | n = m.kernel_size[0] * m.kernel_size[1] * m.in_channels 52 | init.normal_(m.weight, mean=0.0, std=np.sqrt(2. / n)) 53 | if m.bias is not None: 54 | init.zeros_(m.bias) 55 | init.zeros_(self.final_proj.weight) 56 | if self.final_proj.bias is not None: 57 | init.zeros_(self.final_proj.bias) 58 | 59 | def forward(self, x): 60 | if x.ndim == 5: 61 | x = einops.rearrange(x, "b f c h w -> (b f) c h w") 62 | x = self.conv_layers(x) 63 | x = self.final_proj(x) 64 | 65 | return x * self.scale 66 | 67 | @classmethod 68 | def from_pretrained(cls, pretrained_model_path): 69 | """load pretrained pose-net weights 70 | """ 71 | if not Path(pretrained_model_path).exists(): 72 | print(f"There is no model file in {pretrained_model_path}") 73 | print(f"loaded PoseNet's pretrained weights from {pretrained_model_path}.") 74 | 75 | state_dict = torch.load(pretrained_model_path, map_location="cpu") 76 | model = PoseNet(noise_latent_channels=320) 77 | 78 | model.load_state_dict(state_dict, strict=True) 79 | 80 | return model 81 | -------------------------------------------------------------------------------- /mimicmotion/modules/unet.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | from typing import Dict, Optional, Tuple, Union 3 | 4 | import torch 5 | import torch.nn as nn 6 | from diffusers.configuration_utils import ConfigMixin, register_to_config 7 | from diffusers.loaders import UNet2DConditionLoadersMixin 8 | from diffusers.models.attention_processor import CROSS_ATTENTION_PROCESSORS, AttentionProcessor, AttnProcessor 9 | from diffusers.models.embeddings import TimestepEmbedding, Timesteps 10 | from diffusers.models.modeling_utils import ModelMixin 11 | from diffusers.utils import BaseOutput, logging 12 | 13 | from diffusers.models.unets.unet_3d_blocks import get_down_block, get_up_block, UNetMidBlockSpatioTemporal 14 | 15 | logger = logging.get_logger(__name__) # pylint: disable=invalid-name 16 | 17 | 18 | @dataclass 19 | class UNetSpatioTemporalConditionOutput(BaseOutput): 20 | """ 21 | The output of [`UNetSpatioTemporalConditionModel`]. 22 | 23 | Args: 24 | sample (`torch.FloatTensor` of shape `(batch_size, num_frames, num_channels, height, width)`): 25 | The hidden states output conditioned on `encoder_hidden_states` input. Output of last layer of model. 26 | """ 27 | 28 | sample: torch.FloatTensor = None 29 | 30 | 31 | class UNetSpatioTemporalConditionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin): 32 | r""" 33 | A conditional Spatio-Temporal UNet model that takes a noisy video frames, conditional state, 34 | and a timestep and returns a sample shaped output. 35 | 36 | This model inherits from [`ModelMixin`]. Check the superclass documentation for it's generic methods implemented 37 | for all models (such as downloading or saving). 38 | 39 | Parameters: 40 | sample_size (`int` or `Tuple[int, int]`, *optional*, defaults to `None`): 41 | Height and width of input/output sample. 42 | in_channels (`int`, *optional*, defaults to 8): Number of channels in the input sample. 43 | out_channels (`int`, *optional*, defaults to 4): Number of channels in the output. 44 | down_block_types (`Tuple[str]`, *optional*, defaults to `("CrossAttnDownBlockSpatioTemporal", 45 | "CrossAttnDownBlockSpatioTemporal", "CrossAttnDownBlockSpatioTemporal", "DownBlockSpatioTemporal")`): 46 | The tuple of downsample blocks to use. 47 | up_block_types (`Tuple[str]`, *optional*, defaults to `("UpBlockSpatioTemporal", 48 | "CrossAttnUpBlockSpatioTemporal", "CrossAttnUpBlockSpatioTemporal", "CrossAttnUpBlockSpatioTemporal")`): 49 | The tuple of upsample blocks to use. 50 | block_out_channels (`Tuple[int]`, *optional*, defaults to `(320, 640, 1280, 1280)`): 51 | The tuple of output channels for each block. 52 | addition_time_embed_dim: (`int`, defaults to 256): 53 | Dimension to to encode the additional time ids. 54 | projection_class_embeddings_input_dim (`int`, defaults to 768): 55 | The dimension of the projection of encoded `added_time_ids`. 56 | layers_per_block (`int`, *optional*, defaults to 2): The number of layers per block. 57 | cross_attention_dim (`int` or `Tuple[int]`, *optional*, defaults to 1280): 58 | The dimension of the cross attention features. 59 | transformer_layers_per_block (`int`, `Tuple[int]`, or `Tuple[Tuple]` , *optional*, defaults to 1): 60 | The number of transformer blocks of type [`~models.attention.BasicTransformerBlock`]. Only relevant for 61 | [`~models.unet_3d_blocks.CrossAttnDownBlockSpatioTemporal`], 62 | [`~models.unet_3d_blocks.CrossAttnUpBlockSpatioTemporal`], 63 | [`~models.unet_3d_blocks.UNetMidBlockSpatioTemporal`]. 64 | num_attention_heads (`int`, `Tuple[int]`, defaults to `(5, 10, 10, 20)`): 65 | The number of attention heads. 66 | dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use. 67 | """ 68 | 69 | _supports_gradient_checkpointing = True 70 | 71 | @register_to_config 72 | def __init__( 73 | self, 74 | sample_size: Optional[int] = None, 75 | in_channels: int = 8, 76 | out_channels: int = 4, 77 | down_block_types: Tuple[str] = ( 78 | "CrossAttnDownBlockSpatioTemporal", 79 | "CrossAttnDownBlockSpatioTemporal", 80 | "CrossAttnDownBlockSpatioTemporal", 81 | "DownBlockSpatioTemporal", 82 | ), 83 | up_block_types: Tuple[str] = ( 84 | "UpBlockSpatioTemporal", 85 | "CrossAttnUpBlockSpatioTemporal", 86 | "CrossAttnUpBlockSpatioTemporal", 87 | "CrossAttnUpBlockSpatioTemporal", 88 | ), 89 | block_out_channels: Tuple[int] = (320, 640, 1280, 1280), 90 | addition_time_embed_dim: int = 256, 91 | projection_class_embeddings_input_dim: int = 768, 92 | layers_per_block: Union[int, Tuple[int]] = 2, 93 | cross_attention_dim: Union[int, Tuple[int]] = 1024, 94 | transformer_layers_per_block: Union[int, Tuple[int], Tuple[Tuple]] = 1, 95 | num_attention_heads: Union[int, Tuple[int]] = (5, 10, 10, 20), 96 | num_frames: int = 25, 97 | ): 98 | super().__init__() 99 | 100 | self.sample_size = sample_size 101 | 102 | # Check inputs 103 | if len(down_block_types) != len(up_block_types): 104 | raise ValueError( 105 | f"Must provide the same number of `down_block_types` as `up_block_types`. " \ 106 | f"`down_block_types`: {down_block_types}. `up_block_types`: {up_block_types}." 107 | ) 108 | 109 | if len(block_out_channels) != len(down_block_types): 110 | raise ValueError( 111 | f"Must provide the same number of `block_out_channels` as `down_block_types`. " \ 112 | f"`block_out_channels`: {block_out_channels}. `down_block_types`: {down_block_types}." 113 | ) 114 | 115 | if not isinstance(num_attention_heads, int) and len(num_attention_heads) != len(down_block_types): 116 | raise ValueError( 117 | f"Must provide the same number of `num_attention_heads` as `down_block_types`. " \ 118 | f"`num_attention_heads`: {num_attention_heads}. `down_block_types`: {down_block_types}." 119 | ) 120 | 121 | if isinstance(cross_attention_dim, list) and len(cross_attention_dim) != len(down_block_types): 122 | raise ValueError( 123 | f"Must provide the same number of `cross_attention_dim` as `down_block_types`. " \ 124 | f"`cross_attention_dim`: {cross_attention_dim}. `down_block_types`: {down_block_types}." 125 | ) 126 | 127 | if not isinstance(layers_per_block, int) and len(layers_per_block) != len(down_block_types): 128 | raise ValueError( 129 | f"Must provide the same number of `layers_per_block` as `down_block_types`. " \ 130 | f"`layers_per_block`: {layers_per_block}. `down_block_types`: {down_block_types}." 131 | ) 132 | 133 | # input 134 | self.conv_in = nn.Conv2d( 135 | in_channels, 136 | block_out_channels[0], 137 | kernel_size=3, 138 | padding=1, 139 | ) 140 | 141 | # time 142 | time_embed_dim = block_out_channels[0] * 4 143 | 144 | self.time_proj = Timesteps(block_out_channels[0], True, downscale_freq_shift=0) 145 | timestep_input_dim = block_out_channels[0] 146 | 147 | self.time_embedding = TimestepEmbedding(timestep_input_dim, time_embed_dim) 148 | 149 | self.add_time_proj = Timesteps(addition_time_embed_dim, True, downscale_freq_shift=0) 150 | self.add_embedding = TimestepEmbedding(projection_class_embeddings_input_dim, time_embed_dim) 151 | 152 | self.down_blocks = nn.ModuleList([]) 153 | self.up_blocks = nn.ModuleList([]) 154 | 155 | if isinstance(num_attention_heads, int): 156 | num_attention_heads = (num_attention_heads,) * len(down_block_types) 157 | 158 | if isinstance(cross_attention_dim, int): 159 | cross_attention_dim = (cross_attention_dim,) * len(down_block_types) 160 | 161 | if isinstance(layers_per_block, int): 162 | layers_per_block = [layers_per_block] * len(down_block_types) 163 | 164 | if isinstance(transformer_layers_per_block, int): 165 | transformer_layers_per_block = [transformer_layers_per_block] * len(down_block_types) 166 | 167 | blocks_time_embed_dim = time_embed_dim 168 | 169 | # down 170 | output_channel = block_out_channels[0] 171 | for i, down_block_type in enumerate(down_block_types): 172 | input_channel = output_channel 173 | output_channel = block_out_channels[i] 174 | is_final_block = i == len(block_out_channels) - 1 175 | 176 | down_block = get_down_block( 177 | down_block_type, 178 | num_layers=layers_per_block[i], 179 | transformer_layers_per_block=transformer_layers_per_block[i], 180 | in_channels=input_channel, 181 | out_channels=output_channel, 182 | temb_channels=blocks_time_embed_dim, 183 | add_downsample=not is_final_block, 184 | resnet_eps=1e-5, 185 | cross_attention_dim=cross_attention_dim[i], 186 | num_attention_heads=num_attention_heads[i], 187 | resnet_act_fn="silu", 188 | ) 189 | self.down_blocks.append(down_block) 190 | 191 | # mid 192 | self.mid_block = UNetMidBlockSpatioTemporal( 193 | block_out_channels[-1], 194 | temb_channels=blocks_time_embed_dim, 195 | transformer_layers_per_block=transformer_layers_per_block[-1], 196 | cross_attention_dim=cross_attention_dim[-1], 197 | num_attention_heads=num_attention_heads[-1], 198 | ) 199 | 200 | # count how many layers upsample the images 201 | self.num_upsamplers = 0 202 | 203 | # up 204 | reversed_block_out_channels = list(reversed(block_out_channels)) 205 | reversed_num_attention_heads = list(reversed(num_attention_heads)) 206 | reversed_layers_per_block = list(reversed(layers_per_block)) 207 | reversed_cross_attention_dim = list(reversed(cross_attention_dim)) 208 | reversed_transformer_layers_per_block = list(reversed(transformer_layers_per_block)) 209 | 210 | output_channel = reversed_block_out_channels[0] 211 | for i, up_block_type in enumerate(up_block_types): 212 | is_final_block = i == len(block_out_channels) - 1 213 | 214 | prev_output_channel = output_channel 215 | output_channel = reversed_block_out_channels[i] 216 | input_channel = reversed_block_out_channels[min(i + 1, len(block_out_channels) - 1)] 217 | 218 | # add upsample block for all BUT final layer 219 | if not is_final_block: 220 | add_upsample = True 221 | self.num_upsamplers += 1 222 | else: 223 | add_upsample = False 224 | 225 | up_block = get_up_block( 226 | up_block_type, 227 | num_layers=reversed_layers_per_block[i] + 1, 228 | transformer_layers_per_block=reversed_transformer_layers_per_block[i], 229 | in_channels=input_channel, 230 | out_channels=output_channel, 231 | prev_output_channel=prev_output_channel, 232 | temb_channels=blocks_time_embed_dim, 233 | add_upsample=add_upsample, 234 | resnet_eps=1e-5, 235 | resolution_idx=i, 236 | cross_attention_dim=reversed_cross_attention_dim[i], 237 | num_attention_heads=reversed_num_attention_heads[i], 238 | resnet_act_fn="silu", 239 | ) 240 | self.up_blocks.append(up_block) 241 | prev_output_channel = output_channel 242 | 243 | # out 244 | self.conv_norm_out = nn.GroupNorm(num_channels=block_out_channels[0], num_groups=32, eps=1e-5) 245 | self.conv_act = nn.SiLU() 246 | 247 | self.conv_out = nn.Conv2d( 248 | block_out_channels[0], 249 | out_channels, 250 | kernel_size=3, 251 | padding=1, 252 | ) 253 | 254 | @property 255 | def attn_processors(self) -> Dict[str, AttentionProcessor]: 256 | r""" 257 | Returns: 258 | `dict` of attention processors: A dictionary containing all attention processors used in the model with 259 | indexed by its weight name. 260 | """ 261 | # set recursively 262 | processors = {} 263 | 264 | def fn_recursive_add_processors( 265 | name: str, 266 | module: torch.nn.Module, 267 | processors: Dict[str, AttentionProcessor], 268 | ): 269 | if hasattr(module, "get_processor"): 270 | processors[f"{name}.processor"] = module.get_processor(return_deprecated_lora=True) 271 | 272 | for sub_name, child in module.named_children(): 273 | fn_recursive_add_processors(f"{name}.{sub_name}", child, processors) 274 | 275 | return processors 276 | 277 | for name, module in self.named_children(): 278 | fn_recursive_add_processors(name, module, processors) 279 | 280 | return processors 281 | 282 | def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]): 283 | r""" 284 | Sets the attention processor to use to compute attention. 285 | 286 | Parameters: 287 | processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`): 288 | The instantiated processor class or a dictionary of processor classes that will be set as the processor 289 | for **all** `Attention` layers. 290 | 291 | If `processor` is a dict, the key needs to define the path to the corresponding cross attention 292 | processor. This is strongly recommended when setting trainable attention processors. 293 | 294 | """ 295 | count = len(self.attn_processors.keys()) 296 | 297 | if isinstance(processor, dict) and len(processor) != count: 298 | raise ValueError( 299 | f"A dict of processors was passed, but the number of processors {len(processor)} does not match the" 300 | f" number of attention layers: {count}. Please make sure to pass {count} processor classes." 301 | ) 302 | 303 | def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor): 304 | if hasattr(module, "set_processor"): 305 | if not isinstance(processor, dict): 306 | module.set_processor(processor) 307 | else: 308 | module.set_processor(processor.pop(f"{name}.processor")) 309 | 310 | for sub_name, child in module.named_children(): 311 | fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor) 312 | 313 | for name, module in self.named_children(): 314 | fn_recursive_attn_processor(name, module, processor) 315 | 316 | def set_default_attn_processor(self): 317 | """ 318 | Disables custom attention processors and sets the default attention implementation. 319 | """ 320 | if all(proc.__class__ in CROSS_ATTENTION_PROCESSORS for proc in self.attn_processors.values()): 321 | processor = AttnProcessor() 322 | else: 323 | raise ValueError( 324 | f"Cannot call `set_default_attn_processor` " \ 325 | f"when attention processors are of type {next(iter(self.attn_processors.values()))}" 326 | ) 327 | 328 | self.set_attn_processor(processor) 329 | 330 | def _set_gradient_checkpointing(self, module, value=False): 331 | if hasattr(module, "gradient_checkpointing"): 332 | module.gradient_checkpointing = value 333 | 334 | # Copied from diffusers.models.unets.unet_3d_condition.UNet3DConditionModel.enable_forward_chunking 335 | def enable_forward_chunking(self, chunk_size: Optional[int] = None, dim: int = 0) -> None: 336 | """ 337 | Sets the attention processor to use [feed forward 338 | chunking](https://huggingface.co/blog/reformer#2-chunked-feed-forward-layers). 339 | 340 | Parameters: 341 | chunk_size (`int`, *optional*): 342 | The chunk size of the feed-forward layers. If not specified, will run feed-forward layer individually 343 | over each tensor of dim=`dim`. 344 | dim (`int`, *optional*, defaults to `0`): 345 | The dimension over which the feed-forward computation should be chunked. Choose between dim=0 (batch) 346 | or dim=1 (sequence length). 347 | """ 348 | if dim not in [0, 1]: 349 | raise ValueError(f"Make sure to set `dim` to either 0 or 1, not {dim}") 350 | 351 | # By default chunk size is 1 352 | chunk_size = chunk_size or 1 353 | 354 | def fn_recursive_feed_forward(module: torch.nn.Module, chunk_size: int, dim: int): 355 | if hasattr(module, "set_chunk_feed_forward"): 356 | module.set_chunk_feed_forward(chunk_size=chunk_size, dim=dim) 357 | 358 | for child in module.children(): 359 | fn_recursive_feed_forward(child, chunk_size, dim) 360 | 361 | for module in self.children(): 362 | fn_recursive_feed_forward(module, chunk_size, dim) 363 | 364 | def forward( 365 | self, 366 | sample: torch.FloatTensor, 367 | timestep: Union[torch.Tensor, float, int], 368 | encoder_hidden_states: torch.Tensor, 369 | added_time_ids: torch.Tensor, 370 | pose_latents: torch.Tensor = None, 371 | image_only_indicator: bool = False, 372 | return_dict: bool = True, 373 | ) -> Union[UNetSpatioTemporalConditionOutput, Tuple]: 374 | r""" 375 | The [`UNetSpatioTemporalConditionModel`] forward method. 376 | 377 | Args: 378 | sample (`torch.FloatTensor`): 379 | The noisy input tensor with the following shape `(batch, num_frames, channel, height, width)`. 380 | timestep (`torch.FloatTensor` or `float` or `int`): The number of timesteps to denoise an input. 381 | encoder_hidden_states (`torch.FloatTensor`): 382 | The encoder hidden states with shape `(batch, sequence_length, cross_attention_dim)`. 383 | added_time_ids: (`torch.FloatTensor`): 384 | The additional time ids with shape `(batch, num_additional_ids)`. These are encoded with sinusoidal 385 | embeddings and added to the time embeddings. 386 | pose_latents: (`torch.FloatTensor`): 387 | The additional latents for pose sequences. 388 | image_only_indicator (`bool`, *optional*, defaults to `False`): 389 | Whether or not training with all images. 390 | return_dict (`bool`, *optional*, defaults to `True`): 391 | Whether or not to return a [`~models.unet_slatio_temporal.UNetSpatioTemporalConditionOutput`] 392 | instead of a plain tuple. 393 | Returns: 394 | [`~models.unet_slatio_temporal.UNetSpatioTemporalConditionOutput`] or `tuple`: 395 | If `return_dict` is True, 396 | an [`~models.unet_slatio_temporal.UNetSpatioTemporalConditionOutput`] is returned, 397 | otherwise a `tuple` is returned where the first element is the sample tensor. 398 | """ 399 | # 1. time 400 | timesteps = timestep 401 | if not torch.is_tensor(timesteps): 402 | # TODO: this requires sync between CPU and GPU. So try to pass timesteps as tensors if you can 403 | # This would be a good case for the `match` statement (Python 3.10+) 404 | is_mps = sample.device.type == "mps" 405 | if isinstance(timestep, float): 406 | dtype = torch.float32 if is_mps else torch.float64 407 | else: 408 | dtype = torch.int32 if is_mps else torch.int64 409 | timesteps = torch.tensor([timesteps], dtype=dtype, device=sample.device) 410 | elif len(timesteps.shape) == 0: 411 | timesteps = timesteps[None].to(sample.device) 412 | 413 | # broadcast to batch dimension in a way that's compatible with ONNX/Core ML 414 | batch_size, num_frames = sample.shape[:2] 415 | timesteps = timesteps.expand(batch_size) 416 | 417 | t_emb = self.time_proj(timesteps) 418 | 419 | # `Timesteps` does not contain any weights and will always return f32 tensors 420 | # but time_embedding might actually be running in fp16. so we need to cast here. 421 | # there might be better ways to encapsulate this. 422 | t_emb = t_emb.to(dtype=sample.dtype) 423 | 424 | emb = self.time_embedding(t_emb) 425 | 426 | time_embeds = self.add_time_proj(added_time_ids.flatten()) 427 | time_embeds = time_embeds.reshape((batch_size, -1)) 428 | time_embeds = time_embeds.to(emb.dtype) 429 | aug_emb = self.add_embedding(time_embeds) 430 | emb = emb + aug_emb 431 | 432 | # Flatten the batch and frames dimensions 433 | # sample: [batch, frames, channels, height, width] -> [batch * frames, channels, height, width] 434 | sample = sample.flatten(0, 1) 435 | # Repeat the embeddings num_video_frames times 436 | # emb: [batch, channels] -> [batch * frames, channels] 437 | emb = emb.repeat_interleave(num_frames, dim=0) 438 | # encoder_hidden_states: [batch, 1, channels] -> [batch * frames, 1, channels] 439 | encoder_hidden_states = encoder_hidden_states.repeat_interleave(num_frames, dim=0) 440 | 441 | # 2. pre-process 442 | sample = self.conv_in(sample) 443 | if pose_latents is not None: 444 | sample = sample + pose_latents 445 | 446 | image_only_indicator = torch.ones(batch_size, num_frames, dtype=sample.dtype, device=sample.device) \ 447 | if image_only_indicator else torch.zeros(batch_size, num_frames, dtype=sample.dtype, device=sample.device) 448 | 449 | down_block_res_samples = (sample,) 450 | for downsample_block in self.down_blocks: 451 | if hasattr(downsample_block, "has_cross_attention") and downsample_block.has_cross_attention: 452 | sample, res_samples = downsample_block( 453 | hidden_states=sample, 454 | temb=emb, 455 | encoder_hidden_states=encoder_hidden_states, 456 | image_only_indicator=image_only_indicator, 457 | ) 458 | else: 459 | sample, res_samples = downsample_block( 460 | hidden_states=sample, 461 | temb=emb, 462 | image_only_indicator=image_only_indicator, 463 | ) 464 | 465 | down_block_res_samples += res_samples 466 | 467 | # 4. mid 468 | sample = self.mid_block( 469 | hidden_states=sample, 470 | temb=emb, 471 | encoder_hidden_states=encoder_hidden_states, 472 | image_only_indicator=image_only_indicator, 473 | ) 474 | 475 | # 5. up 476 | for i, upsample_block in enumerate(self.up_blocks): 477 | res_samples = down_block_res_samples[-len(upsample_block.resnets):] 478 | down_block_res_samples = down_block_res_samples[: -len(upsample_block.resnets)] 479 | 480 | if hasattr(upsample_block, "has_cross_attention") and upsample_block.has_cross_attention: 481 | sample = upsample_block( 482 | hidden_states=sample, 483 | temb=emb, 484 | res_hidden_states_tuple=res_samples, 485 | encoder_hidden_states=encoder_hidden_states, 486 | image_only_indicator=image_only_indicator, 487 | ) 488 | else: 489 | sample = upsample_block( 490 | hidden_states=sample, 491 | temb=emb, 492 | res_hidden_states_tuple=res_samples, 493 | image_only_indicator=image_only_indicator, 494 | ) 495 | 496 | # 6. post-process 497 | sample = self.conv_norm_out(sample) 498 | sample = self.conv_act(sample) 499 | sample = self.conv_out(sample) 500 | 501 | # 7. Reshape back to original shape 502 | sample = sample.reshape(batch_size, num_frames, *sample.shape[1:]) 503 | 504 | if not return_dict: 505 | return (sample,) 506 | 507 | return UNetSpatioTemporalConditionOutput(sample=sample) 508 | -------------------------------------------------------------------------------- /mimicmotion/pipelines/pipeline_mimicmotion.py: -------------------------------------------------------------------------------- 1 | import inspect 2 | from dataclasses import dataclass 3 | from typing import Callable, Dict, List, Optional, Union 4 | 5 | import PIL.Image 6 | import einops 7 | import numpy as np 8 | import torch 9 | from diffusers.image_processor import VaeImageProcessor, PipelineImageInput 10 | from diffusers.models import AutoencoderKLTemporalDecoder, UNetSpatioTemporalConditionModel 11 | from diffusers.pipelines.pipeline_utils import DiffusionPipeline 12 | from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion import retrieve_timesteps 13 | from diffusers.pipelines.stable_video_diffusion.pipeline_stable_video_diffusion \ 14 | import _resize_with_antialiasing, _append_dims 15 | from diffusers.schedulers import EulerDiscreteScheduler 16 | from diffusers.utils import BaseOutput, logging 17 | from diffusers.utils.torch_utils import is_compiled_module, randn_tensor 18 | from transformers import CLIPImageProcessor, CLIPVisionModelWithProjection 19 | 20 | from ..modules.pose_net import PoseNet 21 | 22 | logger = logging.get_logger(__name__) # pylint: disable=invalid-name 23 | 24 | 25 | def _append_dims(x, target_dims): 26 | """Appends dimensions to the end of a tensor until it has target_dims dimensions.""" 27 | dims_to_append = target_dims - x.ndim 28 | if dims_to_append < 0: 29 | raise ValueError(f"input has {x.ndim} dims but target_dims is {target_dims}, which is less") 30 | return x[(...,) + (None,) * dims_to_append] 31 | 32 | 33 | # Copied from diffusers.pipelines.animatediff.pipeline_animatediff.tensor2vid 34 | def tensor2vid(video: torch.Tensor, processor: "VaeImageProcessor", output_type: str = "np"): 35 | batch_size, channels, num_frames, height, width = video.shape 36 | outputs = [] 37 | for batch_idx in range(batch_size): 38 | batch_vid = video[batch_idx].permute(1, 0, 2, 3) 39 | batch_output = processor.postprocess(batch_vid, output_type) 40 | 41 | outputs.append(batch_output) 42 | 43 | if output_type == "np": 44 | outputs = np.stack(outputs) 45 | 46 | elif output_type == "pt": 47 | outputs = torch.stack(outputs) 48 | 49 | elif not output_type == "pil": 50 | raise ValueError(f"{output_type} does not exist. Please choose one of ['np', 'pt', 'pil]") 51 | 52 | return outputs 53 | 54 | 55 | @dataclass 56 | class MimicMotionPipelineOutput(BaseOutput): 57 | r""" 58 | Output class for mimicmotion pipeline. 59 | 60 | Args: 61 | frames (`[List[List[PIL.Image.Image]]`, `np.ndarray`, `torch.Tensor`]): 62 | List of denoised PIL images of length `batch_size` or numpy array or torch tensor of shape `(batch_size, 63 | num_frames, height, width, num_channels)`. 64 | """ 65 | 66 | frames: Union[List[List[PIL.Image.Image]], np.ndarray, torch.Tensor] 67 | 68 | 69 | class MimicMotionPipeline(DiffusionPipeline): 70 | r""" 71 | Pipeline to generate video from an input image using Stable Video Diffusion. 72 | 73 | This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods 74 | implemented for all pipelines (downloading, saving, running on a particular device, etc.). 75 | 76 | Args: 77 | vae ([`AutoencoderKLTemporalDecoder`]): 78 | Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations. 79 | image_encoder ([`~transformers.CLIPVisionModelWithProjection`]): 80 | Frozen CLIP image-encoder ([laion/CLIP-ViT-H-14-laion2B-s32B-b79K] 81 | (https://huggingface.co/laion/CLIP-ViT-H-14-laion2B-s32B-b79K)). 82 | unet ([`UNetSpatioTemporalConditionModel`]): 83 | A `UNetSpatioTemporalConditionModel` to denoise the encoded image latents. 84 | scheduler ([`EulerDiscreteScheduler`]): 85 | A scheduler to be used in combination with `unet` to denoise the encoded image latents. 86 | feature_extractor ([`~transformers.CLIPImageProcessor`]): 87 | A `CLIPImageProcessor` to extract features from generated images. 88 | pose_net ([`PoseNet`]): 89 | A `` to inject pose signals into unet. 90 | """ 91 | 92 | model_cpu_offload_seq = "image_encoder->unet->vae" 93 | _callback_tensor_inputs = ["latents"] 94 | 95 | def __init__( 96 | self, 97 | vae: AutoencoderKLTemporalDecoder, 98 | image_encoder: CLIPVisionModelWithProjection, 99 | unet: UNetSpatioTemporalConditionModel, 100 | scheduler: EulerDiscreteScheduler, 101 | feature_extractor: CLIPImageProcessor, 102 | pose_net: PoseNet, 103 | ): 104 | super().__init__() 105 | 106 | self.register_modules( 107 | vae=vae, 108 | image_encoder=image_encoder, 109 | unet=unet, 110 | scheduler=scheduler, 111 | feature_extractor=feature_extractor, 112 | pose_net=pose_net, 113 | ) 114 | self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) 115 | self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor) 116 | 117 | def _encode_image( 118 | self, 119 | image: PipelineImageInput, 120 | device: Union[str, torch.device], 121 | num_videos_per_prompt: int, 122 | do_classifier_free_guidance: bool): 123 | dtype = next(self.image_encoder.parameters()).dtype 124 | 125 | if not isinstance(image, torch.Tensor): 126 | image = self.image_processor.pil_to_numpy(image) 127 | image = self.image_processor.numpy_to_pt(image) 128 | 129 | # We normalize the image before resizing to match with the original implementation. 130 | # Then we unnormalize it after resizing. 131 | image = image * 2.0 - 1.0 132 | image = _resize_with_antialiasing(image, (224, 224)) 133 | image = (image + 1.0) / 2.0 134 | 135 | # Normalize the image with for CLIP input 136 | image = self.feature_extractor( 137 | images=image, 138 | do_normalize=True, 139 | do_center_crop=False, 140 | do_resize=False, 141 | do_rescale=False, 142 | return_tensors="pt", 143 | ).pixel_values 144 | 145 | image = image.to(device=device, dtype=dtype) 146 | image_embeddings = self.image_encoder(image).image_embeds 147 | image_embeddings = image_embeddings.unsqueeze(1) 148 | 149 | # duplicate image embeddings for each generation per prompt, using mps friendly method 150 | bs_embed, seq_len, _ = image_embeddings.shape 151 | image_embeddings = image_embeddings.repeat(1, num_videos_per_prompt, 1) 152 | image_embeddings = image_embeddings.view(bs_embed * num_videos_per_prompt, seq_len, -1) 153 | 154 | if do_classifier_free_guidance: 155 | negative_image_embeddings = torch.zeros_like(image_embeddings) 156 | 157 | # For classifier free guidance, we need to do two forward passes. 158 | # Here we concatenate the unconditional and text embeddings into a single batch 159 | # to avoid doing two forward passes 160 | image_embeddings = torch.cat([negative_image_embeddings, image_embeddings]) 161 | 162 | return image_embeddings 163 | 164 | def _encode_pose_image( 165 | self, 166 | pose_image: torch.Tensor, 167 | do_classifier_free_guidance: bool, 168 | ): 169 | # Get latents_pose 170 | pose_latents = self.pose_net(pose_image) 171 | 172 | if do_classifier_free_guidance: 173 | negative_pose_latents = torch.zeros_like(pose_latents) 174 | 175 | # For classifier free guidance, we need to do two forward passes. 176 | # Here we concatenate the unconditional and text embeddings into a single batch 177 | # to avoid doing two forward passes 178 | pose_latents = torch.cat([negative_pose_latents, pose_latents]) 179 | 180 | return pose_latents 181 | 182 | def _encode_vae_image( 183 | self, 184 | image: torch.Tensor, 185 | device: Union[str, torch.device], 186 | num_videos_per_prompt: int, 187 | do_classifier_free_guidance: bool, 188 | ): 189 | image = image.to(device=device) 190 | image_latents = self.vae.encode(image).latent_dist.mode() 191 | 192 | if do_classifier_free_guidance: 193 | negative_image_latents = torch.zeros_like(image_latents) 194 | 195 | # For classifier free guidance, we need to do two forward passes. 196 | # Here we concatenate the unconditional and text embeddings into a single batch 197 | # to avoid doing two forward passes 198 | image_latents = torch.cat([negative_image_latents, image_latents]) 199 | 200 | # duplicate image_latents for each generation per prompt, using mps friendly method 201 | image_latents = image_latents.repeat(num_videos_per_prompt, 1, 1, 1) 202 | 203 | return image_latents 204 | 205 | def _get_add_time_ids( 206 | self, 207 | fps: int, 208 | motion_bucket_id: int, 209 | noise_aug_strength: float, 210 | dtype: torch.dtype, 211 | batch_size: int, 212 | num_videos_per_prompt: int, 213 | do_classifier_free_guidance: bool, 214 | ): 215 | add_time_ids = [fps, motion_bucket_id, noise_aug_strength] 216 | 217 | passed_add_embed_dim = self.unet.config.addition_time_embed_dim * len(add_time_ids) 218 | expected_add_embed_dim = self.unet.add_embedding.linear_1.in_features 219 | 220 | if expected_add_embed_dim != passed_add_embed_dim: 221 | raise ValueError( 222 | f"Model expects an added time embedding vector of length {expected_add_embed_dim}, " \ 223 | f"but a vector of {passed_add_embed_dim} was created. The model has an incorrect config. " \ 224 | f"Please check `unet.config.time_embedding_type` and `text_encoder_2.config.projection_dim`." 225 | ) 226 | 227 | add_time_ids = torch.tensor([add_time_ids], dtype=dtype) 228 | add_time_ids = add_time_ids.repeat(batch_size * num_videos_per_prompt, 1) 229 | 230 | if do_classifier_free_guidance: 231 | add_time_ids = torch.cat([add_time_ids, add_time_ids]) 232 | 233 | return add_time_ids 234 | 235 | def decode_latents( 236 | self, 237 | latents: torch.Tensor, 238 | num_frames: int, 239 | decode_chunk_size: int = 8): 240 | # [batch, frames, channels, height, width] -> [batch*frames, channels, height, width] 241 | latents = latents.flatten(0, 1) 242 | 243 | latents = 1 / self.vae.config.scaling_factor * latents 244 | 245 | forward_vae_fn = self.vae._orig_mod.forward if is_compiled_module(self.vae) else self.vae.forward 246 | accepts_num_frames = "num_frames" in set(inspect.signature(forward_vae_fn).parameters.keys()) 247 | 248 | # decode decode_chunk_size frames at a time to avoid OOM 249 | frames = [] 250 | for i in range(0, latents.shape[0], decode_chunk_size): 251 | num_frames_in = latents[i: i + decode_chunk_size].shape[0] 252 | decode_kwargs = {} 253 | if accepts_num_frames: 254 | # we only pass num_frames_in if it's expected 255 | decode_kwargs["num_frames"] = num_frames_in 256 | 257 | frame = self.vae.decode(latents[i: i + decode_chunk_size], **decode_kwargs).sample 258 | frames.append(frame.cpu()) 259 | frames = torch.cat(frames, dim=0) 260 | 261 | # [batch*frames, channels, height, width] -> [batch, channels, frames, height, width] 262 | frames = frames.reshape(-1, num_frames, *frames.shape[1:]).permute(0, 2, 1, 3, 4) 263 | 264 | # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16 265 | frames = frames.float() 266 | return frames 267 | 268 | def check_inputs(self, image, height, width): 269 | if ( 270 | not isinstance(image, torch.Tensor) 271 | and not isinstance(image, PIL.Image.Image) 272 | and not isinstance(image, list) 273 | ): 274 | raise ValueError( 275 | "`image` has to be of type `torch.FloatTensor` or `PIL.Image.Image` or `List[PIL.Image.Image]` but is" 276 | f" {type(image)}" 277 | ) 278 | 279 | if height % 8 != 0 or width % 8 != 0: 280 | raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.") 281 | 282 | def prepare_latents( 283 | self, 284 | batch_size: int, 285 | num_frames: int, 286 | num_channels_latents: int, 287 | height: int, 288 | width: int, 289 | dtype: torch.dtype, 290 | device: Union[str, torch.device], 291 | generator: torch.Generator, 292 | latents: Optional[torch.Tensor] = None, 293 | ): 294 | shape = ( 295 | batch_size, 296 | num_frames, 297 | num_channels_latents // 2, 298 | height // self.vae_scale_factor, 299 | width // self.vae_scale_factor, 300 | ) 301 | if isinstance(generator, list) and len(generator) != batch_size: 302 | raise ValueError( 303 | f"You have passed a list of generators of length {len(generator)}, but requested an effective batch" 304 | f" size of {batch_size}. Make sure the batch size matches the length of the generators." 305 | ) 306 | 307 | if latents is None: 308 | latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype) 309 | else: 310 | latents = latents.to(device) 311 | 312 | # scale the initial noise by the standard deviation required by the scheduler 313 | latents = latents * self.scheduler.init_noise_sigma 314 | return latents 315 | 316 | @property 317 | def guidance_scale(self): 318 | return self._guidance_scale 319 | 320 | # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2) 321 | # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1` 322 | # corresponds to doing no classifier free guidance. 323 | @property 324 | def do_classifier_free_guidance(self): 325 | return True # TODO 326 | if isinstance(self.guidance_scale, (int, float)): 327 | return self.guidance_scale 328 | return self.guidance_scale.max() > 1 329 | 330 | @property 331 | def num_timesteps(self): 332 | return self._num_timesteps 333 | 334 | def prepare_extra_step_kwargs(self, generator, eta): 335 | # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature 336 | # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers. 337 | # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502 338 | # and should be between [0, 1] 339 | 340 | accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys()) 341 | extra_step_kwargs = {} 342 | if accepts_eta: 343 | extra_step_kwargs["eta"] = eta 344 | 345 | # check if the scheduler accepts generator 346 | accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys()) 347 | if accepts_generator: 348 | extra_step_kwargs["generator"] = generator 349 | return extra_step_kwargs 350 | 351 | @torch.no_grad() 352 | def __call__( 353 | self, 354 | image: Union[PIL.Image.Image, List[PIL.Image.Image], torch.FloatTensor], 355 | image_pose: Union[torch.FloatTensor], 356 | height: int = 576, 357 | width: int = 1024, 358 | num_frames: Optional[int] = None, 359 | tile_size: Optional[int] = 16, 360 | tile_overlap: Optional[int] = 4, 361 | num_inference_steps: int = 25, 362 | min_guidance_scale: float = 1.0, 363 | max_guidance_scale: float = 3.0, 364 | fps: int = 7, 365 | motion_bucket_id: int = 127, 366 | noise_aug_strength: float = 0.02, 367 | image_only_indicator: bool = False, 368 | decode_chunk_size: Optional[int] = None, 369 | num_videos_per_prompt: Optional[int] = 1, 370 | generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, 371 | latents: Optional[torch.FloatTensor] = None, 372 | first_n_frames=None, 373 | output_type: Optional[str] = "pil", 374 | callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None, 375 | callback_on_step_end_tensor_inputs: List[str] = ["latents"], 376 | return_dict: bool = True, 377 | device: Union[str, torch.device] =None, 378 | ): 379 | r""" 380 | The call function to the pipeline for generation. 381 | 382 | Args: 383 | image (`PIL.Image.Image` or `List[PIL.Image.Image]` or `torch.FloatTensor`): 384 | Image or images to guide image generation. If you provide a tensor, it needs to be compatible with 385 | [`CLIPImageProcessor`](https://huggingface.co/lambdalabs/sd-image-variations-diffusers/blob/main/ 386 | feature_extractor/preprocessor_config.json). 387 | height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`): 388 | The height in pixels of the generated image. 389 | width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`): 390 | The width in pixels of the generated image. 391 | num_frames (`int`, *optional*): 392 | The number of video frames to generate. Defaults to 14 for `stable-video-diffusion-img2vid` 393 | and to 25 for `stable-video-diffusion-img2vid-xt` 394 | num_inference_steps (`int`, *optional*, defaults to 25): 395 | The number of denoising steps. More denoising steps usually lead to a higher quality image at the 396 | expense of slower inference. This parameter is modulated by `strength`. 397 | min_guidance_scale (`float`, *optional*, defaults to 1.0): 398 | The minimum guidance scale. Used for the classifier free guidance with first frame. 399 | max_guidance_scale (`float`, *optional*, defaults to 3.0): 400 | The maximum guidance scale. Used for the classifier free guidance with last frame. 401 | fps (`int`, *optional*, defaults to 7): 402 | Frames per second.The rate at which the generated images shall be exported to a video after generation. 403 | Note that Stable Diffusion Video's UNet was micro-conditioned on fps-1 during training. 404 | motion_bucket_id (`int`, *optional*, defaults to 127): 405 | The motion bucket ID. Used as conditioning for the generation. 406 | The higher the number the more motion will be in the video. 407 | noise_aug_strength (`float`, *optional*, defaults to 0.02): 408 | The amount of noise added to the init image, 409 | the higher it is the less the video will look like the init image. Increase it for more motion. 410 | image_only_indicator (`bool`, *optional*, defaults to False): 411 | Whether to treat the inputs as batch of images instead of videos. 412 | decode_chunk_size (`int`, *optional*): 413 | The number of frames to decode at a time.The higher the chunk size, the higher the temporal consistency 414 | between frames, but also the higher the memory consumption. 415 | By default, the decoder will decode all frames at once for maximal quality. 416 | Reduce `decode_chunk_size` to reduce memory usage. 417 | num_videos_per_prompt (`int`, *optional*, defaults to 1): 418 | The number of images to generate per prompt. 419 | generator (`torch.Generator` or `List[torch.Generator]`, *optional*): 420 | A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make 421 | generation deterministic. 422 | latents (`torch.FloatTensor`, *optional*): 423 | Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image 424 | generation. Can be used to tweak the same generation with different prompts. If not provided, a latents 425 | tensor is generated by sampling using the supplied random `generator`. 426 | output_type (`str`, *optional*, defaults to `"pil"`): 427 | The output format of the generated image. Choose between `PIL.Image` or `np.array`. 428 | callback_on_step_end (`Callable`, *optional*): 429 | A function that calls at the end of each denoising steps during the inference. The function is called 430 | with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int, 431 | callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by 432 | `callback_on_step_end_tensor_inputs`. 433 | callback_on_step_end_tensor_inputs (`List`, *optional*): 434 | The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list 435 | will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the 436 | `._callback_tensor_inputs` attribute of your pipeline class. 437 | return_dict (`bool`, *optional*, defaults to `True`): 438 | Whether to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a 439 | plain tuple. 440 | device: 441 | On which device the pipeline runs on. 442 | 443 | Returns: 444 | [`~pipelines.stable_diffusion.StableVideoDiffusionPipelineOutput`] or `tuple`: 445 | If `return_dict` is `True`, 446 | [`~pipelines.stable_diffusion.StableVideoDiffusionPipelineOutput`] is returned, 447 | otherwise a `tuple` is returned where the first element is a list of list with the generated frames. 448 | 449 | Examples: 450 | 451 | ```py 452 | from diffusers import StableVideoDiffusionPipeline 453 | from diffusers.utils import load_image, export_to_video 454 | 455 | pipe = StableVideoDiffusionPipeline.from_pretrained( 456 | "stabilityai/stable-video-diffusion-img2vid-xt", torch_dtype=torch.float16, variant="fp16") 457 | pipe.to("cuda") 458 | 459 | image = load_image( 460 | "https://lh3.googleusercontent.com/y-iFOHfLTwkuQSUegpwDdgKmOjRSTvPxat63dQLB25xkTs4lhIbRUFeNBWZzYf370g=s1200") 461 | image = image.resize((1024, 576)) 462 | 463 | frames = pipe(image, num_frames=25, decode_chunk_size=8).frames[0] 464 | export_to_video(frames, "generated.mp4", fps=7) 465 | ``` 466 | """ 467 | # 0. Default height and width to unet 468 | height = height or self.unet.config.sample_size * self.vae_scale_factor 469 | width = width or self.unet.config.sample_size * self.vae_scale_factor 470 | 471 | num_frames = num_frames if num_frames is not None else self.unet.config.num_frames 472 | decode_chunk_size = decode_chunk_size if decode_chunk_size is not None else num_frames 473 | 474 | # 1. Check inputs. Raise error if not correct 475 | self.check_inputs(image, height, width) 476 | 477 | # 2. Define call parameters 478 | if isinstance(image, PIL.Image.Image): 479 | batch_size = 1 480 | elif isinstance(image, list): 481 | batch_size = len(image) 482 | else: 483 | batch_size = image.shape[0] 484 | device = device if device is not None else self._execution_device 485 | # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2) 486 | # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1` 487 | # corresponds to doing no classifier free guidance. 488 | self._guidance_scale = max_guidance_scale 489 | 490 | # 3. Encode input image 491 | image_embeddings = self._encode_image(image, device, num_videos_per_prompt, self.do_classifier_free_guidance) 492 | 493 | # NOTE: Stable Diffusion Video was conditioned on fps - 1, which 494 | # is why it is reduced here. 495 | fps = fps - 1 496 | 497 | # 4. Encode input image using VAE 498 | image = self.image_processor.preprocess(image, height=height, width=width).to(device) 499 | noise = randn_tensor(image.shape, generator=generator, device=device, dtype=image.dtype) 500 | image = image + noise_aug_strength * noise 501 | 502 | needs_upcasting = self.vae.dtype == torch.float16 and self.vae.config.force_upcast 503 | if needs_upcasting: 504 | self.vae.to(dtype=torch.float32) 505 | 506 | image_latents = self._encode_vae_image( 507 | image, 508 | device=device, 509 | num_videos_per_prompt=num_videos_per_prompt, 510 | do_classifier_free_guidance=self.do_classifier_free_guidance, 511 | ) 512 | image_latents = image_latents.to(image_embeddings.dtype) 513 | 514 | ref_latent = first_n_frames[:, 0] if first_n_frames is not None else None 515 | pose_latents = self._encode_pose_image( 516 | image_pose, do_classifier_free_guidance=self.do_classifier_free_guidance, 517 | ) 518 | 519 | # cast back to fp16 if needed 520 | if needs_upcasting: 521 | self.vae.to(dtype=torch.float16) 522 | 523 | # Repeat the image latents for each frame so we can concatenate them with the noise 524 | # image_latents [batch, channels, height, width] ->[batch, num_frames, channels, height, width] 525 | image_latents = image_latents.unsqueeze(1).repeat(1, num_frames, 1, 1, 1) 526 | 527 | # 5. Get Added Time IDs 528 | added_time_ids = self._get_add_time_ids( 529 | fps, 530 | motion_bucket_id, 531 | noise_aug_strength, 532 | image_embeddings.dtype, 533 | batch_size, 534 | num_videos_per_prompt, 535 | self.do_classifier_free_guidance, 536 | ) 537 | added_time_ids = added_time_ids.to(device) 538 | 539 | # 4. Prepare timesteps 540 | timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, None) 541 | 542 | # 5. Prepare latent variables 543 | num_channels_latents = self.unet.config.in_channels 544 | latents = self.prepare_latents( 545 | batch_size * num_videos_per_prompt, 546 | tile_size, 547 | num_channels_latents, 548 | height, 549 | width, 550 | image_embeddings.dtype, 551 | device, 552 | generator, 553 | latents, 554 | ) 555 | latents = latents.repeat(1, num_frames // tile_size + 1, 1, 1, 1)[:, :num_frames] 556 | 557 | # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline 558 | extra_step_kwargs = self.prepare_extra_step_kwargs(generator, 0.0) 559 | 560 | # 7. Prepare guidance scale 561 | guidance_scale = torch.linspace(min_guidance_scale, max_guidance_scale, num_frames).unsqueeze(0) 562 | guidance_scale = guidance_scale.to(device, latents.dtype) 563 | guidance_scale = guidance_scale.repeat(batch_size * num_videos_per_prompt, 1) 564 | guidance_scale = _append_dims(guidance_scale, latents.ndim) 565 | 566 | self._guidance_scale = guidance_scale 567 | 568 | # 8. Denoising loop 569 | self._num_timesteps = len(timesteps) 570 | pose_latents = einops.rearrange(pose_latents, '(b f) c h w -> b f c h w', f=num_frames) 571 | indices = [[0, *range(i + 1, min(i + tile_size, num_frames))] for i in 572 | range(0, num_frames - tile_size + 1, tile_size - tile_overlap)] 573 | if indices[-1][-1] < num_frames - 1: 574 | indices.append([0, *range(num_frames - tile_size + 1, num_frames)]) 575 | 576 | with self.progress_bar(total=len(timesteps) * len(indices)) as progress_bar: 577 | for i, t in enumerate(timesteps): 578 | # expand the latents if we are doing classifier free guidance 579 | latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents 580 | latent_model_input = self.scheduler.scale_model_input(latent_model_input, t) 581 | 582 | # Concatenate image_latents over channels dimension 583 | latent_model_input = torch.cat([latent_model_input, image_latents], dim=2) 584 | 585 | # predict the noise residual 586 | noise_pred = torch.zeros_like(image_latents) 587 | noise_pred_cnt = image_latents.new_zeros((num_frames,)) 588 | # image_pose = pixel_values_pose[:, frame_start:frame_start + self.num_frames, ...] 589 | weight = (torch.arange(tile_size, device=device) + 0.5) * 2. / tile_size 590 | weight = torch.minimum(weight, 2 - weight) 591 | for idx in indices: 592 | _noise_pred = self.unet( 593 | latent_model_input[:, idx], 594 | t, 595 | encoder_hidden_states=image_embeddings, 596 | added_time_ids=added_time_ids, 597 | pose_latents=pose_latents[:, idx].flatten(0, 1), 598 | image_only_indicator=image_only_indicator, 599 | return_dict=False, 600 | )[0] 601 | noise_pred[:, idx] += _noise_pred * weight[:, None, None, None] 602 | noise_pred_cnt[idx] += weight 603 | progress_bar.update() 604 | noise_pred.div_(noise_pred_cnt[:, None, None, None]) 605 | 606 | # perform guidance 607 | if self.do_classifier_free_guidance: 608 | noise_pred_uncond, noise_pred_cond = noise_pred.chunk(2) 609 | noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_cond - noise_pred_uncond) 610 | 611 | if first_n_frames is not None: 612 | sigma = self.scheduler.sigmas[self.scheduler.step_index] 613 | _latents = latents[:, 1:1 + first_n_frames.size(1)] 614 | tmp = (first_n_frames - _latents / (sigma ** 2 + 1)) / (-sigma / ((sigma ** 2 + 1) ** 0.5)) 615 | noise_pred[:, 1:1 + first_n_frames.size(1)] = tmp 616 | 617 | # compute the previous noisy sample x_t -> x_t-1 618 | latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0] 619 | 620 | if callback_on_step_end is not None: 621 | callback_kwargs = {} 622 | for k in callback_on_step_end_tensor_inputs: 623 | callback_kwargs[k] = locals()[k] 624 | callback_outputs = callback_on_step_end(self, i, t, callback_kwargs) 625 | 626 | latents = callback_outputs.pop("latents", latents) 627 | 628 | if not output_type == "latent": 629 | # cast back to fp16 if needed 630 | if needs_upcasting: 631 | self.vae.to(dtype=torch.float16) 632 | frames = self.decode_latents(latents, num_frames, decode_chunk_size) 633 | frames = tensor2vid(frames, self.image_processor, output_type=output_type) 634 | else: 635 | frames = latents 636 | 637 | self.maybe_free_model_hooks() 638 | 639 | if not return_dict: 640 | return frames 641 | 642 | return MimicMotionPipelineOutput(frames=frames) 643 | -------------------------------------------------------------------------------- /mimicmotion/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AIFSH/ComfyUI-MimicMotion/0f376479219afe8431f634539359eb26b981d1e5/mimicmotion/utils/__init__.py -------------------------------------------------------------------------------- /mimicmotion/utils/loader.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | import torch 4 | import torch.utils.checkpoint 5 | from diffusers.models import AutoencoderKLTemporalDecoder 6 | from diffusers.schedulers import EulerDiscreteScheduler 7 | from transformers import CLIPImageProcessor, CLIPVisionModelWithProjection 8 | 9 | from ..modules.unet import UNetSpatioTemporalConditionModel 10 | from ..modules.pose_net import PoseNet 11 | from ..pipelines.pipeline_mimicmotion import MimicMotionPipeline 12 | 13 | logger = logging.getLogger(__name__) 14 | 15 | class MimicMotionModel(torch.nn.Module): 16 | def __init__(self, base_model_path): 17 | """construnct base model components and load pretrained svd model except pose-net 18 | Args: 19 | base_model_path (str): pretrained svd model path 20 | """ 21 | super().__init__() 22 | self.unet = UNetSpatioTemporalConditionModel.from_config( 23 | UNetSpatioTemporalConditionModel.load_config(base_model_path, subfolder="unet"), 24 | use_safetensors=True,variant="fp16") 25 | self.vae = AutoencoderKLTemporalDecoder.from_pretrained( 26 | base_model_path, subfolder="vae",use_safetensors=True,variant="fp16").half() 27 | self.image_encoder = CLIPVisionModelWithProjection.from_pretrained( 28 | base_model_path, subfolder="image_encoder",use_safetensors=True,variant="fp16") 29 | self.noise_scheduler = EulerDiscreteScheduler.from_pretrained( 30 | base_model_path, subfolder="scheduler") 31 | self.feature_extractor = CLIPImageProcessor.from_pretrained( 32 | base_model_path, subfolder="feature_extractor") 33 | # pose_net 34 | self.pose_net = PoseNet(noise_latent_channels=self.unet.config.block_out_channels[0]) 35 | 36 | def create_pipeline(infer_config, device): 37 | """create mimicmotion pipeline and load pretrained weight 38 | 39 | Args: 40 | infer_config (str): 41 | device (str or torch.device): "cpu" or "cuda:{device_id}" 42 | """ 43 | mimicmotion_models = MimicMotionModel(infer_config.base_model_path).to(device=device).eval() 44 | mimicmotion_models.load_state_dict(torch.load(infer_config.ckpt_path, map_location=device), strict=False) 45 | pipeline = MimicMotionPipeline( 46 | vae=mimicmotion_models.vae, 47 | image_encoder=mimicmotion_models.image_encoder, 48 | unet=mimicmotion_models.unet, 49 | scheduler=mimicmotion_models.noise_scheduler, 50 | feature_extractor=mimicmotion_models.feature_extractor, 51 | pose_net=mimicmotion_models.pose_net 52 | ) 53 | return pipeline 54 | 55 | -------------------------------------------------------------------------------- /mimicmotion/utils/utils.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from pathlib import Path 3 | 4 | from torchvision.io import write_video 5 | 6 | logger = logging.getLogger(__name__) 7 | 8 | def save_to_mp4(frames, save_path, fps=7): 9 | frames = frames.permute((0, 2, 3, 1)) # (f, c, h, w) to (f, h, w, c) 10 | Path(save_path).parent.mkdir(parents=True, exist_ok=True) 11 | write_video(save_path, frames, fps=fps) 12 | 13 | -------------------------------------------------------------------------------- /nodes.py: -------------------------------------------------------------------------------- 1 | import os,sys 2 | now_dir = os.path.dirname(os.path.abspath(__file__)) 3 | sys.path.append(now_dir) 4 | 5 | import math 6 | import torch 7 | # import logging 8 | import cuda_malloc 9 | import folder_paths 10 | import numpy as np 11 | from PIL import Image 12 | from datetime import datetime 13 | from omegaconf import OmegaConf 14 | from huggingface_hub import snapshot_download 15 | from moviepy.editor import VideoFileClip,AudioFileClip 16 | from torchvision.transforms.functional import pil_to_tensor, resize, center_crop,to_pil_image 17 | 18 | from mimicmotion.pipelines.pipeline_mimicmotion import MimicMotionPipeline 19 | from mimicmotion.utils.loader import create_pipeline 20 | from mimicmotion.utils.utils import save_to_mp4 21 | 22 | # logging.basicConfig(level=logging.INFO, format="%(asctime)s: [%(levelname)s] %(message)s") 23 | # logger = logging.getLogger(__name__) 24 | device = torch.device("cuda" if cuda_malloc.cuda_malloc_supported() else "cpu") 25 | 26 | input_path = folder_paths.get_input_directory() 27 | output_dir = folder_paths.get_output_directory() 28 | ckpt_dir = os.path.join(now_dir, "models") 29 | svd_dir = os.path.join(ckpt_dir,"stable-video-diffusion-img2vid-xt-1-1") 30 | ASPECT_RATIO = 9 / 16 31 | # yzd-v/DWPose 32 | os.environ["dwpose"] = os.path.join(ckpt_dir,"DWPose") 33 | snapshot_download(repo_id="yzd-v/DWPose",local_dir=os.environ["dwpose"], 34 | allow_patterns=["dw-ll_ucoco_384.onnx","yolox_l.onnx"]) 35 | 36 | from mimicmotion.dwpose.preprocess import get_video_pose, get_image_pose 37 | 38 | class MimicMotionNode: 39 | def __init__(self) -> None: 40 | # weights/stable-video-diffusion-img2vid-xt-1-1 41 | snapshot_download(repo_id="weights/stable-video-diffusion-img2vid-xt-1-1",local_dir=svd_dir, 42 | ignore_patterns=["svd_xt*"],allow_patterns=["*.json","*fp16*"]) 43 | 44 | # ixaac/MimicMotion 45 | snapshot_download(repo_id="ixaac/MimicMotion",local_dir=ckpt_dir, 46 | allow_patterns="*.pth") 47 | 48 | 49 | @classmethod 50 | def INPUT_TYPES(s): 51 | return { 52 | "required":{ 53 | "ref_image":("IMAGE",), 54 | "ref_video_path":("VIDEO",), 55 | "resolution":([576,768],{ 56 | "default":576, 57 | }), 58 | "sample_stride":("INT",{ 59 | "default": 2 60 | }), 61 | "tile_size": ("INT",{ 62 | "default": 16 63 | }), 64 | "tile_overlap": ("INT",{ 65 | "default": 6 66 | }), 67 | "decode_chunk_size":("INT",{ 68 | "default": 8 69 | }), 70 | "num_inference_steps": ("INT",{ 71 | "default": 25 72 | }), 73 | "guidance_scale":("FLOAT",{ 74 | "default": 2.0 75 | }), 76 | "fps": ("INT",{ 77 | "default": 15 78 | }), 79 | "seed": ("INT",{ 80 | "default": 42 81 | }), 82 | } 83 | } 84 | 85 | RETURN_TYPES = ("VIDEO",) 86 | #RETURN_NAMES = ("image_output_name",) 87 | 88 | FUNCTION = "gen_video" 89 | 90 | #OUTPUT_NODE = False 91 | 92 | CATEGORY = "AIFSH_MimicMotion" 93 | 94 | @torch.no_grad() 95 | def gen_video(self,ref_image,ref_video_path,resolution,sample_stride, 96 | tile_size,tile_overlap,decode_chunk_size,num_inference_steps, 97 | guidance_scale,fps,seed): 98 | torch.set_default_dtype(torch.float16) 99 | infer_config = OmegaConf.load(os.path.join(now_dir,"test.yaml")) 100 | infer_config.base_model_path = svd_dir 101 | infer_config.ckpt_path = os.path.join(ckpt_dir,"MimicMotion.pth") 102 | pipeline = create_pipeline(infer_config,device) 103 | 104 | ############################################## Pre-process data ############################################## 105 | ref_image = ref_image.numpy()[0] * 255 106 | ref_image = ref_image.astype(np.uint8) 107 | ref_image = Image.fromarray(ref_image) 108 | pose_pixels, image_pixels = preprocess( 109 | ref_video_path, ref_image, 110 | resolution=resolution, sample_stride=sample_stride 111 | ) 112 | task_config = { 113 | "tile_size": tile_size, 114 | "tile_overlap": tile_overlap, 115 | "decode_chunk_size": decode_chunk_size, 116 | "num_inference_steps": num_inference_steps, 117 | "noise_aug_strength": 0, 118 | "guidance_scale": guidance_scale, 119 | "fps": fps, 120 | "seed": seed, 121 | } 122 | ########################################### Run MimicMotion pipeline ########################################### 123 | _video_frames = run_pipeline( 124 | pipeline, 125 | image_pixels, pose_pixels, 126 | device, task_config 127 | ) 128 | ################################### save results to output folder. ########################################### 129 | outfile = f"{output_dir}/mimicmotion_{os.path.basename(ref_video_path).split('.')[0]}" \ 130 | f"_{datetime.now().strftime('%Y%m%d%H%M%S')}.mp4" 131 | save_to_mp4(_video_frames,outfile,fps=fps,) 132 | if os.path.isfile(ref_video_path+".wav"): 133 | video_clip = VideoFileClip(outfile) 134 | audio_clip = AudioFileClip(ref_video_path+".wav") 135 | video_clip = video_clip.set_audio(audio_clip) 136 | outfile = f"{output_dir}/mimicmotion_{os.path.basename(ref_video_path).split('.')[0]}" \ 137 | f"_{datetime.now().strftime('%Y%m%d%H%M%S')}.mp4" 138 | video_clip.write_videofile(outfile) 139 | return (outfile, ) 140 | 141 | 142 | class PreViewVideo: 143 | @classmethod 144 | def INPUT_TYPES(s): 145 | return {"required":{ 146 | "video":("VIDEO",), 147 | }} 148 | 149 | CATEGORY = "AIFSH_MimicMotion" 150 | DESCRIPTION = "hello world!" 151 | 152 | RETURN_TYPES = () 153 | 154 | OUTPUT_NODE = True 155 | 156 | FUNCTION = "load_video" 157 | 158 | def load_video(self, video): 159 | video_name = os.path.basename(video) 160 | video_path_name = os.path.basename(os.path.dirname(video)) 161 | return {"ui":{"video":[video_name,video_path_name]}} 162 | 163 | class LoadVideo: 164 | @classmethod 165 | def INPUT_TYPES(s): 166 | files = [f for f in os.listdir(input_path) if os.path.isfile(os.path.join(input_path, f)) and f.split('.')[-1] in ["mp4", "webm","mkv","avi"]] 167 | return {"required":{ 168 | "video":(files,), 169 | }} 170 | 171 | CATEGORY = "AIFSH_MimicMotion" 172 | DESCRIPTION = "hello world!" 173 | 174 | RETURN_TYPES = ("VIDEO",) 175 | 176 | OUTPUT_NODE = False 177 | 178 | FUNCTION = "load_video" 179 | 180 | def load_video(self, video): 181 | video_path = os.path.join(input_path,video) 182 | video_clip = VideoFileClip(video_path) 183 | audio_path = os.path.join(input_path,video+".wav") 184 | try: 185 | video_clip.audio.write_audiofile(audio_path) 186 | print(f"bgm save at {audio_path}") 187 | except: 188 | print("none audio") 189 | return (video_path,) 190 | 191 | 192 | def run_pipeline(pipeline: MimicMotionPipeline, image_pixels, pose_pixels, device, task_config): 193 | image_pixels = [to_pil_image(img.to(torch.uint8)) for img in (image_pixels + 1.0) * 127.5] 194 | pose_pixels = pose_pixels.unsqueeze(0).to(device) 195 | generator = torch.Generator(device=device) 196 | generator.manual_seed(task_config["seed"]) 197 | frames = pipeline( 198 | image_pixels, image_pose=pose_pixels, num_frames=pose_pixels.size(1), 199 | tile_size=task_config["tile_size"], tile_overlap=task_config["tile_overlap"], 200 | height=pose_pixels.shape[-2], width=pose_pixels.shape[-1], fps=task_config["fps"], 201 | noise_aug_strength=task_config["noise_aug_strength"], num_inference_steps=task_config["num_inference_steps"], 202 | generator=generator, min_guidance_scale=task_config["guidance_scale"], 203 | max_guidance_scale=task_config["guidance_scale"], decode_chunk_size=task_config['decode_chunk_size'], output_type="pt", device=device 204 | ).frames.cpu() 205 | video_frames = (frames * 255.0).to(torch.uint8) 206 | 207 | for vid_idx in range(video_frames.shape[0]): 208 | # deprecated first frame because of ref image 209 | _video_frames = video_frames[vid_idx, 1:] 210 | 211 | return _video_frames 212 | 213 | def preprocess(video_path, image_pixels, resolution=576, sample_stride=2): 214 | """preprocess ref image pose and video pose 215 | 216 | Args: 217 | video_path (str): input video pose path 218 | image_pixels (Image): reference image pil 219 | resolution (int, optional): Defaults to 576. 220 | sample_stride (int, optional): Defaults to 2. 221 | """ 222 | # image_pixels = pil_loader(image_path) 223 | image_pixels = pil_to_tensor(image_pixels) # (c, h, w) 224 | h, w = image_pixels.shape[-2:] 225 | ############################ compute target h/w according to original aspect ratio ############################### 226 | if h>w: 227 | w_target, h_target = resolution, int(resolution / ASPECT_RATIO // 64) * 64 228 | else: 229 | w_target, h_target = int(resolution / ASPECT_RATIO // 64) * 64, resolution 230 | h_w_ratio = float(h) / float(w) 231 | if h_w_ratio < h_target / w_target: 232 | h_resize, w_resize = h_target, math.ceil(h_target / h_w_ratio) 233 | else: 234 | h_resize, w_resize = math.ceil(w_target * h_w_ratio), w_target 235 | image_pixels = resize(image_pixels, [h_resize, w_resize], antialias=None) 236 | image_pixels = center_crop(image_pixels, [h_target, w_target]) 237 | image_pixels = image_pixels.permute((1, 2, 0)).numpy() 238 | ##################################### get image&video pose value ################################################# 239 | image_pose = get_image_pose(image_pixels) 240 | video_pose = get_video_pose(video_path, image_pixels, sample_stride=sample_stride) 241 | pose_pixels = np.concatenate([np.expand_dims(image_pose, 0), video_pose]) 242 | image_pixels = np.transpose(np.expand_dims(image_pixels, 0), (0, 3, 1, 2)) 243 | return torch.from_numpy(pose_pixels.copy()) / 127.5 - 1, torch.from_numpy(image_pixels) / 127.5 - 1 244 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | diffusers 2 | transformers 3 | decord 4 | einops 5 | omegaconf 6 | onnxruntime-gpu 7 | moviepy 8 | matplotlib 9 | opencv-python 10 | accelerate 11 | av -------------------------------------------------------------------------------- /test.yaml: -------------------------------------------------------------------------------- 1 | # base svd model path 2 | base_model_path: models/SVD/stable-video-diffusion-img2vid-xt-1-1 3 | 4 | # checkpoint path 5 | ckpt_path: models/MimicMotion.pth 6 | 7 | test_case: 8 | - ref_video_path: assets/example_data/videos/pose1.mp4 9 | ref_image_path: assets/example_data/images/demo1.jpg 10 | num_frames: 16 11 | resolution: 576 12 | frames_overlap: 6 13 | num_inference_steps: 25 14 | noise_aug_strength: 0 15 | guidance_scale: 2.0 16 | sample_stride: 2 17 | fps: 15 18 | seed: 42 19 | 20 | 21 | -------------------------------------------------------------------------------- /web/js/previewVideo.js: -------------------------------------------------------------------------------- 1 | import { app } from "../../../scripts/app.js"; 2 | import { api } from '../../../scripts/api.js' 3 | 4 | function fitHeight(node) { 5 | node.setSize([node.size[0], node.computeSize([node.size[0], node.size[1]])[1]]) 6 | node?.graph?.setDirtyCanvas(true); 7 | } 8 | function chainCallback(object, property, callback) { 9 | if (object == undefined) { 10 | //This should not happen. 11 | console.error("Tried to add callback to non-existant object") 12 | return; 13 | } 14 | if (property in object) { 15 | const callback_orig = object[property] 16 | object[property] = function () { 17 | const r = callback_orig.apply(this, arguments); 18 | callback.apply(this, arguments); 19 | return r 20 | }; 21 | } else { 22 | object[property] = callback; 23 | } 24 | } 25 | 26 | function addPreviewOptions(nodeType) { 27 | chainCallback(nodeType.prototype, "getExtraMenuOptions", function(_, options) { 28 | // The intended way of appending options is returning a list of extra options, 29 | // but this isn't used in widgetInputs.js and would require 30 | // less generalization of chainCallback 31 | let optNew = [] 32 | try { 33 | const previewWidget = this.widgets.find((w) => w.name === "videopreview"); 34 | 35 | let url = null 36 | if (previewWidget.videoEl?.hidden == false && previewWidget.videoEl.src) { 37 | //Use full quality video 38 | //url = api.apiURL('/view?' + new URLSearchParams(previewWidget.value.params)); 39 | url = previewWidget.videoEl.src 40 | } 41 | if (url) { 42 | optNew.push( 43 | { 44 | content: "Open preview", 45 | callback: () => { 46 | window.open(url, "_blank") 47 | }, 48 | }, 49 | { 50 | content: "Save preview", 51 | callback: () => { 52 | const a = document.createElement("a"); 53 | a.href = url; 54 | a.setAttribute("download", new URLSearchParams(previewWidget.value.params).get("filename")); 55 | document.body.append(a); 56 | a.click(); 57 | requestAnimationFrame(() => a.remove()); 58 | }, 59 | } 60 | ); 61 | } 62 | if(options.length > 0 && options[0] != null && optNew.length > 0) { 63 | optNew.push(null); 64 | } 65 | options.unshift(...optNew); 66 | 67 | } catch (error) { 68 | console.log(error); 69 | } 70 | 71 | }); 72 | } 73 | function previewVideo(node,file,type){ 74 | var element = document.createElement("div"); 75 | const previewNode = node; 76 | var previewWidget = node.addDOMWidget("videopreview", "preview", element, { 77 | serialize: false, 78 | hideOnZoom: false, 79 | getValue() { 80 | return element.value; 81 | }, 82 | setValue(v) { 83 | element.value = v; 84 | }, 85 | }); 86 | previewWidget.computeSize = function(width) { 87 | if (this.aspectRatio && !this.parentEl.hidden) { 88 | let height = (previewNode.size[0]-20)/ this.aspectRatio + 10; 89 | if (!(height > 0)) { 90 | height = 0; 91 | } 92 | this.computedHeight = height + 10; 93 | return [width, height]; 94 | } 95 | return [width, -4];//no loaded src, widget should not display 96 | } 97 | // element.style['pointer-events'] = "none" 98 | previewWidget.value = {hidden: false, paused: false, params: {}} 99 | previewWidget.parentEl = document.createElement("div"); 100 | previewWidget.parentEl.className = "video_preview"; 101 | previewWidget.parentEl.style['width'] = "100%" 102 | element.appendChild(previewWidget.parentEl); 103 | previewWidget.videoEl = document.createElement("video"); 104 | previewWidget.videoEl.controls = true; 105 | previewWidget.videoEl.loop = false; 106 | previewWidget.videoEl.muted = false; 107 | previewWidget.videoEl.style['width'] = "100%" 108 | previewWidget.videoEl.addEventListener("loadedmetadata", () => { 109 | 110 | previewWidget.aspectRatio = previewWidget.videoEl.videoWidth / previewWidget.videoEl.videoHeight; 111 | fitHeight(this); 112 | }); 113 | previewWidget.videoEl.addEventListener("error", () => { 114 | //TODO: consider a way to properly notify the user why a preview isn't shown. 115 | previewWidget.parentEl.hidden = true; 116 | fitHeight(this); 117 | }); 118 | 119 | let params = { 120 | "filename": file, 121 | "type": type, 122 | } 123 | 124 | previewWidget.parentEl.hidden = previewWidget.value.hidden; 125 | previewWidget.videoEl.autoplay = !previewWidget.value.paused && !previewWidget.value.hidden; 126 | let target_width = 256 127 | if (element.style?.width) { 128 | //overscale to allow scrolling. Endpoint won't return higher than native 129 | target_width = element.style.width.slice(0,-2)*2; 130 | } 131 | if (!params.force_size || params.force_size.includes("?") || params.force_size == "Disabled") { 132 | params.force_size = target_width+"x?" 133 | } else { 134 | let size = params.force_size.split("x") 135 | let ar = parseInt(size[0])/parseInt(size[1]) 136 | params.force_size = target_width+"x"+(target_width/ar) 137 | } 138 | 139 | previewWidget.videoEl.src = api.apiURL('/view?' + new URLSearchParams(params)); 140 | 141 | previewWidget.videoEl.hidden = false; 142 | previewWidget.parentEl.appendChild(previewWidget.videoEl) 143 | } 144 | 145 | app.registerExtension({ 146 | name: "MimicMotion.VideoPreviewer", 147 | async beforeRegisterNodeDef(nodeType, nodeData, app) { 148 | if (nodeData?.name == "PreViewVideo") { 149 | nodeType.prototype.onExecuted = function (data) { 150 | previewVideo(this, data.video[0], data.video[1]); 151 | } 152 | //addPreviewOptions(nodeType) 153 | } 154 | } 155 | }); 156 | -------------------------------------------------------------------------------- /web/js/uploadVideo.js: -------------------------------------------------------------------------------- 1 | import { app } from "../../../scripts/app.js"; 2 | import { api } from '../../../scripts/api.js' 3 | import { ComfyWidgets } from "../../../scripts/widgets.js" 4 | 5 | function fitHeight(node) { 6 | node.setSize([node.size[0], node.computeSize([node.size[0], node.size[1]])[1]]) 7 | node?.graph?.setDirtyCanvas(true); 8 | } 9 | 10 | function previewVideo(node,file){ 11 | while (node.widgets.length > 2){ 12 | node.widgets.pop() 13 | } 14 | try { 15 | var el = document.getElementById("uploadVideo"); 16 | el.remove(); 17 | } catch (error) { 18 | console.log(error); 19 | } 20 | var element = document.createElement("div"); 21 | element.id = "uploadVideo"; 22 | const previewNode = node; 23 | var previewWidget = node.addDOMWidget("videopreview", "preview", element, { 24 | serialize: false, 25 | hideOnZoom: false, 26 | getValue() { 27 | return element.value; 28 | }, 29 | setValue(v) { 30 | element.value = v; 31 | }, 32 | }); 33 | previewWidget.computeSize = function(width) { 34 | if (this.aspectRatio && !this.parentEl.hidden) { 35 | let height = (previewNode.size[0]-20)/ this.aspectRatio + 10; 36 | if (!(height > 0)) { 37 | height = 0; 38 | } 39 | this.computedHeight = height + 10; 40 | return [width, height]; 41 | } 42 | return [width, -4];//no loaded src, widget should not display 43 | } 44 | // element.style['pointer-events'] = "none" 45 | previewWidget.value = {hidden: false, paused: false, params: {}} 46 | previewWidget.parentEl = document.createElement("div"); 47 | previewWidget.parentEl.className = "video_preview"; 48 | previewWidget.parentEl.style['width'] = "100%" 49 | element.appendChild(previewWidget.parentEl); 50 | previewWidget.videoEl = document.createElement("video"); 51 | previewWidget.videoEl.controls = true; 52 | previewWidget.videoEl.loop = false; 53 | previewWidget.videoEl.muted = false; 54 | previewWidget.videoEl.style['width'] = "100%" 55 | previewWidget.videoEl.addEventListener("loadedmetadata", () => { 56 | 57 | previewWidget.aspectRatio = previewWidget.videoEl.videoWidth / previewWidget.videoEl.videoHeight; 58 | fitHeight(this); 59 | }); 60 | previewWidget.videoEl.addEventListener("error", () => { 61 | //TODO: consider a way to properly notify the user why a preview isn't shown. 62 | previewWidget.parentEl.hidden = true; 63 | fitHeight(this); 64 | }); 65 | 66 | let params = { 67 | "filename": file, 68 | "type": "input", 69 | } 70 | 71 | previewWidget.parentEl.hidden = previewWidget.value.hidden; 72 | previewWidget.videoEl.autoplay = !previewWidget.value.paused && !previewWidget.value.hidden; 73 | let target_width = 256 74 | if (element.style?.width) { 75 | //overscale to allow scrolling. Endpoint won't return higher than native 76 | target_width = element.style.width.slice(0,-2)*2; 77 | } 78 | if (!params.force_size || params.force_size.includes("?") || params.force_size == "Disabled") { 79 | params.force_size = target_width+"x?" 80 | } else { 81 | let size = params.force_size.split("x") 82 | let ar = parseInt(size[0])/parseInt(size[1]) 83 | params.force_size = target_width+"x"+(target_width/ar) 84 | } 85 | 86 | previewWidget.videoEl.src = api.apiURL('/view?' + new URLSearchParams(params)); 87 | 88 | previewWidget.videoEl.hidden = false; 89 | previewWidget.parentEl.appendChild(previewWidget.videoEl) 90 | } 91 | 92 | function videoUpload(node, inputName, inputData, app) { 93 | const videoWidget = node.widgets.find((w) => w.name === "video"); 94 | let uploadWidget; 95 | /* 96 | A method that returns the required style for the html 97 | */ 98 | var default_value = videoWidget.value; 99 | Object.defineProperty(videoWidget, "value", { 100 | set : function(value) { 101 | this._real_value = value; 102 | }, 103 | 104 | get : function() { 105 | let value = ""; 106 | if (this._real_value) { 107 | value = this._real_value; 108 | } else { 109 | return default_value; 110 | } 111 | 112 | if (value.filename) { 113 | let real_value = value; 114 | value = ""; 115 | if (real_value.subfolder) { 116 | value = real_value.subfolder + "/"; 117 | } 118 | 119 | value += real_value.filename; 120 | 121 | if(real_value.type && real_value.type !== "input") 122 | value += ` [${real_value.type}]`; 123 | } 124 | return value; 125 | } 126 | }); 127 | async function uploadFile(file, updateNode, pasted = false) { 128 | try { 129 | // Wrap file in formdata so it includes filename 130 | const body = new FormData(); 131 | body.append("image", file); 132 | if (pasted) body.append("subfolder", "pasted"); 133 | const resp = await api.fetchApi("/upload/image", { 134 | method: "POST", 135 | body, 136 | }); 137 | 138 | if (resp.status === 200) { 139 | const data = await resp.json(); 140 | // Add the file to the dropdown list and update the widget value 141 | let path = data.name; 142 | if (data.subfolder) path = data.subfolder + "/" + path; 143 | 144 | if (!videoWidget.options.values.includes(path)) { 145 | videoWidget.options.values.push(path); 146 | } 147 | 148 | if (updateNode) { 149 | videoWidget.value = path; 150 | previewVideo(node,path) 151 | 152 | } 153 | } else { 154 | alert(resp.status + " - " + resp.statusText); 155 | } 156 | } catch (error) { 157 | alert(error); 158 | } 159 | } 160 | 161 | const fileInput = document.createElement("input"); 162 | Object.assign(fileInput, { 163 | type: "file", 164 | accept: "video/webm,video/mp4,video/mkv,video/avi", 165 | style: "display: none", 166 | onchange: async () => { 167 | if (fileInput.files.length) { 168 | await uploadFile(fileInput.files[0], true); 169 | } 170 | }, 171 | }); 172 | document.body.append(fileInput); 173 | 174 | // Create the button widget for selecting the files 175 | uploadWidget = node.addWidget("button", "choose video file to upload", "Video", () => { 176 | fileInput.click(); 177 | }); 178 | 179 | uploadWidget.serialize = false; 180 | 181 | previewVideo(node, videoWidget.value); 182 | const cb = node.callback; 183 | videoWidget.callback = function () { 184 | previewVideo(node,videoWidget.value); 185 | if (cb) { 186 | return cb.apply(this, arguments); 187 | } 188 | }; 189 | 190 | return { widget: uploadWidget }; 191 | } 192 | 193 | ComfyWidgets.VIDEOPLOAD = videoUpload; 194 | 195 | app.registerExtension({ 196 | name: "V-Express.UploadVideo", 197 | async beforeRegisterNodeDef(nodeType, nodeData, app) { 198 | if (nodeData?.name == "LoadVideo") { 199 | nodeData.input.required.upload = ["VIDEOPLOAD"]; 200 | } 201 | }, 202 | }); 203 | 204 | --------------------------------------------------------------------------------