├── .gitignore ├── .gitmodules ├── LICENSE ├── README.md ├── app.py ├── dwpose-l_384x288.py ├── main.py ├── requirements.txt ├── sample_videos ├── input_video.mp4 ├── output_video.mp4 ├── side_by_side.gif └── side_by_side.mp4 ├── video2openpose2.py └── yolox_l_8xb8-300e_coco.py /.gitignore: -------------------------------------------------------------------------------- 1 | detectron2/ -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "detectron2"] 2 | path = detectron2 3 | url = https://github.com/facebookresearch/detectron2 4 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 Flode Labs 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Vid2DensePose 2 | 3 | Open In Colab 4 | 5 | 6 | ![](https://github.com/Flode-Labs/vid2densepose/blob/main/sample_videos/side_by_side.gif) 7 | 8 | ## Overview 9 | 10 | The Vid2DensePose is a powerful tool designed for applying the DensePose model to videos, generating detailed "Part Index" visualizations for each frame. This tool is exceptionally useful for enhancing animations, particularly when used in conjunction with MagicAnimate for temporally consistent human image animation. 11 | 12 | ## Key Features 13 | 14 | 15 | - **Enhanced Output**: Produces video files showcasing DensePosedata in a vivid, color-coded format. 16 | - **MagicAnimate Integration**: Seamlessly compatible with MagicAnimate to foster advanced human animation projects. 17 | 18 | ## Prerequisites 19 | 20 | To utilize this tool, ensure the installation of: 21 | - Python 3.8 or later 22 | - PyTorch (preferably with CUDA for GPU support) 23 | - Detectron2 24 | 25 | ## Installation Steps 26 | 27 | 1. Clone the repository: 28 | ```bash 29 | git clone https://github.com/Flode-Labs/vid2densepose.git 30 | cd vid2densepose 31 | ``` 32 | 33 | 2. Install necessary Python packages: 34 | ```bash 35 | pip install -r requirements.txt 36 | ``` 37 | 38 | 3. Clone the Detectron repository: 39 | ```bash 40 | git clone https://github.com/facebookresearch/detectron2.git 41 | ``` 42 | 43 | ## Usage Guide 44 | 45 | Run the script: 46 | ```bash 47 | python main.py -i input_video.mp4 -o output_video.mp4 48 | ``` 49 | 50 | The script processes the input video and generates an output with the densePose format. 51 | 52 | #### Gradio version 53 | You can also use the Gradio to run the script with an interface. To do so, run the following command: 54 | ```bash 55 | python app.py 56 | ``` 57 | 58 | ## Integration with MagicAnimate 59 | 60 | For integration with MagicAnimate: 61 | 62 | 1. Create the densepose video using the steps outlined above. 63 | 2. Use this output as an input to MagicAnimate for generating temporally consistent animations. 64 | 65 | 66 | ## Acknowledgments 67 | 68 | Special thanks to: 69 | - Facebook AI Research (FAIR) for the development of DensePose. 70 | - The contributors of the Detectron2 project. 71 | - [Gonzalo Vidal](https://www.tiktok.com/@_gonzavidal) for the sample videos. 72 | - [Sylvain Filoni](https://twitter.com/fffiloni) for the deployment of the Gradio Space in [Hugging Face](https://huggingface.co/spaces/fffiloni/video2densepose). 73 | 74 | ## Support 75 | 76 | For any inquiries or support, please file an issue in our GitHub repository's issue tracker. 77 | 78 | -------------------------------------------------------------------------------- /app.py: -------------------------------------------------------------------------------- 1 | import gradio as gr 2 | from detectron2.config import get_cfg 3 | import torch 4 | import cv2 5 | import numpy as np 6 | from detectron2.engine import DefaultPredictor 7 | from densepose import add_densepose_config 8 | from densepose.vis.extractor import DensePoseResultExtractor 9 | from densepose.vis.densepose_results import DensePoseResultsFineSegmentationVisualizer as Visualizer 10 | import tempfile 11 | import shutil 12 | 13 | # Function to process video 14 | def process_video(input_video_path): 15 | # Temporary path for output video 16 | output_video_path = tempfile.NamedTemporaryFile(suffix=".mp4", delete=False).name 17 | 18 | # Initialize Detectron2 configuration for DensePose 19 | cfg = get_cfg() 20 | add_densepose_config(cfg) 21 | cfg.merge_from_file("detectron2/projects/DensePose/configs/densepose_rcnn_R_50_FPN_s1x.yaml") 22 | cfg.MODEL.WEIGHTS = "https://dl.fbaipublicfiles.com/densepose/densepose_rcnn_R_50_FPN_s1x/165712039/model_final_162be9.pkl" 23 | predictor = DefaultPredictor(cfg) 24 | 25 | # Open the input video 26 | cap = cv2.VideoCapture(input_video_path) 27 | fps = cap.get(cv2.CAP_PROP_FPS) 28 | width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)) 29 | height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) 30 | 31 | # Initialize video writer 32 | fourcc = cv2.VideoWriter_fourcc(*'mp4v') 33 | out = cv2.VideoWriter(output_video_path, fourcc, fps, (width, height)) 34 | 35 | # Process each frame 36 | while cap.isOpened(): 37 | ret, frame = cap.read() 38 | if not ret: 39 | break 40 | 41 | with torch.no_grad(): 42 | outputs = predictor(frame)['instances'] 43 | 44 | results = DensePoseResultExtractor()(outputs) 45 | cmap = cv2.COLORMAP_VIRIDIS 46 | # Visualizer outputs black for background, but we want the 0 value of 47 | # the colormap, so we initialize the array with that value 48 | arr = cv2.applyColorMap(np.zeros((height, width), dtype=np.uint8), cmap) 49 | out_frame = Visualizer(alpha=1, cmap=cmap).visualize(arr, results) 50 | out.write(out_frame) 51 | 52 | # Release resources 53 | cap.release() 54 | out.release() 55 | 56 | # Return processed video 57 | return output_video_path 58 | 59 | # Gradio interface 60 | iface = gr.Interface( 61 | fn=process_video, 62 | inputs=gr.Video(label="Input Video"), 63 | outputs=gr.Video(label="Output DensePose Video"), 64 | title="Video 2 DensePose" 65 | ) 66 | 67 | # Run the app 68 | iface.launch() 69 | -------------------------------------------------------------------------------- /dwpose-l_384x288.py: -------------------------------------------------------------------------------- 1 | # runtime 2 | max_epochs = 270 3 | stage2_num_epochs = 30 4 | base_lr = 4e-3 5 | 6 | train_cfg = dict(max_epochs=max_epochs, val_interval=10) 7 | randomness = dict(seed=21) 8 | 9 | # optimizer 10 | optim_wrapper = dict( 11 | type='OptimWrapper', 12 | optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.05), 13 | paramwise_cfg=dict( 14 | norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True)) 15 | 16 | # learning rate 17 | param_scheduler = [ 18 | dict( 19 | type='LinearLR', 20 | start_factor=1.0e-5, 21 | by_epoch=False, 22 | begin=0, 23 | end=1000), 24 | dict( 25 | # use cosine lr from 150 to 300 epoch 26 | type='CosineAnnealingLR', 27 | eta_min=base_lr * 0.05, 28 | begin=max_epochs // 2, 29 | end=max_epochs, 30 | T_max=max_epochs // 2, 31 | by_epoch=True, 32 | convert_to_iter_based=True), 33 | ] 34 | 35 | # automatically scaling LR based on the actual training batch size 36 | auto_scale_lr = dict(base_batch_size=512) 37 | 38 | # codec settings 39 | codec = dict( 40 | type='SimCCLabel', 41 | input_size=(288, 384), 42 | sigma=(6., 6.93), 43 | simcc_split_ratio=2.0, 44 | normalize=False, 45 | use_dark=False) 46 | 47 | # model settings 48 | model = dict( 49 | type='TopdownPoseEstimator', 50 | data_preprocessor=dict( 51 | type='PoseDataPreprocessor', 52 | mean=[123.675, 116.28, 103.53], 53 | std=[58.395, 57.12, 57.375], 54 | bgr_to_rgb=True), 55 | backbone=dict( 56 | _scope_='mmdet', 57 | type='CSPNeXt', 58 | arch='P5', 59 | expand_ratio=0.5, 60 | deepen_factor=1., 61 | widen_factor=1., 62 | out_indices=(4, ), 63 | channel_attention=True, 64 | norm_cfg=dict(type='SyncBN'), 65 | act_cfg=dict(type='SiLU'), 66 | init_cfg=dict( 67 | type='Pretrained', 68 | prefix='backbone.', 69 | checkpoint='https://download.openmmlab.com/mmpose/v1/projects/' 70 | 'rtmpose/cspnext-l_udp-aic-coco_210e-256x192-273b7631_20230130.pth' # noqa 71 | )), 72 | head=dict( 73 | type='RTMCCHead', 74 | in_channels=1024, 75 | out_channels=133, 76 | input_size=codec['input_size'], 77 | in_featuremap_size=(9, 12), 78 | simcc_split_ratio=codec['simcc_split_ratio'], 79 | final_layer_kernel_size=7, 80 | gau_cfg=dict( 81 | hidden_dims=256, 82 | s=128, 83 | expansion_factor=2, 84 | dropout_rate=0., 85 | drop_path=0., 86 | act_fn='SiLU', 87 | use_rel_bias=False, 88 | pos_enc=False), 89 | loss=dict( 90 | type='KLDiscretLoss', 91 | use_target_weight=True, 92 | beta=10., 93 | label_softmax=True), 94 | decoder=codec), 95 | test_cfg=dict(flip_test=True, )) 96 | 97 | # base dataset settings 98 | dataset_type = 'CocoWholeBodyDataset' 99 | data_mode = 'topdown' 100 | data_root = '/data/' 101 | 102 | backend_args = dict(backend='local') 103 | # backend_args = dict( 104 | # backend='petrel', 105 | # path_mapping=dict({ 106 | # f'{data_root}': 's3://openmmlab/datasets/detection/coco/', 107 | # f'{data_root}': 's3://openmmlab/datasets/detection/coco/' 108 | # })) 109 | 110 | # pipelines 111 | train_pipeline = [ 112 | dict(type='LoadImage', backend_args=backend_args), 113 | dict(type='GetBBoxCenterScale'), 114 | dict(type='RandomFlip', direction='horizontal'), 115 | dict(type='RandomHalfBody'), 116 | dict( 117 | type='RandomBBoxTransform', scale_factor=[0.6, 1.4], rotate_factor=80), 118 | dict(type='TopdownAffine', input_size=codec['input_size']), 119 | dict(type='mmdet.YOLOXHSVRandomAug'), 120 | dict( 121 | type='Albumentation', 122 | transforms=[ 123 | dict(type='Blur', p=0.1), 124 | dict(type='MedianBlur', p=0.1), 125 | dict( 126 | type='CoarseDropout', 127 | max_holes=1, 128 | max_height=0.4, 129 | max_width=0.4, 130 | min_holes=1, 131 | min_height=0.2, 132 | min_width=0.2, 133 | p=1.0), 134 | ]), 135 | dict(type='GenerateTarget', encoder=codec), 136 | dict(type='PackPoseInputs') 137 | ] 138 | val_pipeline = [ 139 | dict(type='LoadImage', backend_args=backend_args), 140 | dict(type='GetBBoxCenterScale'), 141 | dict(type='TopdownAffine', input_size=codec['input_size']), 142 | dict(type='PackPoseInputs') 143 | ] 144 | 145 | train_pipeline_stage2 = [ 146 | dict(type='LoadImage', backend_args=backend_args), 147 | dict(type='GetBBoxCenterScale'), 148 | dict(type='RandomFlip', direction='horizontal'), 149 | dict(type='RandomHalfBody'), 150 | dict( 151 | type='RandomBBoxTransform', 152 | shift_factor=0., 153 | scale_factor=[0.75, 1.25], 154 | rotate_factor=60), 155 | dict(type='TopdownAffine', input_size=codec['input_size']), 156 | dict(type='mmdet.YOLOXHSVRandomAug'), 157 | dict( 158 | type='Albumentation', 159 | transforms=[ 160 | dict(type='Blur', p=0.1), 161 | dict(type='MedianBlur', p=0.1), 162 | dict( 163 | type='CoarseDropout', 164 | max_holes=1, 165 | max_height=0.4, 166 | max_width=0.4, 167 | min_holes=1, 168 | min_height=0.2, 169 | min_width=0.2, 170 | p=0.5), 171 | ]), 172 | dict(type='GenerateTarget', encoder=codec), 173 | dict(type='PackPoseInputs') 174 | ] 175 | 176 | datasets = [] 177 | dataset_coco=dict( 178 | type=dataset_type, 179 | data_root=data_root, 180 | data_mode=data_mode, 181 | ann_file='coco/annotations/coco_wholebody_train_v1.0.json', 182 | data_prefix=dict(img='coco/train2017/'), 183 | pipeline=[], 184 | ) 185 | datasets.append(dataset_coco) 186 | 187 | scene = ['Magic_show', 'Entertainment', 'ConductMusic', 'Online_class', 188 | 'TalkShow', 'Speech', 'Fitness', 'Interview', 'Olympic', 'TVShow', 189 | 'Singing', 'SignLanguage', 'Movie', 'LiveVlog', 'VideoConference'] 190 | 191 | for i in range(len(scene)): 192 | datasets.append( 193 | dict( 194 | type=dataset_type, 195 | data_root=data_root, 196 | data_mode=data_mode, 197 | ann_file='UBody/annotations/'+scene[i]+'/keypoint_annotation.json', 198 | data_prefix=dict(img='UBody/images/'+scene[i]+'/'), 199 | pipeline=[], 200 | ) 201 | ) 202 | 203 | # data loaders 204 | train_dataloader = dict( 205 | batch_size=32, 206 | num_workers=10, 207 | persistent_workers=True, 208 | sampler=dict(type='DefaultSampler', shuffle=True), 209 | dataset=dict( 210 | type='CombinedDataset', 211 | metainfo=dict(from_file='configs/_base_/datasets/coco_wholebody.py'), 212 | datasets=datasets, 213 | pipeline=train_pipeline, 214 | test_mode=False, 215 | )) 216 | val_dataloader = dict( 217 | batch_size=32, 218 | num_workers=10, 219 | persistent_workers=True, 220 | drop_last=False, 221 | sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), 222 | dataset=dict( 223 | type=dataset_type, 224 | data_root=data_root, 225 | data_mode=data_mode, 226 | ann_file='coco/annotations/coco_wholebody_val_v1.0.json', 227 | bbox_file=f'{data_root}coco/person_detection_results/' 228 | 'COCO_val2017_detections_AP_H_56_person.json', 229 | data_prefix=dict(img='coco/val2017/'), 230 | test_mode=True, 231 | pipeline=val_pipeline, 232 | )) 233 | test_dataloader = val_dataloader 234 | 235 | # hooks 236 | default_hooks = dict( 237 | checkpoint=dict( 238 | save_best='coco-wholebody/AP', rule='greater', max_keep_ckpts=1)) 239 | 240 | custom_hooks = [ 241 | dict( 242 | type='EMAHook', 243 | ema_type='ExpMomentumEMA', 244 | momentum=0.0002, 245 | update_buffers=True, 246 | priority=49), 247 | dict( 248 | type='mmdet.PipelineSwitchHook', 249 | switch_epoch=max_epochs - stage2_num_epochs, 250 | switch_pipeline=train_pipeline_stage2) 251 | ] 252 | 253 | # evaluators 254 | val_evaluator = dict( 255 | type='CocoWholeBodyMetric', 256 | ann_file=data_root + 'coco/annotations/coco_wholebody_val_v1.0.json') 257 | test_evaluator = val_evaluator 258 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | import cv2 4 | import numpy as np 5 | import torch 6 | from densepose import add_densepose_config 7 | from densepose.vis.densepose_results import ( 8 | DensePoseResultsFineSegmentationVisualizer as Visualizer, 9 | ) 10 | from densepose.vis.extractor import DensePoseResultExtractor 11 | 12 | from detectron2.config import get_cfg 13 | from detectron2.engine import DefaultPredictor 14 | 15 | 16 | def main(input_video_path="./input_video.mp4", output_video_path="./output_video.mp4"): 17 | # Initialize Detectron2 configuration for DensePose 18 | cfg = get_cfg() 19 | add_densepose_config(cfg) 20 | cfg.merge_from_file( 21 | "detectron2/projects/DensePose/configs/densepose_rcnn_R_50_FPN_s1x.yaml" 22 | ) 23 | cfg.MODEL.WEIGHTS = "https://dl.fbaipublicfiles.com/densepose/densepose_rcnn_R_50_FPN_s1x/165712039/model_final_162be9.pkl" 24 | predictor = DefaultPredictor(cfg) 25 | 26 | # Open the input video 27 | cap = cv2.VideoCapture(input_video_path) 28 | fps = cap.get(cv2.CAP_PROP_FPS) 29 | width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)) 30 | height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) 31 | 32 | # Initialize video writer 33 | fourcc = cv2.VideoWriter_fourcc(*"mp4v") 34 | out = cv2.VideoWriter(output_video_path, fourcc, fps, (width, height)) 35 | 36 | # Process each frame 37 | while cap.isOpened(): 38 | ret, frame = cap.read() 39 | if not ret: 40 | break 41 | 42 | with torch.no_grad(): 43 | outputs = predictor(frame)["instances"] 44 | 45 | results = DensePoseResultExtractor()(outputs) 46 | 47 | # MagicAnimate uses the Viridis colormap for their training data 48 | cmap = cv2.COLORMAP_VIRIDIS 49 | # Visualizer outputs black for background, but we want the 0 value of 50 | # the colormap, so we initialize the array with that value 51 | arr = cv2.applyColorMap(np.zeros((height, width), dtype=np.uint8), cmap) 52 | out_frame = Visualizer(alpha=1, cmap=cmap).visualize(arr, results) 53 | out.write(out_frame) 54 | 55 | # Release resources 56 | cap.release() 57 | out.release() 58 | 59 | 60 | if __name__ == "__main__": 61 | parser = argparse.ArgumentParser() 62 | parser.add_argument( 63 | "-i", "--input_video_path", type=str, default="./input_video.mp4" 64 | ) 65 | parser.add_argument( 66 | "-o", "--output_video_path", type=str, default="./output_video.mp4" 67 | ) 68 | args = parser.parse_args() 69 | 70 | main(args.input_video_path, args.output_video_path) 71 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | #git+https://github.com/facebookresearch/detectron2@main#subdirectory=projects/DensePose 2 | moviepy 3 | controlnet_aux 4 | mediapipe 5 | openmim 6 | -------------------------------------------------------------------------------- /sample_videos/input_video.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sdbds/vid2pose/97957ca92b70fde93754fa861f2fec8df48e6c68/sample_videos/input_video.mp4 -------------------------------------------------------------------------------- /sample_videos/output_video.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sdbds/vid2pose/97957ca92b70fde93754fa861f2fec8df48e6c68/sample_videos/output_video.mp4 -------------------------------------------------------------------------------- /sample_videos/side_by_side.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sdbds/vid2pose/97957ca92b70fde93754fa861f2fec8df48e6c68/sample_videos/side_by_side.gif -------------------------------------------------------------------------------- /sample_videos/side_by_side.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sdbds/vid2pose/97957ca92b70fde93754fa861f2fec8df48e6c68/sample_videos/side_by_side.mp4 -------------------------------------------------------------------------------- /video2openpose2.py: -------------------------------------------------------------------------------- 1 | import gradio as gr 2 | from controlnet_aux import OpenposeDetector, DWposeDetector 3 | import os 4 | import cv2 5 | import numpy as np 6 | from PIL import Image 7 | from moviepy.editor import * 8 | import argparse 9 | import torch 10 | import re 11 | 12 | 13 | def main( 14 | input_path="vid2pose/sample_videos/input_video.mp4", 15 | output_path="./outputs/", 16 | pose_model="dwpose", 17 | ): 18 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 19 | if pose_model.__contains__("openpose"): 20 | openpose = OpenposeDetector.from_pretrained("lllyasviel/ControlNet") 21 | else: 22 | dwpose = DWposeDetector( 23 | det_config=os.path.dirname(__file__) 24 | + "/yolox_l_8xb8-300e_coco.py", 25 | pose_config=os.path.dirname(__file__) 26 | + "/dwpose-l_384x288.py", 27 | device=device, 28 | ) 29 | 30 | def regex(string): 31 | return re.findall(r"\d+", str(string))[-1] 32 | 33 | def get_frames(video_in): 34 | frames = [] 35 | # resize the video 36 | clip = VideoFileClip(video_in) 37 | 38 | # check fps 39 | video_path = os.path.join(output_path, "video_resized.mp4") 40 | if clip.fps > 30: 41 | print("vide rate is over 30, resetting to 30") 42 | clip_resized = clip.resize(height=512) 43 | clip_resized.write_videofile(video_path, fps=30) 44 | else: 45 | print("video rate is OK") 46 | clip_resized = clip.resize(height=512) 47 | clip_resized.write_videofile(video_path, fps=clip.fps) 48 | 49 | print("video resized to 512 height") 50 | 51 | # Opens the Video file with CV2 52 | cap = cv2.VideoCapture(video_path) 53 | 54 | fps = cap.get(cv2.CAP_PROP_FPS) 55 | print("video fps: " + str(fps)) 56 | i = 0 57 | while cap.isOpened(): 58 | ret, frame = cap.read() 59 | if ret == False: 60 | break 61 | path = os.path.join(output_path, "raw" + str(i) + ".jpg") 62 | cv2.imwrite(path, frame) 63 | frames.append(path) 64 | i += 1 65 | 66 | cap.release() 67 | cv2.destroyAllWindows() 68 | print("broke the video into frames") 69 | 70 | return frames, fps 71 | 72 | def get_openpose_filter(i): 73 | image = Image.open(i) 74 | 75 | # image = np.array(image) 76 | openpose.to(device) 77 | 78 | if pose_model.__contains__("full"): 79 | image = openpose(image, include_hand=True, include_face=True) 80 | elif pose_model.__contains__("hand"): 81 | image = openpose(image, include_hand=True) 82 | elif pose_model.__contains__("face"): 83 | image = openpose(image, include_face=True) 84 | elif pose_model.__contains__("openpose"): 85 | image = openpose(image) 86 | else: 87 | image = dwpose(image) 88 | # image = Image.fromarray(image) 89 | path = os.path.join(output_path, "openpose_frame_" + regex(i) + ".jpeg") 90 | image.save(path) 91 | return path 92 | 93 | def create_video(frames, fps, type): 94 | print("building video result") 95 | clip = ImageSequenceClip(frames, fps=fps) 96 | path = os.path.join(output_path, type + "_result.mp4") 97 | clip.write_videofile(path, fps=fps) 98 | 99 | return path 100 | 101 | def convertG2V(imported_gif): 102 | clip = VideoFileClip(imported_gif.name) 103 | path = os.path.join(output_path, "my_gif_video.mp4") 104 | clip.write_videofile(path) 105 | return path 106 | 107 | def infer(video_in): 108 | # 1. break video into frames and get FPS 109 | break_vid = get_frames(video_in) 110 | frames_list = break_vid[0] 111 | fps = break_vid[1] 112 | # n_frame = int(trim_value*fps) 113 | n_frame = len(frames_list) 114 | 115 | if n_frame >= len(frames_list): 116 | print("video is shorter than the cut value") 117 | n_frame = len(frames_list) 118 | 119 | # 2. prepare frames result arrays 120 | result_frames = [] 121 | print("set stop frames to: " + str(n_frame)) 122 | 123 | for i in frames_list[0 : int(n_frame)]: 124 | openpose_frame = get_openpose_filter(i) 125 | result_frames.append(openpose_frame) 126 | print("frame " + i + "/" + str(n_frame) + ": done;") 127 | 128 | final_vid = create_video(result_frames, fps, "openpose") 129 | 130 | files = [final_vid] 131 | 132 | return final_vid, files 133 | 134 | title = """ 135 |
136 |
145 |

146 | Video to OpenPose 147 |

148 |
149 |
150 | """ 151 | 152 | with gr.Blocks() as demo: 153 | with gr.Column(): 154 | gr.HTML(title) 155 | with gr.Row(): 156 | with gr.Column(): 157 | video_input = gr.Video( 158 | source="upload", 159 | type="filepath", 160 | value=input_path if not input_path.endswith(".gif") else None, 161 | ) 162 | gif_input = gr.File( 163 | label="import a GIF instead", 164 | file_types=[".gif"], 165 | value=input_path if input_path.endswith(".gif") else None, 166 | ) 167 | gif_input.change( 168 | fn=convertG2V, inputs=gif_input, outputs=video_input 169 | ) 170 | submit_btn = gr.Button("Submit") 171 | 172 | with gr.Column(): 173 | video_output = gr.Video() 174 | file_output = gr.Files() 175 | 176 | submit_btn.click( 177 | fn=infer, inputs=[video_input], outputs=[video_output, file_output] 178 | ) 179 | 180 | demo.launch() 181 | 182 | 183 | if __name__ == "__main__": 184 | parser = argparse.ArgumentParser() 185 | parser.add_argument( 186 | "-i", "--input_path", type=str, default="vid2pose/sample_videos/input_video.mp4" 187 | ) 188 | parser.add_argument("-o", "--output_path", type=str, default="./outputs/") 189 | parser.add_argument("--pose_model", type=str, default="dwpose") 190 | args = parser.parse_args() 191 | 192 | if not os.path.exists(args.output_path): 193 | os.makedirs(args.output_path) 194 | 195 | main(args.input_path, args.output_path, args.pose_model) 196 | -------------------------------------------------------------------------------- /yolox_l_8xb8-300e_coco.py: -------------------------------------------------------------------------------- 1 | img_scale = (640, 640) # width, height 2 | 3 | # model settings 4 | model = dict( 5 | type='YOLOX', 6 | data_preprocessor=dict( 7 | type='DetDataPreprocessor', 8 | pad_size_divisor=32, 9 | batch_augments=[ 10 | dict( 11 | type='BatchSyncRandomResize', 12 | random_size_range=(480, 800), 13 | size_divisor=32, 14 | interval=10) 15 | ]), 16 | backbone=dict( 17 | type='CSPDarknet', 18 | deepen_factor=1.0, 19 | widen_factor=1.0, 20 | out_indices=(2, 3, 4), 21 | use_depthwise=False, 22 | spp_kernal_sizes=(5, 9, 13), 23 | norm_cfg=dict(type='BN', momentum=0.03, eps=0.001), 24 | act_cfg=dict(type='Swish'), 25 | ), 26 | neck=dict( 27 | type='YOLOXPAFPN', 28 | in_channels=[256, 512, 1024], 29 | out_channels=256, 30 | num_csp_blocks=3, 31 | use_depthwise=False, 32 | upsample_cfg=dict(scale_factor=2, mode='nearest'), 33 | norm_cfg=dict(type='BN', momentum=0.03, eps=0.001), 34 | act_cfg=dict(type='Swish')), 35 | bbox_head=dict( 36 | type='YOLOXHead', 37 | num_classes=80, 38 | in_channels=256, 39 | feat_channels=256, 40 | stacked_convs=2, 41 | strides=(8, 16, 32), 42 | use_depthwise=False, 43 | norm_cfg=dict(type='BN', momentum=0.03, eps=0.001), 44 | act_cfg=dict(type='Swish'), 45 | loss_cls=dict( 46 | type='CrossEntropyLoss', 47 | use_sigmoid=True, 48 | reduction='sum', 49 | loss_weight=1.0), 50 | loss_bbox=dict( 51 | type='IoULoss', 52 | mode='square', 53 | eps=1e-16, 54 | reduction='sum', 55 | loss_weight=5.0), 56 | loss_obj=dict( 57 | type='CrossEntropyLoss', 58 | use_sigmoid=True, 59 | reduction='sum', 60 | loss_weight=1.0), 61 | loss_l1=dict(type='L1Loss', reduction='sum', loss_weight=1.0)), 62 | train_cfg=dict(assigner=dict(type='SimOTAAssigner', center_radius=2.5)), 63 | # In order to align the source code, the threshold of the val phase is 64 | # 0.01, and the threshold of the test phase is 0.001. 65 | test_cfg=dict(score_thr=0.01, nms=dict(type='nms', iou_threshold=0.65))) 66 | 67 | # dataset settings 68 | data_root = 'data/coco/' 69 | dataset_type = 'CocoDataset' 70 | 71 | # Example to use different file client 72 | # Method 1: simply set the data root and let the file I/O module 73 | # automatically infer from prefix (not support LMDB and Memcache yet) 74 | 75 | # data_root = 's3://openmmlab/datasets/detection/coco/' 76 | 77 | # Method 2: Use `backend_args`, `file_client_args` in versions before 3.0.0rc6 78 | # backend_args = dict( 79 | # backend='petrel', 80 | # path_mapping=dict({ 81 | # './data/': 's3://openmmlab/datasets/detection/', 82 | # 'data/': 's3://openmmlab/datasets/detection/' 83 | # })) 84 | backend_args = None 85 | 86 | train_pipeline = [ 87 | dict(type='Mosaic', img_scale=img_scale, pad_val=114.0), 88 | dict( 89 | type='RandomAffine', 90 | scaling_ratio_range=(0.1, 2), 91 | # img_scale is (width, height) 92 | border=(-img_scale[0] // 2, -img_scale[1] // 2)), 93 | dict( 94 | type='MixUp', 95 | img_scale=img_scale, 96 | ratio_range=(0.8, 1.6), 97 | pad_val=114.0), 98 | dict(type='YOLOXHSVRandomAug'), 99 | dict(type='RandomFlip', prob=0.5), 100 | # According to the official implementation, multi-scale 101 | # training is not considered here but in the 102 | # 'mmdet/models/detectors/yolox.py'. 103 | # Resize and Pad are for the last 15 epochs when Mosaic, 104 | # RandomAffine, and MixUp are closed by YOLOXModeSwitchHook. 105 | dict(type='Resize', scale=img_scale, keep_ratio=True), 106 | dict( 107 | type='Pad', 108 | pad_to_square=True, 109 | # If the image is three-channel, the pad value needs 110 | # to be set separately for each channel. 111 | pad_val=dict(img=(114.0, 114.0, 114.0))), 112 | dict(type='FilterAnnotations', min_gt_bbox_wh=(1, 1), keep_empty=False), 113 | dict(type='PackDetInputs') 114 | ] 115 | 116 | train_dataset = dict( 117 | # use MultiImageMixDataset wrapper to support mosaic and mixup 118 | type='MultiImageMixDataset', 119 | dataset=dict( 120 | type=dataset_type, 121 | data_root=data_root, 122 | ann_file='annotations/instances_train2017.json', 123 | data_prefix=dict(img='train2017/'), 124 | pipeline=[ 125 | dict(type='LoadImageFromFile', backend_args=backend_args), 126 | dict(type='LoadAnnotations', with_bbox=True) 127 | ], 128 | filter_cfg=dict(filter_empty_gt=False, min_size=32), 129 | backend_args=backend_args), 130 | pipeline=train_pipeline) 131 | 132 | test_pipeline = [ 133 | dict(type='LoadImageFromFile', backend_args=backend_args), 134 | dict(type='Resize', scale=img_scale, keep_ratio=True), 135 | dict( 136 | type='Pad', 137 | pad_to_square=True, 138 | pad_val=dict(img=(114.0, 114.0, 114.0))), 139 | dict(type='LoadAnnotations', with_bbox=True), 140 | dict( 141 | type='PackDetInputs', 142 | meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 143 | 'scale_factor')) 144 | ] 145 | 146 | train_dataloader = dict( 147 | batch_size=8, 148 | num_workers=4, 149 | persistent_workers=True, 150 | sampler=dict(type='DefaultSampler', shuffle=True), 151 | dataset=train_dataset) 152 | val_dataloader = dict( 153 | batch_size=8, 154 | num_workers=4, 155 | persistent_workers=True, 156 | drop_last=False, 157 | sampler=dict(type='DefaultSampler', shuffle=False), 158 | dataset=dict( 159 | type=dataset_type, 160 | data_root=data_root, 161 | ann_file='annotations/instances_val2017.json', 162 | data_prefix=dict(img='val2017/'), 163 | test_mode=True, 164 | pipeline=test_pipeline, 165 | backend_args=backend_args)) 166 | test_dataloader = val_dataloader 167 | 168 | val_evaluator = dict( 169 | type='CocoMetric', 170 | ann_file=data_root + 'annotations/instances_val2017.json', 171 | metric='bbox', 172 | backend_args=backend_args) 173 | test_evaluator = val_evaluator 174 | 175 | # training settings 176 | max_epochs = 300 177 | num_last_epochs = 15 178 | interval = 10 179 | 180 | train_cfg = dict(max_epochs=max_epochs, val_interval=interval) 181 | 182 | # optimizer 183 | # default 8 gpu 184 | base_lr = 0.01 185 | optim_wrapper = dict( 186 | type='OptimWrapper', 187 | optimizer=dict( 188 | type='SGD', lr=base_lr, momentum=0.9, weight_decay=5e-4, 189 | nesterov=True), 190 | paramwise_cfg=dict(norm_decay_mult=0., bias_decay_mult=0.)) 191 | 192 | # learning rate 193 | param_scheduler = [ 194 | dict( 195 | # use quadratic formula to warm up 5 epochs 196 | # and lr is updated by iteration 197 | # TODO: fix default scope in get function 198 | type='mmdet.QuadraticWarmupLR', 199 | by_epoch=True, 200 | begin=0, 201 | end=5, 202 | convert_to_iter_based=True), 203 | dict( 204 | # use cosine lr from 5 to 285 epoch 205 | type='CosineAnnealingLR', 206 | eta_min=base_lr * 0.05, 207 | begin=5, 208 | T_max=max_epochs - num_last_epochs, 209 | end=max_epochs - num_last_epochs, 210 | by_epoch=True, 211 | convert_to_iter_based=True), 212 | dict( 213 | # use fixed lr during last 15 epochs 214 | type='ConstantLR', 215 | by_epoch=True, 216 | factor=1, 217 | begin=max_epochs - num_last_epochs, 218 | end=max_epochs, 219 | ) 220 | ] 221 | 222 | default_hooks = dict( 223 | checkpoint=dict( 224 | interval=interval, 225 | max_keep_ckpts=3 # only keep latest 3 checkpoints 226 | )) 227 | 228 | custom_hooks = [ 229 | dict( 230 | type='YOLOXModeSwitchHook', 231 | num_last_epochs=num_last_epochs, 232 | priority=48), 233 | dict(type='SyncNormHook', priority=48), 234 | dict( 235 | type='EMAHook', 236 | ema_type='ExpMomentumEMA', 237 | momentum=0.0001, 238 | update_buffers=True, 239 | priority=49) 240 | ] 241 | 242 | # NOTE: `auto_scale_lr` is for automatically scaling LR, 243 | # USER SHOULD NOT CHANGE ITS VALUES. 244 | # base_batch_size = (8 GPUs) x (8 samples per GPU) 245 | auto_scale_lr = dict(base_batch_size=64) 246 | --------------------------------------------------------------------------------