├── .idea ├── .gitignore ├── Wav2Lip-Ultra.iml ├── inspectionProfiles │ ├── Project_Default.xml │ └── profiles_settings.xml ├── misc.xml ├── modules.xml └── vcs.xml └── script └── syncnet_python ├── .gitignore ├── LICENSE.md ├── README.md ├── SyncNetInstance.py ├── SyncNetModel.py ├── demo_feature.py ├── demo_syncnet.py ├── detectors ├── README.md ├── __init__.py └── s3fd │ ├── __init__.py │ ├── box_utils.py │ └── nets.py ├── download_model.sh ├── requirements.txt ├── run_pipeline.py ├── run_pipeline_dir.py ├── run_syncnet.py ├── run_syncnet_dir.py ├── run_visualise.py └── syncnet_videos.py /.idea/.gitignore: -------------------------------------------------------------------------------- 1 | # 默认忽略的文件 2 | /shelf/ 3 | /workspace.xml 4 | # 基于编辑器的 HTTP 客户端请求 5 | /httpRequests/ 6 | # Datasource local storage ignored files 7 | /dataSources/ 8 | /dataSources.local.xml 9 | -------------------------------------------------------------------------------- /.idea/Wav2Lip-Ultra.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /.idea/inspectionProfiles/Project_Default.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 32 | -------------------------------------------------------------------------------- /.idea/inspectionProfiles/profiles_settings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 6 | -------------------------------------------------------------------------------- /.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /script/syncnet_python/.gitignore: -------------------------------------------------------------------------------- 1 | # Compiled source # 2 | ################### 3 | *.com 4 | *.class 5 | *.dll 6 | *.exe 7 | *.o 8 | *.so 9 | *.pyc 10 | 11 | # Packages # 12 | ############ 13 | # it's better to unpack these files and commit the raw source 14 | # git has its own built in compression methods 15 | *.7z 16 | *.dmg 17 | *.gz 18 | *.iso 19 | *.jar 20 | *.rar 21 | *.tar 22 | *.zip 23 | 24 | # Logs and databases # 25 | ###################### 26 | *.log 27 | *.sql 28 | *.sqlite 29 | 30 | # OS generated files # 31 | ###################### 32 | .DS_Store 33 | .DS_Store? 34 | ._* 35 | .Spotlight-V100 36 | .Trashes 37 | ehthumbs.db 38 | Thumbs.db 39 | 40 | # Specific to this demo # 41 | ######################### 42 | data/ 43 | protos/ 44 | utils/ 45 | *.pth 46 | -------------------------------------------------------------------------------- /script/syncnet_python/LICENSE.md: -------------------------------------------------------------------------------- 1 | Copyright (c) 2016-present Joon Son Chung. 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy 4 | of this software and associated documentation files (the "Software"), to deal 5 | in the Software without restriction, including without limitation the rights 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | copies of the Software, and to permit persons to whom the Software is 8 | furnished to do so, subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in 11 | all copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 | THE SOFTWARE. 20 | -------------------------------------------------------------------------------- /script/syncnet_python/README.md: -------------------------------------------------------------------------------- 1 | # SyncNet 2 | 3 | This repository contains the demo for the audio-to-video synchronisation network (SyncNet). This network can be used for audio-visual synchronisation tasks including: 4 | 1. Removing temporal lags between the audio and visual streams in a video; 5 | 2. Determining who is speaking amongst multiple faces in a video. 6 | 7 | Please cite the paper below if you make use of the software. 8 | 9 | ## Dependencies 10 | ``` 11 | pip install -r requirements.txt 12 | ``` 13 | 14 | In addition, `ffmpeg` is required. 15 | 16 | 17 | ## Demo 18 | 19 | SyncNet demo: 20 | ``` 21 | python demo_syncnet.py --videofile data/example.avi --tmp_dir /path/to/temp/directory 22 | ``` 23 | 24 | Check that this script returns: 25 | ``` 26 | AV offset: 3 27 | Min dist: 5.353 28 | Confidence: 10.021 29 | ``` 30 | 31 | Full pipeline: 32 | ``` 33 | sh download_model.sh 34 | python run_pipeline.py --videofile /path/to/video.mp4 --reference name_of_video --data_dir /path/to/output 35 | python run_syncnet.py --videofile /path/to/video.mp4 --reference name_of_video --data_dir /path/to/output 36 | python run_visualise.py --videofile /path/to/video.mp4 --reference name_of_video --data_dir /path/to/output 37 | ``` 38 | 39 | Outputs: 40 | ``` 41 | $DATA_DIR/pycrop/$REFERENCE/*.avi - cropped face tracks 42 | $DATA_DIR/pywork/$REFERENCE/offsets.txt - audio-video offset values 43 | $DATA_DIR/pyavi/$REFERENCE/video_out.avi - output video (as shown below) 44 | ``` 45 |

46 | 47 | 48 |

49 | 50 | ## Publications 51 | 52 | ``` 53 | @InProceedings{Chung16a, 54 | author = "Chung, J.~S. and Zisserman, A.", 55 | title = "Out of time: automated lip sync in the wild", 56 | booktitle = "Workshop on Multi-view Lip-reading, ACCV", 57 | year = "2016", 58 | } 59 | ``` 60 | -------------------------------------------------------------------------------- /script/syncnet_python/SyncNetInstance.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | #-*- coding: utf-8 -*- 3 | # Video 25 FPS, Audio 16000HZ 4 | 5 | import torch 6 | import numpy 7 | import time, pdb, argparse, subprocess, os, math, glob 8 | import cv2 9 | import python_speech_features 10 | 11 | from scipy import signal 12 | from scipy.io import wavfile 13 | from SyncNetModel import * 14 | from shutil import rmtree 15 | 16 | 17 | # ==================== Get OFFSET ==================== 18 | 19 | def calc_pdist(feat1, feat2, vshift=10): 20 | 21 | win_size = vshift*2+1 22 | 23 | feat2p = torch.nn.functional.pad(feat2,(0,0,vshift,vshift)) 24 | 25 | dists = [] 26 | 27 | for i in range(0,len(feat1)): 28 | 29 | dists.append(torch.nn.functional.pairwise_distance(feat1[[i],:].repeat(win_size, 1), feat2p[i:i+win_size,:])) 30 | 31 | return dists 32 | 33 | # ==================== MAIN DEF ==================== 34 | 35 | class SyncNetInstance(torch.nn.Module): 36 | 37 | def __init__(self, dropout = 0, num_layers_in_fc_layers = 1024): 38 | super(SyncNetInstance, self).__init__(); 39 | 40 | self.__S__ = S(num_layers_in_fc_layers = num_layers_in_fc_layers).cuda(); 41 | 42 | def evaluate(self, opt, videofile): 43 | 44 | self.__S__.eval(); 45 | 46 | # ========== ========== 47 | # Convert files 48 | # ========== ========== 49 | 50 | if os.path.exists(os.path.join(opt.tmp_dir,opt.reference)): 51 | rmtree(os.path.join(opt.tmp_dir,opt.reference)) 52 | 53 | os.makedirs(os.path.join(opt.tmp_dir,opt.reference)) 54 | 55 | command = ("ffmpeg -y -i %s -threads 1 -f image2 %s" % (videofile,os.path.join(opt.tmp_dir,opt.reference,'%06d.jpg'))) 56 | output = subprocess.call(command, shell=True, stdout=None) 57 | 58 | command = ("ffmpeg -y -i %s -async 1 -ac 1 -vn -acodec pcm_s16le -ar 16000 %s" % (videofile,os.path.join(opt.tmp_dir,opt.reference,'audio.wav'))) 59 | output = subprocess.call(command, shell=True, stdout=None) 60 | 61 | # ========== ========== 62 | # Load video 63 | # ========== ========== 64 | 65 | images = [] 66 | 67 | flist = glob.glob(os.path.join(opt.tmp_dir,opt.reference,'*.jpg')) 68 | flist.sort() 69 | 70 | for fname in flist: 71 | images.append(cv2.imread(fname)) 72 | 73 | im = numpy.stack(images,axis=3) 74 | im = numpy.expand_dims(im,axis=0) 75 | im = numpy.transpose(im,(0,3,4,1,2)) 76 | 77 | imtv = torch.autograd.Variable(torch.from_numpy(im.astype(float)).float()) 78 | 79 | # ========== ========== 80 | # Load audio 81 | # ========== ========== 82 | 83 | sample_rate, audio = wavfile.read(os.path.join(opt.tmp_dir,opt.reference,'audio.wav')) 84 | mfcc = zip(*python_speech_features.mfcc(audio,sample_rate)) 85 | mfcc = numpy.stack([numpy.array(i) for i in mfcc]) 86 | 87 | cc = numpy.expand_dims(numpy.expand_dims(mfcc,axis=0),axis=0) 88 | cct = torch.autograd.Variable(torch.from_numpy(cc.astype(float)).float()) 89 | 90 | # ========== ========== 91 | # Check audio and video input length 92 | # ========== ========== 93 | 94 | if (float(len(audio))/16000) != (float(len(images))/25) : 95 | print("WARNING: Audio (%.4fs) and video (%.4fs) lengths are different."%(float(len(audio))/16000,float(len(images))/25)) 96 | 97 | min_length = min(len(images),math.floor(len(audio)/640)) 98 | 99 | # ========== ========== 100 | # Generate video and audio feats 101 | # ========== ========== 102 | 103 | lastframe = min_length-5 104 | im_feat = [] 105 | cc_feat = [] 106 | 107 | tS = time.time() 108 | for i in range(0,lastframe,opt.batch_size): 109 | 110 | im_batch = [ imtv[:,:,vframe:vframe+5,:,:] for vframe in range(i,min(lastframe,i+opt.batch_size)) ] 111 | im_in = torch.cat(im_batch,0) 112 | im_out = self.__S__.forward_lip(im_in.cuda()); 113 | im_feat.append(im_out.data.cpu()) 114 | 115 | cc_batch = [ cct[:,:,:,vframe*4:vframe*4+20] for vframe in range(i,min(lastframe,i+opt.batch_size)) ] 116 | cc_in = torch.cat(cc_batch,0) 117 | cc_out = self.__S__.forward_aud(cc_in.cuda()) 118 | cc_feat.append(cc_out.data.cpu()) 119 | 120 | im_feat = torch.cat(im_feat,0) 121 | cc_feat = torch.cat(cc_feat,0) 122 | 123 | # ========== ========== 124 | # Compute offset 125 | # ========== ========== 126 | 127 | print('Compute time %.3f sec.' % (time.time()-tS)) 128 | 129 | dists = calc_pdist(im_feat,cc_feat,vshift=opt.vshift) 130 | mdist = torch.mean(torch.stack(dists,1),1) 131 | 132 | minval, minidx = torch.min(mdist,0) 133 | 134 | offset = opt.vshift-minidx 135 | conf = torch.median(mdist) - minval 136 | 137 | fdist = numpy.stack([dist[minidx].numpy() for dist in dists]) 138 | # fdist = numpy.pad(fdist, (3,3), 'constant', constant_values=15) 139 | fconf = torch.median(mdist).numpy() - fdist 140 | fconfm = signal.medfilt(fconf,kernel_size=9) 141 | 142 | numpy.set_printoptions(formatter={'float': '{: 0.3f}'.format}) 143 | print('Framewise conf: ') 144 | print(fconfm) 145 | print('AV offset: \t%d \nMin dist: \t%.3f\nConfidence: \t%.3f' % (offset,minval,conf)) 146 | 147 | #dists_npy = numpy.array([ dist.numpy() for dist in dists ]) 148 | return offset 149 | 150 | def extract_feature(self, opt, videofile): 151 | 152 | self.__S__.eval(); 153 | 154 | # ========== ========== 155 | # Load video 156 | # ========== ========== 157 | cap = cv2.VideoCapture(videofile) 158 | 159 | frame_num = 1; 160 | images = [] 161 | while frame_num: 162 | frame_num += 1 163 | ret, image = cap.read() 164 | if ret == 0: 165 | break 166 | 167 | images.append(image) 168 | 169 | im = numpy.stack(images,axis=3) 170 | im = numpy.expand_dims(im,axis=0) 171 | im = numpy.transpose(im,(0,3,4,1,2)) 172 | 173 | imtv = torch.autograd.Variable(torch.from_numpy(im.astype(float)).float()) 174 | 175 | # ========== ========== 176 | # Generate video feats 177 | # ========== ========== 178 | 179 | lastframe = len(images)-4 180 | im_feat = [] 181 | 182 | tS = time.time() 183 | for i in range(0,lastframe,opt.batch_size): 184 | 185 | im_batch = [ imtv[:,:,vframe:vframe+5,:,:] for vframe in range(i,min(lastframe,i+opt.batch_size)) ] 186 | im_in = torch.cat(im_batch,0) 187 | im_out = self.__S__.forward_lipfeat(im_in.cuda()); 188 | im_feat.append(im_out.data.cpu()) 189 | 190 | im_feat = torch.cat(im_feat,0) 191 | 192 | # ========== ========== 193 | # Compute offset 194 | # ========== ========== 195 | 196 | print('Compute time %.3f sec.' % (time.time()-tS)) 197 | 198 | return im_feat 199 | 200 | 201 | def loadParameters(self, path): 202 | loaded_state = torch.load(path, map_location=lambda storage, loc: storage); 203 | 204 | self_state = self.__S__.state_dict(); 205 | 206 | for name, param in loaded_state.items(): 207 | 208 | self_state[name].copy_(param); 209 | -------------------------------------------------------------------------------- /script/syncnet_python/SyncNetModel.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | #-*- coding: utf-8 -*- 3 | 4 | import torch 5 | import torch.nn as nn 6 | 7 | def save(model, filename): 8 | with open(filename, "wb") as f: 9 | torch.save(model, f); 10 | print("%s saved."%filename); 11 | 12 | def load(filename): 13 | net = torch.load(filename) 14 | return net; 15 | 16 | class S(nn.Module): 17 | def __init__(self, num_layers_in_fc_layers = 1024): 18 | super(S, self).__init__(); 19 | 20 | self.__nFeatures__ = 24; 21 | self.__nChs__ = 32; 22 | self.__midChs__ = 32; 23 | 24 | self.netcnnaud = nn.Sequential( 25 | nn.Conv2d(1, 64, kernel_size=(3,3), stride=(1,1), padding=(1,1)), 26 | nn.BatchNorm2d(64), 27 | nn.ReLU(inplace=True), 28 | nn.MaxPool2d(kernel_size=(1,1), stride=(1,1)), 29 | 30 | nn.Conv2d(64, 192, kernel_size=(3,3), stride=(1,1), padding=(1,1)), 31 | nn.BatchNorm2d(192), 32 | nn.ReLU(inplace=True), 33 | nn.MaxPool2d(kernel_size=(3,3), stride=(1,2)), 34 | 35 | nn.Conv2d(192, 384, kernel_size=(3,3), padding=(1,1)), 36 | nn.BatchNorm2d(384), 37 | nn.ReLU(inplace=True), 38 | 39 | nn.Conv2d(384, 256, kernel_size=(3,3), padding=(1,1)), 40 | nn.BatchNorm2d(256), 41 | nn.ReLU(inplace=True), 42 | 43 | nn.Conv2d(256, 256, kernel_size=(3,3), padding=(1,1)), 44 | nn.BatchNorm2d(256), 45 | nn.ReLU(inplace=True), 46 | nn.MaxPool2d(kernel_size=(3,3), stride=(2,2)), 47 | 48 | nn.Conv2d(256, 512, kernel_size=(5,4), padding=(0,0)), 49 | nn.BatchNorm2d(512), 50 | nn.ReLU(), 51 | ); 52 | 53 | self.netfcaud = nn.Sequential( 54 | nn.Linear(512, 512), 55 | nn.BatchNorm1d(512), 56 | nn.ReLU(), 57 | nn.Linear(512, num_layers_in_fc_layers), 58 | ); 59 | 60 | self.netfclip = nn.Sequential( 61 | nn.Linear(512, 512), 62 | nn.BatchNorm1d(512), 63 | nn.ReLU(), 64 | nn.Linear(512, num_layers_in_fc_layers), 65 | ); 66 | 67 | self.netcnnlip = nn.Sequential( 68 | nn.Conv3d(3, 96, kernel_size=(5,7,7), stride=(1,2,2), padding=0), 69 | nn.BatchNorm3d(96), 70 | nn.ReLU(inplace=True), 71 | nn.MaxPool3d(kernel_size=(1,3,3), stride=(1,2,2)), 72 | 73 | nn.Conv3d(96, 256, kernel_size=(1,5,5), stride=(1,2,2), padding=(0,1,1)), 74 | nn.BatchNorm3d(256), 75 | nn.ReLU(inplace=True), 76 | nn.MaxPool3d(kernel_size=(1,3,3), stride=(1,2,2), padding=(0,1,1)), 77 | 78 | nn.Conv3d(256, 256, kernel_size=(1,3,3), padding=(0,1,1)), 79 | nn.BatchNorm3d(256), 80 | nn.ReLU(inplace=True), 81 | 82 | nn.Conv3d(256, 256, kernel_size=(1,3,3), padding=(0,1,1)), 83 | nn.BatchNorm3d(256), 84 | nn.ReLU(inplace=True), 85 | 86 | nn.Conv3d(256, 256, kernel_size=(1,3,3), padding=(0,1,1)), 87 | nn.BatchNorm3d(256), 88 | nn.ReLU(inplace=True), 89 | nn.MaxPool3d(kernel_size=(1,3,3), stride=(1,2,2)), 90 | 91 | nn.Conv3d(256, 512, kernel_size=(1,6,6), padding=0), 92 | nn.BatchNorm3d(512), 93 | nn.ReLU(inplace=True), 94 | ); 95 | 96 | def forward_aud(self, x): 97 | 98 | mid = self.netcnnaud(x); # N x ch x 24 x M 99 | mid = mid.view((mid.size()[0], -1)); # N x (ch x 24) 100 | out = self.netfcaud(mid); 101 | 102 | return out; 103 | 104 | def forward_lip(self, x): 105 | 106 | mid = self.netcnnlip(x); 107 | mid = mid.view((mid.size()[0], -1)); # N x (ch x 24) 108 | out = self.netfclip(mid); 109 | 110 | return out; 111 | 112 | def forward_lipfeat(self, x): 113 | 114 | mid = self.netcnnlip(x); 115 | out = mid.view((mid.size()[0], -1)); # N x (ch x 24) 116 | 117 | return out; -------------------------------------------------------------------------------- /script/syncnet_python/demo_feature.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | #-*- coding: utf-8 -*- 3 | 4 | import time, pdb, argparse, subprocess 5 | 6 | from SyncNetInstance import * 7 | 8 | # ==================== LOAD PARAMS ==================== 9 | 10 | 11 | parser = argparse.ArgumentParser(description = "SyncNet"); 12 | 13 | parser.add_argument('--initial_model', type=str, default="data/syncnet_v2.model", help=''); 14 | parser.add_argument('--batch_size', type=int, default='20', help=''); 15 | parser.add_argument('--vshift', type=int, default='15', help=''); 16 | parser.add_argument('--videofile', type=str, default="data/example.avi", help=''); 17 | parser.add_argument('--tmp_dir', type=str, default="data", help=''); 18 | parser.add_argument('--save_as', type=str, default="data/features.pt", help=''); 19 | 20 | opt = parser.parse_args(); 21 | 22 | 23 | # ==================== RUN EVALUATION ==================== 24 | 25 | s = SyncNetInstance(); 26 | 27 | s.loadParameters(opt.initial_model); 28 | print("Model %s loaded."%opt.initial_model); 29 | 30 | feats = s.extract_feature(opt, videofile=opt.videofile) 31 | 32 | torch.save(feats, opt.save_as) 33 | -------------------------------------------------------------------------------- /script/syncnet_python/demo_syncnet.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | #-*- coding: utf-8 -*- 3 | 4 | import time, pdb, argparse, subprocess 5 | 6 | from SyncNetInstance import * 7 | 8 | # ==================== LOAD PARAMS ==================== 9 | 10 | 11 | parser = argparse.ArgumentParser(description = "SyncNet"); 12 | 13 | parser.add_argument('--initial_model', type=str, default="data/syncnet_v2.model", help=''); 14 | parser.add_argument('--batch_size', type=int, default='16', help=''); 15 | parser.add_argument('--vshift', type=int, default='15', help=''); 16 | parser.add_argument('--videofile', type=str, default="data/example.mp4", help=''); 17 | parser.add_argument('--tmp_dir', type=str, default="data/work/pytmp", help=''); 18 | parser.add_argument('--reference', type=str, default="demo", help=''); 19 | 20 | opt = parser.parse_args(); 21 | 22 | 23 | # ==================== RUN EVALUATION ==================== 24 | 25 | s = SyncNetInstance(); 26 | 27 | s.loadParameters(opt.initial_model); 28 | print("Model %s loaded."%opt.initial_model); 29 | 30 | s.evaluate(opt, videofile=opt.videofile) 31 | -------------------------------------------------------------------------------- /script/syncnet_python/detectors/README.md: -------------------------------------------------------------------------------- 1 | # Face detector 2 | 3 | This face detector is adapted from `https://github.com/cs-giung/face-detection-pytorch`. 4 | -------------------------------------------------------------------------------- /script/syncnet_python/detectors/__init__.py: -------------------------------------------------------------------------------- 1 | from .s3fd import S3FD -------------------------------------------------------------------------------- /script/syncnet_python/detectors/s3fd/__init__.py: -------------------------------------------------------------------------------- 1 | import time 2 | import numpy as np 3 | import cv2 4 | import torch 5 | from torchvision import transforms 6 | from .nets import S3FDNet 7 | from .box_utils import nms_ 8 | 9 | PATH_WEIGHT = './detectors/s3fd/weights/sfd_face.pth' 10 | img_mean = np.array([104., 117., 123.])[:, np.newaxis, np.newaxis].astype('float32') 11 | 12 | 13 | class S3FD(): 14 | 15 | def __init__(self, device='cuda'): 16 | 17 | tstamp = time.time() 18 | self.device = device 19 | 20 | print('[S3FD] loading with', self.device) 21 | self.net = S3FDNet(device=self.device).to(self.device) 22 | state_dict = torch.load(PATH_WEIGHT, map_location=self.device) 23 | self.net.load_state_dict(state_dict) 24 | self.net.eval() 25 | print('[S3FD] finished loading (%.4f sec)' % (time.time() - tstamp)) 26 | 27 | def detect_faces(self, image, conf_th=0.8, scales=[1]): 28 | 29 | w, h = image.shape[1], image.shape[0] 30 | 31 | bboxes = np.empty(shape=(0, 5)) 32 | 33 | with torch.no_grad(): 34 | for s in scales: 35 | scaled_img = cv2.resize(image, dsize=(0, 0), fx=s, fy=s, interpolation=cv2.INTER_LINEAR) 36 | 37 | scaled_img = np.swapaxes(scaled_img, 1, 2) 38 | scaled_img = np.swapaxes(scaled_img, 1, 0) 39 | scaled_img = scaled_img[[2, 1, 0], :, :] 40 | scaled_img = scaled_img.astype('float32') 41 | scaled_img -= img_mean 42 | scaled_img = scaled_img[[2, 1, 0], :, :] 43 | x = torch.from_numpy(scaled_img).unsqueeze(0).to(self.device) 44 | y = self.net(x) 45 | 46 | detections = y.data 47 | scale = torch.Tensor([w, h, w, h]) 48 | 49 | for i in range(detections.size(1)): 50 | j = 0 51 | while detections[0, i, j, 0] > conf_th: 52 | score = detections[0, i, j, 0] 53 | pt = (detections[0, i, j, 1:] * scale).cpu().numpy() 54 | bbox = (pt[0], pt[1], pt[2], pt[3], score) 55 | bboxes = np.vstack((bboxes, bbox)) 56 | j += 1 57 | 58 | keep = nms_(bboxes, 0.1) 59 | bboxes = bboxes[keep] 60 | 61 | return bboxes 62 | -------------------------------------------------------------------------------- /script/syncnet_python/detectors/s3fd/box_utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from itertools import product as product 3 | import torch 4 | from torch.autograd import Function 5 | 6 | 7 | def nms_(dets, thresh): 8 | """ 9 | Courtesy of Ross Girshick 10 | [https://github.com/rbgirshick/py-faster-rcnn/blob/master/lib/nms/py_cpu_nms.py] 11 | """ 12 | x1 = dets[:, 0] 13 | y1 = dets[:, 1] 14 | x2 = dets[:, 2] 15 | y2 = dets[:, 3] 16 | scores = dets[:, 4] 17 | 18 | areas = (x2 - x1) * (y2 - y1) 19 | order = scores.argsort()[::-1] 20 | 21 | keep = [] 22 | while order.size > 0: 23 | i = order[0] 24 | keep.append(int(i)) 25 | xx1 = np.maximum(x1[i], x1[order[1:]]) 26 | yy1 = np.maximum(y1[i], y1[order[1:]]) 27 | xx2 = np.minimum(x2[i], x2[order[1:]]) 28 | yy2 = np.minimum(y2[i], y2[order[1:]]) 29 | 30 | w = np.maximum(0.0, xx2 - xx1) 31 | h = np.maximum(0.0, yy2 - yy1) 32 | inter = w * h 33 | ovr = inter / (areas[i] + areas[order[1:]] - inter) 34 | 35 | inds = np.where(ovr <= thresh)[0] 36 | order = order[inds + 1] 37 | 38 | return np.array(keep).astype(np.int_) 39 | 40 | 41 | def decode(loc, priors, variances): 42 | """Decode locations from predictions using priors to undo 43 | the encoding we did for offset regression at train time. 44 | Args: 45 | loc (tensor): location predictions for loc layers, 46 | Shape: [num_priors,4] 47 | priors (tensor): Prior boxes in center-offset form. 48 | Shape: [num_priors,4]. 49 | variances: (list[float]) Variances of priorboxes 50 | Return: 51 | decoded bounding box predictions 52 | """ 53 | 54 | boxes = torch.cat(( 55 | priors[:, :2] + loc[:, :2] * variances[0] * priors[:, 2:], 56 | priors[:, 2:] * torch.exp(loc[:, 2:] * variances[1])), 1) 57 | boxes[:, :2] -= boxes[:, 2:] / 2 58 | boxes[:, 2:] += boxes[:, :2] 59 | return boxes 60 | 61 | 62 | def nms(boxes, scores, overlap=0.5, top_k=200): 63 | """Apply non-maximum suppression at test time to avoid detecting too many 64 | overlapping bounding boxes for a given object. 65 | Args: 66 | boxes: (tensor) The location preds for the img, Shape: [num_priors,4]. 67 | scores: (tensor) The class predscores for the img, Shape:[num_priors]. 68 | overlap: (float) The overlap thresh for suppressing unnecessary boxes. 69 | top_k: (int) The Maximum number of box preds to consider. 70 | Return: 71 | The indices of the kept boxes with respect to num_priors. 72 | """ 73 | 74 | keep = scores.new(scores.size(0)).zero_().long() 75 | if boxes.numel() == 0: 76 | return keep, 0 77 | x1 = boxes[:, 0] 78 | y1 = boxes[:, 1] 79 | x2 = boxes[:, 2] 80 | y2 = boxes[:, 3] 81 | area = torch.mul(x2 - x1, y2 - y1) 82 | v, idx = scores.sort(0) # sort in ascending order 83 | # I = I[v >= 0.01] 84 | idx = idx[-top_k:] # indices of the top-k largest vals 85 | xx1 = boxes.new() 86 | yy1 = boxes.new() 87 | xx2 = boxes.new() 88 | yy2 = boxes.new() 89 | w = boxes.new() 90 | h = boxes.new() 91 | 92 | # keep = torch.Tensor() 93 | count = 0 94 | while idx.numel() > 0: 95 | i = idx[-1] # index of current largest val 96 | # keep.append(i) 97 | keep[count] = i 98 | count += 1 99 | if idx.size(0) == 1: 100 | break 101 | idx = idx[:-1] # remove kept element from view 102 | # load bboxes of next highest vals 103 | torch.index_select(x1, 0, idx, out=xx1) 104 | torch.index_select(y1, 0, idx, out=yy1) 105 | torch.index_select(x2, 0, idx, out=xx2) 106 | torch.index_select(y2, 0, idx, out=yy2) 107 | # store element-wise max with next highest score 108 | xx1 = torch.clamp(xx1, min=x1[i]) 109 | yy1 = torch.clamp(yy1, min=y1[i]) 110 | xx2 = torch.clamp(xx2, max=x2[i]) 111 | yy2 = torch.clamp(yy2, max=y2[i]) 112 | w.resize_as_(xx2) 113 | h.resize_as_(yy2) 114 | w = xx2 - xx1 115 | h = yy2 - yy1 116 | # check sizes of xx1 and xx2.. after each iteration 117 | w = torch.clamp(w, min=0.0) 118 | h = torch.clamp(h, min=0.0) 119 | inter = w * h 120 | # IoU = i / (area(a) + area(b) - i) 121 | rem_areas = torch.index_select(area, 0, idx) # load remaining areas) 122 | union = (rem_areas - inter) + area[i] 123 | IoU = inter / union # store result in iou 124 | # keep only elements with an IoU <= overlap 125 | idx = idx[IoU.le(overlap)] 126 | return keep, count 127 | 128 | 129 | class Detect(object): 130 | 131 | def __init__(self, num_classes=2, 132 | top_k=750, nms_thresh=0.3, conf_thresh=0.05, 133 | variance=[0.1, 0.2], nms_top_k=5000): 134 | 135 | self.num_classes = num_classes 136 | self.top_k = top_k 137 | self.nms_thresh = nms_thresh 138 | self.conf_thresh = conf_thresh 139 | self.variance = variance 140 | self.nms_top_k = nms_top_k 141 | 142 | def forward(self, loc_data, conf_data, prior_data): 143 | 144 | num = loc_data.size(0) 145 | num_priors = prior_data.size(0) 146 | 147 | conf_preds = conf_data.view(num, num_priors, self.num_classes).transpose(2, 1) 148 | batch_priors = prior_data.view(-1, num_priors, 4).expand(num, num_priors, 4) 149 | batch_priors = batch_priors.contiguous().view(-1, 4) 150 | 151 | decoded_boxes = decode(loc_data.view(-1, 4), batch_priors, self.variance) 152 | decoded_boxes = decoded_boxes.view(num, num_priors, 4) 153 | 154 | output = torch.zeros(num, self.num_classes, self.top_k, 5) 155 | 156 | for i in range(num): 157 | boxes = decoded_boxes[i].clone() 158 | conf_scores = conf_preds[i].clone() 159 | 160 | for cl in range(1, self.num_classes): 161 | c_mask = conf_scores[cl].gt(self.conf_thresh) 162 | scores = conf_scores[cl][c_mask] 163 | 164 | if scores.dim() == 0: 165 | continue 166 | l_mask = c_mask.unsqueeze(1).expand_as(boxes) 167 | boxes_ = boxes[l_mask].view(-1, 4) 168 | ids, count = nms(boxes_, scores, self.nms_thresh, self.nms_top_k) 169 | count = count if count < self.top_k else self.top_k 170 | 171 | output[i, cl, :count] = torch.cat((scores[ids[:count]].unsqueeze(1), boxes_[ids[:count]]), 1) 172 | 173 | return output 174 | 175 | 176 | class PriorBox(object): 177 | 178 | def __init__(self, input_size, feature_maps, 179 | variance=[0.1, 0.2], 180 | min_sizes=[16, 32, 64, 128, 256, 512], 181 | steps=[4, 8, 16, 32, 64, 128], 182 | clip=False): 183 | 184 | super(PriorBox, self).__init__() 185 | 186 | self.imh = input_size[0] 187 | self.imw = input_size[1] 188 | self.feature_maps = feature_maps 189 | 190 | self.variance = variance 191 | self.min_sizes = min_sizes 192 | self.steps = steps 193 | self.clip = clip 194 | 195 | def forward(self): 196 | mean = [] 197 | for k, fmap in enumerate(self.feature_maps): 198 | feath = fmap[0] 199 | featw = fmap[1] 200 | for i, j in product(range(feath), range(featw)): 201 | f_kw = self.imw / self.steps[k] 202 | f_kh = self.imh / self.steps[k] 203 | 204 | cx = (j + 0.5) / f_kw 205 | cy = (i + 0.5) / f_kh 206 | 207 | s_kw = self.min_sizes[k] / self.imw 208 | s_kh = self.min_sizes[k] / self.imh 209 | 210 | mean += [cx, cy, s_kw, s_kh] 211 | 212 | output = torch.FloatTensor(mean).view(-1, 4) 213 | 214 | if self.clip: 215 | output.clamp_(max=1, min=0) 216 | 217 | return output 218 | -------------------------------------------------------------------------------- /script/syncnet_python/detectors/s3fd/nets.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | import torch.nn.init as init 5 | from .box_utils import Detect, PriorBox 6 | 7 | 8 | class L2Norm(nn.Module): 9 | 10 | def __init__(self, n_channels, scale): 11 | super(L2Norm, self).__init__() 12 | self.n_channels = n_channels 13 | self.gamma = scale or None 14 | self.eps = 1e-10 15 | self.weight = nn.Parameter(torch.Tensor(self.n_channels)) 16 | self.reset_parameters() 17 | 18 | def reset_parameters(self): 19 | init.constant_(self.weight, self.gamma) 20 | 21 | def forward(self, x): 22 | norm = x.pow(2).sum(dim=1, keepdim=True).sqrt() + self.eps 23 | x = torch.div(x, norm) 24 | out = self.weight.unsqueeze(0).unsqueeze(2).unsqueeze(3).expand_as(x) * x 25 | return out 26 | 27 | 28 | class S3FDNet(nn.Module): 29 | 30 | def __init__(self, device='cuda'): 31 | super(S3FDNet, self).__init__() 32 | self.device = device 33 | 34 | self.vgg = nn.ModuleList([ 35 | nn.Conv2d(3, 64, 3, 1, padding=1), 36 | nn.ReLU(inplace=True), 37 | nn.Conv2d(64, 64, 3, 1, padding=1), 38 | nn.ReLU(inplace=True), 39 | nn.MaxPool2d(2, 2), 40 | 41 | nn.Conv2d(64, 128, 3, 1, padding=1), 42 | nn.ReLU(inplace=True), 43 | nn.Conv2d(128, 128, 3, 1, padding=1), 44 | nn.ReLU(inplace=True), 45 | nn.MaxPool2d(2, 2), 46 | 47 | nn.Conv2d(128, 256, 3, 1, padding=1), 48 | nn.ReLU(inplace=True), 49 | nn.Conv2d(256, 256, 3, 1, padding=1), 50 | nn.ReLU(inplace=True), 51 | nn.Conv2d(256, 256, 3, 1, padding=1), 52 | nn.ReLU(inplace=True), 53 | nn.MaxPool2d(2, 2, ceil_mode=True), 54 | 55 | nn.Conv2d(256, 512, 3, 1, padding=1), 56 | nn.ReLU(inplace=True), 57 | nn.Conv2d(512, 512, 3, 1, padding=1), 58 | nn.ReLU(inplace=True), 59 | nn.Conv2d(512, 512, 3, 1, padding=1), 60 | nn.ReLU(inplace=True), 61 | nn.MaxPool2d(2, 2), 62 | 63 | nn.Conv2d(512, 512, 3, 1, padding=1), 64 | nn.ReLU(inplace=True), 65 | nn.Conv2d(512, 512, 3, 1, padding=1), 66 | nn.ReLU(inplace=True), 67 | nn.Conv2d(512, 512, 3, 1, padding=1), 68 | nn.ReLU(inplace=True), 69 | nn.MaxPool2d(2, 2), 70 | 71 | nn.Conv2d(512, 1024, 3, 1, padding=6, dilation=6), 72 | nn.ReLU(inplace=True), 73 | nn.Conv2d(1024, 1024, 1, 1), 74 | nn.ReLU(inplace=True), 75 | ]) 76 | 77 | self.L2Norm3_3 = L2Norm(256, 10) 78 | self.L2Norm4_3 = L2Norm(512, 8) 79 | self.L2Norm5_3 = L2Norm(512, 5) 80 | 81 | self.extras = nn.ModuleList([ 82 | nn.Conv2d(1024, 256, 1, 1), 83 | nn.Conv2d(256, 512, 3, 2, padding=1), 84 | nn.Conv2d(512, 128, 1, 1), 85 | nn.Conv2d(128, 256, 3, 2, padding=1), 86 | ]) 87 | 88 | self.loc = nn.ModuleList([ 89 | nn.Conv2d(256, 4, 3, 1, padding=1), 90 | nn.Conv2d(512, 4, 3, 1, padding=1), 91 | nn.Conv2d(512, 4, 3, 1, padding=1), 92 | nn.Conv2d(1024, 4, 3, 1, padding=1), 93 | nn.Conv2d(512, 4, 3, 1, padding=1), 94 | nn.Conv2d(256, 4, 3, 1, padding=1), 95 | ]) 96 | 97 | self.conf = nn.ModuleList([ 98 | nn.Conv2d(256, 4, 3, 1, padding=1), 99 | nn.Conv2d(512, 2, 3, 1, padding=1), 100 | nn.Conv2d(512, 2, 3, 1, padding=1), 101 | nn.Conv2d(1024, 2, 3, 1, padding=1), 102 | nn.Conv2d(512, 2, 3, 1, padding=1), 103 | nn.Conv2d(256, 2, 3, 1, padding=1), 104 | ]) 105 | 106 | self.softmax = nn.Softmax(dim=-1) 107 | self.detect = Detect() 108 | 109 | def forward(self, x): 110 | size = x.size()[2:] 111 | sources = list() 112 | loc = list() 113 | conf = list() 114 | 115 | for k in range(16): 116 | x = self.vgg[k](x) 117 | s = self.L2Norm3_3(x) 118 | sources.append(s) 119 | 120 | for k in range(16, 23): 121 | x = self.vgg[k](x) 122 | s = self.L2Norm4_3(x) 123 | sources.append(s) 124 | 125 | for k in range(23, 30): 126 | x = self.vgg[k](x) 127 | s = self.L2Norm5_3(x) 128 | sources.append(s) 129 | 130 | for k in range(30, len(self.vgg)): 131 | x = self.vgg[k](x) 132 | sources.append(x) 133 | 134 | # apply extra layers and cache source layer outputs 135 | for k, v in enumerate(self.extras): 136 | x = F.relu(v(x), inplace=True) 137 | if k % 2 == 1: 138 | sources.append(x) 139 | 140 | # apply multibox head to source layers 141 | loc_x = self.loc[0](sources[0]) 142 | conf_x = self.conf[0](sources[0]) 143 | 144 | max_conf, _ = torch.max(conf_x[:, 0:3, :, :], dim=1, keepdim=True) 145 | conf_x = torch.cat((max_conf, conf_x[:, 3:, :, :]), dim=1) 146 | 147 | loc.append(loc_x.permute(0, 2, 3, 1).contiguous()) 148 | conf.append(conf_x.permute(0, 2, 3, 1).contiguous()) 149 | 150 | for i in range(1, len(sources)): 151 | x = sources[i] 152 | conf.append(self.conf[i](x).permute(0, 2, 3, 1).contiguous()) 153 | loc.append(self.loc[i](x).permute(0, 2, 3, 1).contiguous()) 154 | 155 | features_maps = [] 156 | for i in range(len(loc)): 157 | feat = [] 158 | feat += [loc[i].size(1), loc[i].size(2)] 159 | features_maps += [feat] 160 | 161 | loc = torch.cat([o.view(o.size(0), -1) for o in loc], 1) 162 | conf = torch.cat([o.view(o.size(0), -1) for o in conf], 1) 163 | 164 | with torch.no_grad(): 165 | self.priorbox = PriorBox(size, features_maps) 166 | self.priors = self.priorbox.forward() 167 | 168 | output = self.detect.forward( 169 | loc.view(loc.size(0), -1, 4), 170 | self.softmax(conf.view(conf.size(0), -1, 2)), 171 | self.priors.type(type(x.data)).to(self.device) 172 | ) 173 | 174 | return output 175 | -------------------------------------------------------------------------------- /script/syncnet_python/download_model.sh: -------------------------------------------------------------------------------- 1 | # SyncNet model 2 | 3 | mkdir data 4 | wget http://www.robots.ox.ac.uk/~vgg/software/lipsync/data/syncnet_v2.model -O data/syncnet_v2.model 5 | wget http://www.robots.ox.ac.uk/~vgg/software/lipsync/data/example.avi -O data/example.avi 6 | 7 | # For the pre-processing pipeline 8 | mkdir detectors/s3fd/weights 9 | wget https://www.robots.ox.ac.uk/~vgg/software/lipsync/data/sfd_face.pth -O detectors/s3fd/weights/sfd_face.pth -------------------------------------------------------------------------------- /script/syncnet_python/requirements.txt: -------------------------------------------------------------------------------- 1 | torch>=1.4.0 2 | torchvision>=0.5.0 3 | numpy>=1.18.1 4 | scipy>=1.2.1 5 | scenedetect==0.5.1 6 | opencv-contrib-python 7 | python_speech_features 8 | -------------------------------------------------------------------------------- /script/syncnet_python/run_pipeline.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import sys, time, os, pdb, argparse, pickle, subprocess, glob, cv2 4 | import numpy as np 5 | from shutil import rmtree 6 | 7 | import scenedetect 8 | from scenedetect.video_manager import VideoManager 9 | from scenedetect.scene_manager import SceneManager 10 | from scenedetect.frame_timecode import FrameTimecode 11 | from scenedetect.stats_manager import StatsManager 12 | from scenedetect.detectors import ContentDetector 13 | 14 | from scipy.interpolate import interp1d 15 | from scipy.io import wavfile 16 | from scipy import signal 17 | 18 | from detectors import S3FD 19 | 20 | # ========== ========== ========== ========== 21 | # # PARSE ARGS 22 | # ========== ========== ========== ========== 23 | 24 | parser = argparse.ArgumentParser(description = "FaceTracker"); 25 | parser.add_argument('--data_dir', type=str, default='data/work', help='Output direcotry'); 26 | parser.add_argument('--videofile', type=str, default='', help='Input video file'); 27 | parser.add_argument('--reference', type=str, default='', help='Video reference'); 28 | parser.add_argument('--facedet_scale', type=float, default=0.25, help='Scale factor for face detection'); 29 | parser.add_argument('--crop_scale', type=float, default=0.40, help='Scale bounding box'); 30 | parser.add_argument('--min_track', type=int, default=25, help='Minimum facetrack duration'); 31 | parser.add_argument('--frame_rate', type=int, default=25, help='Frame rate'); 32 | parser.add_argument('--num_failed_det', type=int, default=25, help='Number of missed detections allowed before tracking is stopped'); 33 | parser.add_argument('--min_face_size', type=int, default=100, help='Minimum face size in pixels'); 34 | opt = parser.parse_args(); 35 | 36 | setattr(opt,'avi_dir',os.path.join(opt.data_dir,'pyavi')) 37 | setattr(opt,'tmp_dir',os.path.join(opt.data_dir,'pytmp')) 38 | setattr(opt,'work_dir',os.path.join(opt.data_dir,'pywork')) 39 | setattr(opt,'crop_dir',os.path.join(opt.data_dir,'pycrop')) 40 | setattr(opt,'frames_dir',os.path.join(opt.data_dir,'pyframes')) 41 | 42 | # ========== ========== ========== ========== 43 | # # IOU FUNCTION 44 | # ========== ========== ========== ========== 45 | 46 | def bb_intersection_over_union(boxA, boxB): 47 | 48 | xA = max(boxA[0], boxB[0]) 49 | yA = max(boxA[1], boxB[1]) 50 | xB = min(boxA[2], boxB[2]) 51 | yB = min(boxA[3], boxB[3]) 52 | 53 | interArea = max(0, xB - xA) * max(0, yB - yA) 54 | 55 | boxAArea = (boxA[2] - boxA[0]) * (boxA[3] - boxA[1]) 56 | boxBArea = (boxB[2] - boxB[0]) * (boxB[3] - boxB[1]) 57 | 58 | iou = interArea / float(boxAArea + boxBArea - interArea) 59 | 60 | return iou 61 | 62 | # ========== ========== ========== ========== 63 | # # FACE TRACKING 64 | # ========== ========== ========== ========== 65 | 66 | def track_shot(opt,scenefaces): 67 | 68 | iouThres = 0.5 # Minimum IOU between consecutive face detections 69 | tracks = [] 70 | 71 | while True: 72 | track = [] 73 | for framefaces in scenefaces: 74 | for face in framefaces: 75 | if track == []: 76 | track.append(face) 77 | framefaces.remove(face) 78 | elif face['frame'] - track[-1]['frame'] <= opt.num_failed_det: 79 | iou = bb_intersection_over_union(face['bbox'], track[-1]['bbox']) 80 | if iou > iouThres: 81 | track.append(face) 82 | framefaces.remove(face) 83 | continue 84 | else: 85 | break 86 | 87 | if track == []: 88 | break 89 | elif len(track) > opt.min_track: 90 | 91 | framenum = np.array([ f['frame'] for f in track ]) 92 | bboxes = np.array([np.array(f['bbox']) for f in track]) 93 | 94 | frame_i = np.arange(framenum[0],framenum[-1]+1) 95 | 96 | bboxes_i = [] 97 | for ij in range(0,4): 98 | interpfn = interp1d(framenum, bboxes[:,ij]) 99 | bboxes_i.append(interpfn(frame_i)) 100 | bboxes_i = np.stack(bboxes_i, axis=1) 101 | 102 | if max(np.mean(bboxes_i[:,2]-bboxes_i[:,0]), np.mean(bboxes_i[:,3]-bboxes_i[:,1])) > opt.min_face_size: 103 | tracks.append({'frame':frame_i,'bbox':bboxes_i}) 104 | 105 | return tracks 106 | 107 | # ========== ========== ========== ========== 108 | # # VIDEO CROP AND SAVE 109 | # ========== ========== ========== ========== 110 | 111 | def crop_video(opt,track,cropfile): 112 | 113 | flist = glob.glob(os.path.join(opt.frames_dir,opt.reference,'*.jpg')) 114 | flist.sort() 115 | 116 | fourcc = cv2.VideoWriter_fourcc(*'XVID') 117 | vOut = cv2.VideoWriter(cropfile+'t.avi', fourcc, opt.frame_rate, (224,224)) 118 | 119 | dets = {'x':[], 'y':[], 's':[]} 120 | 121 | for det in track['bbox']: 122 | 123 | dets['s'].append(max((det[3]-det[1]),(det[2]-det[0]))/2) 124 | dets['y'].append((det[1]+det[3])/2) # crop center x 125 | dets['x'].append((det[0]+det[2])/2) # crop center y 126 | 127 | # Smooth detections 128 | dets['s'] = signal.medfilt(dets['s'],kernel_size=13) 129 | dets['x'] = signal.medfilt(dets['x'],kernel_size=13) 130 | dets['y'] = signal.medfilt(dets['y'],kernel_size=13) 131 | 132 | for fidx, frame in enumerate(track['frame']): 133 | 134 | cs = opt.crop_scale 135 | 136 | bs = dets['s'][fidx] # Detection box size 137 | bsi = int(bs*(1+2*cs)) # Pad videos by this amount 138 | 139 | image = cv2.imread(flist[frame]) 140 | 141 | frame = np.pad(image,((bsi,bsi),(bsi,bsi),(0,0)), 'constant', constant_values=(110,110)) 142 | my = dets['y'][fidx]+bsi # BBox center Y 143 | mx = dets['x'][fidx]+bsi # BBox center X 144 | 145 | face = frame[int(my-bs):int(my+bs*(1+2*cs)),int(mx-bs*(1+cs)):int(mx+bs*(1+cs))] 146 | 147 | vOut.write(cv2.resize(face,(224,224))) 148 | 149 | audiotmp = os.path.join(opt.tmp_dir,opt.reference,'audio.wav') 150 | audiostart = (track['frame'][0])/opt.frame_rate 151 | audioend = (track['frame'][-1]+1)/opt.frame_rate 152 | 153 | vOut.release() 154 | 155 | # ========== CROP AUDIO FILE ========== 156 | 157 | command = ("ffmpeg -y -i %s -ss %.3f -to %.3f %s" % (os.path.join(opt.avi_dir,opt.reference,'audio.wav'),audiostart,audioend,audiotmp)) 158 | output = subprocess.call(command, shell=True, stdout=None) 159 | 160 | if output != 0: 161 | pdb.set_trace() 162 | 163 | sample_rate, audio = wavfile.read(audiotmp) 164 | 165 | # ========== COMBINE AUDIO AND VIDEO FILES ========== 166 | 167 | command = ("ffmpeg -y -i %st.avi -i %s -c:v copy -c:a copy %s.avi" % (cropfile,audiotmp,cropfile)) 168 | output = subprocess.call(command, shell=True, stdout=None) 169 | 170 | if output != 0: 171 | pdb.set_trace() 172 | 173 | print('Written %s'%cropfile) 174 | 175 | os.remove(cropfile+'t.avi') 176 | 177 | print('Mean pos: x %.2f y %.2f s %.2f'%(np.mean(dets['x']),np.mean(dets['y']),np.mean(dets['s']))) 178 | 179 | return {'track':track, 'proc_track':dets} 180 | 181 | 182 | def crop_hq_video(opt, track, cropfile): 183 | flist = glob.glob(os.path.join(opt.frames_dir, opt.reference, '*.jpg')) 184 | flist.sort() 185 | 186 | fourcc = cv2.VideoWriter_fourcc(*'XVID') 187 | 188 | 189 | dets = {'x': [], 'y': [], 's': []} 190 | 191 | for det in track['bbox']: 192 | dets['s'].append(max((det[3] - det[1]), (det[2] - det[0])) / 2) 193 | dets['y'].append((det[1] + det[3]) / 2) # crop center x 194 | dets['x'].append((det[0] + det[2]) / 2) # crop center y 195 | 196 | # Smooth detections 197 | dets['s'] = signal.medfilt(dets['s'], kernel_size=13) 198 | dets['x'] = signal.medfilt(dets['x'], kernel_size=13) 199 | dets['y'] = signal.medfilt(dets['y'], kernel_size=13) 200 | cs = opt.crop_scale 201 | lengh = int(dets['s'].max() * 2 * (1 + cs)) 202 | vOut = cv2.VideoWriter(cropfile + 't.avi', fourcc, opt.frame_rate, (lengh, lengh)) 203 | 204 | for fidx, frame in enumerate(track['frame']): 205 | cs = opt.crop_scale 206 | 207 | bs = dets['s'][fidx] # Detection box size 208 | bsi = int(bs * (1 + 2 * cs)) # Pad videos by this amount 209 | 210 | image = cv2.imread(flist[frame]) 211 | 212 | frame = np.pad(image, ((bsi, bsi), (bsi, bsi), (0, 0)), 'constant', constant_values=(110, 110)) 213 | my = dets['y'][fidx] + bsi # BBox center Y 214 | mx = dets['x'][fidx] + bsi # BBox center X 215 | 216 | face = frame[int(my - bs):int(my + bs * (1 + 2 * cs)), int(mx - bs * (1 + cs)):int(mx + bs * (1 + cs))] 217 | 218 | vOut.write(cv2.resize(face,(lengh,lengh))) 219 | audiotmp = os.path.join(opt.tmp_dir, opt.reference, 'audio.wav') 220 | audiostart = (track['frame'][0]) / opt.frame_rate 221 | audioend = (track['frame'][-1] + 1) / opt.frame_rate 222 | 223 | vOut.release() 224 | 225 | # ========== CROP AUDIO FILE ========== 226 | 227 | command = ("ffmpeg -y -i %s -ss %.3f -to %.3f %s" % ( 228 | os.path.join(opt.avi_dir, opt.reference, 'audio.wav'), audiostart, audioend, audiotmp)) 229 | output = subprocess.call(command, shell=True, stdout=None) 230 | 231 | if output != 0: 232 | pdb.set_trace() 233 | 234 | sample_rate, audio = wavfile.read(audiotmp) 235 | 236 | # ========== COMBINE AUDIO AND VIDEO FILES ========== 237 | 238 | command = ("ffmpeg -y -i %st.avi -i %s -c:v copy -c:a copy %s_hq.avi" % (cropfile, audiotmp, cropfile)) 239 | output = subprocess.call(command, shell=True, stdout=None) 240 | 241 | if output != 0: 242 | pdb.set_trace() 243 | 244 | print('Written %s' % cropfile) 245 | 246 | os.remove(cropfile + 't.avi') 247 | 248 | print('Mean pos: x %.2f y %.2f s %.2f' % (np.mean(dets['x']), np.mean(dets['y']), np.mean(dets['s']))) 249 | 250 | return {'track': track, 'proc_track': dets} 251 | 252 | 253 | # ========== ========== ========== ========== 254 | # # FACE DETECTION 255 | # ========== ========== ========== ========== 256 | 257 | def inference_video(opt): 258 | 259 | DET = S3FD(device='cuda') 260 | 261 | flist = glob.glob(os.path.join(opt.frames_dir,opt.reference,'*.jpg')) 262 | flist.sort() 263 | 264 | dets = [] 265 | 266 | for fidx, fname in enumerate(flist): 267 | 268 | start_time = time.time() 269 | 270 | image = cv2.imread(fname) 271 | 272 | image_np = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) 273 | bboxes = DET.detect_faces(image_np, conf_th=0.9, scales=[opt.facedet_scale]) 274 | 275 | dets.append([]); 276 | for bbox in bboxes: 277 | dets[-1].append({'frame':fidx, 'bbox':(bbox[:-1]).tolist(), 'conf':bbox[-1]}) 278 | 279 | elapsed_time = time.time() - start_time 280 | 281 | print('%s-%05d; %d dets; %.2f Hz' % (os.path.join(opt.avi_dir,opt.reference,'video.avi'),fidx,len(dets[-1]),(1/elapsed_time))) 282 | 283 | savepath = os.path.join(opt.work_dir,opt.reference,'faces.pckl') 284 | 285 | with open(savepath, 'wb') as fil: 286 | pickle.dump(dets, fil) 287 | 288 | return dets 289 | 290 | # ========== ========== ========== ========== 291 | # # SCENE DETECTION 292 | # ========== ========== ========== ========== 293 | 294 | def scene_detect(opt): 295 | 296 | video_manager = VideoManager([os.path.join(opt.avi_dir,opt.reference,'video.avi')]) 297 | stats_manager = StatsManager() 298 | scene_manager = SceneManager(stats_manager) 299 | # Add ContentDetector algorithm (constructor takes detector options like threshold). 300 | scene_manager.add_detector(ContentDetector()) 301 | base_timecode = video_manager.get_base_timecode() 302 | 303 | video_manager.set_downscale_factor() 304 | 305 | video_manager.start() 306 | 307 | scene_manager.detect_scenes(frame_source=video_manager) 308 | 309 | scene_list = scene_manager.get_scene_list(base_timecode) 310 | 311 | savepath = os.path.join(opt.work_dir,opt.reference,'scene.pckl') 312 | 313 | if scene_list == []: 314 | scene_list = [(video_manager.get_base_timecode(),video_manager.get_current_timecode())] 315 | 316 | with open(savepath, 'wb') as fil: 317 | pickle.dump(scene_list, fil) 318 | 319 | print('%s - scenes detected %d'%(os.path.join(opt.avi_dir,opt.reference,'video.avi'),len(scene_list))) 320 | 321 | return scene_list 322 | 323 | 324 | # ========== ========== ========== ========== 325 | # # EXECUTE DEMO 326 | # ========== ========== ========== ========== 327 | 328 | # ========== DELETE EXISTING DIRECTORIES ========== 329 | 330 | if os.path.exists(os.path.join(opt.work_dir,opt.reference)): 331 | rmtree(os.path.join(opt.work_dir,opt.reference)) 332 | 333 | if os.path.exists(os.path.join(opt.crop_dir,opt.reference)): 334 | rmtree(os.path.join(opt.crop_dir,opt.reference)) 335 | 336 | if os.path.exists(os.path.join(opt.avi_dir,opt.reference)): 337 | rmtree(os.path.join(opt.avi_dir,opt.reference)) 338 | 339 | if os.path.exists(os.path.join(opt.frames_dir,opt.reference)): 340 | rmtree(os.path.join(opt.frames_dir,opt.reference)) 341 | 342 | if os.path.exists(os.path.join(opt.tmp_dir,opt.reference)): 343 | rmtree(os.path.join(opt.tmp_dir,opt.reference)) 344 | 345 | # ========== MAKE NEW DIRECTORIES ========== 346 | 347 | os.makedirs(os.path.join(opt.work_dir,opt.reference)) 348 | os.makedirs(os.path.join(opt.crop_dir,opt.reference)) 349 | os.makedirs(os.path.join(opt.avi_dir,opt.reference)) 350 | os.makedirs(os.path.join(opt.frames_dir,opt.reference)) 351 | os.makedirs(os.path.join(opt.tmp_dir,opt.reference)) 352 | 353 | # ========== CONVERT VIDEO AND EXTRACT FRAMES ========== 354 | 355 | command = ("ffmpeg -y -i %s -qscale:v 2 -async 1 -r 25 %s" % (opt.videofile,os.path.join(opt.avi_dir,opt.reference,'video.avi'))) 356 | output = subprocess.call(command, shell=True, stdout=None) 357 | 358 | command = ("ffmpeg -y -i %s -qscale:v 2 -threads 1 -f image2 %s" % (os.path.join(opt.avi_dir,opt.reference,'video.avi'),os.path.join(opt.frames_dir,opt.reference,'%06d.jpg'))) 359 | output = subprocess.call(command, shell=True, stdout=None) 360 | 361 | command = ("ffmpeg -y -i %s -ac 1 -vn -acodec pcm_s16le -ar 16000 %s" % (os.path.join(opt.avi_dir,opt.reference,'video.avi'),os.path.join(opt.avi_dir,opt.reference,'audio.wav'))) 362 | output = subprocess.call(command, shell=True, stdout=None) 363 | 364 | # ========== FACE DETECTION ========== 365 | 366 | faces = inference_video(opt) 367 | 368 | # ========== SCENE DETECTION ========== 369 | 370 | scene = scene_detect(opt) 371 | 372 | # ========== FACE TRACKING ========== 373 | 374 | alltracks = [] 375 | vidtracks = [] 376 | 377 | for shot in scene: 378 | 379 | if shot[1].frame_num - shot[0].frame_num >= opt.min_track : 380 | alltracks.extend(track_shot(opt,faces[shot[0].frame_num:shot[1].frame_num])) 381 | 382 | # ========== FACE TRACK CROP ========== 383 | 384 | for ii, track in enumerate(alltracks): 385 | vidtracks.append(crop_video(opt,track,os.path.join(opt.crop_dir,opt.reference,'%05d'%ii))) 386 | crop_hq_video(opt,track,os.path.join(opt.crop_dir,opt.reference,'%05d'%ii)) 387 | 388 | # ========== SAVE RESULTS ========== 389 | 390 | savepath = os.path.join(opt.work_dir,opt.reference,'tracks.pckl') 391 | 392 | with open(savepath, 'wb') as fil: 393 | pickle.dump(vidtracks, fil) 394 | 395 | rmtree(os.path.join(opt.tmp_dir,opt.reference)) 396 | -------------------------------------------------------------------------------- /script/syncnet_python/run_pipeline_dir.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import sys, time, os, pdb, argparse, pickle, subprocess, glob, cv2 4 | import numpy as np 5 | from shutil import rmtree 6 | 7 | import scenedetect 8 | from scenedetect.video_manager import VideoManager 9 | from scenedetect.scene_manager import SceneManager 10 | from scenedetect.frame_timecode import FrameTimecode 11 | from scenedetect.stats_manager import StatsManager 12 | from scenedetect.detectors import ContentDetector 13 | 14 | from scipy.interpolate import interp1d 15 | from scipy.io import wavfile 16 | from scipy import signal 17 | 18 | from detectors import S3FD 19 | 20 | # ========== ========== ========== ========== 21 | # # PARSE ARGS 22 | # ========== ========== ========== ========== 23 | 24 | parser = argparse.ArgumentParser(description = "FaceTracker"); 25 | parser.add_argument('--data_dir', type=str, default='output', help='Output direcotry'); 26 | parser.add_argument('--video_dir', type=str, default='input', help='Input video file'); 27 | parser.add_argument('--reference', type=str, default='', help='Video reference'); 28 | parser.add_argument('--log_file', help='dataset', default='output/input.txt', type=str); 29 | parser.add_argument('--facedet_scale', type=float, default=0.25, help='Scale factor for face detection'); 30 | parser.add_argument('--crop_scale', type=float, default=0.40, help='Scale bounding box'); 31 | parser.add_argument('--min_track', type=int, default=25, help='Minimum facetrack duration'); 32 | parser.add_argument('--frame_rate', type=int, default=25, help='Frame rate'); 33 | parser.add_argument('--num_failed_det', type=int, default=25, help='Number of missed detections allowed before tracking is stopped'); 34 | parser.add_argument('--min_face_size', type=int, default=100, help='Minimum face size in pixels'); 35 | opt = parser.parse_args(); 36 | log_file = opt.log_file 37 | if not os.path.exists(log_file): 38 | print(f"Create log file {log_file}") 39 | os.system(f'touch {log_file}') 40 | setattr(opt,'avi_dir',os.path.join(opt.data_dir,'pyavi')) 41 | setattr(opt,'tmp_dir',os.path.join(opt.data_dir,'pytmp')) 42 | setattr(opt,'work_dir',os.path.join(opt.data_dir,'pywork')) 43 | setattr(opt,'crop_dir',os.path.join(opt.data_dir,'pycrop')) 44 | setattr(opt,'frames_dir',os.path.join(opt.data_dir,'pyframes')) 45 | 46 | # ========== ========== ========== ========== 47 | # # IOU FUNCTION 48 | # ========== ========== ========== ========== 49 | 50 | def bb_intersection_over_union(boxA, boxB): 51 | 52 | xA = max(boxA[0], boxB[0]) 53 | yA = max(boxA[1], boxB[1]) 54 | xB = min(boxA[2], boxB[2]) 55 | yB = min(boxA[3], boxB[3]) 56 | 57 | interArea = max(0, xB - xA) * max(0, yB - yA) 58 | 59 | boxAArea = (boxA[2] - boxA[0]) * (boxA[3] - boxA[1]) 60 | boxBArea = (boxB[2] - boxB[0]) * (boxB[3] - boxB[1]) 61 | 62 | iou = interArea / float(boxAArea + boxBArea - interArea) 63 | 64 | return iou 65 | 66 | # ========== ========== ========== ========== 67 | # # FACE TRACKING 68 | # ========== ========== ========== ========== 69 | 70 | def track_shot(opt,scenefaces): 71 | 72 | iouThres = 0.5 # Minimum IOU between consecutive face detections 73 | tracks = [] 74 | 75 | while True: 76 | track = [] 77 | for framefaces in scenefaces: 78 | for face in framefaces: 79 | if track == []: 80 | track.append(face) 81 | framefaces.remove(face) 82 | elif face['frame'] - track[-1]['frame'] <= opt.num_failed_det: 83 | iou = bb_intersection_over_union(face['bbox'], track[-1]['bbox']) 84 | if iou > iouThres: 85 | track.append(face) 86 | framefaces.remove(face) 87 | continue 88 | else: 89 | break 90 | 91 | if track == []: 92 | break 93 | elif len(track) > opt.min_track: 94 | 95 | framenum = np.array([ f['frame'] for f in track ]) 96 | bboxes = np.array([np.array(f['bbox']) for f in track]) 97 | 98 | frame_i = np.arange(framenum[0],framenum[-1]+1) 99 | 100 | bboxes_i = [] 101 | for ij in range(0,4): 102 | interpfn = interp1d(framenum, bboxes[:,ij]) 103 | bboxes_i.append(interpfn(frame_i)) 104 | bboxes_i = np.stack(bboxes_i, axis=1) 105 | 106 | if max(np.mean(bboxes_i[:,2]-bboxes_i[:,0]), np.mean(bboxes_i[:,3]-bboxes_i[:,1])) > opt.min_face_size: 107 | tracks.append({'frame':frame_i,'bbox':bboxes_i}) 108 | 109 | return tracks 110 | 111 | # ========== ========== ========== ========== 112 | # # VIDEO CROP AND SAVE 113 | # ========== ========== ========== ========== 114 | 115 | def crop_video(opt,track,cropfile): 116 | 117 | flist = glob.glob(os.path.join(opt.frames_dir,opt.reference,'*.jpg')) 118 | flist.sort() 119 | 120 | fourcc = cv2.VideoWriter_fourcc(*'XVID') 121 | vOut = cv2.VideoWriter(cropfile+'t.avi', fourcc, opt.frame_rate, (224,224)) 122 | 123 | dets = {'x':[], 'y':[], 's':[]} 124 | 125 | for det in track['bbox']: 126 | 127 | dets['s'].append(max((det[3]-det[1]),(det[2]-det[0]))/2) 128 | dets['y'].append((det[1]+det[3])/2) # crop center x 129 | dets['x'].append((det[0]+det[2])/2) # crop center y 130 | 131 | # Smooth detections 132 | dets['s'] = signal.medfilt(dets['s'],kernel_size=13) 133 | dets['x'] = signal.medfilt(dets['x'],kernel_size=13) 134 | dets['y'] = signal.medfilt(dets['y'],kernel_size=13) 135 | 136 | for fidx, frame in enumerate(track['frame']): 137 | 138 | cs = opt.crop_scale 139 | 140 | bs = dets['s'][fidx] # Detection box size 141 | bsi = int(bs*(1+2*cs)) # Pad videos by this amount 142 | 143 | image = cv2.imread(flist[frame]) 144 | 145 | frame = np.pad(image,((bsi,bsi),(bsi,bsi),(0,0)), 'constant', constant_values=(110,110)) 146 | my = dets['y'][fidx]+bsi # BBox center Y 147 | mx = dets['x'][fidx]+bsi # BBox center X 148 | 149 | face = frame[int(my-bs):int(my+bs*(1+2*cs)),int(mx-bs*(1+cs)):int(mx+bs*(1+cs))] 150 | 151 | vOut.write(cv2.resize(face,(224,224))) 152 | 153 | audiotmp = os.path.join(opt.tmp_dir,opt.reference,'audio.wav') 154 | audiostart = (track['frame'][0])/opt.frame_rate 155 | audioend = (track['frame'][-1]+1)/opt.frame_rate 156 | 157 | vOut.release() 158 | 159 | # ========== CROP AUDIO FILE ========== 160 | 161 | command = ("ffmpeg -y -i %s -ss %.3f -to %.3f %s" % (os.path.join(opt.avi_dir,opt.reference,'audio.wav'),audiostart,audioend,audiotmp)) 162 | output = subprocess.call(command, shell=True, stdout=None) 163 | 164 | if output != 0: 165 | pdb.set_trace() 166 | 167 | sample_rate, audio = wavfile.read(audiotmp) 168 | 169 | # ========== COMBINE AUDIO AND VIDEO FILES ========== 170 | 171 | command = ("ffmpeg -y -i %st.avi -i %s -c:v copy -c:a copy %s.avi" % (cropfile,audiotmp,cropfile)) 172 | output = subprocess.call(command, shell=True, stdout=None) 173 | 174 | if output != 0: 175 | pdb.set_trace() 176 | 177 | print('Written %s'%cropfile) 178 | 179 | os.remove(cropfile+'t.avi') 180 | 181 | print('Mean pos: x %.2f y %.2f s %.2f'%(np.mean(dets['x']),np.mean(dets['y']),np.mean(dets['s']))) 182 | 183 | return {'track':track, 'proc_track':dets} 184 | 185 | 186 | def crop_hq_video(opt, track, cropfile): 187 | flist = glob.glob(os.path.join(opt.frames_dir, opt.reference, '*.jpg')) 188 | flist.sort() 189 | 190 | fourcc = cv2.VideoWriter_fourcc(*'XVID') 191 | 192 | 193 | dets = {'x': [], 'y': [], 's': []} 194 | 195 | for det in track['bbox']: 196 | dets['s'].append(max((det[3] - det[1]), (det[2] - det[0])) / 2) 197 | dets['y'].append((det[1] + det[3]) / 2) # crop center x 198 | dets['x'].append((det[0] + det[2]) / 2) # crop center y 199 | 200 | # Smooth detections 201 | dets['s'] = signal.medfilt(dets['s'], kernel_size=13) 202 | dets['x'] = signal.medfilt(dets['x'], kernel_size=13) 203 | dets['y'] = signal.medfilt(dets['y'], kernel_size=13) 204 | cs = opt.crop_scale 205 | lengh = int(dets['s'].max() * 2 * (1 + cs)) 206 | vOut = cv2.VideoWriter(cropfile + 't.avi', fourcc, opt.frame_rate, (lengh, lengh)) 207 | 208 | for fidx, frame in enumerate(track['frame']): 209 | cs = opt.crop_scale 210 | 211 | bs = dets['s'][fidx] # Detection box size 212 | bsi = int(bs * (1 + 2 * cs)) # Pad videos by this amount 213 | 214 | image = cv2.imread(flist[frame]) 215 | 216 | frame = np.pad(image, ((bsi, bsi), (bsi, bsi), (0, 0)), 'constant', constant_values=(110, 110)) 217 | my = dets['y'][fidx] + bsi # BBox center Y 218 | mx = dets['x'][fidx] + bsi # BBox center X 219 | 220 | face = frame[int(my - bs):int(my + bs * (1 + 2 * cs)), int(mx - bs * (1 + cs)):int(mx + bs * (1 + cs))] 221 | 222 | vOut.write(cv2.resize(face,(lengh,lengh))) 223 | audiotmp = os.path.join(opt.tmp_dir, opt.reference, 'audio.wav') 224 | audiostart = (track['frame'][0]) / opt.frame_rate 225 | audioend = (track['frame'][-1] + 1) / opt.frame_rate 226 | 227 | vOut.release() 228 | 229 | # ========== CROP AUDIO FILE ========== 230 | 231 | command = ("ffmpeg -y -i %s -ss %.3f -to %.3f %s" % ( 232 | os.path.join(opt.avi_dir, opt.reference, 'audio.wav'), audiostart, audioend, audiotmp)) 233 | output = subprocess.call(command, shell=True, stdout=None) 234 | 235 | if output != 0: 236 | pdb.set_trace() 237 | 238 | sample_rate, audio = wavfile.read(audiotmp) 239 | 240 | # ========== COMBINE AUDIO AND VIDEO FILES ========== 241 | 242 | command = ("ffmpeg -y -i %st.avi -i %s -c:v copy -c:a copy %s_hq.avi" % (cropfile, audiotmp, cropfile)) 243 | output = subprocess.call(command, shell=True, stdout=None) 244 | 245 | if output != 0: 246 | pdb.set_trace() 247 | 248 | print('Written %s' % cropfile) 249 | 250 | os.remove(cropfile + 't.avi') 251 | 252 | print('Mean pos: x %.2f y %.2f s %.2f' % (np.mean(dets['x']), np.mean(dets['y']), np.mean(dets['s']))) 253 | 254 | return {'track': track, 'proc_track': dets} 255 | 256 | 257 | # ========== ========== ========== ========== 258 | # # FACE DETECTION 259 | # ========== ========== ========== ========== 260 | 261 | def inference_video(opt): 262 | 263 | DET = S3FD(device='cuda') 264 | 265 | flist = glob.glob(os.path.join(opt.frames_dir,opt.reference,'*.jpg')) 266 | flist.sort() 267 | 268 | dets = [] 269 | 270 | for fidx, fname in enumerate(flist): 271 | 272 | start_time = time.time() 273 | 274 | image = cv2.imread(fname) 275 | 276 | image_np = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) 277 | bboxes = DET.detect_faces(image_np, conf_th=0.9, scales=[opt.facedet_scale]) 278 | 279 | dets.append([]); 280 | for bbox in bboxes: 281 | dets[-1].append({'frame':fidx, 'bbox':(bbox[:-1]).tolist(), 'conf':bbox[-1]}) 282 | 283 | elapsed_time = time.time() - start_time 284 | 285 | print('%s-%05d; %d dets; %.2f Hz' % (os.path.join(opt.avi_dir,opt.reference,'video.avi'),fidx,len(dets[-1]),(1/elapsed_time))) 286 | 287 | savepath = os.path.join(opt.work_dir,opt.reference,'faces.pckl') 288 | 289 | with open(savepath, 'wb') as fil: 290 | pickle.dump(dets, fil) 291 | 292 | return dets 293 | 294 | # ========== ========== ========== ========== 295 | # # SCENE DETECTION 296 | # ========== ========== ========== ========== 297 | 298 | def scene_detect(opt): 299 | 300 | video_manager = VideoManager([os.path.join(opt.avi_dir,opt.reference,'video.avi')]) 301 | stats_manager = StatsManager() 302 | scene_manager = SceneManager(stats_manager) 303 | # Add ContentDetector algorithm (constructor takes detector options like threshold). 304 | scene_manager.add_detector(ContentDetector()) 305 | base_timecode = video_manager.get_base_timecode() 306 | 307 | video_manager.set_downscale_factor() 308 | 309 | video_manager.start() 310 | 311 | scene_manager.detect_scenes(frame_source=video_manager) 312 | 313 | scene_list = scene_manager.get_scene_list(base_timecode) 314 | 315 | savepath = os.path.join(opt.work_dir,opt.reference,'scene.pckl') 316 | 317 | if scene_list == []: 318 | scene_list = [(video_manager.get_base_timecode(),video_manager.get_current_timecode())] 319 | 320 | with open(savepath, 'wb') as fil: 321 | pickle.dump(scene_list, fil) 322 | 323 | print('%s - scenes detected %d'%(os.path.join(opt.avi_dir,opt.reference,'video.avi'),len(scene_list))) 324 | 325 | return scene_list 326 | 327 | 328 | # ========== ========== ========== ========== 329 | # # EXECUTE DEMO 330 | # ========== ========== ========== ========== 331 | 332 | # ========== DELETE EXISTING DIRECTORIES ========== 333 | video_root = opt.video_dir 334 | videos = os.listdir(video_root) 335 | videos = [x if x.endswith('.mp4') else os.remove(os.path.join(video_root, x)) for x in videos] 336 | videos.sort() 337 | # root_manifest = os.path.join('manifest', identity) 338 | with open(log_file, 'r') as f: 339 | CompleteID = f.read().split("\n") 340 | 341 | for video in videos: 342 | if video in CompleteID: 343 | fullPathVideo = os.path.join(video_root, video) 344 | print(f' video {fullPathVideo} was OK') 345 | else: 346 | video_name = video.split('.')[0] 347 | fullPathVideo = os.path.join(video_root, video) 348 | print(f'processing video {fullPathVideo}') 349 | # manifest = os.path.join(root_manifest, video.split('.')[0] + '.text') 350 | opt.reference = video_name 351 | opt.videofile = fullPathVideo 352 | if os.path.exists(os.path.join(opt.work_dir,opt.reference)): 353 | rmtree(os.path.join(opt.work_dir,opt.reference)) 354 | 355 | if os.path.exists(os.path.join(opt.crop_dir,opt.reference)): 356 | rmtree(os.path.join(opt.crop_dir,opt.reference)) 357 | 358 | if os.path.exists(os.path.join(opt.avi_dir,opt.reference)): 359 | rmtree(os.path.join(opt.avi_dir,opt.reference)) 360 | 361 | if os.path.exists(os.path.join(opt.frames_dir,opt.reference)): 362 | rmtree(os.path.join(opt.frames_dir,opt.reference)) 363 | 364 | if os.path.exists(os.path.join(opt.tmp_dir,opt.reference)): 365 | rmtree(os.path.join(opt.tmp_dir,opt.reference)) 366 | 367 | # ========== MAKE NEW DIRECTORIES ========== 368 | 369 | os.makedirs(os.path.join(opt.work_dir,opt.reference)) 370 | os.makedirs(os.path.join(opt.crop_dir,opt.reference)) 371 | os.makedirs(os.path.join(opt.avi_dir,opt.reference)) 372 | os.makedirs(os.path.join(opt.frames_dir,opt.reference)) 373 | os.makedirs(os.path.join(opt.tmp_dir,opt.reference)) 374 | 375 | # ========== CONVERT VIDEO AND EXTRACT FRAMES ========== 376 | 377 | command = ("ffmpeg -y -i %s -qscale:v 2 -async 1 -r 25 %s" % (opt.videofile,os.path.join(opt.avi_dir,opt.reference,'video.avi'))) 378 | output = subprocess.call(command, shell=True, stdout=None) 379 | 380 | command = ("ffmpeg -y -i %s -qscale:v 2 -threads 1 -f image2 %s" % (os.path.join(opt.avi_dir,opt.reference,'video.avi'),os.path.join(opt.frames_dir,opt.reference,'%06d.jpg'))) 381 | output = subprocess.call(command, shell=True, stdout=None) 382 | 383 | command = ("ffmpeg -y -i %s -ac 1 -vn -acodec pcm_s16le -ar 16000 %s" % (os.path.join(opt.avi_dir,opt.reference,'video.avi'),os.path.join(opt.avi_dir,opt.reference,'audio.wav'))) 384 | output = subprocess.call(command, shell=True, stdout=None) 385 | 386 | # ========== FACE DETECTION ========== 387 | 388 | faces = inference_video(opt) 389 | 390 | # ========== SCENE DETECTION ========== 391 | 392 | scene = scene_detect(opt) 393 | 394 | # ========== FACE TRACKING ========== 395 | 396 | alltracks = [] 397 | vidtracks = [] 398 | 399 | for shot in scene: 400 | 401 | if shot[1].frame_num - shot[0].frame_num >= opt.min_track : 402 | alltracks.extend(track_shot(opt,faces[shot[0].frame_num:shot[1].frame_num])) 403 | 404 | # ========== FACE TRACK CROP ========== 405 | 406 | for ii, track in enumerate(alltracks): 407 | vidtracks.append(crop_video(opt,track,os.path.join(opt.crop_dir,opt.reference,'%05d'%ii))) 408 | crop_hq_video(opt,track,os.path.join(opt.crop_dir,opt.reference,'%05d'%ii)) 409 | 410 | # ========== SAVE RESULTS ========== 411 | 412 | savepath = os.path.join(opt.work_dir,opt.reference,'tracks.pckl') 413 | 414 | with open(savepath, 'wb') as fil: 415 | pickle.dump(vidtracks, fil) 416 | 417 | rmtree(os.path.join(opt.tmp_dir,opt.reference)) 418 | with open(log_file, 'a') as f: 419 | f.write(f"{video}\n") 420 | print(f' video {fullPathVideo} OK !!!') 421 | -------------------------------------------------------------------------------- /script/syncnet_python/run_syncnet.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | #-*- coding: utf-8 -*- 3 | 4 | import time, pdb, argparse, subprocess, pickle, os, gzip, glob 5 | 6 | from SyncNetInstance import * 7 | 8 | # ==================== PARSE ARGUMENT ==================== 9 | 10 | parser = argparse.ArgumentParser(description = "SyncNet"); 11 | parser.add_argument('--initial_model', type=str, default="data/syncnet_v2.model", help=''); 12 | parser.add_argument('--batch_size', type=int, default='20', help=''); 13 | parser.add_argument('--vshift', type=int, default='15', help=''); 14 | parser.add_argument('--data_dir', type=str, default='test', help=''); 15 | parser.add_argument('--videofile', type=str, default='', help=''); 16 | parser.add_argument('--reference', type=str, default='', help=''); 17 | opt = parser.parse_args(); 18 | 19 | setattr(opt,'avi_dir',os.path.join(opt.data_dir,'pyavi')) 20 | setattr(opt,'tmp_dir',os.path.join(opt.data_dir,'pytmp')) 21 | setattr(opt,'work_dir',os.path.join(opt.data_dir,'pywork')) 22 | setattr(opt,'crop_dir',os.path.join(opt.data_dir,'pycrop')) 23 | 24 | 25 | # ==================== LOAD MODEL AND FILE LIST ==================== 26 | 27 | s = SyncNetInstance(); 28 | 29 | s.loadParameters(opt.initial_model); 30 | print("Model %s loaded."%opt.initial_model); 31 | 32 | flist = glob.glob(os.path.join(opt.crop_dir,opt.reference,'0*.avi')) 33 | flist.sort() 34 | 35 | # ==================== GET OFFSETS ==================== 36 | 37 | dists = [] 38 | for idx, fname in enumerate(flist): 39 | offset, conf, dist = s.evaluate(opt,videofile=fname) 40 | dists.append(dist) 41 | 42 | # ==================== PRINT RESULTS TO FILE ==================== 43 | # 44 | # with open(os.path.join(opt.work_dir,opt.reference,'activesd.pckl'), 'wb') as fil: 45 | # pickle.dump(dists, fil) 46 | -------------------------------------------------------------------------------- /script/syncnet_python/run_syncnet_dir.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | 4 | import time, pdb, argparse, subprocess, pickle, os, gzip, glob 5 | 6 | from SyncNetInstance import * 7 | 8 | # ==================== PARSE ARGUMENT ==================== 9 | 10 | parser = argparse.ArgumentParser(description="SyncNet"); 11 | parser.add_argument('--initial_model', type=str, default="data/syncnet_v2.model", help=''); 12 | parser.add_argument('--batch_size', type=int, default='20', help=''); 13 | parser.add_argument('--vshift', type=int, default='15', help=''); 14 | parser.add_argument('--output_root', type=str, default='syncnet_output', help='Output direcotry'); 15 | parser.add_argument('--video_root', type=str, default='output/pycrop', help='Input video file'); 16 | parser.add_argument('--reference', type=str, default='', help='Video reference'); 17 | parser.add_argument('--log_file', help='dataset', default='syncnet_output/complete.txt', type=str); 18 | parser.add_argument('--videofile', type=str, default='', help=''); 19 | opt = parser.parse_args(); 20 | 21 | setattr(opt, 'avi_dir', os.path.join(opt.output_root, 'pyavi')) 22 | setattr(opt, 'tmp_dir', os.path.join(opt.output_root, 'pytmp')) 23 | setattr(opt, 'work_dir', os.path.join(opt.output_root, 'pywork')) 24 | setattr(opt, 'crop_dir', os.path.join(opt.output_root, 'pycrop')) 25 | 26 | # ==================== LOAD MODEL AND FILE LIST ==================== 27 | 28 | s = SyncNetInstance(); 29 | 30 | s.loadParameters(opt.initial_model); 31 | print("Model %s loaded." % opt.initial_model); 32 | 33 | video_root = opt.video_root 34 | output_root = opt.output_root 35 | identities = os.listdir(video_root) 36 | identities = [x for x in identities] 37 | identities.sort() 38 | log_file = opt.log_file 39 | with open(log_file, 'r') as f: 40 | identityID = f.read().split("\n") 41 | for identity in identities: 42 | fullPathVideo = os.path.join(video_root, identity) 43 | if identity in identityID: 44 | print(f' videosSet {fullPathVideo} was OK') 45 | continue 46 | print(f'Processing videosSet {identity}') 47 | videos = os.listdir(os.path.join(video_root, identity)) 48 | videos = [x if x.endswith('.avi') else os.remove(os.path.join(video_root, x)) for x in videos] 49 | videosList = [] 50 | for video in videos: 51 | (shotname, extension) = os.path.splitext(video) 52 | if shotname.isalnum(): 53 | videosList.append(shotname) 54 | for idx, fname in enumerate(videosList): 55 | normalPath = os.path.join(video_root, identity, fname + '.avi') 56 | hqPath = os.path.join(video_root, identity, fname + '_hq.avi') 57 | targetPath = os.path.join(output_root, identity + fname + '_hq.avi') 58 | offset = s.evaluate(opt, videofile=normalPath) 59 | 60 | if offset == 0: 61 | os.rename(hqPath, targetPath) 62 | elif abs(offset) <= 5: 63 | fpss = offset / 25 64 | cmd = f"ffmpeg -i {hqPath} -itsoffset {fpss} -i {hqPath} -map 0:v -map 1:a -b:v 8000k {targetPath}" 65 | output = subprocess.call(cmd, shell=True, stdout=None) 66 | if output != 0: 67 | pdb.set_trace() 68 | os.remove(hqPath) 69 | with open(log_file, 'a') as f: 70 | f.write(f"{identity}\n") 71 | print(f' videosSet {fullPathVideo} OK !!!') 72 | -------------------------------------------------------------------------------- /script/syncnet_python/run_visualise.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | #-*- coding: utf-8 -*- 3 | 4 | import torch 5 | import numpy 6 | import time, pdb, argparse, subprocess, pickle, os, glob 7 | import cv2 8 | 9 | from scipy import signal 10 | 11 | # ==================== PARSE ARGUMENT ==================== 12 | 13 | parser = argparse.ArgumentParser(description = "SyncNet"); 14 | parser.add_argument('--data_dir', type=str, default='data/work', help=''); 15 | parser.add_argument('--videofile', type=str, default='', help=''); 16 | parser.add_argument('--reference', type=str, default='', help=''); 17 | parser.add_argument('--frame_rate', type=int, default=25, help='Frame rate'); 18 | opt = parser.parse_args(); 19 | 20 | setattr(opt,'avi_dir',os.path.join(opt.data_dir,'pyavi')) 21 | setattr(opt,'tmp_dir',os.path.join(opt.data_dir,'pytmp')) 22 | setattr(opt,'work_dir',os.path.join(opt.data_dir,'pywork')) 23 | setattr(opt,'crop_dir',os.path.join(opt.data_dir,'pycrop')) 24 | setattr(opt,'frames_dir',os.path.join(opt.data_dir,'pyframes')) 25 | 26 | # ==================== LOAD FILES ==================== 27 | 28 | with open(os.path.join(opt.work_dir,opt.reference,'tracks.pckl'), 'rb') as fil: 29 | tracks = pickle.load(fil, encoding='latin1') 30 | 31 | with open(os.path.join(opt.work_dir,opt.reference,'activesd.pckl'), 'rb') as fil: 32 | dists = pickle.load(fil, encoding='latin1') 33 | 34 | flist = glob.glob(os.path.join(opt.frames_dir,opt.reference,'*.jpg')) 35 | flist.sort() 36 | 37 | # ==================== SMOOTH FACES ==================== 38 | 39 | faces = [[] for i in range(len(flist))] 40 | 41 | for tidx, track in enumerate(tracks): 42 | 43 | mean_dists = numpy.mean(numpy.stack(dists[tidx],1),1) 44 | minidx = numpy.argmin(mean_dists,0) 45 | minval = mean_dists[minidx] 46 | 47 | fdist = numpy.stack([dist[minidx] for dist in dists[tidx]]) 48 | fdist = numpy.pad(fdist, (3,3), 'constant', constant_values=10) 49 | 50 | fconf = numpy.median(mean_dists) - fdist 51 | fconfm = signal.medfilt(fconf,kernel_size=9) 52 | 53 | for fidx, frame in enumerate(track['track']['frame'].tolist()) : 54 | faces[frame].append({'track': tidx, 'conf':fconfm[fidx], 's':track['proc_track']['s'][fidx], 'x':track['proc_track']['x'][fidx], 'y':track['proc_track']['y'][fidx]}) 55 | 56 | # ==================== ADD DETECTIONS TO VIDEO ==================== 57 | 58 | first_image = cv2.imread(flist[0]) 59 | 60 | fw = first_image.shape[1] 61 | fh = first_image.shape[0] 62 | 63 | fourcc = cv2.VideoWriter_fourcc(*'XVID') 64 | vOut = cv2.VideoWriter(os.path.join(opt.avi_dir,opt.reference,'video_only.avi'), fourcc, opt.frame_rate, (fw,fh)) 65 | 66 | for fidx, fname in enumerate(flist): 67 | 68 | image = cv2.imread(fname) 69 | 70 | for face in faces[fidx]: 71 | 72 | clr = max(min(face['conf']*25,255),0) 73 | 74 | cv2.rectangle(image,(int(face['x']-face['s']),int(face['y']-face['s'])),(int(face['x']+face['s']),int(face['y']+face['s'])),(0,clr,255-clr),3) 75 | cv2.putText(image,'Track %d, Conf %.3f'%(face['track'],face['conf']), (int(face['x']-face['s']),int(face['y']-face['s'])),cv2.FONT_HERSHEY_SIMPLEX,0.5,(255,255,255),2) 76 | 77 | vOut.write(image) 78 | 79 | print('Frame %d'%fidx) 80 | 81 | vOut.release() 82 | 83 | # ========== COMBINE AUDIO AND VIDEO FILES ========== 84 | 85 | command = ("ffmpeg -y -i %s -i %s -c:v copy -c:a copy %s" % (os.path.join(opt.avi_dir,opt.reference,'video_only.avi'),os.path.join(opt.avi_dir,opt.reference,'audio.wav'),os.path.join(opt.avi_dir,opt.reference,'video_out.avi'))) #-async 1 86 | output = subprocess.call(command, shell=True, stdout=None) 87 | 88 | 89 | -------------------------------------------------------------------------------- /script/syncnet_python/syncnet_videos.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kafeyun/Wav2Lip-Ultra/23f6a98c3785c2039d032a8c68ee4de63bc4ddd0/script/syncnet_python/syncnet_videos.py --------------------------------------------------------------------------------