├── .idea
├── .gitignore
├── Wav2Lip-Ultra.iml
├── inspectionProfiles
│ ├── Project_Default.xml
│ └── profiles_settings.xml
├── misc.xml
├── modules.xml
└── vcs.xml
└── script
└── syncnet_python
├── .gitignore
├── LICENSE.md
├── README.md
├── SyncNetInstance.py
├── SyncNetModel.py
├── demo_feature.py
├── demo_syncnet.py
├── detectors
├── README.md
├── __init__.py
└── s3fd
│ ├── __init__.py
│ ├── box_utils.py
│ └── nets.py
├── download_model.sh
├── requirements.txt
├── run_pipeline.py
├── run_pipeline_dir.py
├── run_syncnet.py
├── run_syncnet_dir.py
├── run_visualise.py
└── syncnet_videos.py
/.idea/.gitignore:
--------------------------------------------------------------------------------
1 | # 默认忽略的文件
2 | /shelf/
3 | /workspace.xml
4 | # 基于编辑器的 HTTP 客户端请求
5 | /httpRequests/
6 | # Datasource local storage ignored files
7 | /dataSources/
8 | /dataSources.local.xml
9 |
--------------------------------------------------------------------------------
/.idea/Wav2Lip-Ultra.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/.idea/inspectionProfiles/Project_Default.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
23 |
24 |
25 |
30 |
31 |
32 |
--------------------------------------------------------------------------------
/.idea/inspectionProfiles/profiles_settings.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
--------------------------------------------------------------------------------
/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/script/syncnet_python/.gitignore:
--------------------------------------------------------------------------------
1 | # Compiled source #
2 | ###################
3 | *.com
4 | *.class
5 | *.dll
6 | *.exe
7 | *.o
8 | *.so
9 | *.pyc
10 |
11 | # Packages #
12 | ############
13 | # it's better to unpack these files and commit the raw source
14 | # git has its own built in compression methods
15 | *.7z
16 | *.dmg
17 | *.gz
18 | *.iso
19 | *.jar
20 | *.rar
21 | *.tar
22 | *.zip
23 |
24 | # Logs and databases #
25 | ######################
26 | *.log
27 | *.sql
28 | *.sqlite
29 |
30 | # OS generated files #
31 | ######################
32 | .DS_Store
33 | .DS_Store?
34 | ._*
35 | .Spotlight-V100
36 | .Trashes
37 | ehthumbs.db
38 | Thumbs.db
39 |
40 | # Specific to this demo #
41 | #########################
42 | data/
43 | protos/
44 | utils/
45 | *.pth
46 |
--------------------------------------------------------------------------------
/script/syncnet_python/LICENSE.md:
--------------------------------------------------------------------------------
1 | Copyright (c) 2016-present Joon Son Chung.
2 |
3 | Permission is hereby granted, free of charge, to any person obtaining a copy
4 | of this software and associated documentation files (the "Software"), to deal
5 | in the Software without restriction, including without limitation the rights
6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7 | copies of the Software, and to permit persons to whom the Software is
8 | furnished to do so, subject to the following conditions:
9 |
10 | The above copyright notice and this permission notice shall be included in
11 | all copies or substantial portions of the Software.
12 |
13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 | THE SOFTWARE.
20 |
--------------------------------------------------------------------------------
/script/syncnet_python/README.md:
--------------------------------------------------------------------------------
1 | # SyncNet
2 |
3 | This repository contains the demo for the audio-to-video synchronisation network (SyncNet). This network can be used for audio-visual synchronisation tasks including:
4 | 1. Removing temporal lags between the audio and visual streams in a video;
5 | 2. Determining who is speaking amongst multiple faces in a video.
6 |
7 | Please cite the paper below if you make use of the software.
8 |
9 | ## Dependencies
10 | ```
11 | pip install -r requirements.txt
12 | ```
13 |
14 | In addition, `ffmpeg` is required.
15 |
16 |
17 | ## Demo
18 |
19 | SyncNet demo:
20 | ```
21 | python demo_syncnet.py --videofile data/example.avi --tmp_dir /path/to/temp/directory
22 | ```
23 |
24 | Check that this script returns:
25 | ```
26 | AV offset: 3
27 | Min dist: 5.353
28 | Confidence: 10.021
29 | ```
30 |
31 | Full pipeline:
32 | ```
33 | sh download_model.sh
34 | python run_pipeline.py --videofile /path/to/video.mp4 --reference name_of_video --data_dir /path/to/output
35 | python run_syncnet.py --videofile /path/to/video.mp4 --reference name_of_video --data_dir /path/to/output
36 | python run_visualise.py --videofile /path/to/video.mp4 --reference name_of_video --data_dir /path/to/output
37 | ```
38 |
39 | Outputs:
40 | ```
41 | $DATA_DIR/pycrop/$REFERENCE/*.avi - cropped face tracks
42 | $DATA_DIR/pywork/$REFERENCE/offsets.txt - audio-video offset values
43 | $DATA_DIR/pyavi/$REFERENCE/video_out.avi - output video (as shown below)
44 | ```
45 |
46 |
47 |
48 |
49 |
50 | ## Publications
51 |
52 | ```
53 | @InProceedings{Chung16a,
54 | author = "Chung, J.~S. and Zisserman, A.",
55 | title = "Out of time: automated lip sync in the wild",
56 | booktitle = "Workshop on Multi-view Lip-reading, ACCV",
57 | year = "2016",
58 | }
59 | ```
60 |
--------------------------------------------------------------------------------
/script/syncnet_python/SyncNetInstance.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 | #-*- coding: utf-8 -*-
3 | # Video 25 FPS, Audio 16000HZ
4 |
5 | import torch
6 | import numpy
7 | import time, pdb, argparse, subprocess, os, math, glob
8 | import cv2
9 | import python_speech_features
10 |
11 | from scipy import signal
12 | from scipy.io import wavfile
13 | from SyncNetModel import *
14 | from shutil import rmtree
15 |
16 |
17 | # ==================== Get OFFSET ====================
18 |
19 | def calc_pdist(feat1, feat2, vshift=10):
20 |
21 | win_size = vshift*2+1
22 |
23 | feat2p = torch.nn.functional.pad(feat2,(0,0,vshift,vshift))
24 |
25 | dists = []
26 |
27 | for i in range(0,len(feat1)):
28 |
29 | dists.append(torch.nn.functional.pairwise_distance(feat1[[i],:].repeat(win_size, 1), feat2p[i:i+win_size,:]))
30 |
31 | return dists
32 |
33 | # ==================== MAIN DEF ====================
34 |
35 | class SyncNetInstance(torch.nn.Module):
36 |
37 | def __init__(self, dropout = 0, num_layers_in_fc_layers = 1024):
38 | super(SyncNetInstance, self).__init__();
39 |
40 | self.__S__ = S(num_layers_in_fc_layers = num_layers_in_fc_layers).cuda();
41 |
42 | def evaluate(self, opt, videofile):
43 |
44 | self.__S__.eval();
45 |
46 | # ========== ==========
47 | # Convert files
48 | # ========== ==========
49 |
50 | if os.path.exists(os.path.join(opt.tmp_dir,opt.reference)):
51 | rmtree(os.path.join(opt.tmp_dir,opt.reference))
52 |
53 | os.makedirs(os.path.join(opt.tmp_dir,opt.reference))
54 |
55 | command = ("ffmpeg -y -i %s -threads 1 -f image2 %s" % (videofile,os.path.join(opt.tmp_dir,opt.reference,'%06d.jpg')))
56 | output = subprocess.call(command, shell=True, stdout=None)
57 |
58 | command = ("ffmpeg -y -i %s -async 1 -ac 1 -vn -acodec pcm_s16le -ar 16000 %s" % (videofile,os.path.join(opt.tmp_dir,opt.reference,'audio.wav')))
59 | output = subprocess.call(command, shell=True, stdout=None)
60 |
61 | # ========== ==========
62 | # Load video
63 | # ========== ==========
64 |
65 | images = []
66 |
67 | flist = glob.glob(os.path.join(opt.tmp_dir,opt.reference,'*.jpg'))
68 | flist.sort()
69 |
70 | for fname in flist:
71 | images.append(cv2.imread(fname))
72 |
73 | im = numpy.stack(images,axis=3)
74 | im = numpy.expand_dims(im,axis=0)
75 | im = numpy.transpose(im,(0,3,4,1,2))
76 |
77 | imtv = torch.autograd.Variable(torch.from_numpy(im.astype(float)).float())
78 |
79 | # ========== ==========
80 | # Load audio
81 | # ========== ==========
82 |
83 | sample_rate, audio = wavfile.read(os.path.join(opt.tmp_dir,opt.reference,'audio.wav'))
84 | mfcc = zip(*python_speech_features.mfcc(audio,sample_rate))
85 | mfcc = numpy.stack([numpy.array(i) for i in mfcc])
86 |
87 | cc = numpy.expand_dims(numpy.expand_dims(mfcc,axis=0),axis=0)
88 | cct = torch.autograd.Variable(torch.from_numpy(cc.astype(float)).float())
89 |
90 | # ========== ==========
91 | # Check audio and video input length
92 | # ========== ==========
93 |
94 | if (float(len(audio))/16000) != (float(len(images))/25) :
95 | print("WARNING: Audio (%.4fs) and video (%.4fs) lengths are different."%(float(len(audio))/16000,float(len(images))/25))
96 |
97 | min_length = min(len(images),math.floor(len(audio)/640))
98 |
99 | # ========== ==========
100 | # Generate video and audio feats
101 | # ========== ==========
102 |
103 | lastframe = min_length-5
104 | im_feat = []
105 | cc_feat = []
106 |
107 | tS = time.time()
108 | for i in range(0,lastframe,opt.batch_size):
109 |
110 | im_batch = [ imtv[:,:,vframe:vframe+5,:,:] for vframe in range(i,min(lastframe,i+opt.batch_size)) ]
111 | im_in = torch.cat(im_batch,0)
112 | im_out = self.__S__.forward_lip(im_in.cuda());
113 | im_feat.append(im_out.data.cpu())
114 |
115 | cc_batch = [ cct[:,:,:,vframe*4:vframe*4+20] for vframe in range(i,min(lastframe,i+opt.batch_size)) ]
116 | cc_in = torch.cat(cc_batch,0)
117 | cc_out = self.__S__.forward_aud(cc_in.cuda())
118 | cc_feat.append(cc_out.data.cpu())
119 |
120 | im_feat = torch.cat(im_feat,0)
121 | cc_feat = torch.cat(cc_feat,0)
122 |
123 | # ========== ==========
124 | # Compute offset
125 | # ========== ==========
126 |
127 | print('Compute time %.3f sec.' % (time.time()-tS))
128 |
129 | dists = calc_pdist(im_feat,cc_feat,vshift=opt.vshift)
130 | mdist = torch.mean(torch.stack(dists,1),1)
131 |
132 | minval, minidx = torch.min(mdist,0)
133 |
134 | offset = opt.vshift-minidx
135 | conf = torch.median(mdist) - minval
136 |
137 | fdist = numpy.stack([dist[minidx].numpy() for dist in dists])
138 | # fdist = numpy.pad(fdist, (3,3), 'constant', constant_values=15)
139 | fconf = torch.median(mdist).numpy() - fdist
140 | fconfm = signal.medfilt(fconf,kernel_size=9)
141 |
142 | numpy.set_printoptions(formatter={'float': '{: 0.3f}'.format})
143 | print('Framewise conf: ')
144 | print(fconfm)
145 | print('AV offset: \t%d \nMin dist: \t%.3f\nConfidence: \t%.3f' % (offset,minval,conf))
146 |
147 | #dists_npy = numpy.array([ dist.numpy() for dist in dists ])
148 | return offset
149 |
150 | def extract_feature(self, opt, videofile):
151 |
152 | self.__S__.eval();
153 |
154 | # ========== ==========
155 | # Load video
156 | # ========== ==========
157 | cap = cv2.VideoCapture(videofile)
158 |
159 | frame_num = 1;
160 | images = []
161 | while frame_num:
162 | frame_num += 1
163 | ret, image = cap.read()
164 | if ret == 0:
165 | break
166 |
167 | images.append(image)
168 |
169 | im = numpy.stack(images,axis=3)
170 | im = numpy.expand_dims(im,axis=0)
171 | im = numpy.transpose(im,(0,3,4,1,2))
172 |
173 | imtv = torch.autograd.Variable(torch.from_numpy(im.astype(float)).float())
174 |
175 | # ========== ==========
176 | # Generate video feats
177 | # ========== ==========
178 |
179 | lastframe = len(images)-4
180 | im_feat = []
181 |
182 | tS = time.time()
183 | for i in range(0,lastframe,opt.batch_size):
184 |
185 | im_batch = [ imtv[:,:,vframe:vframe+5,:,:] for vframe in range(i,min(lastframe,i+opt.batch_size)) ]
186 | im_in = torch.cat(im_batch,0)
187 | im_out = self.__S__.forward_lipfeat(im_in.cuda());
188 | im_feat.append(im_out.data.cpu())
189 |
190 | im_feat = torch.cat(im_feat,0)
191 |
192 | # ========== ==========
193 | # Compute offset
194 | # ========== ==========
195 |
196 | print('Compute time %.3f sec.' % (time.time()-tS))
197 |
198 | return im_feat
199 |
200 |
201 | def loadParameters(self, path):
202 | loaded_state = torch.load(path, map_location=lambda storage, loc: storage);
203 |
204 | self_state = self.__S__.state_dict();
205 |
206 | for name, param in loaded_state.items():
207 |
208 | self_state[name].copy_(param);
209 |
--------------------------------------------------------------------------------
/script/syncnet_python/SyncNetModel.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 | #-*- coding: utf-8 -*-
3 |
4 | import torch
5 | import torch.nn as nn
6 |
7 | def save(model, filename):
8 | with open(filename, "wb") as f:
9 | torch.save(model, f);
10 | print("%s saved."%filename);
11 |
12 | def load(filename):
13 | net = torch.load(filename)
14 | return net;
15 |
16 | class S(nn.Module):
17 | def __init__(self, num_layers_in_fc_layers = 1024):
18 | super(S, self).__init__();
19 |
20 | self.__nFeatures__ = 24;
21 | self.__nChs__ = 32;
22 | self.__midChs__ = 32;
23 |
24 | self.netcnnaud = nn.Sequential(
25 | nn.Conv2d(1, 64, kernel_size=(3,3), stride=(1,1), padding=(1,1)),
26 | nn.BatchNorm2d(64),
27 | nn.ReLU(inplace=True),
28 | nn.MaxPool2d(kernel_size=(1,1), stride=(1,1)),
29 |
30 | nn.Conv2d(64, 192, kernel_size=(3,3), stride=(1,1), padding=(1,1)),
31 | nn.BatchNorm2d(192),
32 | nn.ReLU(inplace=True),
33 | nn.MaxPool2d(kernel_size=(3,3), stride=(1,2)),
34 |
35 | nn.Conv2d(192, 384, kernel_size=(3,3), padding=(1,1)),
36 | nn.BatchNorm2d(384),
37 | nn.ReLU(inplace=True),
38 |
39 | nn.Conv2d(384, 256, kernel_size=(3,3), padding=(1,1)),
40 | nn.BatchNorm2d(256),
41 | nn.ReLU(inplace=True),
42 |
43 | nn.Conv2d(256, 256, kernel_size=(3,3), padding=(1,1)),
44 | nn.BatchNorm2d(256),
45 | nn.ReLU(inplace=True),
46 | nn.MaxPool2d(kernel_size=(3,3), stride=(2,2)),
47 |
48 | nn.Conv2d(256, 512, kernel_size=(5,4), padding=(0,0)),
49 | nn.BatchNorm2d(512),
50 | nn.ReLU(),
51 | );
52 |
53 | self.netfcaud = nn.Sequential(
54 | nn.Linear(512, 512),
55 | nn.BatchNorm1d(512),
56 | nn.ReLU(),
57 | nn.Linear(512, num_layers_in_fc_layers),
58 | );
59 |
60 | self.netfclip = nn.Sequential(
61 | nn.Linear(512, 512),
62 | nn.BatchNorm1d(512),
63 | nn.ReLU(),
64 | nn.Linear(512, num_layers_in_fc_layers),
65 | );
66 |
67 | self.netcnnlip = nn.Sequential(
68 | nn.Conv3d(3, 96, kernel_size=(5,7,7), stride=(1,2,2), padding=0),
69 | nn.BatchNorm3d(96),
70 | nn.ReLU(inplace=True),
71 | nn.MaxPool3d(kernel_size=(1,3,3), stride=(1,2,2)),
72 |
73 | nn.Conv3d(96, 256, kernel_size=(1,5,5), stride=(1,2,2), padding=(0,1,1)),
74 | nn.BatchNorm3d(256),
75 | nn.ReLU(inplace=True),
76 | nn.MaxPool3d(kernel_size=(1,3,3), stride=(1,2,2), padding=(0,1,1)),
77 |
78 | nn.Conv3d(256, 256, kernel_size=(1,3,3), padding=(0,1,1)),
79 | nn.BatchNorm3d(256),
80 | nn.ReLU(inplace=True),
81 |
82 | nn.Conv3d(256, 256, kernel_size=(1,3,3), padding=(0,1,1)),
83 | nn.BatchNorm3d(256),
84 | nn.ReLU(inplace=True),
85 |
86 | nn.Conv3d(256, 256, kernel_size=(1,3,3), padding=(0,1,1)),
87 | nn.BatchNorm3d(256),
88 | nn.ReLU(inplace=True),
89 | nn.MaxPool3d(kernel_size=(1,3,3), stride=(1,2,2)),
90 |
91 | nn.Conv3d(256, 512, kernel_size=(1,6,6), padding=0),
92 | nn.BatchNorm3d(512),
93 | nn.ReLU(inplace=True),
94 | );
95 |
96 | def forward_aud(self, x):
97 |
98 | mid = self.netcnnaud(x); # N x ch x 24 x M
99 | mid = mid.view((mid.size()[0], -1)); # N x (ch x 24)
100 | out = self.netfcaud(mid);
101 |
102 | return out;
103 |
104 | def forward_lip(self, x):
105 |
106 | mid = self.netcnnlip(x);
107 | mid = mid.view((mid.size()[0], -1)); # N x (ch x 24)
108 | out = self.netfclip(mid);
109 |
110 | return out;
111 |
112 | def forward_lipfeat(self, x):
113 |
114 | mid = self.netcnnlip(x);
115 | out = mid.view((mid.size()[0], -1)); # N x (ch x 24)
116 |
117 | return out;
--------------------------------------------------------------------------------
/script/syncnet_python/demo_feature.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 | #-*- coding: utf-8 -*-
3 |
4 | import time, pdb, argparse, subprocess
5 |
6 | from SyncNetInstance import *
7 |
8 | # ==================== LOAD PARAMS ====================
9 |
10 |
11 | parser = argparse.ArgumentParser(description = "SyncNet");
12 |
13 | parser.add_argument('--initial_model', type=str, default="data/syncnet_v2.model", help='');
14 | parser.add_argument('--batch_size', type=int, default='20', help='');
15 | parser.add_argument('--vshift', type=int, default='15', help='');
16 | parser.add_argument('--videofile', type=str, default="data/example.avi", help='');
17 | parser.add_argument('--tmp_dir', type=str, default="data", help='');
18 | parser.add_argument('--save_as', type=str, default="data/features.pt", help='');
19 |
20 | opt = parser.parse_args();
21 |
22 |
23 | # ==================== RUN EVALUATION ====================
24 |
25 | s = SyncNetInstance();
26 |
27 | s.loadParameters(opt.initial_model);
28 | print("Model %s loaded."%opt.initial_model);
29 |
30 | feats = s.extract_feature(opt, videofile=opt.videofile)
31 |
32 | torch.save(feats, opt.save_as)
33 |
--------------------------------------------------------------------------------
/script/syncnet_python/demo_syncnet.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 | #-*- coding: utf-8 -*-
3 |
4 | import time, pdb, argparse, subprocess
5 |
6 | from SyncNetInstance import *
7 |
8 | # ==================== LOAD PARAMS ====================
9 |
10 |
11 | parser = argparse.ArgumentParser(description = "SyncNet");
12 |
13 | parser.add_argument('--initial_model', type=str, default="data/syncnet_v2.model", help='');
14 | parser.add_argument('--batch_size', type=int, default='16', help='');
15 | parser.add_argument('--vshift', type=int, default='15', help='');
16 | parser.add_argument('--videofile', type=str, default="data/example.mp4", help='');
17 | parser.add_argument('--tmp_dir', type=str, default="data/work/pytmp", help='');
18 | parser.add_argument('--reference', type=str, default="demo", help='');
19 |
20 | opt = parser.parse_args();
21 |
22 |
23 | # ==================== RUN EVALUATION ====================
24 |
25 | s = SyncNetInstance();
26 |
27 | s.loadParameters(opt.initial_model);
28 | print("Model %s loaded."%opt.initial_model);
29 |
30 | s.evaluate(opt, videofile=opt.videofile)
31 |
--------------------------------------------------------------------------------
/script/syncnet_python/detectors/README.md:
--------------------------------------------------------------------------------
1 | # Face detector
2 |
3 | This face detector is adapted from `https://github.com/cs-giung/face-detection-pytorch`.
4 |
--------------------------------------------------------------------------------
/script/syncnet_python/detectors/__init__.py:
--------------------------------------------------------------------------------
1 | from .s3fd import S3FD
--------------------------------------------------------------------------------
/script/syncnet_python/detectors/s3fd/__init__.py:
--------------------------------------------------------------------------------
1 | import time
2 | import numpy as np
3 | import cv2
4 | import torch
5 | from torchvision import transforms
6 | from .nets import S3FDNet
7 | from .box_utils import nms_
8 |
9 | PATH_WEIGHT = './detectors/s3fd/weights/sfd_face.pth'
10 | img_mean = np.array([104., 117., 123.])[:, np.newaxis, np.newaxis].astype('float32')
11 |
12 |
13 | class S3FD():
14 |
15 | def __init__(self, device='cuda'):
16 |
17 | tstamp = time.time()
18 | self.device = device
19 |
20 | print('[S3FD] loading with', self.device)
21 | self.net = S3FDNet(device=self.device).to(self.device)
22 | state_dict = torch.load(PATH_WEIGHT, map_location=self.device)
23 | self.net.load_state_dict(state_dict)
24 | self.net.eval()
25 | print('[S3FD] finished loading (%.4f sec)' % (time.time() - tstamp))
26 |
27 | def detect_faces(self, image, conf_th=0.8, scales=[1]):
28 |
29 | w, h = image.shape[1], image.shape[0]
30 |
31 | bboxes = np.empty(shape=(0, 5))
32 |
33 | with torch.no_grad():
34 | for s in scales:
35 | scaled_img = cv2.resize(image, dsize=(0, 0), fx=s, fy=s, interpolation=cv2.INTER_LINEAR)
36 |
37 | scaled_img = np.swapaxes(scaled_img, 1, 2)
38 | scaled_img = np.swapaxes(scaled_img, 1, 0)
39 | scaled_img = scaled_img[[2, 1, 0], :, :]
40 | scaled_img = scaled_img.astype('float32')
41 | scaled_img -= img_mean
42 | scaled_img = scaled_img[[2, 1, 0], :, :]
43 | x = torch.from_numpy(scaled_img).unsqueeze(0).to(self.device)
44 | y = self.net(x)
45 |
46 | detections = y.data
47 | scale = torch.Tensor([w, h, w, h])
48 |
49 | for i in range(detections.size(1)):
50 | j = 0
51 | while detections[0, i, j, 0] > conf_th:
52 | score = detections[0, i, j, 0]
53 | pt = (detections[0, i, j, 1:] * scale).cpu().numpy()
54 | bbox = (pt[0], pt[1], pt[2], pt[3], score)
55 | bboxes = np.vstack((bboxes, bbox))
56 | j += 1
57 |
58 | keep = nms_(bboxes, 0.1)
59 | bboxes = bboxes[keep]
60 |
61 | return bboxes
62 |
--------------------------------------------------------------------------------
/script/syncnet_python/detectors/s3fd/box_utils.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from itertools import product as product
3 | import torch
4 | from torch.autograd import Function
5 |
6 |
7 | def nms_(dets, thresh):
8 | """
9 | Courtesy of Ross Girshick
10 | [https://github.com/rbgirshick/py-faster-rcnn/blob/master/lib/nms/py_cpu_nms.py]
11 | """
12 | x1 = dets[:, 0]
13 | y1 = dets[:, 1]
14 | x2 = dets[:, 2]
15 | y2 = dets[:, 3]
16 | scores = dets[:, 4]
17 |
18 | areas = (x2 - x1) * (y2 - y1)
19 | order = scores.argsort()[::-1]
20 |
21 | keep = []
22 | while order.size > 0:
23 | i = order[0]
24 | keep.append(int(i))
25 | xx1 = np.maximum(x1[i], x1[order[1:]])
26 | yy1 = np.maximum(y1[i], y1[order[1:]])
27 | xx2 = np.minimum(x2[i], x2[order[1:]])
28 | yy2 = np.minimum(y2[i], y2[order[1:]])
29 |
30 | w = np.maximum(0.0, xx2 - xx1)
31 | h = np.maximum(0.0, yy2 - yy1)
32 | inter = w * h
33 | ovr = inter / (areas[i] + areas[order[1:]] - inter)
34 |
35 | inds = np.where(ovr <= thresh)[0]
36 | order = order[inds + 1]
37 |
38 | return np.array(keep).astype(np.int_)
39 |
40 |
41 | def decode(loc, priors, variances):
42 | """Decode locations from predictions using priors to undo
43 | the encoding we did for offset regression at train time.
44 | Args:
45 | loc (tensor): location predictions for loc layers,
46 | Shape: [num_priors,4]
47 | priors (tensor): Prior boxes in center-offset form.
48 | Shape: [num_priors,4].
49 | variances: (list[float]) Variances of priorboxes
50 | Return:
51 | decoded bounding box predictions
52 | """
53 |
54 | boxes = torch.cat((
55 | priors[:, :2] + loc[:, :2] * variances[0] * priors[:, 2:],
56 | priors[:, 2:] * torch.exp(loc[:, 2:] * variances[1])), 1)
57 | boxes[:, :2] -= boxes[:, 2:] / 2
58 | boxes[:, 2:] += boxes[:, :2]
59 | return boxes
60 |
61 |
62 | def nms(boxes, scores, overlap=0.5, top_k=200):
63 | """Apply non-maximum suppression at test time to avoid detecting too many
64 | overlapping bounding boxes for a given object.
65 | Args:
66 | boxes: (tensor) The location preds for the img, Shape: [num_priors,4].
67 | scores: (tensor) The class predscores for the img, Shape:[num_priors].
68 | overlap: (float) The overlap thresh for suppressing unnecessary boxes.
69 | top_k: (int) The Maximum number of box preds to consider.
70 | Return:
71 | The indices of the kept boxes with respect to num_priors.
72 | """
73 |
74 | keep = scores.new(scores.size(0)).zero_().long()
75 | if boxes.numel() == 0:
76 | return keep, 0
77 | x1 = boxes[:, 0]
78 | y1 = boxes[:, 1]
79 | x2 = boxes[:, 2]
80 | y2 = boxes[:, 3]
81 | area = torch.mul(x2 - x1, y2 - y1)
82 | v, idx = scores.sort(0) # sort in ascending order
83 | # I = I[v >= 0.01]
84 | idx = idx[-top_k:] # indices of the top-k largest vals
85 | xx1 = boxes.new()
86 | yy1 = boxes.new()
87 | xx2 = boxes.new()
88 | yy2 = boxes.new()
89 | w = boxes.new()
90 | h = boxes.new()
91 |
92 | # keep = torch.Tensor()
93 | count = 0
94 | while idx.numel() > 0:
95 | i = idx[-1] # index of current largest val
96 | # keep.append(i)
97 | keep[count] = i
98 | count += 1
99 | if idx.size(0) == 1:
100 | break
101 | idx = idx[:-1] # remove kept element from view
102 | # load bboxes of next highest vals
103 | torch.index_select(x1, 0, idx, out=xx1)
104 | torch.index_select(y1, 0, idx, out=yy1)
105 | torch.index_select(x2, 0, idx, out=xx2)
106 | torch.index_select(y2, 0, idx, out=yy2)
107 | # store element-wise max with next highest score
108 | xx1 = torch.clamp(xx1, min=x1[i])
109 | yy1 = torch.clamp(yy1, min=y1[i])
110 | xx2 = torch.clamp(xx2, max=x2[i])
111 | yy2 = torch.clamp(yy2, max=y2[i])
112 | w.resize_as_(xx2)
113 | h.resize_as_(yy2)
114 | w = xx2 - xx1
115 | h = yy2 - yy1
116 | # check sizes of xx1 and xx2.. after each iteration
117 | w = torch.clamp(w, min=0.0)
118 | h = torch.clamp(h, min=0.0)
119 | inter = w * h
120 | # IoU = i / (area(a) + area(b) - i)
121 | rem_areas = torch.index_select(area, 0, idx) # load remaining areas)
122 | union = (rem_areas - inter) + area[i]
123 | IoU = inter / union # store result in iou
124 | # keep only elements with an IoU <= overlap
125 | idx = idx[IoU.le(overlap)]
126 | return keep, count
127 |
128 |
129 | class Detect(object):
130 |
131 | def __init__(self, num_classes=2,
132 | top_k=750, nms_thresh=0.3, conf_thresh=0.05,
133 | variance=[0.1, 0.2], nms_top_k=5000):
134 |
135 | self.num_classes = num_classes
136 | self.top_k = top_k
137 | self.nms_thresh = nms_thresh
138 | self.conf_thresh = conf_thresh
139 | self.variance = variance
140 | self.nms_top_k = nms_top_k
141 |
142 | def forward(self, loc_data, conf_data, prior_data):
143 |
144 | num = loc_data.size(0)
145 | num_priors = prior_data.size(0)
146 |
147 | conf_preds = conf_data.view(num, num_priors, self.num_classes).transpose(2, 1)
148 | batch_priors = prior_data.view(-1, num_priors, 4).expand(num, num_priors, 4)
149 | batch_priors = batch_priors.contiguous().view(-1, 4)
150 |
151 | decoded_boxes = decode(loc_data.view(-1, 4), batch_priors, self.variance)
152 | decoded_boxes = decoded_boxes.view(num, num_priors, 4)
153 |
154 | output = torch.zeros(num, self.num_classes, self.top_k, 5)
155 |
156 | for i in range(num):
157 | boxes = decoded_boxes[i].clone()
158 | conf_scores = conf_preds[i].clone()
159 |
160 | for cl in range(1, self.num_classes):
161 | c_mask = conf_scores[cl].gt(self.conf_thresh)
162 | scores = conf_scores[cl][c_mask]
163 |
164 | if scores.dim() == 0:
165 | continue
166 | l_mask = c_mask.unsqueeze(1).expand_as(boxes)
167 | boxes_ = boxes[l_mask].view(-1, 4)
168 | ids, count = nms(boxes_, scores, self.nms_thresh, self.nms_top_k)
169 | count = count if count < self.top_k else self.top_k
170 |
171 | output[i, cl, :count] = torch.cat((scores[ids[:count]].unsqueeze(1), boxes_[ids[:count]]), 1)
172 |
173 | return output
174 |
175 |
176 | class PriorBox(object):
177 |
178 | def __init__(self, input_size, feature_maps,
179 | variance=[0.1, 0.2],
180 | min_sizes=[16, 32, 64, 128, 256, 512],
181 | steps=[4, 8, 16, 32, 64, 128],
182 | clip=False):
183 |
184 | super(PriorBox, self).__init__()
185 |
186 | self.imh = input_size[0]
187 | self.imw = input_size[1]
188 | self.feature_maps = feature_maps
189 |
190 | self.variance = variance
191 | self.min_sizes = min_sizes
192 | self.steps = steps
193 | self.clip = clip
194 |
195 | def forward(self):
196 | mean = []
197 | for k, fmap in enumerate(self.feature_maps):
198 | feath = fmap[0]
199 | featw = fmap[1]
200 | for i, j in product(range(feath), range(featw)):
201 | f_kw = self.imw / self.steps[k]
202 | f_kh = self.imh / self.steps[k]
203 |
204 | cx = (j + 0.5) / f_kw
205 | cy = (i + 0.5) / f_kh
206 |
207 | s_kw = self.min_sizes[k] / self.imw
208 | s_kh = self.min_sizes[k] / self.imh
209 |
210 | mean += [cx, cy, s_kw, s_kh]
211 |
212 | output = torch.FloatTensor(mean).view(-1, 4)
213 |
214 | if self.clip:
215 | output.clamp_(max=1, min=0)
216 |
217 | return output
218 |
--------------------------------------------------------------------------------
/script/syncnet_python/detectors/s3fd/nets.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | import torch.nn.functional as F
4 | import torch.nn.init as init
5 | from .box_utils import Detect, PriorBox
6 |
7 |
8 | class L2Norm(nn.Module):
9 |
10 | def __init__(self, n_channels, scale):
11 | super(L2Norm, self).__init__()
12 | self.n_channels = n_channels
13 | self.gamma = scale or None
14 | self.eps = 1e-10
15 | self.weight = nn.Parameter(torch.Tensor(self.n_channels))
16 | self.reset_parameters()
17 |
18 | def reset_parameters(self):
19 | init.constant_(self.weight, self.gamma)
20 |
21 | def forward(self, x):
22 | norm = x.pow(2).sum(dim=1, keepdim=True).sqrt() + self.eps
23 | x = torch.div(x, norm)
24 | out = self.weight.unsqueeze(0).unsqueeze(2).unsqueeze(3).expand_as(x) * x
25 | return out
26 |
27 |
28 | class S3FDNet(nn.Module):
29 |
30 | def __init__(self, device='cuda'):
31 | super(S3FDNet, self).__init__()
32 | self.device = device
33 |
34 | self.vgg = nn.ModuleList([
35 | nn.Conv2d(3, 64, 3, 1, padding=1),
36 | nn.ReLU(inplace=True),
37 | nn.Conv2d(64, 64, 3, 1, padding=1),
38 | nn.ReLU(inplace=True),
39 | nn.MaxPool2d(2, 2),
40 |
41 | nn.Conv2d(64, 128, 3, 1, padding=1),
42 | nn.ReLU(inplace=True),
43 | nn.Conv2d(128, 128, 3, 1, padding=1),
44 | nn.ReLU(inplace=True),
45 | nn.MaxPool2d(2, 2),
46 |
47 | nn.Conv2d(128, 256, 3, 1, padding=1),
48 | nn.ReLU(inplace=True),
49 | nn.Conv2d(256, 256, 3, 1, padding=1),
50 | nn.ReLU(inplace=True),
51 | nn.Conv2d(256, 256, 3, 1, padding=1),
52 | nn.ReLU(inplace=True),
53 | nn.MaxPool2d(2, 2, ceil_mode=True),
54 |
55 | nn.Conv2d(256, 512, 3, 1, padding=1),
56 | nn.ReLU(inplace=True),
57 | nn.Conv2d(512, 512, 3, 1, padding=1),
58 | nn.ReLU(inplace=True),
59 | nn.Conv2d(512, 512, 3, 1, padding=1),
60 | nn.ReLU(inplace=True),
61 | nn.MaxPool2d(2, 2),
62 |
63 | nn.Conv2d(512, 512, 3, 1, padding=1),
64 | nn.ReLU(inplace=True),
65 | nn.Conv2d(512, 512, 3, 1, padding=1),
66 | nn.ReLU(inplace=True),
67 | nn.Conv2d(512, 512, 3, 1, padding=1),
68 | nn.ReLU(inplace=True),
69 | nn.MaxPool2d(2, 2),
70 |
71 | nn.Conv2d(512, 1024, 3, 1, padding=6, dilation=6),
72 | nn.ReLU(inplace=True),
73 | nn.Conv2d(1024, 1024, 1, 1),
74 | nn.ReLU(inplace=True),
75 | ])
76 |
77 | self.L2Norm3_3 = L2Norm(256, 10)
78 | self.L2Norm4_3 = L2Norm(512, 8)
79 | self.L2Norm5_3 = L2Norm(512, 5)
80 |
81 | self.extras = nn.ModuleList([
82 | nn.Conv2d(1024, 256, 1, 1),
83 | nn.Conv2d(256, 512, 3, 2, padding=1),
84 | nn.Conv2d(512, 128, 1, 1),
85 | nn.Conv2d(128, 256, 3, 2, padding=1),
86 | ])
87 |
88 | self.loc = nn.ModuleList([
89 | nn.Conv2d(256, 4, 3, 1, padding=1),
90 | nn.Conv2d(512, 4, 3, 1, padding=1),
91 | nn.Conv2d(512, 4, 3, 1, padding=1),
92 | nn.Conv2d(1024, 4, 3, 1, padding=1),
93 | nn.Conv2d(512, 4, 3, 1, padding=1),
94 | nn.Conv2d(256, 4, 3, 1, padding=1),
95 | ])
96 |
97 | self.conf = nn.ModuleList([
98 | nn.Conv2d(256, 4, 3, 1, padding=1),
99 | nn.Conv2d(512, 2, 3, 1, padding=1),
100 | nn.Conv2d(512, 2, 3, 1, padding=1),
101 | nn.Conv2d(1024, 2, 3, 1, padding=1),
102 | nn.Conv2d(512, 2, 3, 1, padding=1),
103 | nn.Conv2d(256, 2, 3, 1, padding=1),
104 | ])
105 |
106 | self.softmax = nn.Softmax(dim=-1)
107 | self.detect = Detect()
108 |
109 | def forward(self, x):
110 | size = x.size()[2:]
111 | sources = list()
112 | loc = list()
113 | conf = list()
114 |
115 | for k in range(16):
116 | x = self.vgg[k](x)
117 | s = self.L2Norm3_3(x)
118 | sources.append(s)
119 |
120 | for k in range(16, 23):
121 | x = self.vgg[k](x)
122 | s = self.L2Norm4_3(x)
123 | sources.append(s)
124 |
125 | for k in range(23, 30):
126 | x = self.vgg[k](x)
127 | s = self.L2Norm5_3(x)
128 | sources.append(s)
129 |
130 | for k in range(30, len(self.vgg)):
131 | x = self.vgg[k](x)
132 | sources.append(x)
133 |
134 | # apply extra layers and cache source layer outputs
135 | for k, v in enumerate(self.extras):
136 | x = F.relu(v(x), inplace=True)
137 | if k % 2 == 1:
138 | sources.append(x)
139 |
140 | # apply multibox head to source layers
141 | loc_x = self.loc[0](sources[0])
142 | conf_x = self.conf[0](sources[0])
143 |
144 | max_conf, _ = torch.max(conf_x[:, 0:3, :, :], dim=1, keepdim=True)
145 | conf_x = torch.cat((max_conf, conf_x[:, 3:, :, :]), dim=1)
146 |
147 | loc.append(loc_x.permute(0, 2, 3, 1).contiguous())
148 | conf.append(conf_x.permute(0, 2, 3, 1).contiguous())
149 |
150 | for i in range(1, len(sources)):
151 | x = sources[i]
152 | conf.append(self.conf[i](x).permute(0, 2, 3, 1).contiguous())
153 | loc.append(self.loc[i](x).permute(0, 2, 3, 1).contiguous())
154 |
155 | features_maps = []
156 | for i in range(len(loc)):
157 | feat = []
158 | feat += [loc[i].size(1), loc[i].size(2)]
159 | features_maps += [feat]
160 |
161 | loc = torch.cat([o.view(o.size(0), -1) for o in loc], 1)
162 | conf = torch.cat([o.view(o.size(0), -1) for o in conf], 1)
163 |
164 | with torch.no_grad():
165 | self.priorbox = PriorBox(size, features_maps)
166 | self.priors = self.priorbox.forward()
167 |
168 | output = self.detect.forward(
169 | loc.view(loc.size(0), -1, 4),
170 | self.softmax(conf.view(conf.size(0), -1, 2)),
171 | self.priors.type(type(x.data)).to(self.device)
172 | )
173 |
174 | return output
175 |
--------------------------------------------------------------------------------
/script/syncnet_python/download_model.sh:
--------------------------------------------------------------------------------
1 | # SyncNet model
2 |
3 | mkdir data
4 | wget http://www.robots.ox.ac.uk/~vgg/software/lipsync/data/syncnet_v2.model -O data/syncnet_v2.model
5 | wget http://www.robots.ox.ac.uk/~vgg/software/lipsync/data/example.avi -O data/example.avi
6 |
7 | # For the pre-processing pipeline
8 | mkdir detectors/s3fd/weights
9 | wget https://www.robots.ox.ac.uk/~vgg/software/lipsync/data/sfd_face.pth -O detectors/s3fd/weights/sfd_face.pth
--------------------------------------------------------------------------------
/script/syncnet_python/requirements.txt:
--------------------------------------------------------------------------------
1 | torch>=1.4.0
2 | torchvision>=0.5.0
3 | numpy>=1.18.1
4 | scipy>=1.2.1
5 | scenedetect==0.5.1
6 | opencv-contrib-python
7 | python_speech_features
8 |
--------------------------------------------------------------------------------
/script/syncnet_python/run_pipeline.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 |
3 | import sys, time, os, pdb, argparse, pickle, subprocess, glob, cv2
4 | import numpy as np
5 | from shutil import rmtree
6 |
7 | import scenedetect
8 | from scenedetect.video_manager import VideoManager
9 | from scenedetect.scene_manager import SceneManager
10 | from scenedetect.frame_timecode import FrameTimecode
11 | from scenedetect.stats_manager import StatsManager
12 | from scenedetect.detectors import ContentDetector
13 |
14 | from scipy.interpolate import interp1d
15 | from scipy.io import wavfile
16 | from scipy import signal
17 |
18 | from detectors import S3FD
19 |
20 | # ========== ========== ========== ==========
21 | # # PARSE ARGS
22 | # ========== ========== ========== ==========
23 |
24 | parser = argparse.ArgumentParser(description = "FaceTracker");
25 | parser.add_argument('--data_dir', type=str, default='data/work', help='Output direcotry');
26 | parser.add_argument('--videofile', type=str, default='', help='Input video file');
27 | parser.add_argument('--reference', type=str, default='', help='Video reference');
28 | parser.add_argument('--facedet_scale', type=float, default=0.25, help='Scale factor for face detection');
29 | parser.add_argument('--crop_scale', type=float, default=0.40, help='Scale bounding box');
30 | parser.add_argument('--min_track', type=int, default=25, help='Minimum facetrack duration');
31 | parser.add_argument('--frame_rate', type=int, default=25, help='Frame rate');
32 | parser.add_argument('--num_failed_det', type=int, default=25, help='Number of missed detections allowed before tracking is stopped');
33 | parser.add_argument('--min_face_size', type=int, default=100, help='Minimum face size in pixels');
34 | opt = parser.parse_args();
35 |
36 | setattr(opt,'avi_dir',os.path.join(opt.data_dir,'pyavi'))
37 | setattr(opt,'tmp_dir',os.path.join(opt.data_dir,'pytmp'))
38 | setattr(opt,'work_dir',os.path.join(opt.data_dir,'pywork'))
39 | setattr(opt,'crop_dir',os.path.join(opt.data_dir,'pycrop'))
40 | setattr(opt,'frames_dir',os.path.join(opt.data_dir,'pyframes'))
41 |
42 | # ========== ========== ========== ==========
43 | # # IOU FUNCTION
44 | # ========== ========== ========== ==========
45 |
46 | def bb_intersection_over_union(boxA, boxB):
47 |
48 | xA = max(boxA[0], boxB[0])
49 | yA = max(boxA[1], boxB[1])
50 | xB = min(boxA[2], boxB[2])
51 | yB = min(boxA[3], boxB[3])
52 |
53 | interArea = max(0, xB - xA) * max(0, yB - yA)
54 |
55 | boxAArea = (boxA[2] - boxA[0]) * (boxA[3] - boxA[1])
56 | boxBArea = (boxB[2] - boxB[0]) * (boxB[3] - boxB[1])
57 |
58 | iou = interArea / float(boxAArea + boxBArea - interArea)
59 |
60 | return iou
61 |
62 | # ========== ========== ========== ==========
63 | # # FACE TRACKING
64 | # ========== ========== ========== ==========
65 |
66 | def track_shot(opt,scenefaces):
67 |
68 | iouThres = 0.5 # Minimum IOU between consecutive face detections
69 | tracks = []
70 |
71 | while True:
72 | track = []
73 | for framefaces in scenefaces:
74 | for face in framefaces:
75 | if track == []:
76 | track.append(face)
77 | framefaces.remove(face)
78 | elif face['frame'] - track[-1]['frame'] <= opt.num_failed_det:
79 | iou = bb_intersection_over_union(face['bbox'], track[-1]['bbox'])
80 | if iou > iouThres:
81 | track.append(face)
82 | framefaces.remove(face)
83 | continue
84 | else:
85 | break
86 |
87 | if track == []:
88 | break
89 | elif len(track) > opt.min_track:
90 |
91 | framenum = np.array([ f['frame'] for f in track ])
92 | bboxes = np.array([np.array(f['bbox']) for f in track])
93 |
94 | frame_i = np.arange(framenum[0],framenum[-1]+1)
95 |
96 | bboxes_i = []
97 | for ij in range(0,4):
98 | interpfn = interp1d(framenum, bboxes[:,ij])
99 | bboxes_i.append(interpfn(frame_i))
100 | bboxes_i = np.stack(bboxes_i, axis=1)
101 |
102 | if max(np.mean(bboxes_i[:,2]-bboxes_i[:,0]), np.mean(bboxes_i[:,3]-bboxes_i[:,1])) > opt.min_face_size:
103 | tracks.append({'frame':frame_i,'bbox':bboxes_i})
104 |
105 | return tracks
106 |
107 | # ========== ========== ========== ==========
108 | # # VIDEO CROP AND SAVE
109 | # ========== ========== ========== ==========
110 |
111 | def crop_video(opt,track,cropfile):
112 |
113 | flist = glob.glob(os.path.join(opt.frames_dir,opt.reference,'*.jpg'))
114 | flist.sort()
115 |
116 | fourcc = cv2.VideoWriter_fourcc(*'XVID')
117 | vOut = cv2.VideoWriter(cropfile+'t.avi', fourcc, opt.frame_rate, (224,224))
118 |
119 | dets = {'x':[], 'y':[], 's':[]}
120 |
121 | for det in track['bbox']:
122 |
123 | dets['s'].append(max((det[3]-det[1]),(det[2]-det[0]))/2)
124 | dets['y'].append((det[1]+det[3])/2) # crop center x
125 | dets['x'].append((det[0]+det[2])/2) # crop center y
126 |
127 | # Smooth detections
128 | dets['s'] = signal.medfilt(dets['s'],kernel_size=13)
129 | dets['x'] = signal.medfilt(dets['x'],kernel_size=13)
130 | dets['y'] = signal.medfilt(dets['y'],kernel_size=13)
131 |
132 | for fidx, frame in enumerate(track['frame']):
133 |
134 | cs = opt.crop_scale
135 |
136 | bs = dets['s'][fidx] # Detection box size
137 | bsi = int(bs*(1+2*cs)) # Pad videos by this amount
138 |
139 | image = cv2.imread(flist[frame])
140 |
141 | frame = np.pad(image,((bsi,bsi),(bsi,bsi),(0,0)), 'constant', constant_values=(110,110))
142 | my = dets['y'][fidx]+bsi # BBox center Y
143 | mx = dets['x'][fidx]+bsi # BBox center X
144 |
145 | face = frame[int(my-bs):int(my+bs*(1+2*cs)),int(mx-bs*(1+cs)):int(mx+bs*(1+cs))]
146 |
147 | vOut.write(cv2.resize(face,(224,224)))
148 |
149 | audiotmp = os.path.join(opt.tmp_dir,opt.reference,'audio.wav')
150 | audiostart = (track['frame'][0])/opt.frame_rate
151 | audioend = (track['frame'][-1]+1)/opt.frame_rate
152 |
153 | vOut.release()
154 |
155 | # ========== CROP AUDIO FILE ==========
156 |
157 | command = ("ffmpeg -y -i %s -ss %.3f -to %.3f %s" % (os.path.join(opt.avi_dir,opt.reference,'audio.wav'),audiostart,audioend,audiotmp))
158 | output = subprocess.call(command, shell=True, stdout=None)
159 |
160 | if output != 0:
161 | pdb.set_trace()
162 |
163 | sample_rate, audio = wavfile.read(audiotmp)
164 |
165 | # ========== COMBINE AUDIO AND VIDEO FILES ==========
166 |
167 | command = ("ffmpeg -y -i %st.avi -i %s -c:v copy -c:a copy %s.avi" % (cropfile,audiotmp,cropfile))
168 | output = subprocess.call(command, shell=True, stdout=None)
169 |
170 | if output != 0:
171 | pdb.set_trace()
172 |
173 | print('Written %s'%cropfile)
174 |
175 | os.remove(cropfile+'t.avi')
176 |
177 | print('Mean pos: x %.2f y %.2f s %.2f'%(np.mean(dets['x']),np.mean(dets['y']),np.mean(dets['s'])))
178 |
179 | return {'track':track, 'proc_track':dets}
180 |
181 |
182 | def crop_hq_video(opt, track, cropfile):
183 | flist = glob.glob(os.path.join(opt.frames_dir, opt.reference, '*.jpg'))
184 | flist.sort()
185 |
186 | fourcc = cv2.VideoWriter_fourcc(*'XVID')
187 |
188 |
189 | dets = {'x': [], 'y': [], 's': []}
190 |
191 | for det in track['bbox']:
192 | dets['s'].append(max((det[3] - det[1]), (det[2] - det[0])) / 2)
193 | dets['y'].append((det[1] + det[3]) / 2) # crop center x
194 | dets['x'].append((det[0] + det[2]) / 2) # crop center y
195 |
196 | # Smooth detections
197 | dets['s'] = signal.medfilt(dets['s'], kernel_size=13)
198 | dets['x'] = signal.medfilt(dets['x'], kernel_size=13)
199 | dets['y'] = signal.medfilt(dets['y'], kernel_size=13)
200 | cs = opt.crop_scale
201 | lengh = int(dets['s'].max() * 2 * (1 + cs))
202 | vOut = cv2.VideoWriter(cropfile + 't.avi', fourcc, opt.frame_rate, (lengh, lengh))
203 |
204 | for fidx, frame in enumerate(track['frame']):
205 | cs = opt.crop_scale
206 |
207 | bs = dets['s'][fidx] # Detection box size
208 | bsi = int(bs * (1 + 2 * cs)) # Pad videos by this amount
209 |
210 | image = cv2.imread(flist[frame])
211 |
212 | frame = np.pad(image, ((bsi, bsi), (bsi, bsi), (0, 0)), 'constant', constant_values=(110, 110))
213 | my = dets['y'][fidx] + bsi # BBox center Y
214 | mx = dets['x'][fidx] + bsi # BBox center X
215 |
216 | face = frame[int(my - bs):int(my + bs * (1 + 2 * cs)), int(mx - bs * (1 + cs)):int(mx + bs * (1 + cs))]
217 |
218 | vOut.write(cv2.resize(face,(lengh,lengh)))
219 | audiotmp = os.path.join(opt.tmp_dir, opt.reference, 'audio.wav')
220 | audiostart = (track['frame'][0]) / opt.frame_rate
221 | audioend = (track['frame'][-1] + 1) / opt.frame_rate
222 |
223 | vOut.release()
224 |
225 | # ========== CROP AUDIO FILE ==========
226 |
227 | command = ("ffmpeg -y -i %s -ss %.3f -to %.3f %s" % (
228 | os.path.join(opt.avi_dir, opt.reference, 'audio.wav'), audiostart, audioend, audiotmp))
229 | output = subprocess.call(command, shell=True, stdout=None)
230 |
231 | if output != 0:
232 | pdb.set_trace()
233 |
234 | sample_rate, audio = wavfile.read(audiotmp)
235 |
236 | # ========== COMBINE AUDIO AND VIDEO FILES ==========
237 |
238 | command = ("ffmpeg -y -i %st.avi -i %s -c:v copy -c:a copy %s_hq.avi" % (cropfile, audiotmp, cropfile))
239 | output = subprocess.call(command, shell=True, stdout=None)
240 |
241 | if output != 0:
242 | pdb.set_trace()
243 |
244 | print('Written %s' % cropfile)
245 |
246 | os.remove(cropfile + 't.avi')
247 |
248 | print('Mean pos: x %.2f y %.2f s %.2f' % (np.mean(dets['x']), np.mean(dets['y']), np.mean(dets['s'])))
249 |
250 | return {'track': track, 'proc_track': dets}
251 |
252 |
253 | # ========== ========== ========== ==========
254 | # # FACE DETECTION
255 | # ========== ========== ========== ==========
256 |
257 | def inference_video(opt):
258 |
259 | DET = S3FD(device='cuda')
260 |
261 | flist = glob.glob(os.path.join(opt.frames_dir,opt.reference,'*.jpg'))
262 | flist.sort()
263 |
264 | dets = []
265 |
266 | for fidx, fname in enumerate(flist):
267 |
268 | start_time = time.time()
269 |
270 | image = cv2.imread(fname)
271 |
272 | image_np = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
273 | bboxes = DET.detect_faces(image_np, conf_th=0.9, scales=[opt.facedet_scale])
274 |
275 | dets.append([]);
276 | for bbox in bboxes:
277 | dets[-1].append({'frame':fidx, 'bbox':(bbox[:-1]).tolist(), 'conf':bbox[-1]})
278 |
279 | elapsed_time = time.time() - start_time
280 |
281 | print('%s-%05d; %d dets; %.2f Hz' % (os.path.join(opt.avi_dir,opt.reference,'video.avi'),fidx,len(dets[-1]),(1/elapsed_time)))
282 |
283 | savepath = os.path.join(opt.work_dir,opt.reference,'faces.pckl')
284 |
285 | with open(savepath, 'wb') as fil:
286 | pickle.dump(dets, fil)
287 |
288 | return dets
289 |
290 | # ========== ========== ========== ==========
291 | # # SCENE DETECTION
292 | # ========== ========== ========== ==========
293 |
294 | def scene_detect(opt):
295 |
296 | video_manager = VideoManager([os.path.join(opt.avi_dir,opt.reference,'video.avi')])
297 | stats_manager = StatsManager()
298 | scene_manager = SceneManager(stats_manager)
299 | # Add ContentDetector algorithm (constructor takes detector options like threshold).
300 | scene_manager.add_detector(ContentDetector())
301 | base_timecode = video_manager.get_base_timecode()
302 |
303 | video_manager.set_downscale_factor()
304 |
305 | video_manager.start()
306 |
307 | scene_manager.detect_scenes(frame_source=video_manager)
308 |
309 | scene_list = scene_manager.get_scene_list(base_timecode)
310 |
311 | savepath = os.path.join(opt.work_dir,opt.reference,'scene.pckl')
312 |
313 | if scene_list == []:
314 | scene_list = [(video_manager.get_base_timecode(),video_manager.get_current_timecode())]
315 |
316 | with open(savepath, 'wb') as fil:
317 | pickle.dump(scene_list, fil)
318 |
319 | print('%s - scenes detected %d'%(os.path.join(opt.avi_dir,opt.reference,'video.avi'),len(scene_list)))
320 |
321 | return scene_list
322 |
323 |
324 | # ========== ========== ========== ==========
325 | # # EXECUTE DEMO
326 | # ========== ========== ========== ==========
327 |
328 | # ========== DELETE EXISTING DIRECTORIES ==========
329 |
330 | if os.path.exists(os.path.join(opt.work_dir,opt.reference)):
331 | rmtree(os.path.join(opt.work_dir,opt.reference))
332 |
333 | if os.path.exists(os.path.join(opt.crop_dir,opt.reference)):
334 | rmtree(os.path.join(opt.crop_dir,opt.reference))
335 |
336 | if os.path.exists(os.path.join(opt.avi_dir,opt.reference)):
337 | rmtree(os.path.join(opt.avi_dir,opt.reference))
338 |
339 | if os.path.exists(os.path.join(opt.frames_dir,opt.reference)):
340 | rmtree(os.path.join(opt.frames_dir,opt.reference))
341 |
342 | if os.path.exists(os.path.join(opt.tmp_dir,opt.reference)):
343 | rmtree(os.path.join(opt.tmp_dir,opt.reference))
344 |
345 | # ========== MAKE NEW DIRECTORIES ==========
346 |
347 | os.makedirs(os.path.join(opt.work_dir,opt.reference))
348 | os.makedirs(os.path.join(opt.crop_dir,opt.reference))
349 | os.makedirs(os.path.join(opt.avi_dir,opt.reference))
350 | os.makedirs(os.path.join(opt.frames_dir,opt.reference))
351 | os.makedirs(os.path.join(opt.tmp_dir,opt.reference))
352 |
353 | # ========== CONVERT VIDEO AND EXTRACT FRAMES ==========
354 |
355 | command = ("ffmpeg -y -i %s -qscale:v 2 -async 1 -r 25 %s" % (opt.videofile,os.path.join(opt.avi_dir,opt.reference,'video.avi')))
356 | output = subprocess.call(command, shell=True, stdout=None)
357 |
358 | command = ("ffmpeg -y -i %s -qscale:v 2 -threads 1 -f image2 %s" % (os.path.join(opt.avi_dir,opt.reference,'video.avi'),os.path.join(opt.frames_dir,opt.reference,'%06d.jpg')))
359 | output = subprocess.call(command, shell=True, stdout=None)
360 |
361 | command = ("ffmpeg -y -i %s -ac 1 -vn -acodec pcm_s16le -ar 16000 %s" % (os.path.join(opt.avi_dir,opt.reference,'video.avi'),os.path.join(opt.avi_dir,opt.reference,'audio.wav')))
362 | output = subprocess.call(command, shell=True, stdout=None)
363 |
364 | # ========== FACE DETECTION ==========
365 |
366 | faces = inference_video(opt)
367 |
368 | # ========== SCENE DETECTION ==========
369 |
370 | scene = scene_detect(opt)
371 |
372 | # ========== FACE TRACKING ==========
373 |
374 | alltracks = []
375 | vidtracks = []
376 |
377 | for shot in scene:
378 |
379 | if shot[1].frame_num - shot[0].frame_num >= opt.min_track :
380 | alltracks.extend(track_shot(opt,faces[shot[0].frame_num:shot[1].frame_num]))
381 |
382 | # ========== FACE TRACK CROP ==========
383 |
384 | for ii, track in enumerate(alltracks):
385 | vidtracks.append(crop_video(opt,track,os.path.join(opt.crop_dir,opt.reference,'%05d'%ii)))
386 | crop_hq_video(opt,track,os.path.join(opt.crop_dir,opt.reference,'%05d'%ii))
387 |
388 | # ========== SAVE RESULTS ==========
389 |
390 | savepath = os.path.join(opt.work_dir,opt.reference,'tracks.pckl')
391 |
392 | with open(savepath, 'wb') as fil:
393 | pickle.dump(vidtracks, fil)
394 |
395 | rmtree(os.path.join(opt.tmp_dir,opt.reference))
396 |
--------------------------------------------------------------------------------
/script/syncnet_python/run_pipeline_dir.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 |
3 | import sys, time, os, pdb, argparse, pickle, subprocess, glob, cv2
4 | import numpy as np
5 | from shutil import rmtree
6 |
7 | import scenedetect
8 | from scenedetect.video_manager import VideoManager
9 | from scenedetect.scene_manager import SceneManager
10 | from scenedetect.frame_timecode import FrameTimecode
11 | from scenedetect.stats_manager import StatsManager
12 | from scenedetect.detectors import ContentDetector
13 |
14 | from scipy.interpolate import interp1d
15 | from scipy.io import wavfile
16 | from scipy import signal
17 |
18 | from detectors import S3FD
19 |
20 | # ========== ========== ========== ==========
21 | # # PARSE ARGS
22 | # ========== ========== ========== ==========
23 |
24 | parser = argparse.ArgumentParser(description = "FaceTracker");
25 | parser.add_argument('--data_dir', type=str, default='output', help='Output direcotry');
26 | parser.add_argument('--video_dir', type=str, default='input', help='Input video file');
27 | parser.add_argument('--reference', type=str, default='', help='Video reference');
28 | parser.add_argument('--log_file', help='dataset', default='output/input.txt', type=str);
29 | parser.add_argument('--facedet_scale', type=float, default=0.25, help='Scale factor for face detection');
30 | parser.add_argument('--crop_scale', type=float, default=0.40, help='Scale bounding box');
31 | parser.add_argument('--min_track', type=int, default=25, help='Minimum facetrack duration');
32 | parser.add_argument('--frame_rate', type=int, default=25, help='Frame rate');
33 | parser.add_argument('--num_failed_det', type=int, default=25, help='Number of missed detections allowed before tracking is stopped');
34 | parser.add_argument('--min_face_size', type=int, default=100, help='Minimum face size in pixels');
35 | opt = parser.parse_args();
36 | log_file = opt.log_file
37 | if not os.path.exists(log_file):
38 | print(f"Create log file {log_file}")
39 | os.system(f'touch {log_file}')
40 | setattr(opt,'avi_dir',os.path.join(opt.data_dir,'pyavi'))
41 | setattr(opt,'tmp_dir',os.path.join(opt.data_dir,'pytmp'))
42 | setattr(opt,'work_dir',os.path.join(opt.data_dir,'pywork'))
43 | setattr(opt,'crop_dir',os.path.join(opt.data_dir,'pycrop'))
44 | setattr(opt,'frames_dir',os.path.join(opt.data_dir,'pyframes'))
45 |
46 | # ========== ========== ========== ==========
47 | # # IOU FUNCTION
48 | # ========== ========== ========== ==========
49 |
50 | def bb_intersection_over_union(boxA, boxB):
51 |
52 | xA = max(boxA[0], boxB[0])
53 | yA = max(boxA[1], boxB[1])
54 | xB = min(boxA[2], boxB[2])
55 | yB = min(boxA[3], boxB[3])
56 |
57 | interArea = max(0, xB - xA) * max(0, yB - yA)
58 |
59 | boxAArea = (boxA[2] - boxA[0]) * (boxA[3] - boxA[1])
60 | boxBArea = (boxB[2] - boxB[0]) * (boxB[3] - boxB[1])
61 |
62 | iou = interArea / float(boxAArea + boxBArea - interArea)
63 |
64 | return iou
65 |
66 | # ========== ========== ========== ==========
67 | # # FACE TRACKING
68 | # ========== ========== ========== ==========
69 |
70 | def track_shot(opt,scenefaces):
71 |
72 | iouThres = 0.5 # Minimum IOU between consecutive face detections
73 | tracks = []
74 |
75 | while True:
76 | track = []
77 | for framefaces in scenefaces:
78 | for face in framefaces:
79 | if track == []:
80 | track.append(face)
81 | framefaces.remove(face)
82 | elif face['frame'] - track[-1]['frame'] <= opt.num_failed_det:
83 | iou = bb_intersection_over_union(face['bbox'], track[-1]['bbox'])
84 | if iou > iouThres:
85 | track.append(face)
86 | framefaces.remove(face)
87 | continue
88 | else:
89 | break
90 |
91 | if track == []:
92 | break
93 | elif len(track) > opt.min_track:
94 |
95 | framenum = np.array([ f['frame'] for f in track ])
96 | bboxes = np.array([np.array(f['bbox']) for f in track])
97 |
98 | frame_i = np.arange(framenum[0],framenum[-1]+1)
99 |
100 | bboxes_i = []
101 | for ij in range(0,4):
102 | interpfn = interp1d(framenum, bboxes[:,ij])
103 | bboxes_i.append(interpfn(frame_i))
104 | bboxes_i = np.stack(bboxes_i, axis=1)
105 |
106 | if max(np.mean(bboxes_i[:,2]-bboxes_i[:,0]), np.mean(bboxes_i[:,3]-bboxes_i[:,1])) > opt.min_face_size:
107 | tracks.append({'frame':frame_i,'bbox':bboxes_i})
108 |
109 | return tracks
110 |
111 | # ========== ========== ========== ==========
112 | # # VIDEO CROP AND SAVE
113 | # ========== ========== ========== ==========
114 |
115 | def crop_video(opt,track,cropfile):
116 |
117 | flist = glob.glob(os.path.join(opt.frames_dir,opt.reference,'*.jpg'))
118 | flist.sort()
119 |
120 | fourcc = cv2.VideoWriter_fourcc(*'XVID')
121 | vOut = cv2.VideoWriter(cropfile+'t.avi', fourcc, opt.frame_rate, (224,224))
122 |
123 | dets = {'x':[], 'y':[], 's':[]}
124 |
125 | for det in track['bbox']:
126 |
127 | dets['s'].append(max((det[3]-det[1]),(det[2]-det[0]))/2)
128 | dets['y'].append((det[1]+det[3])/2) # crop center x
129 | dets['x'].append((det[0]+det[2])/2) # crop center y
130 |
131 | # Smooth detections
132 | dets['s'] = signal.medfilt(dets['s'],kernel_size=13)
133 | dets['x'] = signal.medfilt(dets['x'],kernel_size=13)
134 | dets['y'] = signal.medfilt(dets['y'],kernel_size=13)
135 |
136 | for fidx, frame in enumerate(track['frame']):
137 |
138 | cs = opt.crop_scale
139 |
140 | bs = dets['s'][fidx] # Detection box size
141 | bsi = int(bs*(1+2*cs)) # Pad videos by this amount
142 |
143 | image = cv2.imread(flist[frame])
144 |
145 | frame = np.pad(image,((bsi,bsi),(bsi,bsi),(0,0)), 'constant', constant_values=(110,110))
146 | my = dets['y'][fidx]+bsi # BBox center Y
147 | mx = dets['x'][fidx]+bsi # BBox center X
148 |
149 | face = frame[int(my-bs):int(my+bs*(1+2*cs)),int(mx-bs*(1+cs)):int(mx+bs*(1+cs))]
150 |
151 | vOut.write(cv2.resize(face,(224,224)))
152 |
153 | audiotmp = os.path.join(opt.tmp_dir,opt.reference,'audio.wav')
154 | audiostart = (track['frame'][0])/opt.frame_rate
155 | audioend = (track['frame'][-1]+1)/opt.frame_rate
156 |
157 | vOut.release()
158 |
159 | # ========== CROP AUDIO FILE ==========
160 |
161 | command = ("ffmpeg -y -i %s -ss %.3f -to %.3f %s" % (os.path.join(opt.avi_dir,opt.reference,'audio.wav'),audiostart,audioend,audiotmp))
162 | output = subprocess.call(command, shell=True, stdout=None)
163 |
164 | if output != 0:
165 | pdb.set_trace()
166 |
167 | sample_rate, audio = wavfile.read(audiotmp)
168 |
169 | # ========== COMBINE AUDIO AND VIDEO FILES ==========
170 |
171 | command = ("ffmpeg -y -i %st.avi -i %s -c:v copy -c:a copy %s.avi" % (cropfile,audiotmp,cropfile))
172 | output = subprocess.call(command, shell=True, stdout=None)
173 |
174 | if output != 0:
175 | pdb.set_trace()
176 |
177 | print('Written %s'%cropfile)
178 |
179 | os.remove(cropfile+'t.avi')
180 |
181 | print('Mean pos: x %.2f y %.2f s %.2f'%(np.mean(dets['x']),np.mean(dets['y']),np.mean(dets['s'])))
182 |
183 | return {'track':track, 'proc_track':dets}
184 |
185 |
186 | def crop_hq_video(opt, track, cropfile):
187 | flist = glob.glob(os.path.join(opt.frames_dir, opt.reference, '*.jpg'))
188 | flist.sort()
189 |
190 | fourcc = cv2.VideoWriter_fourcc(*'XVID')
191 |
192 |
193 | dets = {'x': [], 'y': [], 's': []}
194 |
195 | for det in track['bbox']:
196 | dets['s'].append(max((det[3] - det[1]), (det[2] - det[0])) / 2)
197 | dets['y'].append((det[1] + det[3]) / 2) # crop center x
198 | dets['x'].append((det[0] + det[2]) / 2) # crop center y
199 |
200 | # Smooth detections
201 | dets['s'] = signal.medfilt(dets['s'], kernel_size=13)
202 | dets['x'] = signal.medfilt(dets['x'], kernel_size=13)
203 | dets['y'] = signal.medfilt(dets['y'], kernel_size=13)
204 | cs = opt.crop_scale
205 | lengh = int(dets['s'].max() * 2 * (1 + cs))
206 | vOut = cv2.VideoWriter(cropfile + 't.avi', fourcc, opt.frame_rate, (lengh, lengh))
207 |
208 | for fidx, frame in enumerate(track['frame']):
209 | cs = opt.crop_scale
210 |
211 | bs = dets['s'][fidx] # Detection box size
212 | bsi = int(bs * (1 + 2 * cs)) # Pad videos by this amount
213 |
214 | image = cv2.imread(flist[frame])
215 |
216 | frame = np.pad(image, ((bsi, bsi), (bsi, bsi), (0, 0)), 'constant', constant_values=(110, 110))
217 | my = dets['y'][fidx] + bsi # BBox center Y
218 | mx = dets['x'][fidx] + bsi # BBox center X
219 |
220 | face = frame[int(my - bs):int(my + bs * (1 + 2 * cs)), int(mx - bs * (1 + cs)):int(mx + bs * (1 + cs))]
221 |
222 | vOut.write(cv2.resize(face,(lengh,lengh)))
223 | audiotmp = os.path.join(opt.tmp_dir, opt.reference, 'audio.wav')
224 | audiostart = (track['frame'][0]) / opt.frame_rate
225 | audioend = (track['frame'][-1] + 1) / opt.frame_rate
226 |
227 | vOut.release()
228 |
229 | # ========== CROP AUDIO FILE ==========
230 |
231 | command = ("ffmpeg -y -i %s -ss %.3f -to %.3f %s" % (
232 | os.path.join(opt.avi_dir, opt.reference, 'audio.wav'), audiostart, audioend, audiotmp))
233 | output = subprocess.call(command, shell=True, stdout=None)
234 |
235 | if output != 0:
236 | pdb.set_trace()
237 |
238 | sample_rate, audio = wavfile.read(audiotmp)
239 |
240 | # ========== COMBINE AUDIO AND VIDEO FILES ==========
241 |
242 | command = ("ffmpeg -y -i %st.avi -i %s -c:v copy -c:a copy %s_hq.avi" % (cropfile, audiotmp, cropfile))
243 | output = subprocess.call(command, shell=True, stdout=None)
244 |
245 | if output != 0:
246 | pdb.set_trace()
247 |
248 | print('Written %s' % cropfile)
249 |
250 | os.remove(cropfile + 't.avi')
251 |
252 | print('Mean pos: x %.2f y %.2f s %.2f' % (np.mean(dets['x']), np.mean(dets['y']), np.mean(dets['s'])))
253 |
254 | return {'track': track, 'proc_track': dets}
255 |
256 |
257 | # ========== ========== ========== ==========
258 | # # FACE DETECTION
259 | # ========== ========== ========== ==========
260 |
261 | def inference_video(opt):
262 |
263 | DET = S3FD(device='cuda')
264 |
265 | flist = glob.glob(os.path.join(opt.frames_dir,opt.reference,'*.jpg'))
266 | flist.sort()
267 |
268 | dets = []
269 |
270 | for fidx, fname in enumerate(flist):
271 |
272 | start_time = time.time()
273 |
274 | image = cv2.imread(fname)
275 |
276 | image_np = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
277 | bboxes = DET.detect_faces(image_np, conf_th=0.9, scales=[opt.facedet_scale])
278 |
279 | dets.append([]);
280 | for bbox in bboxes:
281 | dets[-1].append({'frame':fidx, 'bbox':(bbox[:-1]).tolist(), 'conf':bbox[-1]})
282 |
283 | elapsed_time = time.time() - start_time
284 |
285 | print('%s-%05d; %d dets; %.2f Hz' % (os.path.join(opt.avi_dir,opt.reference,'video.avi'),fidx,len(dets[-1]),(1/elapsed_time)))
286 |
287 | savepath = os.path.join(opt.work_dir,opt.reference,'faces.pckl')
288 |
289 | with open(savepath, 'wb') as fil:
290 | pickle.dump(dets, fil)
291 |
292 | return dets
293 |
294 | # ========== ========== ========== ==========
295 | # # SCENE DETECTION
296 | # ========== ========== ========== ==========
297 |
298 | def scene_detect(opt):
299 |
300 | video_manager = VideoManager([os.path.join(opt.avi_dir,opt.reference,'video.avi')])
301 | stats_manager = StatsManager()
302 | scene_manager = SceneManager(stats_manager)
303 | # Add ContentDetector algorithm (constructor takes detector options like threshold).
304 | scene_manager.add_detector(ContentDetector())
305 | base_timecode = video_manager.get_base_timecode()
306 |
307 | video_manager.set_downscale_factor()
308 |
309 | video_manager.start()
310 |
311 | scene_manager.detect_scenes(frame_source=video_manager)
312 |
313 | scene_list = scene_manager.get_scene_list(base_timecode)
314 |
315 | savepath = os.path.join(opt.work_dir,opt.reference,'scene.pckl')
316 |
317 | if scene_list == []:
318 | scene_list = [(video_manager.get_base_timecode(),video_manager.get_current_timecode())]
319 |
320 | with open(savepath, 'wb') as fil:
321 | pickle.dump(scene_list, fil)
322 |
323 | print('%s - scenes detected %d'%(os.path.join(opt.avi_dir,opt.reference,'video.avi'),len(scene_list)))
324 |
325 | return scene_list
326 |
327 |
328 | # ========== ========== ========== ==========
329 | # # EXECUTE DEMO
330 | # ========== ========== ========== ==========
331 |
332 | # ========== DELETE EXISTING DIRECTORIES ==========
333 | video_root = opt.video_dir
334 | videos = os.listdir(video_root)
335 | videos = [x if x.endswith('.mp4') else os.remove(os.path.join(video_root, x)) for x in videos]
336 | videos.sort()
337 | # root_manifest = os.path.join('manifest', identity)
338 | with open(log_file, 'r') as f:
339 | CompleteID = f.read().split("\n")
340 |
341 | for video in videos:
342 | if video in CompleteID:
343 | fullPathVideo = os.path.join(video_root, video)
344 | print(f' video {fullPathVideo} was OK')
345 | else:
346 | video_name = video.split('.')[0]
347 | fullPathVideo = os.path.join(video_root, video)
348 | print(f'processing video {fullPathVideo}')
349 | # manifest = os.path.join(root_manifest, video.split('.')[0] + '.text')
350 | opt.reference = video_name
351 | opt.videofile = fullPathVideo
352 | if os.path.exists(os.path.join(opt.work_dir,opt.reference)):
353 | rmtree(os.path.join(opt.work_dir,opt.reference))
354 |
355 | if os.path.exists(os.path.join(opt.crop_dir,opt.reference)):
356 | rmtree(os.path.join(opt.crop_dir,opt.reference))
357 |
358 | if os.path.exists(os.path.join(opt.avi_dir,opt.reference)):
359 | rmtree(os.path.join(opt.avi_dir,opt.reference))
360 |
361 | if os.path.exists(os.path.join(opt.frames_dir,opt.reference)):
362 | rmtree(os.path.join(opt.frames_dir,opt.reference))
363 |
364 | if os.path.exists(os.path.join(opt.tmp_dir,opt.reference)):
365 | rmtree(os.path.join(opt.tmp_dir,opt.reference))
366 |
367 | # ========== MAKE NEW DIRECTORIES ==========
368 |
369 | os.makedirs(os.path.join(opt.work_dir,opt.reference))
370 | os.makedirs(os.path.join(opt.crop_dir,opt.reference))
371 | os.makedirs(os.path.join(opt.avi_dir,opt.reference))
372 | os.makedirs(os.path.join(opt.frames_dir,opt.reference))
373 | os.makedirs(os.path.join(opt.tmp_dir,opt.reference))
374 |
375 | # ========== CONVERT VIDEO AND EXTRACT FRAMES ==========
376 |
377 | command = ("ffmpeg -y -i %s -qscale:v 2 -async 1 -r 25 %s" % (opt.videofile,os.path.join(opt.avi_dir,opt.reference,'video.avi')))
378 | output = subprocess.call(command, shell=True, stdout=None)
379 |
380 | command = ("ffmpeg -y -i %s -qscale:v 2 -threads 1 -f image2 %s" % (os.path.join(opt.avi_dir,opt.reference,'video.avi'),os.path.join(opt.frames_dir,opt.reference,'%06d.jpg')))
381 | output = subprocess.call(command, shell=True, stdout=None)
382 |
383 | command = ("ffmpeg -y -i %s -ac 1 -vn -acodec pcm_s16le -ar 16000 %s" % (os.path.join(opt.avi_dir,opt.reference,'video.avi'),os.path.join(opt.avi_dir,opt.reference,'audio.wav')))
384 | output = subprocess.call(command, shell=True, stdout=None)
385 |
386 | # ========== FACE DETECTION ==========
387 |
388 | faces = inference_video(opt)
389 |
390 | # ========== SCENE DETECTION ==========
391 |
392 | scene = scene_detect(opt)
393 |
394 | # ========== FACE TRACKING ==========
395 |
396 | alltracks = []
397 | vidtracks = []
398 |
399 | for shot in scene:
400 |
401 | if shot[1].frame_num - shot[0].frame_num >= opt.min_track :
402 | alltracks.extend(track_shot(opt,faces[shot[0].frame_num:shot[1].frame_num]))
403 |
404 | # ========== FACE TRACK CROP ==========
405 |
406 | for ii, track in enumerate(alltracks):
407 | vidtracks.append(crop_video(opt,track,os.path.join(opt.crop_dir,opt.reference,'%05d'%ii)))
408 | crop_hq_video(opt,track,os.path.join(opt.crop_dir,opt.reference,'%05d'%ii))
409 |
410 | # ========== SAVE RESULTS ==========
411 |
412 | savepath = os.path.join(opt.work_dir,opt.reference,'tracks.pckl')
413 |
414 | with open(savepath, 'wb') as fil:
415 | pickle.dump(vidtracks, fil)
416 |
417 | rmtree(os.path.join(opt.tmp_dir,opt.reference))
418 | with open(log_file, 'a') as f:
419 | f.write(f"{video}\n")
420 | print(f' video {fullPathVideo} OK !!!')
421 |
--------------------------------------------------------------------------------
/script/syncnet_python/run_syncnet.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 | #-*- coding: utf-8 -*-
3 |
4 | import time, pdb, argparse, subprocess, pickle, os, gzip, glob
5 |
6 | from SyncNetInstance import *
7 |
8 | # ==================== PARSE ARGUMENT ====================
9 |
10 | parser = argparse.ArgumentParser(description = "SyncNet");
11 | parser.add_argument('--initial_model', type=str, default="data/syncnet_v2.model", help='');
12 | parser.add_argument('--batch_size', type=int, default='20', help='');
13 | parser.add_argument('--vshift', type=int, default='15', help='');
14 | parser.add_argument('--data_dir', type=str, default='test', help='');
15 | parser.add_argument('--videofile', type=str, default='', help='');
16 | parser.add_argument('--reference', type=str, default='', help='');
17 | opt = parser.parse_args();
18 |
19 | setattr(opt,'avi_dir',os.path.join(opt.data_dir,'pyavi'))
20 | setattr(opt,'tmp_dir',os.path.join(opt.data_dir,'pytmp'))
21 | setattr(opt,'work_dir',os.path.join(opt.data_dir,'pywork'))
22 | setattr(opt,'crop_dir',os.path.join(opt.data_dir,'pycrop'))
23 |
24 |
25 | # ==================== LOAD MODEL AND FILE LIST ====================
26 |
27 | s = SyncNetInstance();
28 |
29 | s.loadParameters(opt.initial_model);
30 | print("Model %s loaded."%opt.initial_model);
31 |
32 | flist = glob.glob(os.path.join(opt.crop_dir,opt.reference,'0*.avi'))
33 | flist.sort()
34 |
35 | # ==================== GET OFFSETS ====================
36 |
37 | dists = []
38 | for idx, fname in enumerate(flist):
39 | offset, conf, dist = s.evaluate(opt,videofile=fname)
40 | dists.append(dist)
41 |
42 | # ==================== PRINT RESULTS TO FILE ====================
43 | #
44 | # with open(os.path.join(opt.work_dir,opt.reference,'activesd.pckl'), 'wb') as fil:
45 | # pickle.dump(dists, fil)
46 |
--------------------------------------------------------------------------------
/script/syncnet_python/run_syncnet_dir.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 | # -*- coding: utf-8 -*-
3 |
4 | import time, pdb, argparse, subprocess, pickle, os, gzip, glob
5 |
6 | from SyncNetInstance import *
7 |
8 | # ==================== PARSE ARGUMENT ====================
9 |
10 | parser = argparse.ArgumentParser(description="SyncNet");
11 | parser.add_argument('--initial_model', type=str, default="data/syncnet_v2.model", help='');
12 | parser.add_argument('--batch_size', type=int, default='20', help='');
13 | parser.add_argument('--vshift', type=int, default='15', help='');
14 | parser.add_argument('--output_root', type=str, default='syncnet_output', help='Output direcotry');
15 | parser.add_argument('--video_root', type=str, default='output/pycrop', help='Input video file');
16 | parser.add_argument('--reference', type=str, default='', help='Video reference');
17 | parser.add_argument('--log_file', help='dataset', default='syncnet_output/complete.txt', type=str);
18 | parser.add_argument('--videofile', type=str, default='', help='');
19 | opt = parser.parse_args();
20 |
21 | setattr(opt, 'avi_dir', os.path.join(opt.output_root, 'pyavi'))
22 | setattr(opt, 'tmp_dir', os.path.join(opt.output_root, 'pytmp'))
23 | setattr(opt, 'work_dir', os.path.join(opt.output_root, 'pywork'))
24 | setattr(opt, 'crop_dir', os.path.join(opt.output_root, 'pycrop'))
25 |
26 | # ==================== LOAD MODEL AND FILE LIST ====================
27 |
28 | s = SyncNetInstance();
29 |
30 | s.loadParameters(opt.initial_model);
31 | print("Model %s loaded." % opt.initial_model);
32 |
33 | video_root = opt.video_root
34 | output_root = opt.output_root
35 | identities = os.listdir(video_root)
36 | identities = [x for x in identities]
37 | identities.sort()
38 | log_file = opt.log_file
39 | with open(log_file, 'r') as f:
40 | identityID = f.read().split("\n")
41 | for identity in identities:
42 | fullPathVideo = os.path.join(video_root, identity)
43 | if identity in identityID:
44 | print(f' videosSet {fullPathVideo} was OK')
45 | continue
46 | print(f'Processing videosSet {identity}')
47 | videos = os.listdir(os.path.join(video_root, identity))
48 | videos = [x if x.endswith('.avi') else os.remove(os.path.join(video_root, x)) for x in videos]
49 | videosList = []
50 | for video in videos:
51 | (shotname, extension) = os.path.splitext(video)
52 | if shotname.isalnum():
53 | videosList.append(shotname)
54 | for idx, fname in enumerate(videosList):
55 | normalPath = os.path.join(video_root, identity, fname + '.avi')
56 | hqPath = os.path.join(video_root, identity, fname + '_hq.avi')
57 | targetPath = os.path.join(output_root, identity + fname + '_hq.avi')
58 | offset = s.evaluate(opt, videofile=normalPath)
59 |
60 | if offset == 0:
61 | os.rename(hqPath, targetPath)
62 | elif abs(offset) <= 5:
63 | fpss = offset / 25
64 | cmd = f"ffmpeg -i {hqPath} -itsoffset {fpss} -i {hqPath} -map 0:v -map 1:a -b:v 8000k {targetPath}"
65 | output = subprocess.call(cmd, shell=True, stdout=None)
66 | if output != 0:
67 | pdb.set_trace()
68 | os.remove(hqPath)
69 | with open(log_file, 'a') as f:
70 | f.write(f"{identity}\n")
71 | print(f' videosSet {fullPathVideo} OK !!!')
72 |
--------------------------------------------------------------------------------
/script/syncnet_python/run_visualise.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 | #-*- coding: utf-8 -*-
3 |
4 | import torch
5 | import numpy
6 | import time, pdb, argparse, subprocess, pickle, os, glob
7 | import cv2
8 |
9 | from scipy import signal
10 |
11 | # ==================== PARSE ARGUMENT ====================
12 |
13 | parser = argparse.ArgumentParser(description = "SyncNet");
14 | parser.add_argument('--data_dir', type=str, default='data/work', help='');
15 | parser.add_argument('--videofile', type=str, default='', help='');
16 | parser.add_argument('--reference', type=str, default='', help='');
17 | parser.add_argument('--frame_rate', type=int, default=25, help='Frame rate');
18 | opt = parser.parse_args();
19 |
20 | setattr(opt,'avi_dir',os.path.join(opt.data_dir,'pyavi'))
21 | setattr(opt,'tmp_dir',os.path.join(opt.data_dir,'pytmp'))
22 | setattr(opt,'work_dir',os.path.join(opt.data_dir,'pywork'))
23 | setattr(opt,'crop_dir',os.path.join(opt.data_dir,'pycrop'))
24 | setattr(opt,'frames_dir',os.path.join(opt.data_dir,'pyframes'))
25 |
26 | # ==================== LOAD FILES ====================
27 |
28 | with open(os.path.join(opt.work_dir,opt.reference,'tracks.pckl'), 'rb') as fil:
29 | tracks = pickle.load(fil, encoding='latin1')
30 |
31 | with open(os.path.join(opt.work_dir,opt.reference,'activesd.pckl'), 'rb') as fil:
32 | dists = pickle.load(fil, encoding='latin1')
33 |
34 | flist = glob.glob(os.path.join(opt.frames_dir,opt.reference,'*.jpg'))
35 | flist.sort()
36 |
37 | # ==================== SMOOTH FACES ====================
38 |
39 | faces = [[] for i in range(len(flist))]
40 |
41 | for tidx, track in enumerate(tracks):
42 |
43 | mean_dists = numpy.mean(numpy.stack(dists[tidx],1),1)
44 | minidx = numpy.argmin(mean_dists,0)
45 | minval = mean_dists[minidx]
46 |
47 | fdist = numpy.stack([dist[minidx] for dist in dists[tidx]])
48 | fdist = numpy.pad(fdist, (3,3), 'constant', constant_values=10)
49 |
50 | fconf = numpy.median(mean_dists) - fdist
51 | fconfm = signal.medfilt(fconf,kernel_size=9)
52 |
53 | for fidx, frame in enumerate(track['track']['frame'].tolist()) :
54 | faces[frame].append({'track': tidx, 'conf':fconfm[fidx], 's':track['proc_track']['s'][fidx], 'x':track['proc_track']['x'][fidx], 'y':track['proc_track']['y'][fidx]})
55 |
56 | # ==================== ADD DETECTIONS TO VIDEO ====================
57 |
58 | first_image = cv2.imread(flist[0])
59 |
60 | fw = first_image.shape[1]
61 | fh = first_image.shape[0]
62 |
63 | fourcc = cv2.VideoWriter_fourcc(*'XVID')
64 | vOut = cv2.VideoWriter(os.path.join(opt.avi_dir,opt.reference,'video_only.avi'), fourcc, opt.frame_rate, (fw,fh))
65 |
66 | for fidx, fname in enumerate(flist):
67 |
68 | image = cv2.imread(fname)
69 |
70 | for face in faces[fidx]:
71 |
72 | clr = max(min(face['conf']*25,255),0)
73 |
74 | cv2.rectangle(image,(int(face['x']-face['s']),int(face['y']-face['s'])),(int(face['x']+face['s']),int(face['y']+face['s'])),(0,clr,255-clr),3)
75 | cv2.putText(image,'Track %d, Conf %.3f'%(face['track'],face['conf']), (int(face['x']-face['s']),int(face['y']-face['s'])),cv2.FONT_HERSHEY_SIMPLEX,0.5,(255,255,255),2)
76 |
77 | vOut.write(image)
78 |
79 | print('Frame %d'%fidx)
80 |
81 | vOut.release()
82 |
83 | # ========== COMBINE AUDIO AND VIDEO FILES ==========
84 |
85 | command = ("ffmpeg -y -i %s -i %s -c:v copy -c:a copy %s" % (os.path.join(opt.avi_dir,opt.reference,'video_only.avi'),os.path.join(opt.avi_dir,opt.reference,'audio.wav'),os.path.join(opt.avi_dir,opt.reference,'video_out.avi'))) #-async 1
86 | output = subprocess.call(command, shell=True, stdout=None)
87 |
88 |
89 |
--------------------------------------------------------------------------------
/script/syncnet_python/syncnet_videos.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Kafeyun/Wav2Lip-Ultra/23f6a98c3785c2039d032a8c68ee4de63bc4ddd0/script/syncnet_python/syncnet_videos.py
--------------------------------------------------------------------------------