├── README.md
├── audio_feature_extractor.py
├── data
    ├── test_viseme_13.npy
    └── train_viseme_13.npy
├── model
    └── audio2pho_model_mfa13label_ep300_1e-4_32.h5
├── requirements.txt
├── train.py
└── utils.py


/README.md:
--------------------------------------------------------------------------------
 1 | # audio2viseme
 2 | 
 3 | The code generate visemes from audio features.
 4 | 
 5 | ## Installation
 6 | 
 7 | The code uses Python 3.8.2 and it was tested on Tensorflow 2.4.1.
 8 | 
 9 | Install ffmpeg.
10 | 
11 | Clone the git project:
12 | ```
13 | $ git clone https://github.com/liukuangxiangzi/audio2viseme.git
14 | ```
15 | The requirements (including tensorflow) can be installed using:
16 | ```
17 | pip install -r requirements.txt
18 | ```
19 | ## Data
20 | #### Data used to train 
21 | 
22 | audio2viseme is trained on the 1st subject of The Grid Corpus, with 47.2 minutes of audio captured at 25 fps and synchronized transcriptions that can be downloaded at [The GRID](http://spandh.dcs.shef.ac.uk/gridcorpus/). 
23 | 
24 | 
25 | ## Audio Feature Extraction
26 | 
27 | You can run audio_feature_extractor.py to extract audio features from audio files. The arguments are as follows:
28 | 
29 | * -i --- Input folder containing audio files (if your audio file types are different from .wav, please modify the script accordingly)
30 | * -d --- Delay in terms of frames, where one frame is 40 ms 
31 | * -c --- Number of context frames
32 | * -o --- Output folder path to save audio feature 
33 | 
34 | Usage:
35 | 
36 | ```
37 | python audio_feature_extractor.py -i path-to-audio-files/ -d number-of-delay-frames -c number-of-contex-frames -o output-folder-to-save-audio-feature-file
38 | ```
39 | 
40 | ## Training
41 | 
42 | The training code has the following arguments:
43 | 
44 | * -x --- Input audio feature file for training
45 | * -X --- Input audio feature file for testing
46 | * -y --- Input viseme file for training
47 | * -Y --- Input viseme file for testing
48 | * -o --- Output folder path to save the model
49 | 
50 | Usage:
51 | 
52 | ```
53 | python train.py -x path-to-training-audio-feature-file, -X path-to-testing-audio-feature-file, -y path-to-training-viseme-file -Y path-to-testing-viseme-file -o path-to-save-model-file
54 | 
55 | ```
56 | 
57 | 


--------------------------------------------------------------------------------
/audio_feature_extractor.py:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | #python audio_feature_extractor.py -i audio/data/path/ -d number-of-delay-frames -c number-of-context-frames -o output/data/path
 4 | #e.g. python audio_feature_extractor.py -i data/test_audio/ -d 1 -c 5 -o data/result/
 5 | 
 6 | import librosa
 7 | import numpy as np
 8 | import matplotlib as mpl
 9 | mpl.use('Agg')
10 | import os, shutil
11 | from tqdm import tqdm
12 | import utils
13 | import argparse
14 | 
15 | 
16 | def addContext(melSpc, ctxWin):
17 |     ctx = melSpc[:,:]
18 |     filler = melSpc[0, :]
19 |     for i in range(ctxWin):
20 |         melSpc = np.insert(melSpc, 0, filler, axis=0)[:ctx.shape[0], :]
21 |         ctx = np.append(ctx, melSpc, axis=1)
22 |     return ctx
23 | 
24 | parser = argparse.ArgumentParser(description=__doc__)
25 | parser.add_argument("-i", "--in-file", type=str, help="input speech file")
26 | parser.add_argument("-d", "--delay", type=int, help="Delay in terms of number of frames, where each frame is 40 ms")
27 | parser.add_argument("-c", "--ctx", type=int, help="context window size")
28 | parser.add_argument("-o", "--out_fold", type=str, help="output folder")
29 | args = parser.parse_args()
30 | 
31 | output_path = args.out_fold
32 | num_features_Y = 136
33 | num_frames = 75
34 | wsize = 0.04
35 | hsize = wsize
36 | fs = 48000#44100
37 | trainDelay = args.delay
38 | ctxWin = args.ctx
39 | 
40 | if not os.path.exists(output_path):
41 |     os.makedirs(output_path)
42 | else:
43 |     shutil.rmtree(output_path)
44 |     os.mkdir(output_path)
45 | 
46 | 
47 | audio_folder_path = args.in_file
48 | generated_all = []
49 | cur_features_to_save = []
50 | 
51 | for root, dirnames, filenames in os.walk(audio_folder_path):
52 |     filenames.sort()
53 |     if os.path.exists(root + '.DS_Store') is True:
54 |         filenames.remove('.DS_Store')
55 |     for filename in filenames:
56 |         # Used for padding zeros to first and second temporal differences
57 |         zeroVecD = np.zeros((1, 64), dtype='f16')
58 |         zeroVecDD = np.zeros((2, 64), dtype='f16')
59 | 
60 |         # Load speech and extract features
61 |         sound, sr = librosa.load(audio_folder_path + filename, sr=fs) #(132300,) sr=50000
62 |         print("sound,sr", np.shape(sound), sr)
63 | 
64 |         melFrames = np.transpose(utils.melSpectra(sound, sr, wsize, hsize))
65 |         print("melFrames", np.shape(melFrames))
66 |         melDelta = np.insert(np.diff(melFrames, n=1, axis=0), 0, zeroVecD, axis=0)
67 |         print("melDelta", np.shape(melDelta))
68 |         melDDelta = np.insert(np.diff(melFrames, n=2, axis=0), 0, zeroVecDD, axis=0)
69 |         print("melDDelta", np.shape(melDDelta))
70 | 
71 |         features = np.concatenate((melDelta, melDDelta), axis=1) #76,128
72 |         print("features1", np.shape(features))
73 |         features = addContext(features, ctxWin)  #76,768
74 |         print("features2 contex5", np.shape(features))
75 |         features = np.reshape(features, (1, features.shape[0], features.shape[1]))  #1,76,768
76 |         print("features3", np.shape(features))
77 | 
78 | 
79 |         upper_limit = features.shape[1]
80 |         print("upper_limit", upper_limit)
81 |         lower = 0
82 |         generated = np.zeros((0, num_features_Y))
83 | 
84 |         # Generates face landmarks one-by-one
85 |         # This part can be modified to predict the whole sequence at one, but may introduce discontinuities
86 |         for i in tqdm(range(upper_limit)):
87 |             cur_features = np.zeros((1, num_frames, features.shape[2]))
88 |             if i+1 > 75:
89 |                 lower = i+1-75
90 |             cur_features[:,-i-1:,:] = features[:,lower:i+1,:]
91 |         cur_features.resize(np.shape(cur_features)[1],np.shape(cur_features)[2])
92 |         print(np.shape(cur_features))
93 |         cur_features_to_save.append(cur_features)
94 | np.save(output_path + '/Xtext.npy', cur_features_to_save)


--------------------------------------------------------------------------------
/data/test_viseme_13.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/liukuangxiangzi/audio2viseme/24dcf48a6f8e24a0c332d797f5b1cb22cd3137e6/data/test_viseme_13.npy


--------------------------------------------------------------------------------
/data/train_viseme_13.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/liukuangxiangzi/audio2viseme/24dcf48a6f8e24a0c332d797f5b1cb22cd3137e6/data/train_viseme_13.npy


--------------------------------------------------------------------------------
/model/audio2pho_model_mfa13label_ep300_1e-4_32.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/liukuangxiangzi/audio2viseme/24dcf48a6f8e24a0c332d797f5b1cb22cd3137e6/model/audio2pho_model_mfa13label_ep300_1e-4_32.h5


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | tensorflow == 2.4.1
2 | Keras == 2.4.1
3 | Librosa == 0.8.0
4 | matplotlib == 3.3.4
5 | numpy == 1.19.2
6 | tqdm == 4.59.0
7 | 


--------------------------------------------------------------------------------
/train.py:
--------------------------------------------------------------------------------
  1 | 
  2 | #prprocess Y
  3 | 
  4 | import os
  5 | from datetime import datetime
  6 | import numpy as np
  7 | import argparse
  8 | from keras.layers import Input, LSTM
  9 | from keras.models import Model
 10 | from keras.utils import to_categorical
 11 | from keras.callbacks import TensorBoard
 12 | from keras.layers import Dense
 13 | from keras.layers import Dropout
 14 | from keras.optimizers import *
 15 | from keras.utils import plot_model
 16 | 
 17 | 
 18 | 
 19 | #-x path-of-audio-feature-file-for-training, -X path-of-audio-feature-file-for-testing, -y path-of-viseme-file-for-training -Y path-of-viseme-file-for-testing -o path-of-saving-model
 20 | #e.g. python train.py -x data/audio_feature_train.npy -X data/audio_feature_test.npy -y data/train_viseme_13.npy -Y data/test_viseme_13.npy -o model/audio2pho_model_mfa13label_ep300_1e-4_32.h5
 21 | 
 22 | 
 23 | parser = argparse.ArgumentParser(description=__doc__)
 24 | parser.add_argument("-x", "--train_x", type=str, help="input audio feature file for training")
 25 | parser.add_argument("-X", "--test_X", type=str, help="input audio feature file for testing")
 26 | parser.add_argument("-y", "--train_y", type=str, help="input viseme file for training")
 27 | parser.add_argument("-Y", "--test_Y", type=str, help="input viseme file for testing")
 28 | parser.add_argument("-o", "--out_fold", type=str, help="model folder")
 29 | args = parser.parse_args()
 30 | 
 31 | 
 32 | 
 33 | def labelcategorical():
 34 |     label_train = np.array(np.load(args.train_y))  #train_viseme_label14H.npy
 35 |     label_test = np.array(np.load(args.test_Y))
 36 | 
 37 |     label_train = label_train.reshape(753,75,-1)
 38 |     label_test = label_test.reshape(191,75,-1)
 39 | 
 40 |     # one hot encode y
 41 |     label_categorical_train = to_categorical(label_train)
 42 |     label_categorical_test = to_categorical(label_test)
 43 |     return label_categorical_train, label_categorical_test
 44 | 
 45 | 
 46 | # load the dataset, returns train and test X and y elements
 47 | def load_dataset(prefix=''):
 48 |     # load all x
 49 |     trainX = np.load(args.train_x)  #trainX.shape = (753, 75, 768)
 50 |     print('trainX.shape', trainX.shape)
 51 |     testX = np.load(args.test_X)   #testX.shape = (191, 75, 768)
 52 |     print('testX.shape', testX.shape)
 53 |     # load all y
 54 |     trainy, testy = labelcategorical()
 55 |     print(trainX.shape, trainy.shape, testX.shape, testy.shape)
 56 |     return trainX, trainy, testX, testy
 57 | 
 58 | 
 59 | #set tensorboard
 60 | #tensorboard --logdir /Users/liukuangxiangzi/PycharmProjects/audio2viseme/logs/ --host=127.0.0.1
 61 | log_dir = os.path.join(
 62 |     "logs",
 63 |     "fit",
 64 |     datetime.now().strftime("%Y%m%d-%H%M%S"),
 65 | )
 66 | tbCallBack = TensorBoard(log_dir= log_dir, histogram_freq=0, write_graph=True, write_images=True)
 67 | 
 68 | #X y data
 69 | trainX, trainy, testX, testy = load_dataset(prefix='')
 70 | 
 71 | verbose, epochs, batch_size = 0, 300, 32
 72 | h_dim = 256
 73 | n_timesteps, n_features = trainX.shape[1], trainX.shape[2]
 74 | print(n_timesteps, n_features)
 75 | 
 76 | drpRate = 0.2
 77 | recDrpRate = 0.2
 78 | lr = 1e-4
 79 | initializer = 'glorot_uniform'
 80 | 
 81 | # define model
 82 | net_in = Input(shape=(n_timesteps, n_features))
 83 | lstm1 = LSTM(h_dim,
 84 |              activation='sigmoid',
 85 |              dropout=drpRate,
 86 |              recurrent_dropout=recDrpRate,
 87 |              return_sequences=True)(net_in)
 88 | lstm2 = LSTM(h_dim,
 89 |              activation='sigmoid',
 90 |              dropout=drpRate,
 91 |              recurrent_dropout=recDrpRate,
 92 |              return_sequences=True)(lstm1)
 93 | lstm3= LSTM(h_dim,
 94 |             activation='sigmoid',
 95 |             dropout=drpRate,
 96 |             recurrent_dropout=recDrpRate,
 97 |             return_sequences=True)(lstm2)
 98 | 
 99 | dropout = Dropout(0.5)(lstm3)
100 | 
101 | l1 = Dense(128,
102 |            kernel_initializer=initializer,
103 |            name='lm_Dense1',activation='relu')(dropout)
104 | 
105 | out = Dense(13,
106 |             kernel_initializer=initializer, name='lm_out',activation='softmax')(l1)
107 | 
108 | model = Model(inputs=net_in, outputs=out)
109 | model.summary()
110 | opt = Adam(lr=lr)
111 | #opt = SGD(learning_rate=lr)
112 | model.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['acc'])
113 | 
114 | 
115 | 
116 | plot_model(model, to_file='audio2viseme_model.png', show_shapes=True, show_layer_names=True)
117 | 
118 | model.fit(trainX, trainy,
119 |           validation_data=(testX, testy),
120 |           epochs=epochs,
121 |           batch_size=batch_size,
122 |           callbacks=[tbCallBack]
123 |           )
124 | 
125 | model.save(args.out_fold)
126 | print("Saved model to disk")
127 | 
128 | 
129 | 
130 | 
131 | 
132 | 
133 | 
134 | 
135 | 
136 | 
137 | 


--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
  1 | import matplotlib as mpl
  2 | 
  3 | mpl.use('Agg')
  4 | import matplotlib.pyplot as plt
  5 | import matplotlib.animation as manimation
  6 | import matplotlib.lines as mlines
  7 | from matplotlib import transforms
  8 | import argparse, os, fnmatch, shutil
  9 | import numpy as np
 10 | import cv2
 11 | import math
 12 | import copy
 13 | import librosa
 14 | #import dlib
 15 | import subprocess
 16 | from keras import backend as K
 17 | from tqdm import tqdm
 18 | 
 19 | font = {'size': 18}
 20 | mpl.rc('font', **font)
 21 | 
 22 | # Lookup tables for drawing lines between points
 23 | Mouth = [[48, 49], [49, 50], [50, 51], [51, 52], [52, 53], [53, 54], [54, 55], [55, 56], [56, 57], \
 24 |          [57, 58], [58, 59], [59, 48], [60, 61], [61, 62], [62, 63], [63, 64], [64, 65], [65, 66], \
 25 |          [66, 67], [67, 60]]
 26 | 
 27 | Nose = [[27, 28], [28, 29], [29, 30], [30, 31], [30, 35], [31, 32], [32, 33], \
 28 |         [33, 34], [34, 35], [27, 31], [27, 35]]
 29 | 
 30 | leftBrow = [[17, 18], [18, 19], [19, 20], [20, 21]]
 31 | rightBrow = [[22, 23], [23, 24], [24, 25], [25, 26]]
 32 | 
 33 | leftEye = [[36, 37], [37, 38], [38, 39], [39, 40], [40, 41], [36, 41]]
 34 | rightEye = [[42, 43], [43, 44], [44, 45], [45, 46], [46, 47], [42, 47]]
 35 | 
 36 | other = [[0, 1], [1, 2], [2, 3], [3, 4], [4, 5], [5, 6], \
 37 |          [6, 7], [7, 8], [8, 9], [9, 10], [10, 11], [11, 12], \
 38 |          [12, 13], [13, 14], [14, 15], [15, 16]]
 39 | 
 40 | faceLmarkLookup = Mouth + Nose + leftBrow + rightBrow + leftEye + rightEye + other
 41 | 
 42 | 
 43 | class faceNormalizer(object):
 44 |     # Credits: http://www.learnopencv.com/face-morph-using-opencv-cpp-python/
 45 |     w = 600
 46 |     h = 600
 47 | 
 48 |     def __init__(self, w=600, h=600):
 49 |         self.w = w
 50 |         self.h = h
 51 | 
 52 |     def similarityTransform(self, inPoints, outPoints):
 53 |         s60 = math.sin(60 * math.pi / 180)
 54 |         c60 = math.cos(60 * math.pi / 180)
 55 | 
 56 |         inPts = np.copy(inPoints).tolist()
 57 |         outPts = np.copy(outPoints).tolist()
 58 | 
 59 |         xin = c60 * (inPts[0][0] - inPts[1][0]) - s60 * (inPts[0][1] - inPts[1][1]) + inPts[1][0]
 60 |         yin = s60 * (inPts[0][0] - inPts[1][0]) + c60 * (inPts[0][1] - inPts[1][1]) + inPts[1][1]
 61 | 
 62 |         inPts.append([np.int(xin), np.int(yin)])
 63 | 
 64 |         xout = c60 * (outPts[0][0] - outPts[1][0]) - s60 * (outPts[0][1] - outPts[1][1]) + outPts[1][0]
 65 |         yout = s60 * (outPts[0][0] - outPts[1][0]) + c60 * (outPts[0][1] - outPts[1][1]) + outPts[1][1]
 66 | 
 67 |         outPts.append([np.int(xout), np.int(yout)])
 68 | 
 69 |         tform = cv2.estimateRigidTransform(np.array([inPts]), np.array([outPts]), False)
 70 | 
 71 |         return tform
 72 | 
 73 |     def tformFlmarks(self, flmark, tform):
 74 |         transformed = np.reshape(np.array(flmark), (68, 1, 2))
 75 |         transformed = cv2.transform(transformed, tform)
 76 |         transformed = np.float32(np.reshape(transformed, (68, 2)))
 77 |         return transformed
 78 | 
 79 |     def alignEyePoints(self, lmarkSeq):
 80 |         w = self.w
 81 |         h = self.h
 82 | 
 83 |         alignedSeq = copy.deepcopy(lmarkSeq)
 84 |         firstFlmark = alignedSeq[0, :, :]
 85 | 
 86 |         eyecornerDst = [(np.float(0.3 * w), np.float(h / 3)), (np.float(0.7 * w), np.float(h / 3))]
 87 |         eyecornerSrc = [(firstFlmark[36, 0], firstFlmark[36, 1]), (firstFlmark[45, 0], firstFlmark[45, 1])]
 88 | 
 89 |         tform = self.similarityTransform(eyecornerSrc, eyecornerDst);
 90 | 
 91 |         for i, lmark in enumerate(alignedSeq):
 92 |             alignedSeq[i] = self.tformFlmarks(lmark, tform)
 93 | 
 94 |         return alignedSeq
 95 | 
 96 |     def alignEyePointsV2(self, lmarkSeq):
 97 |         w = self.w
 98 |         h = self.h
 99 | 
100 |         alignedSeq = copy.deepcopy(lmarkSeq)
101 | 
102 |         eyecornerDst = [(np.float(0.3 * w), np.float(h / 3)), (np.float(0.7 * w), np.float(h / 3))]
103 | 
104 |         for i, lmark in enumerate(alignedSeq):
105 |             curLmark = alignedSeq[i, :, :]
106 |             eyecornerSrc = [(curLmark[36, 0], curLmark[36, 1]), (curLmark[45, 0], curLmark[45, 1])]
107 |             tform = self.similarityTransform(eyecornerSrc, eyecornerDst);
108 |             alignedSeq[i, :, :] = self.tformFlmarks(lmark, tform)
109 | 
110 |         return alignedSeq
111 | 
112 |     def transferExpression(self, lmarkSeq, meanShape):
113 |         exptransSeq = copy.deepcopy(lmarkSeq)
114 |         firstFlmark = exptransSeq[0, :, :]
115 |         indexes = np.array([60, 64, 62, 67])
116 | 
117 |         tformMS = cv2.estimateRigidTransform(firstFlmark[:, :], np.float32(meanShape[:, :]), True)
118 | 
119 |         sx = np.sign(tformMS[0, 0]) * np.sqrt(tformMS[0, 0] ** 2 + tformMS[0, 1] ** 2)
120 |         sy = np.sign(tformMS[1, 0]) * np.sqrt(tformMS[1, 0] ** 2 + tformMS[1, 1] ** 2)
121 |         # print sx, sy
122 |         prevLmark = copy.deepcopy(firstFlmark)
123 |         prevExpTransFlmark = copy.deepcopy(meanShape)
124 | 
125 |         zeroVecD = np.zeros((1, 68, 2))
126 |         diff = np.cumsum(np.insert(np.diff(exptransSeq, n=1, axis=0), 0, zeroVecD, axis=0), axis=0)
127 |         msSeq = np.tile(np.reshape(meanShape, (1, 68, 2)), [lmarkSeq.shape[0], 1, 1])
128 | 
129 |         diff[:, :, 0] = abs(sx) * diff[:, :, 0]
130 |         diff[:, :, 1] = abs(sy) * diff[:, :, 1]
131 | 
132 |         exptransSeq = diff + msSeq
133 | 
134 |         return exptransSeq
135 | 
136 |     def unitNorm(self, flmarkSeq):
137 |         normSeq = copy.deepcopy(flmarkSeq)
138 |         normSeq[:, :, 0] /= self.w
139 |         normSeq[:, :, 1] /= self.h
140 |         return normSeq
141 | 
142 | 
143 | def write_video_wpts_wsound(frames, sound, fs, path, fname, xLim, yLim):
144 |     try:
145 |         os.remove(os.path.join(path, fname + '.mp4'))
146 |         os.remove(os.path.join(path, fname + '.wav'))
147 |         os.remove(os.path.join(path, fname + '_ws.mp4'))
148 |     except:
149 |         print ('Exp')
150 | 
151 |     if len(frames.shape) < 3:
152 |         frames = np.reshape(frames, (frames.shape[0], frames.shape[1] / 2, 2))
153 |     print (frames.shape)
154 | 
155 |     FFMpegWriter = manimation.writers['ffmpeg']
156 |     metadata = dict(title='Movie Test', artist='Matplotlib',
157 |                     comment='Movie support!')
158 |     writer = FFMpegWriter(fps=25, metadata=metadata)
159 | 
160 |     fig = plt.figure(figsize=(10, 10))
161 |     l, = plt.plot([], [], 'ko', ms=4)
162 | 
163 |     plt.xlim(xLim)
164 |     plt.ylim(yLim)
165 | 
166 |     librosa.output.write_wav(os.path.join(path, fname + '.wav'), sound, fs)
167 | 
168 |     rect = (0, 0, 600, 600)
169 | 
170 |     if frames.shape[1] == 20:
171 |         lookup = [[x[0] - 48, x[1] - 48] for x in Mouth]
172 |         print (lookup)
173 |     else:
174 |         lookup = faceLmarkLookup
175 | 
176 |     lines = [plt.plot([], [], 'k')[0] for _ in range(3 * len(lookup))]
177 | 
178 |     with writer.saving(fig, os.path.join(path, fname + '.mp4'), 150):
179 |         plt.gca().invert_yaxis()
180 |         for i in tqdm(range(frames.shape[0])):
181 |             l.set_data(frames[i, :, 0], frames[i, :, 1])
182 |             cnt = 0
183 |             for refpts in lookup:
184 |                 lines[cnt].set_data([frames[i, refpts[1], 0], frames[i, refpts[0], 0]],
185 |                                     [frames[i, refpts[1], 1], frames[i, refpts[0], 1]])
186 |                 cnt += 1
187 |             writer.grab_frame()
188 | 
189 |     cmd = 'ffmpeg -y -i ' + os.path.join(path, fname) + '.mp4 -i ' + os.path.join(path,
190 |                                                                                   fname) + '.wav -c:v copy -c:a aac -strict experimental ' + os.path.join(
191 |         path, fname) + '_ws.mp4'
192 |     subprocess.call(cmd, shell=True)
193 |     print('Muxing Done')
194 | 
195 |     os.remove(os.path.join(path, fname + '.mp4'))
196 |     os.remove(os.path.join(path, fname + '.wav'))
197 | 
198 | 
199 | def plot_flmarks(pts, lab, xLim, yLim, xLab, yLab, figsize=(10, 10)):
200 |     if len(pts.shape) != 3:
201 |         pts = np.reshape(pts, (pts.shape[0] / 2, 2))
202 |     print (pts.shape)
203 | 
204 |     if pts.shape[0] == 20:
205 |         lookup = [[x[0] - 48, x[1] - 48] for x in Mouth]
206 |         print (lookup)
207 |     else:
208 |         lookup = faceLmarkLookup
209 | 
210 |     plt.figure(figsize=figsize)
211 |     plt.plot(pts[:, 0], pts[:, 1], 'ko', ms=4)
212 |     for refpts in lookup:
213 |         plt.plot([pts[refpts[1], 0], pts[refpts[0], 0]], [pts[refpts[1], 1], pts[refpts[0], 1]], 'k', ms=4)
214 | 
215 |     plt.xlabel(xLab, fontsize=font['size'] + 4, fontweight='bold')
216 |     plt.gca().xaxis.tick_top()
217 |     plt.gca().xaxis.set_label_position('top')
218 |     plt.ylabel(yLab, fontsize=font['size'] + 4, fontweight='bold')
219 |     plt.xlim(xLim)
220 |     plt.ylim(yLim)
221 |     plt.gca().invert_yaxis()
222 | 
223 |     plt.savefig(lab, dpi=300, bbox_inches='tight')
224 |     plt.clf()
225 |     plt.close()
226 | 
227 | 
228 | def melSpectra(y, sr, wsize, hsize):  #sound:(132300,) , sr=48000, wsize=0.04, hsize =0.04
229 |     cnst = 1 + (int(sr * wsize) / 2) #883
230 |     y_stft_abs = np.abs(librosa.stft(y, #132300
231 |                                      win_length=int(sr * wsize), #1764
232 |                                      hop_length=int(sr * hsize), #1764
233 |                                      n_fft=int(sr * wsize))) / cnst #883
234 |     print("y_stft_abs", np.shape(y_stft_abs))
235 | 
236 |     melspec = np.log(1e-16 + librosa.feature.melspectrogram(sr=sr,
237 |                                                             S=y_stft_abs ** 2,
238 |                                                             n_mels=64))
239 |     return melspec
240 | 
241 | 
242 | def main():
243 |     return
244 | 
245 | 
246 | if __name__ == "__main__":
247 |     main()


--------------------------------------------------------------------------------