├── README.md ├── audio_feature_extractor.py ├── data ├── test_viseme_13.npy └── train_viseme_13.npy ├── model └── audio2pho_model_mfa13label_ep300_1e-4_32.h5 ├── requirements.txt ├── train.py └── utils.py /README.md: -------------------------------------------------------------------------------- 1 | # audio2viseme 2 | 3 | The code generate visemes from audio features. 4 | 5 | ## Installation 6 | 7 | The code uses Python 3.8.2 and it was tested on Tensorflow 2.4.1. 8 | 9 | Install ffmpeg. 10 | 11 | Clone the git project: 12 | ``` 13 | $ git clone https://github.com/liukuangxiangzi/audio2viseme.git 14 | ``` 15 | The requirements (including tensorflow) can be installed using: 16 | ``` 17 | pip install -r requirements.txt 18 | ``` 19 | ## Data 20 | #### Data used to train 21 | 22 | audio2viseme is trained on the 1st subject of The Grid Corpus, with 47.2 minutes of audio captured at 25 fps and synchronized transcriptions that can be downloaded at [The GRID](http://spandh.dcs.shef.ac.uk/gridcorpus/). 23 | 24 | 25 | ## Audio Feature Extraction 26 | 27 | You can run audio_feature_extractor.py to extract audio features from audio files. The arguments are as follows: 28 | 29 | * -i --- Input folder containing audio files (if your audio file types are different from .wav, please modify the script accordingly) 30 | * -d --- Delay in terms of frames, where one frame is 40 ms 31 | * -c --- Number of context frames 32 | * -o --- Output folder path to save audio feature 33 | 34 | Usage: 35 | 36 | ``` 37 | python audio_feature_extractor.py -i path-to-audio-files/ -d number-of-delay-frames -c number-of-contex-frames -o output-folder-to-save-audio-feature-file 38 | ``` 39 | 40 | ## Training 41 | 42 | The training code has the following arguments: 43 | 44 | * -x --- Input audio feature file for training 45 | * -X --- Input audio feature file for testing 46 | * -y --- Input viseme file for training 47 | * -Y --- Input viseme file for testing 48 | * -o --- Output folder path to save the model 49 | 50 | Usage: 51 | 52 | ``` 53 | python train.py -x path-to-training-audio-feature-file, -X path-to-testing-audio-feature-file, -y path-to-training-viseme-file -Y path-to-testing-viseme-file -o path-to-save-model-file 54 | 55 | ``` 56 | 57 | -------------------------------------------------------------------------------- /audio_feature_extractor.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | #python audio_feature_extractor.py -i audio/data/path/ -d number-of-delay-frames -c number-of-context-frames -o output/data/path 4 | #e.g. python audio_feature_extractor.py -i data/test_audio/ -d 1 -c 5 -o data/result/ 5 | 6 | import librosa 7 | import numpy as np 8 | import matplotlib as mpl 9 | mpl.use('Agg') 10 | import os, shutil 11 | from tqdm import tqdm 12 | import utils 13 | import argparse 14 | 15 | 16 | def addContext(melSpc, ctxWin): 17 | ctx = melSpc[:,:] 18 | filler = melSpc[0, :] 19 | for i in range(ctxWin): 20 | melSpc = np.insert(melSpc, 0, filler, axis=0)[:ctx.shape[0], :] 21 | ctx = np.append(ctx, melSpc, axis=1) 22 | return ctx 23 | 24 | parser = argparse.ArgumentParser(description=__doc__) 25 | parser.add_argument("-i", "--in-file", type=str, help="input speech file") 26 | parser.add_argument("-d", "--delay", type=int, help="Delay in terms of number of frames, where each frame is 40 ms") 27 | parser.add_argument("-c", "--ctx", type=int, help="context window size") 28 | parser.add_argument("-o", "--out_fold", type=str, help="output folder") 29 | args = parser.parse_args() 30 | 31 | output_path = args.out_fold 32 | num_features_Y = 136 33 | num_frames = 75 34 | wsize = 0.04 35 | hsize = wsize 36 | fs = 48000#44100 37 | trainDelay = args.delay 38 | ctxWin = args.ctx 39 | 40 | if not os.path.exists(output_path): 41 | os.makedirs(output_path) 42 | else: 43 | shutil.rmtree(output_path) 44 | os.mkdir(output_path) 45 | 46 | 47 | audio_folder_path = args.in_file 48 | generated_all = [] 49 | cur_features_to_save = [] 50 | 51 | for root, dirnames, filenames in os.walk(audio_folder_path): 52 | filenames.sort() 53 | if os.path.exists(root + '.DS_Store') is True: 54 | filenames.remove('.DS_Store') 55 | for filename in filenames: 56 | # Used for padding zeros to first and second temporal differences 57 | zeroVecD = np.zeros((1, 64), dtype='f16') 58 | zeroVecDD = np.zeros((2, 64), dtype='f16') 59 | 60 | # Load speech and extract features 61 | sound, sr = librosa.load(audio_folder_path + filename, sr=fs) #(132300,) sr=50000 62 | print("sound,sr", np.shape(sound), sr) 63 | 64 | melFrames = np.transpose(utils.melSpectra(sound, sr, wsize, hsize)) 65 | print("melFrames", np.shape(melFrames)) 66 | melDelta = np.insert(np.diff(melFrames, n=1, axis=0), 0, zeroVecD, axis=0) 67 | print("melDelta", np.shape(melDelta)) 68 | melDDelta = np.insert(np.diff(melFrames, n=2, axis=0), 0, zeroVecDD, axis=0) 69 | print("melDDelta", np.shape(melDDelta)) 70 | 71 | features = np.concatenate((melDelta, melDDelta), axis=1) #76,128 72 | print("features1", np.shape(features)) 73 | features = addContext(features, ctxWin) #76,768 74 | print("features2 contex5", np.shape(features)) 75 | features = np.reshape(features, (1, features.shape[0], features.shape[1])) #1,76,768 76 | print("features3", np.shape(features)) 77 | 78 | 79 | upper_limit = features.shape[1] 80 | print("upper_limit", upper_limit) 81 | lower = 0 82 | generated = np.zeros((0, num_features_Y)) 83 | 84 | # Generates face landmarks one-by-one 85 | # This part can be modified to predict the whole sequence at one, but may introduce discontinuities 86 | for i in tqdm(range(upper_limit)): 87 | cur_features = np.zeros((1, num_frames, features.shape[2])) 88 | if i+1 > 75: 89 | lower = i+1-75 90 | cur_features[:,-i-1:,:] = features[:,lower:i+1,:] 91 | cur_features.resize(np.shape(cur_features)[1],np.shape(cur_features)[2]) 92 | print(np.shape(cur_features)) 93 | cur_features_to_save.append(cur_features) 94 | np.save(output_path + '/Xtext.npy', cur_features_to_save) -------------------------------------------------------------------------------- /data/test_viseme_13.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liukuangxiangzi/audio2viseme/24dcf48a6f8e24a0c332d797f5b1cb22cd3137e6/data/test_viseme_13.npy -------------------------------------------------------------------------------- /data/train_viseme_13.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liukuangxiangzi/audio2viseme/24dcf48a6f8e24a0c332d797f5b1cb22cd3137e6/data/train_viseme_13.npy -------------------------------------------------------------------------------- /model/audio2pho_model_mfa13label_ep300_1e-4_32.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liukuangxiangzi/audio2viseme/24dcf48a6f8e24a0c332d797f5b1cb22cd3137e6/model/audio2pho_model_mfa13label_ep300_1e-4_32.h5 -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | tensorflow == 2.4.1 2 | Keras == 2.4.1 3 | Librosa == 0.8.0 4 | matplotlib == 3.3.4 5 | numpy == 1.19.2 6 | tqdm == 4.59.0 7 | -------------------------------------------------------------------------------- /train.py: -------------------------------------------------------------------------------- 1 | 2 | #prprocess Y 3 | 4 | import os 5 | from datetime import datetime 6 | import numpy as np 7 | import argparse 8 | from keras.layers import Input, LSTM 9 | from keras.models import Model 10 | from keras.utils import to_categorical 11 | from keras.callbacks import TensorBoard 12 | from keras.layers import Dense 13 | from keras.layers import Dropout 14 | from keras.optimizers import * 15 | from keras.utils import plot_model 16 | 17 | 18 | 19 | #-x path-of-audio-feature-file-for-training, -X path-of-audio-feature-file-for-testing, -y path-of-viseme-file-for-training -Y path-of-viseme-file-for-testing -o path-of-saving-model 20 | #e.g. python train.py -x data/audio_feature_train.npy -X data/audio_feature_test.npy -y data/train_viseme_13.npy -Y data/test_viseme_13.npy -o model/audio2pho_model_mfa13label_ep300_1e-4_32.h5 21 | 22 | 23 | parser = argparse.ArgumentParser(description=__doc__) 24 | parser.add_argument("-x", "--train_x", type=str, help="input audio feature file for training") 25 | parser.add_argument("-X", "--test_X", type=str, help="input audio feature file for testing") 26 | parser.add_argument("-y", "--train_y", type=str, help="input viseme file for training") 27 | parser.add_argument("-Y", "--test_Y", type=str, help="input viseme file for testing") 28 | parser.add_argument("-o", "--out_fold", type=str, help="model folder") 29 | args = parser.parse_args() 30 | 31 | 32 | 33 | def labelcategorical(): 34 | label_train = np.array(np.load(args.train_y)) #train_viseme_label14H.npy 35 | label_test = np.array(np.load(args.test_Y)) 36 | 37 | label_train = label_train.reshape(753,75,-1) 38 | label_test = label_test.reshape(191,75,-1) 39 | 40 | # one hot encode y 41 | label_categorical_train = to_categorical(label_train) 42 | label_categorical_test = to_categorical(label_test) 43 | return label_categorical_train, label_categorical_test 44 | 45 | 46 | # load the dataset, returns train and test X and y elements 47 | def load_dataset(prefix=''): 48 | # load all x 49 | trainX = np.load(args.train_x) #trainX.shape = (753, 75, 768) 50 | print('trainX.shape', trainX.shape) 51 | testX = np.load(args.test_X) #testX.shape = (191, 75, 768) 52 | print('testX.shape', testX.shape) 53 | # load all y 54 | trainy, testy = labelcategorical() 55 | print(trainX.shape, trainy.shape, testX.shape, testy.shape) 56 | return trainX, trainy, testX, testy 57 | 58 | 59 | #set tensorboard 60 | #tensorboard --logdir /Users/liukuangxiangzi/PycharmProjects/audio2viseme/logs/ --host=127.0.0.1 61 | log_dir = os.path.join( 62 | "logs", 63 | "fit", 64 | datetime.now().strftime("%Y%m%d-%H%M%S"), 65 | ) 66 | tbCallBack = TensorBoard(log_dir= log_dir, histogram_freq=0, write_graph=True, write_images=True) 67 | 68 | #X y data 69 | trainX, trainy, testX, testy = load_dataset(prefix='') 70 | 71 | verbose, epochs, batch_size = 0, 300, 32 72 | h_dim = 256 73 | n_timesteps, n_features = trainX.shape[1], trainX.shape[2] 74 | print(n_timesteps, n_features) 75 | 76 | drpRate = 0.2 77 | recDrpRate = 0.2 78 | lr = 1e-4 79 | initializer = 'glorot_uniform' 80 | 81 | # define model 82 | net_in = Input(shape=(n_timesteps, n_features)) 83 | lstm1 = LSTM(h_dim, 84 | activation='sigmoid', 85 | dropout=drpRate, 86 | recurrent_dropout=recDrpRate, 87 | return_sequences=True)(net_in) 88 | lstm2 = LSTM(h_dim, 89 | activation='sigmoid', 90 | dropout=drpRate, 91 | recurrent_dropout=recDrpRate, 92 | return_sequences=True)(lstm1) 93 | lstm3= LSTM(h_dim, 94 | activation='sigmoid', 95 | dropout=drpRate, 96 | recurrent_dropout=recDrpRate, 97 | return_sequences=True)(lstm2) 98 | 99 | dropout = Dropout(0.5)(lstm3) 100 | 101 | l1 = Dense(128, 102 | kernel_initializer=initializer, 103 | name='lm_Dense1',activation='relu')(dropout) 104 | 105 | out = Dense(13, 106 | kernel_initializer=initializer, name='lm_out',activation='softmax')(l1) 107 | 108 | model = Model(inputs=net_in, outputs=out) 109 | model.summary() 110 | opt = Adam(lr=lr) 111 | #opt = SGD(learning_rate=lr) 112 | model.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['acc']) 113 | 114 | 115 | 116 | plot_model(model, to_file='audio2viseme_model.png', show_shapes=True, show_layer_names=True) 117 | 118 | model.fit(trainX, trainy, 119 | validation_data=(testX, testy), 120 | epochs=epochs, 121 | batch_size=batch_size, 122 | callbacks=[tbCallBack] 123 | ) 124 | 125 | model.save(args.out_fold) 126 | print("Saved model to disk") 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | import matplotlib as mpl 2 | 3 | mpl.use('Agg') 4 | import matplotlib.pyplot as plt 5 | import matplotlib.animation as manimation 6 | import matplotlib.lines as mlines 7 | from matplotlib import transforms 8 | import argparse, os, fnmatch, shutil 9 | import numpy as np 10 | import cv2 11 | import math 12 | import copy 13 | import librosa 14 | #import dlib 15 | import subprocess 16 | from keras import backend as K 17 | from tqdm import tqdm 18 | 19 | font = {'size': 18} 20 | mpl.rc('font', **font) 21 | 22 | # Lookup tables for drawing lines between points 23 | Mouth = [[48, 49], [49, 50], [50, 51], [51, 52], [52, 53], [53, 54], [54, 55], [55, 56], [56, 57], \ 24 | [57, 58], [58, 59], [59, 48], [60, 61], [61, 62], [62, 63], [63, 64], [64, 65], [65, 66], \ 25 | [66, 67], [67, 60]] 26 | 27 | Nose = [[27, 28], [28, 29], [29, 30], [30, 31], [30, 35], [31, 32], [32, 33], \ 28 | [33, 34], [34, 35], [27, 31], [27, 35]] 29 | 30 | leftBrow = [[17, 18], [18, 19], [19, 20], [20, 21]] 31 | rightBrow = [[22, 23], [23, 24], [24, 25], [25, 26]] 32 | 33 | leftEye = [[36, 37], [37, 38], [38, 39], [39, 40], [40, 41], [36, 41]] 34 | rightEye = [[42, 43], [43, 44], [44, 45], [45, 46], [46, 47], [42, 47]] 35 | 36 | other = [[0, 1], [1, 2], [2, 3], [3, 4], [4, 5], [5, 6], \ 37 | [6, 7], [7, 8], [8, 9], [9, 10], [10, 11], [11, 12], \ 38 | [12, 13], [13, 14], [14, 15], [15, 16]] 39 | 40 | faceLmarkLookup = Mouth + Nose + leftBrow + rightBrow + leftEye + rightEye + other 41 | 42 | 43 | class faceNormalizer(object): 44 | # Credits: http://www.learnopencv.com/face-morph-using-opencv-cpp-python/ 45 | w = 600 46 | h = 600 47 | 48 | def __init__(self, w=600, h=600): 49 | self.w = w 50 | self.h = h 51 | 52 | def similarityTransform(self, inPoints, outPoints): 53 | s60 = math.sin(60 * math.pi / 180) 54 | c60 = math.cos(60 * math.pi / 180) 55 | 56 | inPts = np.copy(inPoints).tolist() 57 | outPts = np.copy(outPoints).tolist() 58 | 59 | xin = c60 * (inPts[0][0] - inPts[1][0]) - s60 * (inPts[0][1] - inPts[1][1]) + inPts[1][0] 60 | yin = s60 * (inPts[0][0] - inPts[1][0]) + c60 * (inPts[0][1] - inPts[1][1]) + inPts[1][1] 61 | 62 | inPts.append([np.int(xin), np.int(yin)]) 63 | 64 | xout = c60 * (outPts[0][0] - outPts[1][0]) - s60 * (outPts[0][1] - outPts[1][1]) + outPts[1][0] 65 | yout = s60 * (outPts[0][0] - outPts[1][0]) + c60 * (outPts[0][1] - outPts[1][1]) + outPts[1][1] 66 | 67 | outPts.append([np.int(xout), np.int(yout)]) 68 | 69 | tform = cv2.estimateRigidTransform(np.array([inPts]), np.array([outPts]), False) 70 | 71 | return tform 72 | 73 | def tformFlmarks(self, flmark, tform): 74 | transformed = np.reshape(np.array(flmark), (68, 1, 2)) 75 | transformed = cv2.transform(transformed, tform) 76 | transformed = np.float32(np.reshape(transformed, (68, 2))) 77 | return transformed 78 | 79 | def alignEyePoints(self, lmarkSeq): 80 | w = self.w 81 | h = self.h 82 | 83 | alignedSeq = copy.deepcopy(lmarkSeq) 84 | firstFlmark = alignedSeq[0, :, :] 85 | 86 | eyecornerDst = [(np.float(0.3 * w), np.float(h / 3)), (np.float(0.7 * w), np.float(h / 3))] 87 | eyecornerSrc = [(firstFlmark[36, 0], firstFlmark[36, 1]), (firstFlmark[45, 0], firstFlmark[45, 1])] 88 | 89 | tform = self.similarityTransform(eyecornerSrc, eyecornerDst); 90 | 91 | for i, lmark in enumerate(alignedSeq): 92 | alignedSeq[i] = self.tformFlmarks(lmark, tform) 93 | 94 | return alignedSeq 95 | 96 | def alignEyePointsV2(self, lmarkSeq): 97 | w = self.w 98 | h = self.h 99 | 100 | alignedSeq = copy.deepcopy(lmarkSeq) 101 | 102 | eyecornerDst = [(np.float(0.3 * w), np.float(h / 3)), (np.float(0.7 * w), np.float(h / 3))] 103 | 104 | for i, lmark in enumerate(alignedSeq): 105 | curLmark = alignedSeq[i, :, :] 106 | eyecornerSrc = [(curLmark[36, 0], curLmark[36, 1]), (curLmark[45, 0], curLmark[45, 1])] 107 | tform = self.similarityTransform(eyecornerSrc, eyecornerDst); 108 | alignedSeq[i, :, :] = self.tformFlmarks(lmark, tform) 109 | 110 | return alignedSeq 111 | 112 | def transferExpression(self, lmarkSeq, meanShape): 113 | exptransSeq = copy.deepcopy(lmarkSeq) 114 | firstFlmark = exptransSeq[0, :, :] 115 | indexes = np.array([60, 64, 62, 67]) 116 | 117 | tformMS = cv2.estimateRigidTransform(firstFlmark[:, :], np.float32(meanShape[:, :]), True) 118 | 119 | sx = np.sign(tformMS[0, 0]) * np.sqrt(tformMS[0, 0] ** 2 + tformMS[0, 1] ** 2) 120 | sy = np.sign(tformMS[1, 0]) * np.sqrt(tformMS[1, 0] ** 2 + tformMS[1, 1] ** 2) 121 | # print sx, sy 122 | prevLmark = copy.deepcopy(firstFlmark) 123 | prevExpTransFlmark = copy.deepcopy(meanShape) 124 | 125 | zeroVecD = np.zeros((1, 68, 2)) 126 | diff = np.cumsum(np.insert(np.diff(exptransSeq, n=1, axis=0), 0, zeroVecD, axis=0), axis=0) 127 | msSeq = np.tile(np.reshape(meanShape, (1, 68, 2)), [lmarkSeq.shape[0], 1, 1]) 128 | 129 | diff[:, :, 0] = abs(sx) * diff[:, :, 0] 130 | diff[:, :, 1] = abs(sy) * diff[:, :, 1] 131 | 132 | exptransSeq = diff + msSeq 133 | 134 | return exptransSeq 135 | 136 | def unitNorm(self, flmarkSeq): 137 | normSeq = copy.deepcopy(flmarkSeq) 138 | normSeq[:, :, 0] /= self.w 139 | normSeq[:, :, 1] /= self.h 140 | return normSeq 141 | 142 | 143 | def write_video_wpts_wsound(frames, sound, fs, path, fname, xLim, yLim): 144 | try: 145 | os.remove(os.path.join(path, fname + '.mp4')) 146 | os.remove(os.path.join(path, fname + '.wav')) 147 | os.remove(os.path.join(path, fname + '_ws.mp4')) 148 | except: 149 | print ('Exp') 150 | 151 | if len(frames.shape) < 3: 152 | frames = np.reshape(frames, (frames.shape[0], frames.shape[1] / 2, 2)) 153 | print (frames.shape) 154 | 155 | FFMpegWriter = manimation.writers['ffmpeg'] 156 | metadata = dict(title='Movie Test', artist='Matplotlib', 157 | comment='Movie support!') 158 | writer = FFMpegWriter(fps=25, metadata=metadata) 159 | 160 | fig = plt.figure(figsize=(10, 10)) 161 | l, = plt.plot([], [], 'ko', ms=4) 162 | 163 | plt.xlim(xLim) 164 | plt.ylim(yLim) 165 | 166 | librosa.output.write_wav(os.path.join(path, fname + '.wav'), sound, fs) 167 | 168 | rect = (0, 0, 600, 600) 169 | 170 | if frames.shape[1] == 20: 171 | lookup = [[x[0] - 48, x[1] - 48] for x in Mouth] 172 | print (lookup) 173 | else: 174 | lookup = faceLmarkLookup 175 | 176 | lines = [plt.plot([], [], 'k')[0] for _ in range(3 * len(lookup))] 177 | 178 | with writer.saving(fig, os.path.join(path, fname + '.mp4'), 150): 179 | plt.gca().invert_yaxis() 180 | for i in tqdm(range(frames.shape[0])): 181 | l.set_data(frames[i, :, 0], frames[i, :, 1]) 182 | cnt = 0 183 | for refpts in lookup: 184 | lines[cnt].set_data([frames[i, refpts[1], 0], frames[i, refpts[0], 0]], 185 | [frames[i, refpts[1], 1], frames[i, refpts[0], 1]]) 186 | cnt += 1 187 | writer.grab_frame() 188 | 189 | cmd = 'ffmpeg -y -i ' + os.path.join(path, fname) + '.mp4 -i ' + os.path.join(path, 190 | fname) + '.wav -c:v copy -c:a aac -strict experimental ' + os.path.join( 191 | path, fname) + '_ws.mp4' 192 | subprocess.call(cmd, shell=True) 193 | print('Muxing Done') 194 | 195 | os.remove(os.path.join(path, fname + '.mp4')) 196 | os.remove(os.path.join(path, fname + '.wav')) 197 | 198 | 199 | def plot_flmarks(pts, lab, xLim, yLim, xLab, yLab, figsize=(10, 10)): 200 | if len(pts.shape) != 3: 201 | pts = np.reshape(pts, (pts.shape[0] / 2, 2)) 202 | print (pts.shape) 203 | 204 | if pts.shape[0] == 20: 205 | lookup = [[x[0] - 48, x[1] - 48] for x in Mouth] 206 | print (lookup) 207 | else: 208 | lookup = faceLmarkLookup 209 | 210 | plt.figure(figsize=figsize) 211 | plt.plot(pts[:, 0], pts[:, 1], 'ko', ms=4) 212 | for refpts in lookup: 213 | plt.plot([pts[refpts[1], 0], pts[refpts[0], 0]], [pts[refpts[1], 1], pts[refpts[0], 1]], 'k', ms=4) 214 | 215 | plt.xlabel(xLab, fontsize=font['size'] + 4, fontweight='bold') 216 | plt.gca().xaxis.tick_top() 217 | plt.gca().xaxis.set_label_position('top') 218 | plt.ylabel(yLab, fontsize=font['size'] + 4, fontweight='bold') 219 | plt.xlim(xLim) 220 | plt.ylim(yLim) 221 | plt.gca().invert_yaxis() 222 | 223 | plt.savefig(lab, dpi=300, bbox_inches='tight') 224 | plt.clf() 225 | plt.close() 226 | 227 | 228 | def melSpectra(y, sr, wsize, hsize): #sound:(132300,) , sr=48000, wsize=0.04, hsize =0.04 229 | cnst = 1 + (int(sr * wsize) / 2) #883 230 | y_stft_abs = np.abs(librosa.stft(y, #132300 231 | win_length=int(sr * wsize), #1764 232 | hop_length=int(sr * hsize), #1764 233 | n_fft=int(sr * wsize))) / cnst #883 234 | print("y_stft_abs", np.shape(y_stft_abs)) 235 | 236 | melspec = np.log(1e-16 + librosa.feature.melspectrogram(sr=sr, 237 | S=y_stft_abs ** 2, 238 | n_mels=64)) 239 | return melspec 240 | 241 | 242 | def main(): 243 | return 244 | 245 | 246 | if __name__ == "__main__": 247 | main() --------------------------------------------------------------------------------