├── cache └── readme.md ├── data └── readme.md ├── model └── readme.me ├── README.md ├── train.py ├── test.py ├── model.py └── utils.py /cache/readme.md: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /data/readme.md: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /model/readme.me: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Speech-to-Text-WaveNet : End-to-end sentence level Chinese speech recognition using DeepMind's WaveNet 2 | = 3 | A tensorflow implementation for Chinese speech recognition based on DeepMind's WaveNet: A Generative Model for Raw Audio. ([Hereafter the Paper]( https://arxiv.org/abs/1609.03499)) 4 | 5 | Version 6 | --- 7 | Current Version : 0.0.1 8 | 9 | Dependencies 10 | --- 11 | 1. python == 3.5 12 | 2. tensorflow == 1.0.0 13 | 3. librosa == 0.5.0 14 | 15 | Dataset 16 | --- 17 | [清华30小时中文数据集](http://data.cslt.org/thchs30/standalone.html) 18 | 19 | Directories 20 | --- 21 | 1. cache: save data featrue and word dictionary 22 | 2. data: wav files and related labels 23 | 3. model: save the models 24 | 25 | Network model 26 | --- 27 | 1. Data random shuffle per epoch 28 | 2. Xavier initialization 29 | 3. Adam optimization algorithms 30 | 4. Batch Normalization 31 | 32 | Train the network 33 | --- 34 | python3 train.py 35 | 36 | Test the network 37 | --- 38 | python3 test.py 39 | 40 | Other resources 41 | --- 42 | 1. [TensorFlow练习15: 中文语音识别](http://blog.topspeedsnail.com/archives/10696#more-10696) 43 | 2. [ibab's WaveNet(speech synthesis) tensorflow implementationt](https://github.com/ibab/tensorflow-wavenet) 44 | 3. [buriburisuri's WaveNet(English speech recognition) tensorflow and sugartensor implementationt](https://github.com/buriburisuri/speech-to-text-wavenet#version) 45 | -------------------------------------------------------------------------------- /train.py: -------------------------------------------------------------------------------- 1 | #-*- coding:utf-8 -*- 2 | 3 | from __future__ import print_function 4 | from utils import SpeechLoader 5 | from model import Model 6 | import tensorflow as tf #1.0.0 7 | import time 8 | import os 9 | 10 | def train(): 11 | # setting parameters 12 | batch_size = 32 13 | n_epoch = 100 14 | n_mfcc = 60 15 | 16 | # load speech data 17 | wav_path = os.path.join(os.getcwd(),'data','wav','train') 18 | label_file = os.path.join(os.getcwd(),'data','doc','trans','train.word.txt') 19 | speech_loader = SpeechLoader(wav_path, label_file, batch_size, n_mfcc) 20 | n_out = speech_loader.vocab_size 21 | 22 | # load model 23 | model = Model(n_out, batch_size=batch_size, n_mfcc=n_mfcc) 24 | 25 | with tf.Session() as sess: 26 | sess.run(tf.global_variables_initializer()) 27 | 28 | saver = tf.train.Saver(tf.global_variables()) 29 | 30 | for epoch in range(n_epoch): 31 | speech_loader.create_batches() # random shuffle data 32 | speech_loader.reset_batch_pointer() 33 | for batch in range(speech_loader.n_batches): 34 | start = time.time() 35 | batches_wav, batches_label = speech_loader.next_batch() 36 | feed = {model.input_data: batches_wav, model.targets: batches_label} 37 | train_loss, _ = sess.run([model.cost, model.optimizer_op], feed_dict=feed) 38 | end = time.time() 39 | print("epoch: %d/%d, batch: %d/%d, loss: %s, time: %.3f."%(epoch, n_epoch, batch, speech_loader.n_batches, train_loss, end-start)) 40 | 41 | # save models 42 | if epoch % 5 ==0: 43 | saver.save(sess, os.path.join(os.getcwd(), 'model','speech.module'), global_step=epoch) 44 | 45 | 46 | if __name__ == '__main__': 47 | train() -------------------------------------------------------------------------------- /test.py: -------------------------------------------------------------------------------- 1 | #-*- coding:utf-8 -*- 2 | 3 | from __future__ import print_function 4 | from model import Model 5 | from utils import SpeechLoader 6 | 7 | import tensorflow as tf # 1.0.0 8 | import numpy as np 9 | import librosa 10 | import os 11 | 12 | # 语音识别 13 | # 把batch_size改为1 14 | def speech_to_text(): 15 | n_mfcc = 60 16 | 17 | # load data 18 | speech_loader = SpeechLoader() 19 | 20 | # load model 21 | model = Model(speech_loader.vocab_size, n_mfcc=n_mfcc, is_training=False) 22 | 23 | saver = tf.train.Saver() 24 | 25 | with tf.Session() as sess: 26 | for j in range(750,755): 27 | # extract feature 28 | wav_file = os.path.join(os.getcwd(),'data','wav','test','D4','D4_'+str(j)+'.wav') 29 | wav, sr = librosa.load(wav_file, mono=True) 30 | mfcc = np.transpose(np.expand_dims(librosa.feature.mfcc(wav, sr, n_mfcc=n_mfcc), axis=0), [0,2,1]) 31 | mfcc = mfcc.tolist() 32 | 33 | # fill 0 34 | while len(mfcc[0]) < speech_loader.wav_max_len: 35 | mfcc[0].append([0] * n_mfcc) 36 | 37 | # word dict 38 | wmap = {value:key for key, value in speech_loader.wordmap.items()} 39 | 40 | # recognition 41 | saver.restore(sess, tf.train.latest_checkpoint('model')) 42 | decoded = tf.transpose(model.logit, perm=[1, 0, 2]) 43 | decoded, probs = tf.nn.ctc_beam_search_decoder(decoded, model.seq_len, top_paths=1, merge_repeated=True) 44 | predict = tf.sparse_to_dense(decoded[0].indices, decoded[0].dense_shape, decoded[0].values) + 1 45 | output, probs = sess.run([predict, probs], feed_dict={model.input_data: mfcc}) 46 | 47 | # print result 48 | words = '' 49 | for i in range(len(output[0])): 50 | words += wmap.get(output[0][i], -1) 51 | 52 | print("---------------------------") 53 | print("Input: " + wav_file) 54 | print("Output: " + words) 55 | 56 | 57 | if __name__ == '__main__': 58 | speech_to_text() 59 | 60 | -------------------------------------------------------------------------------- /model.py: -------------------------------------------------------------------------------- 1 | #-*- coding:utf-8 -*- 2 | __author__ = 'Deeper' 3 | import tensorflow as tf # 1.0.0 4 | import numpy as np 5 | 6 | class Model(): 7 | def __init__(self, n_out, batch_size=1, n_mfcc=20, is_training=True): 8 | n_dim = 128 9 | self.is_training = is_training 10 | 11 | self.input_data = tf.placeholder(dtype=tf.float32, shape=[batch_size, None, n_mfcc]) 12 | self.seq_len = tf.reduce_sum(tf.cast(tf.not_equal(tf.reduce_sum(self.input_data, reduction_indices=2), 0.), tf.int32), reduction_indices=1) 13 | self.targets = tf.placeholder(dtype=tf.int32, shape=[batch_size, None]) 14 | 15 | # 1D convolution 16 | self.conv1d_index = 0 17 | out = self.conv1d_layer(self.input_data, dim=n_dim) 18 | 19 | # stack hole CNN 20 | n_blocks = 3 21 | skip = 0 22 | self.aconv1d_index = 0 23 | for _ in range(n_blocks): 24 | for r in [1, 2, 4, 8, 16]: 25 | out, s = self.residual_block(out, size=7, rate=r, dim=n_dim) 26 | skip += s 27 | 28 | logit = self.conv1d_layer(skip, dim=skip.get_shape().as_list()[-1]) 29 | self.logit = self.conv1d_layer(logit, dim=n_out, bias=True, activation=None) 30 | 31 | # CTC loss 32 | indices = tf.where(tf.not_equal(tf.cast(self.targets, tf.float32), 0.)) 33 | target = tf.SparseTensor(indices=indices, values=tf.gather_nd(self.targets, indices)-1, dense_shape=tf.cast(tf.shape(self.targets), tf.int64)) 34 | loss = tf.nn.ctc_loss(target, self.logit, self.seq_len, time_major=False) 35 | self.cost = tf.reduce_mean(loss) 36 | 37 | # optimizer 38 | optimizer = tf.train.AdamOptimizer() 39 | var_list = [var for var in tf.trainable_variables()] 40 | gradient = optimizer.compute_gradients(self.cost, var_list=var_list) 41 | self.optimizer_op = optimizer.apply_gradients(gradient) 42 | 43 | def residual_block(self, input_tensor, size, rate, dim): 44 | conv_filter = self.aconv1d_layer(input_tensor, size=size, rate=rate, activation='tanh') 45 | conv_gate = self.aconv1d_layer(input_tensor, size=size, rate=rate, activation='sigmoid') 46 | out = conv_filter * conv_gate 47 | out = self.conv1d_layer(out, size=1, dim=dim) 48 | return out + input_tensor, out 49 | 50 | def conv1d_layer(self, input_tensor, size=1, dim=128, bias=False, activation='tanh'): 51 | with tf.variable_scope('conv1d'+str(self.conv1d_index)): 52 | shape = input_tensor.get_shape().as_list() 53 | kernel = tf.get_variable('kernel', (size, shape[-1], dim), dtype=tf.float32, initializer=tf.contrib.layers.xavier_initializer()) 54 | if bias: 55 | b = tf.get_variable('b', [dim], dtype=tf.float32, initializer=tf.constant_initializer(0)) 56 | out = tf.nn.conv1d(input_tensor, kernel, stride=1, padding='SAME') + (b if bias else 0) 57 | if not bias: 58 | out = self.batch_norm_wrapper(out) 59 | 60 | out = self.activation_wrapper(out, activation) 61 | 62 | self.conv1d_index += 1 63 | return out 64 | 65 | def aconv1d_layer(self, input_tensor, size=7, rate=2, bias=False, activation='tanh'): 66 | with tf.variable_scope('aconv1d_'+str(self.aconv1d_index)): 67 | shape = input_tensor.get_shape().as_list() 68 | kernel = tf.get_variable('kernel',(1, size, shape[-1], shape[-1]), dtype=tf.float32, initializer=tf.contrib.layers.xavier_initializer()) 69 | if bias: 70 | b = tf.get_variable('b', [shape[-1]], dtype=tf.float32, initializer=tf.constant_initializer(0)) 71 | out = tf.nn.atrous_conv2d(tf.expand_dims(input_tensor, dim=1), kernel, rate=rate, padding='SAME') 72 | out = tf.squeeze(out, [1]) 73 | if not bias: 74 | out = self.batch_norm_wrapper(out) 75 | 76 | out = self.activation_wrapper(out, activation) 77 | 78 | self.aconv1d_index += 1 79 | return out 80 | 81 | def batch_norm_wrapper(self, inputs, decay=0.999): 82 | epsilon = 1e-3 83 | shape = inputs.get_shape().as_list() 84 | 85 | beta = tf.get_variable('beta', shape[-1], dtype=tf.float32, initializer=tf.constant_initializer(0)) 86 | gamma = tf.get_variable('gamma', shape[-1], dtype=tf.float32, initializer=tf.constant_initializer(1)) 87 | pop_mean = tf.get_variable('mean', shape[-1], dtype=tf.float32, initializer=tf.constant_initializer(0)) 88 | pop_var = tf.get_variable('variance', shape[-1], dtype=tf.float32, initializer=tf.constant_initializer(1)) 89 | if self.is_training: 90 | batch_mean, batch_var = tf.nn.moments(inputs, axes=list(range(len(shape)-1))) 91 | train_mean = tf.assign(pop_mean, pop_mean*decay+batch_mean*(1-decay)) 92 | train_var =tf.assign(pop_var, pop_var*decay+batch_var*(1-decay)) 93 | with tf.control_dependencies([train_mean, train_var]): 94 | return tf.nn.batch_normalization(inputs, batch_mean, batch_var, beta, gamma, epsilon) 95 | else: 96 | return tf.nn.batch_normalization(inputs, pop_mean, pop_var, beta, gamma, epsilon) 97 | 98 | def activation_wrapper(self, inputs, activation): 99 | out = inputs 100 | 101 | if activation == 'sigmoid': 102 | out = tf.nn.sigmoid(out) 103 | elif activation == 'tanh': 104 | out = tf.nn.tanh(out) 105 | elif activation == 'relu': 106 | out = tf.nn.relu(out) 107 | 108 | return out -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | #-*- coding:utf-8 -*- 2 | __author__ = 'Deeper' 3 | import tensorflow as tf 4 | import numpy as np 5 | import os 6 | import codecs 7 | import librosa 8 | from six.moves import cPickle, reduce, map 9 | from collections import Counter 10 | import random 11 | 12 | class SpeechLoader(): 13 | 14 | def __init__(self, wav_path=None, label_file=None, batch_size=1, n_mfcc=20, encoding='utf-8'): 15 | self.batch_size = batch_size 16 | self.encoding = encoding 17 | self.n_mfcc = n_mfcc 18 | 19 | # path setting 20 | data_dir = os.path.join(os.getcwd(), 'cache', 'mfcc'+str(n_mfcc)) 21 | # data cache 22 | wavs_file = os.path.join(data_dir, "wavs.file") 23 | vocab_file = os.path.join(data_dir,"vocab.file") 24 | mfcc_tensor = os.path.join(data_dir, "mfcc.tensor") 25 | label_tensor = os.path.join(data_dir, "label.tensor") 26 | 27 | # data process 28 | if not (os.path.exists(vocab_file) and os.path.exists(mfcc_tensor) and os.path.exists(label_tensor)): 29 | print("reading wav files") 30 | self.preprocess(wav_path, label_file, wavs_file, vocab_file, mfcc_tensor, label_tensor) 31 | else: 32 | print("loading preprocessed files") 33 | self.load_preprocessed(vocab_file, mfcc_tensor, label_tensor) 34 | 35 | # minibatch 36 | self.create_batches() 37 | # pointer reset 38 | self.reset_batch_pointer() 39 | 40 | def preprocess(self, wav_path, label_file, wavs_file, vocab_file, mfcc_tensor, label_tensor): 41 | def handle_file(dirpath, filename): 42 | if filename.endswith('.wav') or filename.endswith('.WAV'): 43 | filename_path = os.path.join(dirpath, filename) 44 | if os.stat(filename_path).st_size < 240000: 45 | return 46 | return filename_path 47 | 48 | # read label file 49 | labels_dict = {} 50 | with codecs.open(label_file,"r", encoding=self.encoding) as f: 51 | for label in f: 52 | label = label.strip('\n') 53 | labels_id = label.split(' ',1)[0] 54 | labels_text = label.split(' ',1)[1] 55 | labels_dict[labels_id] = labels_text 56 | # print("",len(labels_dict)) # 10000 57 | 58 | # wav files 59 | wav_files = [] 60 | if wav_path: 61 | for (dirpath, dirnames, filenames) in os.walk(wav_path): 62 | for filename in filenames: 63 | if handle_file(dirpath,filename): 64 | wav_files.append(handle_file(dirpath,filename)) 65 | print("初始样本数:", len(wav_files)) #样本数 66 | 67 | # data filter and feature extraction 68 | wav_files_filter = [] 69 | labels_filter = [] 70 | self.mfcc_tensor = [] 71 | self.wav_max_len = 0 72 | cnt = 0 73 | for wav_file in wav_files: 74 | wav_id = os.path.basename(wav_file).split('.')[0] 75 | if wav_id in labels_dict: 76 | print('样本'+str(cnt), wav_file) 77 | labels_filter.append(labels_dict[wav_id]) 78 | wav_files_filter.append(wav_file) 79 | # mfcc feature 80 | wav_file, sr = librosa.load(wav_file, mono=True) 81 | mfcc = np.transpose(librosa.feature.mfcc(wav_file, sr, n_mfcc=self.n_mfcc),[1,0]) 82 | self.mfcc_tensor.append(mfcc.tolist()) 83 | cnt += 1 84 | self.wav_max_len = max(len(mfcc) for mfcc in self.mfcc_tensor) 85 | print("样本总数:", cnt) 86 | print("最长的语音:", self.wav_max_len) 87 | # print(len(wav_files_filter), len(labels_filter),len(wav2mfcc)) # assert check dimensions 88 | 89 | with open(wavs_file, 'wb') as f: 90 | cPickle.dump(wav_files_filter, f) 91 | 92 | with open(mfcc_tensor, 'wb') as f: 93 | cPickle.dump(self.mfcc_tensor, f) 94 | 95 | # vocab file 96 | vocabs = [] 97 | for label in labels_filter: 98 | vocabs += [word for word in label] 99 | count = Counter(vocabs) 100 | count_pairs = sorted(count.items(), key=lambda x:-x[1]) 101 | words, _ = zip(*count_pairs) 102 | self.wordmap = dict(zip(words, range(len(words)))) 103 | 104 | self.vocab_size = len(words) 105 | print("词汇表大小:",len(words)) 106 | 107 | with open(vocab_file,'wb') as f: 108 | cPickle.dump(self.wordmap, f) 109 | 110 | # label vector 111 | label_encoder = lambda word: self.wordmap.get(word, len(words)) 112 | self.label_tensor = [list(map(label_encoder, label)) for label in labels_filter] 113 | self.label_max_len = max(len(label) for label in self.label_tensor) 114 | print("最长的句子:", self.label_max_len) 115 | 116 | with open(label_tensor,'wb') as f: 117 | cPickle.dump(self.label_tensor, f) 118 | 119 | def load_preprocessed(self, vocab_file, mfcc_tensor, label_tensor): 120 | with open(vocab_file, 'rb') as f: 121 | self.wordmap = cPickle.load(f) 122 | self.vocab_size = len(self.wordmap) 123 | print("词汇表大小:",self.vocab_size) 124 | 125 | with open(mfcc_tensor, 'rb') as f: 126 | self.mfcc_tensor = cPickle.load(f) 127 | self.wav_max_len = max(len(mfcc) for mfcc in self.mfcc_tensor) 128 | print("最长的语音:", self.wav_max_len) 129 | 130 | with open(label_tensor, 'rb') as f: 131 | self.label_tensor = cPickle.load(f) 132 | self.label_max_len = max(len(label) for label in self.label_tensor) 133 | print("最长的句子:", self.label_max_len) 134 | 135 | def create_batches(self): 136 | self.n_batches = len(self.mfcc_tensor) // self.batch_size 137 | if self.n_batches==0: 138 | assert False, "Not enough data. Make seq_length and batch_size small." 139 | 140 | self.mfcc_tensor = self.mfcc_tensor[:self.n_batches*self.batch_size] 141 | self.label_tensor = self.label_tensor[:self.n_batches*self.batch_size] 142 | 143 | # random shuffle the data 144 | if len(self.mfcc_tensor) != len(self.label_tensor): 145 | assert False, "Data length does not match the label length!" 146 | 147 | data_tensor = [] 148 | for i in range(len(self.mfcc_tensor)): 149 | data_tensor.append([self.mfcc_tensor[i], self.label_tensor[i]]) 150 | 151 | random.shuffle(data_tensor) 152 | self.mfcc_tensor = [] 153 | self.label_tensor = [] 154 | for i in range(len(data_tensor)): 155 | self.mfcc_tensor.append(data_tensor[i][0]) 156 | self.label_tensor.append(data_tensor[i][1]) 157 | 158 | # create batches 159 | self.x_batches = [] 160 | self.y_batches = [] 161 | 162 | for i in range(self.n_batches): 163 | from_index = i*self.batch_size 164 | to_index = from_index + self.batch_size 165 | mfcc_batches = self.mfcc_tensor[from_index:to_index] 166 | label_batches = self.label_tensor[from_index:to_index] 167 | # 补零对齐 168 | for mfcc in mfcc_batches: 169 | while len(mfcc) < self.wav_max_len: 170 | mfcc.append([0]*self.n_mfcc) 171 | for label in label_batches: 172 | while len(label) < self.label_max_len: 173 | label.append(0) 174 | 175 | self.x_batches.append(mfcc_batches) 176 | self.y_batches.append(label_batches) 177 | 178 | def next_batch(self): 179 | x, y = self.x_batches[self.pointer], self.y_batches[self.pointer] 180 | self.pointer += 1 181 | return x, y 182 | 183 | def reset_batch_pointer(self): 184 | self.pointer = 0 --------------------------------------------------------------------------------