├── cache
    └── readme.md
├── data
    └── readme.md
├── model
    └── readme.me
├── README.md
├── train.py
├── test.py
├── model.py
└── utils.py


/cache/readme.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/data/readme.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/model/readme.me:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | Speech-to-Text-WaveNet : End-to-end sentence level Chinese speech recognition using DeepMind's WaveNet
 2 | =
 3 | A tensorflow implementation for Chinese speech recognition based on DeepMind's WaveNet: A Generative Model for Raw Audio. ([Hereafter the Paper]( https://arxiv.org/abs/1609.03499))
 4 | 
 5 | Version
 6 | ---
 7 | Current Version : 0.0.1
 8 | 
 9 | Dependencies
10 | ---
11 | 1. python == 3.5
12 | 2. tensorflow == 1.0.0
13 | 3. librosa == 0.5.0
14 | 
15 | Dataset
16 | ---
17 | [清华30小时中文数据集](http://data.cslt.org/thchs30/standalone.html)
18 | 
19 | Directories
20 | ---
21 | 1. cache: save data featrue and word dictionary
22 | 2. data: wav files and related labels
23 | 3. model: save the models
24 | 
25 | Network model
26 | ---
27 | 1. Data random shuffle per epoch
28 | 2. Xavier initialization
29 | 3. Adam optimization algorithms
30 | 4. Batch Normalization
31 | 
32 | Train the network
33 | ---
34 | python3 train.py
35 | 
36 | Test the network
37 | ---
38 | python3 test.py
39 | 
40 | Other resources
41 | ---
42 | 1. [TensorFlow练习15: 中文语音识别](http://blog.topspeedsnail.com/archives/10696#more-10696)
43 | 2. [ibab's WaveNet(speech synthesis) tensorflow implementationt](https://github.com/ibab/tensorflow-wavenet)
44 | 3. [buriburisuri's WaveNet(English speech recognition) tensorflow and sugartensor implementationt](https://github.com/buriburisuri/speech-to-text-wavenet#version)
45 | 


--------------------------------------------------------------------------------
/train.py:
--------------------------------------------------------------------------------
 1 | #-*- coding:utf-8 -*-
 2 | 
 3 | from __future__ import print_function
 4 | from utils import SpeechLoader
 5 | from model import Model
 6 | import tensorflow as tf #1.0.0
 7 | import time
 8 | import os
 9 | 
10 | def train():
11 | 	# setting parameters
12 | 	batch_size = 32
13 | 	n_epoch = 100
14 | 	n_mfcc = 60
15 | 
16 | 	# load speech data
17 | 	wav_path = os.path.join(os.getcwd(),'data','wav','train')
18 | 	label_file = os.path.join(os.getcwd(),'data','doc','trans','train.word.txt')
19 | 	speech_loader = SpeechLoader(wav_path, label_file, batch_size, n_mfcc)
20 | 	n_out = speech_loader.vocab_size
21 | 
22 | 	# load model
23 | 	model = Model(n_out, batch_size=batch_size, n_mfcc=n_mfcc)
24 | 
25 | 	with tf.Session() as sess:
26 | 		sess.run(tf.global_variables_initializer())
27 | 		
28 | 		saver = tf.train.Saver(tf.global_variables())
29 | 
30 | 		for epoch in range(n_epoch):
31 | 			speech_loader.create_batches() # random shuffle data
32 | 			speech_loader.reset_batch_pointer()
33 | 			for batch in range(speech_loader.n_batches):
34 | 				start = time.time()
35 | 				batches_wav, batches_label = speech_loader.next_batch()
36 | 				feed = {model.input_data: batches_wav, model.targets: batches_label}
37 | 				train_loss, _ = sess.run([model.cost, model.optimizer_op], feed_dict=feed)
38 | 				end = time.time()
39 | 				print("epoch: %d/%d, batch: %d/%d, loss: %s, time: %.3f."%(epoch, n_epoch, batch, speech_loader.n_batches, train_loss, end-start))
40 | 
41 | 			# save models
42 | 			if epoch % 5 ==0:
43 | 				saver.save(sess, os.path.join(os.getcwd(), 'model','speech.module'), global_step=epoch)
44 | 
45 | 
46 | if __name__ == '__main__':
47 | 	train()


--------------------------------------------------------------------------------
/test.py:
--------------------------------------------------------------------------------
 1 | #-*- coding:utf-8 -*-
 2 | 
 3 | from __future__ import print_function
 4 | from model import Model
 5 | from utils import SpeechLoader
 6 | 
 7 | import tensorflow as tf  # 1.0.0
 8 | import numpy as np
 9 | import librosa
10 | import os
11 |  
12 | # 语音识别
13 | # 把batch_size改为1
14 | def speech_to_text():
15 |     n_mfcc = 60
16 | 
17 |     # load data
18 |     speech_loader = SpeechLoader()
19 | 
20 |     # load model
21 |     model = Model(speech_loader.vocab_size, n_mfcc=n_mfcc, is_training=False)
22 | 
23 |     saver = tf.train.Saver()
24 | 
25 |     with tf.Session() as sess:
26 |         for j in range(750,755):
27 |             # extract feature
28 |             wav_file = os.path.join(os.getcwd(),'data','wav','test','D4','D4_'+str(j)+'.wav')
29 |             wav, sr = librosa.load(wav_file, mono=True)
30 |             mfcc = np.transpose(np.expand_dims(librosa.feature.mfcc(wav, sr, n_mfcc=n_mfcc), axis=0), [0,2,1])
31 |             mfcc = mfcc.tolist()
32 | 
33 |             # fill 0
34 |             while len(mfcc[0]) < speech_loader.wav_max_len:
35 |                 mfcc[0].append([0] * n_mfcc)
36 | 
37 |             # word dict
38 |             wmap = {value:key for key, value in speech_loader.wordmap.items()}
39 | 
40 |             # recognition
41 |             saver.restore(sess, tf.train.latest_checkpoint('model'))
42 |             decoded = tf.transpose(model.logit, perm=[1, 0, 2])
43 |             decoded, probs = tf.nn.ctc_beam_search_decoder(decoded, model.seq_len, top_paths=1, merge_repeated=True)
44 |             predict = tf.sparse_to_dense(decoded[0].indices, decoded[0].dense_shape, decoded[0].values) + 1
45 |             output, probs = sess.run([predict, probs], feed_dict={model.input_data: mfcc})
46 |             
47 |             # print result
48 |             words = ''
49 |             for i in range(len(output[0])):
50 |                 words += wmap.get(output[0][i], -1)
51 | 
52 |             print("---------------------------")
53 |             print("Input: " + wav_file)
54 |             print("Output: " + words)
55 | 
56 | 
57 | if __name__ == '__main__':
58 |         speech_to_text()
59 |         
60 |     


--------------------------------------------------------------------------------
/model.py:
--------------------------------------------------------------------------------
  1 | #-*- coding:utf-8 -*-
  2 | __author__ = 'Deeper'
  3 | import tensorflow as tf  # 1.0.0
  4 | import numpy as np
  5 | 
  6 | class Model():
  7 | 	def __init__(self, n_out, batch_size=1, n_mfcc=20, is_training=True):
  8 | 		n_dim = 128
  9 | 		self.is_training = is_training
 10 | 
 11 | 		self.input_data = tf.placeholder(dtype=tf.float32, shape=[batch_size, None, n_mfcc])
 12 | 		self.seq_len = tf.reduce_sum(tf.cast(tf.not_equal(tf.reduce_sum(self.input_data, reduction_indices=2), 0.), tf.int32), reduction_indices=1)
 13 | 		self.targets = tf.placeholder(dtype=tf.int32, shape=[batch_size, None])
 14 | 
 15 | 		# 1D convolution
 16 | 		self.conv1d_index = 0
 17 | 		out = self.conv1d_layer(self.input_data, dim=n_dim)
 18 | 		
 19 | 		# stack hole CNN
 20 | 		n_blocks = 3
 21 | 		skip = 0
 22 | 		self.aconv1d_index = 0
 23 | 		for _ in range(n_blocks):
 24 | 			for r in [1, 2, 4, 8, 16]:
 25 | 				out, s = self.residual_block(out, size=7, rate=r, dim=n_dim)
 26 | 				skip += s
 27 | 
 28 | 		logit = self.conv1d_layer(skip, dim=skip.get_shape().as_list()[-1])
 29 | 		self.logit = self.conv1d_layer(logit, dim=n_out, bias=True, activation=None)
 30 | 
 31 | 		# CTC loss
 32 | 		indices = tf.where(tf.not_equal(tf.cast(self.targets, tf.float32), 0.))
 33 | 		target = tf.SparseTensor(indices=indices, values=tf.gather_nd(self.targets, indices)-1, dense_shape=tf.cast(tf.shape(self.targets), tf.int64))
 34 | 		loss = tf.nn.ctc_loss(target, self.logit, self.seq_len, time_major=False)
 35 | 		self.cost = tf.reduce_mean(loss)
 36 | 
 37 | 		# optimizer
 38 | 		optimizer = tf.train.AdamOptimizer()
 39 | 		var_list = [var for var in tf.trainable_variables()]
 40 | 		gradient = optimizer.compute_gradients(self.cost, var_list=var_list)
 41 | 		self.optimizer_op = optimizer.apply_gradients(gradient)
 42 | 
 43 | 	def residual_block(self, input_tensor, size, rate, dim):
 44 | 		conv_filter = self.aconv1d_layer(input_tensor, size=size, rate=rate, activation='tanh')
 45 | 		conv_gate = self.aconv1d_layer(input_tensor, size=size, rate=rate, activation='sigmoid')
 46 | 		out = conv_filter * conv_gate
 47 | 		out = self.conv1d_layer(out, size=1, dim=dim)
 48 | 		return out + input_tensor, out
 49 | 
 50 | 	def conv1d_layer(self, input_tensor, size=1, dim=128, bias=False, activation='tanh'):
 51 | 		with tf.variable_scope('conv1d'+str(self.conv1d_index)):
 52 | 			shape = input_tensor.get_shape().as_list()
 53 | 			kernel = tf.get_variable('kernel', (size, shape[-1], dim), dtype=tf.float32, initializer=tf.contrib.layers.xavier_initializer())
 54 | 			if bias:
 55 | 				b = tf.get_variable('b', [dim], dtype=tf.float32, initializer=tf.constant_initializer(0))
 56 | 			out = tf.nn.conv1d(input_tensor, kernel, stride=1, padding='SAME') + (b if bias else 0)
 57 | 			if not bias:
 58 | 				out = self.batch_norm_wrapper(out)
 59 | 
 60 | 			out = self.activation_wrapper(out, activation)
 61 | 			
 62 | 			self.conv1d_index += 1
 63 | 			return out
 64 | 
 65 | 	def aconv1d_layer(self, input_tensor, size=7, rate=2, bias=False, activation='tanh'):
 66 | 		with tf.variable_scope('aconv1d_'+str(self.aconv1d_index)):
 67 | 			shape = input_tensor.get_shape().as_list()
 68 | 			kernel = tf.get_variable('kernel',(1, size, shape[-1], shape[-1]), dtype=tf.float32, initializer=tf.contrib.layers.xavier_initializer())
 69 | 			if bias:
 70 | 				b = tf.get_variable('b', [shape[-1]], dtype=tf.float32, initializer=tf.constant_initializer(0))
 71 | 			out = tf.nn.atrous_conv2d(tf.expand_dims(input_tensor, dim=1), kernel, rate=rate, padding='SAME')
 72 | 			out = tf.squeeze(out, [1])
 73 | 			if not bias:
 74 | 				out = self.batch_norm_wrapper(out)
 75 | 
 76 | 			out = self.activation_wrapper(out, activation)
 77 | 			
 78 | 			self.aconv1d_index += 1
 79 | 			return out
 80 | 
 81 | 	def batch_norm_wrapper(self, inputs, decay=0.999):
 82 | 		epsilon = 1e-3
 83 | 		shape = inputs.get_shape().as_list()
 84 | 
 85 | 		beta = tf.get_variable('beta', shape[-1], dtype=tf.float32, initializer=tf.constant_initializer(0))
 86 | 		gamma = tf.get_variable('gamma', shape[-1], dtype=tf.float32, initializer=tf.constant_initializer(1))
 87 | 		pop_mean = tf.get_variable('mean', shape[-1], dtype=tf.float32, initializer=tf.constant_initializer(0))
 88 | 		pop_var = tf.get_variable('variance', shape[-1], dtype=tf.float32, initializer=tf.constant_initializer(1))
 89 | 		if self.is_training:
 90 | 			batch_mean, batch_var = tf.nn.moments(inputs, axes=list(range(len(shape)-1)))
 91 | 			train_mean = tf.assign(pop_mean, pop_mean*decay+batch_mean*(1-decay))
 92 | 			train_var =tf.assign(pop_var, pop_var*decay+batch_var*(1-decay))
 93 | 			with tf.control_dependencies([train_mean, train_var]):
 94 | 				return tf.nn.batch_normalization(inputs, batch_mean, batch_var, beta, gamma, epsilon)
 95 | 		else:
 96 | 			return tf.nn.batch_normalization(inputs, pop_mean, pop_var, beta, gamma, epsilon)
 97 | 
 98 | 	def activation_wrapper(self, inputs, activation):
 99 | 		out = inputs
100 | 
101 | 		if activation == 'sigmoid':
102 | 			out = tf.nn.sigmoid(out)
103 | 		elif activation == 'tanh':
104 | 			out = tf.nn.tanh(out)
105 | 		elif activation == 'relu':
106 | 			out = tf.nn.relu(out)
107 | 
108 | 		return out


--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
  1 | #-*- coding:utf-8 -*-
  2 | __author__ = 'Deeper'
  3 | import tensorflow as tf 
  4 | import numpy as np 
  5 | import os
  6 | import codecs
  7 | import librosa
  8 | from six.moves import cPickle, reduce, map
  9 | from collections import Counter
 10 | import random
 11 | 
 12 | class SpeechLoader():
 13 | 
 14 | 	def __init__(self, wav_path=None, label_file=None, batch_size=1, n_mfcc=20, encoding='utf-8'):
 15 | 		self.batch_size = batch_size
 16 | 		self.encoding = encoding
 17 | 		self.n_mfcc = n_mfcc
 18 | 
 19 | 		# path setting
 20 | 		data_dir = os.path.join(os.getcwd(), 'cache', 'mfcc'+str(n_mfcc))
 21 | 		# data cache
 22 | 		wavs_file = os.path.join(data_dir, "wavs.file")
 23 | 		vocab_file = os.path.join(data_dir,"vocab.file")
 24 | 		mfcc_tensor = os.path.join(data_dir, "mfcc.tensor")
 25 | 		label_tensor = os.path.join(data_dir, "label.tensor")
 26 | 
 27 | 		# data process
 28 | 		if not (os.path.exists(vocab_file) and os.path.exists(mfcc_tensor) and os.path.exists(label_tensor)):
 29 | 			print("reading wav files")
 30 | 			self.preprocess(wav_path, label_file, wavs_file, vocab_file, mfcc_tensor, label_tensor)
 31 | 		else:
 32 | 			print("loading preprocessed files")
 33 | 			self.load_preprocessed(vocab_file, mfcc_tensor, label_tensor)
 34 | 
 35 | 		# minibatch
 36 | 		self.create_batches()
 37 | 		# pointer reset
 38 | 		self.reset_batch_pointer()
 39 | 
 40 | 	def preprocess(self, wav_path, label_file, wavs_file, vocab_file, mfcc_tensor, label_tensor):
 41 | 		def handle_file(dirpath, filename):
 42 | 			if filename.endswith('.wav') or filename.endswith('.WAV'):
 43 | 				filename_path = os.path.join(dirpath, filename)
 44 | 				if os.stat(filename_path).st_size < 240000:
 45 | 					return
 46 | 				return filename_path
 47 | 
 48 | 		# read label file
 49 | 		labels_dict = {}
 50 | 		with codecs.open(label_file,"r", encoding=self.encoding) as f:
 51 | 			for label in f:
 52 | 				label = label.strip('\n')
 53 | 				labels_id = label.split(' ',1)[0]
 54 | 				labels_text = label.split(' ',1)[1]
 55 | 				labels_dict[labels_id] = labels_text
 56 | 		# print("",len(labels_dict)) # 10000
 57 | 		
 58 | 		# wav files
 59 | 		wav_files = []
 60 | 		if wav_path:
 61 | 			for (dirpath, dirnames, filenames) in os.walk(wav_path):
 62 | 				for filename in filenames:
 63 | 					if handle_file(dirpath,filename):
 64 | 						wav_files.append(handle_file(dirpath,filename))
 65 | 		print("初始样本数：", len(wav_files)) #样本数
 66 | 
 67 | 		# data filter and feature extraction
 68 | 		wav_files_filter = []
 69 | 		labels_filter = []
 70 | 		self.mfcc_tensor = []
 71 | 		self.wav_max_len = 0
 72 | 		cnt = 0
 73 | 		for wav_file in wav_files:
 74 | 			wav_id = os.path.basename(wav_file).split('.')[0]
 75 | 			if wav_id in labels_dict:
 76 | 				print('样本'+str(cnt), wav_file)
 77 | 				labels_filter.append(labels_dict[wav_id])
 78 | 				wav_files_filter.append(wav_file)
 79 | 				# mfcc feature
 80 | 				wav_file, sr = librosa.load(wav_file, mono=True)
 81 | 				mfcc = np.transpose(librosa.feature.mfcc(wav_file, sr, n_mfcc=self.n_mfcc),[1,0])
 82 | 				self.mfcc_tensor.append(mfcc.tolist())
 83 | 				cnt += 1
 84 | 		self.wav_max_len = max(len(mfcc) for mfcc in self.mfcc_tensor)
 85 | 		print("样本总数：", cnt)
 86 | 		print("最长的语音：", self.wav_max_len)
 87 | 		# print(len(wav_files_filter), len(labels_filter),len(wav2mfcc)) # assert check dimensions
 88 | 
 89 | 		with open(wavs_file, 'wb') as f:
 90 | 			cPickle.dump(wav_files_filter, f)
 91 | 
 92 | 		with open(mfcc_tensor, 'wb') as f:
 93 | 			cPickle.dump(self.mfcc_tensor, f)
 94 | 
 95 | 		# vocab file
 96 | 		vocabs = []
 97 | 		for label in labels_filter:
 98 | 			vocabs += [word for word in label]
 99 | 		count = Counter(vocabs)
100 | 		count_pairs = sorted(count.items(), key=lambda x:-x[1])
101 | 		words, _ = zip(*count_pairs)
102 | 		self.wordmap = dict(zip(words, range(len(words))))
103 | 
104 | 		self.vocab_size = len(words)
105 | 		print("词汇表大小：",len(words))
106 | 
107 | 		with open(vocab_file,'wb') as f:
108 | 			cPickle.dump(self.wordmap, f)
109 | 
110 | 		# label vector
111 | 		label_encoder = lambda word: self.wordmap.get(word, len(words))
112 | 		self.label_tensor = [list(map(label_encoder, label)) for label in labels_filter]
113 | 		self.label_max_len = max(len(label) for label in self.label_tensor)
114 | 		print("最长的句子：", self.label_max_len)
115 | 
116 | 		with open(label_tensor,'wb') as f:
117 | 			cPickle.dump(self.label_tensor, f)
118 | 
119 | 	def load_preprocessed(self, vocab_file, mfcc_tensor, label_tensor):
120 | 		with open(vocab_file, 'rb') as f:
121 | 			self.wordmap = cPickle.load(f) 
122 | 		self.vocab_size = len(self.wordmap)
123 | 		print("词汇表大小：",self.vocab_size)
124 | 
125 | 		with open(mfcc_tensor, 'rb') as f:
126 | 			self.mfcc_tensor = cPickle.load(f)
127 | 		self.wav_max_len = max(len(mfcc) for mfcc in self.mfcc_tensor)
128 | 		print("最长的语音：", self.wav_max_len)
129 | 
130 | 		with open(label_tensor, 'rb') as f:
131 | 			self.label_tensor = cPickle.load(f)
132 | 		self.label_max_len = max(len(label) for label in self.label_tensor)
133 | 		print("最长的句子：", self.label_max_len)
134 | 
135 | 	def create_batches(self):
136 | 		self.n_batches = len(self.mfcc_tensor) // self.batch_size
137 | 		if self.n_batches==0:
138 | 			assert False, "Not enough data. Make seq_length and batch_size small."
139 | 		
140 | 		self.mfcc_tensor = self.mfcc_tensor[:self.n_batches*self.batch_size]
141 | 		self.label_tensor = self.label_tensor[:self.n_batches*self.batch_size]
142 | 
143 | 		# random shuffle the data
144 | 		if len(self.mfcc_tensor) != len(self.label_tensor):
145 | 			assert False, "Data length does not match the label length!"
146 | 
147 | 		data_tensor = []
148 | 		for i in range(len(self.mfcc_tensor)):
149 | 			data_tensor.append([self.mfcc_tensor[i], self.label_tensor[i]])
150 | 
151 | 		random.shuffle(data_tensor)
152 | 		self.mfcc_tensor = []
153 | 		self.label_tensor = []
154 | 		for i in range(len(data_tensor)):
155 | 			self.mfcc_tensor.append(data_tensor[i][0])
156 | 			self.label_tensor.append(data_tensor[i][1])
157 | 
158 | 		# create batches
159 | 		self.x_batches = []
160 | 		self.y_batches = []
161 | 
162 | 		for i in range(self.n_batches):
163 | 			from_index = i*self.batch_size
164 | 			to_index = from_index + self.batch_size
165 | 			mfcc_batches = self.mfcc_tensor[from_index:to_index]
166 | 			label_batches = self.label_tensor[from_index:to_index]
167 | 			# 补零对齐
168 | 			for mfcc in mfcc_batches:
169 | 				while len(mfcc) < self.wav_max_len:
170 | 					mfcc.append([0]*self.n_mfcc)
171 | 			for label in label_batches:
172 | 				while len(label) < self.label_max_len:
173 | 					label.append(0)
174 | 
175 | 			self.x_batches.append(mfcc_batches)
176 | 			self.y_batches.append(label_batches)
177 | 
178 | 	def next_batch(self):
179 | 		x, y = self.x_batches[self.pointer], self.y_batches[self.pointer]
180 | 		self.pointer += 1
181 | 		return x, y
182 | 
183 | 	def reset_batch_pointer(self):
184 | 		self.pointer = 0


--------------------------------------------------------------------------------