├── README.md
├── TDeval.py
└── loss.py


/README.md:
--------------------------------------------------------------------------------
1 | # Characterizing-Audio-Adversarial-Examples-using-Temporal-Dependency
2 | This repository contains the code for the ICLR 2019 Paper, "Characterizing Audio Adversarial Examples using Temporal Dependency".
3 | 


--------------------------------------------------------------------------------
/TDeval.py:
--------------------------------------------------------------------------------
  1 | import scipy.io.wavfile as wav
  2 | 
  3 | import argparse
  4 | 
  5 | import tensorflow as tf 
  6 | import numpy as np
  7 | import os
  8 | import sys
  9 | from sklearn.metrics import roc_curve, auc
 10 | sys.path.append("DeepSpeech")
 11 | 
 12 | tf.load_op_library = lambda x: x
 13 | tmp = os.path.exists
 14 | os.path.exists = lambda x: True
 15 | import DeepSpeech
 16 | os.path.exists = tmp
 17 | 
 18 | from util.text import ctc_label_dense_to_sparse
 19 | from tf_logits3 import get_logits
 20 | from tqdm import tqdm
 21 | toks = " abcdefghijklmnopqrstuvwxyz'-"
 22 | 
 23 | DeepSpeech.TrainingCoordinator.__init__ = lambda x: None
 24 | 
 25 | DeepSpeech.TrainingCoordinator.start = lambda x: None
 26 | import loss
 27 | # Use the same decode framework as carlini used :) 
 28 | class Decoder:
 29 | 	def __init__(self, sess, max_audio_len):
 30 | 		self.sess = sess
 31 | 		self.max_audio_len = max_audio_len
 32 | 		self.original = original = tf.Variable(np.zeros((1, max_audio_len), dtype=np.float32), name='qq_original')
 33 | 		self.lengths = lengths = tf.Variable(np.zeros(1, dtype=np.int32), name='qq_lengths')
 34 | 
 35 | 		with tf.variable_scope("", reuse=tf.AUTO_REUSE):
 36 | 			logits, features = get_logits(original, lengths)
 37 | 
 38 | 		self.logits = logits
 39 | 		self.features = features
 40 | 		saver = tf.train.Saver([x for x in tf.global_variables() if 'qq' not in x.name])
 41 | 		saver.restore(sess, "models/session_dump")
 42 | 		self.decoded, _ = tf.nn.ctc_beam_search_decoder(logits, lengths, merge_repeated=False, beam_width=1000)
 43 | 
 44 | 	def transcribe(self, audio, lengths):
 45 | 		sess = self.sess
 46 | 		sess.run(self.original.assign(np.array(audio)))
 47 | 		sess.run(self.lengths.assign((np.array(lengths)-1)//320))
 48 | 		out, logits = sess.run((self.decoded, self.logits))
 49 | 		chars = out[0].values
 50 | 		res = np.zeros(out[0].dense_shape)+len(toks)-1		
 51 | 		for ii in range(len(out[0].values)):
 52 | 			x,y = out[0].indices[ii]
 53 | 			res[x,y] = out[0].values[ii]
 54 | 		res = ["".join(toks[int(x)] for x in y).replace("-","") for y in res]
 55 | 		return res[0]
 56 | 
 57 | def decode(audio, num):
 58 | 	global maxlen
 59 | 	global D
 60 | 	global ref
 61 | 	audios = [list(audio)]
 62 | 	lengths = [int(maxlen * num)]
 63 | 	audios = np.array(audios)
 64 | 	res = D.transcribe(audios, lengths)
 65 | 	return res
 66 | 
 67 | num_samples = 50
 68 | y_test = np.zeros(num_samples * 2)
 69 | roc_auc = np.zeros(3)
 70 | TD = np.zeros((3, num_samples * 2), dtype = np.float32)
 71 | count = 0
 72 | if __name__ == '__main__':
 73 | 	sess = tf.Session()
 74 | 	parser = argparse.ArgumentParser(description = None)
 75 | 	parser.add_argument('--cut', type = float, required = True)
 76 | 	args = parser.parse_args()
 77 | 
 78 | 
 79 | 	ratio = args.cut
 80 | 	pbar = tqdm(range(num_samples), unit='steps', ascii = True)
 81 | 	for epoch in pbar:
 82 | 		x, y = wav.read("librisample" + str(epoch) + ".wav")
 83 | 		z, w = wav.read("librifinal" + str(epoch) + ".wav")
 84 | 		maxlen = len(y)
 85 | 
 86 | 		#ratio = np.random.random_sample() * 0.6 + 0.2 
 87 | 		#ratio = (numcut) * 1.0 / (numcut - 1)
 88 | 		D = Decoder(sess, maxlen)
 89 | 		stry = decode(y, 1)
 90 | 		strw = decode(w, 1)
 91 | 		halfy = decode(y, ratio)
 92 | 		halfw = decode(w, ratio)
 93 | 
 94 | 		#print ("Origin: " + stry)
 95 | 		#print ("Half of Origin: " + halfy)
 96 | 		s1 = loss.newWER(stry, halfy)
 97 | 		s2 = loss.newCER(stry, halfy)
 98 | 		s3 = loss.lcp(stry, halfy)
 99 | 
100 | 		y_test[count] = 0
101 | 		TD[0][count] = float(s1)
102 | 		TD[1][count] = float(s2)
103 | 		TD[2][count] = float(s3)
104 | 
105 | 		count += 1
106 | 		#print ("WER: " + str(s1) + " CER: " + str(s2) + " LCP: " + str(s3))
107 | 		#print ("Adv: " + strw)
108 | 		#print ("Half of Adv: " + halfw)
109 | 		s1 = loss.newWER(strw, halfw)
110 | 		s2 = loss.newCER(strw, halfw)
111 | 		s3 = loss.lcp(strw, halfw)
112 | 		
113 | 		y_test[count] = 1
114 | 		TD[0][count] = float(s1)
115 | 		TD[1][count] = float(s2)
116 | 		TD[2][count] = float(s3)
117 | 		count += 1
118 | 		#print ("WER: " + str(s1) + " CER: " + str(s2) + " LCP: " + str(s3))
119 | 
120 | 
121 | 	for i in range(3):
122 | 		if (i == 2):
123 | 			y_test = 1 - y_test
124 | 		fpr, tpr, threshold = roc_curve(y_test, TD[i])
125 | 		roc_auc[i] = auc(fpr, tpr)
126 | 
127 | 	print ("WER: " + str(roc_auc[0]) + " CER: " + str(roc_auc[1]) + " LCP: " + str(roc_auc[2]))
128 | 
129 | 
130 | 
131 | 


--------------------------------------------------------------------------------
/loss.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | 
  3 | def lcp(x, y):
  4 | 	x = x.split()
  5 | 	y = y.split()
  6 | 	n = len(x)
  7 | 	m = len(y)
  8 | 
  9 | 	k = min(n, m)
 10 | 	idx = 0
 11 | 	for i in range(k):
 12 | 		if (x[i] != y[i]):
 13 | 			break
 14 | 		idx = i + 1
 15 | 	
 16 | 	return idx * 1.0 / max(n, m)
 17 | 
 18 | def newWER(x, y):
 19 | 	x = x.split()
 20 | 	y = y.split()
 21 | 	
 22 | 	n = len(x)
 23 | 	m = len(y)
 24 | 	k = min(n, m)
 25 | 	d = np.zeros((k + 1) * (k + 1), dtype = np.uint8).reshape(k + 1, k + 1)
 26 | 
 27 | 	for i in range(k + 1):
 28 | 		for j in range(k + 1):
 29 | 			if i == 0:
 30 | 				d[0][j] = j
 31 | 			elif j == 0:
 32 | 				d[i][0] = i
 33 | 
 34 | 	for i in range(1, k + 1):
 35 | 		for j in range(1, k + 1):
 36 | 			if (x[i - 1] == y[j - 1]):
 37 | 				d[i][j] = d[i - 1][j - 1]
 38 | 			else:
 39 | 				S = d[i - 1][j - 1] + 1
 40 | 				I = d[i][j - 1] + 1
 41 | 				D = d[i - 1][j] + 1
 42 | 				d[i][j] = min(S, I, D)
 43 | 	
 44 | 	print (d[k][k])
 45 | 	return d[k][k] * 1.0 / k
 46 | 
 47 | def WER(x, y):
 48 | 	x = x.split()
 49 | 	y = y.split()
 50 | 	
 51 | 	n = len(x)
 52 | 	m = len(y)
 53 | 	print (n)
 54 | 	print (m)
 55 | 	d = np.zeros((n + 1) * (m + 1), dtype = np.uint8).reshape(n + 1, m + 1)
 56 | 
 57 | 	for i in range(n + 1):
 58 | 		for j in range(m + 1):
 59 | 			if i == 0:
 60 | 				d[0][j] = j
 61 | 			elif j == 0:
 62 | 				d[i][0] = i
 63 | 
 64 | 	for i in range(1, n + 1):
 65 | 		for j in range(1, m + 1):
 66 | 			if (x[i - 1] == y[j - 1]):
 67 | 				d[i][j] = d[i - 1][j - 1]
 68 | 			else:
 69 | 				S = d[i - 1][j - 1] + 1
 70 | 				I = d[i][j - 1] + 1
 71 | 				D = d[i - 1][j] + 1
 72 | 				d[i][j] = min(S, I, D)
 73 | 	
 74 | 	print (d[n][m])
 75 | 	return d[n][m] * 1.0 / n
 76 | 
 77 | def newCER(x, y):
 78 | 	x = x.replace(" ", "")
 79 | 	y = y.replace(" ", "")
 80 | 	n = len(x)
 81 | 	m = len(y)
 82 | 	k = min(n, m)
 83 | 	d = np.zeros((k + 1) * (k + 1), dtype = np.uint8).reshape(k + 1, k + 1)
 84 | 
 85 | 	for i in range(k + 1):
 86 | 		for j in range(k + 1):
 87 | 			if i == 0:
 88 | 				d[0][j] = j
 89 | 			elif j == 0:
 90 | 				d[i][0] = i
 91 | 
 92 | 	for i in range(1, k + 1):
 93 | 		for j in range(1, k + 1):
 94 | 			if (x[i - 1] == y[j - 1]):
 95 | 				d[i][j] = d[i - 1][j - 1]
 96 | 			else:
 97 | 				S = d[i - 1][j - 1] + 1
 98 | 				I = d[i][j - 1] + 1
 99 | 				D = d[i - 1][j] + 1
100 | 				d[i][j] = min(S, I, D)
101 | 	
102 | 	return d[k][k] * 1.0 / k
103 | 
104 | def CER(x, y):
105 | 	x = x.replace(" ", "")
106 | 	y = y.replace(" ", "")
107 | 	n = len(x)
108 | 	m = len(y)
109 | 	d = np.zeros((n + 1) * (m + 1), dtype = np.uint8).reshape(n + 1, m + 1)
110 | 
111 | 	for i in range(n + 1):
112 | 		for j in range(m + 1):
113 | 			if i == 0:
114 | 				d[0][j] = j
115 | 			elif j == 0:
116 | 				d[i][0] = i
117 | 
118 | 	for i in range(1, n + 1):
119 | 		for j in range(1, m + 1):
120 | 			if (x[i - 1] == y[j - 1]):
121 | 				d[i][j] = d[i - 1][j - 1]
122 | 			else:
123 | 				S = d[i - 1][j - 1] + 1
124 | 				I = d[i][j - 1] + 1
125 | 				D = d[i - 1][j] + 1
126 | 				d[i][j] = min(S, I, D)
127 | 	
128 | 	return d[n][m] * 1.0 / n
129 | 
130 | 
131 | 


--------------------------------------------------------------------------------