├── README.md ├── TDeval.py └── loss.py /README.md: -------------------------------------------------------------------------------- 1 | # Characterizing-Audio-Adversarial-Examples-using-Temporal-Dependency 2 | This repository contains the code for the ICLR 2019 Paper, "Characterizing Audio Adversarial Examples using Temporal Dependency". 3 | -------------------------------------------------------------------------------- /TDeval.py: -------------------------------------------------------------------------------- 1 | import scipy.io.wavfile as wav 2 | 3 | import argparse 4 | 5 | import tensorflow as tf 6 | import numpy as np 7 | import os 8 | import sys 9 | from sklearn.metrics import roc_curve, auc 10 | sys.path.append("DeepSpeech") 11 | 12 | tf.load_op_library = lambda x: x 13 | tmp = os.path.exists 14 | os.path.exists = lambda x: True 15 | import DeepSpeech 16 | os.path.exists = tmp 17 | 18 | from util.text import ctc_label_dense_to_sparse 19 | from tf_logits3 import get_logits 20 | from tqdm import tqdm 21 | toks = " abcdefghijklmnopqrstuvwxyz'-" 22 | 23 | DeepSpeech.TrainingCoordinator.__init__ = lambda x: None 24 | 25 | DeepSpeech.TrainingCoordinator.start = lambda x: None 26 | import loss 27 | # Use the same decode framework as carlini used :) 28 | class Decoder: 29 | def __init__(self, sess, max_audio_len): 30 | self.sess = sess 31 | self.max_audio_len = max_audio_len 32 | self.original = original = tf.Variable(np.zeros((1, max_audio_len), dtype=np.float32), name='qq_original') 33 | self.lengths = lengths = tf.Variable(np.zeros(1, dtype=np.int32), name='qq_lengths') 34 | 35 | with tf.variable_scope("", reuse=tf.AUTO_REUSE): 36 | logits, features = get_logits(original, lengths) 37 | 38 | self.logits = logits 39 | self.features = features 40 | saver = tf.train.Saver([x for x in tf.global_variables() if 'qq' not in x.name]) 41 | saver.restore(sess, "models/session_dump") 42 | self.decoded, _ = tf.nn.ctc_beam_search_decoder(logits, lengths, merge_repeated=False, beam_width=1000) 43 | 44 | def transcribe(self, audio, lengths): 45 | sess = self.sess 46 | sess.run(self.original.assign(np.array(audio))) 47 | sess.run(self.lengths.assign((np.array(lengths)-1)//320)) 48 | out, logits = sess.run((self.decoded, self.logits)) 49 | chars = out[0].values 50 | res = np.zeros(out[0].dense_shape)+len(toks)-1 51 | for ii in range(len(out[0].values)): 52 | x,y = out[0].indices[ii] 53 | res[x,y] = out[0].values[ii] 54 | res = ["".join(toks[int(x)] for x in y).replace("-","") for y in res] 55 | return res[0] 56 | 57 | def decode(audio, num): 58 | global maxlen 59 | global D 60 | global ref 61 | audios = [list(audio)] 62 | lengths = [int(maxlen * num)] 63 | audios = np.array(audios) 64 | res = D.transcribe(audios, lengths) 65 | return res 66 | 67 | num_samples = 50 68 | y_test = np.zeros(num_samples * 2) 69 | roc_auc = np.zeros(3) 70 | TD = np.zeros((3, num_samples * 2), dtype = np.float32) 71 | count = 0 72 | if __name__ == '__main__': 73 | sess = tf.Session() 74 | parser = argparse.ArgumentParser(description = None) 75 | parser.add_argument('--cut', type = float, required = True) 76 | args = parser.parse_args() 77 | 78 | 79 | ratio = args.cut 80 | pbar = tqdm(range(num_samples), unit='steps', ascii = True) 81 | for epoch in pbar: 82 | x, y = wav.read("librisample" + str(epoch) + ".wav") 83 | z, w = wav.read("librifinal" + str(epoch) + ".wav") 84 | maxlen = len(y) 85 | 86 | #ratio = np.random.random_sample() * 0.6 + 0.2 87 | #ratio = (numcut) * 1.0 / (numcut - 1) 88 | D = Decoder(sess, maxlen) 89 | stry = decode(y, 1) 90 | strw = decode(w, 1) 91 | halfy = decode(y, ratio) 92 | halfw = decode(w, ratio) 93 | 94 | #print ("Origin: " + stry) 95 | #print ("Half of Origin: " + halfy) 96 | s1 = loss.newWER(stry, halfy) 97 | s2 = loss.newCER(stry, halfy) 98 | s3 = loss.lcp(stry, halfy) 99 | 100 | y_test[count] = 0 101 | TD[0][count] = float(s1) 102 | TD[1][count] = float(s2) 103 | TD[2][count] = float(s3) 104 | 105 | count += 1 106 | #print ("WER: " + str(s1) + " CER: " + str(s2) + " LCP: " + str(s3)) 107 | #print ("Adv: " + strw) 108 | #print ("Half of Adv: " + halfw) 109 | s1 = loss.newWER(strw, halfw) 110 | s2 = loss.newCER(strw, halfw) 111 | s3 = loss.lcp(strw, halfw) 112 | 113 | y_test[count] = 1 114 | TD[0][count] = float(s1) 115 | TD[1][count] = float(s2) 116 | TD[2][count] = float(s3) 117 | count += 1 118 | #print ("WER: " + str(s1) + " CER: " + str(s2) + " LCP: " + str(s3)) 119 | 120 | 121 | for i in range(3): 122 | if (i == 2): 123 | y_test = 1 - y_test 124 | fpr, tpr, threshold = roc_curve(y_test, TD[i]) 125 | roc_auc[i] = auc(fpr, tpr) 126 | 127 | print ("WER: " + str(roc_auc[0]) + " CER: " + str(roc_auc[1]) + " LCP: " + str(roc_auc[2])) 128 | 129 | 130 | 131 | -------------------------------------------------------------------------------- /loss.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | def lcp(x, y): 4 | x = x.split() 5 | y = y.split() 6 | n = len(x) 7 | m = len(y) 8 | 9 | k = min(n, m) 10 | idx = 0 11 | for i in range(k): 12 | if (x[i] != y[i]): 13 | break 14 | idx = i + 1 15 | 16 | return idx * 1.0 / max(n, m) 17 | 18 | def newWER(x, y): 19 | x = x.split() 20 | y = y.split() 21 | 22 | n = len(x) 23 | m = len(y) 24 | k = min(n, m) 25 | d = np.zeros((k + 1) * (k + 1), dtype = np.uint8).reshape(k + 1, k + 1) 26 | 27 | for i in range(k + 1): 28 | for j in range(k + 1): 29 | if i == 0: 30 | d[0][j] = j 31 | elif j == 0: 32 | d[i][0] = i 33 | 34 | for i in range(1, k + 1): 35 | for j in range(1, k + 1): 36 | if (x[i - 1] == y[j - 1]): 37 | d[i][j] = d[i - 1][j - 1] 38 | else: 39 | S = d[i - 1][j - 1] + 1 40 | I = d[i][j - 1] + 1 41 | D = d[i - 1][j] + 1 42 | d[i][j] = min(S, I, D) 43 | 44 | print (d[k][k]) 45 | return d[k][k] * 1.0 / k 46 | 47 | def WER(x, y): 48 | x = x.split() 49 | y = y.split() 50 | 51 | n = len(x) 52 | m = len(y) 53 | print (n) 54 | print (m) 55 | d = np.zeros((n + 1) * (m + 1), dtype = np.uint8).reshape(n + 1, m + 1) 56 | 57 | for i in range(n + 1): 58 | for j in range(m + 1): 59 | if i == 0: 60 | d[0][j] = j 61 | elif j == 0: 62 | d[i][0] = i 63 | 64 | for i in range(1, n + 1): 65 | for j in range(1, m + 1): 66 | if (x[i - 1] == y[j - 1]): 67 | d[i][j] = d[i - 1][j - 1] 68 | else: 69 | S = d[i - 1][j - 1] + 1 70 | I = d[i][j - 1] + 1 71 | D = d[i - 1][j] + 1 72 | d[i][j] = min(S, I, D) 73 | 74 | print (d[n][m]) 75 | return d[n][m] * 1.0 / n 76 | 77 | def newCER(x, y): 78 | x = x.replace(" ", "") 79 | y = y.replace(" ", "") 80 | n = len(x) 81 | m = len(y) 82 | k = min(n, m) 83 | d = np.zeros((k + 1) * (k + 1), dtype = np.uint8).reshape(k + 1, k + 1) 84 | 85 | for i in range(k + 1): 86 | for j in range(k + 1): 87 | if i == 0: 88 | d[0][j] = j 89 | elif j == 0: 90 | d[i][0] = i 91 | 92 | for i in range(1, k + 1): 93 | for j in range(1, k + 1): 94 | if (x[i - 1] == y[j - 1]): 95 | d[i][j] = d[i - 1][j - 1] 96 | else: 97 | S = d[i - 1][j - 1] + 1 98 | I = d[i][j - 1] + 1 99 | D = d[i - 1][j] + 1 100 | d[i][j] = min(S, I, D) 101 | 102 | return d[k][k] * 1.0 / k 103 | 104 | def CER(x, y): 105 | x = x.replace(" ", "") 106 | y = y.replace(" ", "") 107 | n = len(x) 108 | m = len(y) 109 | d = np.zeros((n + 1) * (m + 1), dtype = np.uint8).reshape(n + 1, m + 1) 110 | 111 | for i in range(n + 1): 112 | for j in range(m + 1): 113 | if i == 0: 114 | d[0][j] = j 115 | elif j == 0: 116 | d[i][0] = i 117 | 118 | for i in range(1, n + 1): 119 | for j in range(1, m + 1): 120 | if (x[i - 1] == y[j - 1]): 121 | d[i][j] = d[i - 1][j - 1] 122 | else: 123 | S = d[i - 1][j - 1] + 1 124 | I = d[i][j - 1] + 1 125 | D = d[i - 1][j] + 1 126 | d[i][j] = min(S, I, D) 127 | 128 | return d[n][m] * 1.0 / n 129 | 130 | 131 | --------------------------------------------------------------------------------