├── results.png ├── README.md ├── process_data.py ├── text_cnn.py ├── data_helpers.py └── active_learning.py /results.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yezhang-xiaofan/Active-Learning-for-Neural-Networks/HEAD/results.png -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Active-Learning-for-Neural-Networks 2 | This re-implemented the "EGL-word" method proposed in AAAI 2017 paper ["Active Discriminative Text Representation Learning"](https://arxiv.org/pdf/1606.04212.pdf) in Tensorflow (the original implementation was in Theano). It compared the "EGL-word" method with two baseline methods "random" and "entropy" on a sentiment analysis dataset. 3 | 4 | First run "python process_data.py path/to/pre-trained-embedding" to generate the processed data. 5 | 6 | Then run "python active_learning.py --AL_method=active-learning-method", where "active-learning-method" should be one of "random", "entropy" or "EGL". 7 | 8 | The following figure is the average learning curve over five runs of 10-fold cross validation. 9 | 10 | ![Learning Curve](https://github.com/yezhang-xiaofan/Active-Learning-for-Neural-Networks/blob/master/results.png) 11 | -------------------------------------------------------------------------------- /process_data.py: -------------------------------------------------------------------------------- 1 | #this code is modified from https://github.com/yoonkim/CNN_sentence 2 | import numpy as np 3 | import cPickle 4 | from collections import defaultdict 5 | import sys, re 6 | def build_data_cv(data_folder, cv=10, clean_string=True): 7 | """ 8 | Loads data and split into 10 folds. 9 | """ 10 | revs = [] 11 | pos_file = data_folder[0] 12 | neg_file = data_folder[1] 13 | vocab = defaultdict(float) 14 | with open(pos_file, "rb") as f: 15 | for line in f: 16 | rev = [] 17 | rev.append(line.strip()) 18 | if clean_string: 19 | orig_rev = clean_str(" ".join(rev)) 20 | else: 21 | orig_rev = " ".join(rev).lower() 22 | words = set(orig_rev.split()) 23 | for word in words: 24 | vocab[word] += 1 25 | datum = {"y":1, 26 | "text": orig_rev, 27 | "num_words": len(orig_rev.split()), 28 | "split": np.random.randint(0,cv)} 29 | revs.append(datum) 30 | with open(neg_file, "rb") as f: 31 | for line in f: 32 | rev = [] 33 | rev.append(line.strip()) 34 | if clean_string: 35 | orig_rev = clean_str(" ".join(rev)) 36 | else: 37 | orig_rev = " ".join(rev).lower() 38 | words = set(orig_rev.split()) 39 | for word in words: 40 | vocab[word] += 1 41 | datum = {"y":0, 42 | "text": orig_rev, 43 | "num_words": len(orig_rev.split()), 44 | "split": np.random.randint(0,cv)} 45 | revs.append(datum) 46 | return revs, vocab 47 | 48 | def get_W(word_vecs, k=300): 49 | """ 50 | Get word matrix. W[i] is the vector for word indexed by i 51 | """ 52 | vocab_size = len(word_vecs) 53 | word_idx_map = dict() 54 | W = np.zeros(shape=(vocab_size+1, k), dtype='float32') 55 | W[0] = np.zeros(k, dtype='float32') 56 | i = 1 57 | for word in word_vecs: 58 | W[i] = word_vecs[word] 59 | word_idx_map[word] = i 60 | i += 1 61 | return W, word_idx_map 62 | 63 | def load_bin_vec(fname, vocab): 64 | """ 65 | Loads 300x1 word vecs from Google (Mikolov) word2vec 66 | """ 67 | word_vecs = {} 68 | 69 | with open(fname, "rb") as f: 70 | header = f.readline() 71 | vocab_size, layer1_size = map(int, header.split()) 72 | binary_len = np.dtype('float32').itemsize * layer1_size 73 | for line in xrange(vocab_size): 74 | word = [] 75 | while True: 76 | ch = f.read(1) 77 | if ch == ' ': 78 | word = ''.join(word) 79 | break 80 | if ch != '\n': 81 | word.append(ch) 82 | if word in vocab: 83 | word_vecs[word] = np.fromstring(f.read(binary_len), dtype='float32') 84 | else: 85 | f.read(binary_len) 86 | return word_vecs 87 | 88 | def add_unknown_words(word_vecs, vocab, min_df=1, k=300): 89 | """ 90 | For words that occur in at least min_df documents, create a separate word vector. 91 | 0.25 is chosen so the unknown vectors have (approximately) same variance as pre-trained ones 92 | """ 93 | for word in vocab: 94 | if word not in word_vecs and vocab[word] >= min_df: 95 | word_vecs[word] = np.random.uniform(-0.25,0.25,k) 96 | 97 | def clean_str(string, TREC=False): 98 | """ 99 | Tokenization/string cleaning for all datasets except for SST. 100 | Every dataset is lower cased except for TREC 101 | """ 102 | string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string) 103 | string = re.sub(r"\'s", " \'s", string) 104 | string = re.sub(r"\'ve", " \'ve", string) 105 | string = re.sub(r"n\'t", " n\'t", string) 106 | string = re.sub(r"\'re", " \'re", string) 107 | string = re.sub(r"\'d", " \'d", string) 108 | string = re.sub(r"\'ll", " \'ll", string) 109 | string = re.sub(r",", " , ", string) 110 | string = re.sub(r"!", " ! ", string) 111 | string = re.sub(r"\(", " \( ", string) 112 | string = re.sub(r"\)", " \) ", string) 113 | string = re.sub(r"\?", " \? ", string) 114 | string = re.sub(r"\s{2,}", " ", string) 115 | return string.strip() if TREC else string.strip().lower() 116 | 117 | def clean_str_sst(string): 118 | """ 119 | Tokenization/string cleaning for the SST dataset 120 | """ 121 | string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string) 122 | string = re.sub(r"\s{2,}", " ", string) 123 | return string.strip().lower() 124 | 125 | if __name__=="__main__": 126 | w2v_file = sys.argv[1] 127 | data_folder = ["rt-polarity.pos","rt-polarity.neg"] 128 | print "loading data...", 129 | revs, vocab = build_data_cv(data_folder, cv=10, clean_string=True) 130 | #max_l = np.max(pd.DataFrame(revs)["num_words"]) 131 | max_l = 56 132 | print "data loaded!" 133 | print "number of sentences: " + str(len(revs)) 134 | print "vocab size: " + str(len(vocab)) 135 | print "max sentence length: " + str(max_l) 136 | print "loading word2vec vectors...", 137 | w2v = load_bin_vec(w2v_file, vocab) 138 | print "word2vec loaded!" 139 | print "num words already in word2vec: " + str(len(w2v)) 140 | add_unknown_words(w2v, vocab) 141 | W, word_idx_map = get_W(w2v) 142 | rand_vecs = {} 143 | add_unknown_words(rand_vecs, vocab) 144 | W2, _ = get_W(rand_vecs) 145 | cPickle.dump([revs, W, W2, word_idx_map, vocab], open("mr.p", "wb")) 146 | print "dataset created!" -------------------------------------------------------------------------------- /text_cnn.py: -------------------------------------------------------------------------------- 1 | #This code is modified from https://github.com/dennybritz/cnn-text-classification-tf 2 | 3 | import sys 4 | sys.path.append('/scratch/cluster/yezhang/influence-release') 5 | import tensorflow as tf 6 | import numpy as np 7 | from scipy.optimize import fmin_ncg 8 | import os 9 | import time 10 | 11 | class TextCNN(object): 12 | """ 13 | A CNN for text classification. 14 | Uses an embedding layer, followed by a convolutional, max-pooling and softmax layer. 15 | """ 16 | def __init__( 17 | self, sequence_length, num_classes, vocab_size, 18 | embedding_size, filter_sizes, num_filters, train_dir=None, l2_reg_lambda=0.0, batch_size=100, damping=0.0, 19 | mini_batch=True, model_name='CNN',session=None): 20 | 21 | # Placeholders for input, output and dropout 22 | self.input_x = tf.placeholder(tf.int32, [None, sequence_length], name="input_x") 23 | self.input_y = tf.placeholder(tf.int32, [None], name="input_y") 24 | self.dropout_keep_prob = tf.placeholder(tf.float32, name="dropout_keep_prob") 25 | self.batch_size = batch_size 26 | self.num_classes = num_classes 27 | # Keeping track of l2 regularization loss (optional) 28 | l2_loss = tf.constant(0.0) 29 | 30 | # Embedding layer 31 | with tf.device('/cpu:0'), tf.variable_scope("embedding"): 32 | self.W = tf.Variable( 33 | tf.random_uniform([vocab_size, embedding_size], -1.0, 1.0), 34 | name="W") 35 | self.embedded_chars = tf.nn.embedding_lookup(self.W, self.input_x) 36 | self.embedded_chars_expanded = tf.expand_dims(self.embedded_chars, -1) 37 | 38 | # Create a convolution + maxpool layer for each filter size 39 | pooled_outputs = [] 40 | for i, filter_size in enumerate(filter_sizes): 41 | with tf.variable_scope("conv-maxpool-%s" % filter_size): 42 | # Convolution Layer 43 | filter_shape = [filter_size, embedding_size, 1, num_filters] 44 | W = tf.Variable(tf.truncated_normal(filter_shape, stddev=0.1), name="W") 45 | b = tf.Variable(tf.constant(0.01, shape=[num_filters]), name="b") 46 | conv = tf.nn.conv2d( 47 | self.embedded_chars_expanded, 48 | W, 49 | strides=[1, 1, 1, 1], 50 | padding="VALID", 51 | name="conv") 52 | # Apply nonlinearity 53 | h = tf.nn.relu(tf.nn.bias_add(conv, b), name="relu") 54 | # Maxpooling over the outputs 55 | pooled = tf.nn.max_pool( 56 | h, 57 | ksize=[1, sequence_length - filter_size + 1, 1, 1], 58 | strides=[1, 1, 1, 1], 59 | padding='VALID', 60 | name="pool") 61 | pooled_outputs.append(pooled) 62 | 63 | # Combine all the pooled features 64 | num_filters_total = num_filters * len(filter_sizes) 65 | self.h_pool = tf.concat(pooled_outputs, 3) 66 | self.h_pool_flat = tf.reshape(self.h_pool, [-1, num_filters_total]) 67 | 68 | # Add dropout 69 | with tf.variable_scope("dropout"): 70 | self.h_drop = tf.nn.dropout(self.h_pool_flat, self.dropout_keep_prob) 71 | 72 | # Final (unnormalized) scores and predictions 73 | with tf.variable_scope("output"): 74 | W = tf.get_variable( 75 | "W", 76 | shape=[num_filters_total, num_classes], 77 | initializer=tf.contrib.layers.xavier_initializer()) 78 | self.softmax_W = W 79 | b = tf.Variable(tf.constant(0.0, shape=[num_classes]), name="b") 80 | self.softmax_b = b 81 | l2_loss += tf.nn.l2_loss(W) 82 | l2_loss += tf.nn.l2_loss(b) 83 | self.scores = tf.nn.xw_plus_b(self.h_drop, W, b, name="scores") 84 | self.probabilies = tf.nn.softmax(self.scores) 85 | self.log_probabilies = tf.nn.log_softmax(self.scores) 86 | self.predictions = tf.argmax(self.scores, 1, name="predictions") 87 | 88 | self.params = self.get_all_params() 89 | # CalculateMean cross-entropy loss 90 | with tf.variable_scope("loss"): 91 | labels = tf.one_hot(self.input_y, depth=self.num_classes) 92 | cross_entropy = -tf.reduce_sum(tf.multiply(labels, tf.nn.log_softmax(self.scores)), axis=1) 93 | self.entropy = -tf.reduce_sum(tf.multiply(self.log_probabilies, self.probabilies), axis=1) 94 | self.average_entropy = tf.reduce_mean(self.entropy) 95 | self.indiv_loss_no_reg = cross_entropy 96 | self.loss_no_reg = tf.reduce_mean(cross_entropy, name='xentropy_mean') 97 | self.grad_loss_no_reg_op = tf.gradients(self.loss_no_reg, self.params) #average grad loss 98 | self.loss = self.loss_no_reg + l2_reg_lambda * l2_loss #average loss 99 | self.EGL_norm = tf.reshape(tf.norm(tf.gradients(self.loss_no_reg, self.W)[0],axis=-1),[-1]) #batch_size (should be 1) * |V| 100 | 101 | # Accuracy 102 | with tf.variable_scope("accuracy"): 103 | correct_predictions = tf.equal(tf.cast(self.predictions, "int32"), self.input_y) 104 | self.accuracy = tf.reduce_mean(tf.cast(correct_predictions, "float"), name="accuracy") 105 | 106 | config = tf.ConfigProto() 107 | if session is None: 108 | self.sess = tf.Session(config=config) 109 | else: 110 | self.sess = session 111 | init = tf.global_variables_initializer() 112 | self.sess.run(init) 113 | self.saver = tf.train.Saver() 114 | self.damping = damping 115 | self.grad_total_loss_op = tf.gradients(self.loss, self.params) 116 | self.mini_batch = mini_batch 117 | self.train_dir = train_dir 118 | self.model_name = model_name 119 | if self.train_dir is not None: 120 | if not os.path.exists(self.train_dir): 121 | os.makedirs(self.train_dir) 122 | 123 | def get_all_params(self): 124 | trainable_vars = tf.trainable_variables() 125 | trainable_vars = [t for t in trainable_vars if "embedding" not in t.name 126 | and 'conv' not in t.name 127 | ] 128 | print "params used in Hessian: " 129 | for t in trainable_vars: 130 | print (t.name) 131 | print (t.shape) 132 | return trainable_vars 133 | 134 | def re_initialize(self, checkpoint_dir): 135 | checkpoint_file = tf.train.latest_checkpoint(checkpoint_dir) 136 | print ("checkpoint file: ", checkpoint_file) 137 | self.saver.restore(self.sess, checkpoint_file) 138 | -------------------------------------------------------------------------------- /data_helpers.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import re 3 | import itertools 4 | from collections import Counter 5 | 6 | 7 | def clean_str(string): 8 | """ 9 | Tokenization/string cleaning for all datasets except for SST. 10 | Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py 11 | """ 12 | string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string) 13 | string = re.sub(r"\'s", " \'s", string) 14 | string = re.sub(r"\'ve", " \'ve", string) 15 | string = re.sub(r"n\'t", " n\'t", string) 16 | string = re.sub(r"\'re", " \'re", string) 17 | string = re.sub(r"\'d", " \'d", string) 18 | string = re.sub(r"\'ll", " \'ll", string) 19 | string = re.sub(r",", " , ", string) 20 | string = re.sub(r"!", " ! ", string) 21 | string = re.sub(r"\(", " \( ", string) 22 | string = re.sub(r"\)", " \) ", string) 23 | string = re.sub(r"\?", " \? ", string) 24 | string = re.sub(r"\s{2,}", " ", string) 25 | return string.strip().lower() 26 | 27 | 28 | def load_data_and_labels(positive_data_file, negative_data_file): 29 | """ 30 | Loads MR polarity data from files, splits the data into words and generates labels. 31 | Returns split sentences and labels. 32 | """ 33 | # Load data from files 34 | positive_examples = list(open(positive_data_file, "r").readlines()) 35 | positive_examples = [s.strip() for s in positive_examples] 36 | negative_examples = list(open(negative_data_file, "r").readlines()) 37 | negative_examples = [s.strip() for s in negative_examples] 38 | # Split by words 39 | x_text = positive_examples + negative_examples 40 | original_text = x_text 41 | x_text = [clean_str(sent) for sent in x_text] 42 | # Generate labels 43 | #positive_labels = [[0, 1] for _ in positive_examples] 44 | #negative_labels = [[1, 0] for _ in negative_examples] 45 | positive_labels = [1] * len(positive_examples) 46 | negative_labels = [0] * len(negative_examples) 47 | #y = np.concatenate([positive_labels, negative_labels], 0) 48 | y = positive_labels + negative_labels 49 | return x_text, y, original_text 50 | 51 | 52 | def batch_iter(data, batch_size, num_epochs, shuffle=True): 53 | """ 54 | Generates a batch iterator for a dataset. 55 | """ 56 | data = np.array(data) 57 | data_size = len(data) 58 | num_batches_per_epoch = int((len(data)-1)/batch_size) + 1 59 | for epoch in range(num_epochs): 60 | # Shuffle the data at each epoch 61 | if shuffle: 62 | shuffle_indices = np.random.permutation(np.arange(data_size)) 63 | shuffled_data = data[shuffle_indices] 64 | else: 65 | shuffled_data = data 66 | for batch_num in range(num_batches_per_epoch): 67 | start_index = batch_num * batch_size 68 | end_index = min((batch_num + 1) * batch_size, data_size) 69 | yield shuffled_data[start_index:end_index] 70 | 71 | def split_into_train_test(positive_data_file, negative_data_file): 72 | positive_examples = list(open(positive_data_file, "r").readlines()) 73 | negative_examples = list(open(negative_data_file, "r").readlines()) 74 | num_pos = len(positive_examples) 75 | num_neg = len(negative_examples) 76 | shuffle_indices = np.random.permutation(np.arange(num_pos)) 77 | dev_sample_index = -1 * int(0.1 * float(num_pos)) 78 | pos_train_indices = shuffle_indices[:dev_sample_index] 79 | #pos_test_indices = shuffle_indices[dev_sample_index:] 80 | pos_train = open('data/rt-polaritydata/train_pos.txt','wb') 81 | pos_test = open('data/rt-polaritydata/test_pos.txt','wb') 82 | neg_train = open('data/rt-polaritydata/train_neg.txt','wb') 83 | neg_test = open('data/rt-polaritydata/test_neg.txt','wb') 84 | for i in range(num_pos): 85 | if i in pos_train_indices: 86 | pos_train.write(positive_examples[i]) 87 | else: 88 | pos_test.write(positive_examples[i]) 89 | shuffle_indices = np.random.permutation(np.arange(num_neg)) 90 | dev_sample_index = -1 * int(0.1 * float(num_neg)) 91 | neg_train_indices = shuffle_indices[:dev_sample_index] 92 | for i in range(num_neg): 93 | if i in neg_train_indices: 94 | neg_train.write(negative_examples[i]) 95 | else: 96 | neg_test.write(negative_examples[i]) 97 | 98 | def split_into_folds(positive_data_file, negative_data_file): 99 | positive_examples = list(open(positive_data_file, "r").readlines()) 100 | negative_examples = list(open(negative_data_file, "r").readlines()) 101 | folds_pos = [] 102 | folds_neg = [] 103 | for i in range(10): 104 | folds_pos.append(open("data/folds/fold_pos_"+str(i), 'wb')) 105 | folds_neg.append(open("data/folds/fold_neg_"+str(i), 'wb')) 106 | for p in positive_examples: 107 | cv = np.random.randint(0,10) 108 | folds_pos[cv].write(p) 109 | for n in negative_examples: 110 | cv = np.random.randint(0,10) 111 | folds_neg[cv].write(n) 112 | for f in folds_pos: 113 | f.close() 114 | for f in folds_neg: 115 | f.close() 116 | 117 | def load_folds(cv): 118 | train_x, train_y, train_original = [], [], [] 119 | for i in range(10): 120 | if i == cv: 121 | pos_file = "data/folds/fold_pos_"+str(i) 122 | neg_file = "data/folds/fold_neg_"+str(i) 123 | test_x, test_y, test_original = load_data_and_labels(pos_file, neg_file) 124 | else: 125 | pos_file = "data/folds/fold_pos_" + str(i) 126 | neg_file = "data/folds/fold_neg_" + str(i) 127 | temp_train_x, temp_train_y, temp_train_original = load_data_and_labels(pos_file, neg_file) 128 | train_x += temp_train_x 129 | train_y += temp_train_y 130 | train_original += temp_train_original 131 | return train_x, train_y, train_original, test_x, test_y, test_original 132 | 133 | def idx_to_word(word_idx_map): 134 | ''' 135 | :param word_idx_map: map word to index 136 | :return: map index to word (not including zero) 137 | ''' 138 | result = {} 139 | for word in word_idx_map: 140 | result[word_idx_map[word]] = word 141 | return result 142 | 143 | def get_idx_from_sent(sent, word_idx_map, max_l=51, k=300, filter_h=5): 144 | """ 145 | Transforms sentence into a list of indices. Pad with zeroes. 146 | """ 147 | x = [] 148 | pad = filter_h - 1 149 | for i in xrange(pad): 150 | x.append(0) 151 | words = sent.split() 152 | for word in words: 153 | if word in word_idx_map: 154 | x.append(word_idx_map[word]) 155 | while len(x) < max_l+2*pad: 156 | x.append(0) 157 | return x 158 | 159 | def make_idx_data_cv(revs, word_idx_map, cv, max_l=51, k=300, filter_h=5): 160 | """ 161 | Transforms sentences into a 2-d matrix. 162 | """ 163 | train, test = [], [] 164 | for rev in revs: 165 | sent = get_idx_from_sent(rev["text"], word_idx_map, max_l, k, filter_h) 166 | sent.append(rev["y"]) 167 | if rev["split"]==cv: 168 | test.append(sent) 169 | else: 170 | train.append(sent) 171 | #train is a list of sent 172 | #each sent is a list of indices padded with zeroes, followed by the label 173 | train = np.array(train,dtype='int32') 174 | test = np.array(test,dtype='int32') 175 | return [train, test] 176 | 177 | 178 | 179 | 180 | 181 | 182 | 183 | -------------------------------------------------------------------------------- /active_learning.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | #This code is modified from https://github.com/dennybritz/cnn-text-classification-tf 3 | import tensorflow as tf 4 | import numpy as np 5 | import os 6 | import time 7 | import datetime 8 | import data_helpers 9 | from text_cnn import TextCNN 10 | from tensorflow.contrib import learn 11 | import cPickle 12 | # Parameters 13 | # ================================================== 14 | 15 | # Data loading params 16 | tf.flags.DEFINE_float("dev_sample_percentage", .1, "Percentage of the training data to use for validation") 17 | #tf.flags.DEFINE_string("train_data_file", "train.txt", "Data source for the positive data.") 18 | tf.flags.DEFINE_string("positive_data_file", "./data/rt-polaritydata/train_pos.txt", "Data source for the positive data.") 19 | tf.flags.DEFINE_string("negative_data_file", "./data/rt-polaritydata/train_neg.txt", "Data source for the negative data.") 20 | tf.flags.DEFINE_string("test_positive_data_file", "./data/rt-polaritydata/test_pos.txt", "Data source for the positive data.") 21 | tf.flags.DEFINE_string("test_negative_data_file", "./data/rt-polaritydata/test_neg.txt", "Data source for the positive data.") 22 | # Model Hyperparameters 23 | tf.flags.DEFINE_integer("embedding_dim", 300, "Dimensionality of character embedding (default: 128)") 24 | tf.flags.DEFINE_string("filter_sizes", "3,4,5", "Comma-separated filter sizes (default: '3,4,5')") 25 | tf.flags.DEFINE_integer("num_filters", 128, "Number of filters per filter size (default: 128)") 26 | tf.flags.DEFINE_float("dropout_keep_prob", 0.5, "Dropout keep probability (default: 0.5)") 27 | tf.flags.DEFINE_float("l2_reg_lambda", 0.0, "L2 regularization lambda (default: 0.0)") 28 | 29 | # Training parameters 30 | tf.flags.DEFINE_integer("batch_size", 25, "Batch Size (default: 64)") 31 | tf.flags.DEFINE_integer("num_epochs", 10, "Number of training epochs (default: 200)") 32 | tf.flags.DEFINE_integer("num_checkpoints", 5, "Number of checkpoints to store (default: 5)") 33 | # Misc Parameters 34 | tf.flags.DEFINE_boolean("allow_soft_placement", True, "Allow device soft device placement") 35 | tf.flags.DEFINE_boolean("log_device_placement", False, "Log placement of ops on devices") 36 | tf.flags.DEFINE_string("word2vec_path", "/scratch/cluster/yezhang/glove.840B.300d.txt", "word2ec path") 37 | #active learning method 38 | tf.flags.DEFINE_string("AL_method", "EGL", "active learning method. Should be random/entropy/EGL") 39 | 40 | FLAGS = tf.flags.FLAGS 41 | FLAGS._parse_flags() 42 | print("\nParameters:") 43 | for attr, value in sorted(FLAGS.__flags.items()): 44 | print("{}={}".format(attr.upper(), value)) 45 | print("") 46 | 47 | def idx_to_word(word_idx_map): 48 | ''' 49 | :param word_idx_map: map word to index 50 | :return: map index to word (not including zero) 51 | ''' 52 | result = {} 53 | for word in word_idx_map: 54 | result[word_idx_map[word]] = word 55 | return result 56 | def get_idx_from_sent(sent, word_idx_map, max_l=51, k=300, filter_h=5): 57 | """ 58 | Transforms sentence into a list of indices. Pad with zeroes. 59 | """ 60 | x = [] 61 | pad = filter_h - 1 62 | for i in xrange(pad): 63 | x.append(0) 64 | words = sent.split() 65 | for word in words: 66 | if word in word_idx_map: 67 | x.append(word_idx_map[word]) 68 | while len(x) < max_l+2*pad: 69 | x.append(0) 70 | return x 71 | 72 | def make_idx_data_cv(revs, word_idx_map, cv, max_l=51, k=300, filter_h=5): 73 | """ 74 | Transforms sentences into a 2-d matrix. 75 | """ 76 | train, test = [], [] 77 | for rev in revs: 78 | sent = get_idx_from_sent(rev["text"], word_idx_map, max_l, k, filter_h) 79 | sent.append(rev["y"]) 80 | if rev["split"]==cv: 81 | test.append(sent) 82 | else: 83 | train.append(sent) 84 | #train is a list of sent 85 | #each sent is a list of indices padded with zeroes, followed by the label 86 | train = np.array(train,dtype='int32') 87 | test = np.array(test,dtype='int32') 88 | return [train, test] 89 | 90 | x = cPickle.load(open("mr.p","rb")) 91 | revs, W, W2, word_idx_map, vocab = x[0], x[1], x[2], x[3], x[4] 92 | vocab_size = W.shape[0] 93 | idx_to_word_map = idx_to_word(word_idx_map) 94 | print "data loaded!" 95 | initW = W 96 | average_accuracy_across_folds = [] 97 | for k in range(10): 98 | print "test on fold: " + str(k) 99 | datasets = make_idx_data_cv(revs, word_idx_map, k, max_l=56,k=300, filter_h=5) 100 | img_h = len(datasets[0][0]) - 1 101 | x_train, y_train = datasets[0][:,:-1], datasets[0][:,-1] 102 | x_test, y_test = datasets[1][:,:-1], datasets[1][:,-1] 103 | 104 | 105 | with tf.Graph().as_default(): 106 | cnn = TextCNN( 107 | sequence_length=x_train.shape[1], 108 | num_classes=2, 109 | vocab_size=vocab_size, 110 | embedding_size=FLAGS.embedding_dim, 111 | filter_sizes=list(map(int, FLAGS.filter_sizes.split(","))), 112 | num_filters=FLAGS.num_filters, 113 | l2_reg_lambda=FLAGS.l2_reg_lambda, 114 | ) 115 | # Define Training procedure 116 | global_step = tf.Variable(0, name="global_step", trainable=False) 117 | optimizer = tf.train.AdamOptimizer(1e-3) 118 | grads_and_vars = optimizer.compute_gradients(cnn.loss) 119 | train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step) 120 | 121 | # Initialize all variables 122 | cnn.sess.run(tf.global_variables_initializer()) 123 | cnn.sess.run(cnn.W.assign(initW)) 124 | 125 | 126 | def train_step(x_batch, y_batch): 127 | """ 128 | A single training step 129 | """ 130 | feed_dict = { 131 | cnn.input_x: x_batch, 132 | cnn.input_y: y_batch, 133 | cnn.dropout_keep_prob: FLAGS.dropout_keep_prob 134 | } 135 | _, step, loss, accuracy = cnn.sess.run( 136 | [train_op, global_step, cnn.loss, cnn.accuracy], 137 | feed_dict) 138 | time_str = datetime.datetime.now().isoformat() 139 | #print("{}: step {}, train loss {:g}, train acc {:g}".format(time_str, step, loss, accuracy)) 140 | 141 | def dev_step(x_batch, y_batch, writer=None): 142 | """ 143 | Evaluates model on a dev set 144 | """ 145 | feed_dict = { 146 | cnn.input_x: x_batch, 147 | cnn.input_y: y_batch, 148 | cnn.dropout_keep_prob: 1.0 149 | } 150 | step, loss, accuracy = cnn.sess.run( 151 | [global_step, cnn.loss, cnn.accuracy], 152 | feed_dict) 153 | time_str = datetime.datetime.now().isoformat() 154 | #print("{}: step {}, dev loss {:g}, dev acc {:g}".format(time_str, step, loss, accuracy)) 155 | return accuracy 156 | 157 | def entropy_score(train_x, train_index): 158 | feed_dict = { 159 | cnn.input_x: train_x[train_index], 160 | cnn.dropout_keep_prob: 1.0 161 | } 162 | step, test_entropies = cnn.sess.run( 163 | [global_step, cnn.entropy], 164 | feed_dict) 165 | return test_entropies 166 | 167 | def EGL_score(train_x, train_index): 168 | EGL_scores = [] 169 | for j in train_index: 170 | EGL_norm = np.zeros(vocab_size) 171 | for k in range(num_classes): 172 | feed_dict = { 173 | cnn.input_x: np.expand_dims(train_x[j],0), 174 | cnn.dropout_keep_prob: 1.0, 175 | cnn.input_y: np.array([k]) 176 | } 177 | step, EGL, probs = cnn.sess.run( 178 | [global_step, cnn.EGL_norm, cnn.probabilies], 179 | feed_dict) 180 | EGL_norm += EGL * probs[0][k] 181 | EGL_max_norm = max(EGL_norm) 182 | EGL_scores.append(EGL_max_norm) 183 | return np.array(EGL_scores) 184 | 185 | 186 | print "number of training points in fold " + str(k) + ":"+ str(len(y_train)) 187 | print "number of test points in fold " + str(k) + ":" + str(len(y_test)) 188 | indices = np.arange(len(y_train)) 189 | num_classes = len(set(y_train)) 190 | best_dev_accuracy = 0.0 191 | index_in_labels_pool = [] 192 | index_in_unlabeled_pool = indices 193 | index_of_new_add_index = np.random.choice(np.arange(len(index_in_unlabeled_pool)), size=FLAGS.batch_size) 194 | index_in_labels_pool += list(index_in_unlabeled_pool[index_of_new_add_index]) 195 | index_in_unlabeled_pool = np.delete(index_in_unlabeled_pool, index_of_new_add_index) 196 | cur_train = x_train[np.array(index_in_labels_pool)] 197 | cur_labels = y_train[np.array(index_in_labels_pool)] 198 | accuracy_list = [] 199 | init = tf.global_variables_initializer() 200 | cnn.sess.run(init) 201 | cnn.sess.run(cnn.W.assign(initW)) 202 | for i in range(20): 203 | print "current number of labels: ", len(cur_labels) 204 | print "current positive labels: ", len(np.where(cur_labels == 1)[0]) 205 | print "current negative labels: ", len(np.where(cur_labels == 0)[0]) 206 | print "current number of unlabeled points: ", len(index_in_unlabeled_pool) 207 | batches = data_helpers.batch_iter( 208 | list(zip(cur_train, cur_labels)), FLAGS.batch_size, FLAGS.num_epochs) 209 | for batch in batches: 210 | x_batch, y_batch = zip(*batch) 211 | train_step(x_batch, y_batch) 212 | cnn.sess.run(cnn.W[0].assign(np.zeros(FLAGS.embedding_dim))) 213 | if FLAGS.AL_method == "entropy": 214 | entropy_scores = entropy_score(x_train, index_in_unlabeled_pool) 215 | index_of_new_add_index = np.argsort(entropy_scores)[-FLAGS.batch_size:] 216 | elif FLAGS.AL_method == "random": 217 | index_of_new_add_index = np.random.choice(np.arange(len(index_in_unlabeled_pool)), size=FLAGS.batch_size) 218 | elif FLAGS.AL_method == "EGL": 219 | EGL_scores = EGL_score(x_train, index_in_unlabeled_pool) 220 | index_of_new_add_index = np.argsort(EGL_scores)[-FLAGS.batch_size:] 221 | index_in_labels_pool += list(index_in_unlabeled_pool[index_of_new_add_index]) 222 | index_in_unlabeled_pool = np.delete(index_in_unlabeled_pool, index_of_new_add_index) 223 | cur_train = x_train[np.array(index_in_labels_pool)] 224 | cur_labels = y_train[np.array(index_in_labels_pool)] 225 | dev_accuracy = dev_step(x_test, y_test) 226 | print dev_accuracy 227 | accuracy_list.append(dev_accuracy) 228 | print accuracy_list 229 | average_accuracy_across_folds.append(accuracy_list) 230 | print list(np.average(np.array(average_accuracy_across_folds), axis=0)) 231 | --------------------------------------------------------------------------------