├── results.png
├── README.md
├── process_data.py
├── text_cnn.py
├── data_helpers.py
└── active_learning.py


/results.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yezhang-xiaofan/Active-Learning-for-Neural-Networks/HEAD/results.png


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Active-Learning-for-Neural-Networks
 2 | This re-implemented the "EGL-word" method proposed in AAAI 2017 paper ["Active Discriminative Text Representation Learning"](https://arxiv.org/pdf/1606.04212.pdf) in Tensorflow (the original implementation was in Theano). It compared the "EGL-word" method with two baseline methods "random" and "entropy" on a sentiment analysis dataset.
 3 | 
 4 | First run "python process_data.py path/to/pre-trained-embedding" to generate the processed data.
 5 | 
 6 | Then run "python active_learning.py --AL_method=active-learning-method", where "active-learning-method" should be one of "random", "entropy" or "EGL".
 7 | 
 8 | The following figure is the average learning curve over five runs of 10-fold cross validation. 
 9 | 
10 | ![Learning Curve](https://github.com/yezhang-xiaofan/Active-Learning-for-Neural-Networks/blob/master/results.png)
11 | 


--------------------------------------------------------------------------------
/process_data.py:
--------------------------------------------------------------------------------
  1 | #this code is modified from https://github.com/yoonkim/CNN_sentence
  2 | import numpy as np
  3 | import cPickle
  4 | from collections import defaultdict
  5 | import sys, re
  6 | def build_data_cv(data_folder, cv=10, clean_string=True):
  7 |     """
  8 |     Loads data and split into 10 folds.
  9 |     """
 10 |     revs = []
 11 |     pos_file = data_folder[0]
 12 |     neg_file = data_folder[1]
 13 |     vocab = defaultdict(float)
 14 |     with open(pos_file, "rb") as f:
 15 |         for line in f:       
 16 |             rev = []
 17 |             rev.append(line.strip())
 18 |             if clean_string:
 19 |                 orig_rev = clean_str(" ".join(rev))
 20 |             else:
 21 |                 orig_rev = " ".join(rev).lower()
 22 |             words = set(orig_rev.split())
 23 |             for word in words:
 24 |                 vocab[word] += 1
 25 |             datum  = {"y":1, 
 26 |                       "text": orig_rev,                             
 27 |                       "num_words": len(orig_rev.split()),
 28 |                       "split": np.random.randint(0,cv)}
 29 |             revs.append(datum)
 30 |     with open(neg_file, "rb") as f:
 31 |         for line in f:       
 32 |             rev = []
 33 |             rev.append(line.strip())
 34 |             if clean_string:
 35 |                 orig_rev = clean_str(" ".join(rev))
 36 |             else:
 37 |                 orig_rev = " ".join(rev).lower()
 38 |             words = set(orig_rev.split())
 39 |             for word in words:
 40 |                 vocab[word] += 1
 41 |             datum  = {"y":0, 
 42 |                       "text": orig_rev,                             
 43 |                       "num_words": len(orig_rev.split()),
 44 |                       "split": np.random.randint(0,cv)}
 45 |             revs.append(datum)
 46 |     return revs, vocab
 47 |     
 48 | def get_W(word_vecs, k=300):
 49 |     """
 50 |     Get word matrix. W[i] is the vector for word indexed by i
 51 |     """
 52 |     vocab_size = len(word_vecs)
 53 |     word_idx_map = dict()
 54 |     W = np.zeros(shape=(vocab_size+1, k), dtype='float32')            
 55 |     W[0] = np.zeros(k, dtype='float32')
 56 |     i = 1
 57 |     for word in word_vecs:
 58 |         W[i] = word_vecs[word]
 59 |         word_idx_map[word] = i
 60 |         i += 1
 61 |     return W, word_idx_map
 62 | 
 63 | def load_bin_vec(fname, vocab):
 64 |     """
 65 |     Loads 300x1 word vecs from Google (Mikolov) word2vec
 66 |     """
 67 |     word_vecs = {}
 68 | 
 69 |     with open(fname, "rb") as f:
 70 |         header = f.readline()
 71 |         vocab_size, layer1_size = map(int, header.split())
 72 |         binary_len = np.dtype('float32').itemsize * layer1_size
 73 |         for line in xrange(vocab_size):
 74 |             word = []
 75 |             while True:
 76 |                 ch = f.read(1)
 77 |                 if ch == ' ':
 78 |                     word = ''.join(word)
 79 |                     break
 80 |                 if ch != '\n':
 81 |                     word.append(ch)   
 82 |             if word in vocab:
 83 |                word_vecs[word] = np.fromstring(f.read(binary_len), dtype='float32')
 84 |             else:
 85 |                 f.read(binary_len)
 86 |     return word_vecs
 87 | 
 88 | def add_unknown_words(word_vecs, vocab, min_df=1, k=300):
 89 |     """
 90 |     For words that occur in at least min_df documents, create a separate word vector.    
 91 |     0.25 is chosen so the unknown vectors have (approximately) same variance as pre-trained ones
 92 |     """
 93 |     for word in vocab:
 94 |         if word not in word_vecs and vocab[word] >= min_df:
 95 |             word_vecs[word] = np.random.uniform(-0.25,0.25,k)  
 96 | 
 97 | def clean_str(string, TREC=False):
 98 |     """
 99 |     Tokenization/string cleaning for all datasets except for SST.
100 |     Every dataset is lower cased except for TREC
101 |     """
102 |     string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)     
103 |     string = re.sub(r"\'s", " \'s", string) 
104 |     string = re.sub(r"\'ve", " \'ve", string) 
105 |     string = re.sub(r"n\'t", " n\'t", string) 
106 |     string = re.sub(r"\'re", " \'re", string) 
107 |     string = re.sub(r"\'d", " \'d", string) 
108 |     string = re.sub(r"\'ll", " \'ll", string) 
109 |     string = re.sub(r",", " , ", string) 
110 |     string = re.sub(r"!", " ! ", string) 
111 |     string = re.sub(r"\(", " \( ", string) 
112 |     string = re.sub(r"\)", " \) ", string) 
113 |     string = re.sub(r"\?", " \? ", string) 
114 |     string = re.sub(r"\s{2,}", " ", string)    
115 |     return string.strip() if TREC else string.strip().lower()
116 | 
117 | def clean_str_sst(string):
118 |     """
119 |     Tokenization/string cleaning for the SST dataset
120 |     """
121 |     string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)   
122 |     string = re.sub(r"\s{2,}", " ", string)    
123 |     return string.strip().lower()
124 | 
125 | if __name__=="__main__":    
126 |     w2v_file = sys.argv[1]     
127 |     data_folder = ["rt-polarity.pos","rt-polarity.neg"]
128 |     print "loading data...",        
129 |     revs, vocab = build_data_cv(data_folder, cv=10, clean_string=True)
130 |     #max_l = np.max(pd.DataFrame(revs)["num_words"])
131 |     max_l = 56
132 |     print "data loaded!"
133 |     print "number of sentences: " + str(len(revs))
134 |     print "vocab size: " + str(len(vocab))
135 |     print "max sentence length: " + str(max_l)
136 |     print "loading word2vec vectors...",
137 |     w2v = load_bin_vec(w2v_file, vocab)
138 |     print "word2vec loaded!"
139 |     print "num words already in word2vec: " + str(len(w2v))
140 |     add_unknown_words(w2v, vocab)
141 |     W, word_idx_map = get_W(w2v)
142 |     rand_vecs = {}
143 |     add_unknown_words(rand_vecs, vocab)
144 |     W2, _ = get_W(rand_vecs)
145 |     cPickle.dump([revs, W, W2, word_idx_map, vocab], open("mr.p", "wb"))
146 |     print "dataset created!"


--------------------------------------------------------------------------------
/text_cnn.py:
--------------------------------------------------------------------------------
  1 | #This code is modified from https://github.com/dennybritz/cnn-text-classification-tf
  2 | 
  3 | import sys
  4 | sys.path.append('/scratch/cluster/yezhang/influence-release')
  5 | import tensorflow as tf
  6 | import numpy as np
  7 | from scipy.optimize import fmin_ncg
  8 | import os
  9 | import time
 10 | 
 11 | class TextCNN(object):
 12 |     """
 13 |     A CNN for text classification.
 14 |     Uses an embedding layer, followed by a convolutional, max-pooling and softmax layer.
 15 |     """
 16 |     def __init__(
 17 |       self, sequence_length, num_classes, vocab_size,
 18 |       embedding_size, filter_sizes, num_filters, train_dir=None, l2_reg_lambda=0.0, batch_size=100, damping=0.0,
 19 |             mini_batch=True, model_name='CNN',session=None):
 20 | 
 21 |         # Placeholders for input, output and dropout
 22 |         self.input_x = tf.placeholder(tf.int32, [None, sequence_length], name="input_x")
 23 |         self.input_y = tf.placeholder(tf.int32, [None], name="input_y")
 24 |         self.dropout_keep_prob = tf.placeholder(tf.float32, name="dropout_keep_prob")
 25 |         self.batch_size = batch_size
 26 |         self.num_classes = num_classes
 27 |         # Keeping track of l2 regularization loss (optional)
 28 |         l2_loss = tf.constant(0.0)
 29 | 
 30 |         # Embedding layer
 31 |         with tf.device('/cpu:0'), tf.variable_scope("embedding"):
 32 |             self.W = tf.Variable(
 33 |                 tf.random_uniform([vocab_size, embedding_size], -1.0, 1.0),
 34 |                 name="W")
 35 |             self.embedded_chars = tf.nn.embedding_lookup(self.W, self.input_x)
 36 |             self.embedded_chars_expanded = tf.expand_dims(self.embedded_chars, -1)
 37 | 
 38 |         # Create a convolution + maxpool layer for each filter size
 39 |         pooled_outputs = []
 40 |         for i, filter_size in enumerate(filter_sizes):
 41 |             with tf.variable_scope("conv-maxpool-%s" % filter_size):
 42 |                 # Convolution Layer
 43 |                 filter_shape = [filter_size, embedding_size, 1, num_filters]
 44 |                 W = tf.Variable(tf.truncated_normal(filter_shape, stddev=0.1), name="W")
 45 |                 b = tf.Variable(tf.constant(0.01, shape=[num_filters]), name="b")
 46 |                 conv = tf.nn.conv2d(
 47 |                     self.embedded_chars_expanded,
 48 |                     W,
 49 |                     strides=[1, 1, 1, 1],
 50 |                     padding="VALID",
 51 |                     name="conv")
 52 |                 # Apply nonlinearity
 53 |                 h = tf.nn.relu(tf.nn.bias_add(conv, b), name="relu")
 54 |                 # Maxpooling over the outputs
 55 |                 pooled = tf.nn.max_pool(
 56 |                     h,
 57 |                     ksize=[1, sequence_length - filter_size + 1, 1, 1],
 58 |                     strides=[1, 1, 1, 1],
 59 |                     padding='VALID',
 60 |                     name="pool")
 61 |                 pooled_outputs.append(pooled)
 62 | 
 63 |         # Combine all the pooled features
 64 |         num_filters_total = num_filters * len(filter_sizes)
 65 |         self.h_pool = tf.concat(pooled_outputs, 3)
 66 |         self.h_pool_flat = tf.reshape(self.h_pool, [-1, num_filters_total])
 67 | 
 68 |         # Add dropout
 69 |         with tf.variable_scope("dropout"):
 70 |             self.h_drop = tf.nn.dropout(self.h_pool_flat, self.dropout_keep_prob)
 71 | 
 72 |         # Final (unnormalized) scores and predictions
 73 |         with tf.variable_scope("output"):
 74 |             W = tf.get_variable(
 75 |                 "W",
 76 |                 shape=[num_filters_total, num_classes],
 77 |                 initializer=tf.contrib.layers.xavier_initializer())
 78 |             self.softmax_W = W
 79 |             b = tf.Variable(tf.constant(0.0, shape=[num_classes]), name="b")
 80 |             self.softmax_b = b
 81 |             l2_loss += tf.nn.l2_loss(W)
 82 |             l2_loss += tf.nn.l2_loss(b)
 83 |             self.scores = tf.nn.xw_plus_b(self.h_drop, W, b, name="scores")
 84 |             self.probabilies = tf.nn.softmax(self.scores)
 85 |             self.log_probabilies = tf.nn.log_softmax(self.scores)
 86 |             self.predictions = tf.argmax(self.scores, 1, name="predictions")
 87 | 
 88 |         self.params = self.get_all_params()
 89 |         # CalculateMean cross-entropy loss
 90 |         with tf.variable_scope("loss"):
 91 |             labels = tf.one_hot(self.input_y, depth=self.num_classes)
 92 |             cross_entropy = -tf.reduce_sum(tf.multiply(labels, tf.nn.log_softmax(self.scores)), axis=1)
 93 |             self.entropy = -tf.reduce_sum(tf.multiply(self.log_probabilies, self.probabilies), axis=1)
 94 |             self.average_entropy = tf.reduce_mean(self.entropy)
 95 |             self.indiv_loss_no_reg = cross_entropy
 96 |             self.loss_no_reg = tf.reduce_mean(cross_entropy, name='xentropy_mean')
 97 |             self.grad_loss_no_reg_op = tf.gradients(self.loss_no_reg, self.params)  #average grad loss
 98 |             self.loss = self.loss_no_reg + l2_reg_lambda * l2_loss   #average loss
 99 |             self.EGL_norm = tf.reshape(tf.norm(tf.gradients(self.loss_no_reg, self.W)[0],axis=-1),[-1]) #batch_size (should be 1) * |V|
100 | 
101 |         # Accuracy
102 |         with tf.variable_scope("accuracy"):
103 |             correct_predictions = tf.equal(tf.cast(self.predictions, "int32"), self.input_y)
104 |             self.accuracy = tf.reduce_mean(tf.cast(correct_predictions, "float"), name="accuracy")
105 | 
106 |         config = tf.ConfigProto()
107 |         if session is None:
108 |             self.sess = tf.Session(config=config)
109 |         else:
110 |             self.sess = session
111 |         init = tf.global_variables_initializer()
112 |         self.sess.run(init)
113 |         self.saver = tf.train.Saver()
114 |         self.damping = damping
115 |         self.grad_total_loss_op = tf.gradients(self.loss, self.params)
116 |         self.mini_batch = mini_batch
117 |         self.train_dir = train_dir
118 |         self.model_name = model_name
119 |         if self.train_dir is not None:
120 |             if not os.path.exists(self.train_dir):
121 |                 os.makedirs(self.train_dir)
122 | 
123 |     def get_all_params(self):
124 |         trainable_vars = tf.trainable_variables()
125 |         trainable_vars = [t for t in trainable_vars if "embedding" not in t.name
126 |                           and 'conv' not in t.name
127 |          ]
128 |         print "params used in Hessian: "
129 |         for t in trainable_vars:
130 |             print (t.name)
131 |             print (t.shape)
132 |         return trainable_vars
133 | 
134 |     def re_initialize(self, checkpoint_dir):
135 |         checkpoint_file = tf.train.latest_checkpoint(checkpoint_dir)
136 |         print ("checkpoint file: ", checkpoint_file)
137 |         self.saver.restore(self.sess, checkpoint_file)
138 | 


--------------------------------------------------------------------------------
/data_helpers.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import re
  3 | import itertools
  4 | from collections import Counter
  5 | 
  6 | 
  7 | def clean_str(string):
  8 |     """
  9 |     Tokenization/string cleaning for all datasets except for SST.
 10 |     Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
 11 |     """
 12 |     string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
 13 |     string = re.sub(r"\'s", " \'s", string)
 14 |     string = re.sub(r"\'ve", " \'ve", string)
 15 |     string = re.sub(r"n\'t", " n\'t", string)
 16 |     string = re.sub(r"\'re", " \'re", string)
 17 |     string = re.sub(r"\'d", " \'d", string)
 18 |     string = re.sub(r"\'ll", " \'ll", string)
 19 |     string = re.sub(r",", " , ", string)
 20 |     string = re.sub(r"!", " ! ", string)
 21 |     string = re.sub(r"\(", " \( ", string)
 22 |     string = re.sub(r"\)", " \) ", string)
 23 |     string = re.sub(r"\?", " \? ", string)
 24 |     string = re.sub(r"\s{2,}", " ", string)
 25 |     return string.strip().lower()
 26 | 
 27 | 
 28 | def load_data_and_labels(positive_data_file, negative_data_file):
 29 |     """
 30 |     Loads MR polarity data from files, splits the data into words and generates labels.
 31 |     Returns split sentences and labels.
 32 |     """
 33 |     # Load data from files
 34 |     positive_examples = list(open(positive_data_file, "r").readlines())
 35 |     positive_examples = [s.strip() for s in positive_examples]
 36 |     negative_examples = list(open(negative_data_file, "r").readlines())
 37 |     negative_examples = [s.strip() for s in negative_examples]
 38 |     # Split by words
 39 |     x_text = positive_examples + negative_examples
 40 |     original_text = x_text
 41 |     x_text = [clean_str(sent) for sent in x_text]
 42 |     # Generate labels
 43 |     #positive_labels = [[0, 1] for _ in positive_examples]
 44 |     #negative_labels = [[1, 0] for _ in negative_examples]
 45 |     positive_labels = [1] * len(positive_examples)
 46 |     negative_labels = [0] * len(negative_examples)
 47 |     #y = np.concatenate([positive_labels, negative_labels], 0)
 48 |     y = positive_labels + negative_labels
 49 |     return x_text, y, original_text
 50 | 
 51 | 
 52 | def batch_iter(data, batch_size, num_epochs, shuffle=True):
 53 |     """
 54 |     Generates a batch iterator for a dataset.
 55 |     """
 56 |     data = np.array(data)
 57 |     data_size = len(data)
 58 |     num_batches_per_epoch = int((len(data)-1)/batch_size) + 1
 59 |     for epoch in range(num_epochs):
 60 |         # Shuffle the data at each epoch
 61 |         if shuffle:
 62 |             shuffle_indices = np.random.permutation(np.arange(data_size))
 63 |             shuffled_data = data[shuffle_indices]
 64 |         else:
 65 |             shuffled_data = data
 66 |         for batch_num in range(num_batches_per_epoch):
 67 |             start_index = batch_num * batch_size
 68 |             end_index = min((batch_num + 1) * batch_size, data_size)
 69 |             yield shuffled_data[start_index:end_index]
 70 | 
 71 | def split_into_train_test(positive_data_file, negative_data_file):
 72 |     positive_examples = list(open(positive_data_file, "r").readlines())
 73 |     negative_examples = list(open(negative_data_file, "r").readlines())
 74 |     num_pos = len(positive_examples)
 75 |     num_neg = len(negative_examples)
 76 |     shuffle_indices = np.random.permutation(np.arange(num_pos))
 77 |     dev_sample_index = -1 * int(0.1 * float(num_pos))
 78 |     pos_train_indices = shuffle_indices[:dev_sample_index]
 79 |     #pos_test_indices = shuffle_indices[dev_sample_index:]
 80 |     pos_train = open('data/rt-polaritydata/train_pos.txt','wb')
 81 |     pos_test = open('data/rt-polaritydata/test_pos.txt','wb')
 82 |     neg_train = open('data/rt-polaritydata/train_neg.txt','wb')
 83 |     neg_test = open('data/rt-polaritydata/test_neg.txt','wb')
 84 |     for i in range(num_pos):
 85 |         if i in pos_train_indices:
 86 |             pos_train.write(positive_examples[i])
 87 |         else:
 88 |             pos_test.write(positive_examples[i])
 89 |     shuffle_indices = np.random.permutation(np.arange(num_neg))
 90 |     dev_sample_index = -1 * int(0.1 * float(num_neg))
 91 |     neg_train_indices = shuffle_indices[:dev_sample_index]
 92 |     for i in range(num_neg):
 93 |         if i in neg_train_indices:
 94 |             neg_train.write(negative_examples[i])
 95 |         else:
 96 |             neg_test.write(negative_examples[i])
 97 | 
 98 | def split_into_folds(positive_data_file, negative_data_file):
 99 |     positive_examples = list(open(positive_data_file, "r").readlines())
100 |     negative_examples = list(open(negative_data_file, "r").readlines())
101 |     folds_pos = []
102 |     folds_neg = []
103 |     for i in range(10):
104 |         folds_pos.append(open("data/folds/fold_pos_"+str(i), 'wb'))
105 |         folds_neg.append(open("data/folds/fold_neg_"+str(i), 'wb'))
106 |     for p in positive_examples:
107 |         cv = np.random.randint(0,10)
108 |         folds_pos[cv].write(p)
109 |     for n in negative_examples:
110 |         cv = np.random.randint(0,10)
111 |         folds_neg[cv].write(n)
112 |     for f in folds_pos:
113 |         f.close()
114 |     for f in folds_neg:
115 |         f.close()
116 | 
117 | def load_folds(cv):
118 |     train_x, train_y, train_original = [], [], []
119 |     for i in range(10):
120 |         if i == cv:
121 |             pos_file = "data/folds/fold_pos_"+str(i)
122 |             neg_file = "data/folds/fold_neg_"+str(i)
123 |             test_x, test_y, test_original = load_data_and_labels(pos_file, neg_file)
124 |         else:
125 |             pos_file = "data/folds/fold_pos_" + str(i)
126 |             neg_file = "data/folds/fold_neg_" + str(i)
127 |             temp_train_x, temp_train_y, temp_train_original = load_data_and_labels(pos_file, neg_file)
128 |             train_x += temp_train_x
129 |             train_y += temp_train_y
130 |             train_original += temp_train_original
131 |     return train_x, train_y, train_original, test_x, test_y, test_original
132 | 
133 | def idx_to_word(word_idx_map):
134 |     '''
135 |     :param word_idx_map:  map word to index
136 |     :return: map index to word (not including zero)
137 |     '''
138 |     result = {}
139 |     for word in word_idx_map:
140 |         result[word_idx_map[word]] = word
141 |     return result
142 | 
143 | def get_idx_from_sent(sent, word_idx_map, max_l=51, k=300, filter_h=5):
144 |     """
145 |     Transforms sentence into a list of indices. Pad with zeroes.
146 |     """
147 |     x = []
148 |     pad = filter_h - 1
149 |     for i in xrange(pad):
150 |         x.append(0)
151 |     words = sent.split()
152 |     for word in words:
153 |         if word in word_idx_map:
154 |             x.append(word_idx_map[word])
155 |     while len(x) < max_l+2*pad:
156 |         x.append(0)
157 |     return x
158 | 
159 | def make_idx_data_cv(revs, word_idx_map, cv, max_l=51, k=300, filter_h=5):
160 |     """
161 |     Transforms sentences into a 2-d matrix.
162 |     """
163 |     train, test = [], []
164 |     for rev in revs:
165 |         sent = get_idx_from_sent(rev["text"], word_idx_map, max_l, k, filter_h)
166 |         sent.append(rev["y"])
167 |         if rev["split"]==cv:
168 |             test.append(sent)
169 |         else:
170 |             train.append(sent)
171 |     #train is a list of sent
172 |     #each sent is a list of indices padded with zeroes, followed by the label
173 |     train = np.array(train,dtype='int32')
174 |     test = np.array(test,dtype='int32')
175 |     return [train, test]
176 | 
177 | 
178 | 
179 | 
180 | 
181 | 
182 | 
183 | 


--------------------------------------------------------------------------------
/active_learning.py:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/env python
  2 | #This code is modified from https://github.com/dennybritz/cnn-text-classification-tf
  3 | import tensorflow as tf
  4 | import numpy as np
  5 | import os
  6 | import time
  7 | import datetime
  8 | import data_helpers
  9 | from text_cnn import TextCNN
 10 | from tensorflow.contrib import learn
 11 | import cPickle
 12 | # Parameters
 13 | # ==================================================
 14 | 
 15 | # Data loading params
 16 | tf.flags.DEFINE_float("dev_sample_percentage", .1, "Percentage of the training data to use for validation")
 17 | #tf.flags.DEFINE_string("train_data_file", "train.txt", "Data source for the positive data.")
 18 | tf.flags.DEFINE_string("positive_data_file", "./data/rt-polaritydata/train_pos.txt", "Data source for the positive data.")
 19 | tf.flags.DEFINE_string("negative_data_file", "./data/rt-polaritydata/train_neg.txt", "Data source for the negative data.")
 20 | tf.flags.DEFINE_string("test_positive_data_file", "./data/rt-polaritydata/test_pos.txt", "Data source for the positive data.")
 21 | tf.flags.DEFINE_string("test_negative_data_file", "./data/rt-polaritydata/test_neg.txt", "Data source for the positive data.")
 22 | # Model Hyperparameters
 23 | tf.flags.DEFINE_integer("embedding_dim", 300, "Dimensionality of character embedding (default: 128)")
 24 | tf.flags.DEFINE_string("filter_sizes", "3,4,5", "Comma-separated filter sizes (default: '3,4,5')")
 25 | tf.flags.DEFINE_integer("num_filters", 128, "Number of filters per filter size (default: 128)")
 26 | tf.flags.DEFINE_float("dropout_keep_prob", 0.5, "Dropout keep probability (default: 0.5)")
 27 | tf.flags.DEFINE_float("l2_reg_lambda", 0.0, "L2 regularization lambda (default: 0.0)")
 28 | 
 29 | # Training parameters
 30 | tf.flags.DEFINE_integer("batch_size", 25, "Batch Size (default: 64)")
 31 | tf.flags.DEFINE_integer("num_epochs", 10, "Number of training epochs (default: 200)")
 32 | tf.flags.DEFINE_integer("num_checkpoints", 5, "Number of checkpoints to store (default: 5)")
 33 | # Misc Parameters
 34 | tf.flags.DEFINE_boolean("allow_soft_placement", True, "Allow device soft device placement")
 35 | tf.flags.DEFINE_boolean("log_device_placement", False, "Log placement of ops on devices")
 36 | tf.flags.DEFINE_string("word2vec_path", "/scratch/cluster/yezhang/glove.840B.300d.txt", "word2ec path")
 37 | #active learning method
 38 | tf.flags.DEFINE_string("AL_method", "EGL", "active learning method. Should be random/entropy/EGL")
 39 | 
 40 | FLAGS = tf.flags.FLAGS
 41 | FLAGS._parse_flags()
 42 | print("\nParameters:")
 43 | for attr, value in sorted(FLAGS.__flags.items()):
 44 |     print("{}={}".format(attr.upper(), value))
 45 | print("")
 46 | 
 47 | def idx_to_word(word_idx_map):
 48 |     '''
 49 |     :param word_idx_map:  map word to index
 50 |     :return: map index to word (not including zero)
 51 |     '''
 52 |     result = {}
 53 |     for word in word_idx_map:
 54 |         result[word_idx_map[word]] = word
 55 |     return result
 56 | def get_idx_from_sent(sent, word_idx_map, max_l=51, k=300, filter_h=5):
 57 |     """
 58 |     Transforms sentence into a list of indices. Pad with zeroes.
 59 |     """
 60 |     x = []
 61 |     pad = filter_h - 1
 62 |     for i in xrange(pad):
 63 |         x.append(0)
 64 |     words = sent.split()
 65 |     for word in words:
 66 |         if word in word_idx_map:
 67 |             x.append(word_idx_map[word])
 68 |     while len(x) < max_l+2*pad:
 69 |         x.append(0)
 70 |     return x
 71 | 
 72 | def make_idx_data_cv(revs, word_idx_map, cv, max_l=51, k=300, filter_h=5):
 73 |     """
 74 |     Transforms sentences into a 2-d matrix.
 75 |     """
 76 |     train, test = [], []
 77 |     for rev in revs:
 78 |         sent = get_idx_from_sent(rev["text"], word_idx_map, max_l, k, filter_h)
 79 |         sent.append(rev["y"])
 80 |         if rev["split"]==cv:
 81 |             test.append(sent)
 82 |         else:
 83 |             train.append(sent)
 84 |     #train is a list of sent
 85 |     #each sent is a list of indices padded with zeroes, followed by the label
 86 |     train = np.array(train,dtype='int32')
 87 |     test = np.array(test,dtype='int32')
 88 |     return [train, test]
 89 | 
 90 | x = cPickle.load(open("mr.p","rb"))
 91 | revs, W, W2, word_idx_map, vocab = x[0], x[1], x[2], x[3], x[4]
 92 | vocab_size = W.shape[0]
 93 | idx_to_word_map = idx_to_word(word_idx_map)
 94 | print "data loaded!"
 95 | initW = W
 96 | average_accuracy_across_folds = []
 97 | for k in range(10):
 98 |     print "test on fold: " + str(k)
 99 |     datasets = make_idx_data_cv(revs, word_idx_map, k, max_l=56,k=300, filter_h=5)
100 |     img_h = len(datasets[0][0]) - 1
101 |     x_train, y_train = datasets[0][:,:-1], datasets[0][:,-1]
102 |     x_test, y_test = datasets[1][:,:-1], datasets[1][:,-1]
103 | 
104 | 
105 |     with tf.Graph().as_default():
106 |         cnn = TextCNN(
107 |             sequence_length=x_train.shape[1],
108 |             num_classes=2,
109 |             vocab_size=vocab_size,
110 |             embedding_size=FLAGS.embedding_dim,
111 |             filter_sizes=list(map(int, FLAGS.filter_sizes.split(","))),
112 |             num_filters=FLAGS.num_filters,
113 |             l2_reg_lambda=FLAGS.l2_reg_lambda,
114 |         )
115 |     # Define Training procedure
116 |         global_step = tf.Variable(0, name="global_step", trainable=False)
117 |         optimizer = tf.train.AdamOptimizer(1e-3)
118 |         grads_and_vars = optimizer.compute_gradients(cnn.loss)
119 |         train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step)
120 | 
121 |         # Initialize all variables
122 |         cnn.sess.run(tf.global_variables_initializer())
123 |         cnn.sess.run(cnn.W.assign(initW))
124 | 
125 | 
126 |         def train_step(x_batch, y_batch):
127 |             """
128 |             A single training step
129 |             """
130 |             feed_dict = {
131 |               cnn.input_x: x_batch,
132 |               cnn.input_y: y_batch,
133 |               cnn.dropout_keep_prob: FLAGS.dropout_keep_prob
134 |             }
135 |             _, step, loss, accuracy = cnn.sess.run(
136 |                 [train_op, global_step, cnn.loss, cnn.accuracy],
137 |                 feed_dict)
138 |             time_str = datetime.datetime.now().isoformat()
139 |             #print("{}: step {}, train loss {:g}, train acc {:g}".format(time_str, step, loss, accuracy))
140 | 
141 |         def dev_step(x_batch, y_batch, writer=None):
142 |             """
143 |             Evaluates model on a dev set
144 |             """
145 |             feed_dict = {
146 |               cnn.input_x: x_batch,
147 |               cnn.input_y: y_batch,
148 |               cnn.dropout_keep_prob: 1.0
149 |             }
150 |             step, loss, accuracy = cnn.sess.run(
151 |                 [global_step, cnn.loss, cnn.accuracy],
152 |                 feed_dict)
153 |             time_str = datetime.datetime.now().isoformat()
154 |             #print("{}: step {}, dev loss {:g}, dev acc {:g}".format(time_str, step, loss, accuracy))
155 |             return accuracy
156 | 
157 |         def entropy_score(train_x, train_index):
158 |             feed_dict = {
159 |                 cnn.input_x: train_x[train_index],
160 |                 cnn.dropout_keep_prob: 1.0
161 |             }
162 |             step, test_entropies = cnn.sess.run(
163 |                 [global_step, cnn.entropy],
164 |                 feed_dict)
165 |             return test_entropies
166 | 
167 |         def EGL_score(train_x, train_index):
168 |             EGL_scores = []
169 |             for j in train_index:
170 |                 EGL_norm = np.zeros(vocab_size)
171 |                 for k in range(num_classes):
172 |                     feed_dict = {
173 |                         cnn.input_x: np.expand_dims(train_x[j],0),
174 |                         cnn.dropout_keep_prob: 1.0,
175 |                         cnn.input_y: np.array([k])
176 |                     }
177 |                     step, EGL, probs = cnn.sess.run(
178 |                         [global_step, cnn.EGL_norm, cnn.probabilies],
179 |                         feed_dict)
180 |                     EGL_norm += EGL * probs[0][k]
181 |                 EGL_max_norm = max(EGL_norm)
182 |                 EGL_scores.append(EGL_max_norm)
183 |             return np.array(EGL_scores)
184 | 
185 | 
186 |         print "number of training points in fold " + str(k) + ":"+ str(len(y_train))
187 |         print "number of test points in fold " + str(k) + ":" + str(len(y_test))
188 |         indices = np.arange(len(y_train))
189 |         num_classes = len(set(y_train))
190 |         best_dev_accuracy = 0.0
191 |         index_in_labels_pool = []
192 |         index_in_unlabeled_pool = indices
193 |         index_of_new_add_index = np.random.choice(np.arange(len(index_in_unlabeled_pool)), size=FLAGS.batch_size)
194 |         index_in_labels_pool += list(index_in_unlabeled_pool[index_of_new_add_index])
195 |         index_in_unlabeled_pool = np.delete(index_in_unlabeled_pool, index_of_new_add_index)
196 |         cur_train = x_train[np.array(index_in_labels_pool)]
197 |         cur_labels = y_train[np.array(index_in_labels_pool)]
198 |         accuracy_list = []
199 |         init = tf.global_variables_initializer()
200 |         cnn.sess.run(init)
201 |         cnn.sess.run(cnn.W.assign(initW))
202 |         for i in range(20):
203 |             print "current number of labels: ", len(cur_labels)
204 |             print "current positive labels: ", len(np.where(cur_labels == 1)[0])
205 |             print "current negative labels: ", len(np.where(cur_labels == 0)[0])
206 |             print "current number of unlabeled points: ", len(index_in_unlabeled_pool)
207 |             batches = data_helpers.batch_iter(
208 |                     list(zip(cur_train, cur_labels)), FLAGS.batch_size, FLAGS.num_epochs)
209 |             for batch in batches:
210 |                 x_batch, y_batch = zip(*batch)
211 |                 train_step(x_batch, y_batch)
212 |                 cnn.sess.run(cnn.W[0].assign(np.zeros(FLAGS.embedding_dim)))
213 |             if FLAGS.AL_method == "entropy":
214 |                 entropy_scores = entropy_score(x_train, index_in_unlabeled_pool)
215 |                 index_of_new_add_index = np.argsort(entropy_scores)[-FLAGS.batch_size:]
216 |             elif FLAGS.AL_method == "random":
217 |                 index_of_new_add_index = np.random.choice(np.arange(len(index_in_unlabeled_pool)), size=FLAGS.batch_size)
218 |             elif FLAGS.AL_method == "EGL":
219 |                 EGL_scores = EGL_score(x_train, index_in_unlabeled_pool)
220 |                 index_of_new_add_index = np.argsort(EGL_scores)[-FLAGS.batch_size:]
221 |             index_in_labels_pool += list(index_in_unlabeled_pool[index_of_new_add_index])
222 |             index_in_unlabeled_pool = np.delete(index_in_unlabeled_pool, index_of_new_add_index)
223 |             cur_train = x_train[np.array(index_in_labels_pool)]
224 |             cur_labels = y_train[np.array(index_in_labels_pool)]
225 |             dev_accuracy = dev_step(x_test, y_test)
226 |             print dev_accuracy
227 |             accuracy_list.append(dev_accuracy)
228 |         print accuracy_list
229 |         average_accuracy_across_folds.append(accuracy_list)
230 | print list(np.average(np.array(average_accuracy_across_folds), axis=0))
231 | 


--------------------------------------------------------------------------------