├── .gitignore ├── data └── mr │ ├── mr.neg │ └── mr.pos ├── README.md ├── eval.py ├── model.py ├── train.py └── text_input.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | *vocab 3 | *.cPickle 4 | train/* 5 | *.sh 6 | *.out 7 | *.bin 8 | *.npy -------------------------------------------------------------------------------- /data/mr/mr.neg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yuhaozhang/sentence-convnet/HEAD/data/mr/mr.neg -------------------------------------------------------------------------------- /data/mr/mr.pos: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yuhaozhang/sentence-convnet/HEAD/data/mr/mr.pos -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | A Tensorflow implementation of convolutional neural network to classify sentences 2 | ========= 3 | This implementation uses Tensorflow's `tf.conv2d` to perform 1D convolution on word sequences. It also supports using Google News word2vec pre-trained vectors to initialize word embeddings, which boosts the performance on movie review dataset from ~76% to ~81%. 4 | 5 | The original theano implementation of this model by the author is [here](https://github.com/yoonkim/CNN_sentence). Another tensorflow implementation that does not support loading pretrained vectors is [here](https://github.com/dennybritz/cnn-text-classification-tf). 6 | 7 | ## Dependency 8 | 9 | - python2.7+ 10 | - numpy 11 | - tensorflow 1.0+ 12 | 13 | ## Data 14 | 15 | The data in `data/mr/` are movie review polarity data provided [here](http://www.cs.cornell.edu/people/pabo/movie-review-data/). The current `data/word2vec` directory is empty. To use the pretrained word2vec embeddings, download the Google News pretrained vector data from [this Google Drive link](https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/edit), and unzip it to the directory. It will be a `.bin` file. 16 | 17 | ## Usage 18 | 19 | #### Preprocess the data 20 | 21 | python text_input.py 22 | 23 | #### Train 24 | 25 | python train.py 26 | 27 | By default the pretrained vectors will be loaded and used to initialize the embeddings. To suppress this, use 28 | 29 | python train.py --use_pretrain False 30 | 31 | #### Evaluate 32 | 33 | python eval.py 34 | 35 | By default evaluation is run over test set. To evaluate over training set, run 36 | 37 | python eval.py --train_data 38 | 39 | ## References 40 | 41 | 1. Kim, Yoon. "Convolutional neural networks for sentence classification." arXiv preprint arXiv:1408.5882 (2014). [link](http://arxiv.org/abs/1408.5882) 42 | 43 | ## License 44 | 45 | MIT 46 | -------------------------------------------------------------------------------- /eval.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | import time 3 | import os 4 | import tensorflow as tf 5 | import numpy as np 6 | 7 | import model 8 | import text_input 9 | 10 | FLAGS = tf.app.flags.FLAGS 11 | 12 | tf.app.flags.DEFINE_string('data_dir', './data/mr/', 'Directory of the data') 13 | tf.app.flags.DEFINE_string('train_dir', './train/', 'Where to read model') 14 | tf.app.flags.DEFINE_boolean('train_data', False, 'To evaluate on training data') 15 | 16 | def evaluate(): 17 | """ Build evaluation graph and run. """ 18 | with tf.Graph().as_default(): 19 | with tf.variable_scope('cnn'): 20 | m = model.Model(FLAGS, is_train=False) 21 | saver = tf.train.Saver(tf.global_variables()) 22 | 23 | # read test files 24 | if FLAGS.train_data: 25 | loader = text_input.DataLoader(os.path.join(FLAGS.data_dir, 'train.cPickle'), batch_size=FLAGS.batch_size) 26 | else: 27 | loader = text_input.DataLoader(os.path.join(FLAGS.data_dir, 'test.cPickle'), batch_size=FLAGS.batch_size) 28 | print 'Start evaluation, %d batches needed, with %d examples per batch.' % (loader.num_batch, FLAGS.batch_size) 29 | 30 | true_count = 0 31 | avg_loss = 0 32 | 33 | with tf.Session() as sess: 34 | ckpt = tf.train.get_checkpoint_state(FLAGS.train_dir) 35 | if ckpt and ckpt.model_checkpoint_path: 36 | saver.restore(sess, ckpt.model_checkpoint_path) 37 | else: 38 | raise IOError("Loading checkpoint file failed!") 39 | 40 | for _ in range(loader.num_batch): 41 | x, y = loader.next_batch() 42 | true_count_value, loss_value = sess.run([m.true_count_op, m.total_loss], 43 | feed_dict={m.inputs:x, m.labels:y}) 44 | true_count += true_count_value 45 | avg_loss += loss_value 46 | 47 | accuracy = float(true_count) / (loader.num_batch * FLAGS.batch_size) 48 | avg_loss = float(avg_loss) / loader.num_batch 49 | print '%s: test_loss = %.6f, test_accuracy = %.3f' % (datetime.now(), avg_loss, accuracy) 50 | 51 | def main(argv=None): 52 | evaluate() 53 | 54 | if __name__ == '__main__': 55 | tf.app.run() 56 | 57 | -------------------------------------------------------------------------------- /model.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | tf.app.flags.DEFINE_integer('batch_size', 50, 'Training batch size') 4 | tf.app.flags.DEFINE_integer('emb_size', 300, 'Size of word embeddings') 5 | tf.app.flags.DEFINE_integer('num_kernel', 100, 'Number of filters for each window size') 6 | tf.app.flags.DEFINE_integer('min_window', 3, 'Minimum size of filter window') 7 | tf.app.flags.DEFINE_integer('max_window', 5, 'Maximum size of filter window') 8 | tf.app.flags.DEFINE_integer('vocab_size', 15000, 'Vocabulary size') 9 | tf.app.flags.DEFINE_integer('num_class', 2, 'Number of class to consider') 10 | tf.app.flags.DEFINE_integer('sent_len', 56, 'Input sentence length. This is after the padding is performed.') 11 | tf.app.flags.DEFINE_float('l2_reg', 0, 'l2 regularization weight') 12 | 13 | def _variable_on_cpu(name, shape, initializer): 14 | with tf.device('/cpu:0'): 15 | var = tf.get_variable(name, shape, initializer=initializer) 16 | return var 17 | 18 | def _variable_with_weight_decay(name, shape, initializer, wd): 19 | var = _variable_on_cpu(name, shape, initializer) 20 | if wd is not None and wd != 0.: 21 | weight_decay = tf.mul(tf.nn.l2_loss(var), wd, name='weight_loss') 22 | else: 23 | weight_decay = tf.constant(0.0, dtype=tf.float32) 24 | return var, weight_decay 25 | 26 | class Model(object): 27 | 28 | def __init__(self, config, is_train=True): 29 | self.is_train = is_train 30 | self.emb_size = config.emb_size 31 | self.batch_size = config.batch_size 32 | self.num_kernel = config.num_kernel 33 | self.min_window = config.min_window 34 | self.max_window = config.max_window 35 | self.vocab_size = config.vocab_size 36 | self.num_class = config.num_class 37 | self.sent_len = config.sent_len 38 | self.l2_reg = config.l2_reg 39 | if is_train: 40 | self.optimizer = config.optimizer 41 | self.dropout = config.dropout 42 | self.build_graph() 43 | 44 | def build_graph(self): 45 | """ Build the computation graph. """ 46 | self._inputs = tf.placeholder(dtype=tf.int64, shape=[self.batch_size, self.sent_len], name='input_x') 47 | self._labels = tf.placeholder(dtype=tf.int64, shape=[self.batch_size], name='input_y') 48 | losses = [] 49 | 50 | # lookup layer 51 | with tf.variable_scope('lookup') as scope: 52 | self._W_emb = _variable_on_cpu(name='embedding', shape=[self.vocab_size, self.emb_size], 53 | initializer=tf.random_uniform_initializer(minval=-1.0, maxval=1.0)) 54 | # sent_batch is of shape: (batch_size, sent_len, emb_size, 1), in order to use conv2d 55 | sent_batch = tf.nn.embedding_lookup(params=self._W_emb, ids=self._inputs) 56 | sent_batch = tf.expand_dims(sent_batch, -1) 57 | 58 | # conv + pooling layer 59 | with tf.variable_scope('conv') as scope: 60 | pool_tensors = [] 61 | for k_size in range(self.min_window, self.max_window+1): 62 | kernel, wd = _variable_with_weight_decay(name='kernel_'+str(k_size), 63 | shape=[k_size, self.emb_size, 1, self.num_kernel], initializer=tf.truncated_normal_initializer(stddev=0.01), wd=self.l2_reg) 64 | losses.append(wd) 65 | conv = tf.nn.conv2d(input=sent_batch, filter=kernel, strides=[1,1,1,1], padding='VALID') 66 | biases = _variable_on_cpu('biases_'+str(k_size), [self.num_kernel], tf.constant_initializer(0.0)) 67 | bias = tf.nn.bias_add(conv, biases) 68 | relu = tf.nn.relu(bias, name=scope.name) 69 | # shape of relu: [batch_size, conv_len, 1, num_kernel] 70 | conv_len = relu.get_shape()[1] 71 | pool = tf.nn.max_pool(relu, ksize=[1,conv_len,1,1], strides=[1,1,1,1], padding='VALID') 72 | # shape of pool: [batch_size, 1, 1, num_kernel] 73 | pool = tf.squeeze(pool,squeeze_dims=[1,2]) # size: [batch_size, num_kernel] 74 | pool_tensors.append(pool) 75 | pool_layer = tf.concat(values=pool_tensors, axis=1, name='pool') 76 | 77 | # drop out layer 78 | if self.is_train and self.dropout > 0: 79 | pool_dropout = tf.nn.dropout(pool_layer, 1 - self.dropout) 80 | else: 81 | pool_dropout = pool_layer 82 | 83 | # fully-connected layer 84 | pool_size = (self.max_window - self.min_window + 1) * self.num_kernel 85 | with tf.variable_scope('fc') as scope: 86 | W, wd = _variable_with_weight_decay('W', shape=[pool_size, self.num_class], 87 | initializer=tf.truncated_normal_initializer(stddev=0.05), wd=self.l2_reg) 88 | losses.append(wd) 89 | biases = _variable_on_cpu('biases', [self.num_class], tf.constant_initializer(0.01)) 90 | logits = tf.nn.bias_add(tf.matmul(pool_dropout, W), biases) 91 | 92 | # loss 93 | cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=self._labels, logits=logits, name='cross_entropy_per_example') 94 | cross_entropy_loss = tf.reduce_mean(cross_entropy, name='cross_entropy_loss') 95 | losses.append(cross_entropy_loss) 96 | self._total_loss = tf.add_n(losses, name='total_loss') 97 | # self._total_loss = cross_entropy_loss 98 | 99 | # correct prediction count 100 | correct_prediction = tf.to_int32(tf.nn.in_top_k(logits, self._labels, 1)) 101 | self._true_count_op = tf.reduce_sum(correct_prediction) 102 | 103 | # train on a batch 104 | self._lr = tf.Variable(0.0, trainable=False) 105 | if self.is_train: 106 | if self.optimizer == 'adadelta': 107 | opt = tf.train.AdadeltaOptimizer(self._lr) 108 | elif self.optimizer == 'adagrad': 109 | opt = tf.train.AdagradOptimizer(self._lr) 110 | elif self.optimizer == 'adam': 111 | opt = tf.train.AdamOptimizer(self._lr) 112 | elif self.optimizer == 'sgd': 113 | opt = tf.train.GradientDescentOptimizer(self._lr) 114 | else: 115 | raise ValueError("Optimizer not supported.") 116 | grads = opt.compute_gradients(self._total_loss) 117 | self._train_op = opt.apply_gradients(grads) 118 | 119 | for var in tf.trainable_variables(): 120 | tf.summary.histogram(var.op.name, var) 121 | else: 122 | self._train_op = tf.no_op() 123 | 124 | return 125 | 126 | @property 127 | def inputs(self): 128 | return self._inputs 129 | 130 | @property 131 | def labels(self): 132 | return self._labels 133 | 134 | @property 135 | def lr(self): 136 | return self._lr 137 | 138 | @property 139 | def train_op(self): 140 | return self._train_op 141 | 142 | @property 143 | def total_loss(self): 144 | return self._total_loss 145 | 146 | @property 147 | def true_count_op(self): 148 | return self._true_count_op 149 | 150 | @property 151 | def W_emb(self): 152 | return self._W_emb 153 | 154 | def assign_lr(self, session, lr_value): 155 | session.run(tf.assign(self.lr, lr_value)) 156 | 157 | def assign_embedding(self, session, pretrained): 158 | session.run(tf.assign(self.W_emb, pretrained)) 159 | -------------------------------------------------------------------------------- /train.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | import time 3 | import os 4 | import sys 5 | import tensorflow as tf 6 | import numpy as np 7 | 8 | import model 9 | import text_input 10 | 11 | FLAGS = tf.app.flags.FLAGS 12 | 13 | tf.app.flags.DEFINE_string('data_dir', './data/mr/', 'Directory of the data') 14 | tf.app.flags.DEFINE_string('train_dir', './train/', 'Directory to save training checkpoint files') 15 | tf.app.flags.DEFINE_integer('num_epoch', 50, 'Number of epochs to run') 16 | tf.app.flags.DEFINE_boolean('use_pretrain', True, 'Use word2vec pretrained embeddings or not') 17 | tf.app.flags.DEFINE_boolean('log_device_placement', False, 'Whether log device information in summary') 18 | 19 | tf.app.flags.DEFINE_string('optimizer', 'adagrad', 'Optimizer to use. Must be one of "sgd", adagrad", "adadelta" and "adam"') 20 | tf.app.flags.DEFINE_float('init_lr', 0.01, 'Initial learning rate') 21 | tf.app.flags.DEFINE_float('lr_decay', 0.95, 'LR decay rate') 22 | tf.app.flags.DEFINE_integer('tolerance_step', 500, 'Decay the lr after loss remains unchanged for this number of steps') 23 | tf.app.flags.DEFINE_float('dropout', 0.5, 'Dropout rate. 0 is no dropout.') 24 | 25 | tf.app.flags.DEFINE_integer('log_step', 10, 'Write log to stdout after this step') 26 | tf.app.flags.DEFINE_integer('summary_step', 200, 'Write summary after this step') 27 | tf.app.flags.DEFINE_integer('save_epoch', 5, 'Save model after this epoch') 28 | 29 | def train(): 30 | # load data 31 | train_loader = text_input.DataLoader(os.path.join(FLAGS.data_dir, 'train.cPickle'), batch_size=FLAGS.batch_size) 32 | test_loader = text_input.DataLoader(os.path.join(FLAGS.data_dir, 'test.cPickle'), batch_size=FLAGS.batch_size) 33 | max_steps = train_loader.num_batch * FLAGS.num_epoch # this is just an estimated number 34 | 35 | with tf.Graph().as_default(): 36 | with tf.variable_scope('cnn', reuse=None): 37 | m = model.Model(FLAGS, is_train=True) 38 | with tf.variable_scope('cnn', reuse=True): 39 | mtest = model.Model(FLAGS, is_train=False) 40 | 41 | saver = tf.train.Saver(tf.global_variables()) 42 | save_path = os.path.join(FLAGS.train_dir, 'model.ckpt') 43 | summary_op = tf.summary.merge_all() 44 | 45 | sess = tf.Session(config=tf.ConfigProto(log_device_placement=FLAGS.log_device_placement)) 46 | summary_writer = tf.summary.FileWriter(FLAGS.train_dir, graph=sess.graph) 47 | sess.run(tf.global_variables_initializer()) 48 | 49 | if FLAGS.use_pretrain: 50 | print "Use pretrained embeddings to initialize model ..." 51 | pretrained_embedding = np.load(os.path.join(FLAGS.data_dir, 'emb.npy')) 52 | m.assign_embedding(sess, pretrained_embedding) 53 | 54 | current_lr = FLAGS.init_lr 55 | lowest_loss_value = float("inf") 56 | step_loss_ascend = 0 57 | global_step = 0 58 | 59 | def eval_once(mtest, sess, data_loader): 60 | test_loss = 0.0 61 | test_accuracy = 0 62 | for _ in xrange(data_loader.num_batch): 63 | x_batch, y_batch = data_loader.next_batch() 64 | x_batch = np.array(x_batch) 65 | loss_value, true_count_value = sess.run([mtest.total_loss, mtest.true_count_op], 66 | feed_dict={mtest.inputs: x_batch, mtest.labels: y_batch}) 67 | test_loss += loss_value 68 | test_accuracy += true_count_value 69 | test_loss /= data_loader.num_batch 70 | test_accuracy /= (1.0 * data_loader.num_batch * FLAGS.batch_size) 71 | data_loader.reset_pointer() 72 | return (test_loss, test_accuracy) 73 | 74 | # Note that this is a soft version of epoch. 75 | for epoch in xrange(FLAGS.num_epoch): 76 | train_loss = 0.0 77 | true_count_total = 0 78 | train_loader.reset_pointer() 79 | for _ in xrange(train_loader.num_batch): 80 | m.assign_lr(sess, current_lr) 81 | global_step += 1 82 | start_time = time.time() 83 | x_batch, y_batch = train_loader.next_batch() 84 | feed = {m.inputs: x_batch, m.labels: y_batch} 85 | _, loss_value, true_count_value = sess.run([m.train_op, m.total_loss, m.true_count_op], feed_dict=feed) 86 | duration = time.time() - start_time 87 | train_loss += loss_value 88 | true_count_total += true_count_value 89 | 90 | assert not np.isnan(loss_value), "Model loss is NaN." 91 | 92 | if global_step % FLAGS.log_step == 0: 93 | examples_per_sec = FLAGS.batch_size / duration 94 | 95 | format_str = ('%s: step %d/%d (epoch %d/%d), loss = %.6f (%.1f examples/sec; %.3f sec/batch), lr: %.6f') 96 | print (format_str % (datetime.now(), global_step, max_steps, epoch+1, FLAGS.num_epoch, loss_value, 97 | examples_per_sec, duration, current_lr)) 98 | 99 | if global_step % FLAGS.summary_step == 0: 100 | summary_str = sess.run(summary_op) 101 | summary_writer.add_summary(summary_str, global_step) 102 | 103 | # decay learning rate if necessary 104 | if loss_value < lowest_loss_value: 105 | lowest_loss_value = loss_value 106 | step_loss_ascend = 0 107 | else: 108 | step_loss_ascend += 1 109 | if step_loss_ascend >= FLAGS.tolerance_step: 110 | current_lr *= FLAGS.lr_decay 111 | print '%s: step %d/%d (epoch %d/%d), LR decays to %.5f' % ((datetime.now(), global_step, max_steps, 112 | epoch+1, FLAGS.num_epoch, current_lr)) 113 | step_loss_ascend = 0 114 | 115 | # stop learning if learning rate is too low 116 | if current_lr < 1e-5: break 117 | 118 | # summary loss/accuracy after each epoch 119 | train_loss /= train_loader.num_batch 120 | train_accuracy = true_count_total * 1.0 / (train_loader.num_batch * FLAGS.batch_size) 121 | summary_writer.add_summary(_summary_for_scalar('eval/training_loss', train_loss), global_step=epoch) 122 | summary_writer.add_summary(_summary_for_scalar('eval/training_accuracy', train_accuracy), global_step=epoch) 123 | 124 | test_loss, test_accuracy = eval_once(mtest, sess, test_loader) 125 | summary_writer.add_summary(_summary_for_scalar('eval/test_loss', test_loss), global_step=epoch) 126 | summary_writer.add_summary(_summary_for_scalar('eval/test_accuracy', test_accuracy), global_step=epoch) 127 | 128 | print("Epoch %d: training_loss = %.6f, training_accuracy = %.3f" % (epoch+1, train_loss, train_accuracy)) 129 | print("Epoch %d: test_loss = %.6f, test_accuracy = %.3f" % (epoch+1, test_loss, test_accuracy)) 130 | 131 | # save after fixed epoch 132 | if epoch % FLAGS.save_epoch == 0: 133 | saver.save(sess, save_path, global_step=epoch) 134 | saver.save(sess, save_path, global_step=epoch) 135 | 136 | def _summary_for_scalar(name, value): 137 | return tf.Summary(value=[tf.Summary.Value(tag=name, simple_value=value)]) 138 | 139 | def main(argv=None): 140 | if tf.gfile.Exists(FLAGS.train_dir): 141 | tf.gfile.DeleteRecursively(FLAGS.train_dir) 142 | tf.gfile.MakeDirs(FLAGS.train_dir) 143 | train() 144 | 145 | if __name__ == '__main__': 146 | tf.app.run() 147 | -------------------------------------------------------------------------------- /text_input.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | import re 4 | import random 5 | from collections import Counter 6 | import cPickle as pickle 7 | import numpy as np 8 | 9 | UNK_TOKEN = '' 10 | PAD_TOKEN = '' 11 | RANDOM_SEED = 1234 12 | 13 | # TODO: I need to clean up this preprocessing script a bit 14 | class TextReader(object): 15 | 16 | def __init__(self, data_dir, num_classes=2, suffix_list=None): 17 | self.data_dir = data_dir 18 | self.num_classes = num_classes 19 | if suffix_list: 20 | self.suffix_list = suffix_list 21 | else: 22 | self.suffix_list = [str(x) for x in range(num_classes)] 23 | self.data_files = None 24 | 25 | def get_filenames(self): 26 | if not os.path.exists(self.data_dir): 27 | sys.exit('Data directory does not exist.') 28 | data_files = [] 29 | for f in os.listdir(self.data_dir): 30 | f = os.path.join(self.data_dir, f) 31 | if os.path.isfile(f): 32 | chunks = f.split('.') 33 | if chunks[-1] in self.suffix_list: 34 | data_files.append(f) 35 | assert data_files 36 | self.data_files = data_files 37 | return data_files 38 | 39 | def clean_str(self, string): 40 | """ 41 | Tokenization/string cleaning. 42 | """ 43 | string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string) 44 | string = re.sub(r"\'s", " \'s", string) 45 | string = re.sub(r"\'ve", " \'ve", string) 46 | string = re.sub(r"n\'t", " n\'t", string) 47 | string = re.sub(r"\'re", " \'re", string) 48 | string = re.sub(r"\'d", " \'d", string) 49 | string = re.sub(r"\'ll", " \'ll", string) 50 | string = re.sub(r",", " , ", string) 51 | string = re.sub(r"!", " ! ", string) 52 | string = re.sub(r"\(", " \( ", string) 53 | string = re.sub(r"\)", " \) ", string) 54 | string = re.sub(r"\?", " \? ", string) 55 | string = re.sub(r"\s{2,}", " ", string) 56 | return string.strip().lower() 57 | 58 | def prepare_dict(self, vocab_size=10000): 59 | max_sent_len = 0 60 | c = Counter() 61 | data_files = self.get_filenames() 62 | # store the preprocessed raw text to avoid cleaning it again 63 | self.raw_text = [] 64 | for f in data_files: 65 | strings = [] 66 | with open(f, 'r') as infile: 67 | for line in infile: 68 | clean_string = self.clean_str(line) 69 | strings.append(clean_string) 70 | toks = clean_string.split() 71 | if len(toks) > max_sent_len: 72 | max_sent_len = len(toks) 73 | for t in toks: 74 | c[t] += 1 75 | self.raw_text.append(strings) 76 | total_words = len(c) 77 | assert total_words >= vocab_size 78 | word_list = [p[0] for p in c.most_common(vocab_size - 2)] 79 | word_list.insert(0, PAD_TOKEN) 80 | word_list.insert(0, UNK_TOKEN) 81 | self.word2freq = c 82 | self.word2id = dict() 83 | vocab_file = os.path.join(self.data_dir, 'vocab') 84 | with open(vocab_file, 'w') as outfile: 85 | for idx, w in enumerate(word_list): 86 | self.word2id[w] = idx 87 | outfile.write(w + '\t' + str(idx) + '\n') 88 | print '%d words found in training set. Truncate to vocabulary size %d.' % (total_words, vocab_size) 89 | print 'Dictionary saved to file %s. Max sentence length in data is %d.' % (vocab_file, max_sent_len) 90 | return max_sent_len 91 | 92 | def generate_id_data(self, max_sent_len=100): 93 | self.max_sent_len = max_sent_len 94 | sentence_and_label_pairs = [] 95 | for label, strings in enumerate(self.raw_text): 96 | for s in strings: 97 | toks = s.split() 98 | toks_len = len(toks) 99 | if toks_len <= max_sent_len: 100 | pad_left = (max_sent_len - toks_len) / 2 101 | pad_right = int(np.ceil((max_sent_len - toks_len) / 2.0)) 102 | else: 103 | continue 104 | toks_ids = [1 for i in range(pad_left)] + [self.word2id[t] if t in self.word2id else 0 for t in toks] + \ 105 | [1 for i in range(pad_right)] 106 | sentence_and_label_pairs.append((toks_ids, label)) 107 | return sentence_and_label_pairs 108 | 109 | def shuffle_and_split(self, sentence_and_label_pairs, test_fraction=0.1): 110 | random.seed(RANDOM_SEED) 111 | random.shuffle(sentence_and_label_pairs) 112 | self.num_examples = len(sentence_and_label_pairs) 113 | sentences, labels = zip(*sentence_and_label_pairs) 114 | test_num = int(self.num_examples * test_fraction) 115 | self.test_data = (sentences[:test_num], labels[:test_num]) 116 | self.train_data = (sentences[test_num:], labels[test_num:]) 117 | dump_to_file(os.path.join(self.data_dir, 'train.cPickle'), self.train_data) 118 | dump_to_file(os.path.join(self.data_dir, 'test.cPickle'), self.test_data) 119 | print 'Split dataset into training and test set: %d for training, %d for testing.' % \ 120 | (self.num_examples - test_num, test_num) 121 | return 122 | 123 | def prepare_data(self, vocab_size=10000, test_fraction=0.1): 124 | max_sent_lent = self.prepare_dict(vocab_size) 125 | sentence_and_label_pairs = self.generate_id_data(max_sent_lent) 126 | self.shuffle_and_split(sentence_and_label_pairs, test_fraction) 127 | return 128 | 129 | class DataLoader(object): 130 | 131 | def __init__(self, filename, batch_size=50): 132 | self._x, self._y = load_from_dump(filename) 133 | assert len(self._x) == len(self._y) 134 | self._pointer = 0 135 | self._num_examples = len(self._x) 136 | 137 | self.batch_size = batch_size 138 | self.num_batch = int(np.ceil(self._num_examples / self.batch_size)) 139 | print 'Loaded data with %d examples. %d examples per batch will be used.' % (self._num_examples, self.batch_size) 140 | 141 | def next_batch(self): 142 | # reset pointer 143 | if self.batch_size + self._pointer >= self._num_examples: 144 | batch_x, batch_y = self._x[self._pointer:], self._y[self._pointer:] 145 | self._pointer = (self._pointer + self.batch_size) % self._num_examples 146 | return (batch_x + self._x[:self._pointer], batch_y + self._y[:self._pointer]) 147 | self._pointer += self.batch_size 148 | return (self._x[self._pointer-self.batch_size:self._pointer], 149 | self._y[self._pointer-self.batch_size:self._pointer]) 150 | 151 | def reset_pointer(self): 152 | self._pointer = 0 153 | 154 | 155 | def dump_to_file(filename, obj): 156 | with open(filename, 'wb') as outfile: 157 | pickle.dump(obj, file=outfile) 158 | return 159 | 160 | def load_from_dump(filename): 161 | with open(filename, 'rb') as infile: 162 | obj = pickle.load(infile) 163 | return obj 164 | 165 | def _load_bin_vec(fname, vocab): 166 | """ 167 | Loads 300x1 word vecs from Google (Mikolov) word2vec 168 | """ 169 | word_vecs = {} 170 | with open(fname, "rb") as f: 171 | header = f.readline() 172 | vocab_size, layer1_size = map(int, header.split()) 173 | binary_len = np.dtype('float32').itemsize * layer1_size 174 | for line in xrange(vocab_size): 175 | word = [] 176 | while True: 177 | ch = f.read(1) 178 | if ch == ' ': 179 | word = ''.join(word) 180 | break 181 | if ch != '\n': 182 | word.append(ch) 183 | if word in vocab: 184 | word_vecs[word] = np.fromstring(f.read(binary_len), dtype='float32') 185 | else: 186 | f.read(binary_len) 187 | return (word_vecs, layer1_size) 188 | 189 | def _add_random_vec(word_vecs, vocab, emb_size=300): 190 | for word in vocab: 191 | if word not in word_vecs: 192 | word_vecs[word] = np.random.uniform(-0.25,0.25,emb_size) 193 | return word_vecs 194 | 195 | def prepare_pretrained_embedding(fname, word2id): 196 | print 'Reading pretrained word vectors from file ...' 197 | word_vecs, emb_size = _load_bin_vec(fname, word2id) 198 | word_vecs = _add_random_vec(word_vecs, word2id, emb_size) 199 | embedding = np.zeros([len(word2id), emb_size]) 200 | for w,idx in word2id.iteritems(): 201 | embedding[idx,:] = word_vecs[w] 202 | print 'Generated embeddings with shape ' + str(embedding.shape) 203 | return embedding 204 | 205 | def main(): 206 | reader = TextReader('./data/mr/', suffix_list=['neg', 'pos']) 207 | reader.prepare_data(vocab_size=15000, test_fraction=0.1) 208 | embedding = prepare_pretrained_embedding('./data/word2vec/GoogleNews-vectors-negative300.bin', reader.word2id) 209 | # dump_to_file('./data/mr/emb.cPickle', embedding) 210 | np.save('./data/mr/emb.npy', embedding) 211 | 212 | 213 | if __name__ == '__main__': 214 | main() 215 | --------------------------------------------------------------------------------