├── .gitignore
├── data
    └── mr
    │   ├── mr.neg
    │   └── mr.pos
├── README.md
├── eval.py
├── model.py
├── train.py
└── text_input.py


/.gitignore:
--------------------------------------------------------------------------------
1 | *.pyc
2 | *vocab
3 | *.cPickle
4 | train/*
5 | *.sh
6 | *.out
7 | *.bin
8 | *.npy


--------------------------------------------------------------------------------
/data/mr/mr.neg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yuhaozhang/sentence-convnet/HEAD/data/mr/mr.neg


--------------------------------------------------------------------------------
/data/mr/mr.pos:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yuhaozhang/sentence-convnet/HEAD/data/mr/mr.pos


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | A Tensorflow implementation of convolutional neural network to classify sentences
 2 | =========
 3 | This implementation uses Tensorflow's `tf.conv2d` to perform 1D convolution on word sequences. It also supports using Google News word2vec pre-trained vectors to initialize word embeddings, which boosts the performance on movie review dataset from ~76% to ~81%.
 4 | 
 5 | The original theano implementation of this model by the author is [here](https://github.com/yoonkim/CNN_sentence). Another tensorflow implementation that does not support loading pretrained vectors is [here](https://github.com/dennybritz/cnn-text-classification-tf).
 6 | 
 7 | ## Dependency
 8 | 
 9 | - python2.7+
10 | - numpy
11 | - tensorflow 1.0+
12 | 
13 | ## Data
14 | 
15 | The data in `data/mr/` are movie review polarity data provided [here](http://www.cs.cornell.edu/people/pabo/movie-review-data/). The current `data/word2vec` directory is empty. To use the pretrained word2vec embeddings, download the Google News pretrained vector data from [this Google Drive link](https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/edit), and unzip it to the directory. It will be a `.bin` file.
16 | 
17 | ## Usage
18 | 
19 | #### Preprocess the data
20 | 
21 |     python text_input.py
22 | 
23 | #### Train
24 | 
25 |     python train.py
26 | 
27 | By default the pretrained vectors will be loaded and used to initialize the embeddings. To suppress this, use
28 | 
29 |     python train.py --use_pretrain False
30 | 
31 | #### Evaluate
32 | 
33 |     python eval.py
34 | 
35 | By default evaluation is run over test set. To evaluate over training set, run
36 | 
37 |     python eval.py --train_data
38 | 
39 | ## References
40 | 
41 | 1. Kim, Yoon. "Convolutional neural networks for sentence classification." arXiv preprint arXiv:1408.5882 (2014). [link](http://arxiv.org/abs/1408.5882)
42 | 
43 | ## License
44 | 
45 | MIT
46 | 


--------------------------------------------------------------------------------
/eval.py:
--------------------------------------------------------------------------------
 1 | from datetime import datetime
 2 | import time
 3 | import os
 4 | import tensorflow as tf
 5 | import numpy as np
 6 | 
 7 | import model
 8 | import text_input
 9 | 
10 | FLAGS = tf.app.flags.FLAGS
11 | 
12 | tf.app.flags.DEFINE_string('data_dir', './data/mr/', 'Directory of the data')
13 | tf.app.flags.DEFINE_string('train_dir', './train/', 'Where to read model')
14 | tf.app.flags.DEFINE_boolean('train_data', False, 'To evaluate on training data')
15 | 
16 | def evaluate():
17 |     """ Build evaluation graph and run. """
18 |     with tf.Graph().as_default():
19 |         with tf.variable_scope('cnn'):
20 |             m = model.Model(FLAGS, is_train=False)
21 |         saver = tf.train.Saver(tf.global_variables())
22 | 
23 |         # read test files
24 |         if FLAGS.train_data:
25 |             loader = text_input.DataLoader(os.path.join(FLAGS.data_dir, 'train.cPickle'), batch_size=FLAGS.batch_size)
26 |         else:
27 |             loader = text_input.DataLoader(os.path.join(FLAGS.data_dir, 'test.cPickle'), batch_size=FLAGS.batch_size)
28 |         print 'Start evaluation, %d batches needed, with %d examples per batch.' % (loader.num_batch, FLAGS.batch_size)
29 | 
30 |         true_count = 0
31 |         avg_loss = 0
32 | 
33 |         with tf.Session() as sess:
34 |             ckpt = tf.train.get_checkpoint_state(FLAGS.train_dir)
35 |             if ckpt and ckpt.model_checkpoint_path:
36 |                 saver.restore(sess, ckpt.model_checkpoint_path)
37 |             else:
38 |                 raise IOError("Loading checkpoint file failed!")
39 | 
40 |             for _ in range(loader.num_batch):
41 |                 x, y = loader.next_batch()
42 |                 true_count_value, loss_value = sess.run([m.true_count_op, m.total_loss], 
43 |                     feed_dict={m.inputs:x, m.labels:y})
44 |                 true_count += true_count_value
45 |                 avg_loss += loss_value
46 | 
47 |             accuracy = float(true_count) / (loader.num_batch * FLAGS.batch_size)
48 |             avg_loss = float(avg_loss) / loader.num_batch
49 |             print '%s: test_loss = %.6f, test_accuracy = %.3f' % (datetime.now(), avg_loss, accuracy)
50 | 
51 | def main(argv=None):
52 |     evaluate()
53 | 
54 | if __name__ == '__main__':
55 |     tf.app.run()
56 | 
57 | 


--------------------------------------------------------------------------------
/model.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | 
  3 | tf.app.flags.DEFINE_integer('batch_size', 50, 'Training batch size')
  4 | tf.app.flags.DEFINE_integer('emb_size', 300, 'Size of word embeddings')
  5 | tf.app.flags.DEFINE_integer('num_kernel', 100, 'Number of filters for each window size')
  6 | tf.app.flags.DEFINE_integer('min_window', 3, 'Minimum size of filter window')
  7 | tf.app.flags.DEFINE_integer('max_window', 5, 'Maximum size of filter window')
  8 | tf.app.flags.DEFINE_integer('vocab_size', 15000, 'Vocabulary size')
  9 | tf.app.flags.DEFINE_integer('num_class', 2, 'Number of class to consider')
 10 | tf.app.flags.DEFINE_integer('sent_len', 56, 'Input sentence length. This is after the padding is performed.')
 11 | tf.app.flags.DEFINE_float('l2_reg', 0, 'l2 regularization weight')
 12 | 
 13 | def _variable_on_cpu(name, shape, initializer):
 14 |     with tf.device('/cpu:0'):
 15 |         var = tf.get_variable(name, shape, initializer=initializer)
 16 |     return var
 17 | 
 18 | def _variable_with_weight_decay(name, shape, initializer, wd):
 19 |     var = _variable_on_cpu(name, shape, initializer)
 20 |     if wd is not None and wd != 0.:
 21 |         weight_decay = tf.mul(tf.nn.l2_loss(var), wd, name='weight_loss')
 22 |     else:
 23 |         weight_decay = tf.constant(0.0, dtype=tf.float32)
 24 |     return var, weight_decay
 25 | 
 26 | class Model(object):
 27 | 
 28 |     def __init__(self, config, is_train=True):
 29 |         self.is_train = is_train
 30 |         self.emb_size = config.emb_size
 31 |         self.batch_size = config.batch_size
 32 |         self.num_kernel = config.num_kernel
 33 |         self.min_window = config.min_window
 34 |         self.max_window = config.max_window
 35 |         self.vocab_size = config.vocab_size
 36 |         self.num_class = config.num_class
 37 |         self.sent_len = config.sent_len
 38 |         self.l2_reg = config.l2_reg
 39 |         if is_train:
 40 |             self.optimizer = config.optimizer
 41 |             self.dropout = config.dropout
 42 |         self.build_graph()
 43 | 
 44 |     def build_graph(self):
 45 |         """ Build the computation graph. """
 46 |         self._inputs = tf.placeholder(dtype=tf.int64, shape=[self.batch_size, self.sent_len], name='input_x')
 47 |         self._labels = tf.placeholder(dtype=tf.int64, shape=[self.batch_size], name='input_y')
 48 |         losses = []
 49 | 
 50 |         # lookup layer
 51 |         with tf.variable_scope('lookup') as scope:
 52 |             self._W_emb = _variable_on_cpu(name='embedding', shape=[self.vocab_size, self.emb_size], 
 53 |                 initializer=tf.random_uniform_initializer(minval=-1.0, maxval=1.0))
 54 |             # sent_batch is of shape: (batch_size, sent_len, emb_size, 1), in order to use conv2d
 55 |             sent_batch = tf.nn.embedding_lookup(params=self._W_emb, ids=self._inputs)
 56 |             sent_batch = tf.expand_dims(sent_batch, -1)
 57 | 
 58 |         # conv + pooling layer
 59 |         with tf.variable_scope('conv') as scope:
 60 |             pool_tensors = []
 61 |             for k_size in range(self.min_window, self.max_window+1):
 62 |                 kernel, wd = _variable_with_weight_decay(name='kernel_'+str(k_size),
 63 |                     shape=[k_size, self.emb_size, 1, self.num_kernel], initializer=tf.truncated_normal_initializer(stddev=0.01), wd=self.l2_reg)
 64 |                 losses.append(wd)
 65 |                 conv = tf.nn.conv2d(input=sent_batch, filter=kernel, strides=[1,1,1,1], padding='VALID')
 66 |                 biases = _variable_on_cpu('biases_'+str(k_size), [self.num_kernel], tf.constant_initializer(0.0))
 67 |                 bias = tf.nn.bias_add(conv, biases)
 68 |                 relu = tf.nn.relu(bias, name=scope.name)
 69 |                 # shape of relu: [batch_size, conv_len, 1, num_kernel]
 70 |                 conv_len = relu.get_shape()[1]
 71 |                 pool = tf.nn.max_pool(relu, ksize=[1,conv_len,1,1], strides=[1,1,1,1], padding='VALID')
 72 |                 # shape of pool: [batch_size, 1, 1, num_kernel]
 73 |                 pool = tf.squeeze(pool,squeeze_dims=[1,2]) # size: [batch_size, num_kernel]
 74 |                 pool_tensors.append(pool)
 75 |             pool_layer = tf.concat(values=pool_tensors, axis=1, name='pool')
 76 | 
 77 |         # drop out layer
 78 |         if self.is_train and self.dropout > 0:
 79 |             pool_dropout = tf.nn.dropout(pool_layer, 1 - self.dropout)
 80 |         else:
 81 |             pool_dropout = pool_layer
 82 | 
 83 |         # fully-connected layer
 84 |         pool_size = (self.max_window - self.min_window + 1) * self.num_kernel
 85 |         with tf.variable_scope('fc') as scope:
 86 |             W, wd = _variable_with_weight_decay('W', shape=[pool_size, self.num_class],
 87 |                 initializer=tf.truncated_normal_initializer(stddev=0.05), wd=self.l2_reg)
 88 |             losses.append(wd)
 89 |             biases = _variable_on_cpu('biases', [self.num_class], tf.constant_initializer(0.01))
 90 |             logits = tf.nn.bias_add(tf.matmul(pool_dropout, W), biases)
 91 | 
 92 |         # loss
 93 |         cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=self._labels, logits=logits, name='cross_entropy_per_example')
 94 |         cross_entropy_loss = tf.reduce_mean(cross_entropy, name='cross_entropy_loss')
 95 |         losses.append(cross_entropy_loss)
 96 |         self._total_loss = tf.add_n(losses, name='total_loss')
 97 |         # self._total_loss = cross_entropy_loss
 98 | 
 99 |         # correct prediction count
100 |         correct_prediction = tf.to_int32(tf.nn.in_top_k(logits, self._labels, 1))
101 |         self._true_count_op = tf.reduce_sum(correct_prediction)
102 | 
103 |         # train on a batch
104 |         self._lr = tf.Variable(0.0, trainable=False)
105 |         if self.is_train:
106 |             if self.optimizer == 'adadelta':
107 |                 opt = tf.train.AdadeltaOptimizer(self._lr)
108 |             elif self.optimizer == 'adagrad':
109 |                 opt = tf.train.AdagradOptimizer(self._lr)
110 |             elif self.optimizer == 'adam':
111 |                 opt = tf.train.AdamOptimizer(self._lr)
112 |             elif self.optimizer == 'sgd':
113 |                 opt = tf.train.GradientDescentOptimizer(self._lr)
114 |             else:
115 |                 raise ValueError("Optimizer not supported.")
116 |             grads = opt.compute_gradients(self._total_loss)
117 |             self._train_op = opt.apply_gradients(grads)
118 | 
119 |             for var in tf.trainable_variables():
120 |                 tf.summary.histogram(var.op.name, var)
121 |         else:
122 |             self._train_op = tf.no_op()
123 |             
124 |         return
125 | 
126 |     @property
127 |     def inputs(self):
128 |         return self._inputs
129 | 
130 |     @property
131 |     def labels(self):
132 |         return self._labels
133 | 
134 |     @property
135 |     def lr(self):
136 |         return self._lr
137 |     
138 |     @property
139 |     def train_op(self):
140 |         return self._train_op
141 |     
142 |     @property
143 |     def total_loss(self):
144 |         return self._total_loss
145 | 
146 |     @property
147 |     def true_count_op(self):
148 |         return self._true_count_op
149 |     
150 |     @property
151 |     def W_emb(self):
152 |         return self._W_emb
153 | 
154 |     def assign_lr(self, session, lr_value):
155 |         session.run(tf.assign(self.lr, lr_value))
156 | 
157 |     def assign_embedding(self, session, pretrained):
158 |         session.run(tf.assign(self.W_emb, pretrained))
159 | 


--------------------------------------------------------------------------------
/train.py:
--------------------------------------------------------------------------------
  1 | from datetime import datetime
  2 | import time
  3 | import os
  4 | import sys
  5 | import tensorflow as tf
  6 | import numpy as np
  7 | 
  8 | import model
  9 | import text_input
 10 | 
 11 | FLAGS = tf.app.flags.FLAGS
 12 | 
 13 | tf.app.flags.DEFINE_string('data_dir', './data/mr/', 'Directory of the data')
 14 | tf.app.flags.DEFINE_string('train_dir', './train/', 'Directory to save training checkpoint files')
 15 | tf.app.flags.DEFINE_integer('num_epoch', 50, 'Number of epochs to run')
 16 | tf.app.flags.DEFINE_boolean('use_pretrain', True, 'Use word2vec pretrained embeddings or not')
 17 | tf.app.flags.DEFINE_boolean('log_device_placement', False, 'Whether log device information in summary')
 18 | 
 19 | tf.app.flags.DEFINE_string('optimizer', 'adagrad', 'Optimizer to use. Must be one of "sgd", adagrad", "adadelta" and "adam"')
 20 | tf.app.flags.DEFINE_float('init_lr', 0.01, 'Initial learning rate')
 21 | tf.app.flags.DEFINE_float('lr_decay', 0.95, 'LR decay rate')
 22 | tf.app.flags.DEFINE_integer('tolerance_step', 500, 'Decay the lr after loss remains unchanged for this number of steps')
 23 | tf.app.flags.DEFINE_float('dropout', 0.5, 'Dropout rate. 0 is no dropout.')
 24 | 
 25 | tf.app.flags.DEFINE_integer('log_step', 10, 'Write log to stdout after this step')
 26 | tf.app.flags.DEFINE_integer('summary_step', 200, 'Write summary after this step')
 27 | tf.app.flags.DEFINE_integer('save_epoch', 5, 'Save model after this epoch')
 28 | 
 29 | def train():
 30 |     # load data
 31 |     train_loader = text_input.DataLoader(os.path.join(FLAGS.data_dir, 'train.cPickle'), batch_size=FLAGS.batch_size)
 32 |     test_loader = text_input.DataLoader(os.path.join(FLAGS.data_dir, 'test.cPickle'), batch_size=FLAGS.batch_size)
 33 |     max_steps = train_loader.num_batch * FLAGS.num_epoch # this is just an estimated number
 34 | 
 35 |     with tf.Graph().as_default():
 36 |         with tf.variable_scope('cnn', reuse=None):
 37 |             m = model.Model(FLAGS, is_train=True)
 38 |         with tf.variable_scope('cnn', reuse=True):
 39 |             mtest = model.Model(FLAGS, is_train=False)
 40 | 
 41 |         saver = tf.train.Saver(tf.global_variables())
 42 |         save_path = os.path.join(FLAGS.train_dir, 'model.ckpt')
 43 |         summary_op = tf.summary.merge_all()
 44 | 
 45 |         sess = tf.Session(config=tf.ConfigProto(log_device_placement=FLAGS.log_device_placement))
 46 |         summary_writer = tf.summary.FileWriter(FLAGS.train_dir, graph=sess.graph)
 47 |         sess.run(tf.global_variables_initializer())
 48 | 
 49 |         if FLAGS.use_pretrain:
 50 |             print "Use pretrained embeddings to initialize model ..."
 51 |             pretrained_embedding = np.load(os.path.join(FLAGS.data_dir, 'emb.npy'))
 52 |             m.assign_embedding(sess, pretrained_embedding)
 53 | 
 54 |         current_lr = FLAGS.init_lr
 55 |         lowest_loss_value = float("inf")
 56 |         step_loss_ascend = 0
 57 |         global_step = 0
 58 | 
 59 |         def eval_once(mtest, sess, data_loader):
 60 |             test_loss = 0.0
 61 |             test_accuracy = 0
 62 |             for _ in xrange(data_loader.num_batch):
 63 |                 x_batch, y_batch = data_loader.next_batch()
 64 |                 x_batch = np.array(x_batch)
 65 |                 loss_value, true_count_value = sess.run([mtest.total_loss, mtest.true_count_op], 
 66 |                     feed_dict={mtest.inputs: x_batch, mtest.labels: y_batch})
 67 |                 test_loss += loss_value
 68 |                 test_accuracy += true_count_value
 69 |             test_loss /= data_loader.num_batch
 70 |             test_accuracy /= (1.0 * data_loader.num_batch * FLAGS.batch_size)
 71 |             data_loader.reset_pointer()
 72 |             return (test_loss, test_accuracy)
 73 | 
 74 |         # Note that this is a soft version of epoch.
 75 |         for epoch in xrange(FLAGS.num_epoch):
 76 |             train_loss = 0.0
 77 |             true_count_total = 0
 78 |             train_loader.reset_pointer()
 79 |             for _ in xrange(train_loader.num_batch):
 80 |                 m.assign_lr(sess, current_lr)
 81 |                 global_step += 1
 82 |                 start_time = time.time()
 83 |                 x_batch, y_batch = train_loader.next_batch()
 84 |                 feed = {m.inputs: x_batch, m.labels: y_batch}
 85 |                 _, loss_value, true_count_value = sess.run([m.train_op, m.total_loss, m.true_count_op], feed_dict=feed)
 86 |                 duration = time.time() - start_time
 87 |                 train_loss += loss_value
 88 |                 true_count_total += true_count_value
 89 | 
 90 |                 assert not np.isnan(loss_value), "Model loss is NaN."
 91 | 
 92 |                 if global_step % FLAGS.log_step == 0:
 93 |                     examples_per_sec = FLAGS.batch_size / duration
 94 | 
 95 |                     format_str = ('%s: step %d/%d (epoch %d/%d), loss = %.6f (%.1f examples/sec; %.3f sec/batch), lr: %.6f')
 96 |                     print (format_str % (datetime.now(), global_step, max_steps, epoch+1, FLAGS.num_epoch, loss_value, 
 97 |                         examples_per_sec, duration, current_lr))
 98 | 
 99 |                 if global_step % FLAGS.summary_step == 0:
100 |                     summary_str = sess.run(summary_op)
101 |                     summary_writer.add_summary(summary_str, global_step)
102 | 
103 |                 # decay learning rate if necessary
104 |                 if loss_value < lowest_loss_value:
105 |                     lowest_loss_value = loss_value
106 |                     step_loss_ascend = 0
107 |                 else:
108 |                     step_loss_ascend += 1
109 |                 if step_loss_ascend >= FLAGS.tolerance_step:
110 |                     current_lr *= FLAGS.lr_decay
111 |                     print '%s: step %d/%d (epoch %d/%d), LR decays to %.5f' % ((datetime.now(), global_step, max_steps, 
112 |                         epoch+1, FLAGS.num_epoch, current_lr))
113 |                     step_loss_ascend = 0
114 | 
115 |                 # stop learning if learning rate is too low
116 |                 if current_lr < 1e-5: break
117 | 
118 |             # summary loss/accuracy after each epoch
119 |             train_loss /= train_loader.num_batch
120 |             train_accuracy = true_count_total * 1.0 / (train_loader.num_batch * FLAGS.batch_size)
121 |             summary_writer.add_summary(_summary_for_scalar('eval/training_loss', train_loss), global_step=epoch)
122 |             summary_writer.add_summary(_summary_for_scalar('eval/training_accuracy', train_accuracy), global_step=epoch)
123 | 
124 |             test_loss, test_accuracy = eval_once(mtest, sess, test_loader)
125 |             summary_writer.add_summary(_summary_for_scalar('eval/test_loss', test_loss), global_step=epoch)
126 |             summary_writer.add_summary(_summary_for_scalar('eval/test_accuracy', test_accuracy), global_step=epoch)
127 | 
128 |             print("Epoch %d: training_loss = %.6f, training_accuracy = %.3f" % (epoch+1, train_loss, train_accuracy))
129 |             print("Epoch %d: test_loss = %.6f, test_accuracy = %.3f" % (epoch+1, test_loss, test_accuracy))
130 | 
131 |             # save after fixed epoch
132 |             if epoch % FLAGS.save_epoch == 0:
133 |                     saver.save(sess, save_path, global_step=epoch)
134 |         saver.save(sess, save_path, global_step=epoch)
135 | 
136 | def _summary_for_scalar(name, value):
137 |     return tf.Summary(value=[tf.Summary.Value(tag=name, simple_value=value)])
138 | 
139 | def main(argv=None):
140 |     if tf.gfile.Exists(FLAGS.train_dir):
141 |         tf.gfile.DeleteRecursively(FLAGS.train_dir)
142 |     tf.gfile.MakeDirs(FLAGS.train_dir)
143 |     train()
144 | 
145 | if __name__ == '__main__':
146 |     tf.app.run()
147 | 


--------------------------------------------------------------------------------
/text_input.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import os
  3 | import re
  4 | import random
  5 | from collections import Counter
  6 | import cPickle as pickle
  7 | import numpy as np
  8 | 
  9 | UNK_TOKEN = '<unk>'
 10 | PAD_TOKEN = '<pad>'
 11 | RANDOM_SEED = 1234
 12 | 
 13 | # TODO: I need to clean up this preprocessing script a bit
 14 | class TextReader(object):
 15 | 
 16 |     def __init__(self, data_dir, num_classes=2, suffix_list=None):
 17 |         self.data_dir = data_dir
 18 |         self.num_classes = num_classes
 19 |         if suffix_list:
 20 |             self.suffix_list = suffix_list
 21 |         else:
 22 |             self.suffix_list = [str(x) for x in range(num_classes)]
 23 |         self.data_files = None
 24 | 
 25 |     def get_filenames(self):
 26 |         if not os.path.exists(self.data_dir):
 27 |             sys.exit('Data directory does not exist.')
 28 |         data_files = []
 29 |         for f in os.listdir(self.data_dir):
 30 |             f = os.path.join(self.data_dir, f)
 31 |             if os.path.isfile(f):
 32 |                 chunks = f.split('.')
 33 |                 if chunks[-1] in self.suffix_list:
 34 |                     data_files.append(f)
 35 |         assert data_files
 36 |         self.data_files = data_files
 37 |         return data_files
 38 | 
 39 |     def clean_str(self, string):
 40 |         """
 41 |         Tokenization/string cleaning.
 42 |         """
 43 |         string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)     
 44 |         string = re.sub(r"\'s", " \'s", string) 
 45 |         string = re.sub(r"\'ve", " \'ve", string) 
 46 |         string = re.sub(r"n\'t", " n\'t", string) 
 47 |         string = re.sub(r"\'re", " \'re", string) 
 48 |         string = re.sub(r"\'d", " \'d", string) 
 49 |         string = re.sub(r"\'ll", " \'ll", string) 
 50 |         string = re.sub(r",", " , ", string) 
 51 |         string = re.sub(r"!", " ! ", string) 
 52 |         string = re.sub(r"\(", " \( ", string) 
 53 |         string = re.sub(r"\)", " \) ", string) 
 54 |         string = re.sub(r"\?", " \? ", string) 
 55 |         string = re.sub(r"\s{2,}", " ", string)    
 56 |         return string.strip().lower()
 57 | 
 58 |     def prepare_dict(self, vocab_size=10000):
 59 |         max_sent_len = 0
 60 |         c = Counter()
 61 |         data_files = self.get_filenames()
 62 |         # store the preprocessed raw text to avoid cleaning it again
 63 |         self.raw_text = []
 64 |         for f in data_files:
 65 |             strings = []
 66 |             with open(f, 'r') as infile:
 67 |                 for line in infile:
 68 |                     clean_string = self.clean_str(line)
 69 |                     strings.append(clean_string)
 70 |                     toks = clean_string.split()
 71 |                     if len(toks) > max_sent_len:
 72 |                         max_sent_len = len(toks)
 73 |                     for t in toks:
 74 |                         c[t] += 1
 75 |             self.raw_text.append(strings)
 76 |         total_words = len(c)
 77 |         assert total_words >= vocab_size
 78 |         word_list = [p[0] for p in c.most_common(vocab_size - 2)]
 79 |         word_list.insert(0, PAD_TOKEN)
 80 |         word_list.insert(0, UNK_TOKEN)
 81 |         self.word2freq = c
 82 |         self.word2id = dict()
 83 |         vocab_file = os.path.join(self.data_dir, 'vocab')
 84 |         with open(vocab_file, 'w') as outfile:
 85 |             for idx, w in enumerate(word_list):
 86 |                 self.word2id[w] = idx
 87 |                 outfile.write(w + '\t' + str(idx) + '\n')
 88 |         print '%d words found in training set. Truncate to vocabulary size %d.' % (total_words, vocab_size)
 89 |         print 'Dictionary saved to file %s. Max sentence length in data is %d.' % (vocab_file, max_sent_len)
 90 |         return max_sent_len
 91 | 
 92 |     def generate_id_data(self, max_sent_len=100):
 93 |         self.max_sent_len = max_sent_len
 94 |         sentence_and_label_pairs = []
 95 |         for label, strings in enumerate(self.raw_text):
 96 |             for s in strings:
 97 |                 toks = s.split()
 98 |                 toks_len = len(toks)
 99 |                 if toks_len <= max_sent_len:
100 |                     pad_left = (max_sent_len - toks_len) / 2
101 |                     pad_right = int(np.ceil((max_sent_len - toks_len) / 2.0))
102 |                 else:
103 |                     continue
104 |                 toks_ids = [1 for i in range(pad_left)] + [self.word2id[t] if t in self.word2id else 0 for t in toks] + \
105 |                     [1 for i in range(pad_right)]
106 |                 sentence_and_label_pairs.append((toks_ids, label))
107 |         return sentence_and_label_pairs
108 | 
109 |     def shuffle_and_split(self, sentence_and_label_pairs, test_fraction=0.1):
110 |         random.seed(RANDOM_SEED)
111 |         random.shuffle(sentence_and_label_pairs)
112 |         self.num_examples = len(sentence_and_label_pairs)
113 |         sentences, labels = zip(*sentence_and_label_pairs)
114 |         test_num = int(self.num_examples * test_fraction)
115 |         self.test_data = (sentences[:test_num], labels[:test_num])
116 |         self.train_data = (sentences[test_num:], labels[test_num:])
117 |         dump_to_file(os.path.join(self.data_dir, 'train.cPickle'), self.train_data)
118 |         dump_to_file(os.path.join(self.data_dir, 'test.cPickle'), self.test_data)
119 |         print 'Split dataset into training and test set: %d for training, %d for testing.' % \
120 |             (self.num_examples - test_num, test_num)
121 |         return
122 | 
123 |     def prepare_data(self, vocab_size=10000, test_fraction=0.1):
124 |         max_sent_lent = self.prepare_dict(vocab_size)
125 |         sentence_and_label_pairs = self.generate_id_data(max_sent_lent)
126 |         self.shuffle_and_split(sentence_and_label_pairs, test_fraction)
127 |         return
128 | 
129 | class DataLoader(object):
130 | 
131 |     def __init__(self, filename, batch_size=50):
132 |         self._x, self._y = load_from_dump(filename)
133 |         assert len(self._x) == len(self._y)
134 |         self._pointer = 0
135 |         self._num_examples = len(self._x)
136 | 
137 |         self.batch_size = batch_size
138 |         self.num_batch = int(np.ceil(self._num_examples / self.batch_size))
139 |         print 'Loaded data with %d examples. %d examples per batch will be used.' % (self._num_examples, self.batch_size)
140 | 
141 |     def next_batch(self):
142 |         # reset pointer
143 |         if self.batch_size + self._pointer >= self._num_examples:
144 |             batch_x, batch_y = self._x[self._pointer:], self._y[self._pointer:]
145 |             self._pointer = (self._pointer + self.batch_size) % self._num_examples
146 |             return (batch_x + self._x[:self._pointer], batch_y + self._y[:self._pointer])
147 |         self._pointer += self.batch_size
148 |         return (self._x[self._pointer-self.batch_size:self._pointer], 
149 |             self._y[self._pointer-self.batch_size:self._pointer])
150 | 
151 |     def reset_pointer(self):
152 |         self._pointer = 0
153 | 
154 | 
155 | def dump_to_file(filename, obj):
156 |     with open(filename, 'wb') as outfile:
157 |         pickle.dump(obj, file=outfile)
158 |     return
159 | 
160 | def load_from_dump(filename):
161 |     with open(filename, 'rb') as infile:
162 |         obj = pickle.load(infile)
163 |     return obj
164 | 
165 | def _load_bin_vec(fname, vocab):
166 |     """
167 |     Loads 300x1 word vecs from Google (Mikolov) word2vec
168 |     """
169 |     word_vecs = {}
170 |     with open(fname, "rb") as f:
171 |         header = f.readline()
172 |         vocab_size, layer1_size = map(int, header.split())
173 |         binary_len = np.dtype('float32').itemsize * layer1_size
174 |         for line in xrange(vocab_size):
175 |             word = []
176 |             while True:
177 |                 ch = f.read(1)
178 |                 if ch == ' ':
179 |                     word = ''.join(word)
180 |                     break
181 |                 if ch != '\n':
182 |                     word.append(ch)
183 |             if word in vocab:
184 |                word_vecs[word] = np.fromstring(f.read(binary_len), dtype='float32')
185 |             else:
186 |                 f.read(binary_len)
187 |     return (word_vecs, layer1_size)
188 | 
189 | def _add_random_vec(word_vecs, vocab, emb_size=300):
190 |     for word in vocab:
191 |         if word not in word_vecs:
192 |             word_vecs[word] = np.random.uniform(-0.25,0.25,emb_size)
193 |     return word_vecs
194 | 
195 | def prepare_pretrained_embedding(fname, word2id):
196 |     print 'Reading pretrained word vectors from file ...'
197 |     word_vecs, emb_size = _load_bin_vec(fname, word2id)
198 |     word_vecs = _add_random_vec(word_vecs, word2id, emb_size)
199 |     embedding = np.zeros([len(word2id), emb_size])
200 |     for w,idx in word2id.iteritems():
201 |         embedding[idx,:] = word_vecs[w]
202 |     print 'Generated embeddings with shape ' + str(embedding.shape)
203 |     return embedding
204 | 
205 | def main():
206 |     reader = TextReader('./data/mr/', suffix_list=['neg', 'pos'])
207 |     reader.prepare_data(vocab_size=15000, test_fraction=0.1)
208 |     embedding = prepare_pretrained_embedding('./data/word2vec/GoogleNews-vectors-negative300.bin', reader.word2id)
209 |     # dump_to_file('./data/mr/emb.cPickle', embedding)
210 |     np.save('./data/mr/emb.npy', embedding)
211 | 
212 | 
213 | if __name__ == '__main__':
214 |     main()
215 | 


--------------------------------------------------------------------------------