├── Selection_134.png
├── Selection_135.png
├── Selection_136.png
├── README.md
├── preprocessing.py
├── model_densenet.py
└── train.py
/Selection_134.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lethienhoa/DenseNet-NLP/HEAD/Selection_134.png
--------------------------------------------------------------------------------
/Selection_135.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lethienhoa/DenseNet-NLP/HEAD/Selection_135.png
--------------------------------------------------------------------------------
/Selection_136.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lethienhoa/DenseNet-NLP/HEAD/Selection_136.png
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | ## Very Deep Convolutional Networks for Natural Language Processing in Tensorflow
2 |
3 | This is the DenseNet implementation of the paper **Do Convolutional Networks need to be Deep for Text Classification ?** in Tensorflow. We study in the paper the importance of depth in convolutional models for text classification, either when character or word inputs are considered. We show on 5 standard text classification and sentiment analysis tasks that deep models indeed give better performances than shallow networks when the text input is represented as a sequence of characters. However, a simple shallow-and-wide network outperforms deep models such as DenseNet with word inputs. Our shallow word model further establishes new state-of-the-art performances on two datasets: Yelp Binary (95.9\%) and Yelp Full (64.9\%).
4 |
5 | **Paper:**
6 |
7 | Hoa T. Le, Christophe Cerisara, Alexandre Denis. **Do Convolutional Networks need to be Deep for Text Classification ?**. Association for the Advancement of Artificial Intelligence 2018 (**AAAI-18**) Workshop on Affective Content Analysis. (https://arxiv.org/abs/1707.04108)
8 |
9 | @article{DBLP:journals/corr/LeCD17,
10 | author = {Hoa T. Le and
11 | Christophe Cerisara and
12 | Alexandre Denis},
13 | title = {Do Convolutional Networks need to be Deep for Text Classification ?},
14 | journal = {CoRR},
15 | year = {2017}
16 | }
17 |
18 |
19 |
20 |
21 |
22 | **Results:**
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 | **Reference Source Codes:**
33 | https://github.com/dennybritz/cnn-text-classification-tf
34 |
35 |
--------------------------------------------------------------------------------
/preprocessing.py:
--------------------------------------------------------------------------------
1 | # based on ideas from https://github.com/dennybritz/cnn-text-classification-tf
2 |
3 | import numpy as np
4 | import json
5 |
6 | alphabet = "abcdefghijklmnopqrstuvwxyz0123456789-,;.!?:'\"/\\|_@#$%^&*~`+-=<>()[]{}\n"
7 | char_seq_length = 144 # Twitter has only 140 characters. We pad 4 blanks characters more to the right of tweets to be conformed with the architecture of A. Conneau et al (2016)
8 |
9 |
10 | def load_yelp(alphabet):
11 | examples = []
12 | labels = []
13 | with open('./yelp-review-dataset/yelp_academic_dataset_review.json') as f:
14 | i = 0
15 | for line in f:
16 | review = json.loads(line)
17 | stars = review["stars"]
18 | text = review["text"]
19 | if stars != 3:
20 | padded = pad_sentence(list(text.lower()))
21 | text_int8_repr = string_to_int8_conversion(padded, alphabet)
22 | if stars == 1 or stars == 2:
23 | labels.append([1, 0])
24 | examples.append(text_int8_repr)
25 | elif stars == 4 or stars == 5:
26 | labels.append([0, 1])
27 | examples.append(text_int8_repr)
28 | i += 1
29 | if i % 10000 == 0:
30 | print("Non-neutral instances processed: " + str(i))
31 | return examples, labels
32 |
33 | def pad_sentence(char_seq, padding_char=" "):
34 | num_padding = char_seq_length - len(char_seq)
35 | new_char_seq = char_seq + [padding_char] * num_padding
36 | return new_char_seq
37 |
38 |
39 | def string_to_int8_conversion(char_seq, alphabet):
40 | x = np.array([alphabet.find(char) for char in char_seq], dtype=np.int8)
41 | return x
42 |
43 |
44 | def get_batched_one_hot(char_seqs_indices, labels, start_index, end_index):
45 | x_batch = char_seqs_indices[start_index:end_index]
46 | y_batch = labels[start_index:end_index]
47 | x_batch_one_hot = np.zeros(shape=[len(x_batch), len(alphabet), len(x_batch[0]), 1])
48 | for example_i, char_seq_indices in enumerate(x_batch):
49 | for char_pos_in_seq, char_seq_char_ind in enumerate(char_seq_indices):
50 | if char_seq_char_ind != -1:
51 | x_batch_one_hot[example_i][char_seq_char_ind][char_pos_in_seq][0] = 1
52 | return [x_batch_one_hot, y_batch]
53 |
54 |
55 | def load_data():
56 | examples, labels = load_yelp(alphabet)
57 | x = np.array(examples, dtype=np.int8)
58 | y = np.array(labels, dtype=np.int8)
59 | print("x_char_seq_ind=" + str(x.shape))
60 | print("y shape=" + str(y.shape))
61 | return [x, y]
62 |
63 |
64 | def batch_iter(x, y, batch_size, num_epochs, shuffle=True):
65 | """
66 | Generates a batch iterator for a dataset.
67 | """
68 | # data = np.array(data)
69 | data_size = len(x)
70 | num_batches_per_epoch = int(data_size/batch_size) + 1
71 | for epoch in range(num_epochs):
72 | print("In epoch >> " + str(epoch + 1))
73 | print("num batches per epoch is: " + str(num_batches_per_epoch))
74 | # Shuffle the data at each epoch
75 | if shuffle:
76 | shuffle_indices = np.random.permutation(np.arange(data_size))
77 | x_shuffled = x[shuffle_indices]
78 | y_shuffled = y[shuffle_indices]
79 | else:
80 | x_shuffled = x
81 | y_shuffled = y
82 | for batch_num in range(num_batches_per_epoch):
83 | start_index = batch_num * batch_size
84 | end_index = min((batch_num + 1) * batch_size, data_size)
85 | x_batch, y_batch = get_batched_one_hot(x_shuffled, y_shuffled, start_index, end_index)
86 | batch = list(zip(x_batch, y_batch))
87 | yield batch
88 |
--------------------------------------------------------------------------------
/model_densenet.py:
--------------------------------------------------------------------------------
1 | import tensorflow as tf
2 | import tensorflow.contrib.slim as slim
3 | from tensorflow.python.ops import math_ops
4 |
5 | alphabet = "abcdefghijklmnopqrstuvwxyz0123456789-,;.!?:'\"/\\|_@#$%^&*~`+-=<>()[]{}"
6 | sequence_max_length = 1014
7 |
8 | class model():
9 | """
10 | DenseNet Model for Text Classification.
11 | """
12 |
13 | def __init__(self, num_classes, cnn_filter_size=(3,3,3,3), pooling_filter_size=(2,2,2,2), num_filters_per_size=(64,128,256,512), num_rep_block=(4,4,4,4), num_quantized_chars=len(alphabet)):
14 |
15 | def denseBlock(input_layer, num_filters_per_size_i, cnn_filter_size_i, i, num_rep_block_i):
16 | with tf.variable_scope("dense_unit_%s" % i):
17 | nodes = []
18 | a = slim.conv2d(input_layer, num_filters_per_size_i, [1, cnn_filter_size_i], weights_initializer=tf.contrib.layers.variance_scaling_initializer(factor=1.0, mode='FAN_AVG', uniform=True), normalizer_fn=slim.batch_norm)
19 | nodes.append(a)
20 | print a.get_shape()
21 | for z in range(num_rep_block_i-1):
22 | #b = slim.conv2d(tf.concat(3,nodes), num_filters_per_size_i, [1, cnn_filter_size_i], weights_initializer=tf.contrib.layers.variance_scaling_initializer(factor=1.0, mode='FAN_AVG', uniform=True), normalizer_fn=slim.batch_norm)
23 | b = slim.conv2d(tf.concat(nodes,3), num_filters_per_size_i, [1, cnn_filter_size_i], weights_initializer=tf.contrib.layers.variance_scaling_initializer(factor=1.0, mode='FAN_AVG', uniform=True), normalizer_fn=slim.batch_norm)
24 | nodes.append(b)
25 | print b.get_shape()
26 | print ""
27 | return b
28 |
29 | self.input_x = tf.placeholder(tf.float32, [None, num_quantized_chars, sequence_max_length, 1], name="input_x")
30 | self.input_y = tf.placeholder(tf.float32, [None, num_classes], name="input_y")
31 | self.dropout_keep_prob = tf.placeholder(tf.float32, name="dropout_keep_prob")
32 |
33 | # Input Dim : 70 x 176 x 1
34 |
35 | # ================ First Conv Layer ================
36 | h = slim.conv2d(self.input_x, num_filters_per_size[0], [num_quantized_chars, cnn_filter_size[0]], weights_initializer=tf.contrib.layers.variance_scaling_initializer(factor=1.0, mode='FAN_AVG', uniform=True), normalizer_fn=slim.batch_norm, scope = 'conv0', padding='VALID')
37 |
38 | # Output Dim : 1 x 176 x 64
39 |
40 | # ================ Conv Block 64, 128, 256, 512 =================
41 | for i in range(0,len(num_filters_per_size)):
42 | h = denseBlock(h, num_filters_per_size[i], cnn_filter_size[i], i, num_rep_block[i])
43 |
44 | # Transition Layer
45 | if i<>len(num_filters_per_size)-1:
46 | h = slim.conv2d(h, num_filters_per_size[i+1], [1, cnn_filter_size[i]], weights_initializer=tf.contrib.layers.variance_scaling_initializer(factor=1.0, mode='FAN_AVG', uniform=True), normalizer_fn=slim.batch_norm, scope='conv-last-%s' % i)
47 |
48 | # Max pooling 1/2
49 | h = slim.max_pool2d(h, [1,pooling_filter_size[i]], stride=pooling_filter_size[i], scope='pool_%s' % i)
50 | print h.get_shape()
51 |
52 | # Output Dim : 1 x 8 x 512
53 |
54 | # ================ Layer FC ================
55 | # Output Dim : 1 x 4096
56 |
57 | # Max pooling filtersize = 8, stride = 8
58 | h = slim.max_pool2d(h, [1, 8], stride=8, scope='pool_final')
59 | h = slim.flatten(h)
60 | print h.get_shape()
61 |
62 | # Global avg max pooling
63 | #h = math_ops.reduce_mean(h, [1, 2], name='pool5', keep_dims=True)
64 | #h = slim.flatten(h)
65 | #print ""
66 | #print h.get_shape()
67 |
68 | h = slim.fully_connected(h, 2048, activation_fn=None, scope='FC1')
69 | print h.get_shape()
70 | h = slim.fully_connected(h, 2048, activation_fn=None, scope='FC2')
71 | print h.get_shape()
72 | scores = slim.fully_connected(h, num_classes, activation_fn=None, scope='output')
73 | print scores.get_shape()
74 |
75 | self.scores = scores
76 | pred1D = tf.argmax(scores, 1, name="predictions")
77 | self.predictions = pred1D
78 | y1D = tf.argmax(self.input_y, 1)
79 |
80 | # ================ Loss and Accuracy ================
81 | # CalculateMean cross-entropy loss
82 | with tf.name_scope("evaluate"):
83 | losses = tf.nn.softmax_cross_entropy_with_logits(labels=self.input_y, logits=scores)
84 | self.loss = tf.reduce_mean(losses)
85 |
86 | correct_predictions = tf.equal(pred1D, y1D)
87 | self.accuracy = tf.reduce_mean(tf.cast(correct_predictions, "float"), name="accuracy")
88 |
--------------------------------------------------------------------------------
/train.py:
--------------------------------------------------------------------------------
1 | #! /usr/bin/env python
2 | # based on ideas in https://github.com/dennybritz/cnn-text-classification-tf
3 |
4 | import tensorflow as tf
5 | import numpy as np
6 | import os
7 | import time
8 | import datetime
9 | import preprocessing
10 | from model_densenet import model
11 |
12 | # Parameters
13 | # ==================================================
14 |
15 | # Model Hyperparameters
16 | tf.flags.DEFINE_float("dropout_keep_prob", 0.5, "Dropout keep probability (default: 0.5)")
17 | tf.flags.DEFINE_float("l2_reg_lambda", 0.0, "L2 regularizaion lambda (default: 0.0)")
18 |
19 | # Training parameters
20 | tf.flags.DEFINE_integer("batch_size", 128, "Batch Size (default: 128)")
21 | tf.flags.DEFINE_integer("num_epochs", 50, "Number of training epochs (default: 200)")
22 | tf.flags.DEFINE_integer("evaluate_every", 5000, "Evaluate model on dev set after this many steps (default: 100)")
23 | tf.flags.DEFINE_integer("checkpoint_every", 1000, "Save model after this many steps (default: 100)")
24 | # Misc Parameters
25 | tf.flags.DEFINE_boolean("allow_soft_placement", True, "Allow device soft device placement")
26 | tf.flags.DEFINE_boolean("log_device_placement", False, "Log placement of ops on devices")
27 |
28 | FLAGS = tf.flags.FLAGS
29 | FLAGS._parse_flags()
30 | print("\nParameters:")
31 | for attr, value in sorted(FLAGS.__flags.items()):
32 | print("{}={}".format(attr.upper(), value))
33 | print("")
34 |
35 |
36 | # Data Preparation
37 | # ==================================================
38 |
39 | # Load data
40 | print("Loading data...")
41 | x, y = preprocessing.load_data()
42 | # Randomly shuffle data
43 | np.random.seed(10)
44 | shuffle_indices = np.random.permutation(np.arange(len(y)))
45 | x_shuffled = x[shuffle_indices]
46 | y_shuffled = y[shuffle_indices]
47 | # Split train/test set
48 | n_dev_samples = 200000
49 | # TODO: Create a cross validation procedure
50 | x_train, x_dev = x_shuffled[:-n_dev_samples], x_shuffled[-n_dev_samples:]
51 | y_train, y_dev = y_shuffled[:-n_dev_samples], y_shuffled[-n_dev_samples:]
52 | print("Train/Dev split: {:d}/{:d}".format(len(y_train), len(y_dev)))
53 |
54 |
55 | # Training
56 | # ==================================================
57 |
58 | with tf.Graph().as_default():
59 | session_conf = tf.ConfigProto(
60 | allow_soft_placement=FLAGS.allow_soft_placement,
61 | log_device_placement=FLAGS.log_device_placement)
62 | sess = tf.Session(config=session_conf)
63 | with sess.as_default():
64 | cnn = model()
65 |
66 | # Define Training procedure
67 | ### To update the computation of moving_mean & moving_var, we must put it on the parent graph of minimizing loss
68 | update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
69 | with tf.control_dependencies(update_ops):
70 | # Ensures that we execute the update_ops before performing the train
71 | global_step = tf.Variable(0, name="global_step", trainable=False)
72 | optimizer = tf.train.AdamOptimizer(1e-3)
73 | grads_and_vars = optimizer.compute_gradients(cnn.loss)
74 | train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step)
75 |
76 | # Keep track of gradient values and sparsity (optional)
77 | grad_summaries = []
78 | for g, v in grads_and_vars:
79 | if g is not None:
80 | grad_hist_summary = tf.histogram_summary("{}/grad/hist".format(v.name), g)
81 | sparsity_summary = tf.scalar_summary("{}/grad/sparsity".format(v.name), tf.nn.zero_fraction(g))
82 | grad_summaries.append(grad_hist_summary)
83 | grad_summaries.append(sparsity_summary)
84 | grad_summaries_merged = tf.merge_summary(grad_summaries)
85 |
86 | # Output directory for models and summaries
87 | timestamp = str(int(time.time()))
88 | out_dir = os.path.abspath(os.path.join(os.path.curdir, "runs", timestamp))
89 | print("Writing to {}\n".format(out_dir))
90 |
91 | # Summaries for loss and accuracy
92 | loss_summary = tf.scalar_summary("loss", cnn.loss)
93 | acc_summary = tf.scalar_summary("accuracy", cnn.accuracy)
94 |
95 | # Train Summaries
96 | train_summary_op = tf.merge_summary([loss_summary, acc_summary, grad_summaries_merged])
97 | train_summary_dir = os.path.join(out_dir, "summaries", "train")
98 | train_summary_writer = tf.train.SummaryWriter(train_summary_dir, sess.graph)
99 |
100 | # Dev summaries
101 | dev_summary_op = tf.merge_summary([loss_summary, acc_summary])
102 | dev_summary_dir = os.path.join(out_dir, "summaries", "dev")
103 | dev_summary_writer = tf.train.SummaryWriter(dev_summary_dir, sess.graph)
104 |
105 | # Checkpoint directory. Tensorflow assumes this directory already exists so we need to create it
106 | checkpoint_dir = os.path.abspath(os.path.join(out_dir, "checkpoints"))
107 | checkpoint_prefix = os.path.join(checkpoint_dir, "model")
108 | if not os.path.exists(checkpoint_dir):
109 | os.makedirs(checkpoint_dir)
110 | saver = tf.train.Saver(tf.all_variables())
111 |
112 | # Initialize all variables
113 | sess.run(tf.initialize_all_variables())
114 |
115 | def train_step(x_batch, y_batch):
116 | """
117 | A single training step
118 | """
119 | feed_dict = {
120 | cnn.input_x: x_batch,
121 | cnn.input_y: y_batch,
122 | cnn.is_training: True # Update moving_mean, moving_var
123 | }
124 | _, step, summaries, loss, accuracy = sess.run(
125 | [train_op, global_step, train_summary_op, cnn.loss, cnn.accuracy],
126 | feed_dict)
127 | time_str = datetime.datetime.now().isoformat()
128 | print("{}: step {}, loss {:g}, acc {:g}".format(time_str, step, loss, accuracy))
129 | train_summary_writer.add_summary(summaries, step)
130 |
131 | def dev_step(x_batch, y_batch, writer=None):
132 | """
133 | Evaluates model on a dev set
134 | """
135 | dev_size = len(x_batch)
136 | max_batch_size = 500
137 | num_batches = dev_size/max_batch_size
138 | acc = []
139 | losses = []
140 | print("Number of batches in dev set is " + str(num_batches))
141 | for i in range(num_batches):
142 | x_batch_dev, y_batch_dev = preprocessing.get_batched_one_hot(
143 | x_batch, y_batch, i * max_batch_size, (i + 1) * max_batch_size)
144 | feed_dict = {
145 | cnn.input_x: x_batch_dev,
146 | cnn.input_y: y_batch_dev,
147 | cnn.is_training: False # Use converged (fixed) updated moving_mean, moving_var
148 | }
149 | step, summaries, loss, accuracy = sess.run(
150 | [global_step, dev_summary_op, cnn.loss, cnn.accuracy],
151 | feed_dict)
152 | acc.append(accuracy)
153 | losses.append(loss)
154 | time_str = datetime.datetime.now().isoformat()
155 | print("batch " + str(i + 1) + " in dev >>" +
156 | " {}: loss {:g}, acc {:g}".format(time_str, loss, accuracy))
157 | if writer:
158 | writer.add_summary(summaries, step)
159 | print("\nMean accuracy=" + str(sum(acc)/len(acc)))
160 | print("Mean loss=" + str(sum(losses)/len(losses)))
161 |
162 |
163 | # Generate batches in one-hot-encoding format
164 | batches = preprocessing.batch_iter(x_train, y_train, FLAGS.batch_size, FLAGS.num_epochs)
165 | # Training loop. For each batch...
166 | for batch in batches:
167 | x_batch, y_batch = zip(*batch)
168 | train_step(x_batch, y_batch)
169 | current_step = tf.train.global_step(sess, global_step)
170 | if current_step % FLAGS.evaluate_every == 0:
171 | print("\nEvaluation:")
172 | dev_step(x_dev, y_dev, writer=dev_summary_writer)
173 | print("")
174 | if current_step % FLAGS.checkpoint_every == 0:
175 | path = saver.save(sess, checkpoint_prefix, global_step=current_step)
176 | print("Saved model checkpoint to {}\n".format(path))
177 |
--------------------------------------------------------------------------------