├── Selection_134.png ├── Selection_135.png ├── Selection_136.png ├── README.md ├── preprocessing.py ├── model_densenet.py └── train.py /Selection_134.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lethienhoa/DenseNet-NLP/HEAD/Selection_134.png -------------------------------------------------------------------------------- /Selection_135.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lethienhoa/DenseNet-NLP/HEAD/Selection_135.png -------------------------------------------------------------------------------- /Selection_136.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lethienhoa/DenseNet-NLP/HEAD/Selection_136.png -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## Very Deep Convolutional Networks for Natural Language Processing in Tensorflow 2 | 3 | This is the DenseNet implementation of the paper **Do Convolutional Networks need to be Deep for Text Classification ?** in Tensorflow. We study in the paper the importance of depth in convolutional models for text classification, either when character or word inputs are considered. We show on 5 standard text classification and sentiment analysis tasks that deep models indeed give better performances than shallow networks when the text input is represented as a sequence of characters. However, a simple shallow-and-wide network outperforms deep models such as DenseNet with word inputs. Our shallow word model further establishes new state-of-the-art performances on two datasets: Yelp Binary (95.9\%) and Yelp Full (64.9\%). 4 | 5 | **Paper:** 6 | 7 | Hoa T. Le, Christophe Cerisara, Alexandre Denis. **Do Convolutional Networks need to be Deep for Text Classification ?**. Association for the Advancement of Artificial Intelligence 2018 (**AAAI-18**) Workshop on Affective Content Analysis. (https://arxiv.org/abs/1707.04108) 8 | 9 | @article{DBLP:journals/corr/LeCD17, 10 | author = {Hoa T. Le and 11 | Christophe Cerisara and 12 | Alexandre Denis}, 13 | title = {Do Convolutional Networks need to be Deep for Text Classification ?}, 14 | journal = {CoRR}, 15 | year = {2017} 16 | } 17 | 18 |

19 | 20 |

21 | 22 | **Results:** 23 | 24 |

25 | 26 |

27 | 28 |

29 | 30 |

31 | 32 | **Reference Source Codes:** 33 | https://github.com/dennybritz/cnn-text-classification-tf 34 | 35 | -------------------------------------------------------------------------------- /preprocessing.py: -------------------------------------------------------------------------------- 1 | # based on ideas from https://github.com/dennybritz/cnn-text-classification-tf 2 | 3 | import numpy as np 4 | import json 5 | 6 | alphabet = "abcdefghijklmnopqrstuvwxyz0123456789-,;.!?:'\"/\\|_@#$%^&*~`+-=<>()[]{}\n" 7 | char_seq_length = 144 # Twitter has only 140 characters. We pad 4 blanks characters more to the right of tweets to be conformed with the architecture of A. Conneau et al (2016) 8 | 9 | 10 | def load_yelp(alphabet): 11 | examples = [] 12 | labels = [] 13 | with open('./yelp-review-dataset/yelp_academic_dataset_review.json') as f: 14 | i = 0 15 | for line in f: 16 | review = json.loads(line) 17 | stars = review["stars"] 18 | text = review["text"] 19 | if stars != 3: 20 | padded = pad_sentence(list(text.lower())) 21 | text_int8_repr = string_to_int8_conversion(padded, alphabet) 22 | if stars == 1 or stars == 2: 23 | labels.append([1, 0]) 24 | examples.append(text_int8_repr) 25 | elif stars == 4 or stars == 5: 26 | labels.append([0, 1]) 27 | examples.append(text_int8_repr) 28 | i += 1 29 | if i % 10000 == 0: 30 | print("Non-neutral instances processed: " + str(i)) 31 | return examples, labels 32 | 33 | def pad_sentence(char_seq, padding_char=" "): 34 | num_padding = char_seq_length - len(char_seq) 35 | new_char_seq = char_seq + [padding_char] * num_padding 36 | return new_char_seq 37 | 38 | 39 | def string_to_int8_conversion(char_seq, alphabet): 40 | x = np.array([alphabet.find(char) for char in char_seq], dtype=np.int8) 41 | return x 42 | 43 | 44 | def get_batched_one_hot(char_seqs_indices, labels, start_index, end_index): 45 | x_batch = char_seqs_indices[start_index:end_index] 46 | y_batch = labels[start_index:end_index] 47 | x_batch_one_hot = np.zeros(shape=[len(x_batch), len(alphabet), len(x_batch[0]), 1]) 48 | for example_i, char_seq_indices in enumerate(x_batch): 49 | for char_pos_in_seq, char_seq_char_ind in enumerate(char_seq_indices): 50 | if char_seq_char_ind != -1: 51 | x_batch_one_hot[example_i][char_seq_char_ind][char_pos_in_seq][0] = 1 52 | return [x_batch_one_hot, y_batch] 53 | 54 | 55 | def load_data(): 56 | examples, labels = load_yelp(alphabet) 57 | x = np.array(examples, dtype=np.int8) 58 | y = np.array(labels, dtype=np.int8) 59 | print("x_char_seq_ind=" + str(x.shape)) 60 | print("y shape=" + str(y.shape)) 61 | return [x, y] 62 | 63 | 64 | def batch_iter(x, y, batch_size, num_epochs, shuffle=True): 65 | """ 66 | Generates a batch iterator for a dataset. 67 | """ 68 | # data = np.array(data) 69 | data_size = len(x) 70 | num_batches_per_epoch = int(data_size/batch_size) + 1 71 | for epoch in range(num_epochs): 72 | print("In epoch >> " + str(epoch + 1)) 73 | print("num batches per epoch is: " + str(num_batches_per_epoch)) 74 | # Shuffle the data at each epoch 75 | if shuffle: 76 | shuffle_indices = np.random.permutation(np.arange(data_size)) 77 | x_shuffled = x[shuffle_indices] 78 | y_shuffled = y[shuffle_indices] 79 | else: 80 | x_shuffled = x 81 | y_shuffled = y 82 | for batch_num in range(num_batches_per_epoch): 83 | start_index = batch_num * batch_size 84 | end_index = min((batch_num + 1) * batch_size, data_size) 85 | x_batch, y_batch = get_batched_one_hot(x_shuffled, y_shuffled, start_index, end_index) 86 | batch = list(zip(x_batch, y_batch)) 87 | yield batch 88 | -------------------------------------------------------------------------------- /model_densenet.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import tensorflow.contrib.slim as slim 3 | from tensorflow.python.ops import math_ops 4 | 5 | alphabet = "abcdefghijklmnopqrstuvwxyz0123456789-,;.!?:'\"/\\|_@#$%^&*~`+-=<>()[]{}" 6 | sequence_max_length = 1014 7 | 8 | class model(): 9 | """ 10 | DenseNet Model for Text Classification. 11 | """ 12 | 13 | def __init__(self, num_classes, cnn_filter_size=(3,3,3,3), pooling_filter_size=(2,2,2,2), num_filters_per_size=(64,128,256,512), num_rep_block=(4,4,4,4), num_quantized_chars=len(alphabet)): 14 | 15 | def denseBlock(input_layer, num_filters_per_size_i, cnn_filter_size_i, i, num_rep_block_i): 16 | with tf.variable_scope("dense_unit_%s" % i): 17 | nodes = [] 18 | a = slim.conv2d(input_layer, num_filters_per_size_i, [1, cnn_filter_size_i], weights_initializer=tf.contrib.layers.variance_scaling_initializer(factor=1.0, mode='FAN_AVG', uniform=True), normalizer_fn=slim.batch_norm) 19 | nodes.append(a) 20 | print a.get_shape() 21 | for z in range(num_rep_block_i-1): 22 | #b = slim.conv2d(tf.concat(3,nodes), num_filters_per_size_i, [1, cnn_filter_size_i], weights_initializer=tf.contrib.layers.variance_scaling_initializer(factor=1.0, mode='FAN_AVG', uniform=True), normalizer_fn=slim.batch_norm) 23 | b = slim.conv2d(tf.concat(nodes,3), num_filters_per_size_i, [1, cnn_filter_size_i], weights_initializer=tf.contrib.layers.variance_scaling_initializer(factor=1.0, mode='FAN_AVG', uniform=True), normalizer_fn=slim.batch_norm) 24 | nodes.append(b) 25 | print b.get_shape() 26 | print "" 27 | return b 28 | 29 | self.input_x = tf.placeholder(tf.float32, [None, num_quantized_chars, sequence_max_length, 1], name="input_x") 30 | self.input_y = tf.placeholder(tf.float32, [None, num_classes], name="input_y") 31 | self.dropout_keep_prob = tf.placeholder(tf.float32, name="dropout_keep_prob") 32 | 33 | # Input Dim : 70 x 176 x 1 34 | 35 | # ================ First Conv Layer ================ 36 | h = slim.conv2d(self.input_x, num_filters_per_size[0], [num_quantized_chars, cnn_filter_size[0]], weights_initializer=tf.contrib.layers.variance_scaling_initializer(factor=1.0, mode='FAN_AVG', uniform=True), normalizer_fn=slim.batch_norm, scope = 'conv0', padding='VALID') 37 | 38 | # Output Dim : 1 x 176 x 64 39 | 40 | # ================ Conv Block 64, 128, 256, 512 ================= 41 | for i in range(0,len(num_filters_per_size)): 42 | h = denseBlock(h, num_filters_per_size[i], cnn_filter_size[i], i, num_rep_block[i]) 43 | 44 | # Transition Layer 45 | if i<>len(num_filters_per_size)-1: 46 | h = slim.conv2d(h, num_filters_per_size[i+1], [1, cnn_filter_size[i]], weights_initializer=tf.contrib.layers.variance_scaling_initializer(factor=1.0, mode='FAN_AVG', uniform=True), normalizer_fn=slim.batch_norm, scope='conv-last-%s' % i) 47 | 48 | # Max pooling 1/2 49 | h = slim.max_pool2d(h, [1,pooling_filter_size[i]], stride=pooling_filter_size[i], scope='pool_%s' % i) 50 | print h.get_shape() 51 | 52 | # Output Dim : 1 x 8 x 512 53 | 54 | # ================ Layer FC ================ 55 | # Output Dim : 1 x 4096 56 | 57 | # Max pooling filtersize = 8, stride = 8 58 | h = slim.max_pool2d(h, [1, 8], stride=8, scope='pool_final') 59 | h = slim.flatten(h) 60 | print h.get_shape() 61 | 62 | # Global avg max pooling 63 | #h = math_ops.reduce_mean(h, [1, 2], name='pool5', keep_dims=True) 64 | #h = slim.flatten(h) 65 | #print "" 66 | #print h.get_shape() 67 | 68 | h = slim.fully_connected(h, 2048, activation_fn=None, scope='FC1') 69 | print h.get_shape() 70 | h = slim.fully_connected(h, 2048, activation_fn=None, scope='FC2') 71 | print h.get_shape() 72 | scores = slim.fully_connected(h, num_classes, activation_fn=None, scope='output') 73 | print scores.get_shape() 74 | 75 | self.scores = scores 76 | pred1D = tf.argmax(scores, 1, name="predictions") 77 | self.predictions = pred1D 78 | y1D = tf.argmax(self.input_y, 1) 79 | 80 | # ================ Loss and Accuracy ================ 81 | # CalculateMean cross-entropy loss 82 | with tf.name_scope("evaluate"): 83 | losses = tf.nn.softmax_cross_entropy_with_logits(labels=self.input_y, logits=scores) 84 | self.loss = tf.reduce_mean(losses) 85 | 86 | correct_predictions = tf.equal(pred1D, y1D) 87 | self.accuracy = tf.reduce_mean(tf.cast(correct_predictions, "float"), name="accuracy") 88 | -------------------------------------------------------------------------------- /train.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # based on ideas in https://github.com/dennybritz/cnn-text-classification-tf 3 | 4 | import tensorflow as tf 5 | import numpy as np 6 | import os 7 | import time 8 | import datetime 9 | import preprocessing 10 | from model_densenet import model 11 | 12 | # Parameters 13 | # ================================================== 14 | 15 | # Model Hyperparameters 16 | tf.flags.DEFINE_float("dropout_keep_prob", 0.5, "Dropout keep probability (default: 0.5)") 17 | tf.flags.DEFINE_float("l2_reg_lambda", 0.0, "L2 regularizaion lambda (default: 0.0)") 18 | 19 | # Training parameters 20 | tf.flags.DEFINE_integer("batch_size", 128, "Batch Size (default: 128)") 21 | tf.flags.DEFINE_integer("num_epochs", 50, "Number of training epochs (default: 200)") 22 | tf.flags.DEFINE_integer("evaluate_every", 5000, "Evaluate model on dev set after this many steps (default: 100)") 23 | tf.flags.DEFINE_integer("checkpoint_every", 1000, "Save model after this many steps (default: 100)") 24 | # Misc Parameters 25 | tf.flags.DEFINE_boolean("allow_soft_placement", True, "Allow device soft device placement") 26 | tf.flags.DEFINE_boolean("log_device_placement", False, "Log placement of ops on devices") 27 | 28 | FLAGS = tf.flags.FLAGS 29 | FLAGS._parse_flags() 30 | print("\nParameters:") 31 | for attr, value in sorted(FLAGS.__flags.items()): 32 | print("{}={}".format(attr.upper(), value)) 33 | print("") 34 | 35 | 36 | # Data Preparation 37 | # ================================================== 38 | 39 | # Load data 40 | print("Loading data...") 41 | x, y = preprocessing.load_data() 42 | # Randomly shuffle data 43 | np.random.seed(10) 44 | shuffle_indices = np.random.permutation(np.arange(len(y))) 45 | x_shuffled = x[shuffle_indices] 46 | y_shuffled = y[shuffle_indices] 47 | # Split train/test set 48 | n_dev_samples = 200000 49 | # TODO: Create a cross validation procedure 50 | x_train, x_dev = x_shuffled[:-n_dev_samples], x_shuffled[-n_dev_samples:] 51 | y_train, y_dev = y_shuffled[:-n_dev_samples], y_shuffled[-n_dev_samples:] 52 | print("Train/Dev split: {:d}/{:d}".format(len(y_train), len(y_dev))) 53 | 54 | 55 | # Training 56 | # ================================================== 57 | 58 | with tf.Graph().as_default(): 59 | session_conf = tf.ConfigProto( 60 | allow_soft_placement=FLAGS.allow_soft_placement, 61 | log_device_placement=FLAGS.log_device_placement) 62 | sess = tf.Session(config=session_conf) 63 | with sess.as_default(): 64 | cnn = model() 65 | 66 | # Define Training procedure 67 | ### To update the computation of moving_mean & moving_var, we must put it on the parent graph of minimizing loss 68 | update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) 69 | with tf.control_dependencies(update_ops): 70 | # Ensures that we execute the update_ops before performing the train 71 | global_step = tf.Variable(0, name="global_step", trainable=False) 72 | optimizer = tf.train.AdamOptimizer(1e-3) 73 | grads_and_vars = optimizer.compute_gradients(cnn.loss) 74 | train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step) 75 | 76 | # Keep track of gradient values and sparsity (optional) 77 | grad_summaries = [] 78 | for g, v in grads_and_vars: 79 | if g is not None: 80 | grad_hist_summary = tf.histogram_summary("{}/grad/hist".format(v.name), g) 81 | sparsity_summary = tf.scalar_summary("{}/grad/sparsity".format(v.name), tf.nn.zero_fraction(g)) 82 | grad_summaries.append(grad_hist_summary) 83 | grad_summaries.append(sparsity_summary) 84 | grad_summaries_merged = tf.merge_summary(grad_summaries) 85 | 86 | # Output directory for models and summaries 87 | timestamp = str(int(time.time())) 88 | out_dir = os.path.abspath(os.path.join(os.path.curdir, "runs", timestamp)) 89 | print("Writing to {}\n".format(out_dir)) 90 | 91 | # Summaries for loss and accuracy 92 | loss_summary = tf.scalar_summary("loss", cnn.loss) 93 | acc_summary = tf.scalar_summary("accuracy", cnn.accuracy) 94 | 95 | # Train Summaries 96 | train_summary_op = tf.merge_summary([loss_summary, acc_summary, grad_summaries_merged]) 97 | train_summary_dir = os.path.join(out_dir, "summaries", "train") 98 | train_summary_writer = tf.train.SummaryWriter(train_summary_dir, sess.graph) 99 | 100 | # Dev summaries 101 | dev_summary_op = tf.merge_summary([loss_summary, acc_summary]) 102 | dev_summary_dir = os.path.join(out_dir, "summaries", "dev") 103 | dev_summary_writer = tf.train.SummaryWriter(dev_summary_dir, sess.graph) 104 | 105 | # Checkpoint directory. Tensorflow assumes this directory already exists so we need to create it 106 | checkpoint_dir = os.path.abspath(os.path.join(out_dir, "checkpoints")) 107 | checkpoint_prefix = os.path.join(checkpoint_dir, "model") 108 | if not os.path.exists(checkpoint_dir): 109 | os.makedirs(checkpoint_dir) 110 | saver = tf.train.Saver(tf.all_variables()) 111 | 112 | # Initialize all variables 113 | sess.run(tf.initialize_all_variables()) 114 | 115 | def train_step(x_batch, y_batch): 116 | """ 117 | A single training step 118 | """ 119 | feed_dict = { 120 | cnn.input_x: x_batch, 121 | cnn.input_y: y_batch, 122 | cnn.is_training: True # Update moving_mean, moving_var 123 | } 124 | _, step, summaries, loss, accuracy = sess.run( 125 | [train_op, global_step, train_summary_op, cnn.loss, cnn.accuracy], 126 | feed_dict) 127 | time_str = datetime.datetime.now().isoformat() 128 | print("{}: step {}, loss {:g}, acc {:g}".format(time_str, step, loss, accuracy)) 129 | train_summary_writer.add_summary(summaries, step) 130 | 131 | def dev_step(x_batch, y_batch, writer=None): 132 | """ 133 | Evaluates model on a dev set 134 | """ 135 | dev_size = len(x_batch) 136 | max_batch_size = 500 137 | num_batches = dev_size/max_batch_size 138 | acc = [] 139 | losses = [] 140 | print("Number of batches in dev set is " + str(num_batches)) 141 | for i in range(num_batches): 142 | x_batch_dev, y_batch_dev = preprocessing.get_batched_one_hot( 143 | x_batch, y_batch, i * max_batch_size, (i + 1) * max_batch_size) 144 | feed_dict = { 145 | cnn.input_x: x_batch_dev, 146 | cnn.input_y: y_batch_dev, 147 | cnn.is_training: False # Use converged (fixed) updated moving_mean, moving_var 148 | } 149 | step, summaries, loss, accuracy = sess.run( 150 | [global_step, dev_summary_op, cnn.loss, cnn.accuracy], 151 | feed_dict) 152 | acc.append(accuracy) 153 | losses.append(loss) 154 | time_str = datetime.datetime.now().isoformat() 155 | print("batch " + str(i + 1) + " in dev >>" + 156 | " {}: loss {:g}, acc {:g}".format(time_str, loss, accuracy)) 157 | if writer: 158 | writer.add_summary(summaries, step) 159 | print("\nMean accuracy=" + str(sum(acc)/len(acc))) 160 | print("Mean loss=" + str(sum(losses)/len(losses))) 161 | 162 | 163 | # Generate batches in one-hot-encoding format 164 | batches = preprocessing.batch_iter(x_train, y_train, FLAGS.batch_size, FLAGS.num_epochs) 165 | # Training loop. For each batch... 166 | for batch in batches: 167 | x_batch, y_batch = zip(*batch) 168 | train_step(x_batch, y_batch) 169 | current_step = tf.train.global_step(sess, global_step) 170 | if current_step % FLAGS.evaluate_every == 0: 171 | print("\nEvaluation:") 172 | dev_step(x_dev, y_dev, writer=dev_summary_writer) 173 | print("") 174 | if current_step % FLAGS.checkpoint_every == 0: 175 | path = saver.save(sess, checkpoint_prefix, global_step=current_step) 176 | print("Saved model checkpoint to {}\n".format(path)) 177 | --------------------------------------------------------------------------------