├── requirements.txt
├── sample_data
    └── sample_data.zip
├── download_glove.py
├── models
    ├── net_utils.py
    ├── naive_rnn.py
    └── attention_rnn.py
├── test.py
├── data_utils.py
├── README.md
└── train.py


/requirements.txt:
--------------------------------------------------------------------------------
1 | gensim==3.3.0
2 | wget==3.2
3 | nltk==3.2.5
4 | scikit-learn==0.19.1
5 | 


--------------------------------------------------------------------------------
/sample_data/sample_data.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dongjun-Lee/rnn-text-classification-tf/HEAD/sample_data/sample_data.zip


--------------------------------------------------------------------------------
/download_glove.py:
--------------------------------------------------------------------------------
 1 | import wget
 2 | import os
 3 | import zipfile
 4 | 
 5 | 
 6 | glove_dir = "glove"
 7 | glove_url = "http://nlp.stanford.edu/data/glove.6B.zip"
 8 | 
 9 | if not os.path.exists(glove_dir):
10 |     os.mkdir(glove_dir)
11 | 
12 | # Download glove vector
13 | wget.download(glove_url, out=glove_dir)
14 | 
15 | # Extract glove file
16 | with zipfile.ZipFile(os.path.join("glove", "glove.6B.zip"), "r") as z:
17 |     z.extractall(glove_dir)
18 | 


--------------------------------------------------------------------------------
/models/net_utils.py:
--------------------------------------------------------------------------------
 1 | from gensim.models.keyedvectors import KeyedVectors
 2 | from gensim.test.utils import get_tmpfile
 3 | from gensim.scripts.glove2word2vec import glove2word2vec
 4 | import numpy as np
 5 | 
 6 | 
 7 | def get_init_embedding(reversed_dict, embedding_size):
 8 |     print("Loading Glove vectors...")
 9 |     glove_file = "glove/glove.6B.%dd.txt" % embedding_size
10 |     word2vec_file = get_tmpfile("word2vec_format.vec")
11 |     glove2word2vec(glove_file, word2vec_file)
12 |     word_vectors = KeyedVectors.load_word2vec_format(word2vec_file)
13 | 
14 |     word_vec_list = list()
15 |     for _, word in sorted(reversed_dict.items()):
16 |         try:
17 |             word_vec = word_vectors.word_vec(word)
18 |         except KeyError:
19 |             word_vec = np.zeros([embedding_size], dtype=np.float32)
20 | 
21 |         word_vec_list.append(word_vec)
22 | 
23 |     return np.array(word_vec_list)
24 | 


--------------------------------------------------------------------------------
/test.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | import argparse
 3 | from data_utils import build_dict, build_dataset, batch_iter
 4 | 
 5 | 
 6 | def add_arguments(parser):
 7 |     parser.add_argument("--test_tsv", type=str, default="sample_data/test.tsv", help="Test tsv file.")
 8 |     parser.add_argument("--checkpoint_dir", type=str, default="saved_model", help="Checkpoint dir for saved model.")
 9 |     parser.add_argument("--batch_size", type=int, default=64, help="Batch size.")
10 | 
11 | 
12 | parser = argparse.ArgumentParser()
13 | add_arguments(parser)
14 | args = parser.parse_args()
15 | 
16 | print("Loading dictionary...")
17 | word_dict, reversed_dict, document_max_len = build_dict(args.test_tsv, is_train=False)
18 | print("Building test dataset...")
19 | test_x, test_y = build_dataset(args.test_tsv, word_dict, document_max_len)
20 | 
21 | checkpoint_file = tf.train.latest_checkpoint(args.checkpoint_dir)
22 | graph = tf.Graph()
23 | with graph.as_default():
24 |     with tf.Session() as sess:
25 |         saver = tf.train.import_meta_graph("{}.meta".format(checkpoint_file))
26 |         saver.restore(sess, checkpoint_file)
27 | 
28 |         x = graph.get_operation_by_name("x").outputs[0]
29 |         y = graph.get_operation_by_name("y").outputs[0]
30 |         keep_prob = graph.get_operation_by_name("keep_prob").outputs[0]
31 |         accuracy = graph.get_operation_by_name("accuracy/accuracy").outputs[0]
32 | 
33 |         batches = batch_iter(test_x, test_y, args.batch_size, 1)
34 |         sum_accuracy, cnt = 0, 0
35 |         for batch_x, batch_y in batches:
36 |             feed_dict = {
37 |                 x: batch_x,
38 |                 y: batch_y,
39 |                 keep_prob: 1.0
40 |             }
41 | 
42 |             accuracy_out = sess.run(accuracy, feed_dict=feed_dict)
43 |             sum_accuracy += accuracy_out
44 |             cnt += 1
45 | 
46 |         print("Test Accuracy : {0}".format(sum_accuracy / cnt))
47 | 


--------------------------------------------------------------------------------
/data_utils.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | from nltk.tokenize import word_tokenize
 4 | import re
 5 | import collections
 6 | import pickle
 7 | 
 8 | 
 9 | def clean_str(text):
10 |     text = re.sub(r"[^A-Za-z0-9(),!?\'\`\"]", " ", text)
11 |     text = re.sub(r"\s{2,}", " ", text)
12 |     text = re.sub(r"@[A-Za-z0-9]+", " ", text)
13 |     text = text.strip().lower()
14 | 
15 |     return text
16 | 
17 | 
18 | def build_dict(train_tsv, is_train=True):
19 |     if is_train:
20 |         df = pd.read_csv(train_tsv, sep="\t")
21 |         sentences = df["sentence"]
22 | 
23 |         words = list()
24 |         for sentence in sentences:
25 |             for word in word_tokenize(clean_str(sentence)):
26 |                 words.append(word)
27 | 
28 |         word_counter = collections.Counter(words).most_common()
29 |         word_dict = dict()
30 |         word_dict["<padding>"] = 0
31 |         word_dict["<unk>"] = 1
32 |         for word, _ in word_counter:
33 |             word_dict[word] = len(word_dict)
34 | 
35 |         with open("word_dict.pickle", "wb") as f:
36 |             pickle.dump(word_dict, f)
37 | 
38 |     else:
39 |         with open("word_dict.pickle", "rb") as f:
40 |             word_dict = pickle.load(f)
41 | 
42 |     reversed_dict = dict(zip(word_dict.values(), word_dict.keys()))
43 |     document_max_len = 20
44 | 
45 |     return word_dict, reversed_dict, document_max_len
46 | 
47 | 
48 | def build_dataset(tsv, word_dict, document_max_len):
49 |     df = pd.read_csv(tsv, sep="\t")
50 | 
51 |     x = list(map(lambda d: word_tokenize(clean_str(d)), df["sentence"]))
52 |     x = list(map(lambda d: list(map(lambda w: word_dict.get(w, word_dict["<unk>"]), d)), x))
53 |     x = list(map(lambda d: d[:document_max_len], x))
54 |     x = list(map(lambda d: d + (document_max_len - len(d)) * [word_dict["<padding>"]], x))
55 | 
56 |     y = list(df["sentiment"])
57 | 
58 |     return x, y
59 | 
60 | 
61 | def batch_iter(inputs, outputs, batch_size, num_epochs):
62 |     inputs = np.array(inputs)
63 |     outputs = np.array(outputs)
64 | 
65 |     num_batches_per_epoch = (len(inputs) - 1) // batch_size + 1
66 |     for epoch in range(num_epochs):
67 |         for batch_num in range(num_batches_per_epoch):
68 |             start_index = batch_num * batch_size
69 |             end_index = min((batch_num + 1) * batch_size, len(inputs))
70 |             yield inputs[start_index:end_index], outputs[start_index:end_index]
71 | 


--------------------------------------------------------------------------------
/models/naive_rnn.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | from tensorflow.contrib import rnn
 3 | from models.net_utils import get_init_embedding
 4 | 
 5 | 
 6 | class NaiveRNN(object):
 7 |     def __init__(self, reversed_dict, document_max_len, num_class, args):
 8 |         self.vocabulary_size = len(reversed_dict)
 9 |         self.embedding_size = args.embedding_size
10 |         self.num_hidden = args.num_hidden
11 |         self.num_layers = args.num_layers
12 |         self.learning_rate = args.learning_rate
13 | 
14 |         self.x = tf.placeholder(tf.int32, [None, document_max_len], name="x")
15 |         self.x_len = tf.reduce_sum(tf.sign(self.x), 1)
16 |         self.y = tf.placeholder(tf.int32, [None], name="y")
17 |         self.keep_prob = tf.placeholder(tf.float32, [], name='keep_prob')
18 |         self.global_step = tf.Variable(0, trainable=False)
19 | 
20 |         with tf.name_scope("embedding"):
21 |             if args.glove:
22 |                 init_embeddings = tf.constant(get_init_embedding(reversed_dict, self.embedding_size), dtype=tf.float32)
23 |                 self.embeddings = tf.get_variable("embeddings", initializer=init_embeddings, trainable=False)
24 |             else:
25 |                 init_embeddings = tf.random_uniform([self.vocabulary_size, self.embedding_size])
26 |                 self.embeddings = tf.get_variable("embeddings", initializer=init_embeddings, trainable=True)
27 |             self.x_emb = tf.nn.embedding_lookup(self.embeddings, self.x)
28 | 
29 |         with tf.name_scope("birnn"):
30 |             fw_cells = [rnn.BasicLSTMCell(self.num_hidden) for _ in range(self.num_layers)]
31 |             bw_cells = [rnn.BasicLSTMCell(self.num_hidden) for _ in range(self.num_layers)]
32 |             fw_cells = [rnn.DropoutWrapper(cell, output_keep_prob=self.keep_prob) for cell in fw_cells]
33 |             bw_cells = [rnn.DropoutWrapper(cell, output_keep_prob=self.keep_prob) for cell in bw_cells]
34 | 
35 |             self.rnn_outputs, _, _ = rnn.stack_bidirectional_dynamic_rnn(
36 |                 fw_cells, bw_cells, self.x_emb, sequence_length=self.x_len, dtype=tf.float32)
37 |             self.last_output = self.rnn_outputs[:, -1, :]
38 | 
39 |         with tf.name_scope("output"):
40 |             self.logits = tf.contrib.slim.fully_connected(self.last_output, num_class, activation_fn=None)
41 |             self.predictions = tf.argmax(self.logits, -1, output_type=tf.int32)
42 | 
43 |         with tf.name_scope("loss"):
44 |             self.loss = tf.reduce_mean(
45 |                 tf.nn.sparse_softmax_cross_entropy_with_logits(logits=self.logits, labels=self.y))
46 |             self.optimizer = tf.train.AdamOptimizer(self.learning_rate).minimize(self.loss, global_step=self.global_step)
47 | 
48 |         with tf.name_scope("accuracy"):
49 |             correct_predictions = tf.equal(self.predictions, self.y)
50 |             self.accuracy = tf.reduce_mean(tf.cast(correct_predictions, "float"), name="accuracy")
51 | 


--------------------------------------------------------------------------------
/models/attention_rnn.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | from tensorflow.contrib import rnn
 3 | from models.net_utils import get_init_embedding
 4 | 
 5 | 
 6 | class AttentionRNN(object):
 7 |     def __init__(self, reversed_dict, document_max_len, num_class, args):
 8 |         self.vocabulary_size = len(reversed_dict)
 9 |         self.embedding_size = args.embedding_size
10 |         self.num_hidden = args.num_hidden
11 |         self.num_layers = args.num_layers
12 |         self.learning_rate = args.learning_rate
13 | 
14 |         self.x = tf.placeholder(tf.int32, [None, document_max_len], name="x")
15 |         self.x_len = tf.reduce_sum(tf.sign(self.x), 1)
16 |         self.y = tf.placeholder(tf.int32, [None], name="y")
17 |         self.keep_prob = tf.placeholder(tf.float32, [], name="keep_prob")
18 |         self.global_step = tf.Variable(0, trainable=False)
19 | 
20 |         with tf.name_scope("embedding"):
21 |             if args.glove:
22 |                 init_embeddings = tf.constant(get_init_embedding(reversed_dict, self.embedding_size), dtype=tf.float32)
23 |                 self.embeddings = tf.get_variable("embeddings", initializer=init_embeddings, trainable=False)
24 |             else:
25 |                 init_embeddings = tf.random_uniform([self.vocabulary_size, self.embedding_size])
26 |                 self.embeddings = tf.get_variable("embeddings", initializer=init_embeddings, trainable=True)
27 |             self.x_emb = tf.nn.embedding_lookup(self.embeddings, self.x)
28 | 
29 |         with tf.name_scope("birnn"):
30 |             fw_cells = [rnn.BasicLSTMCell(self.num_hidden) for _ in range(self.num_layers)]
31 |             bw_cells = [rnn.BasicLSTMCell(self.num_hidden) for _ in range(self.num_layers)]
32 |             fw_cells = [rnn.DropoutWrapper(cell, output_keep_prob=self.keep_prob) for cell in fw_cells]
33 |             bw_cells = [rnn.DropoutWrapper(cell, output_keep_prob=self.keep_prob) for cell in bw_cells]
34 | 
35 |             self.rnn_outputs, _, _ = rnn.stack_bidirectional_dynamic_rnn(
36 |                 fw_cells, bw_cells, self.x_emb, sequence_length=self.x_len, dtype=tf.float32)
37 | 
38 |         with tf.name_scope("attention"):
39 |             self.attention_score = tf.nn.softmax(tf.contrib.slim.fully_connected(self.rnn_outputs, 1))
40 |             self.attention_out = tf.squeeze(
41 |                 tf.matmul(tf.transpose(self.rnn_outputs, perm=[0, 2, 1]), self.attention_score),
42 |                 axis=-1)
43 | 
44 |         with tf.name_scope("output"):
45 |             self.logits = tf.contrib.slim.fully_connected(self.attention_out, num_class, activation_fn=None)
46 |             self.predictions = tf.argmax(self.logits, -1, output_type=tf.int32)
47 | 
48 |         with tf.name_scope("loss"):
49 |             self.loss = tf.reduce_mean(
50 |                 tf.nn.sparse_softmax_cross_entropy_with_logits(logits=self.logits, labels=self.y))
51 |             self.optimizer = tf.train.AdamOptimizer(self.learning_rate).minimize(self.loss, global_step=self.global_step)
52 | 
53 |         with tf.name_scope("accuracy"):
54 |             correct_predictions = tf.equal(self.predictions, self.y)
55 |             self.accuracy = tf.reduce_mean(tf.cast(correct_predictions, "float"), name="accuracy")
56 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # text-classification-tensorflow
 2 |   Tensorflow implementation of **Attention-based Bidirectional RNN** text classification. (with naive Bidirectional RNN model as a baseline)
 3 |   <img src="https://user-images.githubusercontent.com/6512394/41424160-42520358-7038-11e8-8db0-859346a1fa3a.PNG">
 4 | 
 5 | 
 6 | ## Requirements
 7 | - Python 3
 8 | - Tensorflow
 9 | - pip install -r requirements.txt
10 | 
11 | 
12 | ## Usage
13 | 
14 | ### Prepare Data
15 | We are using pre-processed version of [Twitter Sentiment Classification Data](http://help.sentiment140.com/for-students). To use sample data (100K train/30K test),
16 | ```
17 | $ unzip sample_data/sample_data.zip -d sample_data
18 | ```
19 | 
20 | To use full data (1.2M train/0.4M test), download it from [google drive link](https://drive.google.com/file/d/1aMt-6OCN_mEDlmRX4bymk5ZNEatsVXF-/view?usp=sharing).
21 | 
22 | To use Glove pre-trained embedding, download it via
23 | ```
24 | $ python download_glove.py
25 | ```
26 | 
27 | ### Train
28 | To train the model with sample data,
29 | ```
30 | $ python train.py
31 | ```
32 | Train data is split to train set(85%) and validation set(15%). Every 2000 steps, the classification accuracy is tested with validation set and the best model is saved.
33 | 
34 | 
35 | To use Glove pre-trained vectors as initial embedding,
36 | ```
37 | $ python train.py --glove
38 | ```
39 | 
40 | #### Additional Hyperparameters
41 | ```
42 | $ python train.py -h
43 | usage: train.py [-h] [--train_tsv TRAIN_TSV] [--model MODEL] [--glove]
44 |                 [--embedding_size EMBEDDING_SIZE] [--num_hidden NUM_HIDDEN]
45 |                 [--num_layers NUM_LAYERS] [--learning_rate LEARNING_RATE]
46 |                 [--batch_size BATCH_SIZE] [--num_epochs NUM_EPOCHS]
47 |                 [--keep_prob KEEP_PROB] [--checkpoint_dir CHECKPOINT_DIR]
48 | 
49 | optional arguments:
50 |   -h, --help            show this help message and exit
51 |   --train_tsv TRAIN_TSV
52 |                         Train tsv file.
53 |   --model MODEL         naive | att
54 |   --glove               Use glove as initial word embedding.
55 |   --embedding_size EMBEDDING_SIZE
56 |                         Word embedding size. (For glove, use 50 | 100 | 200 | 300)
57 |   --num_hidden NUM_HIDDEN
58 |                         RNN Network size.
59 |   --num_layers NUM_LAYERS
60 |                         RNN Network depth.
61 |   --learning_rate LEARNING_RATE
62 |                         Learning rate.
63 |   --batch_size BATCH_SIZE
64 |                         Batch size.
65 |   --num_epochs NUM_EPOCHS
66 |                         Number of epochs.
67 |   --keep_prob KEEP_PROB
68 |                         Dropout keep prob.
69 |   --checkpoint_dir CHECKPOINT_DIR
70 |                         Checkpoint directory.
71 | ```
72 | 
73 | 
74 | 
75 | ### Test
76 | To test classification accuracy for test data,
77 | ```
78 | $ python test.py
79 | ```
80 | 
81 | To use custom data,
82 | ```
83 | $ python test.py --test_tsv=<CUSTOM_TSV>
84 | ```
85 | 
86 | ### Sample Test Results
87 | Trained and tested with [full data](https://drive.google.com/file/d/1aMt-6OCN_mEDlmRX4bymk5ZNEatsVXF-/view?usp=sharing) with default hyper-parameters,
88 | 
89 | Model    | Naive   | Naive(+Glove) | Attention | Attention(+Glove)
90 | :---:    | :---:   | :---:         | :---:     | :---:
91 | Accuracy | 0.574   | 0.578         | 0.811     | 0.820
92 | 
93 | 
94 | ## References
95 | - [Dataset](http://help.sentiment140.com/for-students)
96 | - [dennybritz/cnn-text-classification-tf](https://github.com/dennybritz/cnn-text-classification-tf)
97 | - [Attention-Based Bidirectional Long Short-Term Memory Networks for Relation Classification](http://www.aclweb.org/anthology/P16-2034)
98 | 


--------------------------------------------------------------------------------
/train.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import os
 3 | import tensorflow as tf
 4 | from sklearn.model_selection import train_test_split
 5 | from models.naive_rnn import NaiveRNN
 6 | from models.attention_rnn import AttentionRNN
 7 | from data_utils import build_dict, build_dataset, batch_iter
 8 | 
 9 | 
10 | def add_arguments(parser):
11 |     parser.add_argument("--train_tsv", type=str, default="sample_data/train.tsv", help="Train tsv file.")
12 |     parser.add_argument("--model", type=str, default="att", help="naive | att")
13 |     parser.add_argument("--glove", action="store_true", help="Use glove as initial word embedding.")
14 |     parser.add_argument("--embedding_size", type=int, default=300,
15 |                         help="Word embedding size. (For glove, use 50 | 100 | 200 | 300)")
16 | 
17 |     parser.add_argument("--num_hidden", type=int, default=100, help="RNN Network size.")
18 |     parser.add_argument("--num_layers", type=int, default=2, help="RNN Network depth.")
19 | 
20 |     parser.add_argument("--learning_rate", type=float, default=1e-3, help="Learning rate.")
21 |     parser.add_argument("--batch_size", type=int, default=64, help="Batch size.")
22 |     parser.add_argument("--num_epochs", type=int, default=10, help="Number of epochs.")
23 |     parser.add_argument("--keep_prob", type=float, default=0.8, help="Dropout keep prob.")
24 |     parser.add_argument("--checkpoint_dir", type=str, default="saved_model", help="Checkpoint directory.")
25 | 
26 | 
27 | parser = argparse.ArgumentParser()
28 | add_arguments(parser)
29 | args = parser.parse_args()
30 | 
31 | num_class = 2
32 | if not os.path.exists(args.checkpoint_dir):
33 |     os.mkdir(args.checkpoint_dir)
34 | 
35 | print("Building dictionary...")
36 | word_dict, reversed_dict, document_max_len = build_dict(args.train_tsv)
37 | print("Building dataset...")
38 | x, y = build_dataset(args.train_tsv, word_dict, document_max_len)
39 | # Split to train and validation data
40 | train_x, valid_x, train_y, valid_y = train_test_split(x, y, test_size=0.15)
41 | 
42 | 
43 | with tf.Session() as sess:
44 |     if args.model == "naive":
45 |         model = NaiveRNN(reversed_dict, document_max_len, num_class, args)
46 |     elif args.model == "att":
47 |         model = AttentionRNN(reversed_dict, document_max_len, num_class, args)
48 |     else:
49 |         raise NotImplementedError()
50 | 
51 |     sess.run(tf.global_variables_initializer())
52 |     saver = tf.train.Saver(tf.global_variables())
53 | 
54 |     train_batches = batch_iter(train_x, train_y, args.batch_size, args.num_epochs)
55 |     num_batches_per_epoch = (len(train_x) - 1) // args.batch_size + 1
56 |     max_accuracy = 0
57 | 
58 |     for x_batch, y_batch in train_batches:
59 |         train_feed_dict = {
60 |             model.x: x_batch,
61 |             model.y: y_batch,
62 |             model.keep_prob: args.keep_prob
63 |         }
64 | 
65 |         _, step, loss = sess.run([model.optimizer, model.global_step, model.loss], feed_dict=train_feed_dict)
66 | 
67 |         if step % 100 == 0:
68 |             print("step {0}: loss = {1}".format(step, loss))
69 | 
70 |         if step % 2000 == 0:
71 |             # Test accuracy with validation data for each epoch.
72 |             valid_batches = batch_iter(valid_x, valid_y, args.batch_size, 1)
73 |             sum_accuracy, cnt = 0, 0
74 | 
75 |             for valid_x_batch, valid_y_batch in valid_batches:
76 |                 valid_feed_dict = {
77 |                     model.x: valid_x_batch,
78 |                     model.y: valid_y_batch,
79 |                     model.keep_prob: 1.0
80 |                 }
81 | 
82 |                 accuracy = sess.run(model.accuracy, feed_dict=valid_feed_dict)
83 |                 sum_accuracy += accuracy
84 |                 cnt += 1
85 |             valid_accuracy = sum_accuracy / cnt
86 | 
87 |             print("\nValidation Accuracy = {1}\n".format(step // num_batches_per_epoch, sum_accuracy / cnt))
88 | 
89 |             # Save model
90 |             if valid_accuracy > max_accuracy:
91 |                 max_accuracy = valid_accuracy
92 |                 saver.save(sess, "{0}/{1}.ckpt".format(args.checkpoint_dir, args.model), global_step=step)
93 |                 print("Model is saved.\n")
94 | 


--------------------------------------------------------------------------------