├── requirements.txt ├── sample_data └── sample_data.zip ├── download_glove.py ├── models ├── net_utils.py ├── naive_rnn.py └── attention_rnn.py ├── test.py ├── data_utils.py ├── README.md └── train.py /requirements.txt: -------------------------------------------------------------------------------- 1 | gensim==3.3.0 2 | wget==3.2 3 | nltk==3.2.5 4 | scikit-learn==0.19.1 5 | -------------------------------------------------------------------------------- /sample_data/sample_data.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dongjun-Lee/rnn-text-classification-tf/HEAD/sample_data/sample_data.zip -------------------------------------------------------------------------------- /download_glove.py: -------------------------------------------------------------------------------- 1 | import wget 2 | import os 3 | import zipfile 4 | 5 | 6 | glove_dir = "glove" 7 | glove_url = "http://nlp.stanford.edu/data/glove.6B.zip" 8 | 9 | if not os.path.exists(glove_dir): 10 | os.mkdir(glove_dir) 11 | 12 | # Download glove vector 13 | wget.download(glove_url, out=glove_dir) 14 | 15 | # Extract glove file 16 | with zipfile.ZipFile(os.path.join("glove", "glove.6B.zip"), "r") as z: 17 | z.extractall(glove_dir) 18 | -------------------------------------------------------------------------------- /models/net_utils.py: -------------------------------------------------------------------------------- 1 | from gensim.models.keyedvectors import KeyedVectors 2 | from gensim.test.utils import get_tmpfile 3 | from gensim.scripts.glove2word2vec import glove2word2vec 4 | import numpy as np 5 | 6 | 7 | def get_init_embedding(reversed_dict, embedding_size): 8 | print("Loading Glove vectors...") 9 | glove_file = "glove/glove.6B.%dd.txt" % embedding_size 10 | word2vec_file = get_tmpfile("word2vec_format.vec") 11 | glove2word2vec(glove_file, word2vec_file) 12 | word_vectors = KeyedVectors.load_word2vec_format(word2vec_file) 13 | 14 | word_vec_list = list() 15 | for _, word in sorted(reversed_dict.items()): 16 | try: 17 | word_vec = word_vectors.word_vec(word) 18 | except KeyError: 19 | word_vec = np.zeros([embedding_size], dtype=np.float32) 20 | 21 | word_vec_list.append(word_vec) 22 | 23 | return np.array(word_vec_list) 24 | -------------------------------------------------------------------------------- /test.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import argparse 3 | from data_utils import build_dict, build_dataset, batch_iter 4 | 5 | 6 | def add_arguments(parser): 7 | parser.add_argument("--test_tsv", type=str, default="sample_data/test.tsv", help="Test tsv file.") 8 | parser.add_argument("--checkpoint_dir", type=str, default="saved_model", help="Checkpoint dir for saved model.") 9 | parser.add_argument("--batch_size", type=int, default=64, help="Batch size.") 10 | 11 | 12 | parser = argparse.ArgumentParser() 13 | add_arguments(parser) 14 | args = parser.parse_args() 15 | 16 | print("Loading dictionary...") 17 | word_dict, reversed_dict, document_max_len = build_dict(args.test_tsv, is_train=False) 18 | print("Building test dataset...") 19 | test_x, test_y = build_dataset(args.test_tsv, word_dict, document_max_len) 20 | 21 | checkpoint_file = tf.train.latest_checkpoint(args.checkpoint_dir) 22 | graph = tf.Graph() 23 | with graph.as_default(): 24 | with tf.Session() as sess: 25 | saver = tf.train.import_meta_graph("{}.meta".format(checkpoint_file)) 26 | saver.restore(sess, checkpoint_file) 27 | 28 | x = graph.get_operation_by_name("x").outputs[0] 29 | y = graph.get_operation_by_name("y").outputs[0] 30 | keep_prob = graph.get_operation_by_name("keep_prob").outputs[0] 31 | accuracy = graph.get_operation_by_name("accuracy/accuracy").outputs[0] 32 | 33 | batches = batch_iter(test_x, test_y, args.batch_size, 1) 34 | sum_accuracy, cnt = 0, 0 35 | for batch_x, batch_y in batches: 36 | feed_dict = { 37 | x: batch_x, 38 | y: batch_y, 39 | keep_prob: 1.0 40 | } 41 | 42 | accuracy_out = sess.run(accuracy, feed_dict=feed_dict) 43 | sum_accuracy += accuracy_out 44 | cnt += 1 45 | 46 | print("Test Accuracy : {0}".format(sum_accuracy / cnt)) 47 | -------------------------------------------------------------------------------- /data_utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | from nltk.tokenize import word_tokenize 4 | import re 5 | import collections 6 | import pickle 7 | 8 | 9 | def clean_str(text): 10 | text = re.sub(r"[^A-Za-z0-9(),!?\'\`\"]", " ", text) 11 | text = re.sub(r"\s{2,}", " ", text) 12 | text = re.sub(r"@[A-Za-z0-9]+", " ", text) 13 | text = text.strip().lower() 14 | 15 | return text 16 | 17 | 18 | def build_dict(train_tsv, is_train=True): 19 | if is_train: 20 | df = pd.read_csv(train_tsv, sep="\t") 21 | sentences = df["sentence"] 22 | 23 | words = list() 24 | for sentence in sentences: 25 | for word in word_tokenize(clean_str(sentence)): 26 | words.append(word) 27 | 28 | word_counter = collections.Counter(words).most_common() 29 | word_dict = dict() 30 | word_dict[""] = 0 31 | word_dict[""] = 1 32 | for word, _ in word_counter: 33 | word_dict[word] = len(word_dict) 34 | 35 | with open("word_dict.pickle", "wb") as f: 36 | pickle.dump(word_dict, f) 37 | 38 | else: 39 | with open("word_dict.pickle", "rb") as f: 40 | word_dict = pickle.load(f) 41 | 42 | reversed_dict = dict(zip(word_dict.values(), word_dict.keys())) 43 | document_max_len = 20 44 | 45 | return word_dict, reversed_dict, document_max_len 46 | 47 | 48 | def build_dataset(tsv, word_dict, document_max_len): 49 | df = pd.read_csv(tsv, sep="\t") 50 | 51 | x = list(map(lambda d: word_tokenize(clean_str(d)), df["sentence"])) 52 | x = list(map(lambda d: list(map(lambda w: word_dict.get(w, word_dict[""]), d)), x)) 53 | x = list(map(lambda d: d[:document_max_len], x)) 54 | x = list(map(lambda d: d + (document_max_len - len(d)) * [word_dict[""]], x)) 55 | 56 | y = list(df["sentiment"]) 57 | 58 | return x, y 59 | 60 | 61 | def batch_iter(inputs, outputs, batch_size, num_epochs): 62 | inputs = np.array(inputs) 63 | outputs = np.array(outputs) 64 | 65 | num_batches_per_epoch = (len(inputs) - 1) // batch_size + 1 66 | for epoch in range(num_epochs): 67 | for batch_num in range(num_batches_per_epoch): 68 | start_index = batch_num * batch_size 69 | end_index = min((batch_num + 1) * batch_size, len(inputs)) 70 | yield inputs[start_index:end_index], outputs[start_index:end_index] 71 | -------------------------------------------------------------------------------- /models/naive_rnn.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from tensorflow.contrib import rnn 3 | from models.net_utils import get_init_embedding 4 | 5 | 6 | class NaiveRNN(object): 7 | def __init__(self, reversed_dict, document_max_len, num_class, args): 8 | self.vocabulary_size = len(reversed_dict) 9 | self.embedding_size = args.embedding_size 10 | self.num_hidden = args.num_hidden 11 | self.num_layers = args.num_layers 12 | self.learning_rate = args.learning_rate 13 | 14 | self.x = tf.placeholder(tf.int32, [None, document_max_len], name="x") 15 | self.x_len = tf.reduce_sum(tf.sign(self.x), 1) 16 | self.y = tf.placeholder(tf.int32, [None], name="y") 17 | self.keep_prob = tf.placeholder(tf.float32, [], name='keep_prob') 18 | self.global_step = tf.Variable(0, trainable=False) 19 | 20 | with tf.name_scope("embedding"): 21 | if args.glove: 22 | init_embeddings = tf.constant(get_init_embedding(reversed_dict, self.embedding_size), dtype=tf.float32) 23 | self.embeddings = tf.get_variable("embeddings", initializer=init_embeddings, trainable=False) 24 | else: 25 | init_embeddings = tf.random_uniform([self.vocabulary_size, self.embedding_size]) 26 | self.embeddings = tf.get_variable("embeddings", initializer=init_embeddings, trainable=True) 27 | self.x_emb = tf.nn.embedding_lookup(self.embeddings, self.x) 28 | 29 | with tf.name_scope("birnn"): 30 | fw_cells = [rnn.BasicLSTMCell(self.num_hidden) for _ in range(self.num_layers)] 31 | bw_cells = [rnn.BasicLSTMCell(self.num_hidden) for _ in range(self.num_layers)] 32 | fw_cells = [rnn.DropoutWrapper(cell, output_keep_prob=self.keep_prob) for cell in fw_cells] 33 | bw_cells = [rnn.DropoutWrapper(cell, output_keep_prob=self.keep_prob) for cell in bw_cells] 34 | 35 | self.rnn_outputs, _, _ = rnn.stack_bidirectional_dynamic_rnn( 36 | fw_cells, bw_cells, self.x_emb, sequence_length=self.x_len, dtype=tf.float32) 37 | self.last_output = self.rnn_outputs[:, -1, :] 38 | 39 | with tf.name_scope("output"): 40 | self.logits = tf.contrib.slim.fully_connected(self.last_output, num_class, activation_fn=None) 41 | self.predictions = tf.argmax(self.logits, -1, output_type=tf.int32) 42 | 43 | with tf.name_scope("loss"): 44 | self.loss = tf.reduce_mean( 45 | tf.nn.sparse_softmax_cross_entropy_with_logits(logits=self.logits, labels=self.y)) 46 | self.optimizer = tf.train.AdamOptimizer(self.learning_rate).minimize(self.loss, global_step=self.global_step) 47 | 48 | with tf.name_scope("accuracy"): 49 | correct_predictions = tf.equal(self.predictions, self.y) 50 | self.accuracy = tf.reduce_mean(tf.cast(correct_predictions, "float"), name="accuracy") 51 | -------------------------------------------------------------------------------- /models/attention_rnn.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from tensorflow.contrib import rnn 3 | from models.net_utils import get_init_embedding 4 | 5 | 6 | class AttentionRNN(object): 7 | def __init__(self, reversed_dict, document_max_len, num_class, args): 8 | self.vocabulary_size = len(reversed_dict) 9 | self.embedding_size = args.embedding_size 10 | self.num_hidden = args.num_hidden 11 | self.num_layers = args.num_layers 12 | self.learning_rate = args.learning_rate 13 | 14 | self.x = tf.placeholder(tf.int32, [None, document_max_len], name="x") 15 | self.x_len = tf.reduce_sum(tf.sign(self.x), 1) 16 | self.y = tf.placeholder(tf.int32, [None], name="y") 17 | self.keep_prob = tf.placeholder(tf.float32, [], name="keep_prob") 18 | self.global_step = tf.Variable(0, trainable=False) 19 | 20 | with tf.name_scope("embedding"): 21 | if args.glove: 22 | init_embeddings = tf.constant(get_init_embedding(reversed_dict, self.embedding_size), dtype=tf.float32) 23 | self.embeddings = tf.get_variable("embeddings", initializer=init_embeddings, trainable=False) 24 | else: 25 | init_embeddings = tf.random_uniform([self.vocabulary_size, self.embedding_size]) 26 | self.embeddings = tf.get_variable("embeddings", initializer=init_embeddings, trainable=True) 27 | self.x_emb = tf.nn.embedding_lookup(self.embeddings, self.x) 28 | 29 | with tf.name_scope("birnn"): 30 | fw_cells = [rnn.BasicLSTMCell(self.num_hidden) for _ in range(self.num_layers)] 31 | bw_cells = [rnn.BasicLSTMCell(self.num_hidden) for _ in range(self.num_layers)] 32 | fw_cells = [rnn.DropoutWrapper(cell, output_keep_prob=self.keep_prob) for cell in fw_cells] 33 | bw_cells = [rnn.DropoutWrapper(cell, output_keep_prob=self.keep_prob) for cell in bw_cells] 34 | 35 | self.rnn_outputs, _, _ = rnn.stack_bidirectional_dynamic_rnn( 36 | fw_cells, bw_cells, self.x_emb, sequence_length=self.x_len, dtype=tf.float32) 37 | 38 | with tf.name_scope("attention"): 39 | self.attention_score = tf.nn.softmax(tf.contrib.slim.fully_connected(self.rnn_outputs, 1)) 40 | self.attention_out = tf.squeeze( 41 | tf.matmul(tf.transpose(self.rnn_outputs, perm=[0, 2, 1]), self.attention_score), 42 | axis=-1) 43 | 44 | with tf.name_scope("output"): 45 | self.logits = tf.contrib.slim.fully_connected(self.attention_out, num_class, activation_fn=None) 46 | self.predictions = tf.argmax(self.logits, -1, output_type=tf.int32) 47 | 48 | with tf.name_scope("loss"): 49 | self.loss = tf.reduce_mean( 50 | tf.nn.sparse_softmax_cross_entropy_with_logits(logits=self.logits, labels=self.y)) 51 | self.optimizer = tf.train.AdamOptimizer(self.learning_rate).minimize(self.loss, global_step=self.global_step) 52 | 53 | with tf.name_scope("accuracy"): 54 | correct_predictions = tf.equal(self.predictions, self.y) 55 | self.accuracy = tf.reduce_mean(tf.cast(correct_predictions, "float"), name="accuracy") 56 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # text-classification-tensorflow 2 | Tensorflow implementation of **Attention-based Bidirectional RNN** text classification. (with naive Bidirectional RNN model as a baseline) 3 | 4 | 5 | 6 | ## Requirements 7 | - Python 3 8 | - Tensorflow 9 | - pip install -r requirements.txt 10 | 11 | 12 | ## Usage 13 | 14 | ### Prepare Data 15 | We are using pre-processed version of [Twitter Sentiment Classification Data](http://help.sentiment140.com/for-students). To use sample data (100K train/30K test), 16 | ``` 17 | $ unzip sample_data/sample_data.zip -d sample_data 18 | ``` 19 | 20 | To use full data (1.2M train/0.4M test), download it from [google drive link](https://drive.google.com/file/d/1aMt-6OCN_mEDlmRX4bymk5ZNEatsVXF-/view?usp=sharing). 21 | 22 | To use Glove pre-trained embedding, download it via 23 | ``` 24 | $ python download_glove.py 25 | ``` 26 | 27 | ### Train 28 | To train the model with sample data, 29 | ``` 30 | $ python train.py 31 | ``` 32 | Train data is split to train set(85%) and validation set(15%). Every 2000 steps, the classification accuracy is tested with validation set and the best model is saved. 33 | 34 | 35 | To use Glove pre-trained vectors as initial embedding, 36 | ``` 37 | $ python train.py --glove 38 | ``` 39 | 40 | #### Additional Hyperparameters 41 | ``` 42 | $ python train.py -h 43 | usage: train.py [-h] [--train_tsv TRAIN_TSV] [--model MODEL] [--glove] 44 | [--embedding_size EMBEDDING_SIZE] [--num_hidden NUM_HIDDEN] 45 | [--num_layers NUM_LAYERS] [--learning_rate LEARNING_RATE] 46 | [--batch_size BATCH_SIZE] [--num_epochs NUM_EPOCHS] 47 | [--keep_prob KEEP_PROB] [--checkpoint_dir CHECKPOINT_DIR] 48 | 49 | optional arguments: 50 | -h, --help show this help message and exit 51 | --train_tsv TRAIN_TSV 52 | Train tsv file. 53 | --model MODEL naive | att 54 | --glove Use glove as initial word embedding. 55 | --embedding_size EMBEDDING_SIZE 56 | Word embedding size. (For glove, use 50 | 100 | 200 | 300) 57 | --num_hidden NUM_HIDDEN 58 | RNN Network size. 59 | --num_layers NUM_LAYERS 60 | RNN Network depth. 61 | --learning_rate LEARNING_RATE 62 | Learning rate. 63 | --batch_size BATCH_SIZE 64 | Batch size. 65 | --num_epochs NUM_EPOCHS 66 | Number of epochs. 67 | --keep_prob KEEP_PROB 68 | Dropout keep prob. 69 | --checkpoint_dir CHECKPOINT_DIR 70 | Checkpoint directory. 71 | ``` 72 | 73 | 74 | 75 | ### Test 76 | To test classification accuracy for test data, 77 | ``` 78 | $ python test.py 79 | ``` 80 | 81 | To use custom data, 82 | ``` 83 | $ python test.py --test_tsv= 84 | ``` 85 | 86 | ### Sample Test Results 87 | Trained and tested with [full data](https://drive.google.com/file/d/1aMt-6OCN_mEDlmRX4bymk5ZNEatsVXF-/view?usp=sharing) with default hyper-parameters, 88 | 89 | Model | Naive | Naive(+Glove) | Attention | Attention(+Glove) 90 | :---: | :---: | :---: | :---: | :---: 91 | Accuracy | 0.574 | 0.578 | 0.811 | 0.820 92 | 93 | 94 | ## References 95 | - [Dataset](http://help.sentiment140.com/for-students) 96 | - [dennybritz/cnn-text-classification-tf](https://github.com/dennybritz/cnn-text-classification-tf) 97 | - [Attention-Based Bidirectional Long Short-Term Memory Networks for Relation Classification](http://www.aclweb.org/anthology/P16-2034) 98 | -------------------------------------------------------------------------------- /train.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import tensorflow as tf 4 | from sklearn.model_selection import train_test_split 5 | from models.naive_rnn import NaiveRNN 6 | from models.attention_rnn import AttentionRNN 7 | from data_utils import build_dict, build_dataset, batch_iter 8 | 9 | 10 | def add_arguments(parser): 11 | parser.add_argument("--train_tsv", type=str, default="sample_data/train.tsv", help="Train tsv file.") 12 | parser.add_argument("--model", type=str, default="att", help="naive | att") 13 | parser.add_argument("--glove", action="store_true", help="Use glove as initial word embedding.") 14 | parser.add_argument("--embedding_size", type=int, default=300, 15 | help="Word embedding size. (For glove, use 50 | 100 | 200 | 300)") 16 | 17 | parser.add_argument("--num_hidden", type=int, default=100, help="RNN Network size.") 18 | parser.add_argument("--num_layers", type=int, default=2, help="RNN Network depth.") 19 | 20 | parser.add_argument("--learning_rate", type=float, default=1e-3, help="Learning rate.") 21 | parser.add_argument("--batch_size", type=int, default=64, help="Batch size.") 22 | parser.add_argument("--num_epochs", type=int, default=10, help="Number of epochs.") 23 | parser.add_argument("--keep_prob", type=float, default=0.8, help="Dropout keep prob.") 24 | parser.add_argument("--checkpoint_dir", type=str, default="saved_model", help="Checkpoint directory.") 25 | 26 | 27 | parser = argparse.ArgumentParser() 28 | add_arguments(parser) 29 | args = parser.parse_args() 30 | 31 | num_class = 2 32 | if not os.path.exists(args.checkpoint_dir): 33 | os.mkdir(args.checkpoint_dir) 34 | 35 | print("Building dictionary...") 36 | word_dict, reversed_dict, document_max_len = build_dict(args.train_tsv) 37 | print("Building dataset...") 38 | x, y = build_dataset(args.train_tsv, word_dict, document_max_len) 39 | # Split to train and validation data 40 | train_x, valid_x, train_y, valid_y = train_test_split(x, y, test_size=0.15) 41 | 42 | 43 | with tf.Session() as sess: 44 | if args.model == "naive": 45 | model = NaiveRNN(reversed_dict, document_max_len, num_class, args) 46 | elif args.model == "att": 47 | model = AttentionRNN(reversed_dict, document_max_len, num_class, args) 48 | else: 49 | raise NotImplementedError() 50 | 51 | sess.run(tf.global_variables_initializer()) 52 | saver = tf.train.Saver(tf.global_variables()) 53 | 54 | train_batches = batch_iter(train_x, train_y, args.batch_size, args.num_epochs) 55 | num_batches_per_epoch = (len(train_x) - 1) // args.batch_size + 1 56 | max_accuracy = 0 57 | 58 | for x_batch, y_batch in train_batches: 59 | train_feed_dict = { 60 | model.x: x_batch, 61 | model.y: y_batch, 62 | model.keep_prob: args.keep_prob 63 | } 64 | 65 | _, step, loss = sess.run([model.optimizer, model.global_step, model.loss], feed_dict=train_feed_dict) 66 | 67 | if step % 100 == 0: 68 | print("step {0}: loss = {1}".format(step, loss)) 69 | 70 | if step % 2000 == 0: 71 | # Test accuracy with validation data for each epoch. 72 | valid_batches = batch_iter(valid_x, valid_y, args.batch_size, 1) 73 | sum_accuracy, cnt = 0, 0 74 | 75 | for valid_x_batch, valid_y_batch in valid_batches: 76 | valid_feed_dict = { 77 | model.x: valid_x_batch, 78 | model.y: valid_y_batch, 79 | model.keep_prob: 1.0 80 | } 81 | 82 | accuracy = sess.run(model.accuracy, feed_dict=valid_feed_dict) 83 | sum_accuracy += accuracy 84 | cnt += 1 85 | valid_accuracy = sum_accuracy / cnt 86 | 87 | print("\nValidation Accuracy = {1}\n".format(step // num_batches_per_epoch, sum_accuracy / cnt)) 88 | 89 | # Save model 90 | if valid_accuracy > max_accuracy: 91 | max_accuracy = valid_accuracy 92 | saver.save(sess, "{0}/{1}.ckpt".format(args.checkpoint_dir, args.model), global_step=step) 93 | print("Model is saved.\n") 94 | --------------------------------------------------------------------------------