├── README.md ├── attention.py ├── data.zip ├── embedding.zip ├── images ├── char_cnn.PNG ├── model_structure.png ├── performance_table.png └── test_loss_graph.png ├── layer.py ├── model.py ├── preprocess.py ├── train.py └── transformer.py /README.md: -------------------------------------------------------------------------------- 1 | # Transformer-Encoder-with-Char 2 | 1. **Transformer Encoder with Char information** for text classification 3 | 2. This code was created by referring to the code in [carpedm20](https://github.com/carpedm20/lstm-char-cnn-tensorflow) and [DongjunLee](https://github.com/DongjunLee/transformer-tensorflow) 4 | 5 | ## 1. Model structure 6 | ![alt text](https://github.com/MSWon/Transformer-Encoder-with-Char/blob/master/images/model_structure.png "Model") 7 | 8 | 1. Input words are represented with **Char-CNN**, **Word2vec** concatenated together(**64 dimensions each**) 9 | 10 | 2. Normal Transformer Encoder from ([Attention is all you need](https://arxiv.org/pdf/1706.03762.pdf)) is used 11 | 12 | 3. Model is composed of **7 Transformer Encoder layers** with **4 attention heads** 13 | 14 | 4. **Global Average Pooling** layer with softmax is used at the end, for predicting class 15 | 16 | ## 2. Char CNN 17 | ![alt text](https://github.com/MSWon/Transformer-Encoder-with-Char/blob/master/images/char_cnn.PNG "Char CNN") 18 | 19 | 1. Char CNN implemented by [Yoon Kim](https://arxiv.org/pdf/1508.06615.pdf) 20 | 21 | ## 3. Prerequisite 22 | - [Tensorflow 1.8.0](https://www.tensorflow.org/) 23 | - Python 3.6 24 | 25 | ## 4. Training 26 | 1. Clone git 27 | ``` 28 | $ git clone https://github.com/MSWon/Transformer-Encoder-with-Char.git 29 | ``` 30 | 2. Unzip **data.zip** and **embedding.zip** 31 | ``` 32 | $ unzip data.zip 33 | $ unzip embedding.zip 34 | ``` 35 | 3. Training with user settings (char_mode : (char_cnn, char_lstm, no_char)) 36 | ``` 37 | $ python train.py --batch_size 128 --training_epochs 12 --char_mode char_cnn 38 | ``` 39 | 40 | ## 5. Experiments 41 | 42 | ### 5-1. Datasets 43 | 44 | 1. The **AG’s news** topic classification dataset is constructed by choosing 4 largest classes from the original news corpus 45 | 2. 4 classes are ‘world’, ‘sports’, ‘business’ and ‘science/technology’ 46 | 3. Each class contains 30,000 training samples and 1,900 testing samples 47 | 4. The total number of **training samples** is **120,000** and **7,600** for **test** 48 | 49 | ### 5-2. Test loss graph 50 | ![alt text](https://github.com/MSWon/Transformer-Encoder-with-Char/blob/master/images/test_loss_graph.png "loss graph") 51 | 52 | ### 5-3. Performance table 53 | ![alt text](https://github.com/MSWon/Transformer-Encoder-with-Char/blob/master/images/performance_table.png "table") 54 | -------------------------------------------------------------------------------- /attention.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Sun Jan 20 18:53:17 2019 4 | 5 | @author: jbk48 6 | """ 7 | 8 | 9 | import numpy as np 10 | import tensorflow as tf 11 | from keras.layers.pooling import GlobalAveragePooling2D, GlobalMaxPooling2D 12 | 13 | 14 | __all__ = [ 15 | "positional_encoding", "Attention" 16 | ] 17 | 18 | def positional_encoding(dim, sentence_length, dtype=tf.float32): 19 | pos_enc = np.array([[pos / np.power(10000., 2. * (i // 2) / dim) for i in range(dim)] for pos in range(sentence_length)]) # [seq_len, d_model] 20 | # Apply the cosine to even columns and sin to odds. 21 | pos_enc[:, 0::2] = np.sin(pos_enc[:, 0::2]) # dim 2i 22 | pos_enc[:, 1::2] = np.cos(pos_enc[:, 1::2]) # dim 2i+1 23 | return tf.convert_to_tensor(pos_enc, dtype=dtype) 24 | 25 | 26 | class Attention: 27 | 28 | def __init__(self, 29 | num_heads=1, 30 | masked=False, 31 | linear_key_dim=50, 32 | linear_value_dim=50, 33 | model_dim=100, 34 | dropout=0.2, 35 | batch_size=128): 36 | 37 | assert linear_key_dim % num_heads == 0 38 | assert linear_value_dim % num_heads == 0 39 | 40 | self.num_heads = num_heads 41 | self.masked = masked 42 | self.linear_key_dim = linear_key_dim 43 | self.linear_value_dim = linear_value_dim 44 | self.model_dim = model_dim 45 | self.dropout = dropout 46 | self.batch_size = batch_size 47 | 48 | def multi_head(self, q, k, v, seq_len): 49 | q, k, v = self._linear_projection(q, k, v) 50 | qs, ks, vs = self._split_heads(q, k, v) 51 | outputs = self._scaled_dot_product(qs, ks, vs, seq_len) 52 | output = self._concat_heads(outputs) 53 | output = tf.layers.dense(output, self.model_dim) 54 | 55 | return tf.nn.dropout(output, 1.0 - self.dropout) 56 | 57 | def classifier_head(self, q, k, v, seq_len): 58 | q, k, v = self._linear_projection(q, k, v) 59 | qs, ks, vs = self._split_heads(q, k, v) 60 | outputs = self._scaled_dot_product(qs, ks, vs, seq_len) 61 | output = self._GlobalAverage_heads(outputs) 62 | 63 | return output 64 | 65 | def _GlobalAverage_heads(self, outputs): 66 | outputs = tf.transpose(outputs, [0, 3, 2, 1]) # [batch_size, dim, max_seq_len, num_heads] 67 | outputs = GlobalAveragePooling2D()(outputs) 68 | return outputs 69 | 70 | def _GlobalMax_heads(self, outputs): 71 | outputs = tf.transpose(outputs, [0, 3, 2, 1]) # [batch_size, dim, max_seq_len, num_heads] 72 | outputs = GlobalMaxPooling2D()(outputs) 73 | return outputs 74 | 75 | def _linear_projection(self, q, k, v): 76 | q = tf.layers.dense(q, self.linear_key_dim, use_bias=False) 77 | k = tf.layers.dense(k, self.linear_key_dim, use_bias=False) 78 | v = tf.layers.dense(v, self.linear_value_dim, use_bias=False) 79 | return q, k, v 80 | 81 | def _split_heads(self, q, k, v): 82 | 83 | def split_last_dimension_then_transpose(tensor, num_heads, dim): ## dim = num_head * project_dim 84 | t_shape = tensor.get_shape().as_list() 85 | tensor = tf.reshape(tensor, [-1] + t_shape[1:-1] + [num_heads, dim // num_heads]) 86 | return tf.transpose(tensor, [0, 2, 1, 3]) # [batch_size, num_heads, max_seq_len, dim] 87 | 88 | qs = split_last_dimension_then_transpose(q, self.num_heads, self.linear_key_dim) 89 | ks = split_last_dimension_then_transpose(k, self.num_heads, self.linear_key_dim) 90 | vs = split_last_dimension_then_transpose(v, self.num_heads, self.linear_value_dim) 91 | 92 | return qs, ks, vs 93 | 94 | def _scaled_dot_product(self, qs, ks, vs, seq_len): 95 | ## qs, ks, vs : [batch_size, num_heads, max_seq_len, dim] 96 | key_dim_per_head = self.linear_key_dim // self.num_heads 97 | o1 = tf.matmul(qs, ks, transpose_b=True) 98 | o2 = o1 / (key_dim_per_head**0.5) ## [batch_size, num_heads, max_seq_len, max_seq_len] 99 | 100 | if self.masked: ## mask score matrix to max_seq_len 101 | row_vector = tf.range(0,o2.shape[2],1) ## [, max_seq_len] 102 | matrix = tf.cast(tf.expand_dims(seq_len,-1), tf.int32) ## [batch_size, 1] 103 | 104 | t = tf.cast(row_vector < matrix, tf.float32) ## [batch_size, max_seq_len] 105 | t = tf.expand_dims(t, -1) ## [batch_size, max_seq_len, 1] 106 | masks = t * tf.transpose(t, [0,2,1]) ## [batch_size, max_seq_len, max_seq_len] 107 | masks = tf.tile(tf.expand_dims(masks, 1), [1, int(o2.shape[1]), 1, 1]) ## [batch_size, num_heads, max_seq_len, max_seq_len] 108 | 109 | paddings = tf.ones_like(masks) * -1e9 110 | o2 = tf.where(tf.equal(masks, 0), paddings, o2) 111 | 112 | o3 = tf.nn.softmax(o2) 113 | return tf.matmul(o3, vs) 114 | 115 | def _concat_heads(self, outputs): 116 | 117 | def transpose_then_concat_last_two_dimenstion(tensor): 118 | tensor = tf.transpose(tensor, [0, 2, 1, 3]) # [batch_size, max_seq_len, num_heads, dim] 119 | t_shape = tensor.get_shape().as_list() 120 | num_heads, dim = t_shape[-2:] 121 | return tf.reshape(tensor, [-1] + t_shape[1:-2] + [num_heads * dim]) # [batch_size, max_seq_len, num_heads*dim] 122 | 123 | return transpose_then_concat_last_two_dimenstion(outputs) 124 | -------------------------------------------------------------------------------- /data.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MSWon/Transformer-Encoder-with-Char/991cd782842efbf23ff99a594d00fe4188b8195d/data.zip -------------------------------------------------------------------------------- /embedding.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MSWon/Transformer-Encoder-with-Char/991cd782842efbf23ff99a594d00fe4188b8195d/embedding.zip -------------------------------------------------------------------------------- /images/char_cnn.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MSWon/Transformer-Encoder-with-Char/991cd782842efbf23ff99a594d00fe4188b8195d/images/char_cnn.PNG -------------------------------------------------------------------------------- /images/model_structure.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MSWon/Transformer-Encoder-with-Char/991cd782842efbf23ff99a594d00fe4188b8195d/images/model_structure.png -------------------------------------------------------------------------------- /images/performance_table.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MSWon/Transformer-Encoder-with-Char/991cd782842efbf23ff99a594d00fe4188b8195d/images/performance_table.png -------------------------------------------------------------------------------- /images/test_loss_graph.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MSWon/Transformer-Encoder-with-Char/991cd782842efbf23ff99a594d00fe4188b8195d/images/test_loss_graph.png -------------------------------------------------------------------------------- /layer.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Mon Jan 21 16:37:55 2019 4 | 5 | @author: jbk48 6 | """ 7 | 8 | import tensorflow as tf 9 | import numpy as np 10 | 11 | class LayerNormalization(tf.layers.Layer): 12 | """Applies layer normalization.""" 13 | 14 | def __init__(self, hidden_size): 15 | super(LayerNormalization, self).__init__() 16 | self.hidden_size = hidden_size 17 | 18 | def build(self, _): 19 | self.scale = tf.get_variable("layer_norm_scale", [self.hidden_size], 20 | initializer=tf.ones_initializer()) 21 | self.bias = tf.get_variable("layer_norm_bias", [self.hidden_size], 22 | initializer=tf.zeros_initializer()) 23 | self.built = True 24 | 25 | def call(self, x, epsilon=1e-6): 26 | mean = tf.reduce_mean(x, axis=[-1], keepdims=True) 27 | variance = tf.reduce_mean(tf.square(x - mean), axis=[-1], keepdims=True) 28 | norm_x = (x - mean) * tf.rsqrt(variance + epsilon) 29 | return norm_x * self.scale + self.bias 30 | 31 | 32 | def gelu(x): 33 | """Gaussian Error Linear Unit. 34 | 35 | This is a smoother version of the RELU. 36 | Original paper: https://arxiv.org/abs/1606.08415 37 | Args: 38 | x: float Tensor to perform activation. 39 | 40 | Returns: 41 | `x` with the GELU activation applied. 42 | """ 43 | cdf = 0.5 * (1.0 + tf.tanh( 44 | (np.sqrt(2 / np.pi) * (x + 0.044715 * tf.pow(x, 3))))) 45 | return x * cdf 46 | 47 | 48 | 49 | class FFN: 50 | """FFN class (Position-wise Feed-Forward Networks)""" 51 | 52 | def __init__(self, 53 | w1_dim=200, 54 | w2_dim=100, 55 | dropout=0.1): 56 | 57 | self.w1_dim = w1_dim 58 | self.w2_dim = w2_dim 59 | self.dropout = dropout 60 | 61 | def dense_relu_dense(self, inputs): 62 | output = tf.layers.dense(inputs, self.w1_dim, activation=tf.nn.relu) 63 | output =tf.layers.dense(output, self.w2_dim) 64 | 65 | return tf.nn.dropout(output, 1.0 - self.dropout) 66 | 67 | def dense_gelu_dense(self, inputs): 68 | output = tf.layers.dense(inputs, self.w1_dim, activation=gelu) 69 | output =tf.layers.dense(output, self.w2_dim) 70 | 71 | return tf.nn.dropout(output, 1.0 - self.dropout) 72 | 73 | def conv_relu_conv(self): 74 | raise NotImplementedError("i will implement it!") 75 | 76 | 77 | -------------------------------------------------------------------------------- /model.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Sat Mar 2 12:55:38 2019 4 | 5 | @author: jbk48 6 | """ 7 | 8 | import numpy as np 9 | import tensorflow as tf 10 | import transformer 11 | import os 12 | import datetime 13 | import preprocess 14 | import pandas as pd 15 | from attention import positional_encoding 16 | 17 | config = tf.ConfigProto() 18 | config.gpu_options.allow_growth = True 19 | 20 | class Model: 21 | 22 | def __init__(self, word_dim, char_dim, max_sent_len, max_char_len, learning_rate, num_train_steps): 23 | 24 | self.word_dim = word_dim 25 | self.char_dim = char_dim 26 | self.max_sent_len = max_sent_len 27 | self.max_char_len = max_char_len 28 | self.learning_rate = learning_rate 29 | self.num_train_steps = num_train_steps 30 | 31 | ## Preprocess data 32 | self.prepro = preprocess.Preprocess(self.char_dim, self.max_sent_len, self.max_char_len) 33 | self.train_X, self.train_seq_length, self.train_Y, self.test_X, self.test_seq_length, self.test_Y = self.prepro.load_data("./train.csv", "./test.csv", self.max_sent_len) 34 | self.word_embedding, self.char_embedding = self.prepro.prepare_embedding(self.char_dim) 35 | self.train_X, self.train_X_char, self.train_X_char_len, self.train_Y = self.prepro.prepare_data(self.train_X, self.train_Y, "train") 36 | self.test_X, self.test_X_char, self.test_X_char_len, self.test_Y = self.prepro.prepare_data(self.test_X, self.test_Y, "test") 37 | 38 | ## Placeholders 39 | self.word_input = tf.placeholder(tf.int32, shape = [None, max_sent_len], name = 'word') 40 | self.char_input = tf.placeholder(tf.int32, shape = [None, max_sent_len, max_char_len], name = 'char') 41 | self.label = tf.placeholder(tf.int32, shape = [None], name = 'label') 42 | self.seq_len = tf.placeholder(tf.int32, shape = [None]) 43 | self.char_len = tf.placeholder(tf.int32, [None, max_sent_len]) 44 | self.dropout = tf.placeholder(tf.float32, shape = ()) 45 | 46 | def train(self, batch_size, training_epochs, char_mode): 47 | 48 | self.batch_size = batch_size 49 | loss, optimizer, logits = self.build_model(self.word_input, self.char_input, self.label, self.seq_len, 50 | self.char_len, self.num_train_steps, char_mode) 51 | accuracy = self.get_accuracy(logits, self.label) 52 | 53 | ## Training 54 | init = tf.global_variables_initializer() 55 | 56 | num_train_batch = int(len(self.train_X) / self.batch_size) 57 | num_test_batch = int(len(self.test_X) / self.batch_size) 58 | print("Start training!") 59 | 60 | modelpath = "./transformer_ag_news_{}/".format(char_mode) 61 | modelName = "transformer_ag_news_{}.ckpt".format(char_mode) 62 | saver = tf.train.Saver() 63 | 64 | train_acc_list = [] 65 | train_loss_list = [] 66 | test_acc_list = [] 67 | test_loss_list = [] 68 | 69 | with tf.Session(config = config) as sess: 70 | 71 | start_time = datetime.datetime.now() 72 | sess.run(init) 73 | if(not os.path.exists(modelpath)): 74 | os.mkdir(modelpath) 75 | ckpt = tf.train.get_checkpoint_state(modelpath) 76 | if(ckpt and tf.train.checkpoint_exists(ckpt.model_checkpoint_path)): 77 | self.load_char_embedding(modelpath + "char_embedding_{}.npy".format(char_mode)) 78 | saver.restore(sess, modelpath + modelName) 79 | print("Model loaded!") 80 | 81 | for epoch in range(training_epochs): 82 | 83 | train_acc, train_loss = 0., 0. 84 | self.train_X, self.train_X_char, self.train_X_char_len, self.train_Y = self.shuffle(self.train_X, 85 | self.train_X_char, 86 | self.train_X_char_len, 87 | self.train_Y) 88 | for step in range(num_train_batch): 89 | if(step == 0): 90 | mode = "init" 91 | else: 92 | mode = None 93 | train_batch, train_batch_char, train_batch_char_len, train_batch_Y, train_batch_seq_len = get_batch(self.train_X, 94 | self.train_X_char, 95 | self.train_X_char_len, 96 | self.train_Y, 97 | self.train_seq_length, 98 | self.batch_size, 99 | mode) 100 | feed_dict_train = {self.word_input: train_batch, self.char_input : train_batch_char, self.label: train_batch_Y, 101 | self.seq_len: train_batch_seq_len, self.char_len: train_batch_char_len, self.dropout : 0.2} 102 | 103 | char_embedding_matrix = sess.run(self.prepro.clear_char_embedding_padding, feed_dict = feed_dict_train) ## clear 0 index to 0 vector 104 | _, train_batch_loss = sess.run([optimizer,loss], feed_dict = feed_dict_train) 105 | 106 | train_loss += train_batch_loss / num_train_batch 107 | train_batch_acc = sess.run(accuracy , feed_dict = feed_dict_train) 108 | train_acc += train_batch_acc / num_train_batch 109 | print("epoch : {:02d} step : {:04d} loss = {:.6f} accuracy= {:.6f}".format(epoch+1, step+1, train_batch_loss, train_batch_acc)) 110 | 111 | test_acc, test_loss = 0. , 0. 112 | print("Now for test data\nCould take few minutes") 113 | for step in range(num_test_batch): 114 | if(step == 0): 115 | mode = "init" 116 | else: 117 | mode = None 118 | test_batch, test_batch_char, test_batch_char_len, test_batch_Y, test_batch_seq_len = get_batch(self.test_X, 119 | self.test_X_char, 120 | self.test_X_char_len, 121 | self.test_Y, 122 | self.test_seq_length, 123 | self.batch_size, 124 | mode) 125 | feed_dict_test = {self.word_input: test_batch, self.char_input: test_batch_char, self.label: test_batch_Y, 126 | self.seq_len: test_batch_seq_len, self.char_len: test_batch_char_len, self.dropout : 0.0} 127 | # Compute average loss 128 | test_batch_loss = sess.run(loss, feed_dict = feed_dict_test) 129 | test_loss += test_batch_loss / num_test_batch 130 | 131 | test_batch_acc = sess.run(accuracy , feed_dict = feed_dict_test) 132 | test_acc += test_batch_acc / num_test_batch 133 | 134 | print(" Loss = {:.6f} Accuracy = {:.6f}".format(train_loss, train_acc)) 135 | print(" Loss = {:.6f} Accuracy = {:.6f}".format(test_loss, test_acc)) 136 | train_loss_list.append(train_loss) 137 | train_acc_list.append(train_acc) 138 | test_loss_list.append(test_loss) 139 | test_acc_list.append(test_acc) 140 | np.save(modelpath + "char_embedding_{}.npy".format(char_mode), char_embedding_matrix) 141 | 142 | train_loss = pd.DataFrame({"train_loss":train_loss_list}) 143 | train_acc = pd.DataFrame({"train_acc":train_acc_list}) 144 | test_loss = pd.DataFrame({"test_loss":test_loss_list}) 145 | test_acc = pd.DataFrame({"test_acc":test_acc_list}) 146 | df = pd.concat([train_loss,train_acc,test_loss,test_acc], axis = 1) 147 | df.to_csv("./results_{}.csv".format(char_mode), sep =",", index=False) 148 | elapsed_time = datetime.datetime.now() - start_time 149 | print("{}".format(elapsed_time)) 150 | save_path = saver.save(sess, modelpath + modelName) 151 | print ('save_path',save_path) 152 | 153 | def char_lstm(self, inputs, char_len, lstm_units, dropout, last=True, scope="char_lstm"): 154 | ## inputs : [batch_size, max_sent_len, max_char_len, dim] 155 | def _build_single_cell(lstm_units, keep_prob): 156 | cell = tf.contrib.rnn.LayerNormBasicLSTMCell(lstm_units) 157 | cell = tf.contrib.rnn.DropoutWrapper(cell, input_keep_prob=1.0-keep_prob, output_keep_prob=1.0-keep_prob) 158 | return cell 159 | char_len = tf.reshape(char_len, [-1]) 160 | max_sent_len = int(inputs.shape[1]) 161 | max_char_len = int(inputs.shape[2]) 162 | embedding_size = int(inputs.shape[3]) 163 | inputs = tf.reshape(inputs,[-1,max_char_len,embedding_size]) ## [batch_size*max_sent_len, max_char_len, dim] 164 | 165 | with tf.variable_scope("shared_" + scope): 166 | lstm_cell = _build_single_cell(lstm_units, dropout) 167 | 168 | with tf.variable_scope("birnn-lstm_" + scope): 169 | _output = tf.nn.bidirectional_dynamic_rnn(lstm_cell, lstm_cell, dtype=tf.float32, 170 | inputs = inputs, sequence_length = char_len, scope="rnn_" + scope) 171 | if last: 172 | _, ((_, output_fw), (_, output_bw)) = _output 173 | outputs = tf.concat([output_fw, output_bw], axis=1) 174 | outputs = tf.reshape(outputs, shape=[-1, max_sent_len, 2 * lstm_units]) 175 | else: 176 | (output_fw, output_bw), _ = _output 177 | outputs = tf.concat([output_fw, output_bw], axis=2) 178 | outputs = tf.reshape(outputs, shape=[-1, 2 * lstm_units]) 179 | 180 | outputs = tf.layers.dense(outputs, self.word_dim) 181 | return outputs 182 | 183 | def char_cnn(self, input_, kernels, kernel_features, scope='char_cnn'): 184 | ''' 185 | :input: input float tensor of shape [batch_size, max_sent_len, max_word_len, char_embed_size] 186 | :kernel_features: array of kernel feature sizes (parallel to kernels) 187 | ''' 188 | assert len(kernels) == len(kernel_features), 'Kernel and Features must have the same size' 189 | 190 | max_sent_len = input_.get_shape()[1] 191 | max_word_len = input_.get_shape()[2] 192 | char_embed_size = input_.get_shape()[3] 193 | 194 | input_ = tf.reshape(input_, [-1, max_word_len, char_embed_size]) 195 | 196 | input_ = tf.expand_dims(input_, 1) # input_: [batch_size*max_sent_len, 1, max_word_len, char_embed_size] 197 | 198 | layers = [] 199 | with tf.variable_scope(scope): 200 | for kernel_size, kernel_feature_size in zip(kernels, kernel_features): 201 | reduced_length = max_word_len - kernel_size + 1 202 | 203 | # [batch_size*max_sent_len, 1, reduced_length, kernel_feature_size] 204 | conv = self.conv2d(input_, kernel_feature_size, 1, kernel_size, name="kernel_%d" % kernel_size) 205 | 206 | # [batch_size*max_sent_len, 1, 1, kernel_feature_size] 207 | pool = tf.nn.max_pool(tf.tanh(conv), [1, 1, reduced_length, 1], [1, 1, 1, 1], 'VALID') 208 | 209 | layers.append(tf.squeeze(pool, [1, 2])) 210 | 211 | if len(kernels) > 1: 212 | output = tf.concat(layers, 1) # [batch_size*max_sent_len, sum(kernel_features)] 213 | else: 214 | output = layers[0] 215 | 216 | # [batch_size, max_sent_len, sum(kernel_features)] 217 | output = self.highway(output, output.get_shape()[-1], num_layers = 1) 218 | output = tf.reshape(output, (-1, max_sent_len, sum(kernel_features))) 219 | output = tf.layers.dense(output, self.word_dim, activation = None) ## projection layer 220 | 221 | return output 222 | 223 | def conv2d(self, input_, output_dim, k_h, k_w, name="conv2d"): 224 | with tf.variable_scope(name): 225 | w = tf.get_variable('w', [k_h, k_w, input_.get_shape()[-1], output_dim]) 226 | b = tf.get_variable('b', [output_dim]) 227 | 228 | return tf.nn.conv2d(input_, w, strides=[1, 1, 1, 1], padding='VALID') + b 229 | 230 | 231 | def highway(self, input_, size, num_layers=1, scope='Highway'): 232 | """Highway Network (cf. http://arxiv.org/abs/1505.00387). 233 | t = sigmoid(Wy + b) 234 | z = t * g(Wy + b) + (1 - t) * y 235 | where g is nonlinearity, t is transform gate, and (1 - t) is carry gate. 236 | """ 237 | with tf.variable_scope(scope): 238 | for idx in range(num_layers): 239 | g = tf.nn.relu(tf.layers.dense(input_, size, name='highway_lin_%d' % idx)) 240 | 241 | t = tf.sigmoid(tf.layers.dense(input_, size, name='highway_gate_%d' % idx)) 242 | 243 | output = t * g + (1. - t) * input_ 244 | input_ = output 245 | 246 | return output 247 | 248 | def build_parameter(self,num_layers, num_heads, linear_key_dim, linear_value_dim, model_dim, ffn_dim, n_class): 249 | 250 | self.num_layers=num_layers 251 | self.num_heads=num_heads 252 | self.linear_key_dim=linear_key_dim 253 | self.linear_value_dim=linear_value_dim 254 | self.model_dim=model_dim 255 | self.ffn_dim=ffn_dim 256 | self.n_class=n_class 257 | 258 | def build_model(self, word_inputs, char_inputs, labels, seq_len, char_len, num_train_steps, char_mode): 259 | print("Building model!") 260 | if(char_mode == "no_char"): 261 | self.model_dim /= 2 262 | 263 | # Implements linear decay of the learning rate. 264 | global_step = tf.Variable(0, trainable=False) 265 | learning_rate = tf.train.polynomial_decay( 266 | self.learning_rate, 267 | global_step, 268 | num_train_steps, 269 | end_learning_rate=0.0, 270 | power=1.0, 271 | cycle=False) 272 | 273 | encoder = transformer.Encoder(num_layers=self.num_layers, 274 | num_heads=self.num_heads, 275 | linear_key_dim=self.linear_key_dim, 276 | linear_value_dim=self.linear_value_dim, 277 | model_dim=self.model_dim, 278 | ffn_dim=self.ffn_dim, 279 | dropout=self.dropout, 280 | n_class=self.n_class, 281 | batch_size=self.batch_size) 282 | encoder_emb = self.build_embed(word_inputs, char_inputs, char_len, char_mode) 283 | encoder_outputs = encoder.build(encoder_emb, seq_len) 284 | 285 | loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits = encoder_outputs , labels = labels)) # Softmax loss 286 | optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(loss, global_step=global_step) # Adam Optimizer 287 | 288 | return loss, optimizer, encoder_outputs 289 | 290 | def build_embed(self, word_inputs, char_inputs, char_len, char_mode): 291 | 292 | # Positional Encoding 293 | with tf.variable_scope("positional-encoding"): 294 | positional_encoded = positional_encoding(self.word_dim, 295 | self.max_sent_len) 296 | 297 | 298 | position_inputs = tf.tile(tf.range(0, self.max_sent_len), [self.batch_size]) 299 | position_inputs = tf.reshape(position_inputs, [self.batch_size, self.max_sent_len]) # batch_size x [0, 1, 2, ..., n] 300 | encoded_inputs = tf.add(tf.nn.embedding_lookup(self.word_embedding, word_inputs), 301 | tf.nn.embedding_lookup(positional_encoded, position_inputs)) 302 | 303 | 304 | if(char_mode == "char_cnn"): 305 | char_inputs = tf.nn.embedding_lookup(self.char_embedding, char_inputs) 306 | kernels = [ 1, 2, 3, 4, 5, 6] 307 | kernel_features = [25, 50, 75, 100, 125, 150] 308 | char_inputs = self.char_cnn(char_inputs, kernels, kernel_features, scope='char_cnn') 309 | final_outputs = tf.concat([encoded_inputs,char_inputs], axis=2) 310 | elif(char_mode == "char_lstm"): 311 | char_inputs = tf.nn.embedding_lookup(self.char_embedding, char_inputs) 312 | char_inputs = self.char_lstm(char_inputs, char_len, self.word_dim, self.dropout, last=True, scope="char_lstm") 313 | final_outputs = tf.concat([encoded_inputs,char_inputs], axis=2) 314 | elif(char_mode == "no_char"): 315 | final_outputs = encoded_inputs 316 | 317 | return final_outputs 318 | 319 | def get_accuracy(self, logits, label): 320 | pred = tf.cast(tf.argmax(logits, 1), tf.int32) 321 | correct_pred = tf.equal(pred, label) 322 | accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32)) 323 | return accuracy 324 | 325 | def shuffle(self, train_X, train_X_char, train_X_char_len, train_Y): 326 | mask = np.random.permutation(len(train_X)) 327 | train_X = train_X[mask] 328 | train_X_char = train_X_char[mask] 329 | train_X_char_len = train_X_char_len[mask] 330 | train_Y = train_Y[mask] 331 | return train_X, train_X_char, train_X_char_len, train_Y 332 | 333 | def load_char_embedding(self, filename): 334 | print("Char embedding loaded!") 335 | self.char_embedding = np.load(filename) 336 | 337 | 338 | step = 0 339 | 340 | def get_batch(train_X, train_X_char, train_X_char_len, train_Y, seq_length, batch_size, mode = None): 341 | global step 342 | if(mode =="init"): 343 | step = 0 344 | train_batch_X = train_X[step*batch_size : (step+1)*batch_size] 345 | train_batch_X_char = train_X_char[step*batch_size : (step+1)*batch_size] 346 | train_batch_X_char_len = train_X_char_len[step*batch_size : (step+1)*batch_size] 347 | train_batch_Y = train_Y[step*batch_size : (step+1)*batch_size] 348 | train_batch_X_seq_len = seq_length[step*batch_size : (step+1)*batch_size] 349 | step += 1 350 | return train_batch_X, train_batch_X_char, train_batch_X_char_len, train_batch_Y, train_batch_X_seq_len 351 | -------------------------------------------------------------------------------- /preprocess.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Mon Aug 20 21:13:33 2018 4 | 5 | @author: jbk48 6 | """ 7 | 8 | import pandas as pd 9 | import numpy as np 10 | import os 11 | import re 12 | import tensorflow as tf 13 | import pickle 14 | from itertools import chain 15 | from keras.preprocessing.sequence import pad_sequences 16 | from nltk import tokenize 17 | from nltk.corpus import stopwords 18 | from sklearn.preprocessing import LabelBinarizer 19 | 20 | class Preprocess(): 21 | 22 | def __init__(self, char_dim, max_sent_len, max_char_len): 23 | self.char_dim = char_dim 24 | self.max_sent_len = max_sent_len 25 | self.max_char_len = max_char_len 26 | self.stop = set(stopwords.words('english')) 27 | self.stop.update(['.', ',', '"', "'", '?', '!', ':', ';', '(', ')', '[', ']', '{', '}', ""]) 28 | 29 | 30 | def load_data(self, train_filename, test_filename, max_len): 31 | print("Making corpus!\nCould take few minutes!") 32 | corpus, labels = self.read_data(train_filename) 33 | self.train_X, self.train_seq_length, self.train_Y = self.clean_text(corpus, labels) 34 | corpus, labels = self.read_data(test_filename) 35 | self.test_X, self.test_seq_length, self.test_Y = self.clean_text(corpus, labels) 36 | print("Tokenize done!") 37 | return self.train_X, self.train_seq_length, self.train_Y, self.test_X, self.test_seq_length, self.test_Y 38 | 39 | def read_data(self, filename): 40 | 41 | data = pd.read_csv(filename) 42 | labels = data.iloc[:,0] 43 | corpus = data.iloc[:,2] 44 | encoder = LabelBinarizer() 45 | encoder.fit(labels) 46 | labels = encoder.transform(labels) 47 | labels = np.array([np.argmax(x) for x in labels]) 48 | return corpus, labels 49 | 50 | def clean_text(self, corpus, labels): 51 | tokens = [] 52 | index_list = [] 53 | seq_len = [] 54 | index = 0 55 | for sent in corpus: 56 | text = re.sub('
', ' ', sent) 57 | text = re.sub('[^a-zA-Z]', ' ', sent) 58 | t = [token for token in tokenize.word_tokenize(text) if not token in self.stop and len(token)>1 and len(token)<=20] 59 | 60 | if(len(t) > self.max_sent_len): 61 | t = t[0:self.max_sent_len] 62 | 63 | if(len(t) > 10): 64 | seq_len.append(len(t)) 65 | t = t + [''] * (self.max_sent_len - len(t)) ## pad with max_len 66 | tokens.append(t) 67 | index_list.append(index) 68 | index += 1 69 | 70 | labels = labels[index_list] 71 | return tokens, seq_len, labels 72 | 73 | def prepare_embedding(self, char_dim): 74 | self.get_word_embedding() ## Get pretrained word embedding 75 | tokens = self.train_X + self.test_X 76 | self.get_char_list(tokens) ## build char dict 77 | self.get_char_embedding(char_dim, len(self.char_list)) ## Get char embedding 78 | return self.word_embedding, self.char_embedding 79 | 80 | def prepare_data(self, input_X, input_Y, mode): 81 | ## Data -> index 82 | input_X_index = self.convert2index(input_X, "UNK") 83 | input_X_char, input_X_char_len = self.sent2char(input_X, mode) 84 | input_X_index = np.array(input_X_index) 85 | input_Y = np.array(input_Y) 86 | return input_X_index, input_X_char, input_X_char_len, input_Y 87 | 88 | def get_word_embedding(self, filename = "./polyglot-en.pkl"): 89 | print("Getting polyglot embeddings!") 90 | words, vector = pd.read_pickle(filename) ## polyglot-en.pkl 91 | words = [''] + list(words) ## add PAD ID 92 | vector = np.append(np.zeros((1,64)),vector,axis=0) 93 | self.vocabulary = {word:index for index,word in enumerate(words)} 94 | self.reverse_vocabulary = dict(zip(self.vocabulary.values(), self.vocabulary.keys())) 95 | self.index2vec = vector 96 | self.word_embedding = tf.get_variable(name="word_embedding", shape=vector.shape, initializer=tf.constant_initializer(vector), trainable=True) 97 | 98 | def convert2index(self, doc, unk = "UNK"): 99 | word_index = [] 100 | for sent in doc: 101 | sub = [] 102 | for word in sent: 103 | if(word in self.vocabulary): 104 | index = self.vocabulary[word] 105 | sub.append(index) 106 | else: 107 | if(unk == "UNK"): 108 | unk_index = self.vocabulary[""] 109 | sub.append(unk_index) 110 | word_index.append(sub) 111 | return word_index 112 | 113 | def get_char_list(self,tokens): 114 | if os.path.exists("./char_list.csv"): 115 | char_data = pd.read_csv("./char_list.csv", sep = ",", encoding='CP949') 116 | char = list(char_data.iloc[:,1]) 117 | print("char_list loaded!") 118 | else: 119 | t = [] 120 | for token in tokens: 121 | t += token 122 | t = np.array(t) 123 | s = [list(set(chain.from_iterable(elements))) for elements in t] 124 | s = np.array(s).flatten() 125 | char = list(set(chain.from_iterable(s))) 126 | char = sorted(char) 127 | char = [""] + char 128 | c = pd.DataFrame(char) 129 | c.to_csv("./char_list.csv", sep = ",") 130 | print("char_list saved!") 131 | 132 | self.char_list = char 133 | self.char_dict = {char:index for index, char in enumerate(self.char_list)} 134 | 135 | 136 | def sent2char(self, inputs, train = "train"): ## inputs : [batch_size, max_sent_len] 137 | 138 | if os.path.exists("./sent2char_{}.pkl".format(train)): 139 | with open("./sent2char_{}.pkl".format(train), 'rb') as f: 140 | outputs,char_len = pickle.load(f) 141 | else: 142 | char_len, outputs = [], [] 143 | for sent in inputs: 144 | sub_char_len, sub_outputs = [], [] 145 | for word in sent: 146 | if word == "": 147 | sub_char_len.append(0) 148 | sub_outputs.append([0]*self.max_char_len) 149 | else: 150 | if(len(word) > self.max_char_len): 151 | word = word[:self.max_char_len] 152 | sub_char_len.append(len(word)) 153 | sub_outputs.append([self.char_dict[char] for char in word]) 154 | outputs.append(pad_sequences(sub_outputs, maxlen = self.max_char_len, padding = "post")) 155 | char_len.append(sub_char_len) 156 | 157 | outputs = np.array(outputs) 158 | char_len = np.array(char_len) 159 | results = (outputs,char_len) 160 | with open("./sent2char_{}.pkl".format(train), 'wb') as f: 161 | pickle.dump(results , f) 162 | 163 | return outputs,char_len 164 | 165 | def get_char_embedding(self, embedding_size, vocab_size): 166 | self.char_embedding = tf.get_variable('char_embedding', [vocab_size, embedding_size]) 167 | self.clear_char_embedding_padding = tf.scatter_update(self.char_embedding, [0], tf.constant(0.0, shape=[1, embedding_size])) 168 | -------------------------------------------------------------------------------- /train.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Fri May 3 14:27:21 2019 4 | 5 | @author: jbk48 6 | """ 7 | 8 | import model 9 | import tensorflow as tf 10 | 11 | if __name__ == '__main__': 12 | 13 | flags = tf.app.flags 14 | FLAGS = flags.FLAGS 15 | 16 | ## Model parameter 17 | flags.DEFINE_integer('word_dim', 64, 'dimension of word vector') 18 | flags.DEFINE_integer('char_dim', 15, 'dimension of character vector') 19 | flags.DEFINE_integer('max_sent_len', 100, 'max length of words of sentences') 20 | flags.DEFINE_integer('max_char_len', 16, 'max length of characters of words') 21 | flags.DEFINE_float('learning_rate', 0.0001, 'initial learning rate') 22 | flags.DEFINE_integer('num_train_steps', 20000, 'number of training steps for learning rate decay') 23 | flags.DEFINE_integer('batch_size', 128, 'number of batch size') 24 | flags.DEFINE_integer('training_epochs', 12, 'number of training epochs') 25 | ## Transformer-Encoder parameter 26 | flags.DEFINE_integer('num_layers', 7, 'number of layers of transformer encoders') 27 | flags.DEFINE_integer('num_heads', 4, 'number of heads of transformer encoders') 28 | flags.DEFINE_integer('linear_key_dim', 4*32, 'dimension of') 29 | flags.DEFINE_integer('linear_value_dim', 4*32, 'dimension of') 30 | flags.DEFINE_integer('model_dim', 64*2, 'output dimension of transformer encoder') 31 | flags.DEFINE_integer('ffn_dim', 64*2, 'dimension of feed forward network') 32 | flags.DEFINE_integer('n_class', 4, 'number of output class') 33 | flags.DEFINE_string('char_mode', 'char_cnn', 'mode of character embedding') 34 | 35 | print('========================') 36 | for key in FLAGS.__flags.keys(): 37 | print('{} : {}'.format(key, getattr(FLAGS, key))) 38 | print('========================') 39 | ## Build model 40 | t_model = model.Model(FLAGS.word_dim, FLAGS.char_dim, FLAGS.max_sent_len, FLAGS.max_char_len, 41 | FLAGS.learning_rate, FLAGS.num_train_steps) 42 | t_model.build_parameter(FLAGS.num_layers, FLAGS.num_heads, FLAGS.linear_key_dim, FLAGS.linear_value_dim, 43 | FLAGS.model_dim, FLAGS.ffn_dim, FLAGS.n_class) 44 | 45 | ## Train model 46 | t_model.train(FLAGS.batch_size, FLAGS.training_epochs, FLAGS.char_mode) 47 | 48 | -------------------------------------------------------------------------------- /transformer.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Mon Jan 21 16:33:17 2019 4 | 5 | @author: jbk48 6 | """ 7 | 8 | import tensorflow as tf 9 | 10 | from attention import Attention 11 | from layer import FFN, LayerNormalization 12 | 13 | class Encoder: 14 | 15 | def __init__(self, 16 | num_layers=6, 17 | num_heads=8, 18 | linear_key_dim=32*8, 19 | linear_value_dim=32*8, 20 | model_dim=64, 21 | ffn_dim=64, 22 | dropout=0.2, 23 | n_class=4, 24 | batch_size=128): 25 | 26 | self.num_layers = num_layers 27 | self.num_heads = num_heads 28 | self.linear_key_dim = linear_key_dim 29 | self.linear_value_dim = linear_value_dim 30 | self.model_dim = model_dim 31 | self.ffn_dim = ffn_dim 32 | self.dropout = dropout 33 | self.n_class = n_class 34 | self.batch_size = batch_size 35 | self.layer_norm = LayerNormalization(self.model_dim) 36 | 37 | def build(self, encoder_inputs, seq_len): 38 | o1 = tf.identity(encoder_inputs) 39 | 40 | for i in range(1, self.num_layers+1): 41 | with tf.variable_scope("layer-{}".format(i)): 42 | o2 = self._add_and_norm(o1, self._self_attention(q=o1, 43 | k=o1, 44 | v=o1, 45 | seq_len=seq_len), num=1) 46 | o3 = self._add_and_norm(o2, self._positional_feed_forward(o2), num=2) 47 | o1 = tf.identity(o3) 48 | 49 | with tf.variable_scope("GlobalAveragePooling-layer"): 50 | o3 = self._pooling_layer(q=o1, k=o1, v=o1, seq_len =seq_len) 51 | 52 | return o3 53 | 54 | 55 | def _pooling_layer(self, q, k, v, seq_len): 56 | with tf.variable_scope("self-attention"): 57 | attention = Attention(num_heads=self.num_heads, 58 | masked=True, 59 | linear_key_dim=self.linear_key_dim, 60 | linear_value_dim=self.linear_value_dim, 61 | model_dim=self.model_dim, 62 | dropout=self.dropout, 63 | batch_size=self.batch_size) 64 | return attention.classifier_head(q, k, v, seq_len) 65 | 66 | def _self_attention(self, q, k, v, seq_len): 67 | with tf.variable_scope("self-attention"): 68 | attention = Attention(num_heads=self.num_heads, 69 | masked=True, 70 | linear_key_dim=self.linear_key_dim, 71 | linear_value_dim=self.linear_value_dim, 72 | model_dim=self.model_dim, 73 | dropout=self.dropout, 74 | batch_size=self.batch_size) 75 | return attention.multi_head(q, k, v, seq_len) 76 | 77 | def _add_and_norm(self, x, sub_layer_x, num=0): 78 | with tf.variable_scope("add-and-norm-{}".format(num)): 79 | return self.layer_norm(tf.add(x, sub_layer_x)) # with Residual connection 80 | 81 | def _positional_feed_forward(self, output): 82 | with tf.variable_scope("feed-forward"): 83 | ffn = FFN(w1_dim=self.ffn_dim, 84 | w2_dim=self.model_dim, 85 | dropout=self.dropout) 86 | return ffn.dense_gelu_dense(output) 87 | --------------------------------------------------------------------------------