├── .gitignore ├── README.md ├── cnn_model.py ├── cnn_rnn_model.py ├── config.py ├── data ├── predict_first.csv └── train_first.csv ├── data_helper.py ├── predict.py ├── preprocess.py ├── rnn_cnn_model.py ├── rnn_model.py ├── train.py └── utils ├── __init__.py ├── config.py ├── log.py ├── model_helper.py └── nlp_util.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | 49 | # Translations 50 | *.mo 51 | *.pot 52 | 53 | # Django stuff: 54 | *.log 55 | local_settings.py 56 | 57 | # Flask stuff: 58 | instance/ 59 | .webassets-cache 60 | 61 | # Scrapy stuff: 62 | .scrapy 63 | 64 | # Sphinx documentation 65 | docs/_build/ 66 | 67 | # PyBuilder 68 | target/ 69 | 70 | # Jupyter Notebook 71 | .ipynb_checkpoints 72 | 73 | # pyenv 74 | .python-version 75 | 76 | # celery beat schedule file 77 | celerybeat-schedule 78 | 79 | # SageMath parsed files 80 | *.sage.py 81 | 82 | # dotenv 83 | .env 84 | 85 | # virtualenv 86 | .venv 87 | venv/ 88 | ENV/ 89 | 90 | # Spyder project settings 91 | .spyderproject 92 | .spyproject 93 | 94 | # Rope project settings 95 | .ropeproject 96 | 97 | # mkdocs documentation 98 | /site 99 | 100 | # mypy 101 | .mypy_cache/ 102 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Text Classify 2 | 3 | 之前参加了一个比赛,比赛内容是和文本分类有关,实现了常见的rnn,cnn,cnn-rnn等模型,将代码整理以下,方便大家参考。 4 | 5 | ## 模型训练步骤 6 | 7 | ### 运行环境 8 | tensorflow == 1.3 9 | python == 2.7 10 | 11 | ### 预处理数据 12 | 13 | - 数据分词 14 | 由于数据量较大,预先将数据分词,减少后来训练调参时候的分词时间开销。 15 | 16 | - 训练词向量 17 | 训练词向量以便后续模型使用,同时保存词向量矩阵。 18 | 19 | 执行命令:`python preprocess.py` 20 | 21 | ### 模型训练模式 22 | 23 | 有三种训练方式:single, multi, kfold, 默认为single 24 | single表示只会训练一次模型; 25 | multi会多次训练模型,取多次训练结果的平均,每次训练数据选取随机; 26 | kfold采取k折交叉验证训练模型,取多次训练结果的平均。 27 | 28 | 执行命令:`python train.py --mode==single` 29 | -------------------------------------------------------------------------------- /cnn_model.py: -------------------------------------------------------------------------------- 1 | #!/bin/env python 2 | #-*- encoding: utf-8 -*- 3 | 4 | 5 | import os 6 | import time 7 | import numpy as np 8 | import tensorflow as tf 9 | from utils import model_helper 10 | import config 11 | 12 | 13 | class ModelParas(object): 14 | embedding_size = config.embedding_size 15 | cell_num_units = 256 16 | num_layers = 1 17 | batch_size = 64 18 | cnn_dropout = 0.0 19 | rnn_dropout = 0.0 20 | learning_rate = 0.01 21 | decay = 0.99 22 | lrshrink = 5 23 | uniform_init_scale = 0.04 24 | clip_gradient_norm = 5.0 25 | filter_sizes = [3, 4, 5] 26 | l2_reg_lambda = 0.0 27 | max_pool_size = 4 28 | num_filters = 32 29 | epochs = 20 30 | 31 | 32 | class Model(object): 33 | 34 | 35 | def __init__(self, paras, sess, mode, emb_matrix): 36 | self.paras = paras 37 | self.sess = sess 38 | self.mode = mode 39 | 40 | # Model variable 41 | with tf.device('/cpu:0'): 42 | self.embeddings = tf.get_variable( 43 | name = 'embeddings', 44 | shape = emb_matrix.shape, 45 | dtype = tf.float32, 46 | initializer = tf.constant_initializer(emb_matrix)) 47 | self.global_step = tf.get_variable( 48 | name = 'global_step', 49 | dtype = tf.int32, 50 | initializer = 1, 51 | trainable = False) 52 | 53 | self._build_graph() 54 | 55 | 56 | def _create_placeholder(self): 57 | self.lr = tf.placeholder(tf.float32, [], name = 'learning_rate') 58 | self.sents = tf.placeholder(tf.int32, [None, None], name = 'sents') 59 | with tf.device('/cpu:0'): 60 | self.emb_sents = tf.nn.embedding_lookup( 61 | self.embeddings, self.sents) 62 | # Expand dimension so meet input requirement of 2d-conv 63 | self.emb_expand = tf.expand_dims(self.emb_sents, -1) 64 | self.sent_lengths = tf.placeholder(tf.int32, [None], name = 'sent_lengths') 65 | self.pad = tf.placeholder(tf.float32, [None, 1, embedding_size, 1], name='pad') 66 | self.labels = tf.placeholder(tf.int32, [None], name = 'labels') 67 | 68 | 69 | def _inference(self): 70 | # Convolution network 71 | with tf.name_scope('cnn'): 72 | 73 | # After conv and pooling, 74 | max_length = tf.reduce_max(self.sent_lengths) 75 | div_value = tf.div(tf.cast(max_length, tf.float32), self.paras.max_pool_size) 76 | reduced_size = tf.cast(tf.ceil(div_value), tf.int32) 77 | 78 | pooled_concat = [] 79 | for i, filter_size in enumerate(self.paras.filter_sizes): 80 | with tf.name_scope('conv-pool-%s' % filter_size): 81 | # Padding zero to keep conv output has same dimention as input 82 | # shape is : [batch_size, sent_length, emb_size, channel] 83 | num_prio = (filter_size - 1) // 2 84 | num_post = (filter_size - 1) - num_prio 85 | pad_prio = tf.concat([self.pad] * num_prio,1) 86 | pad_post = tf.concat([self.pad] * num_post,1) 87 | emb_pad = tf.concat([pad_prio, self.emb_expand, pad_post], 1) 88 | # Prepare filter for conv 89 | filter_ = tf.get_variable( 90 | name = 'filter-%s' % filter_size, 91 | shape = [filter_size, self.paras.embedding_size, 1, self.paras.num_filters]) 92 | # conv: [batch_size, sent_length, 1, num_filters] 93 | conv = tf.nn.conv2d( 94 | input = self.emb_pad, 95 | filter = filter_, 96 | strides = [1, 1, 1, 1], 97 | padding = 'VALID', 98 | name = 'conv') 99 | # Bias 100 | b = tf.get_variable( 101 | name = 'bias-%s' % filter_size, 102 | shape = [self.paras.num_filters]) 103 | h = tf.nn.relu(tf.nn.bias_add(conv, b)) 104 | # Max pooling over the outputs 105 | pooled = tf.nn.max_pool( 106 | value = h, 107 | ksize = [1, self.paras.max_pool_size, 1, 1], 108 | trides = [1, self.paras.max_pool_size, 1, 1], 109 | padding ='SAME', 110 | name ='pool') 111 | pooled = pooled.reshape(pooled, [-1, reduced_size, self.paras.num_filters]) 112 | pooled_concat.append(pooled) 113 | # pooled_concat: [batch_size, reduced_size, filter_sizes * num_filters] 114 | pooled_concat = tf.concat(pooled_concat, 2) 115 | if self.mode == tf.contrib.learn.ModeKeys.TRAIN: 116 | pooled_concat = tf.nn.dropout(pooled_concat, 1.0 - self.paras.cnn_dropout) 117 | 118 | # RNN network 119 | with tf.name_scope('rnn'): 120 | cells_fw = model_helper.create_rnn_cell( 121 | 'lstm', 122 | self.paras.cell_num_units, 123 | self.paras.num_layers, 124 | self.paras.rnn_dropout, 125 | self.mode) 126 | cells_bw = model_helper.create_rnn_cell( 127 | 'lstm', 128 | self.paras.cell_num_units, 129 | self.paras.num_layers, 130 | self.paras.rnn_dropout, 131 | self.mode) 132 | outputs, output_states = tf.nn.bidirectional_dynamic_rnn( 133 | cells_fw, 134 | cells_bw, 135 | inputs = pooled_concat, 136 | dtype = tf.float32) 137 | # states_fw: (batch_size, reduced_size, cell_size) 138 | states_fw, states_bw = outputs 139 | concat_states = tf.concat([states_fw, states_bw], axis = 2) 140 | # sent_states: (batch_size, 2 * cell_size) 141 | self.sent_states = tf.reduce_max(concat_states, axis = 1) 142 | 143 | with tf.name_scope('classify'): 144 | hidden1 = tf.contrib.layers.fully_connected( 145 | inputs = self.sent_states, 146 | num_outputs = 512) 147 | hidden2 = tf.contrib.layers.fully_connected( 148 | inputs = hidden1, 149 | num_outputs = 5) 150 | self.predicts = tf.reduce_max(tf.contrib.layers.fully_connected( 151 | inputs = hidden2, 152 | activation_fn = None, 153 | num_outputs = 1), axis = 1) 154 | self.mse = tf.reduce_mean(tf.cast( 155 | tf.squared_difference( 156 | self.labels, 157 | tf.cast(tf.round(self.predicts), tf.int32)), 158 | tf.float32)) 159 | 160 | with tf.name_scope('accuracy'): 161 | correct_prediction = tf.equal(self.labels, 162 | tf.cast(tf.round(self.predicts), tf.int32)) 163 | self.accuracy = tf.reduce_mean(tf.cast( 164 | correct_prediction, tf.float32)) 165 | 166 | 167 | def _create_loss(self): 168 | with tf.name_scope('loss'): 169 | self.loss = tf.reduce_mean( 170 | tf.losses.mean_squared_error( 171 | labels = tf.cast(self.labels, tf.float32), 172 | predictions = self.predicts)) 173 | 174 | 175 | def _create_optimizer(self): 176 | with tf.name_scope('optimizer'): 177 | self.optimizer = tf.contrib.layers.optimize_loss( 178 | loss = self.loss, 179 | global_step = self.global_step, 180 | learning_rate = self.lr, 181 | optimizer = 'SGD', 182 | clip_gradients = self.paras.clip_gradient_norm) 183 | 184 | 185 | def _create_summary(self): 186 | log_path = os.path.join(config.model_path, 'tensorboard') 187 | self.train_writer = tf.summary.FileWriter( 188 | os.path.join(log_path, 'train'), self.sess.graph) 189 | self.test_writer = tf.summary.FileWriter( 190 | os.path.join(log_path, 'test'), self.sess.graph) 191 | with tf.name_scope('summary') as scope: 192 | tf.summary.scalar('loss', self.loss) 193 | tf.summary.scalar('accuracy', self.accuracy) 194 | 195 | 196 | def _build_graph(self): 197 | self._create_placeholder() 198 | self._inference() 199 | self._create_loss() 200 | self._create_optimizer() 201 | self._create_summary() 202 | print 'Build graph done' 203 | 204 | 205 | def test(): 206 | sess = tf.Session() 207 | paras = ModelParas() 208 | 209 | emb_matrix = NlpUtil.build_emb_matrix() 210 | Model(paras, sess, tf.contrib.learn.ModeKeys.TRAIN) 211 | 212 | 213 | if __name__ == '__main__': 214 | pass 215 | -------------------------------------------------------------------------------- /cnn_rnn_model.py: -------------------------------------------------------------------------------- 1 | #!/bin/env python 2 | #-*- encoding: utf-8 -*- 3 | 4 | 5 | import os 6 | import time 7 | import numpy as np 8 | import tensorflow as tf 9 | from utils import model_helper 10 | import config 11 | 12 | 13 | class ModelParas(object): 14 | embedding_size = config.embedding_size 15 | batch_size = 64 16 | sequence_length = None 17 | learning_rate = 0.01 18 | decay = 0.99 19 | lrshrink = 5 20 | uniform_init_scale = 0.04 21 | clip_gradient_norm = 5.0 22 | l2_reg_lambda = 0.0 23 | nclasses = 5 24 | epochs = 20 25 | 26 | # CNN 27 | cnn_dropout = 0.5 28 | filter_sizes = [3, 4, 5] 29 | max_pool_size = 2 30 | num_filters = 128 31 | 32 | # RNN 33 | rnn_dropout = 0.0 34 | cell_num_units = 256 35 | num_layers = 1 36 | 37 | 38 | class Model(object): 39 | 40 | 41 | def __init__(self, paras, sess, mode, emb_matrix): 42 | self.paras = paras 43 | self.sess = sess 44 | self.mode = mode 45 | self.emb_matrix = emb_matrix 46 | self._build_graph() 47 | 48 | 49 | def _create_placeholder(self): 50 | self.sents = tf.placeholder(tf.int32, [None, None], name = 'sents') 51 | self.sent_lengths = tf.placeholder(tf.int32, [None], name = 'sent_lengths') 52 | self.pad = tf.placeholder(tf.float32, [None, 1, self.paras.embedding_size, 1], name='pad') 53 | self.labels = tf.placeholder(tf.int32, [None], name = 'labels') 54 | self.lr = tf.placeholder(tf.float32, [], name = 'learning_rate') 55 | 56 | 57 | def _create_variable(self): 58 | # Model variable 59 | with tf.device('/cpu:0'): 60 | self.embeddings = tf.get_variable( 61 | name = 'embeddings', 62 | shape = self.emb_matrix.shape, 63 | dtype = tf.float32, 64 | initializer = tf.constant_initializer(self.emb_matrix)) 65 | self.global_step = tf.get_variable( 66 | name = 'global_step', 67 | dtype = tf.int32, 68 | initializer = 1, 69 | trainable = False) 70 | 71 | 72 | def _inference(self): 73 | with tf.device('/cpu:0'): 74 | self.emb_sents = tf.nn.embedding_lookup( 75 | self.embeddings, self.sents) 76 | # Expand dimension so meet input requirement of 2d-conv 77 | self.emb_expand = tf.expand_dims(self.emb_sents, -1) 78 | 79 | # Convolution network 80 | with tf.name_scope('cnn'): 81 | # After conv and pooling, 82 | max_length = tf.reduce_max(self.sent_lengths) 83 | div_value = tf.div(tf.cast(max_length, tf.float32), self.paras.max_pool_size) 84 | reduced_size = tf.cast(tf.ceil(div_value), tf.int32) 85 | pooled_concat = [] 86 | for filter_size in self.paras.filter_sizes: 87 | with tf.name_scope('conv-pool-%s' % filter_size): 88 | # Padding zero to keep conv output has same dimention as input 89 | # shape is : [batch_size, sent_length, emb_size, channel] 90 | num_prio = (filter_size - 1) // 2 91 | num_post = (filter_size - 1) - num_prio 92 | pad_prio = tf.concat([self.pad] * num_prio, 1) 93 | pad_post = tf.concat([self.pad] * num_post, 1) 94 | emb_pad = tf.concat([pad_prio, self.emb_expand, pad_post], 1) 95 | # Prepare filter for conv 96 | filter_ = tf.get_variable( 97 | name = 'filter-%s' % filter_size, 98 | shape = [filter_size, self.paras.embedding_size, 1, self.paras.num_filters]) 99 | # conv: [batch_size, sent_length, 1, num_filters] 100 | conv = tf.nn.conv2d( 101 | input = emb_pad, 102 | filter = filter_, 103 | strides = [1, 1, 1, 1], 104 | padding = 'VALID', 105 | name = 'conv') 106 | # Bias 107 | b = tf.get_variable( 108 | name = 'bias-%s' % filter_size, 109 | shape = [self.paras.num_filters]) 110 | h = tf.nn.relu(tf.nn.bias_add(conv, b)) 111 | # Max pooling over the outputs 112 | pooled = tf.nn.max_pool( 113 | value = h, 114 | ksize = [1, self.paras.max_pool_size, 1, 1], 115 | strides = [1, self.paras.max_pool_size, 1, 1], 116 | padding ='SAME', 117 | name ='pool') 118 | pooled = tf.reshape(pooled, [-1, reduced_size, self.paras.num_filters]) 119 | pooled_concat.append(pooled) 120 | # pooled_concat: (batch_size, reduced_size, filter_sizes * num_filters) 121 | self.pooled_concat = tf.concat(pooled_concat, 2) 122 | if self.mode == tf.contrib.learn.ModeKeys.TRAIN: 123 | self.pooled_concat = tf.nn.dropout(self.pooled_concat, 1.0 - self.paras.cnn_dropout) 124 | 125 | # RNN network 126 | with tf.name_scope('rnn'): 127 | cells_fw = model_helper.create_rnn_cell( 128 | 'lstm', 129 | self.paras.cell_num_units, 130 | self.paras.num_layers, 131 | self.paras.rnn_dropout, 132 | self.mode) 133 | cells_bw = model_helper.create_rnn_cell( 134 | 'lstm', 135 | self.paras.cell_num_units, 136 | self.paras.num_layers, 137 | self.paras.rnn_dropout, 138 | self.mode) 139 | outputs, output_states = tf.nn.bidirectional_dynamic_rnn( 140 | cells_fw, 141 | cells_bw, 142 | inputs = self.pooled_concat, 143 | dtype = tf.float32) 144 | # states_fw: (batch_size, reduced_size, cell_size) 145 | states_fw, states_bw = outputs 146 | concat_states = tf.concat([states_fw, states_bw], axis = 2) 147 | # sent_states: (batch_size, 2 * cell_size) 148 | self.sent_states = tf.reduce_max(concat_states, axis = 1) 149 | 150 | with tf.name_scope('classify'): 151 | hidden1 = tf.contrib.layers.fully_connected( 152 | inputs = self.sent_states, 153 | num_outputs = 512) 154 | hidden2 = tf.contrib.layers.fully_connected( 155 | inputs = hidden1, 156 | num_outputs = 5) 157 | self.predicts = tf.reduce_max(tf.contrib.layers.fully_connected( 158 | inputs = hidden2, 159 | activation_fn = None, 160 | num_outputs = 1), axis = 1) 161 | self.mse = tf.reduce_mean(tf.cast( 162 | tf.squared_difference( 163 | self.labels, 164 | tf.cast(tf.round(self.predicts), tf.int32)), 165 | tf.float32)) 166 | 167 | with tf.name_scope('accuracy'): 168 | correct_prediction = tf.equal(self.labels, 169 | tf.cast(tf.round(self.predicts), tf.int32)) 170 | self.accuracy = tf.reduce_mean(tf.cast( 171 | correct_prediction, tf.float32)) 172 | 173 | 174 | def _create_loss(self): 175 | with tf.name_scope('loss'): 176 | self.loss = tf.reduce_mean( 177 | tf.losses.mean_squared_error( 178 | labels = tf.cast(self.labels, tf.float32), 179 | predictions = self.predicts)) 180 | 181 | 182 | def _create_optimizer(self): 183 | with tf.name_scope('optimizer'): 184 | self.optimizer = tf.contrib.layers.optimize_loss( 185 | loss = self.loss, 186 | global_step = self.global_step, 187 | learning_rate = self.lr, 188 | optimizer = 'SGD', 189 | clip_gradients = self.paras.clip_gradient_norm) 190 | 191 | 192 | def _create_summary(self): 193 | log_path = os.path.join(config.model_path, 'tensorboard') 194 | self.train_writer = tf.summary.FileWriter( 195 | os.path.join(log_path, 'train'), self.sess.graph) 196 | self.test_writer = tf.summary.FileWriter( 197 | os.path.join(log_path, 'test'), self.sess.graph) 198 | with tf.name_scope('summary') as scope: 199 | tf.summary.scalar('loss', self.loss) 200 | tf.summary.scalar('accuracy', self.accuracy) 201 | 202 | 203 | def _build_graph(self): 204 | self._create_variable() 205 | self._create_placeholder() 206 | self._inference() 207 | self._create_loss() 208 | self._create_optimizer() 209 | self._create_summary() 210 | print 'Build graph done' 211 | 212 | 213 | def test(): 214 | os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" # so the IDs match nvidia-smi 215 | os.environ["CUDA_VISIBLE_DEVICES"] = "" # "0, 1" for multiple 216 | from data_helper import Helper 217 | sess = tf.Session() 218 | paras = ModelParas() 219 | emb_matrix = Helper.get_emb_matrix() 220 | Model(paras, sess, tf.contrib.learn.ModeKeys.TRAIN, emb_matrix) 221 | 222 | 223 | if __name__ == '__main__': 224 | test() 225 | -------------------------------------------------------------------------------- /config.py: -------------------------------------------------------------------------------- 1 | #!/bin/env python 2 | #-*- encoding: utf-8 -*- 3 | 4 | 5 | # Data 6 | raw_train_fpath = './data/train_first.csv' 7 | raw_predict_fpath = './data/predict_first.csv' 8 | train_fpath = './data/train.txt' 9 | predict_fpath = './data/predict.txt' 10 | 11 | 12 | # Word2vec 13 | embedding_size = 300 14 | word2vec_fpath = './model/word2vec/w2v_win1_d%d.model' % embedding_size 15 | emb_matrix_fpath = './model/word2vec/emb_matrix_d%d.npy' % embedding_size 16 | word2id_fpath = './model/word2vec/word2id.txt' 17 | 18 | 19 | # Model path 20 | model_path = './model/m0' 21 | 22 | 23 | # Result path 24 | result_path = './data/result.csv' 25 | -------------------------------------------------------------------------------- /data_helper.py: -------------------------------------------------------------------------------- 1 | #!/bin/env python 2 | 3 | 4 | import codecs 5 | import numpy as np 6 | from utils.nlp_util import NlpUtil 7 | import config 8 | 9 | 10 | class Helper(object): 11 | 12 | 13 | @classmethod 14 | def init(cls): 15 | pass 16 | 17 | 18 | @classmethod 19 | def sort_by_length(cls, sents, labels): 20 | len_array = np.array([len(s) for s in sents]) 21 | len_perm = len_array.argsort() 22 | sents = sents[len_perm] 23 | labels = labels[len_perm] 24 | return sents, labels 25 | 26 | 27 | @classmethod 28 | def get_data(cls, is_train_data = True, partition = None, 29 | sort_flag = True, rand_seed = None): 30 | if rand_seed is not None: 31 | np.random.seed(rand_seed) 32 | 33 | word2id = {} 34 | with codecs.open(config.word2id_fpath, 'r', 'utf-8') as in_f: 35 | for line in in_f: 36 | word, id_ = line.rstrip().split('\t') 37 | word2id[word] = int(id_) 38 | 39 | def split_text(text): 40 | ret = [word2id[w] for w in text.split('|') if w in word2id] 41 | return ret 42 | 43 | if is_train_data: 44 | # Return data for training 45 | if partition is None: 46 | partition = [0.8, 0.1, 0.1] 47 | partition = [0.0] + [sum(partition[:id_+1]) for id_ in range(3)] 48 | with codecs.open(config.train_fpath, 'r', 'utf-8') as in_f: 49 | train_corpus = [line.strip().split('\t') for line in in_f] 50 | train_data = [split_text(item[1]) for item in train_corpus] 51 | labels = np.array([int(item[2]) for item in train_corpus], 52 | dtype = np.int32) - 1 53 | train_length = len(train_data) 54 | perm = np.random.permutation(train_length) 55 | train_data = np.array(train_data)[perm] 56 | labels = labels[perm] 57 | train, dev, test = {}, {}, {} 58 | data_type = ['train', 'dev', 'test'] 59 | part = np.array(partition) * train_length 60 | part = part.astype(np.int32) 61 | for id_, type_ in enumerate(data_type): 62 | sents_ = train_data[part[id_] : part[id_+1]] 63 | labels_ = labels[part[id_] : part[id_+1]] 64 | if sort_flag is True: 65 | sents_, labels_ = cls.sort_by_length(sents_, labels_) 66 | eval(type_)['sents'] = sents_ 67 | eval(type_)['labels'] = labels_ 68 | # print len(train['sents']), len(dev['sents']), len(test['sents']) 69 | # print '|'.join(map(str, test['sents'][-1])), test['labels'][-1] 70 | return train, dev, test 71 | else: 72 | # Return data for prediction 73 | with codecs.open(config.predict_fpath, 'r', 'utf-8') as in_f: 74 | predict_corpus = [line.strip().split('\t') for line in in_f] 75 | predict_ids = np.array([item[0] for item in predict_corpus]) 76 | predict = np.array([split_text(item[1]) for item in predict_corpus]) 77 | if sort_flag: 78 | predict, predict_ids = cls.sort_by_length(predict, predict_ids) 79 | return predict_ids, predict 80 | 81 | 82 | @classmethod 83 | def get_batch(cls, batch, sequence_length = None): 84 | if sequence_length: 85 | lengths = np.array([len(x[:sequence_length]) for x in batch]) 86 | else: 87 | lengths = np.array([len(x) for x in batch]) 88 | max_len = np.max(lengths) 89 | batch_len = len(batch) 90 | embed = np.zeros((batch_len, max_len), np.int32) 91 | for i in xrange(batch_len): 92 | for j in xrange(lengths[i]): 93 | embed[i, j] = batch[i][j] 94 | return embed, lengths 95 | 96 | 97 | @classmethod 98 | def get_emb_matrix(cls): 99 | emb_matrix = np.load(config.emb_matrix_fpath) 100 | print 'Load embedding matrix success' 101 | return emb_matrix 102 | 103 | 104 | def test(): 105 | train, dev, test = Helper.get_data(is_train_data = True, 106 | sort_flag = False, 107 | rand_seed = 1234) 108 | 109 | print train['sents'][:3] 110 | batch = Helper.get_batch(train['sents'][:3]) 111 | print batch 112 | Helper.get_emb_matrix() 113 | 114 | 115 | if __name__ == '__main__': 116 | test() 117 | 118 | -------------------------------------------------------------------------------- /predict.py: -------------------------------------------------------------------------------- 1 | #!/bin/env python 2 | #-*- encoding: utf-8 -*- 3 | 4 | 5 | import os 6 | import time 7 | import math 8 | import numpy as np 9 | from collections import defaultdict, Counter 10 | import codecs 11 | import tensorflow as tf 12 | from rnn_model import Model, ModelParas 13 | from data_helper import Helper 14 | from utils.log import logger 15 | import config 16 | 17 | 18 | def load_model(mode): 19 | tf.reset_default_graph() 20 | paras = ModelParas() 21 | sess = tf.Session() 22 | save_path = os.path.join(config.model_path, 'model/model.ckpt') 23 | emb_matrix = Helper.get_emb_matrix() 24 | with tf.variable_scope('Model'): 25 | model = Model(paras, sess, mode, emb_matrix) 26 | saver = tf.train.Saver() 27 | saver.restore(model.sess, save_path) 28 | return model 29 | 30 | 31 | def predict(save_path): 32 | model = load_model(mode = tf.contrib.learn.ModeKeys.EVAL) 33 | predict_ids, predict = Helper.get_data(is_train_data = False) 34 | batch_size = model.paras.batch_size 35 | steps = int(math.ceil(len(predict_ids) * 1.0 / batch_size)) 36 | with codecs.open(save_path, 'w', 'utf-8') as out_f: 37 | for step in xrange(steps): 38 | begin = step * batch_size 39 | end = (step + 1) * batch_size 40 | ids = predict_ids[begin: end] 41 | batch_sents, batch_lengths = Helper.get_batch( 42 | predict[begin: end], model.paras.sequence_length) 43 | feed_dict = { 44 | model.sents: batch_sents, 45 | model.sent_lengths: batch_lengths} 46 | res = model.sess.run(model.predicts, feed_dict) 47 | ids = ids.tolist() 48 | res = res.tolist() 49 | msgs = predict[begin: end].tolist() 50 | for id_, val, msg in zip(ids, res, msgs): 51 | out_f.write('%s,%f\n' % (id_, val + 1)) 52 | del model, predict_ids, predict 53 | print 'Predict done' 54 | 55 | 56 | def fine_tune_result(): 57 | ratio = np.array([0.00587, 0.00973, 0.09389, 0.28954, 0.60097], np.float32) 58 | part = np.array([np.sum(ratio[:i]) for i in range(6)]) * 30000 59 | part[-1] = 30000 60 | part = part.astype(np.int32) 61 | print part 62 | with codecs.open(config.result_path, 'r', 'utf-8') as in_f, \ 63 | codecs.open('fine_tune.csv', 'w', 'utf-8') as out_f: 64 | id_score_list = [] 65 | for line in in_f: 66 | id_, score = line.rstrip().split(',') 67 | id_score_list.append((id_, float(score))) 68 | id_score_list.sort(key = lambda x: x[1]) 69 | for index, item in enumerate(id_score_list): 70 | for i in range(5): 71 | if part[i] <= index < part[i + 1]: 72 | out_f.write('%s,%d\n' % (item[0], i + 1)) 73 | break 74 | print 'Fine tune result done' 75 | 76 | 77 | def _get_vote_value(array): 78 | array = [int(np.round(x)) for x in array] 79 | cnt_dict = Counter(array) 80 | max_v = max([v for k, v in cnt_dict.items()]) 81 | for k, v in cnt_dict.items()[::-1]: 82 | if v == max_v: 83 | return k 84 | 85 | 86 | def _get_mean_value(array): 87 | return np.mean(array) 88 | 89 | 90 | def fuse_result(fuse_mode = 'mean'): 91 | id2result = defaultdict(list) 92 | total_score = 0.0 93 | file_count = 0 94 | for file_ in os.listdir(config.model_path): 95 | if not file_.startswith('result'): 96 | continue 97 | file_count += 1 98 | file_ = os.path.join(config.model_path, file_) 99 | with codecs.open(file_, 'r', 'utf-8') as in_f: 100 | score = float(file_.split('_')[1]) 101 | total_score += score 102 | for line in in_f: 103 | id_, kind_ = line.strip().split(',') 104 | tuple_ = (float(kind_), score) 105 | id2result[id_].append(tuple_) 106 | with codecs.open(config.result_path, 'w', 'utf-8') as out_f: 107 | for id_, list_ in id2result.iteritems(): 108 | array = [kind_ for kind_, score_ in list_] 109 | if fuse_mode == 'mean': 110 | fuse_kind = _get_mean_value(array) 111 | else: 112 | fuse_kind = _get_vote_value(array) 113 | out_f.write('%s,%f\n' % (id_, fuse_kind)) 114 | print id2result['16866b2f-c7e5-319d-b47b-cc9317812bc9'] 115 | print 'Fuse result done' 116 | 117 | 118 | if __name__ == '__main__': 119 | #predict(config.result_path) 120 | fuse_result() 121 | -------------------------------------------------------------------------------- /preprocess.py: -------------------------------------------------------------------------------- 1 | #!/bin/env python 2 | #-*- encoding: utf-8 -*- 3 | 4 | 5 | import numpy as np 6 | import codecs 7 | from utils.nlp_util import NlpUtil 8 | import config 9 | 10 | 11 | def tokenize_corpus(corpus_fpath, save_fpath, is_train_data = True): 12 | 13 | def precess_line(line, is_train_data = True): 14 | try: 15 | line = line.strip() 16 | if is_train_data: 17 | line, flag = line.rsplit(',', 1) 18 | id_, text = line.split(',', 1) 19 | text = text.replace('|', ' ') 20 | text = text.replace('\t', ' ') 21 | text = '|'.join([''] + NlpUtil.tokenize(text, True) + ['']) 22 | #text = '|'.join(NlpUtil.tokenize(text, True)) 23 | return ('\t'.join([id_, text, flag]) + '\n' if is_train_data 24 | else '\t'.join([id_, text]) + '\n') 25 | except Exception as e: 26 | print ('line=%s, errmsg=%s', line, e) 27 | 28 | with codecs.open(corpus_fpath, 'r', 'utf-8') as in_f, \ 29 | codecs.open(save_fpath, 'w', 'utf-8') as out_f: 30 | in_f.readline() 31 | for line in in_f: 32 | out_f.write(precess_line(line, is_train_data)) 33 | print 'Tokenize done' 34 | 35 | 36 | def _get_corpus(): 37 | corpus = [] 38 | for file_ in [config.train_fpath, config.predict_fpath]: 39 | with codecs.open(file_, 'r', 'utf-8') as in_f: 40 | corpus_tmp = [line.strip().split('\t')[1].split('|') 41 | for line in in_f] 42 | corpus.extend(corpus_tmp) 43 | print 'Get corpus done, length is %d' % len(corpus) 44 | return corpus 45 | 46 | 47 | def build_emb_matrix(corpus): 48 | corpus_ = [] 49 | _ = map(lambda x: corpus_.extend(x), corpus) 50 | word2id = NlpUtil.build_word2id(corpus_) 51 | word2vec = NlpUtil.load_word2vec(config.word2vec_fpath) 52 | emb_matrix = NlpUtil.build_emb_matrix(word2vec, 53 | config.embedding_size, word2id) 54 | np.save(config.emb_matrix_fpath, emb_matrix) 55 | with codecs.open(config.word2id_fpath, 'w', 'utf-8') as out_f: 56 | out_f.write('\n'.join(['%s\t%d' % (k, v) for k, v in word2id.iteritems()])) 57 | print 'Build emb_matrix done' 58 | 59 | 60 | if __name__ == '__main__': 61 | # Tokenize data 62 | tokenize_corpus(config.raw_train_fpath, config.train_fpath, 63 | is_train_data = True) 64 | tokenize_corpus(config.raw_predict_fpath, config.predict_fpath, 65 | is_train_data = False) 66 | corpus = _get_corpus() 67 | 68 | # Train word2vec 69 | NlpUtil.train_word2vec(corpus, './model/word2vec') 70 | 71 | # Build emb matrix 72 | build_emb_matrix(corpus) 73 | -------------------------------------------------------------------------------- /rnn_cnn_model.py: -------------------------------------------------------------------------------- 1 | #!/bin/env python 2 | #-*- encoding: utf-8 -*- 3 | 4 | 5 | import os 6 | import time 7 | import tensorflow as tf 8 | from utils import model_helper 9 | import config 10 | 11 | 12 | class ModelParas(object): 13 | embedding_size = config.embedding_size 14 | batch_size = 64 15 | sequence_length = 70 16 | learning_rate = 0.01 17 | decay = 0.99 18 | lrshrink = 5 19 | uniform_init_scale = 0.04 20 | clip_gradient_norm = 5.0 21 | l2_reg_lambda = 0.0 22 | nclasses = 5 23 | epochs = 20 24 | 25 | # RNN 26 | cell_num_units = 256 27 | num_layers = 1 28 | rnn_dropout = 0.0 29 | 30 | # CNN 31 | filter_sizes = [3, 4, 5] 32 | num_filters = 32 33 | cnn_dropout = 0.0 34 | 35 | 36 | class Model(object): 37 | 38 | 39 | def __init__(self, paras, sess, mode, emb_matrix): 40 | self.paras = paras 41 | self.sess = sess 42 | self.mode = mode 43 | self.emb_matrix = emb_matrix 44 | self._build_graph() 45 | 46 | 47 | def _create_placeholder(self): 48 | self.lr = tf.placeholder(tf.float32, [], name = 'learning_rate') 49 | self.sents = tf.placeholder(tf.int32, [None, None], name = 'sents') 50 | self.sent_lengths = tf.placeholder(tf.int32, [None], name = 'sent_lengths') 51 | self.labels = tf.placeholder(tf.int32, [None], name = 'labels') 52 | 53 | 54 | def _create_variable(self): 55 | with tf.device('/cpu:0'): 56 | self.embeddings = tf.get_variable( 57 | name = 'embeddings', 58 | shape = self.emb_matrix.shape, 59 | dtype = tf.float32, 60 | initializer = tf.constant_initializer(self.emb_matrix)) 61 | self.global_step = tf.get_variable( 62 | name = 'global_step', 63 | dtype = tf.int32, 64 | initializer = 1, 65 | trainable = False) 66 | self.num_filters_total = self.paras.num_filters * len(self.paras.filter_sizes) 67 | self.w_projection = tf.get_variable( 68 | name = 'w_projection', 69 | shape = [self.num_filters_total, self.paras.nclasses]) 70 | self.b_projection = tf.get_variable( 71 | name = 'b_projection', 72 | shape = [self.paras.nclasses]) 73 | self.l2_loss = tf.constant(0.0) 74 | 75 | 76 | def _inference(self): 77 | paras = self.paras 78 | with tf.device('/cpu:0'): 79 | self.emb_sents = tf.nn.embedding_lookup( 80 | self.embeddings, self.sents) 81 | 82 | # RNN network 83 | with tf.name_scope('RNN'): 84 | cells_fw = model_helper.create_rnn_cell( 85 | 'lstm', 86 | paras.cell_num_units, 87 | paras.num_layers, 88 | paras.rnn_dropout, 89 | self.mode) 90 | cells_bw = model_helper.create_rnn_cell( 91 | 'lstm', 92 | paras.cell_num_units, 93 | paras.num_layers, 94 | paras.rnn_dropout, 95 | self.mode) 96 | outputs, output_states = tf.nn.bidirectional_dynamic_rnn( 97 | cells_fw, 98 | cells_bw, 99 | inputs = self.emb_sents, 100 | sequence_length = self.sent_lengths, 101 | dtype = tf.float32) 102 | # states_fw: (batch_size, sent_len, cell_size) 103 | states_fw, states_bw = outputs 104 | # concat_states: (batch_size, sent_len, cell_size * 2) 105 | concat_states = tf.concat([states_fw, states_bw], axis = 2) 106 | # rnn_states_expand: (batch_size, sent_len, cell_size * 2, 1) 107 | self.rnn_states_expand = tf.expand_dims(concat_states, -1) 108 | 109 | # CNN network 110 | with tf.name_scope('CNN'): 111 | pooled_concat = [] 112 | for filter_size in paras.filter_sizes: 113 | with tf.name_scope('conv-pool-%s' % filter_size): 114 | # filter: (shape) 115 | filter_ = tf.get_variable( 116 | name = 'filter-%s' % filter_size, 117 | shape = [filter_size, paras.cell_num_units * 2, 1, paras.num_filters]) 118 | # conv: (batch_size, sequence_length - filter + 1, 1, num_filters) 119 | conv = tf.nn.conv2d( 120 | input = self.rnn_states_expand, 121 | filter = filter_, 122 | strides = [1, 1, 1, 1], 123 | padding = 'VALID', 124 | name = 'conv') 125 | # bias: (num_filters, 1) 126 | b = tf.get_variable( 127 | name = 'bias-%s' % filter_size, 128 | shape = [paras.num_filters]) 129 | h = tf.nn.relu(tf.nn.bias_add(conv, b)) 130 | # pooled: (batch_size, 1, 1, num_filters) 131 | pooled = tf.nn.max_pool( 132 | value = h, 133 | ksize = [1, paras.sequence_length - filter_size + 1, 1, 1], 134 | strides = [1, 1, 1, 1], 135 | padding ='VALID', 136 | name ='pool') 137 | pooled_concat.append(pooled) 138 | # h_pool: (batch_size, 1, 1, num_filters_total) 139 | h_pool = tf.concat(pooled_concat, 3) 140 | # h_pool_flat: (batch_size, num_filters_total) 141 | self.h_pool_flat = tf.reshape(h_pool, [-1, self.num_filters_total]) 142 | # dropout 143 | if self.mode == tf.contrib.learn.ModeKeys.TRAIN: 144 | self.h_pool_flat = tf.nn.dropout(self.h_pool_flat, 1.0 - paras.cnn_dropout) 145 | 146 | with tf.name_scope('classify'): 147 | # logits: (batch_size, n_classes) 148 | logits = tf.nn.xw_plus_b(self.h_pool_flat, w_projection, b_projection, 'logits') 149 | # predicts: (batch_size, 1) 150 | self.predicts = tf.reduce_max(tf.contrib.layers.fully_connected( 151 | inputs = logits, 152 | activation_fn = None, 153 | num_outputs = 1), axis = 1) 154 | self.mse = tf.reduce_mean(tf.cast( 155 | tf.squared_difference( 156 | self.labels, 157 | tf.cast(tf.round(self.predicts), tf.int32)), 158 | tf.float32)) 159 | 160 | with tf.name_scope('accuracy'): 161 | correct_prediction = tf.equal(self.labels, 162 | tf.cast(tf.round(self.predicts), tf.int32)) 163 | self.accuracy = tf.reduce_mean(tf.cast( 164 | correct_prediction, tf.float32)) 165 | 166 | 167 | def _create_loss(self): 168 | with tf.name_scope('loss'): 169 | self.loss = tf.reduce_mean( 170 | tf.losses.mean_squared_error( 171 | labels = tf.cast(self.labels, tf.float32), 172 | predictions = self.predicts)) 173 | # Add l2 loss reg 174 | l2_loss += tf.nn.l2_loss(w_projection) 175 | l2_loss += tf.nn.l2_loss(b_projection) 176 | self.loss += l2_loss * self.paras.l2_reg_lambda 177 | 178 | 179 | def _create_optimizer(self): 180 | self.optimizer = tf.contrib.layers.optimize_loss( 181 | loss = self.loss, 182 | global_step = self.global_step, 183 | learning_rate = self.lr, 184 | optimizer = 'SGD', 185 | clip_gradients = self.paras.clip_gradient_norm) 186 | 187 | 188 | def _create_summary(self): 189 | log_path = os.path.join(config.model_path, 'tensorboard') 190 | self.train_writer = tf.summary.FileWriter( 191 | os.path.join(log_path, 'train'), self.sess.graph) 192 | self.test_writer = tf.summary.FileWriter( 193 | os.path.join(log_path, 'test'), self.sess.graph) 194 | with tf.name_scope('summaries') as scope: 195 | tf.summary.scalar('loss', self.loss) 196 | tf.summary.scalar('accuracy', self.accuracy) 197 | 198 | 199 | def _build_graph(self): 200 | self._create_placeholder() 201 | self._create_variable() 202 | self._inference() 203 | self._create_loss() 204 | self._create_optimizer() 205 | self._create_summary() 206 | print 'Build graph done' 207 | 208 | 209 | def test(): 210 | from data_helper import Helper 211 | sess = tf.Session() 212 | paras = ModelParas() 213 | emb_matrix = Helper.get_emb_matrix() 214 | Model(paras, sess, tf.contrib.learn.ModeKeys.TRAIN, emb_matrix) 215 | 216 | 217 | if __name__ == '__main__': 218 | test() 219 | -------------------------------------------------------------------------------- /rnn_model.py: -------------------------------------------------------------------------------- 1 | #!/bin/env python 2 | #-*- encoding: utf-8 -*- 3 | 4 | 5 | import os 6 | import time 7 | import numpy as np 8 | import tensorflow as tf 9 | from utils import model_helper 10 | import config 11 | 12 | 13 | class ModelParas(object): 14 | embedding_size = config.embedding_size 15 | sequence_length = None 16 | cell_num_units = 512 17 | num_layers = 1 18 | batch_size = 64 19 | dropout = 0.0 20 | learning_rate = 0.01 21 | decay = 0.99 22 | lrshrink = 5 23 | uniform_init_scale = 0.04 24 | clip_gradient_norm = 5.0 25 | epochs = 20 26 | 27 | 28 | class Model(object): 29 | 30 | 31 | def __init__(self, paras, sess, mode, emb_matrix): 32 | self.paras = paras 33 | self.sess = sess 34 | self.mode = mode 35 | 36 | # Model variable 37 | with tf.device('/cpu:0'): 38 | self.embeddings = tf.get_variable( 39 | name = 'embeddings', 40 | shape = emb_matrix.shape, 41 | dtype = tf.float32, 42 | initializer = tf.constant_initializer(emb_matrix)) 43 | self.global_step = tf.get_variable( 44 | name = 'global_step', 45 | dtype = tf.int32, 46 | initializer = 1, 47 | trainable = False) 48 | 49 | self._build_graph() 50 | 51 | 52 | def _create_placeholder(self): 53 | self.lr = tf.placeholder(tf.float32, [], name = 'learning_rate') 54 | self.sents = tf.placeholder(tf.int32, [None, None], name = 'sents') 55 | with tf.device('/cpu:0'): 56 | self.emb_sents = tf.nn.embedding_lookup( 57 | self.embeddings, self.sents) 58 | self.sent_lengths = tf.placeholder(tf.int32, [None], name = 'sent_lengths') 59 | self.labels = tf.placeholder(tf.int32, [None], name = 'labels') 60 | 61 | 62 | def _inference(self): 63 | with tf.variable_scope('encoder') as varscope: 64 | cells_fw = model_helper.create_rnn_cell( 65 | 'lstm', 66 | self.paras.cell_num_units, 67 | self.paras.num_layers, 68 | self.paras.dropout, 69 | self.mode) 70 | cells_bw = model_helper.create_rnn_cell( 71 | 'lstm', 72 | self.paras.cell_num_units, 73 | self.paras.num_layers, 74 | self.paras.dropout, 75 | self.mode) 76 | outputs, output_states = tf.nn.bidirectional_dynamic_rnn( 77 | cells_fw, 78 | cells_bw, 79 | inputs = self.emb_sents, 80 | sequence_length = self.sent_lengths, 81 | dtype = tf.float32, 82 | scope = varscope) 83 | # states_fw: (batch_size, sent_len, cell_size) 84 | states_fw, states_bw = outputs 85 | concat_states = tf.concat([states_fw, states_bw], axis = 2) 86 | # sent_states: (batch_size, 2 * cell_size) 87 | self.sent_states = tf.reduce_max(concat_states, axis = 1) 88 | 89 | with tf.variable_scope('classify_layer') as varscope: 90 | hidden1 = tf.contrib.layers.fully_connected( 91 | inputs = self.sent_states, 92 | num_outputs = 512) 93 | hidden2 = tf.contrib.layers.fully_connected( 94 | inputs = hidden1, 95 | num_outputs = 5) 96 | self.predicts = tf.reduce_max(tf.contrib.layers.fully_connected( 97 | inputs = hidden2, 98 | activation_fn = None, 99 | num_outputs = 1), axis = 1) 100 | self.mse = tf.reduce_mean(tf.cast( 101 | tf.squared_difference( 102 | self.labels, 103 | tf.cast(tf.round(self.predicts), tf.int32)), 104 | tf.float32)) 105 | 106 | with tf.variable_scope('accuracy') as varscope: 107 | correct_prediction = tf.equal(self.labels, 108 | tf.cast(tf.round(self.predicts), tf.int32)) 109 | self.accuracy = tf.reduce_mean(tf.cast( 110 | correct_prediction, tf.float32)) 111 | 112 | 113 | def _create_loss(self): 114 | with tf.variable_scope('loss') as varscope: 115 | self.loss = tf.reduce_mean( 116 | tf.losses.mean_squared_error( 117 | labels = tf.cast(self.labels, tf.float32), 118 | predictions = self.predicts)) 119 | 120 | 121 | def _create_optimizer(self): 122 | self.optimizer = tf.contrib.layers.optimize_loss( 123 | loss = self.loss, 124 | global_step = self.global_step, 125 | learning_rate = self.lr, 126 | optimizer = 'SGD', 127 | clip_gradients = self.paras.clip_gradient_norm) 128 | 129 | 130 | def _create_summary(self): 131 | log_path = os.path.join(config.model_path, 'tensorboard') 132 | self.train_writer = tf.summary.FileWriter( 133 | os.path.join(log_path, 'train'), self.sess.graph) 134 | self.test_writer = tf.summary.FileWriter( 135 | os.path.join(log_path, 'test'), self.sess.graph) 136 | with tf.name_scope('summaries') as scope: 137 | tf.summary.scalar('loss', self.loss) 138 | tf.summary.scalar('accuracy', self.accuracy) 139 | 140 | 141 | def _build_graph(self): 142 | self._create_placeholder() 143 | self._inference() 144 | self._create_loss() 145 | self._create_optimizer() 146 | self._create_summary() 147 | print 'Build graph done' 148 | 149 | 150 | def test(): 151 | sess = tf.Session() 152 | paras = ModelParas() 153 | 154 | emb_matrix = NlpUtil.build_emb_matrix() 155 | Model(paras, sess, tf.contrib.learn.ModeKeys.TRAIN) 156 | 157 | 158 | if __name__ == '__main__': 159 | pass 160 | -------------------------------------------------------------------------------- /train.py: -------------------------------------------------------------------------------- 1 | #!/bin/env python 2 | #-*- encoding: utf-8 -*- 3 | 4 | 5 | import os 6 | import time 7 | import math 8 | import codecs 9 | import numpy as np 10 | from sklearn.model_selection import KFold 11 | import tensorflow as tf 12 | from rnn_model import Model, ModelParas 13 | from data_helper import Helper 14 | from predict import predict, fuse_result 15 | from utils.log import logger 16 | import config 17 | 18 | 19 | tf.flags.DEFINE_string('model', 'rnn', 'select model, default is rnn') 20 | tf.flags.DEFINE_string('mode', 'single', 'single, multi or kfold, default is single') 21 | flags = tf.flags.FLAGS 22 | 23 | 24 | def run_epoch(model, input_data): 25 | start_time = time.time() 26 | paras = model.paras 27 | average_loss, average_acc, average_mse = 0.0, 0.0, 0.0 28 | sents, labels = input_data['sents'], input_data['labels'] 29 | data_length = len(sents) 30 | if data_length == 0: 31 | return None 32 | steps = int(math.ceil(data_length * 1.0 / paras.batch_size)) 33 | 34 | for step in xrange(steps): 35 | begin = step * paras.batch_size 36 | end = (step + 1) * paras.batch_size 37 | batch_sents, batch_lengths = Helper.get_batch( 38 | sents[begin: end], paras.sequence_length) 39 | batch_labels = labels[begin: end] 40 | feed_dict = { 41 | model.sents: batch_sents, 42 | model.sent_lengths: batch_lengths, 43 | model.labels: batch_labels.T, 44 | model.lr: paras.learning_rate} 45 | if flags.model == 'cnn_rnn': 46 | feed_dict[model.pad] = np.zeros(( 47 | len(labels[begin: end]), 1, paras.embedding_size, 1)) 48 | fetches = { 49 | 'b_loss': model.loss, 50 | 'b_acc': model.accuracy, 51 | 'global_step': model.global_step, 52 | 'b_mse': model.mse, 53 | } 54 | if model.mode == tf.contrib.learn.ModeKeys.TRAIN: 55 | fetches['optimizer'] = model.optimizer 56 | vals = model.sess.run(fetches, feed_dict) 57 | b_loss, b_acc, b_mse, global_step = ( 58 | vals['b_loss'], vals['b_acc'], 59 | vals['b_mse'], vals['global_step']) 60 | b_score = 1.0 / (1.0 + np.sqrt(b_mse)) 61 | average_loss += b_loss 62 | average_acc += b_acc 63 | average_mse += b_mse 64 | if (model.mode == tf.contrib.learn.ModeKeys.TRAIN and global_step % 10 == 0): 65 | logger.debug('step=%d, b_loss=%.4f, b_acc=%.4f, b_mse=%.4f, b_score=%.4f', 66 | global_step, b_loss, b_acc, b_mse, b_score) 67 | 68 | average_loss /= steps 69 | average_acc /= steps 70 | average_mse /= steps 71 | rmse_score = 1.0 / (1.0 + np.sqrt(average_mse)) 72 | logger.debug('average_loss=%.4f, average_acc=%.4f, average_mse=%.4f, rmse_score=%.4f', 73 | average_loss, average_acc, average_mse, rmse_score) 74 | return rmse_score, global_step 75 | 76 | 77 | def train(train_data, valid_data, test_data, emb_matrix): 78 | """Train the model""" 79 | start_time = time.time() 80 | paras = ModelParas() 81 | tf.reset_default_graph() 82 | sess = tf.Session() 83 | # Init initialzer 84 | uniform_initializer = tf.random_uniform_initializer( 85 | minval = -paras.uniform_init_scale, 86 | maxval = paras.uniform_init_scale) 87 | # Define model for train and evaluate 88 | with tf.name_scope('train'): 89 | with tf.variable_scope('Model', reuse = None, 90 | initializer = uniform_initializer): 91 | model_train = Model(paras, 92 | sess, 93 | tf.contrib.learn.ModeKeys.TRAIN, 94 | emb_matrix) 95 | with tf.name_scope('valid'): 96 | with tf.variable_scope('Model', reuse = True, 97 | initializer = uniform_initializer): 98 | model_eval = Model(paras, 99 | sess, 100 | tf.contrib.learn.ModeKeys.EVAL, 101 | emb_matrix) 102 | # Model Train 103 | init_op = tf.global_variables_initializer() 104 | sess.run(init_op) 105 | best_score = -np.inf 106 | saver = tf.train.Saver() 107 | save_path = os.path.join(config.model_path, 'model/model.ckpt') 108 | for epoch in xrange(paras.epochs): 109 | logger.debug('>>> Epoch %d, learning_rate=%.4f', 110 | epoch, paras.learning_rate) 111 | run_epoch(model_train, train_data) 112 | logger.debug('>>> Running Valid') 113 | score, global_step = run_epoch(model_eval, valid_data) 114 | if score > best_score: 115 | best_score = score 116 | saver.save(sess, save_path) 117 | logger.debug('Score improved, save model to %s', save_path) 118 | else: 119 | saver.restore(sess, save_path) 120 | logger.debug('Score not improved, load previous best model') 121 | logger.debug('Epoch %d done, time=%.4f minutes', 122 | epoch, (time.time() - start_time) / 60) 123 | logger.debug('>>> Running Test') 124 | run_epoch(model_eval, test_data) 125 | del model_train 126 | del model_eval 127 | logger.debug('Predict result') 128 | predict(save_path = os.path.join(config.model_path, 129 | 'result_%f' % best_score)) 130 | 131 | 132 | def tmp_predict(model, save_path): 133 | predict_ids, predict = Helper.get_data(is_train_data = False) 134 | batch_size = model.paras.batch_size 135 | steps = int(math.ceil(len(predict_ids) * 1.0 / batch_size)) 136 | with codecs.open(save_path, 'w', 'utf-8') as out_f: 137 | for step in xrange(steps): 138 | begin = step * batch_size 139 | end = (step + 1) * batch_size 140 | ids = predict_ids[begin: end] 141 | batch_sents, batch_lengths = Helper.get_batch( 142 | predict[begin: end], model.paras.sequence_length) 143 | feed_dict = { 144 | model.sents: batch_sents, 145 | model.sent_lengths: batch_lengths} 146 | res = model.sess.run(model.predicts, feed_dict) 147 | ids = ids.tolist() 148 | res = res.tolist() 149 | msgs = predict[begin: end].tolist() 150 | for id_, val, msg in zip(ids, res, msgs): 151 | out_f.write('%s,%f\n' % (id_, val)) 152 | del predict_ids, predict 153 | print 'Predict done' 154 | 155 | 156 | 157 | def main(_): 158 | start_time = time.time() 159 | logger.info('Train begin...') 160 | emb_matrix = Helper.get_emb_matrix() 161 | if flags.mode == 'single': 162 | train_data, valid_data, test_data = Helper.get_data( 163 | is_train_data = True, partition = [0.8, 0.2], rand_seed = 666) 164 | train(train_data, valid_data, test_data, emb_matrix) 165 | elif flags.mode == 'multi': 166 | for i in range(10): 167 | print '>>> Multi %d' % i 168 | train_data, valid_data, test_data = Helper.get_data( 169 | is_train_data = True, partition = [0.8, 0.2], rand_seed = None) 170 | train(train_data, valid_data, test_data, emb_matrix) 171 | fuse_result() 172 | elif flags.mode == 'kfold': 173 | data_, _, _ = Helper.get_data( 174 | is_train_data = True, partition = [1.0], sort_flag = False) 175 | sents, labels = data_['sents'], data_['labels'] 176 | kf = KFold(n_splits = 10, shuffle = True, random_state = 123) 177 | train_data, test_data = {}, {} 178 | cnt = 1 179 | for train_index, test_index in kf.split(sents): 180 | print '>>> KFold %d' % cnt 181 | cnt += 1 182 | train_data['sents'] = sents[train_index] 183 | train_data['labels'] = labels[train_index] 184 | test_data['sents'] = sents[test_index] 185 | test_data['labels'] = labels[test_index] 186 | train_data['sents'], train_data['labels'] = Helper.sort_by_length( 187 | train_data['sents'], train_data['labels']) 188 | test_data['sents'], test_data['labels'] = Helper.sort_by_length( 189 | test_data['sents'], test_data['labels']) 190 | train(train_data, test_data, _, emb_matrix) 191 | fuse_result() 192 | else: 193 | raise ValueError('Train mode must be `single | multi | kfold` !') 194 | logger.info('Train done, time=%.4f hours' % ((time.time() - start_time) / 3600)) 195 | 196 | 197 | if __name__ == '__main__': 198 | log_path = './log/train.log' 199 | if os.path.exists(log_path): 200 | os.remove(log_path) 201 | logger.start(log_path, name = __name__) 202 | model_path = config.model_path 203 | if tf.gfile.Exists(model_path): 204 | tf.gfile.DeleteRecursively(model_path) 205 | logger.debug('Remove old model folder.') 206 | tf.app.run() 207 | -------------------------------------------------------------------------------- /utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Dikea/Text-Classification/135b5dec09fcd065b88aa4fdb037607aa8340565/utils/__init__.py -------------------------------------------------------------------------------- /utils/config.py: -------------------------------------------------------------------------------- 1 | #!/bin/env python 2 | #-*- encoding: utf-8 -*- 3 | 4 | 5 | stop_word_set = set([u',', u',', u'。', u'.', u'…', u'·', 6 | u'“', u'”', u'"', u'\'', u'(', u')', u'(', u')', u'~', 7 | u'~', u'、', u'\\', u'/', u':', u':', u';', u';', u'!', 8 | u'!', u'?', u'?', u'×', u'=', u'<', u'>', u'[', u']', u'$', 9 | u'@', u'-', u'_', u'│', u'|', u'↑', u'┬', 10 | ]) 11 | -------------------------------------------------------------------------------- /utils/log.py: -------------------------------------------------------------------------------- 1 | #!/bin/env python 2 | #-*- encoding: utf-8 -*- 3 | 4 | 5 | import logging 6 | import logging.handlers 7 | import os 8 | import os.path 9 | 10 | 11 | class Logger(object): 12 | _inst = None 13 | _level_dict = { 14 | 'CRITICAL': logging.CRITICAL, 15 | 'ERROR': logging.ERROR, 16 | 'WARNING': logging.WARNING, 17 | 'INFO': logging.INFO, 18 | 'DEBUG': logging.DEBUG, 19 | 'NOTSET': logging.NOTSET, 20 | } 21 | 22 | @classmethod 23 | def start(cls, log_path, name = None, level = None): 24 | if cls._inst is not None: 25 | return cls._inst 26 | 27 | fpath = '/'.join(log_path.split('/')[0 : -1]) 28 | if False == os.path.exists(fpath): 29 | os.mkdir(fpath) 30 | fmt = '[%(levelname)s] %(asctime)s, pid=%(process)d, src=%(filename)s:%(lineno)d, %(message)s' 31 | datefmt = '%Y-%m-%d %H:%M:%S' 32 | cls._inst = logging.getLogger(name) 33 | log_level = Logger._level_dict[level] if level else 'DEBUG' 34 | cls._inst.setLevel(log_level) 35 | 36 | handler = logging.handlers.RotatingFileHandler( 37 | log_path, maxBytes = 500 * (1<<20), backupCount = 8) 38 | fmtter = logging.Formatter(fmt, datefmt) 39 | handler.setFormatter(fmtter) 40 | 41 | cls._inst.addHandler(handler) 42 | 43 | @classmethod 44 | def get(cls): 45 | return cls._inst 46 | 47 | @classmethod 48 | def info(cls, *args): 49 | return cls._inst.info(*args) 50 | 51 | @classmethod 52 | def debug(cls, *args): 53 | return cls._inst.debug(*args) 54 | 55 | @classmethod 56 | def warn(cls, *args): 57 | return cls._inst.warn(*args) 58 | 59 | 60 | global logger 61 | logger = Logger 62 | 63 | -------------------------------------------------------------------------------- /utils/model_helper.py: -------------------------------------------------------------------------------- 1 | #!/bin/env python 2 | #-*- encoding: utf-8 -*- 3 | 4 | 5 | import tensorflow as tf 6 | 7 | 8 | def create_rnn_cell(unit_type, num_units, num_layers, 9 | dropout, mode, forget_bias = 1.0): 10 | """Create multi-layer RNN cell.""" 11 | cell_list = [] 12 | for i in range(num_layers): 13 | single_cell = _single_cell( 14 | unit_type = unit_type, 15 | num_units = num_units, 16 | forget_bias = forget_bias, 17 | dropout = dropout, 18 | mode = mode) 19 | cell_list.append(single_cell) 20 | if len(cell_list) == 1: 21 | return cell_list[0] 22 | else: 23 | return tf.contrib.rnn.MultiRNNCell(cell_list) 24 | 25 | 26 | def _single_cell(unit_type, num_units, dropout, mode, forget_bias = 1.0): 27 | """Create an instance of a single RNN cell.""" 28 | # Dropout (equal 1 - keep_prob) is set to 0 during eval and infer 29 | dropout = dropout if mode == tf.contrib.learn.ModeKeys.TRAIN else 0.0 30 | 31 | if unit_type == 'lstm': 32 | single_cell = tf.contrib.rnn.BasicLSTMCell( 33 | num_units, 34 | forget_bias = forget_bias) 35 | 36 | if dropout > 0.0: 37 | single_cell = tf.contrib.rnn.DropoutWrapper( 38 | cell = single_cell, 39 | input_keep_prob = (1.0 - dropout)) 40 | 41 | return single_cell 42 | 43 | 44 | def save_model(save_path, sess, inputs, outputs): 45 | """Save model""" 46 | if tf.gfile.Exists(save_path): 47 | tf.gfile.DeleteRecursively(save_path) 48 | builder = tf.saved_model.builder.SavedModelBuilder(save_path) 49 | inputs_ = {k: tf.saved_model.utils.build_tensor_info(v) 50 | for k, v in inputs.iteritems()} 51 | outputs_ = {k: tf.saved_model.utils.build_tensor_info(v) 52 | for k, v in outputs.iteritems()} 53 | signature = tf.saved_model.signature_def_utils.build_signature_def( 54 | inputs_, outputs_, 'signature_') 55 | builder.add_meta_graph_and_variables(sess, ['saved_model'], 56 | signature_def_map = {'signature': signature}) 57 | builder.save() 58 | 59 | 60 | def get_model_tensor(save_path, sess, inputs_fields, outpus_fields): 61 | """Load model""" 62 | meta_graph_def = tf.saved_model.loader.load(sess, 63 | ['saved_model'], save_path) 64 | signature = meta_graph_def.signature_def 65 | 66 | 67 | -------------------------------------------------------------------------------- /utils/nlp_util.py: -------------------------------------------------------------------------------- 1 | #!/bin/env python 2 | #-*- encoding: utf-8 -*- 3 | 4 | 5 | import os 6 | import time 7 | import jieba 8 | import codecs 9 | import collections 10 | import numpy as np 11 | from gensim import models 12 | import config 13 | 14 | 15 | class NlpUtil(object): 16 | 17 | 18 | @classmethod 19 | def tokenize(cls, text, filter_stop_word = False): 20 | if not isinstance(text, unicode): 21 | return [str(text)] 22 | tokens = jieba.lcut(text) 23 | if filter_stop_word: 24 | stop_word_set = config.stop_word_set 25 | tokens = filter(lambda w: w not in stop_word_set, tokens) 26 | return tokens 27 | 28 | 29 | @classmethod 30 | def train_word2vec(cls, corpus, wv_fpath = ''): 31 | time_s = time.time() 32 | vec_size = 300 33 | win_size = 1 34 | print ('begin to train model...') 35 | w2v_model = models.word2vec.Word2Vec(corpus, 36 | size = vec_size, 37 | window = win_size, 38 | min_count = 2, 39 | workers = 4, 40 | sg = 1, 41 | negative = 15, 42 | iter = 7) 43 | w2v_model.train(corpus, total_examples = len(corpus), epochs = w2v_model.iter) 44 | save_fpath = os.path.join(wv_fpath, 45 | 'w2v_win%s_d%s.model' % (win_size, vec_size)) 46 | w2v_model.save(save_fpath) 47 | print ('save model success, model_path=%s, time=%.4f sec.' 48 | % (save_fpath, time.time() - time_s)) 49 | 50 | 51 | @classmethod 52 | def load_word2vec(cls, w2v_fpath): 53 | w2v_model = models.word2vec.Word2Vec.load(w2v_fpath) 54 | print 'load word2vec success' 55 | wv = w2v_model.wv 56 | del w2v_model 57 | return wv 58 | 59 | 60 | @classmethod 61 | def build_word2id(cls, corpus): 62 | """Convert corpus from word to id 63 | Args: 64 | corpus: a list of all words 65 | 66 | Returns: 67 | word_to_id: a dict of word to id 68 | """ 69 | counter = collections.Counter(corpus) 70 | count_pairs = sorted(counter.items(), key=lambda x: (-x[1], x[0])) 71 | words, _ = list(zip(*count_pairs)) 72 | word2id = dict(zip(words, range(1, len(words) + 1))) 73 | return word2id 74 | 75 | 76 | @classmethod 77 | def build_emb_matrix(cls, word2vec, emb_size, word2id, 78 | init_scale = 0.25, norm_flag = False): 79 | vocab_size = len(word2id) 80 | emb_matrix = np.zeros((vocab_size + 1, emb_size), np.float32) 81 | for w, id_ in word2id.iteritems(): 82 | if w in word2vec: 83 | emb_matrix[id_] = word2vec[w] 84 | else: 85 | emb_matrix[id_] = np.random.uniform( 86 | -init_scale, init_scale, emb_size) 87 | return emb_matrix 88 | 89 | 90 | def test(): 91 | # Test tokenize 92 | print '|'.join(NlpUtil.tokenize(u'天气很好')).encode('utf-8') 93 | 94 | ''' 95 | # Test word2vec 96 | wv = NlpUtil.load_word2vec('./model/word2vec/w2v_win1_d128.model') 97 | print wv[u'天气'] 98 | print '|'.join([x[0] for x in wv.most_similar(positive = [u'天气'])]).encode('utf-8') 99 | ''' 100 | 101 | 102 | 103 | if __name__ == '__main__': 104 | test() 105 | --------------------------------------------------------------------------------