├── README.md ├── data ├── cn_data.txt ├── data.csv ├── neg.txt ├── pos.txt └── word_embedding_300_new.txt ├── data_prepare.py ├── data_tools.py ├── dnn_model.py └── train.py /README.md: -------------------------------------------------------------------------------- 1 | # lstm_text_clasification 2 | 一个基本的多层lstm rnn模型,能实现中英文文本的二分类或多分类。 3 | 4 | ### 运行环境 5 | - python 3.6 6 | 7 | ### 软件包依赖 8 | - tensorflow 1.6以及 9 | 10 | - pandas 11 | 12 | - pyhanlp (若处理中文数据时必须,要求jdk安装配置好) 13 | 14 | ### 词向量 15 | - data文件夹中的英文词向量维度为300,但词汇量太小,仅做测试用。可从这里下载替换 http://nlp.stanford.edu/data/glove.6B.zip 16 | - 中文词向量参见 https://github.com/Embedding/Chinese-Word-Vectors 17 | 18 | ### 功能说明 19 | - 一个基本的多层lstm rnn模型,能实现中英文文本的二分类或多分类 20 | - 能够保存训练过程中最佳的模型,用于测试。(也可保存每个epoch的模型,去掉dnn_model.py中train方法相应部分代码的注释即可) 21 | - early stop 22 | - 能够输出日志,包括计算图,以及loss、train_accuracy、dev_accuracy,可利用tensorboard查看。(也可输出元日志,去除中train方法相应部分代码的注释即可,但日志文件相当巨大,且会明显影响训练速度,建议在必要时再打开) 23 | 24 | ### 拓展 25 | - 若要调参,修改train.py中的超参数设置部分 26 | - 若数据格式有变,修改data_prepare.py,读取新格式的数据 27 | - 若模型有变,修改dnn_model.py,修改或添加自己设计好的layer,并在build方法中加入相应代码 28 | -------------------------------------------------------------------------------- /data_prepare.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | """ 3 | @author Liuchen 4 | 2018 5 | """ 6 | import re 7 | import pandas as pd 8 | import logging 9 | logger = logging.getLogger('main.data_prepare') 10 | 11 | 12 | def load_from_csv(csv_file, delimiter=',', lang='EN'): 13 | """ 14 | 从csv文件(默认为逗号分隔)中加载数据 15 | :param csv_file: csv文件 16 | :param delimiter: csv文件分隔符 17 | :param lang: 文本语言,EN为英文文本 18 | :return: tuple, (x list, y list) 19 | """ 20 | logger.info("loading data from csv file ... ") 21 | data = pd.read_csv(csv_file, encoding="utf-8", delimiter=delimiter) 22 | 23 | data = data.drop(data[data.sentiment == 3].index) # 删除正负情感冲突的微博 ---- 仅中文数据 24 | data = data.drop(data[data.sentiment == 2].index) # 删除情绪为surprise的微博 ---- 仅中文数据 25 | # data.loc[data.sentiment == 2, 'sentiment'] = -1 # 将surprise作为负面情感 ---- 仅中文数据 26 | 27 | texts = data["content"].values 28 | texts = [preprocess_text(text, lang=lang) for text in texts] 29 | labels = data["sentiment"].values 30 | return texts, labels 31 | 32 | 33 | def load_from_class_files(files): 34 | """ 35 | 从多个类别文件中加载数据,每个文件一个类别,文件数量即类别数量 36 | :param files: 类别文件列表 37 | :return: tuple, (x list, y list) 38 | """ 39 | logger.info("loading data from many text files ...") 40 | texts = [] 41 | labels = [] 42 | for fid, file in enumerate(files): 43 | f = open(file, encoding="utf-8") 44 | file_data = f.readlines() 45 | texts.extend(file_data) 46 | f.close() 47 | labels.extend([fid]*len(file_data)) 48 | return texts, labels 49 | 50 | 51 | def preprocess_text(text, lang="EN"): 52 | """ 53 | 股票评论文本数据处理 54 | :param text: String. 文本数据 55 | :return: List of Strings. 处理后的文本 56 | """ 57 | if lang == 'EN': # 处理英文数据 58 | return clean_englisth(text) 59 | else: # 处理中文数据 60 | return clean_chinese(text) 61 | 62 | return text 63 | 64 | 65 | def clean_chinese(text): 66 | """ 67 | 中文数据清理,暂无 68 | """ 69 | return text 70 | 71 | 72 | def clean_englisth(text): 73 | """ 74 | 英文数据清理 75 | """ 76 | text = re.sub(r"[^A-Za-z0-9(),!?\'\`\.]", " ", text) # 去除特殊字符 77 | text = re.sub(r"\'s", " \'s", text) # 's 替换为 空格+'s 78 | text = re.sub(r"\'ve", " \'ve", text) # 'e 替换为 空格+'ve 79 | text = re.sub(r"n\'t", " n't", text) # n't 替换为 空格+n't 80 | text = re.sub(r"\'re", " \'re", text) # 're 替换为 空格+'re 81 | text = re.sub(r"\'d", " \'d", text) # 'd 替换为 空格+'d 82 | text = re.sub(r"\'ll", " \'ll", text) # 'll 替换为 空格+'ll 83 | text = re.sub(r",", " , ", text) # 标点替换为 空格+标点+空格 84 | text = re.sub(r"\.", " . ", text) 85 | text = re.sub(r"!", " ! ", text) 86 | text = re.sub(r"\(", " \( ", text) 87 | text = re.sub(r"\)", " \) ", text) 88 | text = re.sub(r"\?", " \? ", text) 89 | text = re.sub(r"\s{2,}", " ", text) # 连续2个或多个空白字符变为一个 90 | return text.strip().lower() 91 | 92 | 93 | if __name__ == "__main__": 94 | x, y = load_from_class_files(['data/pos.txt', 'data/neg.txt']) 95 | for i in range(10000): 96 | print(x[i], y[i]) 97 | -------------------------------------------------------------------------------- /data_tools.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | """ 3 | @author Liuchen 4 | 2018 5 | """ 6 | import numpy as np 7 | from collections import Counter 8 | try: 9 | from pyhanlp import HanLP as hanlp 10 | except Exception: 11 | pass 12 | import logging 13 | logger = logging.getLogger('main.data_tools') 14 | 15 | 16 | def load_embedding(embedding_file): 17 | """ 18 | 加载词向量,返回词典和词向量矩阵 19 | :param embedding_file: 词向量文件 20 | :return: tuple, (词典, 词向量矩阵) 21 | """ 22 | logger.info('loading word dict and word embedding...') 23 | with open(embedding_file, encoding='utf-8') as f: 24 | lines = f.readlines() 25 | embedding_tuple = [tuple(line.strip().split(' ', 1)) for line in lines] 26 | embedding_tuple = [(t[0].strip().lower(), list(map(float, t[1].split()))) for t in embedding_tuple] 27 | embedding_matrix = [] 28 | embedding_dim = len(embedding_tuple[0][1]) 29 | embedding_matrix.append([0] * embedding_dim) # 首行全为0,表示未登录词 30 | word_dict = dict() 31 | word_dict[''] = 0 # 空字符串表示未登录词 32 | word_id = 1 33 | for word, embedding in embedding_tuple: 34 | if word_dict.get(word) is None: 35 | word_dict[word] = word_id 36 | word_id += 1 37 | embedding_matrix.append(embedding) 38 | return word_dict, np.asarray(embedding_matrix, dtype=np.float32) 39 | 40 | 41 | def drop_empty_texts(texts, labels): 42 | """ 43 | 去除预处理后句子为空的评论 44 | :param texts: id形式的文本列表 45 | :param labels: 标记数据 46 | :return: tuple of arrays. 非空句子列表,非空标记列表 47 | """ 48 | logger.info("clear empty sentences ...") 49 | non_zero_idx = [id_ for id_, text in enumerate(texts) if len(text) != 0] 50 | texts_non_zero = np.array([texts[id_] for id_ in non_zero_idx]) 51 | labels_non_zero = np.array([labels[id_] for id_ in non_zero_idx]) 52 | return texts_non_zero, labels_non_zero 53 | 54 | 55 | def make_dictionary_by_text(words_list): 56 | """ 57 | 构建词典(不使用已训练词向量时构建词典) 58 | :param words: list; 全部数数的词序列 59 | :return: tuple; 两个词典,word to int, int to word 60 | """ 61 | logger.info("make dictionary by text ...") 62 | word_counts = Counter(words_list) 63 | sorted_vocab = sorted(word_counts, key=word_counts.get, reverse=True) 64 | id_to_word = {id_: word for id_, word in enumerate(sorted_vocab, 1)} 65 | word_to_id = {word: id_ for id_, word in id_to_word.items()} 66 | word_to_id[''] = 0 67 | id_to_word[0] = '' 68 | return word_to_id, id_to_word 69 | 70 | 71 | def segment(text): 72 | ''' 73 | 使用HanLP对中文句子进行分词 74 | ''' 75 | try: 76 | seg_result = hanlp.segment(text) 77 | return [term.word for term in seg_result] 78 | except Exception: 79 | return text.split() 80 | # return "" 81 | 82 | 83 | def sentences2wordlists(sentence_list, lang='EN'): 84 | """ 85 | 将句子切分成词列表 86 | :param sentence_list: 句子列表 87 | :return: 词列表的列表 88 | """ 89 | logger.info("word cutting ...") 90 | word_list_s = [] 91 | for sentence in sentence_list: 92 | if lang == 'EN': # 英文分词 93 | word_list = sentence.split() 94 | else: # 中文分词 95 | word_list = segment(sentence) 96 | word_list_s.append(word_list) 97 | return word_list_s 98 | 99 | 100 | def wordlists2idlists(word_list_s, word_to_id): 101 | """ 102 | 句子列表转id列表的列表 103 | :param word_list_s: 词列表的列表 104 | :param word_to_id: 词典 105 | :return: list of ints. id形式的句子列表 106 | """ 107 | logger.info("convert word list to id list ...") 108 | sent_id_list = [] 109 | for word_list in word_list_s: 110 | sent_id_list.append([word_to_id.get(word, 0) for word in word_list]) 111 | return np.array(sent_id_list) 112 | 113 | 114 | def labels2onehot(labels, class_num=None, class_labels=None): 115 | """ 116 | 生成句子的情感标记。调用时class_num与class_labels必选其一。 117 | :param labels: list; 数据的标记列表 118 | :param class_num: int; 类别总数 119 | :param class_labels: list; 类别标记,如[0, 1]、['a', 'b'] 120 | :return: numpy array. 121 | """ 122 | if class_num is None and class_labels is None: 123 | raise Exception("Parameter eithor class_num or class_labels must be given! -- by lic") 124 | if class_labels is not None: 125 | class_num = len(class_labels) 126 | 127 | def label2onehot(label_): 128 | if class_labels is None: 129 | label_index = label_ 130 | else: 131 | label_index = class_labels.index(label_) 132 | onehot_label = [0] * class_num 133 | onehot_label[label_index] = 1 134 | return onehot_label 135 | 136 | return np.array([label2onehot(label_) for label_ in labels]) 137 | 138 | 139 | def dataset_padding(text_ids, sent_len): 140 | """ 141 | 句子id列表左侧补0 142 | :param text_ids: id形式的句子列表 143 | :param seq_ken: int, 最大句长 144 | :return: numpy array. 补0后的句子 145 | """ 146 | logger.info("padding dataset ...") 147 | textids_padded = np.zeros((len(text_ids), sent_len), dtype=int) 148 | for i, row in enumerate(text_ids): 149 | textids_padded[i, -len(row):] = np.array(row)[:sent_len] 150 | 151 | return np.array(textids_padded) 152 | 153 | 154 | def dataset_split(texts, labels, train_percent, random_seed=None): 155 | """ 156 | 训练、开发、测试集划分,其中训练集比例为train_percent,开发集和测试各集为0.5(1-train_percent) 157 | :param text: 数据集x 158 | :param labels: 数据集标记 159 | :param train_percent: 训练集所占比例 160 | :return: (train_x, train_y, val_x, val_y, test_x, test_y) 161 | """ 162 | logger.info("split dataset ...") 163 | # 检测x与y长度是否相等 164 | assert len(texts) == len(labels) 165 | # 随机化数据 166 | if random_seed: 167 | np.random.seed(random_seed) 168 | shuf_idx = np.random.permutation(len(texts)) 169 | texts_shuf = np.array(texts)[shuf_idx] 170 | labels_shuf = np.array(labels)[shuf_idx] 171 | 172 | # 切分数据 173 | split_idx = int(len(texts_shuf)*train_percent) 174 | train_x, val_x = texts_shuf[:split_idx], texts_shuf[split_idx:] 175 | train_y, val_y = labels_shuf[:split_idx], labels_shuf[split_idx:] 176 | 177 | test_idx = int(len(val_x)*0.5) 178 | val_x, test_x = val_x[:test_idx], val_x[test_idx:] 179 | val_y, test_y = val_y[:test_idx], val_y[test_idx:] 180 | 181 | return train_x, train_y, val_x, val_y, test_x, test_y 182 | 183 | 184 | def make_batches(x, y, batch_size=100, shuffle=True): 185 | """ 186 | 将数据划分成训练批次 187 | :param x: 训练数据 188 | :param y: 训练数所标记 189 | :param batch_size: int, 批次大小 190 | :return: x和y的批次数据生成器 191 | """ 192 | if shuffle: 193 | shuf_idx = np.random.permutation(len(x)) 194 | x = np.array(x)[shuf_idx] 195 | y = np.array(y)[shuf_idx] 196 | n_batches = len(x)//batch_size 197 | x, y = x[:n_batches*batch_size], y[:n_batches*batch_size] 198 | for id_ in range(0, len(x), batch_size): 199 | yield x[id_:id_+batch_size], y[id_:id_+batch_size] 200 | 201 | 202 | if __name__ == "__main__": 203 | print("Start") 204 | l = [[2, 3, 4, 5, 2, 2], 205 | [3, 4, 2, 5, 23, 3, 2, 4, 21, 2, 2], 206 | [3, 4, 2, 4, 24, 2, 4, 22]] 207 | print(dataset_padding(l, 20)) 208 | print('OK') 209 | -------------------------------------------------------------------------------- /dnn_model.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | """ 3 | @author Liuchen 4 | 2018 5 | """ 6 | import tensorflow as tf 7 | import data_tools as dt 8 | import numpy as np 9 | import os 10 | import time 11 | import logging 12 | logger = logging.getLogger('main.dnn_model') 13 | 14 | 15 | class DNNModel: 16 | def __init__(self, class_num, embed_dim, rnn_dims, vocab_size=None, embed_matrix=None, 17 | isBiRNN=True, fc_size=500, max_sent_len=200, refine=False): 18 | self.class_num = class_num # 分类类别数量 19 | self.embed_dim = embed_dim # 词向量维度 20 | self.rnn_dims = rnn_dims # RNN隐层维度,可有多层RNN 21 | if vocab_size is None and embed_matrix is None: # 词向量和词典长度必须给出一个 22 | raise Exception("One of vocab_size and embed_matrix must be given!") 23 | self.vocab_size = vocab_size # 词典大小 24 | self.embed_matrix = embed_matrix # 词向量矩阵 25 | self.isBiRNN = isBiRNN # 是否使用双向RNN 26 | self.fc_size = fc_size # 全连接层大小 27 | self.max_sent_len = max_sent_len # 最大句长 28 | self.refine = refine # 词向量是否refine 29 | 30 | # ---- 以下为 placeholder 参数 31 | self.learning_rate = tf.placeholder_with_default(0.01, shape=(), name='learning_rate') # 学习率 32 | self.keep_prob = tf.placeholder_with_default( 33 | 1.0, shape=(), name='keep_prob') # dropout keep probability 34 | self.l2reg = tf.placeholder_with_default(0.0, shape=(), name='L2reg') # L2正则化参数 35 | 36 | def inputs_layer(self): 37 | """ 38 | 输入层 39 | """ 40 | with tf.name_scope('input_layer'): 41 | self.inputs = tf.placeholder(tf.int32, [None, self.max_sent_len], name='inputs') # 输入数据x placeholder 42 | self.labels = tf.placeholder(tf.int32, [None, self.class_num], name='labels') # 输入数据y placeholder 43 | return self.inputs 44 | 45 | def embedding_layer(self, inputs_): 46 | """ 47 | 词向量层 48 | """ 49 | with tf.name_scope("embedding_layer"): 50 | if self.embed_matrix is None: # 若无已训练词向量 51 | embedding = tf.Variable(tf.random_uniform((self.vocab_size, self.embed_dim), -1, 1), name="embedding") 52 | else: # 若已有词向量 53 | embedding = tf.Variable(self.embed_matrix, trainable=self.refine, name="embedding") 54 | embed = tf.nn.embedding_lookup(embedding, inputs_) 55 | return embed 56 | 57 | def rnn_layer(self, embed): 58 | """ 59 | RNN层 60 | """ 61 | with tf.name_scope("rnn_layer"): 62 | embed = tf.nn.dropout(embed, keep_prob=self.keep_prob) # dropout 63 | # --- 可选的RNN单元 64 | # tf.contrib.rnn.BasicRNNCell(size) 65 | # tf.contrib.rnn.BasicLSTMCell(size) 66 | # tf.contrib.rnn.LSTMCell(size) 67 | # tf.contrib.rnn.GRUCell(size, activation=tf.nn.relu) 68 | # tf.contrib.cudnn_rnn.CudnnCompatibleLSTMCell(size) 69 | # tf.contrib.cudnn_rnn.CudnnCompatibleGRUCell(size) 70 | 71 | if not self.isBiRNN: 72 | lstms = [tf.contrib.rnn.BasicLSTMCell(size) for size in self.rnn_dims] 73 | drops = [tf.contrib.rnn.DropoutWrapper(lstm, output_keep_prob=self.keep_prob) for lstm in lstms] 74 | cell = tf.contrib.rnn.MultiRNNCell(drops) # 组合多个 LSTM 层 75 | lstm_outputs, _ = tf.nn.dynamic_rnn(cell, embed, dtype=tf.float32) 76 | # lstm_outputs -> batch_size * max_len * n_hidden 77 | else: 78 | lstms_l = [tf.contrib.rnn.BasicLSTMCell(size) for size in self.rnn_dims] 79 | lstms_r = [tf.contrib.rnn.BasicLSTMCell(size) for size in self.rnn_dims] 80 | drops_l = [tf.contrib.rnn.DropoutWrapper(lstm, output_keep_prob=self.keep_prob) for lstm in lstms_l] 81 | drops_r = [tf.contrib.rnn.DropoutWrapper(lstm, output_keep_prob=self.keep_prob) for lstm in lstms_r] 82 | cell_l = tf.contrib.rnn.MultiRNNCell(drops_l) 83 | cell_r = tf.contrib.rnn.MultiRNNCell(drops_r) 84 | outputs, _ = tf.nn.bidirectional_dynamic_rnn( # 双向LSTM 85 | cell_l, # 正向LSTM单元 86 | cell_r, # 反向LSTM单元 87 | inputs=embed, 88 | dtype=tf.float32, 89 | ) # outputs -> batch_size * max_len * n_hidden; state(最终状态,为h和c的tuple) -> batch_size * n_hidden 90 | lstm_outputs = tf.concat(outputs, -1) # 合并双向LSTM的结果 91 | outputs = lstm_outputs[:, -1] # 返回每条数据的最后输出 92 | 93 | return outputs 94 | 95 | def fc_layer(self, inputs): 96 | """ 97 | 全连接层 98 | """ 99 | # initializer = tf.contrib.layers.xavier_initializer() # xavier参数初始化,暂没用到 100 | with tf.name_scope("fc_layer"): 101 | inputs = tf.nn.dropout(inputs, keep_prob=self.keep_prob, name='drop_out') # dropout 102 | # outputs = tf.contrib.layers.fully_connected(inputs, self.fc_size, activation_fn=tf.nn.relu) 103 | outputs = tf.layers.dense(inputs, self.fc_size, activation=tf.nn.relu) 104 | return outputs 105 | 106 | def output_layer(self, inputs): 107 | """ 108 | 输出层 109 | """ 110 | with tf.name_scope("output_layer"): 111 | inputs = tf.layers.dropout(inputs, rate=1-self.keep_prob) 112 | outputs = tf.layers.dense(inputs, self.class_num, activation=None) 113 | # outputs = tf.contrib.layers.fully_connected(inputs, self.class_num, activation_fn=None) 114 | return outputs 115 | 116 | def set_loss(self): 117 | """ 118 | 损失函数 119 | """ 120 | # softmax交叉熵损失 121 | with tf.name_scope("loss_scope"): 122 | reg_loss = tf.contrib.layers.apply_regularization( # L2正则化 123 | tf.contrib.layers.l2_regularizer(self.l2reg), 124 | tf.trainable_variables() 125 | ) 126 | self.loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2( 127 | logits=self.predictions, labels=self.labels)) + reg_loss # ---GLOBAL---损失函数 128 | 129 | def set_accuracy(self): 130 | """ 131 | 准确率 132 | """ 133 | with tf.name_scope("accuracy_scope"): 134 | correct_pred = tf.equal(tf.argmax(self.predictions, axis=1), tf.argmax(self.labels, axis=1)) 135 | self.accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32)) # ---GLOBAL---准确率 136 | 137 | def set_optimizer(self): 138 | """ 139 | 优化器 140 | """ 141 | with tf.name_scope("optimizer"): 142 | # --- 可选优化算法 143 | # self.optimizer = tf.train.AdadeltaOptimizer(self.learning_rate).minimize(self.loss) 144 | self.optimizer = tf.train.AdamOptimizer(self.learning_rate).minimize(self.loss) 145 | # self.optimizer = tf.train.AdagradOptimizer(self.learning_rate).minimize(self.loss) 146 | # self.optimizer = tf.train.MomentumOptimizer(self.learning_rate, 0.9).minimize(self.loss) 147 | # self.optimizer = tf.train.GradientDescentOptimizer(self.learning_rate).minimize(self.loss) 148 | # self.optimizer = tf.train.RMSPropOptimizer(self.learning_rate).minimize(self.loss) 149 | 150 | def build(self): 151 | """ 152 | DNN模型构建 153 | """ 154 | inputs = self.inputs_layer() 155 | embedding = self.embedding_layer(inputs) 156 | rnn_outputs = self.rnn_layer(embedding) 157 | fc_outputs = self.fc_layer(rnn_outputs) 158 | self.predictions = self.output_layer(fc_outputs) 159 | self.set_loss() 160 | self.set_optimizer() 161 | self.set_accuracy() 162 | 163 | 164 | def train(dnn_model, learning_rate, train_x, train_y, dev_x, dev_y, max_epochs, batch_size, keep_prob, l2reg, 165 | show_step=10, checkpoint_path="./checkpoints", model_name=None, no_improve=5): 166 | """ 167 | 训练并验证 168 | :param dnn_model: 计算图模型 169 | :param learning_rate: 学习率 170 | :param train_x: 训练数据 171 | :param train_y: 标记训练数据 172 | :param dev_x: 验证数据 173 | :param dev_y: 标记验证数据 174 | :param max_epochs: 最大迭代次数 175 | :param batch_size: minibatch 大小 176 | :param keep_prob: dropout keep probability 177 | :param l2reg: L2 正则化系数 178 | :param show_step: 隔多少步显示一次训练结果 179 | :param checkpoint_path: 模型保存位置 180 | :param model_name: 保存下来的模型的名称(文件夹名) 181 | :param no_improve: early stop, 连续no_improve次没有得到更低的dev_loss则停止 182 | """ 183 | # 最佳模型保存路径 184 | if model_name is None: 185 | model_name = str(time.time()).replace('.', '')[:11] 186 | best_model_path = checkpoint_path + '/best/' + model_name 187 | if not os.path.exists(best_model_path): # 模型保存路径不存在则创建路径 188 | os.makedirs(best_model_path) 189 | saver = tf.train.Saver() 190 | 191 | with tf.Session() as sess: 192 | # Train Summaries 193 | train_loss = tf.summary.scalar("train_loss", dnn_model.loss) 194 | train_acc = tf.summary.scalar("train_acc", dnn_model.accuracy) 195 | train_summary_op = tf.summary.merge([train_loss, train_acc]) 196 | train_summary_writer = tf.summary.FileWriter('./log/train', sess.graph) 197 | 198 | # Dev summary writer 199 | dev_summary_writer = tf.summary.FileWriter('./log/dev', sess.graph) 200 | 201 | # meta日志 202 | # run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE) # for meta日志 **1 203 | # run_metadata = tf.RunMetadata() # for meta日志 **2 204 | 205 | sess.run(tf.global_variables_initializer()) 206 | n_batches = len(train_x)//batch_size 207 | step = 0 208 | best_dev_acc = 0 # 最优验证准确率 209 | min_dev_loss = float('inf') # 最小验证损失 210 | 211 | no_improve_num = 0 # 最优dev loss 没有下降的次数 212 | for e in range(max_epochs): 213 | for id_, (x, y) in enumerate(dt.make_batches(train_x, train_y, batch_size), 1): 214 | step += 1 215 | feed = { 216 | dnn_model.inputs: x, 217 | dnn_model.labels: y, 218 | dnn_model.learning_rate: learning_rate, 219 | dnn_model.keep_prob: keep_prob, 220 | dnn_model.l2reg: l2reg 221 | } 222 | train_loss, _, train_acc, train_summary = sess.run( 223 | [dnn_model.loss, dnn_model.optimizer, dnn_model.accuracy, train_summary_op], 224 | feed_dict=feed, 225 | # options=run_options, # for meta 日志 - **3 226 | # run_metadata=run_metadata # for meta 日志 - **4 227 | ) 228 | 229 | train_summary_writer.add_summary(train_summary, step) # 写入日志 230 | # --- 写入meta日志,注意:日志文件会特别巨大,若要写入meta日志需取消 **1行、**2行、**3行、**4行和**5行的注释 231 | # train_summary_writer.add_run_metadata(run_metadata, 'batch%03d' % step) # for meta 日志 - **5 232 | 233 | if show_step > 0 and step % show_step == 0: 234 | info = "Epoch {}/{} ".format(e+1, max_epochs) + \ 235 | " - Batch {}/{} ".format(id_+1, n_batches) + \ 236 | " - Loss {:.5f} ".format(train_loss) + \ 237 | " - Acc {:.5f}".format(train_acc) 238 | logger.info(info) 239 | 240 | # 每个 Epoch 验证 --- 241 | dev_acc_s = [] 242 | dev_loss_s = [] 243 | for xx, yy in dt.make_batches(dev_x, dev_y, batch_size): 244 | feed = { 245 | dnn_model.inputs: xx, 246 | dnn_model.labels: yy, 247 | dnn_model.keep_prob: 1, 248 | } 249 | dev_batch_loss, dev_batch_acc = sess.run([dnn_model.loss, dnn_model.accuracy], feed_dict=feed) 250 | dev_acc_s.append(dev_batch_acc) 251 | dev_loss_s.append(dev_batch_loss) 252 | 253 | dev_acc = np.mean(dev_acc_s) # dev acc 均值 254 | dev_loss = np.mean(dev_loss_s) # dev loss 均值 255 | 256 | # --- dev 日志 257 | dev_summary = tf.Summary() 258 | dev_summary.value.add(tag="dev_loss", simple_value=dev_loss) 259 | dev_summary.value.add(tag="dev_acc", simple_value=dev_acc) 260 | dev_summary_writer.add_summary(dev_summary, step) 261 | 262 | info = "|Epoch {}/{}\t".format(e+1, max_epochs) + \ 263 | "|Train-Loss| {:.5f}\t".format(train_loss) + \ 264 | "|Dev-Loss| {:.5f}\t".format(dev_loss) + \ 265 | "|Train-Acc| {:.5f}\t".format(np.mean(train_acc)) + \ 266 | "|Dev-Acc| {:.5f}".format(dev_acc) 267 | logger.info(info) 268 | 269 | # 保存最好的模型 270 | if best_dev_acc < dev_acc: 271 | best_dev_acc = dev_acc 272 | saver.save(sess, best_model_path + "/best_model.ckpt") 273 | # 保存每个epoch的模型 274 | # saver.save(sess, best_model_path + "model.ckpt", global_step=e) 275 | 276 | # 寻找最小 dev_loss 277 | if min_dev_loss > dev_loss: 278 | min_dev_loss = dev_loss 279 | no_improve_num = 0 280 | else: 281 | no_improve_num += 1 282 | 283 | # early stop 284 | if no_improve_num == no_improve: 285 | break 286 | 287 | logger.info("** The best dev accuracy: {:.5f}".format(best_dev_acc)) 288 | 289 | # 返回最小的验证损失。当损失最小时,准确率未必最小,保存的模型为准确率最小的模型,但返回的是最小损失 290 | return min_dev_loss 291 | 292 | 293 | def test(dnn_model, test_x, test_y, batch_size, model_dir="./checkpoints/best"): 294 | """ 295 | 利用最好的模型进行测试 296 | :param test_x: 测试数据 297 | :param test_y: 标记测试数据 298 | :param batch_size: 批次大小 299 | :param dnn_model: 原dnn模型 300 | :param model_dir: 训练好的模型的存储位置 301 | """ 302 | best_folder = max([d for d in os.listdir(model_dir) if d.isdigit()]) 303 | best_model_dir = model_dir + '/' + best_folder 304 | saver = tf.train.Saver() 305 | test_acc = [] 306 | with tf.Session() as sess: 307 | saver.restore(sess, tf.train.latest_checkpoint(best_model_dir)) 308 | for _, (x, y) in enumerate(dt.make_batches(test_x, test_y, batch_size), 1): 309 | feed = { 310 | dnn_model.inputs: x, 311 | dnn_model.labels: y, 312 | dnn_model.keep_prob: 1, 313 | } 314 | batch_acc = sess.run([dnn_model.accuracy], feed_dict=feed) 315 | test_acc.append(batch_acc) 316 | logger.info("** Test Accuracy: {:.5f}".format(np.mean(test_acc))) 317 | -------------------------------------------------------------------------------- /train.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | """ 3 | @author Liuchen 4 | 2018 5 | """ 6 | 7 | import data_tools as tools 8 | import dnn_model as dm 9 | import data_prepare as dp 10 | from itertools import chain 11 | import logging 12 | logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(message)s') 13 | logger = logging.getLogger('main') 14 | 15 | # ================== step0: 定义超参数 ================= 16 | learning_rate = 0.001 # 学习率 17 | batch_size = 64 # mini-batch大小 18 | keep_prob = 0.5 # drop out 保留率 19 | l2reg = 0.0 # l2正则化参数 20 | refine = True # 词向量矩阵是否参与训练 21 | lstm_sizes = [256] # 各层lstm的维度 22 | fc_size = 500 # 全连接层大小 23 | 24 | embed_size = 200 # 词向量维度 25 | max_epochs = 50 # 数据迭代次数 26 | 27 | # ---- 其他参数 28 | max_sent_len = 60 # 最大句长 29 | class_num = 2 # 类别数量 30 | lang = 'EN' # 文本语言 EN为英文,CN为中文 31 | train_percent = 0.8 # 训练数据的比例 32 | show_step = 20 # 每隔几个批次输出一次结果,若为0则不显示 33 | data_path = './data/' # 数据存放路径 34 | 35 | # ================== step1: 数据准备 ================= 36 | ## a. 从csv文件读取数据 37 | texts, labels = dp.load_from_csv(data_path + "data.csv") 38 | ## b. 从以Tab符为分隔符的csv文件读取数据 39 | # texts, labels = dp.load_from_csv(data_path + "cn_data.txt", delimiter='\t', lang=lang) 40 | ## c. 从情感类别文件读取数据 41 | # texts, labels = dp.load_from_class_files([data_path + 'pos.txt', data_path + 'neg.txt']) 42 | 43 | # --- 分词(英文按空格分,中文利用hanlp分词) 44 | texts = tools.sentences2wordlists(texts, lang=lang) 45 | logger.info('max sentence len: ' + str(max([len(text) for text in texts]))) 46 | 47 | # --- 构建词典 48 | ## a. 基于文本构建词典 -- 不使用预训练词向量 49 | vocab_to_int, int_to_vocab = tools.make_dictionary_by_text(list(chain.from_iterable(texts))) 50 | embedding_matrix = None # 设词向量矩阵为None 51 | 52 | ## b. 基于词向量构建词典 -- 使用预训练词向量 53 | # vocab_to_int, embedding_matrix = tools.load_embedding(data_path + "word_embedding_300_new.txt") # 英文词向量 54 | # vocab_to_int, embedding_matrix = tools.load_embedding(data_path + "glove.6B.200d.txt") # 英文词向量 55 | # vocab_to_int, embedding_matrix = tools.load_embedding(data_path + "sgns.weibo.word.txt") # 中文词向量 56 | 57 | logger.info(f"dictionary length: {len(vocab_to_int)}") 58 | 59 | # --- 利用词典,将文本句子转成id列表 60 | texts = tools.wordlists2idlists(texts, vocab_to_int) 61 | # --- 清除预处理后文本内容为空的数据 62 | texts, labels = tools.drop_empty_texts(texts, labels) 63 | # --- 将数据中类别转为one-hot向量表示 64 | labels = tools.labels2onehot(labels, class_num) 65 | # --- 在每个长度小于最大句长的句子左侧补0 66 | texts = tools.dataset_padding(texts, sent_len=max_sent_len) 67 | # --- 将数据分为训练集(80%)、开发集(10%)和测试集(10%) 68 | train_x, train_y, val_x, val_y, test_x, test_y = tools.dataset_split(texts, labels, train_percent=train_percent) 69 | 70 | # ================== step2: 构建模型 ================= 71 | vocab_size = len(vocab_to_int) # add one for padding 72 | model = dm.DNNModel(class_num=class_num, embed_dim=embed_size, rnn_dims=lstm_sizes, vocab_size=vocab_size, 73 | embed_matrix=embedding_matrix, fc_size=fc_size, max_sent_len=max_sent_len, refine=refine, 74 | ) 75 | model.build() 76 | 77 | 78 | # ================== step3: 训练 ================= 79 | min_dev_loss = dm.train( 80 | model, 81 | learning_rate, 82 | train_x, 83 | train_y, 84 | val_x, 85 | val_y, 86 | max_epochs, 87 | batch_size, 88 | keep_prob, 89 | l2reg, 90 | show_step=show_step 91 | ) 92 | logger.info(f' ** The minimum dev_loss is {min_dev_loss}') 93 | 94 | # ================== step4: 测试 ================= 95 | dm.test(model, test_x, test_y, batch_size) 96 | --------------------------------------------------------------------------------