├── 0-pre.py ├── Doc2Vec.py ├── README.md ├── comments.csv ├── embeding.py ├── neg.xls ├── pos.xls ├── pred.py └── sum.xls /0-pre.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | class LSTM_CNN_Model(object): 4 | 5 | def __init__(self, config, is_training=True): 6 | 7 | self.keep_prob = config.keep_prob 8 | self.batch_size = 64 9 | 10 | num_step = config.num_step 11 | self.input_data = tf.placeholder(tf.int32, [None, num_step]) 12 | self.target = tf.placeholder(tf.int64, [None]) 13 | self.mask_x = tf.placeholder(tf.float32, [num_step, None]) 14 | 15 | class_num = config.class_num 16 | hidden_neural_size = config.hidden_neural_size 17 | vocabulary_size = config.vocabulary_size 18 | embed_dim = config.embed_dim 19 | hidden_layer_num = config.hidden_layer_num 20 | 21 | # build LSTM network 22 | 23 | lstm_cell = tf.contrib.rnn.BasicLSTMCell(hidden_neural_size, forget_bias=0.0, state_is_tuple=True) 24 | if self.keep_prob < 1: 25 | lstm_cell = tf.contrib.rnn.DropoutWrapper( 26 | lstm_cell, output_keep_prob=self.keep_prob 27 | ) 28 | 29 | cell = tf.contrib.rnn.MultiRNNCell([lstm_cell] * hidden_layer_num, state_is_tuple=True) 30 | 31 | self._initial_state = cell.zero_state(self.batch_size, tf.float32) 32 | 33 | # embedding layer 34 | with tf.device("/cpu:0"), tf.name_scope("embedding_layer"): 35 | embedding = tf.get_variable("embedding", [vocabulary_size, embed_dim], dtype=tf.float32) 36 | inputs = tf.nn.embedding_lookup(embedding, self.input_data) 37 | 38 | if self.keep_prob < 1: 39 | inputs = tf.nn.dropout(inputs, self.keep_prob) 40 | 41 | out_put = [] 42 | state = self._initial_state 43 | with tf.variable_scope("LSTM_layer"): 44 | for time_step in range(num_step): 45 | if time_step > 0: tf.get_variable_scope().reuse_variables() 46 | (cell_output, state) = cell(inputs[:, time_step, :], state) 47 | out_put.append(cell_output) 48 | 49 | out_put = out_put * self.mask_x[:, :, None] 50 | 51 | with tf.name_scope("Conv_layer"): 52 | out_put = tf.transpose(out_put, [1, 2, 0]) 53 | out_put = tf.reshape(out_put, [self.batch_size, hidden_neural_size, num_step, -1]) 54 | 55 | W_conv = tf.get_variable(name="conv_w", initializer=tf.truncated_normal(shape=[600, 5, 1, 200], stddev=0.1)) 56 | B_conv = tf.get_variable(name="conv_b", initializer=tf.constant(0.1, shape=[200])) 57 | 58 | conv_output = tf.nn.relu(tf.nn.conv2d(out_put, W_conv, strides=[1, 1, 1, 1], padding='VALID') + B_conv) 59 | conv_output = tf.reshape(conv_output, [self.batch_size, 36, 200, 1]) 60 | max_pool_out = tf.nn.max_pool(conv_output, ksize=[1, 36, 1, 1], strides=[1, 1, 1, 1], padding='VALID') 61 | max_pool_out = tf.reshape(max_pool_out, [self.batch_size, 200]) 62 | 63 | with tf.name_scope("Softmax_layer_and_output"): 64 | softmax_w = tf.get_variable("softmax_w", [200, class_num], dtype=tf.float32) 65 | softmax_b = tf.get_variable("softmax_b", [class_num], dtype=tf.float32) 66 | self.logits = tf.matmul(max_pool_out, softmax_w) + softmax_b 67 | 68 | with tf.name_scope("loss"): 69 | self.loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=self.logits + 1e-10, labels=self.target) 70 | self.cost = tf.reduce_mean(self.loss) 71 | 72 | with tf.name_scope("accuracy"): 73 | self.prediction = tf.argmax(self.logits, 1) 74 | correct_prediction = tf.equal(self.prediction, self.target) 75 | self.correct_num = tf.reduce_sum(tf.cast(correct_prediction, tf.float32)) 76 | self.accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32), name="accuracy") 77 | 78 | # add summary 79 | loss_summary = tf.summary.scalar("loss", self.cost) 80 | # add summary 81 | accuracy_summary = tf.summary.scalar("accuracy_summary", self.accuracy) 82 | 83 | if not is_training: 84 | return 85 | 86 | self.globle_step = tf.Variable(tf.constant(0), dtype=tf.int32, name="globle_step", trainable=False) 87 | self.lr = tf.Variable(tf.constant(0.8), dtype=tf.float32, trainable=False) 88 | 89 | tvars = tf.trainable_variables() 90 | grads, _ = tf.clip_by_global_norm(tf.gradients(self.cost, tvars), 91 | config.max_grad_norm) 92 | 93 | # Keep track of gradient values and sparsity (optional) 94 | grad_summaries = [] 95 | for g, v in zip(grads, tvars): 96 | if g is not None: 97 | grad_hist_summary = tf.summary.histogram("{}/grad/hist".format(v.name), g) 98 | sparsity_summary = tf.summary.scalar("{}/grad/sparsity".format(v.name), tf.nn.zero_fraction(g)) 99 | grad_summaries.append(grad_hist_summary) 100 | grad_summaries.append(sparsity_summary) 101 | self.grad_summaries_merged = tf.summary.merge(grad_summaries) 102 | 103 | self.summary = tf.summary.merge([loss_summary, accuracy_summary, self.grad_summaries_merged]) 104 | 105 | optimizer = tf.train.GradientDescentOptimizer(self.lr) 106 | optimizer.apply_gradients(zip(grads, tvars)) 107 | self.train_op = optimizer.apply_gradients(zip(grads, tvars)) 108 | 109 | self.new_lr = tf.placeholder(tf.float32, shape=[], name="new_learning_rate") 110 | self._lr_update = tf.assign(self.lr, self.new_lr) 111 | 112 | def assign_new_lr(self, session, lr_value): 113 | session.run(self._lr_update, feed_dict={self.new_lr: lr_value}) 114 | 115 | -------------------------------------------------------------------------------- /Doc2Vec.py: -------------------------------------------------------------------------------- 1 | import pandas as pd #导入Pandas 2 | import numpy as np #导入Numpy 3 | import jieba #导入结巴分词 4 | import xlrd 5 | from keras.preprocessing import sequence 6 | from keras.optimizers import SGD, RMSprop, Adagrad 7 | from keras.utils import np_utils 8 | from keras.models import Sequential 9 | from keras.layers.core import Dense, Dropout, Activation 10 | from keras.layers.embeddings import Embedding 11 | from keras.layers.recurrent import LSTM, GRU 12 | 13 | 14 | neg=pd.read_excel('neg.xls',header=None,index=None) 15 | pos=pd.read_excel('pos.xls',header=None,index=None) #读取训练语料完毕 16 | pos['mark']=1 17 | neg['mark']=0 #给训练语料贴上标签 18 | pn=pd.concat([pos,neg],ignore_index=True) #合并语料 19 | neglen=len(neg) 20 | poslen=len(pos) #计算语料数目 21 | 22 | cw = lambda x: list(jieba.cut(x)) #定义分词函数 23 | pn['words'] = pn[0].apply(cw) 24 | 25 | comment = pd.read_excel('sum.xls') #读入评论内容 26 | #comment = pd.read_csv('a.csv', encoding='utf-8') 27 | comment = comment[comment['rateContent'].notnull()] #仅读取非空评论 28 | comment['words'] = comment['rateContent'].apply(cw) #评论分词 29 | 30 | d2v_train = pd.concat([pn['words'], comment['words']], ignore_index = True) 31 | 32 | w = [] #将所有词语整合在一起 33 | for i in d2v_train: 34 | w.extend(i) 35 | 36 | dict = pd.DataFrame(pd.Series(w).value_counts()) #统计词的出现次数 37 | del w,d2v_train 38 | dict['id']=list(range(1,len(dict)+1)) 39 | 40 | get_sent = lambda x: list(dict['id'][x]) 41 | pn['sent'] = pn['words'].apply(get_sent) #速度太慢 42 | 43 | maxlen = 50 44 | 45 | print("Pad sequences (samples x time)") 46 | pn['sent'] = list(sequence.pad_sequences(pn['sent'], maxlen=maxlen)) 47 | 48 | x = np.array(list(pn['sent']))[::2] #训练集 49 | y = np.array(list(pn['mark']))[::2] 50 | xt = np.array(list(pn['sent']))[1::2] #测试集 51 | yt = np.array(list(pn['mark']))[1::2] 52 | xa = np.array(list(pn['sent'])) #全集 53 | ya = np.array(list(pn['mark'])) 54 | 55 | print('Build model...') 56 | model = Sequential() 57 | model.add(Embedding(len(dict)+1, 256)) 58 | model.add(LSTM(128)) # 原本是256, 128 但是仅接受一个参数,所以改成128 try using a GRU instead, for fun 59 | model.add(Dropout(0.5)) 60 | model.add(Dense(1)) # 同样的 只接受一个参数 原本是128,1 61 | model.add(Activation('sigmoid')) 62 | 63 | model.compile(loss='binary_crossentropy', optimizer='adam') 64 | 65 | model.fit(x, y, batch_size=16, epochs=1) # 训练时间为若干个小时 66 | 67 | classes = model.predict_classes(xt) 68 | acc = np_utils.accuracy(classes, yt) 69 | print('Test accuracy:', acc) 70 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Text_sentiment_classification 2 | 采用LSTM模型对中文文本进行感情分类 3 | ## 介绍 4 | 搭建卷积神经网络,使用固定预料训练来实现对文本感情色彩分类。 5 | ## 原理 6 | ### 中文分词 7 | 使用jieba分词工具直接分词。 8 | ### 词语映射 9 | 使用doc2num将文本映射成为高维向量。 10 | ### 搭建网络 11 | 使用LSTM模型训练,迭代30次。 12 | ### 进行预测 13 | 预测结果分为两类:积极的和消极的 14 | ## 说明 15 | ### 预测结果 16 | [[1]]代表积极文本 17 | [[0]]代表消极文本 18 | ### 素材说明 19 | neg.xls和pos.xls以及sum.xls保存了训练以及检测使用的语料 20 | -------------------------------------------------------------------------------- /embeding.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | 3 | ''' 4 | word embedding测试 5 | 在GTX960上,18s一轮 6 | 经过30轮迭代,训练集准确率为98.41%,测试集准确率为89.03% 7 | Dropout不能用太多,否则信息损失太严重 8 | ''' 9 | 10 | import numpy as np 11 | import pandas as pd 12 | import jieba 13 | 14 | pos = pd.read_excel('pos.xls', header=None) 15 | pos['label'] = 1 16 | neg = pd.read_excel('neg.xls', header=None) 17 | neg['label'] = 0 18 | all_ = pos.append(neg, ignore_index=True) 19 | all_['words'] = all_[0].apply(lambda s: list(jieba.cut(s))) # 调用结巴分词 20 | 21 | maxlen = 100 #截断词数 22 | min_count = 5 #出现次数少于该值的词扔掉。这是最简单的降维方法 23 | 24 | content = [] 25 | for i in all_['words']: 26 | content.extend(i) 27 | 28 | abc = pd.Series(content).value_counts() 29 | abc = abc[abc >= min_count] 30 | abc[:] = list(range(1, len(abc)+1)) 31 | abc[''] = 0 #添加空字符串用来补全 32 | word_set = set(abc.index) 33 | 34 | def doc2num(s, maxlen): 35 | s = [i for i in s if i in word_set] 36 | s = s[:maxlen] + ['']*max(0, maxlen-len(s)) 37 | return list(abc[s]) 38 | 39 | all_['doc2num'] = all_['words'].apply(lambda s: doc2num(s, maxlen)) 40 | 41 | #手动打乱数据 42 | idx = list(range(len(all_))) 43 | np.random.shuffle(idx) 44 | all_ = all_.loc[idx] 45 | 46 | #按keras的输入要求来生成数据 47 | x = np.array(list(all_['doc2num'])) 48 | y = np.array(list(all_['label'])) 49 | y = y.reshape((-1,1)) #调整标签形状 50 | 51 | from keras.models import Sequential 52 | from keras.layers import Dense, Activation, Dropout, Embedding 53 | from keras.layers import LSTM 54 | 55 | #建立模型 56 | model = Sequential() 57 | model.add(Embedding(len(abc), 256, input_length=maxlen)) 58 | model.add(LSTM(128)) 59 | model.add(Dropout(0.5)) 60 | model.add(Dense(1)) 61 | model.add(Activation('sigmoid')) 62 | model.compile(loss='binary_crossentropy', 63 | optimizer='adam', 64 | metrics=['accuracy']) 65 | 66 | batch_size = 128 67 | train_num = 15000 68 | 69 | model.fit(x[:train_num], y[:train_num], batch_size = batch_size, nb_epoch=30) 70 | 71 | model.evaluate(x[train_num:], y[train_num:], batch_size = batch_size) 72 | 73 | def predict_one(s): #单个句子的预测函数 74 | s = np.array(doc2num(list(jieba.cut(s)), maxlen)) 75 | s = s.reshape((1, s.shape[0])) 76 | return model.predict_classes(s, verbose=0)[0][0] -------------------------------------------------------------------------------- /neg.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Jaschenn/Text_sentiment_classification/445ed4d672107a41dcab07e3621da7e292ea7806/neg.xls -------------------------------------------------------------------------------- /pos.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Jaschenn/Text_sentiment_classification/445ed4d672107a41dcab07e3621da7e292ea7806/pos.xls -------------------------------------------------------------------------------- /pred.py: -------------------------------------------------------------------------------- 1 | import keras 2 | import numpy as np 3 | import jieba 4 | import pandas as pd 5 | model = keras.models.load_model("./model/model.h5") 6 | gooddata = list(jieba.cut("热水器已安装上了性价比不错从发货到安装一切顺利 卖家态度很好,物流快 顺丰不是盖的 安装师傅态度和服务也不错 安装材料花了98元" 7 | "不知道是不是首次用的原因 不过保温效果不错 今天还很热 很不错的网购 等用过几次再来追评 " 8 | " 热水器已安装上了性价比不错从发货到安装一切顺利 卖家态度很好,物流快 顺丰不是盖的 安装师傅态度和服务也不错 安装材料花了98元" 9 | "不知道是不是首次用的原因 不过保温效果不错 今天还很热 很不错的网购 等用过几次再来追评 ")) 10 | 11 | 12 | baddata = list(jieba.cut("人多,昏暗,难闻,不好玩。是我见过最差的海洋馆,据说建设年代久远了。里面奶瓶喂鱼25元,小孩小孩。")) 13 | 14 | 15 | min_count = 1 16 | max_len = 100 17 | 18 | def doc2num(s, maxlen): 19 | content = [] 20 | for i in gooddata: 21 | content.extend(i) 22 | 23 | abc = pd.Series(content).value_counts() 24 | abc = abc[abc >= min_count] 25 | abc[:] = list(range(1, len(abc) + 1)) 26 | abc[''] = 0 # 添加空字符串用来补 27 | word_set = set(abc.index) 28 | s = [i for i in s if i in word_set] 29 | s = s[:maxlen] + ['']*max(0, maxlen-len(s)) 30 | return list(abc[s]) 31 | 32 | gooddata = doc2num(gooddata, max_len) 33 | gooddata = np.reshape(gooddata, (1, 100)) 34 | baddata = doc2num(baddata, max_len) 35 | baddata = np.reshape(baddata, (1, 100)) 36 | print(model.predict_classes(gooddata)) 37 | print(model.predict_classes(baddata)) 38 | -------------------------------------------------------------------------------- /sum.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Jaschenn/Text_sentiment_classification/445ed4d672107a41dcab07e3621da7e292ea7806/sum.xls --------------------------------------------------------------------------------