├── 0-pre.py
├── Doc2Vec.py
├── README.md
├── comments.csv
├── embeding.py
├── neg.xls
├── pos.xls
├── pred.py
└── sum.xls


/0-pre.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | 
  3 | class LSTM_CNN_Model(object):
  4 | 
  5 |     def __init__(self, config, is_training=True):
  6 | 
  7 |         self.keep_prob = config.keep_prob
  8 |         self.batch_size = 64
  9 | 
 10 |         num_step = config.num_step
 11 |         self.input_data = tf.placeholder(tf.int32, [None, num_step])
 12 |         self.target = tf.placeholder(tf.int64, [None])
 13 |         self.mask_x = tf.placeholder(tf.float32, [num_step, None])
 14 | 
 15 |         class_num = config.class_num
 16 |         hidden_neural_size = config.hidden_neural_size
 17 |         vocabulary_size = config.vocabulary_size
 18 |         embed_dim = config.embed_dim
 19 |         hidden_layer_num = config.hidden_layer_num
 20 | 
 21 |         # build LSTM network
 22 | 
 23 |         lstm_cell = tf.contrib.rnn.BasicLSTMCell(hidden_neural_size, forget_bias=0.0, state_is_tuple=True)
 24 |         if self.keep_prob < 1:
 25 |             lstm_cell = tf.contrib.rnn.DropoutWrapper(
 26 |                 lstm_cell, output_keep_prob=self.keep_prob
 27 |             )
 28 | 
 29 |         cell = tf.contrib.rnn.MultiRNNCell([lstm_cell] * hidden_layer_num, state_is_tuple=True)
 30 | 
 31 |         self._initial_state = cell.zero_state(self.batch_size, tf.float32)
 32 | 
 33 |         # embedding layer
 34 |         with tf.device("/cpu:0"), tf.name_scope("embedding_layer"):
 35 |             embedding = tf.get_variable("embedding", [vocabulary_size, embed_dim], dtype=tf.float32)
 36 |             inputs = tf.nn.embedding_lookup(embedding, self.input_data)
 37 | 
 38 |         if self.keep_prob < 1:
 39 |             inputs = tf.nn.dropout(inputs, self.keep_prob)
 40 | 
 41 |         out_put = []
 42 |         state = self._initial_state
 43 |         with tf.variable_scope("LSTM_layer"):
 44 |             for time_step in range(num_step):
 45 |                 if time_step > 0: tf.get_variable_scope().reuse_variables()
 46 |                 (cell_output, state) = cell(inputs[:, time_step, :], state)
 47 |                 out_put.append(cell_output)
 48 | 
 49 |         out_put = out_put * self.mask_x[:, :, None]
 50 | 
 51 |         with tf.name_scope("Conv_layer"):
 52 |             out_put = tf.transpose(out_put, [1, 2, 0])
 53 |             out_put = tf.reshape(out_put, [self.batch_size, hidden_neural_size, num_step, -1])
 54 | 
 55 |             W_conv = tf.get_variable(name="conv_w", initializer=tf.truncated_normal(shape=[600, 5, 1, 200], stddev=0.1))
 56 |             B_conv = tf.get_variable(name="conv_b", initializer=tf.constant(0.1, shape=[200]))
 57 | 
 58 |             conv_output = tf.nn.relu(tf.nn.conv2d(out_put, W_conv, strides=[1, 1, 1, 1], padding='VALID') + B_conv)
 59 |             conv_output = tf.reshape(conv_output, [self.batch_size, 36, 200, 1])
 60 |             max_pool_out = tf.nn.max_pool(conv_output, ksize=[1, 36, 1, 1], strides=[1, 1, 1, 1], padding='VALID')
 61 |             max_pool_out = tf.reshape(max_pool_out, [self.batch_size, 200])
 62 | 
 63 |         with tf.name_scope("Softmax_layer_and_output"):
 64 |             softmax_w = tf.get_variable("softmax_w", [200, class_num], dtype=tf.float32)
 65 |             softmax_b = tf.get_variable("softmax_b", [class_num], dtype=tf.float32)
 66 |             self.logits = tf.matmul(max_pool_out, softmax_w) + softmax_b
 67 | 
 68 |         with tf.name_scope("loss"):
 69 |             self.loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=self.logits + 1e-10, labels=self.target)
 70 |             self.cost = tf.reduce_mean(self.loss)
 71 | 
 72 |         with tf.name_scope("accuracy"):
 73 |             self.prediction = tf.argmax(self.logits, 1)
 74 |             correct_prediction = tf.equal(self.prediction, self.target)
 75 |             self.correct_num = tf.reduce_sum(tf.cast(correct_prediction, tf.float32))
 76 |             self.accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32), name="accuracy")
 77 | 
 78 |         # add summary
 79 |         loss_summary = tf.summary.scalar("loss", self.cost)
 80 |         # add summary
 81 |         accuracy_summary = tf.summary.scalar("accuracy_summary", self.accuracy)
 82 | 
 83 |         if not is_training:
 84 |             return
 85 | 
 86 |         self.globle_step = tf.Variable(tf.constant(0), dtype=tf.int32, name="globle_step", trainable=False)
 87 |         self.lr = tf.Variable(tf.constant(0.8), dtype=tf.float32, trainable=False)
 88 | 
 89 |         tvars = tf.trainable_variables()
 90 |         grads, _ = tf.clip_by_global_norm(tf.gradients(self.cost, tvars),
 91 |                                           config.max_grad_norm)
 92 | 
 93 |         # Keep track of gradient values and sparsity (optional)
 94 |         grad_summaries = []
 95 |         for g, v in zip(grads, tvars):
 96 |             if g is not None:
 97 |                 grad_hist_summary = tf.summary.histogram("{}/grad/hist".format(v.name), g)
 98 |                 sparsity_summary = tf.summary.scalar("{}/grad/sparsity".format(v.name), tf.nn.zero_fraction(g))
 99 |                 grad_summaries.append(grad_hist_summary)
100 |                 grad_summaries.append(sparsity_summary)
101 |         self.grad_summaries_merged = tf.summary.merge(grad_summaries)
102 | 
103 |         self.summary = tf.summary.merge([loss_summary, accuracy_summary, self.grad_summaries_merged])
104 | 
105 |         optimizer = tf.train.GradientDescentOptimizer(self.lr)
106 |         optimizer.apply_gradients(zip(grads, tvars))
107 |         self.train_op = optimizer.apply_gradients(zip(grads, tvars))
108 | 
109 |         self.new_lr = tf.placeholder(tf.float32, shape=[], name="new_learning_rate")
110 |         self._lr_update = tf.assign(self.lr, self.new_lr)
111 | 
112 |     def assign_new_lr(self, session, lr_value):
113 |         session.run(self._lr_update, feed_dict={self.new_lr: lr_value})
114 | 
115 | 


--------------------------------------------------------------------------------
/Doc2Vec.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd #导入Pandas
 2 | import numpy as np #导入Numpy
 3 | import jieba #导入结巴分词
 4 | import xlrd
 5 | from keras.preprocessing import sequence
 6 | from keras.optimizers import SGD, RMSprop, Adagrad
 7 | from keras.utils import np_utils
 8 | from keras.models import Sequential
 9 | from keras.layers.core import Dense, Dropout, Activation
10 | from keras.layers.embeddings import Embedding
11 | from keras.layers.recurrent import LSTM, GRU
12 | 
13 | 
14 | neg=pd.read_excel('neg.xls',header=None,index=None)
15 | pos=pd.read_excel('pos.xls',header=None,index=None) #读取训练语料完毕
16 | pos['mark']=1
17 | neg['mark']=0 #给训练语料贴上标签
18 | pn=pd.concat([pos,neg],ignore_index=True) #合并语料
19 | neglen=len(neg)
20 | poslen=len(pos) #计算语料数目
21 | 
22 | cw = lambda x: list(jieba.cut(x)) #定义分词函数
23 | pn['words'] = pn[0].apply(cw)
24 | 
25 | comment = pd.read_excel('sum.xls') #读入评论内容
26 | #comment = pd.read_csv('a.csv', encoding='utf-8')
27 | comment = comment[comment['rateContent'].notnull()] #仅读取非空评论
28 | comment['words'] = comment['rateContent'].apply(cw) #评论分词
29 | 
30 | d2v_train = pd.concat([pn['words'], comment['words']], ignore_index = True)
31 | 
32 | w = [] #将所有词语整合在一起
33 | for i in d2v_train:
34 |   w.extend(i)
35 | 
36 | dict = pd.DataFrame(pd.Series(w).value_counts()) #统计词的出现次数
37 | del w,d2v_train
38 | dict['id']=list(range(1,len(dict)+1))
39 | 
40 | get_sent = lambda x: list(dict['id'][x])
41 | pn['sent'] = pn['words'].apply(get_sent) #速度太慢
42 | 
43 | maxlen = 50
44 | 
45 | print("Pad sequences (samples x time)")
46 | pn['sent'] = list(sequence.pad_sequences(pn['sent'], maxlen=maxlen))
47 | 
48 | x = np.array(list(pn['sent']))[::2] #训练集
49 | y = np.array(list(pn['mark']))[::2]
50 | xt = np.array(list(pn['sent']))[1::2] #测试集
51 | yt = np.array(list(pn['mark']))[1::2]
52 | xa = np.array(list(pn['sent'])) #全集
53 | ya = np.array(list(pn['mark']))
54 | 
55 | print('Build model...')
56 | model = Sequential()
57 | model.add(Embedding(len(dict)+1, 256))
58 | model.add(LSTM(128))  # 原本是256， 128 但是仅接受一个参数，所以改成128 try using a GRU instead, for fun
59 | model.add(Dropout(0.5))
60 | model.add(Dense(1))  # 同样的 只接受一个参数 原本是128，1
61 | model.add(Activation('sigmoid'))
62 | 
63 | model.compile(loss='binary_crossentropy', optimizer='adam')
64 | 
65 | model.fit(x, y, batch_size=16, epochs=1)  # 训练时间为若干个小时
66 | 
67 | classes = model.predict_classes(xt)
68 | acc = np_utils.accuracy(classes, yt)
69 | print('Test accuracy:', acc)
70 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Text_sentiment_classification
 2 | 采用LSTM模型对中文文本进行感情分类
 3 | ## 介绍
 4 | 搭建卷积神经网络，使用固定预料训练来实现对文本感情色彩分类。
 5 | ## 原理
 6 | ### 中文分词
 7 | 使用jieba分词工具直接分词。
 8 | ### 词语映射
 9 | 使用doc2num将文本映射成为高维向量。
10 | ### 搭建网络
11 | 使用LSTM模型训练，迭代30次。
12 | ### 进行预测
13 | 预测结果分为两类：积极的和消极的
14 | ## 说明
15 | ### 预测结果
16 | [[1]]代表积极文本
17 | [[0]]代表消极文本
18 | ### 素材说明
19 | neg.xls和pos.xls以及sum.xls保存了训练以及检测使用的语料
20 | 


--------------------------------------------------------------------------------
/embeding.py:
--------------------------------------------------------------------------------
 1 | # -*- coding:utf-8 -*-
 2 | 
 3 | '''
 4 | word embedding测试
 5 | 在GTX960上，18s一轮
 6 | 经过30轮迭代，训练集准确率为98.41%，测试集准确率为89.03%
 7 | Dropout不能用太多，否则信息损失太严重
 8 | '''
 9 | 
10 | import numpy as np
11 | import pandas as pd
12 | import jieba
13 | 
14 | pos = pd.read_excel('pos.xls', header=None)
15 | pos['label'] = 1
16 | neg = pd.read_excel('neg.xls', header=None)
17 | neg['label'] = 0
18 | all_ = pos.append(neg, ignore_index=True)
19 | all_['words'] = all_[0].apply(lambda s: list(jieba.cut(s)))  # 调用结巴分词
20 | 
21 | maxlen = 100 #截断词数
22 | min_count = 5 #出现次数少于该值的词扔掉。这是最简单的降维方法
23 | 
24 | content = []
25 | for i in all_['words']:
26 | 	content.extend(i)
27 | 
28 | abc = pd.Series(content).value_counts()
29 | abc = abc[abc >= min_count]
30 | abc[:] = list(range(1, len(abc)+1))
31 | abc[''] = 0 #添加空字符串用来补全
32 | word_set = set(abc.index)
33 | 
34 | def doc2num(s, maxlen):
35 |     s = [i for i in s if i in word_set]
36 |     s = s[:maxlen] + ['']*max(0, maxlen-len(s))
37 |     return list(abc[s])
38 | 
39 | all_['doc2num'] = all_['words'].apply(lambda s: doc2num(s, maxlen))
40 | 
41 | #手动打乱数据
42 | idx = list(range(len(all_)))
43 | np.random.shuffle(idx)
44 | all_ = all_.loc[idx]
45 | 
46 | #按keras的输入要求来生成数据
47 | x = np.array(list(all_['doc2num']))
48 | y = np.array(list(all_['label']))
49 | y = y.reshape((-1,1)) #调整标签形状
50 | 
51 | from keras.models import Sequential
52 | from keras.layers import Dense, Activation, Dropout, Embedding
53 | from keras.layers import LSTM
54 | 
55 | #建立模型
56 | model = Sequential()
57 | model.add(Embedding(len(abc), 256, input_length=maxlen))
58 | model.add(LSTM(128))
59 | model.add(Dropout(0.5))
60 | model.add(Dense(1))
61 | model.add(Activation('sigmoid'))
62 | model.compile(loss='binary_crossentropy',
63 |               optimizer='adam',
64 |               metrics=['accuracy'])
65 | 
66 | batch_size = 128
67 | train_num = 15000
68 | 
69 | model.fit(x[:train_num], y[:train_num], batch_size = batch_size, nb_epoch=30)
70 | 
71 | model.evaluate(x[train_num:], y[train_num:], batch_size = batch_size)
72 | 
73 | def predict_one(s): #单个句子的预测函数
74 |     s = np.array(doc2num(list(jieba.cut(s)), maxlen))
75 |     s = s.reshape((1, s.shape[0]))
76 |     return model.predict_classes(s, verbose=0)[0][0]


--------------------------------------------------------------------------------
/neg.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Jaschenn/Text_sentiment_classification/445ed4d672107a41dcab07e3621da7e292ea7806/neg.xls


--------------------------------------------------------------------------------
/pos.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Jaschenn/Text_sentiment_classification/445ed4d672107a41dcab07e3621da7e292ea7806/pos.xls


--------------------------------------------------------------------------------
/pred.py:
--------------------------------------------------------------------------------
 1 | import keras
 2 | import numpy as np
 3 | import jieba
 4 | import pandas as pd
 5 | model = keras.models.load_model("./model/model.h5")
 6 | gooddata = list(jieba.cut("热水器已安装上了性价比不错从发货到安装一切顺利 卖家态度很好，物流快 顺丰不是盖的   安装师傅态度和服务也不错   安装材料花了98元"
 7 |                           "不知道是不是首次用的原因 不过保温效果不错 今天还很热   很不错的网购   等用过几次再来追评 "
 8 |                           "  热水器已安装上了性价比不错从发货到安装一切顺利 卖家态度很好，物流快 顺丰不是盖的   安装师傅态度和服务也不错   安装材料花了98元"
 9 |                           "不知道是不是首次用的原因 不过保温效果不错 今天还很热   很不错的网购   等用过几次再来追评    "))
10 | 
11 | 
12 | baddata = list(jieba.cut("人多，昏暗，难闻，不好玩。是我见过最差的海洋馆，据说建设年代久远了。里面奶瓶喂鱼25元，小孩小孩。"))
13 | 
14 | 
15 | min_count = 1
16 | max_len = 100
17 | 
18 | def doc2num(s, maxlen):
19 |     content = []
20 |     for i in gooddata:
21 |         content.extend(i)
22 | 
23 |     abc = pd.Series(content).value_counts()
24 |     abc = abc[abc >= min_count]
25 |     abc[:] = list(range(1, len(abc) + 1))
26 |     abc[''] = 0  # 添加空字符串用来补
27 |     word_set = set(abc.index)
28 |     s = [i for i in s if i in word_set]
29 |     s = s[:maxlen] + ['']*max(0, maxlen-len(s))
30 |     return list(abc[s])
31 | 
32 | gooddata = doc2num(gooddata, max_len)
33 | gooddata = np.reshape(gooddata, (1, 100))
34 | baddata = doc2num(baddata, max_len)
35 | baddata = np.reshape(baddata, (1, 100))
36 | print(model.predict_classes(gooddata))
37 | print(model.predict_classes(baddata))
38 | 


--------------------------------------------------------------------------------
/sum.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Jaschenn/Text_sentiment_classification/445ed4d672107a41dcab07e3621da7e292ea7806/sum.xls


--------------------------------------------------------------------------------