├── ATENTION_LSTM_model ├── ATENTION_LSTM.h5 └── log │ └── events.out.tfevents.1534540449.������MC ├── Attention_layer.py ├── Classifier_model_train.py ├── Config.py ├── MLP_model ├── MLP.h5 └── log │ └── events.out.tfevents.1534538333.������MC ├── Predict.py ├── README.md ├── Utils.py └── data └── mpk ├── neg.xls └── pos.xls /ATENTION_LSTM_model/ATENTION_LSTM.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/guimaxing/sentiment-analysis-MLP-BiRNN-BiGRU-Attention-Model/d448434e8c46a49627822ee12b5970ba1a17421e/ATENTION_LSTM_model/ATENTION_LSTM.h5 -------------------------------------------------------------------------------- /ATENTION_LSTM_model/log/events.out.tfevents.1534540449.������MC: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/guimaxing/sentiment-analysis-MLP-BiRNN-BiGRU-Attention-Model/d448434e8c46a49627822ee12b5970ba1a17421e/ATENTION_LSTM_model/log/events.out.tfevents.1534540449.������MC -------------------------------------------------------------------------------- /Attention_layer.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Fri Aug 17 12:39:26 2018 4 | @author: Moc 5 | """ 6 | 7 | from keras import backend as K 8 | from keras.engine.topology import Layer 9 | #from keras import initializations, regularizers, constraints 10 | from keras import initializers, regularizers, constraints 11 | 12 | class Attention_layer(Layer): 13 | """ 14 | Attention operation, with a context/query vector, for temporal data. 15 | Supports Masking. 16 | Follows the work of Yang et al. [https://www.cs.cmu.edu/~diyiy/docs/naacl16.pdf] 17 | "Hierarchical Attention Networks for Document Classification" 18 | by using a context vector to assist the attention 19 | # Input shape 20 | 3D tensor with shape: `(samples, steps, features)`. 21 | # Output shape 22 | 2D tensor with shape: `(samples, features)`. 23 | :param kwargs: 24 | Just put it on top of an RNN Layer (GRU/LSTM/SimpleRNN) with return_sequences=True. 25 | The dimensions are inferred based on the output shape of the RNN. 26 | Example: 27 | model.add(LSTM(64, return_sequences=True)) 28 | model.add(AttentionWithContext()) 29 | """ 30 | 31 | def __init__(self, 32 | W_regularizer=None, b_regularizer=None, 33 | W_constraint=None, b_constraint=None, 34 | bias=True, **kwargs): 35 | 36 | self.supports_masking = True 37 | self.init = initializers.get('glorot_uniform') 38 | 39 | self.W_regularizer = regularizers.get(W_regularizer) 40 | self.b_regularizer = regularizers.get(b_regularizer) 41 | 42 | self.W_constraint = constraints.get(W_constraint) 43 | self.b_constraint = constraints.get(b_constraint) 44 | 45 | self.bias = bias 46 | super(Attention_layer, self).__init__(**kwargs) 47 | 48 | def build(self, input_shape): 49 | # print('input_shape',input_shape) 50 | assert len(input_shape) == 3 51 | #assert断言语句为raise-if-not 52 | self.W = self.add_weight((input_shape[-1], input_shape[-1],), 53 | initializer=self.init, 54 | name='{}_W'.format(self.name), 55 | regularizer=self.W_regularizer, 56 | constraint=self.W_constraint) 57 | if self.bias: 58 | self.b = self.add_weight((input_shape[-1],), 59 | initializer='zero', 60 | name='{}_b'.format(self.name), 61 | regularizer=self.b_regularizer, 62 | constraint=self.b_constraint) 63 | 64 | super(Attention_layer, self).build(input_shape) 65 | 66 | def compute_mask(self, input, input_mask=None): 67 | # do not pass the mask to the next layers 68 | return None 69 | 70 | def call(self, x, mask=None): 71 | uit = K.dot(x, self.W) 72 | 73 | if self.bias: 74 | uit += self.b 75 | 76 | uit = K.tanh(uit) 77 | 78 | a = K.exp(uit) 79 | 80 | # apply mask after the exp. will be re-normalized next 81 | if mask is not None: 82 | # Cast the mask to floatX to avoid float64 upcasting in theano 83 | a *= K.cast(mask, K.floatx()) 84 | 85 | # in some cases especially in the early stages of training the sum may be almost zero 86 | # and this results in NaN's. A workaround is to add a very small positive number to the sum. 87 | # a /= K.cast(K.sum(a, axis=1, keepdims=True), K.floatx()) 88 | a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx()) 89 | weighted_input = x * a 90 | return K.sum(weighted_input, axis=1) 91 | 92 | def compute_output_shape(self, input_shape): 93 | return input_shape[0], input_shape[-1] 94 | -------------------------------------------------------------------------------- /Classifier_model_train.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Sat Aug 18 13:29:25 2018 4 | @author: Moc 5 | """ 6 | 7 | import os 8 | import jieba 9 | import numpy as np 10 | import pandas as pd 11 | from keras.preprocessing.text import Tokenizer 12 | from keras.preprocessing.sequence import pad_sequences 13 | from keras.utils.np_utils import to_categorical 14 | 15 | from keras.layers import Masking 16 | from keras.layers import Dense, Input, Flatten 17 | from keras.layers import Conv1D, GlobalMaxPooling1D, Embedding, Merge, Dropout, LSTM, GRU, Bidirectional 18 | from keras.models import Sequential, Model 19 | 20 | from keras.callbacks import TensorBoard 21 | from keras.callbacks import ModelCheckpoint 22 | 23 | from Config import epochs,batch_size,choice 24 | from Config import SENTENCE_NUM,MAX_SEQUENCE_LENGTH,MAX_NB_WORDS,EMBEDDING_DIM,VALIDATION_SPLIT 25 | from Utils import model_select 26 | 27 | #加载训练文件 28 | def loadfile(): 29 | neg=pd.read_excel('./data/mpk/neg.xls',header=None,index=None) 30 | pos=pd.read_excel('./data/mpk/pos.xls',header=None,index=None) 31 | 32 | combined=np.concatenate((pos[0], neg[0])) 33 | y = np.concatenate((np.ones(len(pos),dtype=int), np.zeros(len(neg),dtype=int))) 34 | 35 | return combined,y 36 | 37 | #对句子经行分词,并去掉换行符 38 | def split_sentence(text): 39 | text = [jieba.lcut(document.replace('\n', '')) for document in text] 40 | return text 41 | 42 | 43 | #读取词向量,生成词典 44 | def embedding_dict(): 45 | embeddings_index = {} 46 | f = open('./data/zhwiki_2017_03.sg_50d.word2vec',encoding='utf-8') 47 | for line in f: 48 | values = line.split() 49 | word = values[0] 50 | coefs = np.asarray(values[1:], dtype='float32') 51 | embeddings_index[word] = coefs 52 | f.close() 53 | print('Total %s word vectors.' % len(embeddings_index)) 54 | return embeddings_index 55 | 56 | 57 | #补齐数据维度 58 | def data_pad(texts): 59 | tokenizer = Tokenizer(num_words=MAX_NB_WORDS) 60 | tokenizer.fit_on_texts(texts) 61 | sequences = tokenizer.texts_to_sequences(texts) 62 | word_index = tokenizer.word_index 63 | data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH) 64 | return data,word_index 65 | 66 | 67 | #构造数据 68 | def data_classfier(): 69 | combined,y = loadfile() 70 | texts = split_sentence(combined) 71 | labels = y 72 | labels = to_categorical(np.asarray(labels)) 73 | print('Shape of data tensor:', len(texts)) 74 | print('Shape of label tensor:', len(labels)) 75 | 76 | data,word_index = data_pad(texts) 77 | 78 | indices = np.arange(data.shape[0]) 79 | np.random.shuffle(indices) 80 | data = data[indices] 81 | labels = labels[indices] 82 | 83 | nb_validation_samples = int(VALIDATION_SPLIT * data.shape[0]) 84 | x_train = data[:-nb_validation_samples] 85 | y_train = labels[:-nb_validation_samples] 86 | x_val = data[-nb_validation_samples:] 87 | y_val = labels[-nb_validation_samples:] 88 | return x_train,y_train,x_val,y_val,word_index 89 | 90 | #模型选择 91 | def model_(word_index,embeddings_index,choice): 92 | M = model_select() 93 | log_dir = './{}_model/log'.format(choice) 94 | filepath = './{}_model/{}.h5'.format(choice,choice) 95 | if choice == 'BIGRU': 96 | model = M.BIGRU_model(word_index,embeddings_index) 97 | elif choice == 'BILSTM': 98 | model = M.BILSTM_model(word_index,embeddings_index) 99 | elif choice == 'MLP': 100 | model = M.MLP_model(word_index,embeddings_index) 101 | elif choice == 'ATENTION_LSTM': 102 | model = M.ATENTION_LSTM_model(word_index,embeddings_index) 103 | elif choice == 'ATTENTION_GRU': 104 | model = M.ATTENTION_GRU_model(word_index,embeddings_index) 105 | else: 106 | print('选择的模型未存在可选配置中,请选择BIGRU/BILSTM/MLP/ATENTION_LSTM/ATTENTION_GRU中的一个') 107 | #终止程序, os._exit(0) 正常退出 108 | # os._exit(0) 109 | return log_dir,filepath,model 110 | 111 | def mdk(dir_): 112 | if not os.path.exists(dir_): 113 | os.makedirs(dir_) 114 | 115 | #构造模型,训练 116 | def train(epochs,batch_size,choise): 117 | x_train,y_train,x_val,y_val,word_index = data_classfier() 118 | embeddings_index = embedding_dict() 119 | log_dir,filepath,model = model_(word_index,embeddings_index,choise) 120 | print(log_dir, filepath) 121 | mdk(log_dir) 122 | 123 | print('Traing and validation set number of positive and negative reviews') 124 | print(y_train.sum(axis=0)) 125 | print(y_val.sum(axis=0)) 126 | 127 | tensorboard = TensorBoard(log_dir=log_dir) 128 | #保存最优模型 129 | checkpoint = ModelCheckpoint(filepath=filepath,monitor='val_acc',mode='max' ,save_best_only='True') 130 | 131 | callback_lists=[tensorboard,checkpoint] 132 | model.fit(x_train, y_train, validation_data=(x_val, y_val), 133 | epochs=epochs, batch_size=batch_size, verbose=1, callbacks=callback_lists) 134 | 135 | #测试集 136 | score = model.evaluate(x_val, y_val, batch_size=batch_size) 137 | print('loss: {} acc: {}'.format(score[0], score[1])) 138 | 139 | 140 | if __name__ == '__main__': 141 | epochs = epochs 142 | batch_size = batch_size 143 | choice = choice 144 | train(epochs,batch_size,choice) 145 | 146 | -------------------------------------------------------------------------------- /Config.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Sat Aug 18 13:29:25 2018 4 | @author: Moc 5 | """ 6 | 7 | #定义参数 8 | SENTENCE_NUM = 21105 9 | MAX_SEQUENCE_LENGTH = 1000 10 | MAX_NB_WORDS = 20000 11 | EMBEDDING_DIM = 50 12 | VALIDATION_SPLIT = 0.2 13 | 14 | 15 | #训练 16 | epochs = 10 17 | batch_size = 50 18 | #choice : BIGRU/BILSTM/MLP/ATENTION_LSTM/ATTENTION_GRU 19 | choice = 'BILSTM' 20 | 21 | 22 | #预测 23 | model_file = './MLP_model/MLP.h5' 24 | #model_file = './lstm_model/lstm.h5' 25 | #model_file = './ATENTION_LSTM_model/ATENTION_LSTM.h5' 26 | 27 | string_list = ['跟想象中差太多,我自己买了100多的配件,你们太夸张了,太不满意了', 28 | '天气很好,非常开心', 29 | '在这家店买的东西质量很差,一点诚信都没有,不会再光顾了'] 30 | 31 | -------------------------------------------------------------------------------- /MLP_model/MLP.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/guimaxing/sentiment-analysis-MLP-BiRNN-BiGRU-Attention-Model/d448434e8c46a49627822ee12b5970ba1a17421e/MLP_model/MLP.h5 -------------------------------------------------------------------------------- /MLP_model/log/events.out.tfevents.1534538333.������MC: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/guimaxing/sentiment-analysis-MLP-BiRNN-BiGRU-Attention-Model/d448434e8c46a49627822ee12b5970ba1a17421e/MLP_model/log/events.out.tfevents.1534538333.������MC -------------------------------------------------------------------------------- /Predict.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Sat Aug 18 13:29:25 2018 4 | @author: Moc 5 | """ 6 | 7 | import jieba 8 | from keras.preprocessing.text import Tokenizer 9 | from keras.preprocessing.sequence import pad_sequences 10 | from keras.models import load_model 11 | from Attention_layer import Attention_layer 12 | from Config import SENTENCE_NUM,MAX_SEQUENCE_LENGTH,MAX_NB_WORDS,EMBEDDING_DIM,VALIDATION_SPLIT 13 | from Config import model_file,string_list 14 | 15 | #对句子经行分词,并去掉换行符 16 | def split_sentence(text): 17 | text = [jieba.lcut(document.replace('\n', '')) for document in text] 18 | return text 19 | 20 | #预测 21 | def predict_result(model, string): 22 | tx = [string] 23 | txs = split_sentence(tx) 24 | tokenizer = Tokenizer(num_words=MAX_NB_WORDS) 25 | tokenizer.fit_on_texts(txs) 26 | sequences = tokenizer.texts_to_sequences(txs) 27 | # word_index = tokenizer.word_index 28 | data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH) 29 | result = model.predict(data) 30 | result_0 = result[0][0] 31 | result_1 = result[0][1] 32 | return result_0, result_1 33 | 34 | if __name__ == '__main__': 35 | """ 36 | 添加自定义损失或者网络层 37 | tips: 38 | load_model函数提供了custom_objects参数,所以加载时需要加入这个参数 39 | 40 | 假设自定义参数loss的函数名为cosloss,所以加载时应采用以下方式 41 | from * import cosloss 42 | model = load_model(model_file, {'cosloss':cosloss}) 43 | 44 | 假设自定义网络层的函数名为Attention_layer,所以加载时应采用以下方式 45 | from Attention_layer import Attention_layer 46 | model = load_model(model_file,{'Attention_layer':Attention_layer}) 47 | """ 48 | 49 | model_file = model_file 50 | string_list = string_list 51 | 52 | print ('loading model......') 53 | model = load_model(model_file,{'Attention_layer':Attention_layer}) 54 | # model.compile(loss='categorical_crossentropy', 55 | # optimizer='rmsprop',metrics=['acc']) 56 | # model.summary() 57 | print('--------------------------------') 58 | print('预测结果') 59 | 60 | for string in string_list: 61 | result_0, result_1 = predict_result(model, string) 62 | if result_0 > result_1: 63 | print('第{}段文字预测为0的概率为{}'.format(string_list.index(string)+1,result_0)) 64 | else: 65 | print('第{}段文字预测为1的概率为{}'.format(string_list.index(string)+1,result_1)) 66 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | zh_project 2 | 3 | 词向量下载: 4 | https://pan.baidu.com/s/1sKO_MZqzcGccJm86NrKEtw q0en 5 | 下载放于data文件夹中 6 | 7 | 8 | 文件 9 | Utils: 模型选择 10 | Config:训练和预测参数选择 11 | Attention:自定义attention网络层 12 | Predict:预测 13 | Classifier_model_train:模型训练 14 | 15 | 16 | 文件夹 17 | data:训练数据,词向量 18 | *_model:模型存储路径 19 | -------------------------------------------------------------------------------- /Utils.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Sat Aug 18 13:29:25 2018 4 | @author: Moc 5 | """ 6 | import numpy as np 7 | import pandas as pd 8 | from keras.preprocessing.text import Tokenizer 9 | from keras.preprocessing.sequence import pad_sequences 10 | from keras.utils.np_utils import to_categorical 11 | 12 | from keras.layers import Masking 13 | from keras.layers import Dense, Input, Flatten 14 | from keras.layers import Conv1D, GlobalMaxPooling1D, Embedding, Merge, Dropout, LSTM, GRU, Bidirectional 15 | from keras.models import Sequential, Model 16 | from Attention_layer import Attention_layer 17 | 18 | from keras.callbacks import TensorBoard 19 | from keras.callbacks import ModelCheckpoint 20 | 21 | from Config import SENTENCE_NUM,MAX_SEQUENCE_LENGTH,MAX_NB_WORDS,EMBEDDING_DIM,VALIDATION_SPLIT 22 | 23 | class model_select(): 24 | #embedding层 25 | def emb_model_layer(self,word_index,embeddings_index): 26 | embedding_matrix = np.random.random((len(word_index) + 1, EMBEDDING_DIM)) 27 | for word, i in word_index.items(): 28 | embedding_vector = embeddings_index.get(word) 29 | if embedding_vector is not None: 30 | # words not found in embedding index will be all-zeros. 31 | embedding_matrix[i] = embedding_vector 32 | print ('Length of embedding_matrix:', embedding_matrix.shape[0]) 33 | embedding_layer = Embedding(len(word_index) + 1, 34 | EMBEDDING_DIM, 35 | weights=[embedding_matrix], 36 | mask_zero=False, 37 | input_length=MAX_SEQUENCE_LENGTH, 38 | trainable=False) 39 | return embedding_layer 40 | 41 | def BIGRU_model(self,word_index,embeddings_index): 42 | print('开始构建BIGRU模型') 43 | embedding_layer = self.emb_model_layer(word_index,embeddings_index) 44 | sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32') 45 | embedded_sequences = embedding_layer(sequence_input) 46 | l_gru = Bidirectional(GRU(100, return_sequences=False))(embedded_sequences) 47 | dense_1 = Dense(100,activation='tanh')(l_gru) 48 | dense_2 = Dense(2, activation='softmax')(dense_1) 49 | 50 | model = Model(sequence_input, dense_2) 51 | 52 | model.compile(loss='categorical_crossentropy', 53 | optimizer='rmsprop', 54 | metrics=['acc']) 55 | 56 | model.summary() 57 | return model 58 | 59 | def BILSTM_model(self,word_index,embeddings_index): 60 | print('开始构建BILSTM模型') 61 | embedding_layer = self.emb_model_layer(word_index,embeddings_index) 62 | sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32') 63 | embedded_sequences = embedding_layer(sequence_input) 64 | l_gru = Bidirectional(LSTM(100, return_sequences=False))(embedded_sequences) 65 | dense_1 = Dense(100,activation='tanh')(l_gru) 66 | dense_2 = Dense(2, activation='softmax')(dense_1) 67 | 68 | model = Model(sequence_input, dense_2) 69 | 70 | model.compile(loss='categorical_crossentropy', 71 | optimizer='rmsprop', 72 | metrics=['acc']) 73 | 74 | model.summary() 75 | return model 76 | 77 | def MLP_model(self,word_index,embeddings_index): 78 | print('开始构建MLP模型') 79 | embedding_layer = self.emb_model_layer(word_index,embeddings_index) 80 | sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32') 81 | embedded_sequences = embedding_layer(sequence_input) 82 | dense_1 = Dense(100,activation='tanh')(embedded_sequences) 83 | max_pooling = GlobalMaxPooling1D()(dense_1) 84 | dense_2 = Dense(2, activation='softmax')(max_pooling) 85 | 86 | model = Model(sequence_input, dense_2) 87 | 88 | model.compile(loss='categorical_crossentropy', 89 | optimizer='rmsprop', 90 | metrics=['acc']) 91 | 92 | model.summary() 93 | return model 94 | 95 | 96 | def ATENTION_LSTM_model(self,word_index,embeddings_index): 97 | print('开始构建MLP模型') 98 | embedding_layer = self.emb_model_layer(word_index,embeddings_index) 99 | sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32') 100 | embedded_sequences = embedding_layer(sequence_input) 101 | l_lstm = Bidirectional(LSTM(100, return_sequences=True))(embedded_sequences) 102 | l_att = Attention_layer()(l_lstm) 103 | dense_1 = Dense(100,activation='tanh')(l_att) 104 | dense_2 = Dense(2, activation='softmax')(dense_1) 105 | model = Model(sequence_input, dense_2) 106 | model.compile(loss='categorical_crossentropy', 107 | optimizer='rmsprop', 108 | metrics=['acc']) 109 | 110 | model.summary() 111 | return model 112 | 113 | def ATENTION_GRU_model(self,word_index,embeddings_index): 114 | print('开始构建MLP模型') 115 | embedding_layer = self.emb_model_layer(word_index,embeddings_index) 116 | sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32') 117 | embedded_sequences = embedding_layer(sequence_input) 118 | l_gru = Bidirectional(GRU(100, return_sequences=True))(embedded_sequences) 119 | l_att = Attention_layer()(l_gru) 120 | dense_1 = Dense(100,activation='tanh')(l_att) 121 | dense_2 = Dense(2, activation='softmax')(dense_1) 122 | model = Model(sequence_input, dense_2) 123 | model.compile(loss='categorical_crossentropy', 124 | optimizer='rmsprop', 125 | metrics=['acc']) 126 | 127 | model.summary() 128 | return model -------------------------------------------------------------------------------- /data/mpk/neg.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/guimaxing/sentiment-analysis-MLP-BiRNN-BiGRU-Attention-Model/d448434e8c46a49627822ee12b5970ba1a17421e/data/mpk/neg.xls -------------------------------------------------------------------------------- /data/mpk/pos.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/guimaxing/sentiment-analysis-MLP-BiRNN-BiGRU-Attention-Model/d448434e8c46a49627822ee12b5970ba1a17421e/data/mpk/pos.xls --------------------------------------------------------------------------------