├── .gitignore ├── README.md ├── bayes_sentiment.py ├── cnn_sentiment.py ├── data ├── test.txt └── train.txt ├── decsiontree_sentiment.py ├── knn_sentiment.py ├── lstm_sentiment.py ├── mlp_sentiment.py ├── model ├── sentiment_bayes_model.m ├── sentiment_cnn_model.h5 ├── sentiment_decisiontree_model.m ├── sentiment_knn_model.m ├── sentiment_lstm_model.h5 ├── sentiment_mlp_model.h5 └── sentiment_svm_model.m └── svm_sentiment.py /.gitignore: -------------------------------------------------------------------------------- 1 | /embedding/*.bin 2 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # LearningBasedSentiment 2 | **Sentiment Classifier** based on traditional Maching learning methods, eg. Bayes, SVM ,DecisionTree, KNN and Deeplearning method like MLP, CNN, RNN(LSTM). 3 | 4 | ## Requirements 5 | All code in this project is implemented in [Python3.6+](https://www.python.org/downloads/). 6 | And all the essential packages are listed in `requirements.txt`, you can install them by 7 | `pip install -r requirements.txt -i https://pypi.douban.com/simple/` 8 | [Anaconda](https://docs.anaconda.com/anaconda/) or [virtualenv + virtualenvwrapper](http://www.jianshu.com/p/44ab75fbaef2) are strongly recommended to manage your Python environments. 9 | 10 | # 预处理 11 | 1、语料 12 | 电影评论,训练集合20000(正向10000,负向10000) 13 | 电影评论,测试集合20000(正向3000,负向3000) 14 | 2、语料处理 15 | 使用jieba进行分词 16 | 3、输入向量化 17 | 使用预先训练的wordvector.bin文件进行向量化 18 | 对于传统机器学习算法,要求输入的是N维向量, 采用句子向量求和平均 19 | 对于CNN,RNN深度学习算法,要求输入的是N*M维向量,分别对应查找并生成向量   20 | 21 | # 训练与对比(准确率) 22 | 23 | | Algorithm | Accuracy | 24 | | --- | --- | 25 | | DecisionTree | 0.6907302434144715 | 26 | | Bayes | 0.7437479159719906 | 27 | | KNN | (n=14)0.7909303101033678 | 28 | | SVM | 0.8302767589196399 | 29 | | MLP | (20epoches) 0.8359 | 30 | | CNN | (20epoches) 0.8376 | 31 | | LSTM | (20epoches) 0.8505 | 32 | -------------------------------------------------------------------------------- /bayes_sentiment.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # coding: utf-8 3 | # File: bayes_sentiment.py 4 | # Author: lhy 5 | # Date: 18-3-20 6 | ''' 7 | function:基于wordvector + 传统机器学习方法的 情感分类 8 | ''' 9 | import gensim 10 | import numpy as np 11 | from sklearn.externals import joblib 12 | 13 | VECTOR_DIR = './embedding/word_vector.bin' # 词向量模型文件 14 | model = gensim.models.KeyedVectors.load_word2vec_format(VECTOR_DIR, binary=False) 15 | 16 | '''基于wordvector,通过lookup table的方式找到句子的wordvector的表示''' 17 | def rep_sentencevector(sentence): 18 | '''通过向量求和的方式标识sentence vector''' 19 | word_list = [word for word in sentence.split(' ')] 20 | embedding_dim = 200 21 | embedding_matrix = np.zeros(embedding_dim) 22 | for index, word in enumerate(word_list): 23 | try: 24 | embedding_matrix += model[word] 25 | except: 26 | pass 27 | 28 | return embedding_matrix/len(word_list) 29 | 30 | '''构造训练数据''' 31 | def build_traindata(): 32 | X_train = list() 33 | Y_train = list() 34 | X_test = list() 35 | Y_test = list() 36 | for line in open('./data/train.txt'): 37 | line = line.strip().strip().split('\t') 38 | sent_vector = rep_sentencevector(line[-1]) 39 | 40 | X_train.append(sent_vector) 41 | if line[0] == '1': 42 | Y_train.append(1) 43 | else: 44 | Y_train.append(0) 45 | 46 | for line in open('./data/test.txt'): 47 | line = line.strip().strip().split('\t') 48 | sent_vector = rep_sentencevector(line[-1]) 49 | X_test.append(sent_vector) 50 | if line[0] == '1': 51 | Y_test.append(1) 52 | else: 53 | Y_test.append(0) 54 | 55 | return np.array(X_train), np.array(Y_train), np.array(X_test), np.array(Y_test), 56 | 57 | '''基于bayes分类器算法''' 58 | def train_bayes(X_train, Y_train): 59 | from sklearn.naive_bayes import GaussianNB 60 | model = GaussianNB() 61 | model.fit(X_train, Y_train) 62 | joblib.dump(model, './model/sentiment_bayes_model.m') 63 | 64 | '''基于bayse分类器的预测''' 65 | def evaluate_bayes(model_filepath, X_test, Y_test): 66 | model = joblib.load(model_filepath) 67 | Y_predict = list() 68 | Y_test = list(Y_test) 69 | right = 0 70 | for sent in X_test: 71 | Y_predict.append(model.predict(sent.reshape(1, -1))[0]) 72 | for index in range(len(Y_predict)): 73 | if int(Y_predict[index]) == int(Y_test[index]): 74 | right += 1 75 | score = right / len(Y_predict) 76 | print('model accuray is :{0}'.format(score)) #0.7437479159719906 77 | return score 78 | 79 | '''实际应用测试''' 80 | def predict_bayes(model_filepath): 81 | model = joblib.load(model_filepath) 82 | sentence1 = '这个 电视 真 尼玛 垃圾 , 老子 再也 不买 了' 83 | sentence2 = '这件 衣服 真的 太 好看 了 ! 好想 买 啊 ' 84 | rep_sen1 = np.array(rep_sentencevector(sentence1)).reshape(1, -1) 85 | rep_sen2 = np.array(rep_sentencevector(sentence2)).reshape(1, -1) 86 | print('sentence1', model.predict(rep_sen1)) 87 | print('sentence2', model.predict(rep_sen2)) 88 | ''' 89 | sentence1 [1] 90 | sentence2 [1] --> 评判错误 91 | ''' 92 | 93 | 94 | 95 | if __name__ == '__main__': 96 | X_train, Y_train, X_test, Y_test = build_traindata() 97 | model_filepath = './model/sentiment_bayes_model.m' 98 | #print(X_train.shape, Y_train.shape) 99 | #print(X_test.shape, Y_test.shape) 100 | #train_bayes(X_train, Y_train) 101 | evaluate_bayes(model_filepath, X_test, Y_test) 102 | #predict_bayes(model_filepath) -------------------------------------------------------------------------------- /cnn_sentiment.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # coding: utf-8 3 | # File: cnn_sentiment.py 4 | # Author: lhy 5 | # Date: 18-3-19 6 | 7 | import gensim 8 | import numpy as np 9 | from keras.models import load_model 10 | 11 | VECTOR_DIR = './embedding/word_vector.bin' # 词向量模型文件 12 | model = gensim.models.KeyedVectors.load_word2vec_format(VECTOR_DIR, binary=False) 13 | 14 | '''基于wordvector,通过lookup table的方式找到句子的wordvector的表示''' 15 | def rep_sentencevector(sentence): 16 | word_list = [word for word in sentence.split(' ')] 17 | max_words = 100 18 | embedding_dim = 200 19 | embedding_matrix = np.zeros((max_words, embedding_dim)) 20 | for index, word in enumerate(word_list): 21 | try: 22 | embedding_matrix[index] = model[word] 23 | except: 24 | pass 25 | 26 | return embedding_matrix 27 | 28 | 29 | '''构造训练数据''' 30 | def build_traindata(): 31 | X_train = list() 32 | Y_train = list() 33 | X_test = list() 34 | Y_test = list() 35 | 36 | for line in open('./data/train.txt'): 37 | line = line.strip().strip().split('\t') 38 | sent_vector = rep_sentencevector(line[-1]) 39 | 40 | X_train.append(sent_vector) 41 | if line[0] == '1': 42 | Y_train.append([0, 1]) 43 | else: 44 | Y_train.append([1, 0]) 45 | 46 | for line in open('./data/test.txt'): 47 | line = line.strip().strip().split('\t') 48 | sent_vector = rep_sentencevector(line[-1]) 49 | X_test.append(sent_vector) 50 | if line[0] == '1': 51 | Y_test.append([0, 1]) 52 | else: 53 | Y_test.append([1, 0]) 54 | 55 | return np.array(X_train), np.array(Y_train), np.array(X_test), np.array(Y_test), 56 | 57 | 58 | '''四层CNN进行训练,迭代20次''' 59 | def train_cnn(X_train, Y_train, X_test, Y_test): 60 | from keras.models import Sequential 61 | from keras.layers import Dense, Dropout 62 | from keras.layers import Embedding 63 | from keras.layers import Conv1D, GlobalAveragePooling1D, MaxPooling1D 64 | #建立sequential序贯模型 65 | model = Sequential() 66 | #input_shape = (rows行, cols列, 1) 1表示颜色通道数目, rows行,对应一句话的长度, cols列表示词向量的维度 67 | model.add(Conv1D(64, 3, activation='relu', input_shape=(100, 200))) 68 | model.add(Conv1D(64, 3, activation='relu')) 69 | model.add(MaxPooling1D(3)) 70 | model.add(Conv1D(128, 3, activation='relu')) 71 | model.add(Conv1D(128, 3, activation='relu')) 72 | model.add(GlobalAveragePooling1D()) 73 | model.add(Dropout(0.5)) 74 | model.add(Dense(2, activation='sigmoid')) 75 | 76 | model.compile(loss='binary_crossentropy', 77 | optimizer='rmsprop', 78 | metrics=['accuracy']) 79 | 80 | 81 | model.fit(X_train, Y_train, batch_size=100, epochs=20, validation_data=(X_test, Y_test)) 82 | model.save('./model/sentiment_cnn_model.h5') 83 | 84 | ''' 85 | 1 [==============================] - 13s 664us/step - loss: 0.4868 - acc: 0.7645 - val_loss: 0.3897 - val_acc: 0.8234 86 | 5 [==============================] - 13s 633us/step - loss: 0.2923 - acc: 0.8794 - val_loss: 0.3376 - val_acc: 0.8527 87 | 10 [==============================] - 12s 601us/step - loss: 0.1337 - acc: 0.9482 - val_loss: 0.5124 - val_acc: 0.8284 88 | 15 [==============================] - 13s 631us/step - loss: 0.0729 - acc: 0.9789 - val_loss: 0.8681 - val_acc: 0.8325 89 | 20 [==============================] - 13s 632us/step - loss: 0.0484 - acc: 0.9873 - val_loss: 1.0889 - val_acc: 0.8376 90 | ''' 91 | 92 | '''实际应用,测试''' 93 | def predict_cnn(model_filepath): 94 | model = load_model(model_filepath) 95 | sentence = '这个 电视 真 尼玛 垃圾 , 老子 再也 不买 了' # [[2.3127215e-04 0.9977249]] 96 | sentence = '这件 衣服 真的 太 好看 了 ! 好想 买 啊 ' # [[0.9936581 0.00627225]] 97 | sentence_vector = np.array([rep_sentencevector(sentence)]) 98 | print(sentence_vector) 99 | print('test after load: ', model.predict(sentence_vector)) 100 | 101 | 102 | if __name__ == '__main__': 103 | # X_train, Y_train, X_test, Y_test = build_traindata() 104 | model_filepath = './model/sentiment_cnn_model.h5' 105 | # print(X_train.shape, Y_train.shape) 106 | # print(X_test.shape, Y_test.shape) 107 | # train_cnn(X_train, Y_train, X_test, Y_test) 108 | predict_cnn(model_filepath) 109 | 110 | 111 | -------------------------------------------------------------------------------- /decsiontree_sentiment.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # coding: utf-8 3 | # File: decsiontree_sentiment.py 4 | # Author: lhy 5 | # Date: 18-3-20 6 | 7 | import gensim 8 | import numpy as np 9 | from sklearn.externals import joblib 10 | 11 | VECTOR_DIR = './embedding/word_vector.bin' # 词向量模型文件 12 | model = gensim.models.KeyedVectors.load_word2vec_format(VECTOR_DIR, binary=False) 13 | 14 | '''基于wordvector,通过lookup table的方式找到句子的wordvector的表示,向量求和做平均''' 15 | def rep_sentencevector(sentence): 16 | '''通过向量求和的方式标识sentence vector''' 17 | word_list = [word for word in sentence.split(' ')] 18 | embedding_dim = 200 19 | embedding_matrix = np.zeros(embedding_dim) 20 | for index, word in enumerate(word_list): 21 | try: 22 | embedding_matrix += model[word] 23 | except: 24 | pass 25 | 26 | return embedding_matrix/len(word_list) 27 | 28 | '''构造训练数据''' 29 | def build_traindata(): 30 | X_train = list() 31 | Y_train = list() 32 | X_test = list() 33 | Y_test = list() 34 | for line in open('./data/train.txt'): 35 | line = line.strip().strip().split('\t') 36 | sent_vector = rep_sentencevector(line[-1]) 37 | 38 | X_train.append(sent_vector) 39 | if line[0] == '1': 40 | Y_train.append(1) 41 | else: 42 | Y_train.append(0) 43 | 44 | for line in open('./data/test.txt'): 45 | line = line.strip().strip().split('\t') 46 | sent_vector = rep_sentencevector(line[-1]) 47 | X_test.append(sent_vector) 48 | if line[0] == '1': 49 | Y_test.append(1) 50 | else: 51 | Y_test.append(0) 52 | 53 | return np.array(X_train), np.array(Y_train), np.array(X_test), np.array(Y_test), 54 | 55 | '''基于decisiontree分类器算法, 使用SVC算法,使用默认参数''' 56 | def train_decisiontree(X_train, Y_train): 57 | from sklearn import tree 58 | model = tree.DecisionTreeClassifier() 59 | model.fit(X_train, Y_train) 60 | joblib.dump(model, './model/sentiment_decisiontree_model.m') 61 | 62 | '''基于decisiontree分类器的预测''' 63 | def evaluate_decisiontree(model_filepath, X_test, Y_test): 64 | model = joblib.load(model_filepath) 65 | Y_predict = list() 66 | Y_test = list(Y_test) 67 | right = 0 68 | for sent in X_test: 69 | Y_predict.append(model.predict(sent.reshape(1, -1))[0]) 70 | for index in range(len(Y_predict)): 71 | if int(Y_predict[index]) == int(Y_test[index]): 72 | right += 1 73 | score = right / len(Y_predict) 74 | print('model accuray is :{0}'.format(score)) #0.6907302434144715 75 | return score 76 | 77 | '''实际应用测试''' 78 | def predict_decisiontree(model_filepath): 79 | model = joblib.load(model_filepath) 80 | sentence1 = '这个 电视 真 尼玛 垃圾 , 老子 再也 不买 了' 81 | sentence2 = '这件 衣服 真的 太 好看 了 ! 好想 买 啊 ' 82 | rep_sen1 = np.array(rep_sentencevector(sentence1)).reshape(1, -1) 83 | rep_sen2 = np.array(rep_sentencevector(sentence2)).reshape(1, -1) 84 | print('sentence1', model.predict(rep_sen1)) #sentence1 [0] 85 | print('sentence2', model.predict(rep_sen2)) #sentence2 [0] 86 | 87 | if __name__ == '__main__': 88 | X_train, Y_train, X_test, Y_test = build_traindata() 89 | model_filepath = './model/sentiment_decisiontree_model.m' 90 | print(X_train.shape, Y_train.shape) 91 | print(X_test.shape, Y_test.shape) 92 | train_decisiontree(X_train, Y_train) 93 | evaluate_decisiontree(model_filepath, X_test, Y_test) 94 | predict_decisiontree(model_filepath) -------------------------------------------------------------------------------- /knn_sentiment.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # coding: utf-8 3 | # File: knn_sentiment.py 4 | # Author: lhy 5 | # Date: 18-3-20 6 | 7 | import gensim 8 | import numpy as np 9 | from sklearn.externals import joblib 10 | 11 | VECTOR_DIR = './embedding/word_vector.bin' # 词向量模型文件 12 | model = gensim.models.KeyedVectors.load_word2vec_format(VECTOR_DIR, binary=False) 13 | 14 | '''基于wordvector,通过lookup table的方式找到句子的wordvector的表示''' 15 | def rep_sentencevector(sentence): 16 | '''通过向量求和的方式标识sentence vector''' 17 | word_list = [word for word in sentence.split(' ')] 18 | embedding_dim = 200 19 | embedding_matrix = np.zeros(embedding_dim) 20 | for index, word in enumerate(word_list): 21 | try: 22 | embedding_matrix += model[word] 23 | except: 24 | pass 25 | 26 | return embedding_matrix/len(word_list) 27 | 28 | '''构造训练数据''' 29 | def build_traindata(): 30 | X_train = list() 31 | Y_train = list() 32 | X_test = list() 33 | Y_test = list() 34 | for line in open('./data/train.txt'): 35 | line = line.strip().strip().split('\t') 36 | sent_vector = rep_sentencevector(line[-1]) 37 | 38 | X_train.append(sent_vector) 39 | if line[0] == '1': 40 | Y_train.append(1) 41 | else: 42 | Y_train.append(0) 43 | 44 | for line in open('./data/test.txt'): 45 | line = line.strip().strip().split('\t') 46 | sent_vector = rep_sentencevector(line[-1]) 47 | X_test.append(sent_vector) 48 | if line[0] == '1': 49 | Y_test.append(1) 50 | else: 51 | Y_test.append(0) 52 | 53 | return np.array(X_train), np.array(Y_train), np.array(X_test), np.array(Y_test), 54 | 55 | '''基于knn分类器算法''' 56 | def train_knn(X_train, Y_train, X_test, Y_test): 57 | from sklearn.neighbors import KNeighborsClassifier 58 | ''' 59 | for x in range(1, 15): 60 | model = KNeighborsClassifier(n_neighbors=x) 61 | model.fit(X_train, Y_train) 62 | preds = knnclf.predict(X_test) 63 | num = 0 64 | num = 0 65 | preds = preds.tolist() 66 | for i, pred in enumerate(preds): 67 | if int(pred) == int(Y_test[i]): 68 | num += 1 69 | print('K= ' + str(x) + ', precision_score:' + str(float(num) / len(preds))) 70 | 71 | *****************result**************** 72 | K= 1, precision_score:0.7169056352117372 73 | K= 2, precision_score:0.7189063021007003 74 | K= 3, precision_score:0.7600866955651884 75 | K= 4, precision_score:0.7519173057685895 76 | K= 5, precision_score:0.764754918306102 77 | K= 6, precision_score:0.7709236412137379 78 | K= 7, precision_score:0.7724241413804601 79 | K= 8, precision_score:0.7784261420473492 80 | K= 9, precision_score:0.7804268089363121 81 | K= 10, precision_score:0.7814271423807936 82 | K= 11, precision_score:0.7829276425475158 83 | K= 12, precision_score:0.7869289763254418 84 | K= 13, precision_score:0.7829276425475158 85 | K= 14, precision_score:0.7909303101033678 86 | ''' 87 | #选择K=20进行KNN训练 88 | model = KNeighborsClassifier(n_neighbors=14) 89 | model.fit(X_train, Y_train) 90 | joblib.dump(model, './model/sentiment_knn_model.m') 91 | 92 | '''基于knn分类器的预测''' 93 | def evaluate_knn(model_filepath, X_test, Y_test): 94 | model = joblib.load(model_filepath) 95 | Y_predict = list() 96 | Y_test = list(Y_test) 97 | right = 0 98 | for sent in X_test: 99 | Y_predict.append(model.predict(sent.reshape(1, -1))) 100 | for index in range(len(Y_predict)): 101 | if Y_predict[index] == Y_test[index]: 102 | right += 1 103 | score = right / len(Y_predict) 104 | print('model accuray is :{0}'.format(score))#0.7909303101033678 105 | return score 106 | 107 | '''实际应用测试''' 108 | def predict_knn(model_filepath): 109 | model = joblib.load(model_filepath) 110 | sentence1 = '这个 电视 真 尼玛 垃圾 , 老子 再也 不买 了' 111 | sentence2 = '这件 衣服 真的 太 好看 了 ! 好想 买 啊 ' 112 | rep_sen1 = np.array(rep_sentencevector(sentence1)).reshape(1, -1) 113 | rep_sen2 = np.array(rep_sentencevector(sentence2)).reshape(1, -1) 114 | print('sentence1', model.predict(rep_sen1)) #[1] 115 | print('sentence2', model.predict(rep_sen2)) #[0] 116 | 117 | if __name__ == '__main__': 118 | X_train, Y_train, X_test, Y_test = build_traindata() 119 | model_filepath = './model/sentiment_knn_model.m' 120 | print(X_train.shape, Y_train.shape) 121 | print(X_test.shape, Y_test.shape) 122 | train_knn(X_train, Y_train, X_test, Y_test) 123 | evaluate_knn(model_filepath, X_test, Y_test) 124 | predict_knn(model_filepath) -------------------------------------------------------------------------------- /lstm_sentiment.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # coding: utf-8 3 | # File: lstm_sentiment.py 4 | # Author: lhy 5 | # Date: 18-3-19 6 | import gensim 7 | import numpy as np 8 | from keras.models import load_model 9 | 10 | VECTOR_DIR = './embedding/word_vector.bin' # 词向量模型文件 11 | model = gensim.models.KeyedVectors.load_word2vec_format(VECTOR_DIR, binary=False) 12 | 13 | '''基于wordvector,通过lookup table的方式找到句子的wordvector的表示''' 14 | def rep_sentencevector(sentence): 15 | word_list = [word for word in sentence.split(' ')] 16 | max_words = 100 17 | embedding_dim = 200 18 | embedding_matrix = np.zeros((max_words, embedding_dim)) 19 | for index, word in enumerate(word_list): 20 | try: 21 | embedding_matrix[index] = model[word] 22 | except: 23 | pass 24 | 25 | return embedding_matrix 26 | 27 | '''构造训练数据''' 28 | def build_traindata(): 29 | X_train = list() 30 | Y_train = list() 31 | X_test = list() 32 | Y_test = list() 33 | 34 | for line in open('./data/train.txt'): 35 | line = line.strip().strip().split('\t') 36 | sent_vector = rep_sentencevector(line[-1]) 37 | 38 | X_train.append(sent_vector) 39 | if line[0] == '1': 40 | Y_train.append([0, 1]) 41 | else: 42 | Y_train.append([1, 0]) 43 | 44 | for line in open('./data/test.txt'): 45 | line = line.strip().strip().split('\t') 46 | sent_vector = rep_sentencevector(line[-1]) 47 | X_test.append(sent_vector) 48 | if line[0] == '1': 49 | Y_test.append([0, 1]) 50 | else: 51 | Y_test.append([1, 0]) 52 | 53 | return np.array(X_train), np.array(Y_train), np.array(X_test), np.array(Y_test), 54 | 55 | '''三层lstm进行训练,迭代20次''' 56 | def train_lstm(X_train, Y_train, X_test, Y_test): 57 | from keras.models import Sequential 58 | from keras.layers import LSTM, Dense 59 | import numpy as np 60 | data_dim = 200 # 对应词向量维度 61 | timesteps = 100 # 对应序列长度 62 | # expected input data shape: (batch_size, timesteps, data_dim) 63 | model = Sequential() 64 | model.add(LSTM(32, return_sequences=True, 65 | input_shape=(timesteps, data_dim))) # returns a sequence of vectors of dimension 32 66 | model.add(LSTM(32, return_sequences=True)) # returns a sequence of vectors of dimension 32 67 | model.add(LSTM(32)) # return a single vector of dimension 32 68 | model.add(Dense(2, activation='softmax')) 69 | 70 | model.compile(loss='categorical_crossentropy', 71 | optimizer='rmsprop', 72 | metrics=['accuracy']) 73 | 74 | model.fit(X_train, Y_train, batch_size=100, epochs=20, validation_data=(X_test, Y_test)) 75 | model.save('./model/sentiment_lstm_model.h5') 76 | '''/ 77 | 1 [==============================] - 41s 2ms/step - loss: 0.5384 - acc: 0.7142 - val_loss: 0.4223 - val_acc: 0.8281 78 | 5 [==============================] - 38s 2ms/step - loss: 0.2885 - acc: 0.8904 - val_loss: 0.3618 - val_acc: 0.8531 79 | 10 [==============================] - 40s 2ms/step - loss: 0.1965 - acc: 0.9357 - val_loss: 0.3815 - val_acc: 0.8515 80 | 15 [==============================] - 39s 2ms/step - loss: 0.1420 - acc: 0.9577 - val_loss: 0.5172 - val_acc: 0.8501 81 | 20 [==============================] - 37s 2ms/step - loss: 0.1055 - acc: 0.9729 - val_loss: 0.5309 - val_acc: 0.8505 82 | ''' 83 | 84 | '''实际应用,测试''' 85 | def predict_lstm(model_filepath): 86 | model = load_model(model_filepath) 87 | sentence = '这个 电视 真 尼玛 垃圾 , 老子 再也 不买 了'#[[0.01477097 0.98522896]] 88 | #sentence = '这件 衣服 真的 太 好看 了 ! 好想 买 啊 '#[[0.9843225 0.01567744]] 89 | sentence_vector = np.array([rep_sentencevector(sentence)]) 90 | print(sentence_vector) 91 | print('test after load: ', model.predict(sentence_vector)) 92 | 93 | 94 | if __name__ == '__main__': 95 | # X_train, Y_train, X_test, Y_test = build_traindata() 96 | model_filepath = './model/sentiment_model.h5' 97 | # print(X_train.shape, Y_train.shape) 98 | # print(X_test.shape, Y_test.shape) 99 | # train_lstm(X_train, Y_train, X_test, Y_test) 100 | predict_lstm(model_filepath) 101 | -------------------------------------------------------------------------------- /mlp_sentiment.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # coding: utf-8 3 | # File: mlp_sentiment.py 4 | # Author: lhy 5 | # Date: 18-3-20 6 | #!/usr/bin/env python3 7 | 8 | import gensim 9 | import numpy as np 10 | from keras.models import load_model 11 | 12 | VECTOR_DIR = './embedding/word_vector.bin' # 词向量模型文件 13 | model = gensim.models.KeyedVectors.load_word2vec_format(VECTOR_DIR, binary=False) 14 | 15 | '''基于wordvector,通过lookup table的方式找到句子的wordvector的表示''' 16 | def rep_sentencevector(sentence): 17 | '''通过向量求和的方式标识sentence vector''' 18 | word_list = [word for word in sentence.split(' ')] 19 | embedding_dim = 200 20 | embedding_matrix = np.zeros(embedding_dim) 21 | for index, word in enumerate(word_list): 22 | try: 23 | embedding_matrix += model[word] 24 | except: 25 | pass 26 | 27 | return embedding_matrix/len(word_list) 28 | 29 | '''构造训练数据''' 30 | def build_traindata(): 31 | X_train = list() 32 | Y_train = list() 33 | X_test = list() 34 | Y_test = list() 35 | for line in open('./data/train.txt'): 36 | line = line.strip().strip().split('\t') 37 | sent_vector = rep_sentencevector(line[-1]) 38 | 39 | X_train.append(sent_vector) 40 | if line[0] == '1': 41 | Y_train.append([0, 1]) 42 | else: 43 | Y_train.append([1, 0]) 44 | 45 | for line in open('./data/test.txt'): 46 | line = line.strip().strip().split('\t') 47 | sent_vector = rep_sentencevector(line[-1]) 48 | X_test.append(sent_vector) 49 | if line[0] == '1': 50 | Y_test.append([0, 1]) 51 | else: 52 | Y_test.append([1, 0]) 53 | 54 | return np.array(X_train), np.array(Y_train), np.array(X_test), np.array(Y_test), 55 | 56 | '''三层mlp进行训练,迭代20次''' 57 | def train_mlp(X_train, Y_train, X_test, Y_test): 58 | from keras.models import Sequential 59 | from keras.layers import Dense, Dropout 60 | model = Sequential() 61 | model.add(Dense(64, input_dim=(200), activation='relu')) 62 | model.add(Dropout(0.5)) 63 | model.add(Dense(64, activation='relu')) 64 | model.add(Dropout(0.5)) 65 | model.add(Dense(2, activation='sigmoid')) 66 | model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy']) 67 | model.fit(X_train, Y_train, batch_size=100, epochs=20, validation_data=(X_test, Y_test)) 68 | model.save('./model/sentiment_mlp_model.h5') 69 | ''' 70 | 1 [==============================] - 1s 25us/step - loss: 1.7893 - acc: 0.6168 - val_loss: 0.5576 - val_acc: 0.7076 71 | 5 [==============================] - 0s 19us/step - loss: 0.4499 - acc: 0.7987 - val_loss: 0.4056 - val_acc: 0.8204 72 | 10 [==============================] - 0s 17us/step - loss: 0.4043 - acc: 0.8274 - val_loss: 0.4016 - val_acc: 0.8341 73 | 15 [==============================] - 0s 17us/step - loss: 0.3815 - acc: 0.8397 - val_loss: 0.3821 - val_acc: 0.8345 74 | 20 [==============================] - 0s 17us/step - loss: 0.3746 - acc: 0.8432 - val_loss: 0.3842 - val_acc: 0.8359 75 | ''' 76 | 77 | '''实际应用,测试''' 78 | def predict_mlp(model_filepath): 79 | model = load_model(model_filepath) 80 | sentence1 = '这个 电视 真 尼玛 垃圾 , 老子 再也 不买 了' # [[0.0942708 0.9058427]] 81 | sentence2 = '这件 衣服 真的 太 好看 了 ! 好想 买 啊 ' # [[0.6489922 0.34993422]] 82 | sentence_vector1 = np.array([rep_sentencevector(sentence1)]) 83 | sentence_vector2 = np.array([rep_sentencevector(sentence2)]) 84 | print('test after load: ', model.predict(sentence_vector1)) 85 | print('test after load: ', model.predict(sentence_vector2)) 86 | 87 | 88 | if __name__ == '__main__': 89 | #X_train, Y_train, X_test, Y_test = build_traindata() 90 | model_filepath = './model/sentiment_mlp_model.h5' 91 | #print(X_train.shape, Y_train.shape) 92 | #print(X_test.shape, Y_test.shape) 93 | #train_mlp(X_train, Y_train, X_test, Y_test) 94 | predict_mlp(model_filepath) 95 | 96 | 97 | -------------------------------------------------------------------------------- /model/sentiment_bayes_model.m: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liuhuanyong/SentenceSentimentClassifier/856a89e6f0f0fa5e459e202875e8bb8fe02ed388/model/sentiment_bayes_model.m -------------------------------------------------------------------------------- /model/sentiment_cnn_model.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liuhuanyong/SentenceSentimentClassifier/856a89e6f0f0fa5e459e202875e8bb8fe02ed388/model/sentiment_cnn_model.h5 -------------------------------------------------------------------------------- /model/sentiment_decisiontree_model.m: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liuhuanyong/SentenceSentimentClassifier/856a89e6f0f0fa5e459e202875e8bb8fe02ed388/model/sentiment_decisiontree_model.m -------------------------------------------------------------------------------- /model/sentiment_knn_model.m: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liuhuanyong/SentenceSentimentClassifier/856a89e6f0f0fa5e459e202875e8bb8fe02ed388/model/sentiment_knn_model.m -------------------------------------------------------------------------------- /model/sentiment_lstm_model.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liuhuanyong/SentenceSentimentClassifier/856a89e6f0f0fa5e459e202875e8bb8fe02ed388/model/sentiment_lstm_model.h5 -------------------------------------------------------------------------------- /model/sentiment_mlp_model.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liuhuanyong/SentenceSentimentClassifier/856a89e6f0f0fa5e459e202875e8bb8fe02ed388/model/sentiment_mlp_model.h5 -------------------------------------------------------------------------------- /model/sentiment_svm_model.m: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liuhuanyong/SentenceSentimentClassifier/856a89e6f0f0fa5e459e202875e8bb8fe02ed388/model/sentiment_svm_model.m -------------------------------------------------------------------------------- /svm_sentiment.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # coding: utf-8 3 | # File: svm_sentiment.py 4 | # Author: lhy 5 | # Date: 18-3-20 6 | 7 | import gensim 8 | import numpy as np 9 | from sklearn.externals import joblib 10 | 11 | VECTOR_DIR = './embedding/word_vector.bin' # 词向量模型文件 12 | model = gensim.models.KeyedVectors.load_word2vec_format(VECTOR_DIR, binary=False) 13 | 14 | '''基于wordvector,通过lookup table的方式找到句子的wordvector的表示,向量求和做平均''' 15 | def rep_sentencevector(sentence): 16 | '''通过向量求和的方式标识sentence vector''' 17 | word_list = [word for word in sentence.split(' ')] 18 | embedding_dim = 200 19 | embedding_matrix = np.zeros(embedding_dim) 20 | for index, word in enumerate(word_list): 21 | try: 22 | embedding_matrix += model[word] 23 | except: 24 | pass 25 | 26 | return embedding_matrix/len(word_list) 27 | 28 | '''构造训练数据''' 29 | def build_traindata(): 30 | X_train = list() 31 | Y_train = list() 32 | X_test = list() 33 | Y_test = list() 34 | for line in open('./data/train.txt'): 35 | line = line.strip().strip().split('\t') 36 | sent_vector = rep_sentencevector(line[-1]) 37 | 38 | X_train.append(sent_vector) 39 | if line[0] == '1': 40 | Y_train.append(1) 41 | else: 42 | Y_train.append(0) 43 | 44 | for line in open('./data/test.txt'): 45 | line = line.strip().strip().split('\t') 46 | sent_vector = rep_sentencevector(line[-1]) 47 | X_test.append(sent_vector) 48 | if line[0] == '1': 49 | Y_test.append(1) 50 | else: 51 | Y_test.append(0) 52 | 53 | return np.array(X_train), np.array(Y_train), np.array(X_test), np.array(Y_test), 54 | 55 | '''基于svm分类器算法, 使用SVC算法,使用默认参数''' 56 | def train_svm(X_train, Y_train): 57 | from sklearn.svm import SVC 58 | model = SVC(kernel='linear') 59 | model.fit(X_train, Y_train) 60 | joblib.dump(model, './model/sentiment_svm_model.m') 61 | 62 | '''基于svm分类器的预测''' 63 | def evaluate_svm(model_filepath, X_test, Y_test): 64 | model = joblib.load(model_filepath) 65 | Y_predict = list() 66 | Y_test = list(Y_test) 67 | right = 0 68 | for sent in X_test: 69 | Y_predict.append(model.predict(sent.reshape(1, -1))[0]) 70 | for index in range(len(Y_predict)): 71 | if int(Y_predict[index]) == int(Y_test[index]): 72 | right += 1 73 | score = right / len(Y_predict) 74 | print('model accuray is :{0}'.format(score)) #0.8302767589196399 75 | return score 76 | 77 | '''实际应用测试''' 78 | def predict_svm(model_filepath): 79 | model = joblib.load(model_filepath) 80 | sentence1 = '这个 电视 真 尼玛 垃圾 , 老子 再也 不买 了' 81 | sentence2 = '这件 衣服 真的 太 好看 了 ! 好想 买 啊 ' 82 | rep_sen1 = np.array(rep_sentencevector(sentence1)).reshape(1, -1) 83 | rep_sen2 = np.array(rep_sentencevector(sentence2)).reshape(1, -1) 84 | print('sentence1', model.predict(rep_sen1)) #sentence1 [1] 85 | print('sentence2', model.predict(rep_sen2)) #sentence2 [0] 86 | 87 | if __name__ == '__main__': 88 | X_train, Y_train, X_test, Y_test = build_traindata() 89 | model_filepath = './model/sentiment_svm_model.m' 90 | print(X_train.shape, Y_train.shape) 91 | print(X_test.shape, Y_test.shape) 92 | train_svm(X_train, Y_train) 93 | evaluate_svm(model_filepath, X_test, Y_test) 94 | predict_svm(model_filepath) --------------------------------------------------------------------------------