├── .gitignore
├── README.md
├── bayes_sentiment.py
├── cnn_sentiment.py
├── data
    ├── test.txt
    └── train.txt
├── decsiontree_sentiment.py
├── knn_sentiment.py
├── lstm_sentiment.py
├── mlp_sentiment.py
├── model
    ├── sentiment_bayes_model.m
    ├── sentiment_cnn_model.h5
    ├── sentiment_decisiontree_model.m
    ├── sentiment_knn_model.m
    ├── sentiment_lstm_model.h5
    ├── sentiment_mlp_model.h5
    └── sentiment_svm_model.m
└── svm_sentiment.py


/.gitignore:
--------------------------------------------------------------------------------
1 | /embedding/*.bin
2 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # LearningBasedSentiment
 2 | **Sentiment Classifier** based on traditional Maching learning methods, eg. Bayes, SVM ,DecisionTree, KNN and Deeplearning method like MLP, CNN, RNN(LSTM).
 3 | 
 4 | ## Requirements
 5 | All code in this project is implemented in [Python3.6+](https://www.python.org/downloads/).  
 6 | And all the essential packages are listed in `requirements.txt`, you can install them by 
 7 | `pip install -r requirements.txt -i https://pypi.douban.com/simple/`  
 8 | [Anaconda](https://docs.anaconda.com/anaconda/) or [virtualenv + virtualenvwrapper](http://www.jianshu.com/p/44ab75fbaef2) are strongly recommended to manage your Python environments.
 9 | 
10 | # 预处理
11 | 1、语料  
12 | 电影评论，训练集合20000（正向10000，负向10000）  
13 | 电影评论，测试集合20000（正向3000，负向3000）  
14 | 2、语料处理  
15 | 使用jieba进行分词  
16 | 3、输入向量化  
17 | 使用预先训练的wordvector.bin文件进行向量化  
18 | 对于传统机器学习算法，要求输入的是N维向量， 采用句子向量求和平均  
19 | 对于CNN，RNN深度学习算法，要求输入的是N*M维向量，分别对应查找并生成向量  
20 | 
21 | # 训练与对比（准确率）
22 | 
23 | | Algorithm | Accuracy |
24 | | --- | --- |
25 | | DecisionTree | 0.6907302434144715 |
26 | | Bayes | 0.7437479159719906 |
27 | | KNN | (n=14)0.7909303101033678 |
28 | | SVM | 0.8302767589196399 |
29 | | MLP | (20epoches) 0.8359 |
30 | | CNN | (20epoches) 0.8376 |
31 | | LSTM | (20epoches) 0.8505 |
32 | 


--------------------------------------------------------------------------------
/bayes_sentiment.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # coding: utf-8
  3 | # File: bayes_sentiment.py
  4 | # Author: lhy<lhy_in_blcu@126.com,https://huangyong.github.io>
  5 | # Date: 18-3-20
  6 | '''
  7 | function:基于wordvector + 传统机器学习方法的 情感分类
  8 | '''
  9 | import gensim
 10 | import numpy as np
 11 | from sklearn.externals import joblib
 12 | 
 13 | VECTOR_DIR = './embedding/word_vector.bin'  # 词向量模型文件
 14 | model = gensim.models.KeyedVectors.load_word2vec_format(VECTOR_DIR, binary=False)
 15 | 
 16 | '''基于wordvector，通过lookup table的方式找到句子的wordvector的表示'''
 17 | def rep_sentencevector(sentence):
 18 |     '''通过向量求和的方式标识sentence vector'''
 19 |     word_list = [word for word in sentence.split(' ')]
 20 |     embedding_dim = 200
 21 |     embedding_matrix = np.zeros(embedding_dim)
 22 |     for index, word in enumerate(word_list):
 23 |         try:
 24 |             embedding_matrix += model[word]
 25 |         except:
 26 |             pass
 27 | 
 28 |     return embedding_matrix/len(word_list)
 29 | 
 30 | '''构造训练数据'''
 31 | def build_traindata():
 32 |     X_train = list()
 33 |     Y_train = list()
 34 |     X_test = list()
 35 |     Y_test = list()
 36 |     for line in open('./data/train.txt'):
 37 |         line = line.strip().strip().split('\t')
 38 |         sent_vector = rep_sentencevector(line[-1])
 39 | 
 40 |         X_train.append(sent_vector)
 41 |         if line[0] == '1':
 42 |             Y_train.append(1)
 43 |         else:
 44 |             Y_train.append(0)
 45 | 
 46 |     for line in open('./data/test.txt'):
 47 |         line = line.strip().strip().split('\t')
 48 |         sent_vector = rep_sentencevector(line[-1])
 49 |         X_test.append(sent_vector)
 50 |         if line[0] == '1':
 51 |             Y_test.append(1)
 52 |         else:
 53 |             Y_test.append(0)
 54 | 
 55 |     return np.array(X_train), np.array(Y_train), np.array(X_test), np.array(Y_test),
 56 | 
 57 | '''基于bayes分类器算法'''
 58 | def train_bayes(X_train, Y_train):
 59 |     from sklearn.naive_bayes import GaussianNB
 60 |     model = GaussianNB()
 61 |     model.fit(X_train, Y_train)
 62 |     joblib.dump(model, './model/sentiment_bayes_model.m')
 63 | 
 64 | '''基于bayse分类器的预测'''
 65 | def evaluate_bayes(model_filepath, X_test, Y_test):
 66 |     model = joblib.load(model_filepath)
 67 |     Y_predict = list()
 68 |     Y_test = list(Y_test)
 69 |     right = 0
 70 |     for sent in X_test:
 71 |         Y_predict.append(model.predict(sent.reshape(1, -1))[0])
 72 |     for index in range(len(Y_predict)):
 73 |         if int(Y_predict[index]) == int(Y_test[index]):
 74 |             right += 1
 75 |     score = right / len(Y_predict)
 76 |     print('model accuray is :{0}'.format(score)) #0.7437479159719906
 77 |     return score
 78 | 
 79 | '''实际应用测试'''
 80 | def predict_bayes(model_filepath):
 81 |     model = joblib.load(model_filepath)
 82 |     sentence1 = '这个 电视 真 尼玛 垃圾 ， 老子 再也 不买 了'
 83 |     sentence2 = '这件 衣服 真的 太 好看 了 ！ 好想 买 啊 '
 84 |     rep_sen1 = np.array(rep_sentencevector(sentence1)).reshape(1, -1)
 85 |     rep_sen2 = np.array(rep_sentencevector(sentence2)).reshape(1, -1)
 86 |     print('sentence1', model.predict(rep_sen1))
 87 |     print('sentence2', model.predict(rep_sen2))
 88 |     '''
 89 |     sentence1 [1]
 90 |     sentence2 [1] --> 评判错误
 91 |     '''
 92 | 
 93 | 
 94 | 
 95 | if __name__ == '__main__':
 96 |     X_train, Y_train, X_test, Y_test = build_traindata()
 97 |     model_filepath = './model/sentiment_bayes_model.m'
 98 |     #print(X_train.shape, Y_train.shape)
 99 |     #print(X_test.shape, Y_test.shape)
100 |     #train_bayes(X_train, Y_train)
101 |     evaluate_bayes(model_filepath, X_test, Y_test)
102 |     #predict_bayes(model_filepath)


--------------------------------------------------------------------------------
/cnn_sentiment.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # coding: utf-8
  3 | # File: cnn_sentiment.py
  4 | # Author: lhy<lhy_in_blcu@126.com,https://huangyong.github.io>
  5 | # Date: 18-3-19
  6 | 
  7 | import gensim
  8 | import numpy as np
  9 | from keras.models import load_model
 10 | 
 11 | VECTOR_DIR = './embedding/word_vector.bin'  # 词向量模型文件
 12 | model = gensim.models.KeyedVectors.load_word2vec_format(VECTOR_DIR, binary=False)
 13 | 
 14 | '''基于wordvector，通过lookup table的方式找到句子的wordvector的表示'''
 15 | def rep_sentencevector(sentence):
 16 |     word_list = [word for word in sentence.split(' ')]
 17 |     max_words = 100
 18 |     embedding_dim = 200
 19 |     embedding_matrix = np.zeros((max_words, embedding_dim))
 20 |     for index, word in enumerate(word_list):
 21 |         try:
 22 |             embedding_matrix[index] = model[word]
 23 |         except:
 24 |             pass
 25 | 
 26 |     return embedding_matrix
 27 | 
 28 | 
 29 | '''构造训练数据'''
 30 | def build_traindata():
 31 |     X_train = list()
 32 |     Y_train = list()
 33 |     X_test = list()
 34 |     Y_test = list()
 35 | 
 36 |     for line in open('./data/train.txt'):
 37 |         line = line.strip().strip().split('\t')
 38 |         sent_vector = rep_sentencevector(line[-1])
 39 | 
 40 |         X_train.append(sent_vector)
 41 |         if line[0] == '1':
 42 |             Y_train.append([0, 1])
 43 |         else:
 44 |             Y_train.append([1, 0])
 45 | 
 46 |     for line in open('./data/test.txt'):
 47 |         line = line.strip().strip().split('\t')
 48 |         sent_vector = rep_sentencevector(line[-1])
 49 |         X_test.append(sent_vector)
 50 |         if line[0] == '1':
 51 |             Y_test.append([0, 1])
 52 |         else:
 53 |             Y_test.append([1, 0])
 54 | 
 55 |     return np.array(X_train), np.array(Y_train), np.array(X_test), np.array(Y_test),
 56 | 
 57 | 
 58 | '''四层CNN进行训练，迭代20次'''
 59 | def train_cnn(X_train, Y_train, X_test, Y_test):
 60 |     from keras.models import Sequential
 61 |     from keras.layers import Dense, Dropout
 62 |     from keras.layers import Embedding
 63 |     from keras.layers import Conv1D, GlobalAveragePooling1D, MaxPooling1D
 64 |     #建立sequential序贯模型
 65 |     model = Sequential()
 66 |     #input_shape = (rows行, cols列, 1) 1表示颜色通道数目, rows行，对应一句话的长度, cols列表示词向量的维度
 67 |     model.add(Conv1D(64, 3, activation='relu', input_shape=(100, 200)))
 68 |     model.add(Conv1D(64, 3, activation='relu'))
 69 |     model.add(MaxPooling1D(3))
 70 |     model.add(Conv1D(128, 3, activation='relu'))
 71 |     model.add(Conv1D(128, 3, activation='relu'))
 72 |     model.add(GlobalAveragePooling1D())
 73 |     model.add(Dropout(0.5))
 74 |     model.add(Dense(2, activation='sigmoid'))
 75 | 
 76 |     model.compile(loss='binary_crossentropy',
 77 |                   optimizer='rmsprop',
 78 |                   metrics=['accuracy'])
 79 | 
 80 | 
 81 |     model.fit(X_train, Y_train, batch_size=100, epochs=20, validation_data=(X_test, Y_test))
 82 |     model.save('./model/sentiment_cnn_model.h5')
 83 | 
 84 |     '''
 85 |     1 [==============================] - 13s 664us/step - loss: 0.4868 - acc: 0.7645 - val_loss: 0.3897 - val_acc: 0.8234
 86 |     5 [==============================] - 13s 633us/step - loss: 0.2923 - acc: 0.8794 - val_loss: 0.3376 - val_acc: 0.8527
 87 |     10 [==============================] - 12s 601us/step - loss: 0.1337 - acc: 0.9482 - val_loss: 0.5124 - val_acc: 0.8284
 88 |     15 [==============================] - 13s 631us/step - loss: 0.0729 - acc: 0.9789 - val_loss: 0.8681 - val_acc: 0.8325
 89 |     20 [==============================] - 13s 632us/step - loss: 0.0484 - acc: 0.9873 - val_loss: 1.0889 - val_acc: 0.8376
 90 |     '''
 91 | 
 92 | '''实际应用，测试'''
 93 | def predict_cnn(model_filepath):
 94 |     model = load_model(model_filepath)
 95 |     sentence = '这个 电视 真 尼玛 垃圾 ， 老子 再也 不买 了'  # [[2.3127215e-04 0.9977249]]
 96 |     sentence = '这件 衣服 真的 太 好看 了 ！ 好想 买 啊 ' # [[0.9936581  0.00627225]]
 97 |     sentence_vector = np.array([rep_sentencevector(sentence)])
 98 |     print(sentence_vector)
 99 |     print('test after load: ', model.predict(sentence_vector))
100 | 
101 | 
102 | if __name__ == '__main__':
103 |    # X_train, Y_train, X_test, Y_test = build_traindata()
104 |     model_filepath = './model/sentiment_cnn_model.h5'
105 |    # print(X_train.shape, Y_train.shape)
106 |    # print(X_test.shape, Y_test.shape)
107 |    # train_cnn(X_train, Y_train, X_test, Y_test)
108 |     predict_cnn(model_filepath)
109 | 
110 | 
111 | 


--------------------------------------------------------------------------------
/decsiontree_sentiment.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # coding: utf-8
 3 | # File: decsiontree_sentiment.py
 4 | # Author: lhy<lhy_in_blcu@126.com,https://huangyong.github.io>
 5 | # Date: 18-3-20
 6 | 
 7 | import gensim
 8 | import numpy as np
 9 | from sklearn.externals import joblib
10 | 
11 | VECTOR_DIR = './embedding/word_vector.bin'  # 词向量模型文件
12 | model = gensim.models.KeyedVectors.load_word2vec_format(VECTOR_DIR, binary=False)
13 | 
14 | '''基于wordvector，通过lookup table的方式找到句子的wordvector的表示，向量求和做平均'''
15 | def rep_sentencevector(sentence):
16 |     '''通过向量求和的方式标识sentence vector'''
17 |     word_list = [word for word in sentence.split(' ')]
18 |     embedding_dim = 200
19 |     embedding_matrix = np.zeros(embedding_dim)
20 |     for index, word in enumerate(word_list):
21 |         try:
22 |             embedding_matrix += model[word]
23 |         except:
24 |             pass
25 | 
26 |     return embedding_matrix/len(word_list)
27 | 
28 | '''构造训练数据'''
29 | def build_traindata():
30 |     X_train = list()
31 |     Y_train = list()
32 |     X_test = list()
33 |     Y_test = list()
34 |     for line in open('./data/train.txt'):
35 |         line = line.strip().strip().split('\t')
36 |         sent_vector = rep_sentencevector(line[-1])
37 | 
38 |         X_train.append(sent_vector)
39 |         if line[0] == '1':
40 |             Y_train.append(1)
41 |         else:
42 |             Y_train.append(0)
43 | 
44 |     for line in open('./data/test.txt'):
45 |         line = line.strip().strip().split('\t')
46 |         sent_vector = rep_sentencevector(line[-1])
47 |         X_test.append(sent_vector)
48 |         if line[0] == '1':
49 |             Y_test.append(1)
50 |         else:
51 |             Y_test.append(0)
52 | 
53 |     return np.array(X_train), np.array(Y_train), np.array(X_test), np.array(Y_test),
54 | 
55 | '''基于decisiontree分类器算法, 使用SVC算法，使用默认参数'''
56 | def train_decisiontree(X_train, Y_train):
57 |     from sklearn import tree
58 |     model = tree.DecisionTreeClassifier()
59 |     model.fit(X_train, Y_train)
60 |     joblib.dump(model, './model/sentiment_decisiontree_model.m')
61 | 
62 | '''基于decisiontree分类器的预测'''
63 | def evaluate_decisiontree(model_filepath, X_test, Y_test):
64 |     model = joblib.load(model_filepath)
65 |     Y_predict = list()
66 |     Y_test = list(Y_test)
67 |     right = 0
68 |     for sent in X_test:
69 |         Y_predict.append(model.predict(sent.reshape(1, -1))[0])
70 |     for index in range(len(Y_predict)):
71 |         if int(Y_predict[index]) == int(Y_test[index]):
72 |             right += 1
73 |     score = right / len(Y_predict)
74 |     print('model accuray is :{0}'.format(score)) #0.6907302434144715
75 |     return score
76 | 
77 | '''实际应用测试'''
78 | def predict_decisiontree(model_filepath):
79 |     model = joblib.load(model_filepath)
80 |     sentence1 = '这个 电视 真 尼玛 垃圾 ， 老子 再也 不买 了'
81 |     sentence2 = '这件 衣服 真的 太 好看 了 ！ 好想 买 啊 '
82 |     rep_sen1 = np.array(rep_sentencevector(sentence1)).reshape(1, -1)
83 |     rep_sen2 = np.array(rep_sentencevector(sentence2)).reshape(1, -1)
84 |     print('sentence1', model.predict(rep_sen1)) #sentence1 [0]
85 |     print('sentence2', model.predict(rep_sen2)) #sentence2 [0]
86 | 
87 | if __name__ == '__main__':
88 |     X_train, Y_train, X_test, Y_test = build_traindata()
89 |     model_filepath = './model/sentiment_decisiontree_model.m'
90 |     print(X_train.shape, Y_train.shape)
91 |     print(X_test.shape, Y_test.shape)
92 |     train_decisiontree(X_train, Y_train)
93 |     evaluate_decisiontree(model_filepath, X_test, Y_test)
94 |     predict_decisiontree(model_filepath)


--------------------------------------------------------------------------------
/knn_sentiment.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # coding: utf-8
  3 | # File: knn_sentiment.py
  4 | # Author: lhy<lhy_in_blcu@126.com,https://huangyong.github.io>
  5 | # Date: 18-3-20
  6 | 
  7 | import gensim
  8 | import numpy as np
  9 | from sklearn.externals import joblib
 10 | 
 11 | VECTOR_DIR = './embedding/word_vector.bin'  # 词向量模型文件
 12 | model = gensim.models.KeyedVectors.load_word2vec_format(VECTOR_DIR, binary=False)
 13 | 
 14 | '''基于wordvector，通过lookup table的方式找到句子的wordvector的表示'''
 15 | def rep_sentencevector(sentence):
 16 |     '''通过向量求和的方式标识sentence vector'''
 17 |     word_list = [word for word in sentence.split(' ')]
 18 |     embedding_dim = 200
 19 |     embedding_matrix = np.zeros(embedding_dim)
 20 |     for index, word in enumerate(word_list):
 21 |         try:
 22 |             embedding_matrix += model[word]
 23 |         except:
 24 |             pass
 25 | 
 26 |     return embedding_matrix/len(word_list)
 27 | 
 28 | '''构造训练数据'''
 29 | def build_traindata():
 30 |     X_train = list()
 31 |     Y_train = list()
 32 |     X_test = list()
 33 |     Y_test = list()
 34 |     for line in open('./data/train.txt'):
 35 |         line = line.strip().strip().split('\t')
 36 |         sent_vector = rep_sentencevector(line[-1])
 37 | 
 38 |         X_train.append(sent_vector)
 39 |         if line[0] == '1':
 40 |             Y_train.append(1)
 41 |         else:
 42 |             Y_train.append(0)
 43 | 
 44 |     for line in open('./data/test.txt'):
 45 |         line = line.strip().strip().split('\t')
 46 |         sent_vector = rep_sentencevector(line[-1])
 47 |         X_test.append(sent_vector)
 48 |         if line[0] == '1':
 49 |             Y_test.append(1)
 50 |         else:
 51 |             Y_test.append(0)
 52 | 
 53 |     return np.array(X_train), np.array(Y_train), np.array(X_test), np.array(Y_test),
 54 | 
 55 | '''基于knn分类器算法'''
 56 | def train_knn(X_train, Y_train, X_test, Y_test):
 57 |     from sklearn.neighbors import KNeighborsClassifier
 58 |     '''
 59 |     for x in range(1, 15):
 60 |         model = KNeighborsClassifier(n_neighbors=x)
 61 |         model.fit(X_train, Y_train)
 62 |         preds = knnclf.predict(X_test)
 63 |         num = 0
 64 |         num = 0
 65 |         preds = preds.tolist()
 66 |         for i, pred in enumerate(preds):
 67 |             if int(pred) == int(Y_test[i]):
 68 |                 num += 1
 69 |         print('K= ' + str(x) + ', precision_score:' + str(float(num) / len(preds)))
 70 | 
 71 |     *****************result****************
 72 |     K= 1, precision_score:0.7169056352117372
 73 |     K= 2, precision_score:0.7189063021007003
 74 |     K= 3, precision_score:0.7600866955651884
 75 |     K= 4, precision_score:0.7519173057685895
 76 |     K= 5, precision_score:0.764754918306102
 77 |     K= 6, precision_score:0.7709236412137379
 78 |     K= 7, precision_score:0.7724241413804601
 79 |     K= 8, precision_score:0.7784261420473492
 80 |     K= 9, precision_score:0.7804268089363121
 81 |     K= 10, precision_score:0.7814271423807936
 82 |     K= 11, precision_score:0.7829276425475158
 83 |     K= 12, precision_score:0.7869289763254418
 84 |     K= 13, precision_score:0.7829276425475158
 85 |     K= 14, precision_score:0.7909303101033678
 86 |     '''
 87 |     #选择K=20进行KNN训练
 88 |     model = KNeighborsClassifier(n_neighbors=14)
 89 |     model.fit(X_train, Y_train)
 90 |     joblib.dump(model, './model/sentiment_knn_model.m')
 91 | 
 92 | '''基于knn分类器的预测'''
 93 | def evaluate_knn(model_filepath, X_test, Y_test):
 94 |     model = joblib.load(model_filepath)
 95 |     Y_predict = list()
 96 |     Y_test = list(Y_test)
 97 |     right = 0
 98 |     for sent in X_test:
 99 |         Y_predict.append(model.predict(sent.reshape(1, -1)))
100 |     for index in range(len(Y_predict)):
101 |         if Y_predict[index] == Y_test[index]:
102 |             right += 1
103 |     score = right / len(Y_predict)
104 |     print('model accuray is :{0}'.format(score))#0.7909303101033678
105 |     return score
106 | 
107 | '''实际应用测试'''
108 | def predict_knn(model_filepath):
109 |     model = joblib.load(model_filepath)
110 |     sentence1 = '这个 电视 真 尼玛 垃圾 ， 老子 再也 不买 了'
111 |     sentence2 = '这件 衣服 真的 太 好看 了 ！ 好想 买 啊 '
112 |     rep_sen1 = np.array(rep_sentencevector(sentence1)).reshape(1, -1)
113 |     rep_sen2 = np.array(rep_sentencevector(sentence2)).reshape(1, -1)
114 |     print('sentence1', model.predict(rep_sen1)) #[1]
115 |     print('sentence2', model.predict(rep_sen2)) #[0]
116 | 
117 | if __name__ == '__main__':
118 |     X_train, Y_train, X_test, Y_test = build_traindata()
119 |     model_filepath = './model/sentiment_knn_model.m'
120 |     print(X_train.shape, Y_train.shape)
121 |     print(X_test.shape, Y_test.shape)
122 |     train_knn(X_train, Y_train, X_test, Y_test)
123 |     evaluate_knn(model_filepath, X_test, Y_test)
124 |     predict_knn(model_filepath)


--------------------------------------------------------------------------------
/lstm_sentiment.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # coding: utf-8
  3 | # File: lstm_sentiment.py
  4 | # Author: lhy<lhy_in_blcu@126.com,https://huangyong.github.io>
  5 | # Date: 18-3-19
  6 | import gensim
  7 | import numpy as np
  8 | from keras.models import load_model
  9 | 
 10 | VECTOR_DIR = './embedding/word_vector.bin'  # 词向量模型文件
 11 | model = gensim.models.KeyedVectors.load_word2vec_format(VECTOR_DIR, binary=False)
 12 | 
 13 | '''基于wordvector，通过lookup table的方式找到句子的wordvector的表示'''
 14 | def rep_sentencevector(sentence):
 15 |     word_list = [word for word in sentence.split(' ')]
 16 |     max_words = 100
 17 |     embedding_dim = 200
 18 |     embedding_matrix = np.zeros((max_words, embedding_dim))
 19 |     for index, word in enumerate(word_list):
 20 |         try:
 21 |             embedding_matrix[index] = model[word]
 22 |         except:
 23 |             pass
 24 | 
 25 |     return embedding_matrix
 26 | 
 27 | '''构造训练数据'''
 28 | def build_traindata():
 29 |     X_train = list()
 30 |     Y_train = list()
 31 |     X_test = list()
 32 |     Y_test = list()
 33 | 
 34 |     for line in open('./data/train.txt'):
 35 |         line = line.strip().strip().split('\t')
 36 |         sent_vector = rep_sentencevector(line[-1])
 37 | 
 38 |         X_train.append(sent_vector)
 39 |         if line[0] == '1':
 40 |             Y_train.append([0, 1])
 41 |         else:
 42 |             Y_train.append([1, 0])
 43 | 
 44 |     for line in open('./data/test.txt'):
 45 |         line = line.strip().strip().split('\t')
 46 |         sent_vector = rep_sentencevector(line[-1])
 47 |         X_test.append(sent_vector)
 48 |         if line[0] == '1':
 49 |             Y_test.append([0, 1])
 50 |         else:
 51 |             Y_test.append([1, 0])
 52 | 
 53 |     return np.array(X_train), np.array(Y_train), np.array(X_test), np.array(Y_test),
 54 | 
 55 | '''三层lstm进行训练，迭代20次'''
 56 | def train_lstm(X_train, Y_train, X_test, Y_test):
 57 |     from keras.models import Sequential
 58 |     from keras.layers import LSTM, Dense
 59 |     import numpy as np
 60 |     data_dim = 200  # 对应词向量维度
 61 |     timesteps = 100  # 对应序列长度
 62 |     # expected input data shape: (batch_size, timesteps, data_dim)
 63 |     model = Sequential()
 64 |     model.add(LSTM(32, return_sequences=True,
 65 |                    input_shape=(timesteps, data_dim)))  # returns a sequence of vectors of dimension 32
 66 |     model.add(LSTM(32, return_sequences=True))  # returns a sequence of vectors of dimension 32
 67 |     model.add(LSTM(32))  # return a single vector of dimension 32
 68 |     model.add(Dense(2, activation='softmax'))
 69 | 
 70 |     model.compile(loss='categorical_crossentropy',
 71 |                   optimizer='rmsprop',
 72 |                   metrics=['accuracy'])
 73 | 
 74 |     model.fit(X_train, Y_train, batch_size=100, epochs=20, validation_data=(X_test, Y_test))
 75 |     model.save('./model/sentiment_lstm_model.h5')
 76 |     '''/
 77 |     1 [==============================] - 41s 2ms/step - loss: 0.5384 - acc: 0.7142 - val_loss: 0.4223 - val_acc: 0.8281
 78 |     5 [==============================] - 38s 2ms/step - loss: 0.2885 - acc: 0.8904 - val_loss: 0.3618 - val_acc: 0.8531
 79 |     10 [==============================] - 40s 2ms/step - loss: 0.1965 - acc: 0.9357 - val_loss: 0.3815 - val_acc: 0.8515
 80 |     15 [==============================] - 39s 2ms/step - loss: 0.1420 - acc: 0.9577 - val_loss: 0.5172 - val_acc: 0.8501
 81 |     20 [==============================] - 37s 2ms/step - loss: 0.1055 - acc: 0.9729 - val_loss: 0.5309 - val_acc: 0.8505
 82 |     '''
 83 | 
 84 | '''实际应用，测试'''
 85 | def predict_lstm(model_filepath):
 86 |     model = load_model(model_filepath)
 87 |     sentence = '这个 电视 真 尼玛 垃圾 ， 老子 再也 不买 了'#[[0.01477097 0.98522896]]
 88 |     #sentence = '这件 衣服 真的 太 好看 了 ！ 好想 买 啊 '#[[0.9843225  0.01567744]]
 89 |     sentence_vector = np.array([rep_sentencevector(sentence)])
 90 |     print(sentence_vector)
 91 |     print('test after load: ', model.predict(sentence_vector))
 92 | 
 93 | 
 94 | if __name__ == '__main__':
 95 |    # X_train, Y_train, X_test, Y_test = build_traindata()
 96 |     model_filepath = './model/sentiment_model.h5'
 97 |    # print(X_train.shape, Y_train.shape)
 98 |    # print(X_test.shape, Y_test.shape)
 99 |    # train_lstm(X_train, Y_train, X_test, Y_test)
100 |     predict_lstm(model_filepath)
101 | 


--------------------------------------------------------------------------------
/mlp_sentiment.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # coding: utf-8
 3 | # File: mlp_sentiment.py
 4 | # Author: lhy<lhy_in_blcu@126.com,https://huangyong.github.io>
 5 | # Date: 18-3-20
 6 | #!/usr/bin/env python3
 7 | 
 8 | import gensim
 9 | import numpy as np
10 | from keras.models import load_model
11 | 
12 | VECTOR_DIR = './embedding/word_vector.bin'  # 词向量模型文件
13 | model = gensim.models.KeyedVectors.load_word2vec_format(VECTOR_DIR, binary=False)
14 | 
15 | '''基于wordvector，通过lookup table的方式找到句子的wordvector的表示'''
16 | def rep_sentencevector(sentence):
17 |     '''通过向量求和的方式标识sentence vector'''
18 |     word_list = [word for word in sentence.split(' ')]
19 |     embedding_dim = 200
20 |     embedding_matrix = np.zeros(embedding_dim)
21 |     for index, word in enumerate(word_list):
22 |         try:
23 |             embedding_matrix += model[word]
24 |         except:
25 |             pass
26 | 
27 |     return embedding_matrix/len(word_list)
28 | 
29 | '''构造训练数据'''
30 | def build_traindata():
31 |     X_train = list()
32 |     Y_train = list()
33 |     X_test = list()
34 |     Y_test = list()
35 |     for line in open('./data/train.txt'):
36 |         line = line.strip().strip().split('\t')
37 |         sent_vector = rep_sentencevector(line[-1])
38 | 
39 |         X_train.append(sent_vector)
40 |         if line[0] == '1':
41 |             Y_train.append([0, 1])
42 |         else:
43 |             Y_train.append([1, 0])
44 | 
45 |     for line in open('./data/test.txt'):
46 |         line = line.strip().strip().split('\t')
47 |         sent_vector = rep_sentencevector(line[-1])
48 |         X_test.append(sent_vector)
49 |         if line[0] == '1':
50 |             Y_test.append([0, 1])
51 |         else:
52 |             Y_test.append([1, 0])
53 | 
54 |     return np.array(X_train), np.array(Y_train), np.array(X_test), np.array(Y_test),
55 | 
56 | '''三层mlp进行训练，迭代20次'''
57 | def train_mlp(X_train, Y_train, X_test, Y_test):
58 |     from keras.models import Sequential
59 |     from keras.layers import Dense, Dropout
60 |     model = Sequential()
61 |     model.add(Dense(64, input_dim=(200), activation='relu'))
62 |     model.add(Dropout(0.5))
63 |     model.add(Dense(64, activation='relu'))
64 |     model.add(Dropout(0.5))
65 |     model.add(Dense(2, activation='sigmoid'))
66 |     model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
67 |     model.fit(X_train, Y_train, batch_size=100, epochs=20, validation_data=(X_test, Y_test))
68 |     model.save('./model/sentiment_mlp_model.h5')
69 |     '''
70 |     1 [==============================] - 1s 25us/step - loss: 1.7893 - acc: 0.6168 - val_loss: 0.5576 - val_acc: 0.7076
71 |     5 [==============================] - 0s 19us/step - loss: 0.4499 - acc: 0.7987 - val_loss: 0.4056 - val_acc: 0.8204
72 |     10 [==============================] - 0s 17us/step - loss: 0.4043 - acc: 0.8274 - val_loss: 0.4016 - val_acc: 0.8341
73 |     15 [==============================] - 0s 17us/step - loss: 0.3815 - acc: 0.8397 - val_loss: 0.3821 - val_acc: 0.8345
74 |     20 [==============================] - 0s 17us/step - loss: 0.3746 - acc: 0.8432 - val_loss: 0.3842 - val_acc: 0.8359
75 |     '''
76 | 
77 | '''实际应用，测试'''
78 | def predict_mlp(model_filepath):
79 |     model = load_model(model_filepath)
80 |     sentence1 = '这个 电视 真 尼玛 垃圾 ， 老子 再也 不买 了'  # [[0.0942708 0.9058427]]
81 |     sentence2 = '这件 衣服 真的 太 好看 了 ！ 好想 买 啊 ' # [[0.6489922  0.34993422]]
82 |     sentence_vector1 = np.array([rep_sentencevector(sentence1)])
83 |     sentence_vector2 = np.array([rep_sentencevector(sentence2)])
84 |     print('test after load: ', model.predict(sentence_vector1))
85 |     print('test after load: ', model.predict(sentence_vector2))
86 | 
87 | 
88 | if __name__ == '__main__':
89 |     #X_train, Y_train, X_test, Y_test = build_traindata()
90 |     model_filepath = './model/sentiment_mlp_model.h5'
91 |     #print(X_train.shape, Y_train.shape)
92 |     #print(X_test.shape, Y_test.shape)
93 |     #train_mlp(X_train, Y_train, X_test, Y_test)
94 |     predict_mlp(model_filepath)
95 | 
96 | 
97 | 


--------------------------------------------------------------------------------
/model/sentiment_bayes_model.m:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/liuhuanyong/SentenceSentimentClassifier/856a89e6f0f0fa5e459e202875e8bb8fe02ed388/model/sentiment_bayes_model.m


--------------------------------------------------------------------------------
/model/sentiment_cnn_model.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/liuhuanyong/SentenceSentimentClassifier/856a89e6f0f0fa5e459e202875e8bb8fe02ed388/model/sentiment_cnn_model.h5


--------------------------------------------------------------------------------
/model/sentiment_decisiontree_model.m:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/liuhuanyong/SentenceSentimentClassifier/856a89e6f0f0fa5e459e202875e8bb8fe02ed388/model/sentiment_decisiontree_model.m


--------------------------------------------------------------------------------
/model/sentiment_knn_model.m:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/liuhuanyong/SentenceSentimentClassifier/856a89e6f0f0fa5e459e202875e8bb8fe02ed388/model/sentiment_knn_model.m


--------------------------------------------------------------------------------
/model/sentiment_lstm_model.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/liuhuanyong/SentenceSentimentClassifier/856a89e6f0f0fa5e459e202875e8bb8fe02ed388/model/sentiment_lstm_model.h5


--------------------------------------------------------------------------------
/model/sentiment_mlp_model.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/liuhuanyong/SentenceSentimentClassifier/856a89e6f0f0fa5e459e202875e8bb8fe02ed388/model/sentiment_mlp_model.h5


--------------------------------------------------------------------------------
/model/sentiment_svm_model.m:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/liuhuanyong/SentenceSentimentClassifier/856a89e6f0f0fa5e459e202875e8bb8fe02ed388/model/sentiment_svm_model.m


--------------------------------------------------------------------------------
/svm_sentiment.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # coding: utf-8
 3 | # File: svm_sentiment.py
 4 | # Author: lhy<lhy_in_blcu@126.com,https://huangyong.github.io>
 5 | # Date: 18-3-20
 6 | 
 7 | import gensim
 8 | import numpy as np
 9 | from sklearn.externals import joblib
10 | 
11 | VECTOR_DIR = './embedding/word_vector.bin'  # 词向量模型文件
12 | model = gensim.models.KeyedVectors.load_word2vec_format(VECTOR_DIR, binary=False)
13 | 
14 | '''基于wordvector，通过lookup table的方式找到句子的wordvector的表示，向量求和做平均'''
15 | def rep_sentencevector(sentence):
16 |     '''通过向量求和的方式标识sentence vector'''
17 |     word_list = [word for word in sentence.split(' ')]
18 |     embedding_dim = 200
19 |     embedding_matrix = np.zeros(embedding_dim)
20 |     for index, word in enumerate(word_list):
21 |         try:
22 |             embedding_matrix += model[word]
23 |         except:
24 |             pass
25 | 
26 |     return embedding_matrix/len(word_list)
27 | 
28 | '''构造训练数据'''
29 | def build_traindata():
30 |     X_train = list()
31 |     Y_train = list()
32 |     X_test = list()
33 |     Y_test = list()
34 |     for line in open('./data/train.txt'):
35 |         line = line.strip().strip().split('\t')
36 |         sent_vector = rep_sentencevector(line[-1])
37 | 
38 |         X_train.append(sent_vector)
39 |         if line[0] == '1':
40 |             Y_train.append(1)
41 |         else:
42 |             Y_train.append(0)
43 | 
44 |     for line in open('./data/test.txt'):
45 |         line = line.strip().strip().split('\t')
46 |         sent_vector = rep_sentencevector(line[-1])
47 |         X_test.append(sent_vector)
48 |         if line[0] == '1':
49 |             Y_test.append(1)
50 |         else:
51 |             Y_test.append(0)
52 | 
53 |     return np.array(X_train), np.array(Y_train), np.array(X_test), np.array(Y_test),
54 | 
55 | '''基于svm分类器算法, 使用SVC算法，使用默认参数'''
56 | def train_svm(X_train, Y_train):
57 |     from sklearn.svm import SVC
58 |     model = SVC(kernel='linear')
59 |     model.fit(X_train, Y_train)
60 |     joblib.dump(model, './model/sentiment_svm_model.m')
61 | 
62 | '''基于svm分类器的预测'''
63 | def evaluate_svm(model_filepath, X_test, Y_test):
64 |     model = joblib.load(model_filepath)
65 |     Y_predict = list()
66 |     Y_test = list(Y_test)
67 |     right = 0
68 |     for sent in X_test:
69 |         Y_predict.append(model.predict(sent.reshape(1, -1))[0])
70 |     for index in range(len(Y_predict)):
71 |         if int(Y_predict[index]) == int(Y_test[index]):
72 |             right += 1
73 |     score = right / len(Y_predict)
74 |     print('model accuray is :{0}'.format(score)) #0.8302767589196399
75 |     return score
76 | 
77 | '''实际应用测试'''
78 | def predict_svm(model_filepath):
79 |     model = joblib.load(model_filepath)
80 |     sentence1 = '这个 电视 真 尼玛 垃圾 ， 老子 再也 不买 了'
81 |     sentence2 = '这件 衣服 真的 太 好看 了 ！ 好想 买 啊 '
82 |     rep_sen1 = np.array(rep_sentencevector(sentence1)).reshape(1, -1)
83 |     rep_sen2 = np.array(rep_sentencevector(sentence2)).reshape(1, -1)
84 |     print('sentence1', model.predict(rep_sen1)) #sentence1 [1]
85 |     print('sentence2', model.predict(rep_sen2)) #sentence2 [0]
86 | 
87 | if __name__ == '__main__':
88 |     X_train, Y_train, X_test, Y_test = build_traindata()
89 |     model_filepath = './model/sentiment_svm_model.m'
90 |     print(X_train.shape, Y_train.shape)
91 |     print(X_test.shape, Y_test.shape)
92 |     train_svm(X_train, Y_train)
93 |     evaluate_svm(model_filepath, X_test, Y_test)
94 |     predict_svm(model_filepath)


--------------------------------------------------------------------------------