├── .gitignore ├── LICENSE ├── README.md ├── data ├── neg.csv ├── neutral.csv └── pos.csv ├── keras_sentiment_analysis_v1.py ├── keras_sentiment_analysis_v2.py ├── keras_sentiment_analysis_v3.py ├── keras_sentiment_analysis_v4.py ├── keras_sentiment_analysis_v5.py └── sentiment_analysis_ml.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | db.sqlite3 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .env 86 | .venv 87 | env/ 88 | venv/ 89 | ENV/ 90 | env.bak/ 91 | venv.bak/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ 105 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Sentiment_Analysis_cnn_lstm_cnnlstm 2 | 利用CNN,LSTM,CNN_LSTM,TextCNN,Bi_LSTM和传统的机器学习算法进行情感分析,参考:https://github.com/Edward1Chou/SentimentAnalysis; 3 | 4 | 1. keras_sentiment_analysis_v1.py: LSTM算法 5 | 2. keras_sentiment_analysis_v2.py: CNN_LSTM算法 6 | 3. keras_sentiment_analysis_v3.py: CNN算法 7 | 4. keras_sentiment_analysis_v4.py: TextCNN算法 8 | 5. keras_sentiment_analysis_v4.py: Bi_LSTM算法 9 | 6. sentiment_analysis_ml.py: 传统机器学习算法 10 | -------------------------------------------------------------------------------- /keras_sentiment_analysis_v1.py: -------------------------------------------------------------------------------- 1 | # _*_ coding:utf-8 _*_ 2 | 3 | ''' 4 | @Author: Ruan Yang 5 | @Date: 2018.12.9 6 | @Purpose: 文本情感分析(positive,negative,neutral) 7 | @Reference: https://github.com/Edward1Chou/SentimentAnalysis 8 | @算法:LSTM 9 | @需要有事先标准好的数据集 10 | @positive: [1,0,0] 11 | @neutral: [0,1,0] 12 | @negative:[0,0,1] 13 | ''' 14 | 15 | import codecs 16 | import jieba 17 | 18 | datapaths=r"C:\Users\RY\Desktop\SentimentAnalysis-master\data\\" 19 | 20 | positive_data=[] 21 | y_positive=[] 22 | neutral_data=[] 23 | y_neutral=[] 24 | negative_data=[] 25 | y_negative=[] 26 | 27 | print("#------------------------------------------------------#") 28 | print("加载数据集") 29 | with codecs.open(datapaths+"pos.csv","r","utf-8") as f1,\ 30 | codecs.open(datapaths+"neutral.csv","r","utf-8") as f2,\ 31 | codecs.open(datapaths+"neg.csv","r","utf-8") as f3: 32 | for line in f1: 33 | positive_data.append(" ".join(i for i in jieba.lcut(line.strip(),cut_all=False))) 34 | y_positive.append([1,0,0]) 35 | for line in f2: 36 | neutral_data.append(" ".join(i for i in jieba.lcut(line.strip(),cut_all=False))) 37 | y_neutral.append([0,1,0]) 38 | for line in f3: 39 | negative_data.append(" ".join(i for i in jieba.lcut(line.strip(),cut_all=False))) 40 | y_negative.append([0,0,1]) 41 | 42 | print("positive data:{}".format(len(positive_data))) 43 | print("neutral data:{}".format(len(neutral_data))) 44 | print("negative data:{}".format(len(negative_data))) 45 | 46 | x_text=positive_data+neutral_data+negative_data 47 | y_label=y_positive+y_neutral+y_negative 48 | print("#------------------------------------------------------#") 49 | print("\n") 50 | 51 | from tensorflow.contrib import learn 52 | import tensorflow as tf 53 | import numpy as np 54 | import collections 55 | 56 | max_document_length=200 57 | min_frequency=1 58 | 59 | 60 | vocab = learn.preprocessing.VocabularyProcessor(max_document_length,min_frequency, tokenizer_fn=list) 61 | x = np.array(list(vocab.fit_transform(x_text))) 62 | vocab_dict = collections.OrderedDict(vocab.vocabulary_._mapping) 63 | 64 | with codecs.open(r"C:\Users\RY\Desktop\vocabulary.txt","w","utf-8") as f: 65 | for key,value in vocab_dict.items(): 66 | f.write("{} {}\n".format(key,value)) 67 | 68 | print("#----------------------------------------------------------#") 69 | print("\n") 70 | 71 | print("#----------------------------------------------------------#") 72 | print("数据混洗") 73 | np.random.seed(10) 74 | y=np.array(y_label) 75 | shuffle_indices = np.random.permutation(np.arange(len(y))) 76 | x_shuffled = x[shuffle_indices] 77 | y_shuffled = y[shuffle_indices] 78 | 79 | test_sample_percentage=0.2 80 | test_sample_index = -1 * int(test_sample_percentage * float(len(y))) 81 | x_train, x_test = x_shuffled[:test_sample_index], x_shuffled[test_sample_index:] 82 | y_train, y_test = y_shuffled[:test_sample_index], y_shuffled[test_sample_index:] 83 | 84 | train_positive_label=0 85 | train_neutral_label=0 86 | train_negative_label=0 87 | test_positive_label=0 88 | test_neutral_label=0 89 | test_negative_label=0 90 | 91 | for i in range(len(y_train)): 92 | if y_train[i,0] == 1: 93 | train_positive_label += 1 94 | elif y_train[i,1] == 1: 95 | train_neutral_label += 1 96 | else: 97 | train_negative_label += 1 98 | 99 | for i in range(len(y_test)): 100 | if y_test[i,0] == 1: 101 | test_positive_label += 1 102 | elif y_test[i,1] == 1: 103 | test_neutral_label += 1 104 | else: 105 | test_negative_label += 1 106 | 107 | print("训练集中 positive 样本个数:{}".format(train_positive_label)) 108 | print("训练集中 neutral 样本个数:{}".format(train_neutral_label)) 109 | print("训练集中 negative 样本个数:{}".format(train_negative_label)) 110 | print("测试集中 positive 样本个数:{}".format(test_positive_label)) 111 | print("测试集中 neutral 样本个数:{}".format(test_neutral_label)) 112 | print("测试集中 negative 样本个数:{}".format(test_negative_label)) 113 | 114 | print("#----------------------------------------------------------#") 115 | print("\n") 116 | 117 | print("#----------------------------------------------------------#") 118 | print("读取預训练词向量矩阵") 119 | 120 | pretrainpath=r"E:\中科大MS\預训练模型\\" 121 | 122 | embedding_index={} 123 | 124 | with codecs.open(pretrainpath+"sgns.wiki.bigram","r","utf-8") as f: 125 | line=f.readline() 126 | nwords=int(line.strip().split(" ")[0]) 127 | ndims=int(line.strip().split(" ")[1]) 128 | for line in f: 129 | values=line.split() 130 | words=values[0] 131 | coefs=np.asarray(values[1:],dtype="float32") 132 | embedding_index[words]=coefs 133 | 134 | print("預训练模型中Token总数:{} = {}".format(nwords,len(embedding_index))) 135 | print("預训练模型的维度:{}".format(ndims)) 136 | print("#----------------------------------------------------------#") 137 | print("\n") 138 | 139 | print("#----------------------------------------------------------#") 140 | print("将vocabulary中的 index-word 对应关系映射到 index-word vector形式") 141 | 142 | embedding_matrix=[] 143 | notfoundword=0 144 | 145 | for word in vocab_dict.keys(): 146 | if word in embedding_index.keys(): 147 | embedding_matrix.append(embedding_index[word]) 148 | else: 149 | notfoundword += 1 150 | embedding_matrix.append(np.random.uniform(-1,1,size=ndims)) 151 | 152 | embedding_matrix=np.array(embedding_matrix,dtype=np.float32) # 必须使用 np.float32 153 | print("词汇表中未找到单词个数:{}".format(notfoundword)) 154 | print("#----------------------------------------------------------#") 155 | print("\n") 156 | 157 | print("#---------------------------------------------------#") 158 | print("Build model .................") 159 | print("NN structure .......") 160 | print("Embedding layer --- LSTM layer --- Dense layer") 161 | print("#---------------------------------------------------#") 162 | print("\n") 163 | 164 | from keras.models import Sequential 165 | from keras.layers import Dense, Dropout, Activation 166 | from keras.layers import Embedding 167 | from keras.layers import Conv1D, GlobalMaxPooling1D 168 | from keras.layers import LSTM 169 | 170 | batch_size=64 171 | max_sentence_length=200 172 | embedding_dims=ndims 173 | dropout=0.2 174 | recurrent_dropout=0.2 175 | num_classes=3 176 | epochs=2 177 | 178 | # 定义网络结构 179 | model=Sequential() 180 | model.add(Embedding(len(vocab_dict), 181 | embedding_dims, 182 | weights=[embedding_matrix], 183 | input_length=max_sentence_length, 184 | trainable=False)) 185 | model.add(Dropout(dropout)) 186 | model.add(LSTM(128,dropout=dropout,recurrent_dropout=recurrent_dropout)) 187 | model.add(Dense(num_classes,activation="sigmoid")) 188 | 189 | # 模型编译 190 | 191 | model.compile(loss="categorical_crossentropy",optimizer="adam",metrics=["accuracy"]) 192 | 193 | print("#---------------------------------------------------#") 194 | print("Train ....................") 195 | print("#---------------------------------------------------#") 196 | print("\n") 197 | 198 | model.fit(x_train,y_train,batch_size=batch_size,epochs=epochs,validation_data=(x_test,y_test)) 199 | 200 | # 训练得分和准确度 201 | 202 | score,acc=model.evaluate(x_test,y_test,batch_size=batch_size) 203 | 204 | print("#---------------------------------------------------#") 205 | print("预测得分:{}".format(score)) 206 | print("预测准确率:{}".format(acc)) 207 | print("#---------------------------------------------------#") 208 | print("\n") 209 | 210 | # 模型预测 211 | 212 | predictions=model.predict(x_test) 213 | 214 | print("#---------------------------------------------------#") 215 | print("测试集的预测结果,对每个类有一个得分/概率,取值大对应的类别") 216 | print(predictions) 217 | print("#---------------------------------------------------#") 218 | print("\n") 219 | 220 | # 模型预测类别 221 | 222 | predict_class=model.predict_classes(x_test) 223 | 224 | print("#---------------------------------------------------#") 225 | print("测试集的预测类别") 226 | print(predict_class) 227 | print("#---------------------------------------------------#") 228 | print("\n") 229 | 230 | # 模型保存 231 | 232 | model.save(r"C:\Users\RY\Desktop\sentiment_analysis_lstm.h5") 233 | 234 | print("#---------------------------------------------------#") 235 | print("保存模型") 236 | print("#---------------------------------------------------#") 237 | print("\n") 238 | 239 | # 模型总结 240 | 241 | print("#---------------------------------------------------#") 242 | print("输出模型总结") 243 | print(model.summary()) 244 | print("#---------------------------------------------------#") 245 | print("\n") 246 | 247 | # 模型的配置文件 248 | 249 | config=model.get_config() 250 | 251 | print("#---------------------------------------------------#") 252 | print("输出模型配置信息") 253 | print(config) 254 | print("#---------------------------------------------------#") 255 | print("\n") 256 | -------------------------------------------------------------------------------- /keras_sentiment_analysis_v2.py: -------------------------------------------------------------------------------- 1 | # _*_ coding:utf-8 _*_ 2 | 3 | ''' 4 | @Author: Ruan Yang 5 | @Date: 2018.12.9 6 | @Purpose: 文本情感分析(positive,negative,neutral) 7 | @Reference: https://github.com/Edward1Chou/SentimentAnalysis 8 | @算法:CNN-LSTM 9 | @需要有事先标准好的数据集 10 | @positive: [1,0,0] 11 | @neutral: [0,1,0] 12 | @negative:[0,0,1] 13 | ''' 14 | 15 | import codecs 16 | import jieba 17 | 18 | datapaths=r"C:\Users\RY\Desktop\SentimentAnalysis-master\data\\" 19 | 20 | positive_data=[] 21 | y_positive=[] 22 | neutral_data=[] 23 | y_neutral=[] 24 | negative_data=[] 25 | y_negative=[] 26 | 27 | print("#------------------------------------------------------#") 28 | print("加载数据集") 29 | with codecs.open(datapaths+"pos.csv","r","utf-8") as f1,\ 30 | codecs.open(datapaths+"neutral.csv","r","utf-8") as f2,\ 31 | codecs.open(datapaths+"neg.csv","r","utf-8") as f3: 32 | for line in f1: 33 | positive_data.append(" ".join(i for i in jieba.lcut(line.strip(),cut_all=False))) 34 | y_positive.append([1,0,0]) 35 | for line in f2: 36 | neutral_data.append(" ".join(i for i in jieba.lcut(line.strip(),cut_all=False))) 37 | y_neutral.append([0,1,0]) 38 | for line in f3: 39 | negative_data.append(" ".join(i for i in jieba.lcut(line.strip(),cut_all=False))) 40 | y_negative.append([0,0,1]) 41 | 42 | print("positive data:{}".format(len(positive_data))) 43 | print("neutral data:{}".format(len(neutral_data))) 44 | print("negative data:{}".format(len(negative_data))) 45 | 46 | x_text=positive_data+neutral_data+negative_data 47 | y_label=y_positive+y_neutral+y_negative 48 | print("#------------------------------------------------------#") 49 | print("\n") 50 | 51 | from tensorflow.contrib import learn 52 | import tensorflow as tf 53 | import numpy as np 54 | import collections 55 | 56 | max_document_length=200 57 | min_frequency=1 58 | 59 | 60 | vocab = learn.preprocessing.VocabularyProcessor(max_document_length,min_frequency, tokenizer_fn=list) 61 | x = np.array(list(vocab.fit_transform(x_text))) 62 | vocab_dict = collections.OrderedDict(vocab.vocabulary_._mapping) 63 | 64 | with codecs.open(r"C:\Users\RY\Desktop\vocabulary.txt","w","utf-8") as f: 65 | for key,value in vocab_dict.items(): 66 | f.write("{} {}\n".format(key,value)) 67 | 68 | print("#----------------------------------------------------------#") 69 | print("\n") 70 | 71 | print("#----------------------------------------------------------#") 72 | print("数据混洗") 73 | np.random.seed(10) 74 | y=np.array(y_label) 75 | shuffle_indices = np.random.permutation(np.arange(len(y))) 76 | x_shuffled = x[shuffle_indices] 77 | y_shuffled = y[shuffle_indices] 78 | 79 | test_sample_percentage=0.2 80 | test_sample_index = -1 * int(test_sample_percentage * float(len(y))) 81 | x_train, x_test = x_shuffled[:test_sample_index], x_shuffled[test_sample_index:] 82 | y_train, y_test = y_shuffled[:test_sample_index], y_shuffled[test_sample_index:] 83 | 84 | train_positive_label=0 85 | train_neutral_label=0 86 | train_negative_label=0 87 | test_positive_label=0 88 | test_neutral_label=0 89 | test_negative_label=0 90 | 91 | for i in range(len(y_train)): 92 | if y_train[i,0] == 1: 93 | train_positive_label += 1 94 | elif y_train[i,1] == 1: 95 | train_neutral_label += 1 96 | else: 97 | train_negative_label += 1 98 | 99 | for i in range(len(y_test)): 100 | if y_test[i,0] == 1: 101 | test_positive_label += 1 102 | elif y_test[i,1] == 1: 103 | test_neutral_label += 1 104 | else: 105 | test_negative_label += 1 106 | 107 | print("训练集中 positive 样本个数:{}".format(train_positive_label)) 108 | print("训练集中 neutral 样本个数:{}".format(train_neutral_label)) 109 | print("训练集中 negative 样本个数:{}".format(train_negative_label)) 110 | print("测试集中 positive 样本个数:{}".format(test_positive_label)) 111 | print("测试集中 neutral 样本个数:{}".format(test_neutral_label)) 112 | print("测试集中 negative 样本个数:{}".format(test_negative_label)) 113 | 114 | print("#----------------------------------------------------------#") 115 | print("\n") 116 | 117 | print("#----------------------------------------------------------#") 118 | print("读取預训练词向量矩阵") 119 | 120 | pretrainpath=r"E:\中科大MS\預训练模型\\" 121 | 122 | embedding_index={} 123 | 124 | with codecs.open(pretrainpath+"sgns.wiki.bigram","r","utf-8") as f: 125 | line=f.readline() 126 | nwords=int(line.strip().split(" ")[0]) 127 | ndims=int(line.strip().split(" ")[1]) 128 | for line in f: 129 | values=line.split() 130 | words=values[0] 131 | coefs=np.asarray(values[1:],dtype="float32") 132 | embedding_index[words]=coefs 133 | 134 | print("預训练模型中Token总数:{} = {}".format(nwords,len(embedding_index))) 135 | print("預训练模型的维度:{}".format(ndims)) 136 | print("#----------------------------------------------------------#") 137 | print("\n") 138 | 139 | print("#----------------------------------------------------------#") 140 | print("将vocabulary中的 index-word 对应关系映射到 index-word vector形式") 141 | 142 | embedding_matrix=[] 143 | notfoundword=0 144 | 145 | for word in vocab_dict.keys(): 146 | if word in embedding_index.keys(): 147 | embedding_matrix.append(embedding_index[word]) 148 | else: 149 | notfoundword += 1 150 | embedding_matrix.append(np.random.uniform(-1,1,size=ndims)) 151 | 152 | embedding_matrix=np.array(embedding_matrix,dtype=np.float32) # 必须使用 np.float32 153 | print("词汇表中未找到单词个数:{}".format(notfoundword)) 154 | print("#----------------------------------------------------------#") 155 | print("\n") 156 | 157 | print("#---------------------------------------------------#") 158 | print("Build model .................") 159 | print("NN structure .......") 160 | print("Embedding layer --- CNN layer --- LSTM layer --- Dense layer") 161 | print("#---------------------------------------------------#") 162 | print("\n") 163 | 164 | from keras.models import Sequential 165 | from keras.layers import Dense, Dropout, Activation 166 | from keras.layers import Embedding 167 | from keras.layers import Conv1D, GlobalMaxPooling1D,MaxPooling1D 168 | from keras.layers import LSTM 169 | 170 | batch_size=64 171 | max_sentence_length=200 172 | lstm_output_size=128 173 | embedding_dims=ndims 174 | filters = 250 175 | kernel_size = 3 176 | dropout=0.2 177 | recurrent_dropout=0.2 178 | num_classes=3 179 | epochs=2 180 | 181 | # 定义网络结构 182 | model=Sequential() 183 | model.add(Embedding(len(vocab_dict), 184 | embedding_dims, 185 | weights=[embedding_matrix], 186 | input_length=max_sentence_length, 187 | trainable=False)) 188 | model.add(Dropout(dropout)) 189 | model.add(Conv1D(filters,kernel_size,padding="valid",activation="relu",strides=1)) 190 | model.add(MaxPooling1D()) 191 | model.add(LSTM(lstm_output_size)) 192 | model.add(Dense(num_classes)) 193 | model.add(Activation("sigmoid")) 194 | 195 | # 模型编译 196 | 197 | model.compile(loss="categorical_crossentropy",optimizer="adam",metrics=["accuracy"]) 198 | 199 | print("#---------------------------------------------------#") 200 | print("Train ....................") 201 | print("#---------------------------------------------------#") 202 | print("\n") 203 | 204 | model.fit(x_train,y_train,batch_size=batch_size,epochs=epochs,validation_data=(x_test,y_test)) 205 | 206 | # 训练得分和准确度 207 | 208 | score,acc=model.evaluate(x_test,y_test,batch_size=batch_size) 209 | 210 | print("#---------------------------------------------------#") 211 | print("预测得分:{}".format(score)) 212 | print("预测准确率:{}".format(acc)) 213 | print("#---------------------------------------------------#") 214 | print("\n") 215 | 216 | # 模型预测 217 | 218 | predictions=model.predict(x_test) 219 | 220 | print("#---------------------------------------------------#") 221 | print("测试集的预测结果,对每个类有一个得分/概率,取值大对应的类别") 222 | print(predictions) 223 | print("#---------------------------------------------------#") 224 | print("\n") 225 | 226 | # 模型预测类别 227 | 228 | predict_class=model.predict_classes(x_test) 229 | 230 | print("#---------------------------------------------------#") 231 | print("测试集的预测类别") 232 | print(predict_class) 233 | print("#---------------------------------------------------#") 234 | print("\n") 235 | 236 | # 模型保存 237 | 238 | model.save(r"C:\Users\RY\Desktop\sentiment_analysis_lstm.h5") 239 | 240 | print("#---------------------------------------------------#") 241 | print("保存模型") 242 | print("#---------------------------------------------------#") 243 | print("\n") 244 | 245 | # 模型总结 246 | 247 | print("#---------------------------------------------------#") 248 | print("输出模型总结") 249 | print(model.summary()) 250 | print("#---------------------------------------------------#") 251 | print("\n") 252 | 253 | # 模型的配置文件 254 | 255 | config=model.get_config() 256 | 257 | print("#---------------------------------------------------#") 258 | print("输出模型配置信息") 259 | print(config) 260 | print("#---------------------------------------------------#") 261 | print("\n") 262 | -------------------------------------------------------------------------------- /keras_sentiment_analysis_v3.py: -------------------------------------------------------------------------------- 1 | # _*_ coding:utf-8 _*_ 2 | 3 | ''' 4 | @Author: Ruan Yang 5 | @Date: 2018.12.9 6 | @Purpose: 文本情感分析(positive,negative,neutral) 7 | @Reference: https://github.com/Edward1Chou/SentimentAnalysis 8 | @算法:CNN 9 | @需要有事先标准好的数据集 10 | @positive: [1,0,0] 11 | @neutral: [0,1,0] 12 | @negative:[0,0,1] 13 | ''' 14 | 15 | import codecs 16 | import jieba 17 | 18 | datapaths=r"C:\Users\RY\Desktop\SentimentAnalysis-master\data\\" 19 | 20 | positive_data=[] 21 | y_positive=[] 22 | neutral_data=[] 23 | y_neutral=[] 24 | negative_data=[] 25 | y_negative=[] 26 | 27 | print("#------------------------------------------------------#") 28 | print("加载数据集") 29 | with codecs.open(datapaths+"pos.csv","r","utf-8") as f1,\ 30 | codecs.open(datapaths+"neutral.csv","r","utf-8") as f2,\ 31 | codecs.open(datapaths+"neg.csv","r","utf-8") as f3: 32 | for line in f1: 33 | positive_data.append(" ".join(i for i in jieba.lcut(line.strip(),cut_all=False))) 34 | y_positive.append([1,0,0]) 35 | for line in f2: 36 | neutral_data.append(" ".join(i for i in jieba.lcut(line.strip(),cut_all=False))) 37 | y_neutral.append([0,1,0]) 38 | for line in f3: 39 | negative_data.append(" ".join(i for i in jieba.lcut(line.strip(),cut_all=False))) 40 | y_negative.append([0,0,1]) 41 | 42 | print("positive data:{}".format(len(positive_data))) 43 | print("neutral data:{}".format(len(neutral_data))) 44 | print("negative data:{}".format(len(negative_data))) 45 | 46 | x_text=positive_data+neutral_data+negative_data 47 | y_label=y_positive+y_neutral+y_negative 48 | print("#------------------------------------------------------#") 49 | print("\n") 50 | 51 | from tensorflow.contrib import learn 52 | import tensorflow as tf 53 | import numpy as np 54 | import collections 55 | 56 | max_document_length=200 57 | min_frequency=1 58 | 59 | 60 | vocab = learn.preprocessing.VocabularyProcessor(max_document_length,min_frequency, tokenizer_fn=list) 61 | x = np.array(list(vocab.fit_transform(x_text))) 62 | vocab_dict = collections.OrderedDict(vocab.vocabulary_._mapping) 63 | 64 | with codecs.open(r"C:\Users\RY\Desktop\vocabulary.txt","w","utf-8") as f: 65 | for key,value in vocab_dict.items(): 66 | f.write("{} {}\n".format(key,value)) 67 | 68 | print("#----------------------------------------------------------#") 69 | print("\n") 70 | 71 | print("#----------------------------------------------------------#") 72 | print("数据混洗") 73 | np.random.seed(10) 74 | y=np.array(y_label) 75 | shuffle_indices = np.random.permutation(np.arange(len(y))) 76 | x_shuffled = x[shuffle_indices] 77 | y_shuffled = y[shuffle_indices] 78 | 79 | test_sample_percentage=0.2 80 | test_sample_index = -1 * int(test_sample_percentage * float(len(y))) 81 | x_train, x_test = x_shuffled[:test_sample_index], x_shuffled[test_sample_index:] 82 | y_train, y_test = y_shuffled[:test_sample_index], y_shuffled[test_sample_index:] 83 | 84 | train_positive_label=0 85 | train_neutral_label=0 86 | train_negative_label=0 87 | test_positive_label=0 88 | test_neutral_label=0 89 | test_negative_label=0 90 | 91 | for i in range(len(y_train)): 92 | if y_train[i,0] == 1: 93 | train_positive_label += 1 94 | elif y_train[i,1] == 1: 95 | train_neutral_label += 1 96 | else: 97 | train_negative_label += 1 98 | 99 | for i in range(len(y_test)): 100 | if y_test[i,0] == 1: 101 | test_positive_label += 1 102 | elif y_test[i,1] == 1: 103 | test_neutral_label += 1 104 | else: 105 | test_negative_label += 1 106 | 107 | print("训练集中 positive 样本个数:{}".format(train_positive_label)) 108 | print("训练集中 neutral 样本个数:{}".format(train_neutral_label)) 109 | print("训练集中 negative 样本个数:{}".format(train_negative_label)) 110 | print("测试集中 positive 样本个数:{}".format(test_positive_label)) 111 | print("测试集中 neutral 样本个数:{}".format(test_neutral_label)) 112 | print("测试集中 negative 样本个数:{}".format(test_negative_label)) 113 | 114 | print("#----------------------------------------------------------#") 115 | print("\n") 116 | 117 | print("#----------------------------------------------------------#") 118 | print("读取預训练词向量矩阵") 119 | 120 | pretrainpath=r"E:\中科大MS\預训练模型\\" 121 | 122 | embedding_index={} 123 | 124 | with codecs.open(pretrainpath+"sgns.wiki.bigram","r","utf-8") as f: 125 | line=f.readline() 126 | nwords=int(line.strip().split(" ")[0]) 127 | ndims=int(line.strip().split(" ")[1]) 128 | for line in f: 129 | values=line.split() 130 | words=values[0] 131 | coefs=np.asarray(values[1:],dtype="float32") 132 | embedding_index[words]=coefs 133 | 134 | print("預训练模型中Token总数:{} = {}".format(nwords,len(embedding_index))) 135 | print("預训练模型的维度:{}".format(ndims)) 136 | print("#----------------------------------------------------------#") 137 | print("\n") 138 | 139 | print("#----------------------------------------------------------#") 140 | print("将vocabulary中的 index-word 对应关系映射到 index-word vector形式") 141 | 142 | embedding_matrix=[] 143 | notfoundword=0 144 | 145 | for word in vocab_dict.keys(): 146 | if word in embedding_index.keys(): 147 | embedding_matrix.append(embedding_index[word]) 148 | else: 149 | notfoundword += 1 150 | embedding_matrix.append(np.random.uniform(-1,1,size=ndims)) 151 | 152 | embedding_matrix=np.array(embedding_matrix,dtype=np.float32) # 必须使用 np.float32 153 | print("词汇表中未找到单词个数:{}".format(notfoundword)) 154 | print("#----------------------------------------------------------#") 155 | print("\n") 156 | 157 | print("#---------------------------------------------------#") 158 | print("Build model .................") 159 | print("NN structure .......") 160 | print("Embedding layer --- CNN layer --- Dense layer --- Dense layer") 161 | print("#---------------------------------------------------#") 162 | print("\n") 163 | 164 | from keras.models import Sequential 165 | from keras.layers import Dense, Dropout, Activation 166 | from keras.layers import Embedding 167 | from keras.layers import Conv1D, GlobalMaxPooling1D,MaxPooling1D 168 | 169 | batch_size=64 170 | max_sentence_length=200 171 | embedding_dims=ndims 172 | filters = 250 173 | kernel_size = 3 174 | hidden_dims = 250 175 | dropout=0.2 176 | recurrent_dropout=0.2 177 | num_classes=3 178 | epochs=2 179 | 180 | # 定义网络结构 181 | model = Sequential() 182 | model.add(Embedding(len(vocab_dict), 183 | embedding_dims, 184 | weights=[embedding_matrix], 185 | input_length=max_sentence_length, 186 | trainable=False)) 187 | model.add(Dropout(dropout)) 188 | model.add(Conv1D(filters, 189 | kernel_size, 190 | padding='valid', 191 | activation='relu', 192 | strides=1)) 193 | model.add(GlobalMaxPooling1D()) 194 | model.add(Dense(hidden_dims)) 195 | model.add(Dropout(dropout)) 196 | model.add(Activation('relu')) 197 | model.add(Dense(num_classes)) 198 | model.add(Activation('sigmoid')) 199 | 200 | # 模型编译 201 | 202 | model.compile(loss="categorical_crossentropy",optimizer="adam",metrics=["accuracy"]) 203 | 204 | print("#---------------------------------------------------#") 205 | print("Train ....................") 206 | print("#---------------------------------------------------#") 207 | print("\n") 208 | 209 | model.fit(x_train,y_train,batch_size=batch_size,epochs=epochs,validation_data=(x_test,y_test)) 210 | 211 | # 训练得分和准确度 212 | 213 | score,acc=model.evaluate(x_test,y_test,batch_size=batch_size) 214 | 215 | print("#---------------------------------------------------#") 216 | print("预测得分:{}".format(score)) 217 | print("预测准确率:{}".format(acc)) 218 | print("#---------------------------------------------------#") 219 | print("\n") 220 | 221 | # 模型预测 222 | 223 | predictions=model.predict(x_test) 224 | 225 | print("#---------------------------------------------------#") 226 | print("测试集的预测结果,对每个类有一个得分/概率,取值大对应的类别") 227 | print(predictions) 228 | print("#---------------------------------------------------#") 229 | print("\n") 230 | 231 | # 模型预测类别 232 | 233 | predict_class=model.predict_classes(x_test) 234 | 235 | print("#---------------------------------------------------#") 236 | print("测试集的预测类别") 237 | print(predict_class) 238 | print("#---------------------------------------------------#") 239 | print("\n") 240 | 241 | # 模型保存 242 | 243 | model.save(r"C:\Users\RY\Desktop\sentiment_analysis_lstm.h5") 244 | 245 | print("#---------------------------------------------------#") 246 | print("保存模型") 247 | print("#---------------------------------------------------#") 248 | print("\n") 249 | 250 | # 模型总结 251 | 252 | print("#---------------------------------------------------#") 253 | print("输出模型总结") 254 | print(model.summary()) 255 | print("#---------------------------------------------------#") 256 | print("\n") 257 | 258 | # 模型的配置文件 259 | 260 | config=model.get_config() 261 | 262 | print("#---------------------------------------------------#") 263 | print("输出模型配置信息") 264 | print(config) 265 | print("#---------------------------------------------------#") 266 | print("\n") 267 | -------------------------------------------------------------------------------- /keras_sentiment_analysis_v4.py: -------------------------------------------------------------------------------- 1 | # _*_ coding:utf-8 _*_ 2 | 3 | ''' 4 | @Author: Ruan Yang 5 | @Date: 2018.12.15 6 | @Purpose: 基于 keras 构建 textcnn 算法 7 | @Reference: https://www.cnblogs.com/bymo/p/9675654.html 8 | ''' 9 | 10 | import logging 11 | logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) 12 | 13 | from keras import Input 14 | from keras.layers import Conv1D,MaxPool1D,Dense,Flatten,concatenate,Embedding 15 | from keras.models import Model 16 | 17 | import codecs 18 | import jieba 19 | 20 | datapaths=r"C:\Users\RY\Desktop\情感分析\SentimentAnalysis-master\data" 21 | 22 | positive_data=[] 23 | y_positive=[] 24 | neutral_data=[] 25 | y_neutral=[] 26 | negative_data=[] 27 | y_negative=[] 28 | 29 | print("#------------------------------------------------------#") 30 | print("加载数据集") 31 | with codecs.open(datapaths+"\\pos.csv","r","utf-8") as f1,\ 32 | codecs.open(datapaths+"\\neutral.csv","r","utf-8") as f2,\ 33 | codecs.open(datapaths+"\\neg.csv","r","utf-8") as f3: 34 | for line in f1: 35 | positive_data.append(" ".join(i for i in jieba.lcut(line.strip(),cut_all=False))) 36 | y_positive.append([1,0,0]) 37 | for line in f2: 38 | neutral_data.append(" ".join(i for i in jieba.lcut(line.strip(),cut_all=False))) 39 | y_neutral.append([0,1,0]) 40 | for line in f3: 41 | negative_data.append(" ".join(i for i in jieba.lcut(line.strip(),cut_all=False))) 42 | y_negative.append([0,0,1]) 43 | 44 | print("positive data:{}".format(len(positive_data))) 45 | print("neutral data:{}".format(len(neutral_data))) 46 | print("negative data:{}".format(len(negative_data))) 47 | 48 | x_text=positive_data+neutral_data+negative_data 49 | y_label=y_positive+y_neutral+y_negative 50 | print("#------------------------------------------------------#") 51 | print("\n") 52 | 53 | from tensorflow.contrib import learn 54 | import tensorflow as tf 55 | import numpy as np 56 | import collections 57 | 58 | max_document_length=200 59 | min_frequency=1 60 | 61 | 62 | vocab = learn.preprocessing.VocabularyProcessor(max_document_length,min_frequency, tokenizer_fn=list) 63 | x = np.array(list(vocab.fit_transform(x_text))) 64 | vocab_dict = collections.OrderedDict(vocab.vocabulary_._mapping) 65 | 66 | with codecs.open(r"C:\Users\RY\Desktop\vocabulary.txt","w","utf-8") as f: 67 | for key,value in vocab_dict.items(): 68 | f.write("{} {}\n".format(key,value)) 69 | 70 | print("#----------------------------------------------------------#") 71 | print("\n") 72 | 73 | print("#----------------------------------------------------------#") 74 | print("数据混洗") 75 | np.random.seed(10) 76 | y=np.array(y_label) 77 | shuffle_indices = np.random.permutation(np.arange(len(y))) 78 | x_shuffled = x[shuffle_indices] 79 | y_shuffled = y[shuffle_indices] 80 | 81 | test_sample_percentage=0.2 82 | test_sample_index = -1 * int(test_sample_percentage * float(len(y))) 83 | x_train, x_test = x_shuffled[:test_sample_index], x_shuffled[test_sample_index:] 84 | y_train, y_test = y_shuffled[:test_sample_index], y_shuffled[test_sample_index:] 85 | 86 | train_positive_label=0 87 | train_neutral_label=0 88 | train_negative_label=0 89 | test_positive_label=0 90 | test_neutral_label=0 91 | test_negative_label=0 92 | 93 | for i in range(len(y_train)): 94 | if y_train[i,0] == 1: 95 | train_positive_label += 1 96 | elif y_train[i,1] == 1: 97 | train_neutral_label += 1 98 | else: 99 | train_negative_label += 1 100 | 101 | for i in range(len(y_test)): 102 | if y_test[i,0] == 1: 103 | test_positive_label += 1 104 | elif y_test[i,1] == 1: 105 | test_neutral_label += 1 106 | else: 107 | test_negative_label += 1 108 | 109 | print("训练集中 positive 样本个数:{}".format(train_positive_label)) 110 | print("训练集中 neutral 样本个数:{}".format(train_neutral_label)) 111 | print("训练集中 negative 样本个数:{}".format(train_negative_label)) 112 | print("测试集中 positive 样本个数:{}".format(test_positive_label)) 113 | print("测试集中 neutral 样本个数:{}".format(test_neutral_label)) 114 | print("测试集中 negative 样本个数:{}".format(test_negative_label)) 115 | 116 | print("#----------------------------------------------------------#") 117 | print("\n") 118 | 119 | print("#----------------------------------------------------------#") 120 | print("读取預训练词向量矩阵") 121 | 122 | pretrainpath=r"E:\中科大MS\預训练模型\\" 123 | 124 | embedding_index={} 125 | 126 | with codecs.open(pretrainpath+"sgns.wiki.bigram","r","utf-8") as f: 127 | line=f.readline() 128 | nwords=int(line.strip().split(" ")[0]) 129 | ndims=int(line.strip().split(" ")[1]) 130 | for line in f: 131 | values=line.split() 132 | words=values[0] 133 | coefs=np.asarray(values[1:],dtype="float32") 134 | embedding_index[words]=coefs 135 | 136 | print("預训练模型中Token总数:{} = {}".format(nwords,len(embedding_index))) 137 | print("預训练模型的维度:{}".format(ndims)) 138 | print("#----------------------------------------------------------#") 139 | print("\n") 140 | 141 | print("#----------------------------------------------------------#") 142 | print("将vocabulary中的 index-word 对应关系映射到 index-word vector形式") 143 | 144 | embedding_matrix=[] 145 | notfoundword=0 146 | 147 | for word in vocab_dict.keys(): 148 | if word in embedding_index.keys(): 149 | embedding_matrix.append(embedding_index[word]) 150 | else: 151 | notfoundword += 1 152 | embedding_matrix.append(np.random.uniform(-1,1,size=ndims)) 153 | 154 | embedding_matrix=np.array(embedding_matrix,dtype=np.float32) # 必须使用 np.float32 155 | print("词汇表中未找到单词个数:{}".format(notfoundword)) 156 | print("#----------------------------------------------------------#") 157 | print("\n") 158 | 159 | # 定义 textcnn 函数 160 | 161 | def textcnn(filters,max_sequence_length,max_token_num,embedding_dim,num_classes,embedding_matrix=None): 162 | ''' 163 | TextCNN Network Structure 164 | 1. Embedding layers 165 | 2. Convolution layer 166 | 3. max-pooling 167 | 4. softmax layer 168 | 169 | max_sequence_length: 输入句子的最大长度 170 | max_token_num: 这个是指最大的 token 个数,感觉上是针对 英文字符 说的 171 | embedding_dim:嵌入矩阵的维度 172 | num_classes: 输出类别个数,二分类就设定为 2 173 | embedding_matrix=None:这个的意思是是否适用 嵌入矩阵 174 | ''' 175 | x_input=Input(shape=(max_sequence_length,)) 176 | logging.info("x_input.shape:{}".format(str(x_input.shape))) 177 | 178 | if embedding_matrix is None: 179 | x_emb=Embedding(input_dim=max_token_num,output_dim=embedding_dim,input_length=max_sequence_length)(x_input) 180 | else: 181 | x_emb=Embedding(input_dim=max_token_num,output_dim=embedding_dim,input_length=max_sequence_length,\ 182 | weights=[embedding_matrix],trainable=False)(x_input) 183 | 184 | logging.info("x_emb.shape:{}".format(str(x_emb.shape))) 185 | 186 | pool_output=[] 187 | kernel_sizes=[2,3,4] 188 | for kernel_size in kernel_sizes: 189 | c=Conv1D(filters=filters,kernel_size=kernel_size,strides=1)(x_emb) 190 | #p=MaxPool1D(pool_size=int(c.shape[1]))(c) 191 | p=MaxPool1D(max_sequence_length-kernel_size+1)(c) 192 | pool_output.append(p) 193 | logging.info("kernel_size:{} \t c.shape:{} \t p.shape:{}".format(kernel_size,str(c.shape),str(p.shape))) 194 | #pool_output = concatenate([p for p in pool_output]) 195 | pool_output = concatenate(pool_output,axis=1) 196 | x_flatten=Flatten()(pool_output) # (?,6) 197 | y=Dense(num_classes,activation="softmax")(x_flatten) # (?,2) 198 | 199 | logging.info("y.shape:{}\n".format(str(y.shape))) 200 | 201 | model=Model([x_input],outputs=[y]) 202 | model.summary() 203 | 204 | return model 205 | 206 | # 定义模型超参数 207 | 208 | filters=128 209 | max_sequence_length=max_document_length 210 | max_token_num=len(vocab_dict) 211 | embedding_dim=ndims 212 | num_classes=3 213 | embedding_matrix=embedding_matrix 214 | batch_size=64 215 | epochs=2 216 | 217 | # 获得模型 218 | model=textcnn(filters,max_sequence_length,max_token_num,embedding_dim,num_classes,embedding_matrix=embedding_matrix) 219 | model.compile(loss="categorical_crossentropy",optimizer="adam",metrics=["accuracy"]) 220 | 221 | # 模型训练 222 | model.fit(x_train,y_train,batch_size=batch_size,epochs=epochs,validation_data=(x_test,y_test)) 223 | 224 | # 训练得分和准确度 225 | 226 | score,acc=model.evaluate(x_test,y_test,batch_size=batch_size) 227 | 228 | print("#---------------------------------------------------#") 229 | print("预测得分:{}".format(score)) 230 | print("预测准确率:{}".format(acc)) 231 | print("#---------------------------------------------------#") 232 | print("\n") 233 | 234 | # 模型预测 235 | 236 | predictions=model.predict(x_test) 237 | 238 | print("#---------------------------------------------------#") 239 | print("测试集的预测结果,对每个类有一个得分/概率,取值大对应的类别") 240 | print(predictions) 241 | print("#---------------------------------------------------#") 242 | print("\n") 243 | 244 | # 模型预测类别 245 | 246 | predict_class=model.predict_classes(x_test) 247 | 248 | print("#---------------------------------------------------#") 249 | print("测试集的预测类别") 250 | print(predict_class) 251 | print("#---------------------------------------------------#") 252 | print("\n") 253 | 254 | # 模型保存 255 | 256 | model.save(r"C:\Users\RY\Desktop\sentiment_analysis_textcnn.h5") 257 | 258 | print("#---------------------------------------------------#") 259 | print("保存模型") 260 | print("#---------------------------------------------------#") 261 | print("\n") 262 | 263 | # 模型总结 264 | 265 | print("#---------------------------------------------------#") 266 | print("输出模型总结") 267 | print(model.summary()) 268 | print("#---------------------------------------------------#") 269 | print("\n") 270 | 271 | # 模型的配置文件 272 | 273 | config=model.get_config() 274 | 275 | print("#---------------------------------------------------#") 276 | print("输出模型配置信息") 277 | print(config) 278 | print("#---------------------------------------------------#") 279 | print("\n") 280 | 281 | # 模型的画图和图片保存 282 | 283 | import matplotlib.pyplot as plt 284 | import matplotlib.image as mpimg 285 | from keras.utils import plot_model 286 | 287 | plot_model(model,to_file="model.jpg",show_shapes=True) 288 | lena=mpimg.imread("model.jpg") 289 | lena.shape 290 | plt.imshow(lena) 291 | plt.axis("off") 292 | plt.show() -------------------------------------------------------------------------------- /keras_sentiment_analysis_v5.py: -------------------------------------------------------------------------------- 1 | # _*_ coding:utf-8 _*_ 2 | 3 | ''' 4 | @Author: Ruan Yang 5 | @Date: 2018.12.9 6 | @Purpose: 文本情感分析(positive,negative,neutral) 7 | @Reference: https://github.com/Edward1Chou/SentimentAnalysis 8 | @算法:Bi_LSTM 9 | @需要有事先标准好的数据集 10 | @positive: [1,0,0] 11 | @neutral: [0,1,0] 12 | @negative:[0,0,1] 13 | ''' 14 | import logging 15 | logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) 16 | 17 | import codecs 18 | import jieba 19 | 20 | datapaths=r"C:\Users\RY\Desktop\情感分析\SentimentAnalysis-master\data\\" 21 | 22 | positive_data=[] 23 | y_positive=[] 24 | neutral_data=[] 25 | y_neutral=[] 26 | negative_data=[] 27 | y_negative=[] 28 | 29 | print("#------------------------------------------------------#") 30 | print("加载数据集") 31 | with codecs.open(datapaths+"pos.csv","r","utf-8") as f1,\ 32 | codecs.open(datapaths+"neutral.csv","r","utf-8") as f2,\ 33 | codecs.open(datapaths+"neg.csv","r","utf-8") as f3: 34 | for line in f1: 35 | positive_data.append(" ".join(i for i in jieba.lcut(line.strip(),cut_all=False))) 36 | y_positive.append([1,0,0]) 37 | for line in f2: 38 | neutral_data.append(" ".join(i for i in jieba.lcut(line.strip(),cut_all=False))) 39 | y_neutral.append([0,1,0]) 40 | for line in f3: 41 | negative_data.append(" ".join(i for i in jieba.lcut(line.strip(),cut_all=False))) 42 | y_negative.append([0,0,1]) 43 | 44 | print("positive data:{}".format(len(positive_data))) 45 | print("neutral data:{}".format(len(neutral_data))) 46 | print("negative data:{}".format(len(negative_data))) 47 | 48 | x_text=positive_data+neutral_data+negative_data 49 | y_label=y_positive+y_neutral+y_negative 50 | print("#------------------------------------------------------#") 51 | print("\n") 52 | 53 | from tensorflow.contrib import learn 54 | import tensorflow as tf 55 | import numpy as np 56 | import collections 57 | 58 | max_document_length=200 59 | min_frequency=1 60 | 61 | 62 | vocab = learn.preprocessing.VocabularyProcessor(max_document_length,min_frequency, tokenizer_fn=list) 63 | x = np.array(list(vocab.fit_transform(x_text))) 64 | vocab_dict = collections.OrderedDict(vocab.vocabulary_._mapping) 65 | 66 | with codecs.open(r"C:\Users\RY\Desktop\vocabulary.txt","w","utf-8") as f: 67 | for key,value in vocab_dict.items(): 68 | f.write("{} {}\n".format(key,value)) 69 | 70 | print("#----------------------------------------------------------#") 71 | print("\n") 72 | 73 | print("#----------------------------------------------------------#") 74 | print("数据混洗") 75 | np.random.seed(10) 76 | y=np.array(y_label) 77 | shuffle_indices = np.random.permutation(np.arange(len(y))) 78 | x_shuffled = x[shuffle_indices] 79 | y_shuffled = y[shuffle_indices] 80 | 81 | test_sample_percentage=0.2 82 | test_sample_index = -1 * int(test_sample_percentage * float(len(y))) 83 | x_train, x_test = x_shuffled[:test_sample_index], x_shuffled[test_sample_index:] 84 | y_train, y_test = y_shuffled[:test_sample_index], y_shuffled[test_sample_index:] 85 | 86 | train_positive_label=0 87 | train_neutral_label=0 88 | train_negative_label=0 89 | test_positive_label=0 90 | test_neutral_label=0 91 | test_negative_label=0 92 | 93 | for i in range(len(y_train)): 94 | if y_train[i,0] == 1: 95 | train_positive_label += 1 96 | elif y_train[i,1] == 1: 97 | train_neutral_label += 1 98 | else: 99 | train_negative_label += 1 100 | 101 | for i in range(len(y_test)): 102 | if y_test[i,0] == 1: 103 | test_positive_label += 1 104 | elif y_test[i,1] == 1: 105 | test_neutral_label += 1 106 | else: 107 | test_negative_label += 1 108 | 109 | print("训练集中 positive 样本个数:{}".format(train_positive_label)) 110 | print("训练集中 neutral 样本个数:{}".format(train_neutral_label)) 111 | print("训练集中 negative 样本个数:{}".format(train_negative_label)) 112 | print("测试集中 positive 样本个数:{}".format(test_positive_label)) 113 | print("测试集中 neutral 样本个数:{}".format(test_neutral_label)) 114 | print("测试集中 negative 样本个数:{}".format(test_negative_label)) 115 | 116 | print("#----------------------------------------------------------#") 117 | print("\n") 118 | 119 | print("#----------------------------------------------------------#") 120 | print("读取預训练词向量矩阵") 121 | 122 | pretrainpath=r"E:\中科大MS\預训练模型\\" 123 | 124 | embedding_index={} 125 | 126 | with codecs.open(pretrainpath+"sgns.wiki.bigram","r","utf-8") as f: 127 | line=f.readline() 128 | nwords=int(line.strip().split(" ")[0]) 129 | ndims=int(line.strip().split(" ")[1]) 130 | for line in f: 131 | values=line.split() 132 | words=values[0] 133 | coefs=np.asarray(values[1:],dtype="float32") 134 | embedding_index[words]=coefs 135 | 136 | print("預训练模型中Token总数:{} = {}".format(nwords,len(embedding_index))) 137 | print("預训练模型的维度:{}".format(ndims)) 138 | print("#----------------------------------------------------------#") 139 | print("\n") 140 | 141 | print("#----------------------------------------------------------#") 142 | print("将vocabulary中的 index-word 对应关系映射到 index-word vector形式") 143 | 144 | embedding_matrix=[] 145 | notfoundword=0 146 | 147 | for word in vocab_dict.keys(): 148 | if word in embedding_index.keys(): 149 | embedding_matrix.append(embedding_index[word]) 150 | else: 151 | notfoundword += 1 152 | embedding_matrix.append(np.random.uniform(-1,1,size=ndims)) 153 | 154 | embedding_matrix=np.array(embedding_matrix,dtype=np.float32) # 必须使用 np.float32 155 | print("词汇表中未找到单词个数:{}".format(notfoundword)) 156 | print("#----------------------------------------------------------#") 157 | print("\n") 158 | 159 | print("#---------------------------------------------------#") 160 | print("Build model .................") 161 | print("NN structure .......") 162 | print("Embedding layer --- Bi_LSTM layer --- Dense layer") 163 | print("#---------------------------------------------------#") 164 | print("\n") 165 | 166 | from keras.models import Sequential 167 | from keras.layers import Dense, Dropout, Activation 168 | from keras.layers import Embedding,Bidirectional 169 | from keras.layers import Conv1D, GlobalMaxPooling1D 170 | from keras.layers import LSTM 171 | 172 | batch_size=64 173 | max_sentence_length=200 174 | embedding_dims=ndims 175 | dropout=0.2 176 | recurrent_dropout=0.2 177 | num_classes=3 178 | epochs=2 179 | 180 | # 定义网络结构 181 | model=Sequential() 182 | model.add(Embedding(len(vocab_dict), 183 | embedding_dims, 184 | weights=[embedding_matrix], 185 | input_length=max_sentence_length, 186 | trainable=False)) 187 | model.add(Dropout(dropout)) 188 | model.add(Bidirectional(LSTM(64))) 189 | model.add(Dropout(dropout)) 190 | model.add(Dense(num_classes,activation="sigmoid")) 191 | 192 | # 模型编译 193 | 194 | model.compile(loss="categorical_crossentropy",optimizer="adam",metrics=["accuracy"]) 195 | 196 | print("#---------------------------------------------------#") 197 | print("Train ....................") 198 | print("#---------------------------------------------------#") 199 | print("\n") 200 | 201 | model.fit(x_train,y_train,batch_size=batch_size,epochs=epochs,validation_data=(x_test,y_test)) 202 | 203 | # 训练得分和准确度 204 | 205 | score,acc=model.evaluate(x_test,y_test,batch_size=batch_size) 206 | 207 | print("#---------------------------------------------------#") 208 | print("预测得分:{}".format(score)) 209 | print("预测准确率:{}".format(acc)) 210 | print("#---------------------------------------------------#") 211 | print("\n") 212 | 213 | # 模型预测 214 | 215 | predictions=model.predict(x_test) 216 | 217 | print("#---------------------------------------------------#") 218 | print("测试集的预测结果,对每个类有一个得分/概率,取值大对应的类别") 219 | print(predictions) 220 | print("#---------------------------------------------------#") 221 | print("\n") 222 | 223 | # 模型预测类别 224 | 225 | predict_class=model.predict_classes(x_test) 226 | 227 | print("#---------------------------------------------------#") 228 | print("测试集的预测类别") 229 | print(predict_class) 230 | print("#---------------------------------------------------#") 231 | print("\n") 232 | 233 | # 模型保存 234 | 235 | model.save(r"C:\Users\RY\Desktop\sentiment_analysis_lstm.h5") 236 | 237 | print("#---------------------------------------------------#") 238 | print("保存模型") 239 | print("#---------------------------------------------------#") 240 | print("\n") 241 | 242 | # 模型总结 243 | 244 | print("#---------------------------------------------------#") 245 | print("输出模型总结") 246 | print(model.summary()) 247 | print("#---------------------------------------------------#") 248 | print("\n") 249 | 250 | # 模型的配置文件 251 | 252 | config=model.get_config() 253 | 254 | print("#---------------------------------------------------#") 255 | print("输出模型配置信息") 256 | print(config) 257 | print("#---------------------------------------------------#") 258 | print("\n") 259 | -------------------------------------------------------------------------------- /sentiment_analysis_ml.py: -------------------------------------------------------------------------------- 1 | # _*_ coding:utf-8 _*_ 2 | 3 | ''' 4 | @Author: Ruan Yang 5 | @Date: 2018.12.16 6 | @Purpose: 使用传统的机器学习的方法进行文本情感分析 7 | ''' 8 | 9 | import codecs 10 | import jieba 11 | import numpy as np 12 | 13 | from gensim.models.word2vec import Word2Vec 14 | from sklearn.externals import joblib 15 | from sklearn.svm import SVC 16 | from sklearn.naive_bayes import GaussianNB 17 | from sklearn.tree import DecisionTreeClassifier 18 | from sklearn.ensemble import RandomForestClassifier 19 | from sklearn.ensemble import ExtraTreesClassifier 20 | from sklearn.ensemble import GradientBoostingClassifier 21 | from sklearn.neural_network import MLPClassifier 22 | from sklearn import neighbors 23 | from sklearn.linear_model import LogisticRegression 24 | from sklearn.linear_model import SGDClassifier 25 | 26 | datapaths=r"C:\Users\RY\Desktop\情感分析\SentimentAnalysis-master\data\\" 27 | storedpaths=r"C:\Users\RY\Desktop\\" 28 | 29 | positive_data=[] 30 | y_positive=[] 31 | neutral_data=[] 32 | y_neutral=[] 33 | negative_data=[] 34 | y_negative=[] 35 | 36 | print("#------------------------------------------------------#") 37 | print("加载数据集") 38 | with codecs.open(datapaths+"pos.csv","r","utf-8") as f1,\ 39 | codecs.open(datapaths+"neutral.csv","r","utf-8") as f2,\ 40 | codecs.open(datapaths+"neg.csv","r","utf-8") as f3: 41 | for line in f1: 42 | positive_data.append(" ".join(i for i in jieba.lcut(line.strip(),cut_all=False))) 43 | #y_positive.append([1,0,0]) 44 | y_positive.append([0]) 45 | for line in f2: 46 | neutral_data.append(" ".join(i for i in jieba.lcut(line.strip(),cut_all=False))) 47 | #y_neutral.append([0,1,0]) 48 | y_neutral.append([1]) 49 | for line in f3: 50 | negative_data.append(" ".join(i for i in jieba.lcut(line.strip(),cut_all=False))) 51 | #y_negative.append([0,0,1]) 52 | y_negative.append([2]) 53 | 54 | print("positive data:{}".format(len(positive_data))) 55 | print("neutral data:{}".format(len(neutral_data))) 56 | print("negative data:{}".format(len(negative_data))) 57 | 58 | x_text=positive_data+neutral_data+negative_data 59 | y_label=y_positive+y_neutral+y_negative 60 | print("#------------------------------------------------------#") 61 | print("\n") 62 | 63 | # 数据集混洗 64 | 65 | shuffle_indices = np.random.permutation(np.arange(len(y_label))) 66 | train_test_percent=0.2 67 | 68 | x_train=[] 69 | x_test=[] 70 | y_train=[] 71 | y_test=[] 72 | 73 | for i in shuffle_indices[:-(int(len(shuffle_indices)*train_test_percent))]: 74 | x_train.append(x_text[i]) 75 | y_train.append(y_label[i]) 76 | 77 | for i in shuffle_indices[-(int(len(shuffle_indices)*train_test_percent)):]: 78 | x_test.append(x_text[i]) 79 | y_test.append(y_label[i]) 80 | 81 | x_train_pos=0 82 | x_train_neu=0 83 | x_train_neg=0 84 | 85 | x_test_pos=0 86 | x_test_neu=0 87 | x_test_neg=0 88 | 89 | for i in y_train: 90 | if i[0] == 0: 91 | x_train_pos += 1 92 | elif i[0] == 1: 93 | x_train_neu += 1 94 | else: 95 | x_train_neg += 1 96 | 97 | for i in y_test: 98 | if i[0] == 0: 99 | x_test_pos += 1 100 | elif i[0] == 1: 101 | x_test_neu += 1 102 | else: 103 | x_test_neg += 1 104 | 105 | print("#------------------------------------------------------#") 106 | print("保存标签数据") 107 | np.save(storedpaths+"y_train.npy",np.array(y_train)) 108 | np.save(storedpaths+"y_test.npy",np.array(y_test)) 109 | print("训练集总数:{}".format(len(x_train))) 110 | print("训练集正样本:{}".format(x_train_pos)) 111 | print("训练集中性样本:{}".format(x_train_neu)) 112 | print("训练集负样本:{}".format(x_train_neg)) 113 | print("测试集总数:{}".format(len(x_test))) 114 | print("测试集正样本:{}".format(x_test_pos)) 115 | print("测试集中性样本:{}".format(x_test_neu)) 116 | print("测试集负样本:{}".format(x_test_neg)) 117 | print("#------------------------------------------------------#") 118 | print("\n") 119 | 120 | 121 | #对每个句子的所有词向量取均值 122 | # text 需要是切完词的 词列表 123 | # size 一般是词向量的维度 124 | # word_vector_model: 训练好的词向量模型 (一般使用 gensim 中的 WordVector 进行词向量训练) 125 | # 或者是直接加载训练好的模型 126 | 127 | def buildWordVector(text,size,word_vector_model): 128 | vec = np.zeros(size).reshape((1, size)) 129 | count = 0. 130 | for word in text: 131 | try: 132 | vec += word_vector_model[word].reshape((1, size)) 133 | count += 1. 134 | except KeyError: 135 | continue 136 | if count != 0: 137 | vec /= count 138 | return vec 139 | 140 | # 计算词向量 141 | 142 | def get_train_vecs(x_train,x_test,n_dim): 143 | ''' 144 | x_train: 训练集 145 | x_test: 测试集 146 | n_dim: 训练词向量的维度 147 | ''' 148 | n_dim=n_dim 149 | # 初始化模型和生成词汇表 150 | all_text=x_train+x_test 151 | text_w2v=Word2Vec(size=n_dim,min_count=5,workers=1) 152 | text_w2v.build_vocab(all_text) 153 | text_w2v.train(all_text,total_examples=text_w2v.corpus_count,epochs=5) 154 | 155 | # 分别得到训练集和测试集文本的词向量合集,这个数据集就很大了 156 | 157 | train_vecs=np.concatenate([buildWordVector(text,n_dim,text_w2v) for text in x_train]) 158 | np.save(storedpaths+"train_vecs.npy",train_vecs) 159 | print("训练集数据的词向量维度:{}".format(train_vecs.shape)) 160 | 161 | test_vecs=np.concatenate([buildWordVector(text,n_dim,text_w2v) for text in x_test]) 162 | np.save(storedpaths+"test_vecs.npy",test_vecs) 163 | print("测试集数据的词向量维度:{}".format(test_vecs.shape)) 164 | 165 | # 保存词向量 166 | text_w2v.save(storedpaths+"w2v_model.pkl") 167 | 168 | # 加载向量化的文本和标签 169 | 170 | def get_data(): 171 | train_vecs=np.load(storedpaths+'train_vecs.npy') 172 | y_train=np.load(storedpaths+'y_train.npy') 173 | test_vecs=np.load(storedpaths+'test_vecs.npy') 174 | y_test=np.load(storedpaths+'y_test.npy') 175 | return train_vecs,y_train,test_vecs,y_test 176 | 177 | # 训练svm模型 178 | 179 | def svm_train(train_vecs,y_train,test_vecs,y_test): 180 | clf=SVC(kernel='rbf',verbose=True) 181 | clf.fit(train_vecs,y_train) 182 | joblib.dump(clf,storedpaths+'model.pkl') 183 | test_scores=clf.score(test_vecs,y_test) 184 | return test_scores 185 | 186 | # 训练朴素贝叶斯模型 187 | 188 | def NB_train(train_vecs,y_train,test_vecs,y_test): 189 | gnb = GaussianNB() 190 | gnb.fit(train_vecs,y_train) 191 | joblib.dump(gnb,storedpaths+'model_gnb.pkl') 192 | test_scores=gnb.score(test_vecs,y_test) 193 | return test_scores 194 | 195 | # 训练决策树模型 196 | 197 | def decision_tree(train_vecs,y_train,test_vecs,y_test): 198 | clf=DecisionTreeClassifier(max_depth=10, min_samples_split=2,random_state=0) 199 | clf.fit(train_vecs,y_train) 200 | joblib.dump(clf,storedpaths+'model_dtree.pkl') 201 | test_scores=clf.score(test_vecs,y_test) 202 | return test_scores 203 | 204 | # 训练随机森林算法 205 | 206 | def random_forest(train_vecs,y_train,test_vecs,y_test): 207 | clf = RandomForestClassifier(n_estimators=10, max_depth=10,min_samples_split=2,n_jobs=1,random_state=0) 208 | clf.fit(train_vecs,y_train) 209 | joblib.dump(clf,storedpaths+'model_randomforest.pkl') 210 | test_scores=clf.score(test_vecs,y_test) 211 | return test_scores 212 | 213 | # 训练 ExtraTreesClassifier 分类算法 214 | 215 | def extract_tree(train_vecs,y_train,test_vecs,y_test): 216 | clf = ExtraTreesClassifier(n_estimators=10, max_depth=10,min_samples_split=2,n_jobs=1,random_state=0) 217 | clf.fit(train_vecs,y_train) 218 | joblib.dump(clf,storedpaths+'model_extracttree.pkl') 219 | test_scores=clf.score(test_vecs,y_test) 220 | return test_scores 221 | 222 | # 训练 GBDT 分类算法 223 | 224 | def gbdt_classifier(train_vecs,y_train,test_vecs,y_test): 225 | clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0,max_depth=10,random_state=0) 226 | clf.fit(train_vecs,y_train) 227 | joblib.dump(clf,storedpaths+'model_gbdt.pkl') 228 | test_scores=clf.score(test_vecs,y_test) 229 | return test_scores 230 | 231 | # 训练近邻分类算法 232 | 233 | def nn_classifier(n_neighbors,train_vecs,y_train,test_vecs,y_test): 234 | clf = neighbors.KNeighborsClassifier(n_neighbors, weights='uniform') 235 | clf.fit(train_vecs,y_train) 236 | joblib.dump(clf,storedpaths+'model_nn.pkl') 237 | test_scores=clf.score(test_vecs,y_test) 238 | return test_scores 239 | 240 | # 训练 LogisticRegression 分类算法 241 | 242 | def LR_classifier(train_vecs,y_train,test_vecs,y_test): 243 | clf = LogisticRegression(C=50. / len(y_train),multi_class='multinomial',\ 244 | penalty='l1', solver='saga', tol=0.1) 245 | clf.fit(train_vecs,y_train) 246 | joblib.dump(clf,storedpaths+'model_lr.pkl') 247 | test_scores=clf.score(test_vecs,y_test) 248 | return test_scores 249 | 250 | # 训练 随机梯度下降 分类算法 251 | 252 | def SGD_classifier(train_vecs,y_train,test_vecs,y_test): 253 | clf = SGDClassifier(alpha=0.001, max_iter=100) 254 | clf.fit(train_vecs,y_train) 255 | joblib.dump(clf,storedpaths+'model_sgd.pkl') 256 | test_scores=clf.score(test_vecs,y_test) 257 | return test_scores 258 | 259 | # 训练多层感知机分类算法 260 | 261 | def MP_classifier(train_vecs,y_train,test_vecs,y_test): 262 | clf = MLPClassifier(solver='lbfgs', alpha=1e-5,hidden_layer_sizes=(5, 2), random_state=1) 263 | clf.fit(train_vecs,y_train) 264 | joblib.dump(clf,storedpaths+'model_mp.pkl') 265 | test_scores=clf.score(test_vecs,y_test) 266 | return test_scores 267 | 268 | 269 | # 得到待预测单个句子的词向量 270 | # 预先进行分词操作 271 | 272 | def get_predict_vecs(string,n_dim,w2v_model_path): 273 | ''' 274 | string: 输入的句子 275 | n_dim: 词向量维度 276 | w2v_model_path: 預训练词向量的模型路径 277 | ''' 278 | n_dim = n_dim 279 | text_w2v = Word2Vec.load(w2v_model_path) 280 | words=[i for i in jieba.cut(string,cut_all=False)] 281 | train_vecs = buildWordVector(words, n_dim,text_w2v) 282 | 283 | return train_vecs 284 | 285 | # 调用训练模型进行预测 286 | 287 | def svm_predict(string,trainmodelpath): 288 | words_vecs=get_predict_vecs(string) 289 | clf=joblib.load(trainmodelpath) 290 | result=clf.predict(words_vecs) 291 | 292 | return result 293 | 294 | # Train model 295 | 296 | n_dim=300 297 | n_neighbors=10 298 | #get_train_vecs(x_train,x_test,n_dim) 299 | 300 | 301 | train_vecs,y_train,test_vecs,y_test=get_data() 302 | test_scores=svm_train(train_vecs,y_train,test_vecs,y_test) 303 | print("#----------------------------------------#") 304 | print("SVM测试集测试得分:{}".format(test_scores)) 305 | print("#----------------------------------------#") 306 | test_scores=NB_train(train_vecs,y_train,test_vecs,y_test) 307 | print("#----------------------------------------#") 308 | print("NB测试集测试得分:{}".format(test_scores)) 309 | print("#----------------------------------------#") 310 | test_scores=nn_classifier(n_neighbors,train_vecs,y_train,test_vecs,y_test) 311 | print("#----------------------------------------#") 312 | print("NN测试集测试得分:{}".format(test_scores)) 313 | print("#----------------------------------------#") 314 | test_scores=LR_classifier(train_vecs,y_train,test_vecs,y_test) 315 | print("#----------------------------------------#") 316 | print("LR测试集测试得分:{}".format(test_scores)) 317 | print("#----------------------------------------#") 318 | test_scores=SGD_classifier(train_vecs,y_train,test_vecs,y_test) 319 | print("#----------------------------------------#") 320 | print("SGD测试集测试得分:{}".format(test_scores)) 321 | print("#----------------------------------------#") 322 | test_scores=decision_tree(train_vecs,y_train,test_vecs,y_test) 323 | print("#----------------------------------------#") 324 | print("TREE测试集测试得分:{}".format(test_scores)) 325 | print("#----------------------------------------#") 326 | test_scores=random_forest(train_vecs,y_train,test_vecs,y_test) 327 | print("#----------------------------------------#") 328 | print("Random_Forest测试集测试得分:{}".format(test_scores)) 329 | print("#----------------------------------------#") 330 | test_scores=extract_tree(train_vecs,y_train,test_vecs,y_test) 331 | print("#----------------------------------------#") 332 | print("Extract_Tree测试集测试得分:{}".format(test_scores)) 333 | print("#----------------------------------------#") 334 | test_scores=gbdt_classifier(train_vecs,y_train,test_vecs,y_test) 335 | print("#----------------------------------------#") 336 | print("GBDT_Tree测试集测试得分:{}".format(test_scores)) 337 | print("#----------------------------------------#") 338 | test_scores=MP_classifier(train_vecs,y_train,test_vecs,y_test) 339 | print("#----------------------------------------#") 340 | print("MP测试集测试得分:{}".format(test_scores)) 341 | print("#----------------------------------------#") --------------------------------------------------------------------------------