├── .gitignore
├── LICENSE
├── README.md
├── data
    ├── neg.csv
    ├── neutral.csv
    └── pos.csv
├── keras_sentiment_analysis_v1.py
├── keras_sentiment_analysis_v2.py
├── keras_sentiment_analysis_v3.py
├── keras_sentiment_analysis_v4.py
├── keras_sentiment_analysis_v5.py
└── sentiment_analysis_ml.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | *.egg-info/
 24 | .installed.cfg
 25 | *.egg
 26 | MANIFEST
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | .pytest_cache/
 49 | 
 50 | # Translations
 51 | *.mo
 52 | *.pot
 53 | 
 54 | # Django stuff:
 55 | *.log
 56 | local_settings.py
 57 | db.sqlite3
 58 | 
 59 | # Flask stuff:
 60 | instance/
 61 | .webassets-cache
 62 | 
 63 | # Scrapy stuff:
 64 | .scrapy
 65 | 
 66 | # Sphinx documentation
 67 | docs/_build/
 68 | 
 69 | # PyBuilder
 70 | target/
 71 | 
 72 | # Jupyter Notebook
 73 | .ipynb_checkpoints
 74 | 
 75 | # pyenv
 76 | .python-version
 77 | 
 78 | # celery beat schedule file
 79 | celerybeat-schedule
 80 | 
 81 | # SageMath parsed files
 82 | *.sage.py
 83 | 
 84 | # Environments
 85 | .env
 86 | .venv
 87 | env/
 88 | venv/
 89 | ENV/
 90 | env.bak/
 91 | venv.bak/
 92 | 
 93 | # Spyder project settings
 94 | .spyderproject
 95 | .spyproject
 96 | 
 97 | # Rope project settings
 98 | .ropeproject
 99 | 
100 | # mkdocs documentation
101 | /site
102 | 
103 | # mypy
104 | .mypy_cache/
105 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Sentiment_Analysis_cnn_lstm_cnnlstm
 2 | 利用CNN,LSTM,CNN_LSTM,TextCNN,Bi_LSTM和传统的机器学习算法进行情感分析，参考：https://github.com/Edward1Chou/SentimentAnalysis;
 3 | 
 4 |     1. keras_sentiment_analysis_v1.py: LSTM算法
 5 |     2. keras_sentiment_analysis_v2.py: CNN_LSTM算法
 6 |     3. keras_sentiment_analysis_v3.py: CNN算法  
 7 |     4. keras_sentiment_analysis_v4.py: TextCNN算法  
 8 |     5. keras_sentiment_analysis_v4.py: Bi_LSTM算法 
 9 |     6. sentiment_analysis_ml.py: 传统机器学习算法  
10 | 


--------------------------------------------------------------------------------
/keras_sentiment_analysis_v1.py:
--------------------------------------------------------------------------------
  1 | # _*_ coding:utf-8 _*_
  2 | 
  3 | '''
  4 | @Author: Ruan Yang
  5 | @Date: 2018.12.9
  6 | @Purpose: 文本情感分析(positive,negative,neutral)
  7 | @Reference: https://github.com/Edward1Chou/SentimentAnalysis
  8 | @算法：LSTM
  9 | @需要有事先标准好的数据集
 10 | @positive: [1,0,0]
 11 | @neutral: [0,1,0]
 12 | @negative:[0,0,1]
 13 | '''
 14 | 
 15 | import codecs
 16 | import jieba
 17 | 
 18 | datapaths=r"C:\Users\RY\Desktop\SentimentAnalysis-master\data\\"
 19 | 
 20 | positive_data=[]
 21 | y_positive=[]
 22 | neutral_data=[]
 23 | y_neutral=[]
 24 | negative_data=[]
 25 | y_negative=[]
 26 | 
 27 | print("#------------------------------------------------------#")
 28 | print("加载数据集")
 29 | with codecs.open(datapaths+"pos.csv","r","utf-8") as f1,\
 30 |      codecs.open(datapaths+"neutral.csv","r","utf-8") as f2,\
 31 |      codecs.open(datapaths+"neg.csv","r","utf-8") as f3:
 32 |     for line in f1:
 33 |         positive_data.append(" ".join(i for i in jieba.lcut(line.strip(),cut_all=False)))
 34 |         y_positive.append([1,0,0])
 35 |     for line in f2:
 36 |         neutral_data.append(" ".join(i for i in jieba.lcut(line.strip(),cut_all=False)))
 37 |         y_neutral.append([0,1,0])
 38 |     for line in f3:
 39 |         negative_data.append(" ".join(i for i in jieba.lcut(line.strip(),cut_all=False)))
 40 |         y_negative.append([0,0,1])
 41 |         
 42 | print("positive data:{}".format(len(positive_data)))
 43 | print("neutral data:{}".format(len(neutral_data)))
 44 | print("negative data:{}".format(len(negative_data)))
 45 | 
 46 | x_text=positive_data+neutral_data+negative_data
 47 | y_label=y_positive+y_neutral+y_negative
 48 | print("#------------------------------------------------------#")
 49 | print("\n")
 50 | 
 51 | from tensorflow.contrib import learn
 52 | import tensorflow as tf
 53 | import numpy as np
 54 | import collections
 55 | 
 56 | max_document_length=200
 57 | min_frequency=1
 58 | 
 59 | 
 60 | vocab = learn.preprocessing.VocabularyProcessor(max_document_length,min_frequency, tokenizer_fn=list)
 61 | x = np.array(list(vocab.fit_transform(x_text)))
 62 | vocab_dict = collections.OrderedDict(vocab.vocabulary_._mapping)
 63 | 
 64 | with codecs.open(r"C:\Users\RY\Desktop\vocabulary.txt","w","utf-8") as f:
 65 |     for key,value in vocab_dict.items():
 66 |         f.write("{} {}\n".format(key,value))
 67 |         
 68 | print("#----------------------------------------------------------#")
 69 | print("\n")
 70 | 
 71 | print("#----------------------------------------------------------#")
 72 | print("数据混洗")
 73 | np.random.seed(10)
 74 | y=np.array(y_label)
 75 | shuffle_indices = np.random.permutation(np.arange(len(y)))
 76 | x_shuffled = x[shuffle_indices]
 77 | y_shuffled = y[shuffle_indices]
 78 | 
 79 | test_sample_percentage=0.2
 80 | test_sample_index = -1 * int(test_sample_percentage * float(len(y)))
 81 | x_train, x_test = x_shuffled[:test_sample_index], x_shuffled[test_sample_index:]
 82 | y_train, y_test = y_shuffled[:test_sample_index], y_shuffled[test_sample_index:]
 83 | 
 84 | train_positive_label=0
 85 | train_neutral_label=0
 86 | train_negative_label=0
 87 | test_positive_label=0
 88 | test_neutral_label=0
 89 | test_negative_label=0
 90 | 
 91 | for i in range(len(y_train)):
 92 |     if y_train[i,0] == 1:
 93 |         train_positive_label += 1
 94 |     elif y_train[i,1] == 1:
 95 |         train_neutral_label += 1
 96 |     else:
 97 |         train_negative_label += 1
 98 |         
 99 | for i in range(len(y_test)):
100 |     if y_test[i,0] == 1:
101 |         test_positive_label += 1
102 |     elif y_test[i,1] == 1:
103 |         test_neutral_label += 1
104 |     else:
105 |         test_negative_label += 1
106 |         
107 | print("训练集中 positive 样本个数：{}".format(train_positive_label))
108 | print("训练集中 neutral 样本个数：{}".format(train_neutral_label))
109 | print("训练集中 negative 样本个数：{}".format(train_negative_label))
110 | print("测试集中 positive 样本个数：{}".format(test_positive_label))
111 | print("测试集中 neutral 样本个数：{}".format(test_neutral_label))
112 | print("测试集中 negative 样本个数：{}".format(test_negative_label))
113 | 
114 | print("#----------------------------------------------------------#")
115 | print("\n")
116 | 
117 | print("#----------------------------------------------------------#")
118 | print("读取預训练词向量矩阵")
119 | 
120 | pretrainpath=r"E:\中科大MS\預训练模型\\"
121 | 
122 | embedding_index={}
123 | 
124 | with codecs.open(pretrainpath+"sgns.wiki.bigram","r","utf-8") as f:
125 |     line=f.readline()
126 |     nwords=int(line.strip().split(" ")[0])
127 |     ndims=int(line.strip().split(" ")[1])
128 |     for line in f:
129 |         values=line.split()
130 |         words=values[0]
131 |         coefs=np.asarray(values[1:],dtype="float32")
132 |         embedding_index[words]=coefs
133 |         
134 | print("預训练模型中Token总数：{} = {}".format(nwords,len(embedding_index)))
135 | print("預训练模型的维度：{}".format(ndims))
136 | print("#----------------------------------------------------------#")
137 | print("\n")
138 | 
139 | print("#----------------------------------------------------------#")
140 | print("将vocabulary中的 index-word 对应关系映射到 index-word vector形式")
141 | 
142 | embedding_matrix=[]
143 | notfoundword=0
144 | 
145 | for word in vocab_dict.keys():
146 |     if word in embedding_index.keys():
147 |         embedding_matrix.append(embedding_index[word])
148 |     else:
149 |         notfoundword += 1
150 |         embedding_matrix.append(np.random.uniform(-1,1,size=ndims))
151 |         
152 | embedding_matrix=np.array(embedding_matrix,dtype=np.float32) # 必须使用 np.float32
153 | print("词汇表中未找到单词个数：{}".format(notfoundword))
154 | print("#----------------------------------------------------------#")
155 | print("\n")
156 | 
157 | print("#---------------------------------------------------#")
158 | print("Build model .................")
159 | print("NN structure .......")
160 | print("Embedding layer --- LSTM layer --- Dense layer")
161 | print("#---------------------------------------------------#")
162 | print("\n")
163 | 
164 | from keras.models import Sequential
165 | from keras.layers import Dense, Dropout, Activation
166 | from keras.layers import Embedding
167 | from keras.layers import Conv1D, GlobalMaxPooling1D
168 | from keras.layers import LSTM
169 | 
170 | batch_size=64
171 | max_sentence_length=200
172 | embedding_dims=ndims
173 | dropout=0.2
174 | recurrent_dropout=0.2
175 | num_classes=3
176 | epochs=2
177 | 
178 | # 定义网络结构
179 | model=Sequential()
180 | model.add(Embedding(len(vocab_dict),
181 |                     embedding_dims,
182 |                     weights=[embedding_matrix],
183 |                     input_length=max_sentence_length,
184 |                     trainable=False))
185 | model.add(Dropout(dropout))
186 | model.add(LSTM(128,dropout=dropout,recurrent_dropout=recurrent_dropout))
187 | model.add(Dense(num_classes,activation="sigmoid"))
188 | 
189 | # 模型编译
190 | 
191 | model.compile(loss="categorical_crossentropy",optimizer="adam",metrics=["accuracy"])
192 | 
193 | print("#---------------------------------------------------#")
194 | print("Train ....................")
195 | print("#---------------------------------------------------#")
196 | print("\n")
197 | 
198 | model.fit(x_train,y_train,batch_size=batch_size,epochs=epochs,validation_data=(x_test,y_test))
199 | 
200 | # 训练得分和准确度
201 | 
202 | score,acc=model.evaluate(x_test,y_test,batch_size=batch_size)
203 | 
204 | print("#---------------------------------------------------#")
205 | print("预测得分:{}".format(score))
206 | print("预测准确率:{}".format(acc))
207 | print("#---------------------------------------------------#")
208 | print("\n")
209 | 
210 | # 模型预测
211 | 
212 | predictions=model.predict(x_test)
213 | 
214 | print("#---------------------------------------------------#")
215 | print("测试集的预测结果，对每个类有一个得分/概率，取值大对应的类别")
216 | print(predictions)
217 | print("#---------------------------------------------------#")
218 | print("\n")
219 | 
220 | # 模型预测类别
221 | 
222 | predict_class=model.predict_classes(x_test)
223 | 
224 | print("#---------------------------------------------------#")
225 | print("测试集的预测类别")
226 | print(predict_class)
227 | print("#---------------------------------------------------#")
228 | print("\n")
229 | 
230 | # 模型保存
231 | 
232 | model.save(r"C:\Users\RY\Desktop\sentiment_analysis_lstm.h5")
233 | 
234 | print("#---------------------------------------------------#")
235 | print("保存模型")
236 | print("#---------------------------------------------------#")
237 | print("\n")
238 | 
239 | # 模型总结
240 | 
241 | print("#---------------------------------------------------#")
242 | print("输出模型总结")
243 | print(model.summary())
244 | print("#---------------------------------------------------#")
245 | print("\n")
246 | 
247 | # 模型的配置文件
248 | 
249 | config=model.get_config()
250 | 
251 | print("#---------------------------------------------------#")
252 | print("输出模型配置信息")
253 | print(config)
254 | print("#---------------------------------------------------#")
255 | print("\n")
256 | 


--------------------------------------------------------------------------------
/keras_sentiment_analysis_v2.py:
--------------------------------------------------------------------------------
  1 | # _*_ coding:utf-8 _*_
  2 | 
  3 | '''
  4 | @Author: Ruan Yang
  5 | @Date: 2018.12.9
  6 | @Purpose: 文本情感分析(positive,negative,neutral)
  7 | @Reference: https://github.com/Edward1Chou/SentimentAnalysis
  8 | @算法：CNN-LSTM
  9 | @需要有事先标准好的数据集
 10 | @positive: [1,0,0]
 11 | @neutral: [0,1,0]
 12 | @negative:[0,0,1]
 13 | '''
 14 | 
 15 | import codecs
 16 | import jieba
 17 | 
 18 | datapaths=r"C:\Users\RY\Desktop\SentimentAnalysis-master\data\\"
 19 | 
 20 | positive_data=[]
 21 | y_positive=[]
 22 | neutral_data=[]
 23 | y_neutral=[]
 24 | negative_data=[]
 25 | y_negative=[]
 26 | 
 27 | print("#------------------------------------------------------#")
 28 | print("加载数据集")
 29 | with codecs.open(datapaths+"pos.csv","r","utf-8") as f1,\
 30 |      codecs.open(datapaths+"neutral.csv","r","utf-8") as f2,\
 31 |      codecs.open(datapaths+"neg.csv","r","utf-8") as f3:
 32 |     for line in f1:
 33 |         positive_data.append(" ".join(i for i in jieba.lcut(line.strip(),cut_all=False)))
 34 |         y_positive.append([1,0,0])
 35 |     for line in f2:
 36 |         neutral_data.append(" ".join(i for i in jieba.lcut(line.strip(),cut_all=False)))
 37 |         y_neutral.append([0,1,0])
 38 |     for line in f3:
 39 |         negative_data.append(" ".join(i for i in jieba.lcut(line.strip(),cut_all=False)))
 40 |         y_negative.append([0,0,1])
 41 |         
 42 | print("positive data:{}".format(len(positive_data)))
 43 | print("neutral data:{}".format(len(neutral_data)))
 44 | print("negative data:{}".format(len(negative_data)))
 45 | 
 46 | x_text=positive_data+neutral_data+negative_data
 47 | y_label=y_positive+y_neutral+y_negative
 48 | print("#------------------------------------------------------#")
 49 | print("\n")
 50 | 
 51 | from tensorflow.contrib import learn
 52 | import tensorflow as tf
 53 | import numpy as np
 54 | import collections
 55 | 
 56 | max_document_length=200
 57 | min_frequency=1
 58 | 
 59 | 
 60 | vocab = learn.preprocessing.VocabularyProcessor(max_document_length,min_frequency, tokenizer_fn=list)
 61 | x = np.array(list(vocab.fit_transform(x_text)))
 62 | vocab_dict = collections.OrderedDict(vocab.vocabulary_._mapping)
 63 | 
 64 | with codecs.open(r"C:\Users\RY\Desktop\vocabulary.txt","w","utf-8") as f:
 65 |     for key,value in vocab_dict.items():
 66 |         f.write("{} {}\n".format(key,value))
 67 |         
 68 | print("#----------------------------------------------------------#")
 69 | print("\n")
 70 | 
 71 | print("#----------------------------------------------------------#")
 72 | print("数据混洗")
 73 | np.random.seed(10)
 74 | y=np.array(y_label)
 75 | shuffle_indices = np.random.permutation(np.arange(len(y)))
 76 | x_shuffled = x[shuffle_indices]
 77 | y_shuffled = y[shuffle_indices]
 78 | 
 79 | test_sample_percentage=0.2
 80 | test_sample_index = -1 * int(test_sample_percentage * float(len(y)))
 81 | x_train, x_test = x_shuffled[:test_sample_index], x_shuffled[test_sample_index:]
 82 | y_train, y_test = y_shuffled[:test_sample_index], y_shuffled[test_sample_index:]
 83 | 
 84 | train_positive_label=0
 85 | train_neutral_label=0
 86 | train_negative_label=0
 87 | test_positive_label=0
 88 | test_neutral_label=0
 89 | test_negative_label=0
 90 | 
 91 | for i in range(len(y_train)):
 92 |     if y_train[i,0] == 1:
 93 |         train_positive_label += 1
 94 |     elif y_train[i,1] == 1:
 95 |         train_neutral_label += 1
 96 |     else:
 97 |         train_negative_label += 1
 98 |         
 99 | for i in range(len(y_test)):
100 |     if y_test[i,0] == 1:
101 |         test_positive_label += 1
102 |     elif y_test[i,1] == 1:
103 |         test_neutral_label += 1
104 |     else:
105 |         test_negative_label += 1
106 |         
107 | print("训练集中 positive 样本个数：{}".format(train_positive_label))
108 | print("训练集中 neutral 样本个数：{}".format(train_neutral_label))
109 | print("训练集中 negative 样本个数：{}".format(train_negative_label))
110 | print("测试集中 positive 样本个数：{}".format(test_positive_label))
111 | print("测试集中 neutral 样本个数：{}".format(test_neutral_label))
112 | print("测试集中 negative 样本个数：{}".format(test_negative_label))
113 | 
114 | print("#----------------------------------------------------------#")
115 | print("\n")
116 | 
117 | print("#----------------------------------------------------------#")
118 | print("读取預训练词向量矩阵")
119 | 
120 | pretrainpath=r"E:\中科大MS\預训练模型\\"
121 | 
122 | embedding_index={}
123 | 
124 | with codecs.open(pretrainpath+"sgns.wiki.bigram","r","utf-8") as f:
125 |     line=f.readline()
126 |     nwords=int(line.strip().split(" ")[0])
127 |     ndims=int(line.strip().split(" ")[1])
128 |     for line in f:
129 |         values=line.split()
130 |         words=values[0]
131 |         coefs=np.asarray(values[1:],dtype="float32")
132 |         embedding_index[words]=coefs
133 |         
134 | print("預训练模型中Token总数：{} = {}".format(nwords,len(embedding_index)))
135 | print("預训练模型的维度：{}".format(ndims))
136 | print("#----------------------------------------------------------#")
137 | print("\n")
138 | 
139 | print("#----------------------------------------------------------#")
140 | print("将vocabulary中的 index-word 对应关系映射到 index-word vector形式")
141 | 
142 | embedding_matrix=[]
143 | notfoundword=0
144 | 
145 | for word in vocab_dict.keys():
146 |     if word in embedding_index.keys():
147 |         embedding_matrix.append(embedding_index[word])
148 |     else:
149 |         notfoundword += 1
150 |         embedding_matrix.append(np.random.uniform(-1,1,size=ndims))
151 |         
152 | embedding_matrix=np.array(embedding_matrix,dtype=np.float32) # 必须使用 np.float32
153 | print("词汇表中未找到单词个数：{}".format(notfoundword))
154 | print("#----------------------------------------------------------#")
155 | print("\n")
156 | 
157 | print("#---------------------------------------------------#")
158 | print("Build model .................")
159 | print("NN structure .......")
160 | print("Embedding layer --- CNN layer --- LSTM layer --- Dense layer")
161 | print("#---------------------------------------------------#")
162 | print("\n")
163 | 
164 | from keras.models import Sequential
165 | from keras.layers import Dense, Dropout, Activation
166 | from keras.layers import Embedding
167 | from keras.layers import Conv1D, GlobalMaxPooling1D,MaxPooling1D
168 | from keras.layers import LSTM
169 | 
170 | batch_size=64
171 | max_sentence_length=200
172 | lstm_output_size=128
173 | embedding_dims=ndims
174 | filters = 250
175 | kernel_size = 3
176 | dropout=0.2
177 | recurrent_dropout=0.2
178 | num_classes=3
179 | epochs=2
180 | 
181 | # 定义网络结构
182 | model=Sequential()
183 | model.add(Embedding(len(vocab_dict),
184 |                     embedding_dims,
185 |                     weights=[embedding_matrix],
186 |                     input_length=max_sentence_length,
187 |                     trainable=False))
188 | model.add(Dropout(dropout))
189 | model.add(Conv1D(filters,kernel_size,padding="valid",activation="relu",strides=1))
190 | model.add(MaxPooling1D())
191 | model.add(LSTM(lstm_output_size))
192 | model.add(Dense(num_classes))
193 | model.add(Activation("sigmoid"))
194 | 
195 | # 模型编译
196 | 
197 | model.compile(loss="categorical_crossentropy",optimizer="adam",metrics=["accuracy"])
198 | 
199 | print("#---------------------------------------------------#")
200 | print("Train ....................")
201 | print("#---------------------------------------------------#")
202 | print("\n")
203 | 
204 | model.fit(x_train,y_train,batch_size=batch_size,epochs=epochs,validation_data=(x_test,y_test))
205 | 
206 | # 训练得分和准确度
207 | 
208 | score,acc=model.evaluate(x_test,y_test,batch_size=batch_size)
209 | 
210 | print("#---------------------------------------------------#")
211 | print("预测得分:{}".format(score))
212 | print("预测准确率:{}".format(acc))
213 | print("#---------------------------------------------------#")
214 | print("\n")
215 | 
216 | # 模型预测
217 | 
218 | predictions=model.predict(x_test)
219 | 
220 | print("#---------------------------------------------------#")
221 | print("测试集的预测结果，对每个类有一个得分/概率，取值大对应的类别")
222 | print(predictions)
223 | print("#---------------------------------------------------#")
224 | print("\n")
225 | 
226 | # 模型预测类别
227 | 
228 | predict_class=model.predict_classes(x_test)
229 | 
230 | print("#---------------------------------------------------#")
231 | print("测试集的预测类别")
232 | print(predict_class)
233 | print("#---------------------------------------------------#")
234 | print("\n")
235 | 
236 | # 模型保存
237 | 
238 | model.save(r"C:\Users\RY\Desktop\sentiment_analysis_lstm.h5")
239 | 
240 | print("#---------------------------------------------------#")
241 | print("保存模型")
242 | print("#---------------------------------------------------#")
243 | print("\n")
244 | 
245 | # 模型总结
246 | 
247 | print("#---------------------------------------------------#")
248 | print("输出模型总结")
249 | print(model.summary())
250 | print("#---------------------------------------------------#")
251 | print("\n")
252 | 
253 | # 模型的配置文件
254 | 
255 | config=model.get_config()
256 | 
257 | print("#---------------------------------------------------#")
258 | print("输出模型配置信息")
259 | print(config)
260 | print("#---------------------------------------------------#")
261 | print("\n")
262 | 


--------------------------------------------------------------------------------
/keras_sentiment_analysis_v3.py:
--------------------------------------------------------------------------------
  1 | # _*_ coding:utf-8 _*_
  2 | 
  3 | '''
  4 | @Author: Ruan Yang
  5 | @Date: 2018.12.9
  6 | @Purpose: 文本情感分析(positive,negative,neutral)
  7 | @Reference: https://github.com/Edward1Chou/SentimentAnalysis
  8 | @算法：CNN
  9 | @需要有事先标准好的数据集
 10 | @positive: [1,0,0]
 11 | @neutral: [0,1,0]
 12 | @negative:[0,0,1]
 13 | '''
 14 | 
 15 | import codecs
 16 | import jieba
 17 | 
 18 | datapaths=r"C:\Users\RY\Desktop\SentimentAnalysis-master\data\\"
 19 | 
 20 | positive_data=[]
 21 | y_positive=[]
 22 | neutral_data=[]
 23 | y_neutral=[]
 24 | negative_data=[]
 25 | y_negative=[]
 26 | 
 27 | print("#------------------------------------------------------#")
 28 | print("加载数据集")
 29 | with codecs.open(datapaths+"pos.csv","r","utf-8") as f1,\
 30 |      codecs.open(datapaths+"neutral.csv","r","utf-8") as f2,\
 31 |      codecs.open(datapaths+"neg.csv","r","utf-8") as f3:
 32 |     for line in f1:
 33 |         positive_data.append(" ".join(i for i in jieba.lcut(line.strip(),cut_all=False)))
 34 |         y_positive.append([1,0,0])
 35 |     for line in f2:
 36 |         neutral_data.append(" ".join(i for i in jieba.lcut(line.strip(),cut_all=False)))
 37 |         y_neutral.append([0,1,0])
 38 |     for line in f3:
 39 |         negative_data.append(" ".join(i for i in jieba.lcut(line.strip(),cut_all=False)))
 40 |         y_negative.append([0,0,1])
 41 |         
 42 | print("positive data:{}".format(len(positive_data)))
 43 | print("neutral data:{}".format(len(neutral_data)))
 44 | print("negative data:{}".format(len(negative_data)))
 45 | 
 46 | x_text=positive_data+neutral_data+negative_data
 47 | y_label=y_positive+y_neutral+y_negative
 48 | print("#------------------------------------------------------#")
 49 | print("\n")
 50 | 
 51 | from tensorflow.contrib import learn
 52 | import tensorflow as tf
 53 | import numpy as np
 54 | import collections
 55 | 
 56 | max_document_length=200
 57 | min_frequency=1
 58 | 
 59 | 
 60 | vocab = learn.preprocessing.VocabularyProcessor(max_document_length,min_frequency, tokenizer_fn=list)
 61 | x = np.array(list(vocab.fit_transform(x_text)))
 62 | vocab_dict = collections.OrderedDict(vocab.vocabulary_._mapping)
 63 | 
 64 | with codecs.open(r"C:\Users\RY\Desktop\vocabulary.txt","w","utf-8") as f:
 65 |     for key,value in vocab_dict.items():
 66 |         f.write("{} {}\n".format(key,value))
 67 |         
 68 | print("#----------------------------------------------------------#")
 69 | print("\n")
 70 | 
 71 | print("#----------------------------------------------------------#")
 72 | print("数据混洗")
 73 | np.random.seed(10)
 74 | y=np.array(y_label)
 75 | shuffle_indices = np.random.permutation(np.arange(len(y)))
 76 | x_shuffled = x[shuffle_indices]
 77 | y_shuffled = y[shuffle_indices]
 78 | 
 79 | test_sample_percentage=0.2
 80 | test_sample_index = -1 * int(test_sample_percentage * float(len(y)))
 81 | x_train, x_test = x_shuffled[:test_sample_index], x_shuffled[test_sample_index:]
 82 | y_train, y_test = y_shuffled[:test_sample_index], y_shuffled[test_sample_index:]
 83 | 
 84 | train_positive_label=0
 85 | train_neutral_label=0
 86 | train_negative_label=0
 87 | test_positive_label=0
 88 | test_neutral_label=0
 89 | test_negative_label=0
 90 | 
 91 | for i in range(len(y_train)):
 92 |     if y_train[i,0] == 1:
 93 |         train_positive_label += 1
 94 |     elif y_train[i,1] == 1:
 95 |         train_neutral_label += 1
 96 |     else:
 97 |         train_negative_label += 1
 98 |         
 99 | for i in range(len(y_test)):
100 |     if y_test[i,0] == 1:
101 |         test_positive_label += 1
102 |     elif y_test[i,1] == 1:
103 |         test_neutral_label += 1
104 |     else:
105 |         test_negative_label += 1
106 |         
107 | print("训练集中 positive 样本个数：{}".format(train_positive_label))
108 | print("训练集中 neutral 样本个数：{}".format(train_neutral_label))
109 | print("训练集中 negative 样本个数：{}".format(train_negative_label))
110 | print("测试集中 positive 样本个数：{}".format(test_positive_label))
111 | print("测试集中 neutral 样本个数：{}".format(test_neutral_label))
112 | print("测试集中 negative 样本个数：{}".format(test_negative_label))
113 | 
114 | print("#----------------------------------------------------------#")
115 | print("\n")
116 | 
117 | print("#----------------------------------------------------------#")
118 | print("读取預训练词向量矩阵")
119 | 
120 | pretrainpath=r"E:\中科大MS\預训练模型\\"
121 | 
122 | embedding_index={}
123 | 
124 | with codecs.open(pretrainpath+"sgns.wiki.bigram","r","utf-8") as f:
125 |     line=f.readline()
126 |     nwords=int(line.strip().split(" ")[0])
127 |     ndims=int(line.strip().split(" ")[1])
128 |     for line in f:
129 |         values=line.split()
130 |         words=values[0]
131 |         coefs=np.asarray(values[1:],dtype="float32")
132 |         embedding_index[words]=coefs
133 |         
134 | print("預训练模型中Token总数：{} = {}".format(nwords,len(embedding_index)))
135 | print("預训练模型的维度：{}".format(ndims))
136 | print("#----------------------------------------------------------#")
137 | print("\n")
138 | 
139 | print("#----------------------------------------------------------#")
140 | print("将vocabulary中的 index-word 对应关系映射到 index-word vector形式")
141 | 
142 | embedding_matrix=[]
143 | notfoundword=0
144 | 
145 | for word in vocab_dict.keys():
146 |     if word in embedding_index.keys():
147 |         embedding_matrix.append(embedding_index[word])
148 |     else:
149 |         notfoundword += 1
150 |         embedding_matrix.append(np.random.uniform(-1,1,size=ndims))
151 |         
152 | embedding_matrix=np.array(embedding_matrix,dtype=np.float32) # 必须使用 np.float32
153 | print("词汇表中未找到单词个数：{}".format(notfoundword))
154 | print("#----------------------------------------------------------#")
155 | print("\n")
156 | 
157 | print("#---------------------------------------------------#")
158 | print("Build model .................")
159 | print("NN structure .......")
160 | print("Embedding layer --- CNN layer  --- Dense layer --- Dense layer")
161 | print("#---------------------------------------------------#")
162 | print("\n")
163 | 
164 | from keras.models import Sequential
165 | from keras.layers import Dense, Dropout, Activation
166 | from keras.layers import Embedding
167 | from keras.layers import Conv1D, GlobalMaxPooling1D,MaxPooling1D
168 | 
169 | batch_size=64
170 | max_sentence_length=200
171 | embedding_dims=ndims
172 | filters = 250
173 | kernel_size = 3
174 | hidden_dims = 250
175 | dropout=0.2
176 | recurrent_dropout=0.2
177 | num_classes=3
178 | epochs=2
179 | 
180 | # 定义网络结构
181 | model = Sequential()
182 | model.add(Embedding(len(vocab_dict),
183 |                     embedding_dims,
184 |                     weights=[embedding_matrix],
185 |                     input_length=max_sentence_length,
186 |                     trainable=False))
187 | model.add(Dropout(dropout))
188 | model.add(Conv1D(filters,
189 |                  kernel_size,
190 |                  padding='valid',
191 |                  activation='relu',
192 |                  strides=1))
193 | model.add(GlobalMaxPooling1D())
194 | model.add(Dense(hidden_dims))
195 | model.add(Dropout(dropout))
196 | model.add(Activation('relu'))
197 | model.add(Dense(num_classes))
198 | model.add(Activation('sigmoid'))
199 | 
200 | # 模型编译
201 | 
202 | model.compile(loss="categorical_crossentropy",optimizer="adam",metrics=["accuracy"])
203 | 
204 | print("#---------------------------------------------------#")
205 | print("Train ....................")
206 | print("#---------------------------------------------------#")
207 | print("\n")
208 | 
209 | model.fit(x_train,y_train,batch_size=batch_size,epochs=epochs,validation_data=(x_test,y_test))
210 | 
211 | # 训练得分和准确度
212 | 
213 | score,acc=model.evaluate(x_test,y_test,batch_size=batch_size)
214 | 
215 | print("#---------------------------------------------------#")
216 | print("预测得分:{}".format(score))
217 | print("预测准确率:{}".format(acc))
218 | print("#---------------------------------------------------#")
219 | print("\n")
220 | 
221 | # 模型预测
222 | 
223 | predictions=model.predict(x_test)
224 | 
225 | print("#---------------------------------------------------#")
226 | print("测试集的预测结果，对每个类有一个得分/概率，取值大对应的类别")
227 | print(predictions)
228 | print("#---------------------------------------------------#")
229 | print("\n")
230 | 
231 | # 模型预测类别
232 | 
233 | predict_class=model.predict_classes(x_test)
234 | 
235 | print("#---------------------------------------------------#")
236 | print("测试集的预测类别")
237 | print(predict_class)
238 | print("#---------------------------------------------------#")
239 | print("\n")
240 | 
241 | # 模型保存
242 | 
243 | model.save(r"C:\Users\RY\Desktop\sentiment_analysis_lstm.h5")
244 | 
245 | print("#---------------------------------------------------#")
246 | print("保存模型")
247 | print("#---------------------------------------------------#")
248 | print("\n")
249 | 
250 | # 模型总结
251 | 
252 | print("#---------------------------------------------------#")
253 | print("输出模型总结")
254 | print(model.summary())
255 | print("#---------------------------------------------------#")
256 | print("\n")
257 | 
258 | # 模型的配置文件
259 | 
260 | config=model.get_config()
261 | 
262 | print("#---------------------------------------------------#")
263 | print("输出模型配置信息")
264 | print(config)
265 | print("#---------------------------------------------------#")
266 | print("\n")
267 | 


--------------------------------------------------------------------------------
/keras_sentiment_analysis_v4.py:
--------------------------------------------------------------------------------
  1 | # _*_ coding:utf-8 _*_
  2 | 
  3 | '''
  4 | @Author: Ruan Yang
  5 | @Date: 2018.12.15
  6 | @Purpose: 基于 keras 构建 textcnn 算法
  7 | @Reference: https://www.cnblogs.com/bymo/p/9675654.html
  8 | '''
  9 | 
 10 | import logging
 11 | logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
 12 | 
 13 | from keras import Input
 14 | from keras.layers import Conv1D,MaxPool1D,Dense,Flatten,concatenate,Embedding
 15 | from keras.models import Model
 16 | 
 17 | import codecs
 18 | import jieba
 19 | 
 20 | datapaths=r"C:\Users\RY\Desktop\情感分析\SentimentAnalysis-master\data"
 21 | 
 22 | positive_data=[]
 23 | y_positive=[]
 24 | neutral_data=[]
 25 | y_neutral=[]
 26 | negative_data=[]
 27 | y_negative=[]
 28 | 
 29 | print("#------------------------------------------------------#")
 30 | print("加载数据集")
 31 | with codecs.open(datapaths+"\\pos.csv","r","utf-8") as f1,\
 32 |      codecs.open(datapaths+"\\neutral.csv","r","utf-8") as f2,\
 33 |      codecs.open(datapaths+"\\neg.csv","r","utf-8") as f3:
 34 |     for line in f1:
 35 |         positive_data.append(" ".join(i for i in jieba.lcut(line.strip(),cut_all=False)))
 36 |         y_positive.append([1,0,0])
 37 |     for line in f2:
 38 |         neutral_data.append(" ".join(i for i in jieba.lcut(line.strip(),cut_all=False)))
 39 |         y_neutral.append([0,1,0])
 40 |     for line in f3:
 41 |         negative_data.append(" ".join(i for i in jieba.lcut(line.strip(),cut_all=False)))
 42 |         y_negative.append([0,0,1])
 43 |         
 44 | print("positive data:{}".format(len(positive_data)))
 45 | print("neutral data:{}".format(len(neutral_data)))
 46 | print("negative data:{}".format(len(negative_data)))
 47 | 
 48 | x_text=positive_data+neutral_data+negative_data
 49 | y_label=y_positive+y_neutral+y_negative
 50 | print("#------------------------------------------------------#")
 51 | print("\n")
 52 | 
 53 | from tensorflow.contrib import learn
 54 | import tensorflow as tf
 55 | import numpy as np
 56 | import collections
 57 | 
 58 | max_document_length=200
 59 | min_frequency=1
 60 | 
 61 | 
 62 | vocab = learn.preprocessing.VocabularyProcessor(max_document_length,min_frequency, tokenizer_fn=list)
 63 | x = np.array(list(vocab.fit_transform(x_text)))
 64 | vocab_dict = collections.OrderedDict(vocab.vocabulary_._mapping)
 65 | 
 66 | with codecs.open(r"C:\Users\RY\Desktop\vocabulary.txt","w","utf-8") as f:
 67 |     for key,value in vocab_dict.items():
 68 |         f.write("{} {}\n".format(key,value))
 69 |         
 70 | print("#----------------------------------------------------------#")
 71 | print("\n")
 72 | 
 73 | print("#----------------------------------------------------------#")
 74 | print("数据混洗")
 75 | np.random.seed(10)
 76 | y=np.array(y_label)
 77 | shuffle_indices = np.random.permutation(np.arange(len(y)))
 78 | x_shuffled = x[shuffle_indices]
 79 | y_shuffled = y[shuffle_indices]
 80 | 
 81 | test_sample_percentage=0.2
 82 | test_sample_index = -1 * int(test_sample_percentage * float(len(y)))
 83 | x_train, x_test = x_shuffled[:test_sample_index], x_shuffled[test_sample_index:]
 84 | y_train, y_test = y_shuffled[:test_sample_index], y_shuffled[test_sample_index:]
 85 | 
 86 | train_positive_label=0
 87 | train_neutral_label=0
 88 | train_negative_label=0
 89 | test_positive_label=0
 90 | test_neutral_label=0
 91 | test_negative_label=0
 92 | 
 93 | for i in range(len(y_train)):
 94 |     if y_train[i,0] == 1:
 95 |         train_positive_label += 1
 96 |     elif y_train[i,1] == 1:
 97 |         train_neutral_label += 1
 98 |     else:
 99 |         train_negative_label += 1
100 |         
101 | for i in range(len(y_test)):
102 |     if y_test[i,0] == 1:
103 |         test_positive_label += 1
104 |     elif y_test[i,1] == 1:
105 |         test_neutral_label += 1
106 |     else:
107 |         test_negative_label += 1
108 |         
109 | print("训练集中 positive 样本个数：{}".format(train_positive_label))
110 | print("训练集中 neutral 样本个数：{}".format(train_neutral_label))
111 | print("训练集中 negative 样本个数：{}".format(train_negative_label))
112 | print("测试集中 positive 样本个数：{}".format(test_positive_label))
113 | print("测试集中 neutral 样本个数：{}".format(test_neutral_label))
114 | print("测试集中 negative 样本个数：{}".format(test_negative_label))
115 | 
116 | print("#----------------------------------------------------------#")
117 | print("\n")
118 | 
119 | print("#----------------------------------------------------------#")
120 | print("读取預训练词向量矩阵")
121 | 
122 | pretrainpath=r"E:\中科大MS\預训练模型\\"
123 | 
124 | embedding_index={}
125 | 
126 | with codecs.open(pretrainpath+"sgns.wiki.bigram","r","utf-8") as f:
127 |     line=f.readline()
128 |     nwords=int(line.strip().split(" ")[0])
129 |     ndims=int(line.strip().split(" ")[1])
130 |     for line in f:
131 |         values=line.split()
132 |         words=values[0]
133 |         coefs=np.asarray(values[1:],dtype="float32")
134 |         embedding_index[words]=coefs
135 |         
136 | print("預训练模型中Token总数：{} = {}".format(nwords,len(embedding_index)))
137 | print("預训练模型的维度：{}".format(ndims))
138 | print("#----------------------------------------------------------#")
139 | print("\n")
140 | 
141 | print("#----------------------------------------------------------#")
142 | print("将vocabulary中的 index-word 对应关系映射到 index-word vector形式")
143 | 
144 | embedding_matrix=[]
145 | notfoundword=0
146 | 
147 | for word in vocab_dict.keys():
148 |     if word in embedding_index.keys():
149 |         embedding_matrix.append(embedding_index[word])
150 |     else:
151 |         notfoundword += 1
152 |         embedding_matrix.append(np.random.uniform(-1,1,size=ndims))
153 |         
154 | embedding_matrix=np.array(embedding_matrix,dtype=np.float32) # 必须使用 np.float32
155 | print("词汇表中未找到单词个数：{}".format(notfoundword))
156 | print("#----------------------------------------------------------#")
157 | print("\n")
158 | 
159 | # 定义 textcnn 函数
160 | 
161 | def textcnn(filters,max_sequence_length,max_token_num,embedding_dim,num_classes,embedding_matrix=None):
162 |     '''
163 |     TextCNN Network Structure
164 |     1. Embedding layers
165 |     2. Convolution layer
166 |     3. max-pooling
167 |     4. softmax layer
168 |     
169 |     max_sequence_length: 输入句子的最大长度
170 |     max_token_num: 这个是指最大的 token 个数，感觉上是针对 英文字符 说的
171 |     embedding_dim：嵌入矩阵的维度
172 |     num_classes: 输出类别个数，二分类就设定为 2
173 |     embedding_matrix=None：这个的意思是是否适用 嵌入矩阵
174 |     '''
175 |     x_input=Input(shape=(max_sequence_length,))
176 |     logging.info("x_input.shape:{}".format(str(x_input.shape)))
177 |    
178 |     if embedding_matrix is None:
179 |         x_emb=Embedding(input_dim=max_token_num,output_dim=embedding_dim,input_length=max_sequence_length)(x_input)
180 |     else:
181 |         x_emb=Embedding(input_dim=max_token_num,output_dim=embedding_dim,input_length=max_sequence_length,\
182 |                        weights=[embedding_matrix],trainable=False)(x_input)
183 |        
184 |     logging.info("x_emb.shape:{}".format(str(x_emb.shape)))
185 |     
186 |     pool_output=[]
187 |     kernel_sizes=[2,3,4]
188 |     for kernel_size in kernel_sizes:
189 |         c=Conv1D(filters=filters,kernel_size=kernel_size,strides=1)(x_emb)
190 |         #p=MaxPool1D(pool_size=int(c.shape[1]))(c)
191 |         p=MaxPool1D(max_sequence_length-kernel_size+1)(c)
192 |         pool_output.append(p)
193 |         logging.info("kernel_size:{} \t c.shape:{} \t p.shape:{}".format(kernel_size,str(c.shape),str(p.shape)))
194 |     #pool_output = concatenate([p for p in pool_output])
195 |     pool_output = concatenate(pool_output,axis=1)
196 |     x_flatten=Flatten()(pool_output) # (?,6)
197 |     y=Dense(num_classes,activation="softmax")(x_flatten) # (?,2)
198 |     
199 |     logging.info("y.shape:{}\n".format(str(y.shape)))
200 |     
201 |     model=Model([x_input],outputs=[y])
202 |     model.summary()
203 |     
204 |     return model
205 | 
206 | # 定义模型超参数
207 | 
208 | filters=128
209 | max_sequence_length=max_document_length
210 | max_token_num=len(vocab_dict)
211 | embedding_dim=ndims
212 | num_classes=3
213 | embedding_matrix=embedding_matrix
214 | batch_size=64
215 | epochs=2
216 |     
217 | # 获得模型
218 | model=textcnn(filters,max_sequence_length,max_token_num,embedding_dim,num_classes,embedding_matrix=embedding_matrix)
219 | model.compile(loss="categorical_crossentropy",optimizer="adam",metrics=["accuracy"])
220 | 
221 | # 模型训练
222 | model.fit(x_train,y_train,batch_size=batch_size,epochs=epochs,validation_data=(x_test,y_test))
223 | 
224 | # 训练得分和准确度
225 | 
226 | score,acc=model.evaluate(x_test,y_test,batch_size=batch_size)
227 | 
228 | print("#---------------------------------------------------#")
229 | print("预测得分:{}".format(score))
230 | print("预测准确率:{}".format(acc))
231 | print("#---------------------------------------------------#")
232 | print("\n")
233 | 
234 | # 模型预测
235 | 
236 | predictions=model.predict(x_test)
237 | 
238 | print("#---------------------------------------------------#")
239 | print("测试集的预测结果，对每个类有一个得分/概率，取值大对应的类别")
240 | print(predictions)
241 | print("#---------------------------------------------------#")
242 | print("\n")
243 | 
244 | # 模型预测类别
245 | 
246 | predict_class=model.predict_classes(x_test)
247 | 
248 | print("#---------------------------------------------------#")
249 | print("测试集的预测类别")
250 | print(predict_class)
251 | print("#---------------------------------------------------#")
252 | print("\n")
253 | 
254 | # 模型保存
255 | 
256 | model.save(r"C:\Users\RY\Desktop\sentiment_analysis_textcnn.h5")
257 | 
258 | print("#---------------------------------------------------#")
259 | print("保存模型")
260 | print("#---------------------------------------------------#")
261 | print("\n")
262 | 
263 | # 模型总结
264 | 
265 | print("#---------------------------------------------------#")
266 | print("输出模型总结")
267 | print(model.summary())
268 | print("#---------------------------------------------------#")
269 | print("\n")
270 | 
271 | # 模型的配置文件
272 | 
273 | config=model.get_config()
274 | 
275 | print("#---------------------------------------------------#")
276 | print("输出模型配置信息")
277 | print(config)
278 | print("#---------------------------------------------------#")
279 | print("\n")
280 | 
281 | # 模型的画图和图片保存
282 | 
283 | import matplotlib.pyplot as plt
284 | import matplotlib.image as mpimg
285 | from keras.utils import plot_model
286 | 
287 | plot_model(model,to_file="model.jpg",show_shapes=True)
288 | lena=mpimg.imread("model.jpg")
289 | lena.shape
290 | plt.imshow(lena)
291 | plt.axis("off")
292 | plt.show()


--------------------------------------------------------------------------------
/keras_sentiment_analysis_v5.py:
--------------------------------------------------------------------------------
  1 | # _*_ coding:utf-8 _*_
  2 | 
  3 | '''
  4 | @Author: Ruan Yang
  5 | @Date: 2018.12.9
  6 | @Purpose: 文本情感分析(positive,negative,neutral)
  7 | @Reference: https://github.com/Edward1Chou/SentimentAnalysis
  8 | @算法：Bi_LSTM
  9 | @需要有事先标准好的数据集
 10 | @positive: [1,0,0]
 11 | @neutral: [0,1,0]
 12 | @negative:[0,0,1]
 13 | '''
 14 | import logging
 15 | logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
 16 | 
 17 | import codecs
 18 | import jieba
 19 | 
 20 | datapaths=r"C:\Users\RY\Desktop\情感分析\SentimentAnalysis-master\data\\"
 21 | 
 22 | positive_data=[]
 23 | y_positive=[]
 24 | neutral_data=[]
 25 | y_neutral=[]
 26 | negative_data=[]
 27 | y_negative=[]
 28 | 
 29 | print("#------------------------------------------------------#")
 30 | print("加载数据集")
 31 | with codecs.open(datapaths+"pos.csv","r","utf-8") as f1,\
 32 |      codecs.open(datapaths+"neutral.csv","r","utf-8") as f2,\
 33 |      codecs.open(datapaths+"neg.csv","r","utf-8") as f3:
 34 |     for line in f1:
 35 |         positive_data.append(" ".join(i for i in jieba.lcut(line.strip(),cut_all=False)))
 36 |         y_positive.append([1,0,0])
 37 |     for line in f2:
 38 |         neutral_data.append(" ".join(i for i in jieba.lcut(line.strip(),cut_all=False)))
 39 |         y_neutral.append([0,1,0])
 40 |     for line in f3:
 41 |         negative_data.append(" ".join(i for i in jieba.lcut(line.strip(),cut_all=False)))
 42 |         y_negative.append([0,0,1])
 43 |         
 44 | print("positive data:{}".format(len(positive_data)))
 45 | print("neutral data:{}".format(len(neutral_data)))
 46 | print("negative data:{}".format(len(negative_data)))
 47 | 
 48 | x_text=positive_data+neutral_data+negative_data
 49 | y_label=y_positive+y_neutral+y_negative
 50 | print("#------------------------------------------------------#")
 51 | print("\n")
 52 | 
 53 | from tensorflow.contrib import learn
 54 | import tensorflow as tf
 55 | import numpy as np
 56 | import collections
 57 | 
 58 | max_document_length=200
 59 | min_frequency=1
 60 | 
 61 | 
 62 | vocab = learn.preprocessing.VocabularyProcessor(max_document_length,min_frequency, tokenizer_fn=list)
 63 | x = np.array(list(vocab.fit_transform(x_text)))
 64 | vocab_dict = collections.OrderedDict(vocab.vocabulary_._mapping)
 65 | 
 66 | with codecs.open(r"C:\Users\RY\Desktop\vocabulary.txt","w","utf-8") as f:
 67 |     for key,value in vocab_dict.items():
 68 |         f.write("{} {}\n".format(key,value))
 69 |         
 70 | print("#----------------------------------------------------------#")
 71 | print("\n")
 72 | 
 73 | print("#----------------------------------------------------------#")
 74 | print("数据混洗")
 75 | np.random.seed(10)
 76 | y=np.array(y_label)
 77 | shuffle_indices = np.random.permutation(np.arange(len(y)))
 78 | x_shuffled = x[shuffle_indices]
 79 | y_shuffled = y[shuffle_indices]
 80 | 
 81 | test_sample_percentage=0.2
 82 | test_sample_index = -1 * int(test_sample_percentage * float(len(y)))
 83 | x_train, x_test = x_shuffled[:test_sample_index], x_shuffled[test_sample_index:]
 84 | y_train, y_test = y_shuffled[:test_sample_index], y_shuffled[test_sample_index:]
 85 | 
 86 | train_positive_label=0
 87 | train_neutral_label=0
 88 | train_negative_label=0
 89 | test_positive_label=0
 90 | test_neutral_label=0
 91 | test_negative_label=0
 92 | 
 93 | for i in range(len(y_train)):
 94 |     if y_train[i,0] == 1:
 95 |         train_positive_label += 1
 96 |     elif y_train[i,1] == 1:
 97 |         train_neutral_label += 1
 98 |     else:
 99 |         train_negative_label += 1
100 |         
101 | for i in range(len(y_test)):
102 |     if y_test[i,0] == 1:
103 |         test_positive_label += 1
104 |     elif y_test[i,1] == 1:
105 |         test_neutral_label += 1
106 |     else:
107 |         test_negative_label += 1
108 |         
109 | print("训练集中 positive 样本个数：{}".format(train_positive_label))
110 | print("训练集中 neutral 样本个数：{}".format(train_neutral_label))
111 | print("训练集中 negative 样本个数：{}".format(train_negative_label))
112 | print("测试集中 positive 样本个数：{}".format(test_positive_label))
113 | print("测试集中 neutral 样本个数：{}".format(test_neutral_label))
114 | print("测试集中 negative 样本个数：{}".format(test_negative_label))
115 | 
116 | print("#----------------------------------------------------------#")
117 | print("\n")
118 | 
119 | print("#----------------------------------------------------------#")
120 | print("读取預训练词向量矩阵")
121 | 
122 | pretrainpath=r"E:\中科大MS\預训练模型\\"
123 | 
124 | embedding_index={}
125 | 
126 | with codecs.open(pretrainpath+"sgns.wiki.bigram","r","utf-8") as f:
127 |     line=f.readline()
128 |     nwords=int(line.strip().split(" ")[0])
129 |     ndims=int(line.strip().split(" ")[1])
130 |     for line in f:
131 |         values=line.split()
132 |         words=values[0]
133 |         coefs=np.asarray(values[1:],dtype="float32")
134 |         embedding_index[words]=coefs
135 |         
136 | print("預训练模型中Token总数：{} = {}".format(nwords,len(embedding_index)))
137 | print("預训练模型的维度：{}".format(ndims))
138 | print("#----------------------------------------------------------#")
139 | print("\n")
140 | 
141 | print("#----------------------------------------------------------#")
142 | print("将vocabulary中的 index-word 对应关系映射到 index-word vector形式")
143 | 
144 | embedding_matrix=[]
145 | notfoundword=0
146 | 
147 | for word in vocab_dict.keys():
148 |     if word in embedding_index.keys():
149 |         embedding_matrix.append(embedding_index[word])
150 |     else:
151 |         notfoundword += 1
152 |         embedding_matrix.append(np.random.uniform(-1,1,size=ndims))
153 |         
154 | embedding_matrix=np.array(embedding_matrix,dtype=np.float32) # 必须使用 np.float32
155 | print("词汇表中未找到单词个数：{}".format(notfoundword))
156 | print("#----------------------------------------------------------#")
157 | print("\n")
158 | 
159 | print("#---------------------------------------------------#")
160 | print("Build model .................")
161 | print("NN structure .......")
162 | print("Embedding layer --- Bi_LSTM layer --- Dense layer")
163 | print("#---------------------------------------------------#")
164 | print("\n")
165 | 
166 | from keras.models import Sequential
167 | from keras.layers import Dense, Dropout, Activation
168 | from keras.layers import Embedding,Bidirectional
169 | from keras.layers import Conv1D, GlobalMaxPooling1D
170 | from keras.layers import LSTM
171 | 
172 | batch_size=64
173 | max_sentence_length=200
174 | embedding_dims=ndims
175 | dropout=0.2
176 | recurrent_dropout=0.2
177 | num_classes=3
178 | epochs=2
179 | 
180 | # 定义网络结构
181 | model=Sequential()
182 | model.add(Embedding(len(vocab_dict),
183 |                     embedding_dims,
184 |                     weights=[embedding_matrix],
185 |                     input_length=max_sentence_length,
186 |                     trainable=False))
187 | model.add(Dropout(dropout))
188 | model.add(Bidirectional(LSTM(64)))
189 | model.add(Dropout(dropout))
190 | model.add(Dense(num_classes,activation="sigmoid"))
191 | 
192 | # 模型编译
193 | 
194 | model.compile(loss="categorical_crossentropy",optimizer="adam",metrics=["accuracy"])
195 | 
196 | print("#---------------------------------------------------#")
197 | print("Train ....................")
198 | print("#---------------------------------------------------#")
199 | print("\n")
200 | 
201 | model.fit(x_train,y_train,batch_size=batch_size,epochs=epochs,validation_data=(x_test,y_test))
202 | 
203 | # 训练得分和准确度
204 | 
205 | score,acc=model.evaluate(x_test,y_test,batch_size=batch_size)
206 | 
207 | print("#---------------------------------------------------#")
208 | print("预测得分:{}".format(score))
209 | print("预测准确率:{}".format(acc))
210 | print("#---------------------------------------------------#")
211 | print("\n")
212 | 
213 | # 模型预测
214 | 
215 | predictions=model.predict(x_test)
216 | 
217 | print("#---------------------------------------------------#")
218 | print("测试集的预测结果，对每个类有一个得分/概率，取值大对应的类别")
219 | print(predictions)
220 | print("#---------------------------------------------------#")
221 | print("\n")
222 | 
223 | # 模型预测类别
224 | 
225 | predict_class=model.predict_classes(x_test)
226 | 
227 | print("#---------------------------------------------------#")
228 | print("测试集的预测类别")
229 | print(predict_class)
230 | print("#---------------------------------------------------#")
231 | print("\n")
232 | 
233 | # 模型保存
234 | 
235 | model.save(r"C:\Users\RY\Desktop\sentiment_analysis_lstm.h5")
236 | 
237 | print("#---------------------------------------------------#")
238 | print("保存模型")
239 | print("#---------------------------------------------------#")
240 | print("\n")
241 | 
242 | # 模型总结
243 | 
244 | print("#---------------------------------------------------#")
245 | print("输出模型总结")
246 | print(model.summary())
247 | print("#---------------------------------------------------#")
248 | print("\n")
249 | 
250 | # 模型的配置文件
251 | 
252 | config=model.get_config()
253 | 
254 | print("#---------------------------------------------------#")
255 | print("输出模型配置信息")
256 | print(config)
257 | print("#---------------------------------------------------#")
258 | print("\n")
259 | 


--------------------------------------------------------------------------------
/sentiment_analysis_ml.py:
--------------------------------------------------------------------------------
  1 | # _*_ coding:utf-8 _*_
  2 | 
  3 | '''
  4 | @Author: Ruan Yang
  5 | @Date: 2018.12.16
  6 | @Purpose: 使用传统的机器学习的方法进行文本情感分析
  7 | '''
  8 | 
  9 | import codecs
 10 | import jieba
 11 | import numpy as np
 12 | 
 13 | from gensim.models.word2vec import Word2Vec
 14 | from sklearn.externals import joblib
 15 | from sklearn.svm import SVC
 16 | from sklearn.naive_bayes import GaussianNB
 17 | from sklearn.tree import DecisionTreeClassifier
 18 | from sklearn.ensemble import RandomForestClassifier
 19 | from sklearn.ensemble import ExtraTreesClassifier
 20 | from sklearn.ensemble import GradientBoostingClassifier
 21 | from sklearn.neural_network import MLPClassifier
 22 | from sklearn import neighbors
 23 | from sklearn.linear_model import LogisticRegression
 24 | from sklearn.linear_model import SGDClassifier
 25 | 
 26 | datapaths=r"C:\Users\RY\Desktop\情感分析\SentimentAnalysis-master\data\\"
 27 | storedpaths=r"C:\Users\RY\Desktop\\"
 28 | 
 29 | positive_data=[]
 30 | y_positive=[]
 31 | neutral_data=[]
 32 | y_neutral=[]
 33 | negative_data=[]
 34 | y_negative=[]
 35 | 
 36 | print("#------------------------------------------------------#")
 37 | print("加载数据集")
 38 | with codecs.open(datapaths+"pos.csv","r","utf-8") as f1,\
 39 |      codecs.open(datapaths+"neutral.csv","r","utf-8") as f2,\
 40 |      codecs.open(datapaths+"neg.csv","r","utf-8") as f3:
 41 |     for line in f1:
 42 |         positive_data.append(" ".join(i for i in jieba.lcut(line.strip(),cut_all=False)))
 43 |         #y_positive.append([1,0,0])
 44 |         y_positive.append([0])
 45 |     for line in f2:
 46 |         neutral_data.append(" ".join(i for i in jieba.lcut(line.strip(),cut_all=False)))
 47 |         #y_neutral.append([0,1,0])
 48 |         y_neutral.append([1])
 49 |     for line in f3:
 50 |         negative_data.append(" ".join(i for i in jieba.lcut(line.strip(),cut_all=False)))
 51 |         #y_negative.append([0,0,1])
 52 |         y_negative.append([2])
 53 |         
 54 | print("positive data:{}".format(len(positive_data)))
 55 | print("neutral data:{}".format(len(neutral_data)))
 56 | print("negative data:{}".format(len(negative_data)))
 57 | 
 58 | x_text=positive_data+neutral_data+negative_data
 59 | y_label=y_positive+y_neutral+y_negative
 60 | print("#------------------------------------------------------#")
 61 | print("\n")
 62 | 
 63 | # 数据集混洗
 64 | 
 65 | shuffle_indices = np.random.permutation(np.arange(len(y_label)))
 66 | train_test_percent=0.2
 67 | 
 68 | x_train=[]
 69 | x_test=[]
 70 | y_train=[]
 71 | y_test=[]
 72 | 
 73 | for i in shuffle_indices[:-(int(len(shuffle_indices)*train_test_percent))]:
 74 |     x_train.append(x_text[i])
 75 |     y_train.append(y_label[i])
 76 |     
 77 | for i in shuffle_indices[-(int(len(shuffle_indices)*train_test_percent)):]:
 78 |     x_test.append(x_text[i])
 79 |     y_test.append(y_label[i])
 80 |     
 81 | x_train_pos=0
 82 | x_train_neu=0
 83 | x_train_neg=0
 84 | 
 85 | x_test_pos=0
 86 | x_test_neu=0
 87 | x_test_neg=0
 88 | 
 89 | for i in y_train:
 90 |     if i[0] == 0:
 91 |         x_train_pos += 1
 92 |     elif i[0] == 1:
 93 |         x_train_neu += 1
 94 |     else:
 95 |         x_train_neg += 1
 96 |         
 97 | for i in y_test:
 98 |     if i[0] == 0:
 99 |         x_test_pos += 1
100 |     elif i[0] == 1:
101 |         x_test_neu += 1
102 |     else:
103 |         x_test_neg += 1
104 |         
105 | print("#------------------------------------------------------#")
106 | print("保存标签数据")
107 | np.save(storedpaths+"y_train.npy",np.array(y_train))
108 | np.save(storedpaths+"y_test.npy",np.array(y_test))
109 | print("训练集总数：{}".format(len(x_train)))
110 | print("训练集正样本：{}".format(x_train_pos))
111 | print("训练集中性样本：{}".format(x_train_neu))
112 | print("训练集负样本：{}".format(x_train_neg))
113 | print("测试集总数：{}".format(len(x_test)))
114 | print("测试集正样本：{}".format(x_test_pos))
115 | print("测试集中性样本：{}".format(x_test_neu))
116 | print("测试集负样本：{}".format(x_test_neg))
117 | print("#------------------------------------------------------#")
118 | print("\n")
119 | 
120 | 
121 | #对每个句子的所有词向量取均值
122 | # text 需要是切完词的 词列表
123 | # size 一般是词向量的维度
124 | # word_vector_model: 训练好的词向量模型 （一般使用 gensim 中的 WordVector 进行词向量训练）
125 | # 或者是直接加载训练好的模型
126 | 
127 | def buildWordVector(text,size,word_vector_model):
128 |     vec = np.zeros(size).reshape((1, size))
129 |     count = 0.
130 |     for word in text:
131 |         try:
132 |             vec += word_vector_model[word].reshape((1, size))
133 |             count += 1.
134 |         except KeyError:
135 |             continue
136 |     if count != 0:
137 |         vec /= count
138 |     return vec
139 | 
140 | # 计算词向量
141 | 
142 | def get_train_vecs(x_train,x_test,n_dim):
143 |     '''
144 |     x_train: 训练集
145 |     x_test: 测试集
146 |     n_dim: 训练词向量的维度
147 |     '''
148 |     n_dim=n_dim
149 |     # 初始化模型和生成词汇表
150 |     all_text=x_train+x_test
151 |     text_w2v=Word2Vec(size=n_dim,min_count=5,workers=1)
152 |     text_w2v.build_vocab(all_text)
153 |     text_w2v.train(all_text,total_examples=text_w2v.corpus_count,epochs=5)
154 |     
155 |     # 分别得到训练集和测试集文本的词向量合集，这个数据集就很大了
156 |     
157 |     train_vecs=np.concatenate([buildWordVector(text,n_dim,text_w2v) for text in x_train])
158 |     np.save(storedpaths+"train_vecs.npy",train_vecs)
159 |     print("训练集数据的词向量维度：{}".format(train_vecs.shape))
160 |     
161 |     test_vecs=np.concatenate([buildWordVector(text,n_dim,text_w2v) for text in x_test])
162 |     np.save(storedpaths+"test_vecs.npy",test_vecs)
163 |     print("测试集数据的词向量维度：{}".format(test_vecs.shape))
164 |     
165 |     # 保存词向量
166 |     text_w2v.save(storedpaths+"w2v_model.pkl")
167 |     
168 | # 加载向量化的文本和标签
169 | 
170 | def get_data():
171 |     train_vecs=np.load(storedpaths+'train_vecs.npy')
172 |     y_train=np.load(storedpaths+'y_train.npy')
173 |     test_vecs=np.load(storedpaths+'test_vecs.npy')
174 |     y_test=np.load(storedpaths+'y_test.npy') 
175 |     return train_vecs,y_train,test_vecs,y_test
176 | 
177 | # 训练svm模型
178 | 
179 | def svm_train(train_vecs,y_train,test_vecs,y_test):
180 |     clf=SVC(kernel='rbf',verbose=True)
181 |     clf.fit(train_vecs,y_train)
182 |     joblib.dump(clf,storedpaths+'model.pkl')
183 |     test_scores=clf.score(test_vecs,y_test)
184 |     return test_scores
185 |     
186 | # 训练朴素贝叶斯模型
187 | 
188 | def NB_train(train_vecs,y_train,test_vecs,y_test):
189 |     gnb = GaussianNB()
190 |     gnb.fit(train_vecs,y_train)
191 |     joblib.dump(gnb,storedpaths+'model_gnb.pkl')
192 |     test_scores=gnb.score(test_vecs,y_test)
193 |     return test_scores
194 |     
195 | # 训练决策树模型
196 | 
197 | def decision_tree(train_vecs,y_train,test_vecs,y_test):
198 |     clf=DecisionTreeClassifier(max_depth=10, min_samples_split=2,random_state=0)
199 |     clf.fit(train_vecs,y_train)
200 |     joblib.dump(clf,storedpaths+'model_dtree.pkl')
201 |     test_scores=clf.score(test_vecs,y_test)
202 |     return test_scores
203 |     
204 | # 训练随机森林算法
205 | 
206 | def random_forest(train_vecs,y_train,test_vecs,y_test):
207 |     clf = RandomForestClassifier(n_estimators=10, max_depth=10,min_samples_split=2,n_jobs=1,random_state=0)
208 |     clf.fit(train_vecs,y_train)
209 |     joblib.dump(clf,storedpaths+'model_randomforest.pkl')
210 |     test_scores=clf.score(test_vecs,y_test)
211 |     return test_scores
212 |     
213 | # 训练 ExtraTreesClassifier 分类算法
214 | 
215 | def extract_tree(train_vecs,y_train,test_vecs,y_test):
216 |     clf = ExtraTreesClassifier(n_estimators=10, max_depth=10,min_samples_split=2,n_jobs=1,random_state=0)
217 |     clf.fit(train_vecs,y_train)
218 |     joblib.dump(clf,storedpaths+'model_extracttree.pkl')
219 |     test_scores=clf.score(test_vecs,y_test)
220 |     return test_scores
221 |     
222 | # 训练 GBDT 分类算法
223 | 
224 | def gbdt_classifier(train_vecs,y_train,test_vecs,y_test):
225 |     clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0,max_depth=10,random_state=0)
226 |     clf.fit(train_vecs,y_train)
227 |     joblib.dump(clf,storedpaths+'model_gbdt.pkl')
228 |     test_scores=clf.score(test_vecs,y_test)
229 |     return test_scores
230 |     
231 | # 训练近邻分类算法
232 | 
233 | def nn_classifier(n_neighbors,train_vecs,y_train,test_vecs,y_test):
234 |     clf = neighbors.KNeighborsClassifier(n_neighbors, weights='uniform')
235 |     clf.fit(train_vecs,y_train)
236 |     joblib.dump(clf,storedpaths+'model_nn.pkl')
237 |     test_scores=clf.score(test_vecs,y_test)
238 |     return test_scores
239 |     
240 | # 训练 LogisticRegression 分类算法
241 | 
242 | def LR_classifier(train_vecs,y_train,test_vecs,y_test):
243 |     clf = LogisticRegression(C=50. / len(y_train),multi_class='multinomial',\
244 |     penalty='l1', solver='saga', tol=0.1)
245 |     clf.fit(train_vecs,y_train)
246 |     joblib.dump(clf,storedpaths+'model_lr.pkl')
247 |     test_scores=clf.score(test_vecs,y_test)
248 |     return test_scores
249 |     
250 | # 训练 随机梯度下降 分类算法
251 | 
252 | def SGD_classifier(train_vecs,y_train,test_vecs,y_test):
253 |     clf = SGDClassifier(alpha=0.001, max_iter=100)
254 |     clf.fit(train_vecs,y_train)
255 |     joblib.dump(clf,storedpaths+'model_sgd.pkl')
256 |     test_scores=clf.score(test_vecs,y_test)
257 |     return test_scores
258 |     
259 | # 训练多层感知机分类算法
260 | 
261 | def MP_classifier(train_vecs,y_train,test_vecs,y_test):
262 |     clf = MLPClassifier(solver='lbfgs', alpha=1e-5,hidden_layer_sizes=(5, 2), random_state=1)
263 |     clf.fit(train_vecs,y_train)
264 |     joblib.dump(clf,storedpaths+'model_mp.pkl')
265 |     test_scores=clf.score(test_vecs,y_test)
266 |     return test_scores
267 |     
268 | 
269 | # 得到待预测单个句子的词向量
270 | # 预先进行分词操作
271 | 
272 | def get_predict_vecs(string,n_dim,w2v_model_path):
273 |     '''
274 |     string: 输入的句子
275 |     n_dim: 词向量维度
276 |     w2v_model_path: 預训练词向量的模型路径
277 |     '''
278 |     n_dim = n_dim
279 |     text_w2v = Word2Vec.load(w2v_model_path)
280 |     words=[i for i in jieba.cut(string,cut_all=False)]
281 |     train_vecs = buildWordVector(words, n_dim,text_w2v)
282 | 
283 |     return train_vecs
284 | 
285 | # 调用训练模型进行预测
286 | 
287 | def svm_predict(string,trainmodelpath):
288 |     words_vecs=get_predict_vecs(string)
289 |     clf=joblib.load(trainmodelpath)
290 |     result=clf.predict(words_vecs)
291 |     
292 |     return result
293 | 
294 | # Train model
295 | 
296 | n_dim=300
297 | n_neighbors=10
298 | #get_train_vecs(x_train,x_test,n_dim)
299 | 
300 | 
301 | train_vecs,y_train,test_vecs,y_test=get_data()
302 | test_scores=svm_train(train_vecs,y_train,test_vecs,y_test)
303 | print("#----------------------------------------#")
304 | print("SVM测试集测试得分：{}".format(test_scores))
305 | print("#----------------------------------------#")
306 | test_scores=NB_train(train_vecs,y_train,test_vecs,y_test)
307 | print("#----------------------------------------#")
308 | print("NB测试集测试得分：{}".format(test_scores))
309 | print("#----------------------------------------#")
310 | test_scores=nn_classifier(n_neighbors,train_vecs,y_train,test_vecs,y_test)
311 | print("#----------------------------------------#")
312 | print("NN测试集测试得分：{}".format(test_scores))
313 | print("#----------------------------------------#")
314 | test_scores=LR_classifier(train_vecs,y_train,test_vecs,y_test)
315 | print("#----------------------------------------#")
316 | print("LR测试集测试得分：{}".format(test_scores))
317 | print("#----------------------------------------#")
318 | test_scores=SGD_classifier(train_vecs,y_train,test_vecs,y_test)
319 | print("#----------------------------------------#")
320 | print("SGD测试集测试得分：{}".format(test_scores))
321 | print("#----------------------------------------#")
322 | test_scores=decision_tree(train_vecs,y_train,test_vecs,y_test)
323 | print("#----------------------------------------#")
324 | print("TREE测试集测试得分：{}".format(test_scores))
325 | print("#----------------------------------------#")
326 | test_scores=random_forest(train_vecs,y_train,test_vecs,y_test)
327 | print("#----------------------------------------#")
328 | print("Random_Forest测试集测试得分：{}".format(test_scores))
329 | print("#----------------------------------------#")
330 | test_scores=extract_tree(train_vecs,y_train,test_vecs,y_test)
331 | print("#----------------------------------------#")
332 | print("Extract_Tree测试集测试得分：{}".format(test_scores))
333 | print("#----------------------------------------#")
334 | test_scores=gbdt_classifier(train_vecs,y_train,test_vecs,y_test)
335 | print("#----------------------------------------#")
336 | print("GBDT_Tree测试集测试得分：{}".format(test_scores))
337 | print("#----------------------------------------#")
338 | test_scores=MP_classifier(train_vecs,y_train,test_vecs,y_test)
339 | print("#----------------------------------------#")
340 | print("MP测试集测试得分：{}".format(test_scores))
341 | print("#----------------------------------------#")


--------------------------------------------------------------------------------