├── .idea
├── Taobao_SentimentAnalysis.iml
├── encodings.xml
├── misc.xml
├── modules.xml
├── vcs.xml
└── workspace.xml
├── LICENSE
├── README.md
├── assets
├── fig.png
├── res.png
├── screenshot.png
└── word_cloud.png
├── comment_data.csv
├── processed_comment_data.csv
├── requirements.txt
├── result.csv
├── sentiment_analysis.py
├── taobao_spider.py
└── word_cloud.py
/.idea/Taobao_SentimentAnalysis.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
--------------------------------------------------------------------------------
/.idea/encodings.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
--------------------------------------------------------------------------------
/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
--------------------------------------------------------------------------------
/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/.idea/workspace.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
73 |
74 |
75 |
76 |
77 |
78 |
79 |
80 |
81 |
82 |
83 |
84 |
85 |
86 |
87 |
88 |
89 |
90 |
91 |
92 |
93 |
94 |
95 |
96 |
97 |
98 |
99 |
100 |
101 |
102 |
103 |
104 |
109 |
110 |
111 |
112 | rate-grid
113 |
114 |
115 |
116 |
117 |
118 |
119 |
120 |
121 |
122 |
123 |
124 |
125 |
126 |
127 |
128 |
129 |
130 |
131 |
132 |
133 |
134 |
135 |
136 |
137 |
138 |
139 |
140 |
141 |
142 |
143 |
144 |
145 |
146 |
147 |
148 |
149 |
150 |
151 |
152 |
153 |
154 |
155 |
156 |
157 |
158 |
159 |
160 |
161 |
162 |
163 |
164 |
165 |
166 |
167 |
168 |
169 |
170 |
171 |
172 |
173 |
174 |
175 |
176 |
177 |
178 |
179 |
180 |
181 |
182 |
183 |
184 |
185 |
186 |
187 |
188 |
189 |
190 |
191 |
192 |
193 |
194 |
195 |
196 |
197 |
198 |
199 |
200 |
201 |
202 |
203 |
204 |
205 |
206 |
207 |
208 |
209 |
210 |
211 |
212 |
213 |
214 |
215 |
216 |
217 |
218 |
219 |
220 |
221 |
222 |
223 |
224 |
225 |
226 |
227 |
228 |
229 |
230 |
231 |
232 |
233 |
234 |
235 |
236 |
237 |
238 |
239 |
240 |
241 |
242 |
243 |
244 |
245 |
246 |
247 |
248 |
249 |
250 | 1569588926647
251 |
252 |
253 | 1569588926647
254 |
255 |
256 |
257 |
258 |
259 | 1569634873358
260 |
261 |
262 |
263 | 1569634873358
264 |
265 |
266 | 1569635232462
267 |
268 |
269 |
270 | 1569635232462
271 |
272 |
273 | 1569642208719
274 |
275 |
276 |
277 | 1569642208719
278 |
279 |
280 | 1569643290972
281 |
282 |
283 |
284 | 1569643290972
285 |
286 |
287 | 1569644578090
288 |
289 |
290 |
291 | 1569644578090
292 |
293 |
294 | 1569644664295
295 |
296 |
297 |
298 | 1569644664295
299 |
300 |
301 | 1569645912513
302 |
303 |
304 |
305 | 1569645912513
306 |
307 |
308 | 1569646215754
309 |
310 |
311 |
312 | 1569646215754
313 |
314 |
315 | 1569646455619
316 |
317 |
318 |
319 | 1569646455619
320 |
321 |
322 | 1569646516003
323 |
324 |
325 |
326 | 1569646516003
327 |
328 |
329 | 1569647989115
330 |
331 |
332 |
333 | 1569647989116
334 |
335 |
336 |
337 |
338 |
339 |
340 |
341 |
342 |
343 |
344 |
345 |
346 |
347 |
348 |
349 |
350 |
351 |
352 |
353 |
354 |
355 |
356 |
357 |
358 |
359 |
360 |
361 |
362 |
363 |
364 |
365 |
366 |
367 |
368 |
369 |
370 |
371 |
372 |
373 |
374 |
375 |
376 |
377 |
378 |
379 |
380 |
381 |
382 |
383 |
384 |
385 |
386 |
387 |
388 |
389 |
390 |
391 |
392 |
393 |
394 |
395 |
396 |
397 |
398 |
399 |
400 |
401 |
402 |
403 |
404 |
405 |
406 |
407 |
408 |
409 |
410 |
411 |
412 |
413 |
414 |
415 |
416 |
417 |
418 |
419 |
420 |
421 |
422 |
423 |
424 |
425 |
426 |
427 |
428 |
429 |
430 |
431 |
432 |
433 |
434 |
435 |
436 |
437 |
438 |
439 |
440 |
441 |
442 |
443 |
444 |
445 |
446 |
447 |
448 |
449 |
450 |
451 |
452 |
453 |
454 |
455 |
456 |
457 |
458 |
459 |
460 |
461 |
462 |
463 |
464 |
465 |
466 |
467 |
468 |
469 |
470 |
471 |
472 |
473 |
474 |
475 |
476 |
477 |
478 |
479 |
480 |
481 |
482 |
483 |
484 |
485 |
486 |
487 |
488 |
489 |
490 |
491 |
492 |
493 |
494 |
495 |
496 |
497 |
498 |
499 |
500 |
501 |
502 |
503 |
504 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2019 ElecRex
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # 😼😽😻JD_SentimentAnalysis🤟
2 | 
3 | 
4 | ## Introduction | 项目简介
5 |
6 | 抓取京东商城随机选取的100部图书的每部书的10条商品评论,共1000条左右,
7 | 进行nlp情感分析和简单数据统计分析,并生成商品评论高频词汇词云。
8 |
9 | ## Download | 项目下载
10 |
11 | ```bash
12 | $ git clone https://github.com/ElecRex/Taobao_SentimentAnalysis.git
13 | $ cd Taobao_SentimentAnalysis
14 | ```
15 |
16 | ## Requirements
17 |
18 | - python 3.7
19 | - requests==2.19.1
20 | - lxml==4.2.5
21 | - pandas==0.23.4
22 | - snownlp==0.12.3
23 | - matplotlib==2.2.2
24 | - jieba==0.39
25 | - wordcloud==1.5.0
26 |
27 | ## Details | 情感分析流程简析
28 |
29 | ### 1. 利用snownlp
30 |
31 | - 找到下载完毕的```snownlp```位于```site-packages```的位置,如下图:
32 | - 
33 |
34 | ### 2. 构建简单分词和词性标注
35 |
36 | - 部分中文分词如下:
37 |
38 | ```
39 | 新/b 华/m 社/e 北/b 京/e 1/b 2/m 月/e 3/b 1/m 日/e 电/s 忠/b 诚/e 的/s 共/b 产/m 主/m 义/e 战/b 士/e ,/s 久/b 经/m 考/m 验/e 的/s 无/b 产/m 阶/m 级/e 革/b 命/m 家/e ,/s 我/b 党/e 党/b 务/e 工/b 作/e 和/s 统/b 一/m 战/m 线/e 工/b 作/e 的/s 杰/b 出/e 领/b 导/m 人/e ,/s 原/s 中/b 共/m 中/m 央/e 顾/b 问/e 委/b 员/m 会/e 常/b 务/e 委/b 员/m 会/e 委/b 员/e ,/s 中/b 国/e 人/b 民/e 政/b 治/e 协/b 商/e 会/b 议/e 第/b 四/e 、/s 五/s 、/s 六/s 届/s 全/b 国/e 委/b 员/m 会/e 副/s 主/b 席/e 刘/s 澜/b 涛/e 同/b 志/e ,/s 因/s 病/s 医/b 治/e 无/b 效/e ,/s 于/s 1/b 9/m 9/m 7/m 年/e 1/b 2/m 月/e 3/b 1/m 日/e 1/b 0/m 时/e 4/b 4/m 分/e 在/s 北/b 京/e 逝/b 世/e ,/s 终/b 年/e 8/b 8/e 岁/s 。/s
40 | 根/b 据/e 刘/s 澜/b 涛/e 同/b 志/e 生/b 前/e 遗/b 愿/e 和/s 家/b 属/e 的/s 意/b 见/e ,/s 刘/s 澜/b 涛/e 同/b 志/e 的/s 丧/b 事/e 从/b 简/e ,/s 不/s 举/b 行/e
41 | ```
42 |
43 | ### 3. 找到```neg.txt```和```pos.txt```分别为负正向情感数据集
44 |
45 | - 我们团队采用公开电商购物评论数据集,并进行训练,
46 | 训练后的模型自动保存到```sentiment.marshal```
47 |
48 | ### 4. 修改并重构贝叶斯分类器
49 |
50 | - 修改```sentiment.marshal```为合适的路径,并利用贝叶斯分类器进行情感分析测试,
51 | 规定将情感分析结果在```[0, 0.4]```范围为不喜欢(差评),```[0.4, 0.6]```范围为一般(中评), ```[0.6, 1.0]```范围为喜欢(好评)
52 | ```python
53 | with open(f'{filename}.csv', 'r', encoding='gbk') as fr:
54 | for line in fr.readlines():
55 | s = snownlp.SnowNLP(line)
56 | if s.sentiments > 0.6:
57 | res = '喜欢'
58 | res_list.append(1)
59 | elif s.sentiments < 0.4:
60 | res = '不喜欢'
61 | res_list.append(-1)
62 | else:
63 | res = '一般'
64 | res_list.append(0)
65 | sent_dict = {
66 | '情感分析结果': s.sentiments,
67 | '评价倾向': res,
68 | '商品评论': line
69 | }
70 | ```
71 | 最后的测试结果保存到```result.csv```
72 |
73 | ## Results | 部分结果展示
74 |
75 | - 部分图书商品评论数据展示
76 |
77 | ```angular2
78 | 宣传的书本厚度严重缩水!!
79 | "很好!孩子喜欢!大
80 | 赞一个"
81 | 很好
82 | 首先,快递师傅的工作热情和态度很好,其次东西包装完好。先给好评。不错。
83 | 一本字典大小,买来当作参考书用,喜欢。随时查。有些东西书店买不到,网上可以买到。希望自己能用的着。工作路还长,一切希望能走的更远,更好。2017,加油。
84 | 方便携带,平时翻翻,随时学习。
85 | 不错,书店找不到,还是网店方便,好好学习天天向上
86 | 很好啊,质量很好,上班几年了,有空时多学习一下!
87 | 封皮颜色看起来怪怪的,喜欢原有的第七版的设计
88 | 像字典那么大,内容比较全面。
89 | 剛收到,小哥送貨快,挺滿意的,
90 | 物流很快,正品行货,满意!
91 | 物流太速度了,买这本书就是因为小,带着方便
92 | 打包一看封面掉漆,沾贴不正,象不是正规厂家出版的,内容有待确认。
93 | 双11给宝宝买了太多的书,满减过后的价格非常美丽,印刷的质量也很好,小家伙现在每晚都要一个故事才肯乖乖睡觉。
94 | 东西好的没话说,下次还买,一如既往的支持 东西好的没话说,下次还买,一如既往的支持 东西好的没话说,下次还买,一如既往的支持
95 | 给娃囤的,6个月娃只会啃,没有什么耐心听故事,长牙阶段太闹腾。
96 | 内容经典,娃爱不释手,要讲好多遍,活动期购买超级划算
97 | 活动真多啊,又买了好多好多,绘本不嫌多。
98 | "金羽毛这系列书都不错,适合大一点的小朋友,先买来收藏着,搞活动买也十分优
99 | 惠,包装的很好!"
100 | 一两个月就要购入很多本 绘本选择很多
101 | 给小孩买的 陪着小孩一起阅读 希望他也爱上读书
102 | 还不错吧,值得购买。。。。。。。。。
103 | ```
104 | - 部分情感分析结果展示
105 | 
106 |
107 | - 1000条图书商品评论的情感分析可视化结果
108 | 
109 |
110 | - 1000条图书商品评论的高频词汇词云
111 | 
112 |
113 | ## License
114 |
115 | - MIT license
116 |
117 | ## Author
118 |
119 | - Zijun Xi @2019-9-28
120 |
--------------------------------------------------------------------------------
/assets/fig.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zjxi/JD_SentimentAnalysis/64bdd85789774d586b5bdebae8bbfc0c3dea74d3/assets/fig.png
--------------------------------------------------------------------------------
/assets/res.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zjxi/JD_SentimentAnalysis/64bdd85789774d586b5bdebae8bbfc0c3dea74d3/assets/res.png
--------------------------------------------------------------------------------
/assets/screenshot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zjxi/JD_SentimentAnalysis/64bdd85789774d586b5bdebae8bbfc0c3dea74d3/assets/screenshot.png
--------------------------------------------------------------------------------
/assets/word_cloud.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zjxi/JD_SentimentAnalysis/64bdd85789774d586b5bdebae8bbfc0c3dea74d3/assets/word_cloud.png
--------------------------------------------------------------------------------
/comment_data.csv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zjxi/JD_SentimentAnalysis/64bdd85789774d586b5bdebae8bbfc0c3dea74d3/comment_data.csv
--------------------------------------------------------------------------------
/processed_comment_data.csv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zjxi/JD_SentimentAnalysis/64bdd85789774d586b5bdebae8bbfc0c3dea74d3/processed_comment_data.csv
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | requests==2.20.0
2 | lxml==4.2.5
3 | pandas==0.23.4
4 | snownlp==0.12.3
5 | matplotlib==2.2.2
6 | jieba==0.39
7 | wordcloud==1.5.0
8 |
--------------------------------------------------------------------------------
/result.csv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zjxi/JD_SentimentAnalysis/64bdd85789774d586b5bdebae8bbfc0c3dea74d3/result.csv
--------------------------------------------------------------------------------
/sentiment_analysis.py:
--------------------------------------------------------------------------------
1 | from snownlp import sentiment
2 | import pandas as pd
3 | import snownlp
4 | import matplotlib.pyplot as plt
5 | from matplotlib.font_manager import FontProperties
6 | from word_cloud import word_cloud_creation, word_cloud_implementation, word_cloud_settings
7 |
8 | """
9 | Written by Zijun Xi
10 | Date: 2019/9/28
11 | """
12 |
13 |
14 | def read_csv():
15 | '''读取商品评论数据文件'''
16 | comment_data = pd.read_csv('comment_data.csv', encoding='gbk',
17 | sep='\n', index_col=None)
18 | return comment_data
19 |
20 |
21 | def clean_data(data):
22 | '''数据清洗'''
23 | df = data.dropna() # 消除缺失数据 NaN为缺失数据
24 | df = pd.DataFrame(df.iloc[:, 0].unique()) # 数据去重
25 | return df
26 | # print('数据清洗后:', len(df))
27 |
28 |
29 | def clean_repeat_word(raw_str, reverse=False):
30 | '''去除评论中的重复使用的词汇'''
31 | if reverse:
32 | raw_str = raw_str[::-1]
33 | res_str = ''
34 | for i in raw_str:
35 | if i not in res_str:
36 | res_str += i
37 | if reverse:
38 | res_str = res_str[::-1]
39 | return res_str
40 |
41 |
42 | def processed_data(filename):
43 | '''清洗完毕的数据,并保存'''
44 | df = clean_data(read_csv())
45 | ser1 = df.iloc[:, 0].apply(clean_repeat_word)
46 | df2 = pd.DataFrame(ser1.apply(clean_repeat_word, reverse=True))
47 | df2.to_csv(f'{filename}.csv', encoding='gbk', index_label=None, index=None)
48 |
49 |
50 | def train():
51 | '''训练正向和负向情感数据集,并保存训练模型'''
52 | sentiment.train('neg.txt', 'pos.txt')
53 | sentiment.save('sentiment.marshal')
54 |
55 |
56 | sentiment_list = []
57 |
58 | res_list = []
59 |
60 |
61 | def test(filename, to_filename):
62 | '''商品评论-情感分析-测试'''
63 | with open(f'{filename}.csv', 'r', encoding='gbk') as fr:
64 | for line in fr.readlines():
65 | s = snownlp.SnowNLP(line)
66 | if s.sentiments > 0.6:
67 | res = '喜欢'
68 | res_list.append(1)
69 | elif s.sentiments < 0.4:
70 | res = '不喜欢'
71 | res_list.append(-1)
72 | else:
73 | res = '一般'
74 | res_list.append(0)
75 | sent_dict = {
76 | '情感分析结果': s.sentiments,
77 | '评价倾向': res,
78 | '商品评论': line.replace('\n', '')
79 | }
80 | sentiment_list.append(sent_dict)
81 | print(sent_dict)
82 | df = pd.DataFrame(sentiment_list)
83 | df.to_csv(f'{to_filename}.csv', index=None, encoding='gbk',
84 | index_label=None, mode='w')
85 |
86 |
87 | def data_virtualization():
88 | '''分析结果可视化,以条形图为测试样例'''
89 | font = FontProperties(fname=r"C:\Windows\Fonts\simhei.ttf", size=14)
90 | likes = len([i for i in res_list if i == 1])
91 | common = len([i for i in res_list if i == 0])
92 | unlikes = len([i for i in res_list if i == -1])
93 |
94 | plt.bar([1], [likes], label='like')
95 | plt.bar([3], [common], label='common')
96 | plt.bar([5], [unlikes], label='unlike')
97 |
98 | plt.legend()
99 | plt.xlabel('result')
100 | plt.ylabel('value')
101 | plt.title(u'商品评论情感分析结果-条形图', FontProperties=font)
102 | plt.savefig(f'./assets/fig.png')
103 | plt.show()
104 |
105 |
106 | def word_cloud_show():
107 | '''将商品评论转为高频词汇的词云'''
108 | wl = word_cloud_creation('processed_comment_data.csv')
109 | wc = word_cloud_settings()
110 | word_cloud_implementation(wl, wc)
111 |
112 |
113 | def main():
114 | # processed_data('processed_comment_data')
115 | # train() # 训练正负向商品评论数据集
116 | test('processed_comment_data', 'result')
117 |
118 | # data_virtualization() # 数据可视化
119 | # word_cloud_show() # 高频词云
120 |
121 |
122 | if __name__ == '__main__':
123 | main()
124 |
125 |
126 |
127 |
--------------------------------------------------------------------------------
/taobao_spider.py:
--------------------------------------------------------------------------------
1 | import requests
2 | from lxml import etree
3 | import time
4 | import random
5 | import pandas as pd
6 | """
7 | Written by Zijun Xi
8 | Date: 2019/9/27
9 | """
10 |
11 |
12 | class Spider:
13 | '''京东随机选取100部图书商品每书10条评论爬虫'''
14 | def __init__(self):
15 | self.url = None
16 | self.cookie = 'shshshfpa=1dec07d1-31c1-d395-b409-c19ab27d8de0-1531147936; shshshfpb=2a0eeb4d693334d7e85fb1f8d3f3f4a605b43769f7c217e0621b61bd57; __jdu=1553345717454455955504; mt_xid=V2_52007VwMWVl9aV14ZSR9ZAWIGFlZVXFZeHkwpVAdnBUBVXwtODRlMH0AAZAAWTg1dAF0DTkoIDWcDQFNbWwJSL0oYXA17AhpOXV5DWhhCHFsOZQciUG1YYlMfTx1ZAGQHEmJeX1s%3D; areaId=7; ipLoc-djd=7-420-45534-0; PCSYCityID=CN_410000_410200_410202; user-key=851d3dca-aaa0-47a5-a3d1-b4cba8ea7744; cn=0; unpl=V2_ZzNtbUEAFhF8DRRdfxhaB2JTFFURUREQd18TVitMCw1kCxoJclRCFX0UR1xnGloUZAEZX0dcQxVFCEdkeBBVAWMDE1VGZxBFLV0CFSNGF1wjU00zQwBBQHcJFF0uSgwDYgcaDhFTQEJ2XBVQL0oMDDdRFAhyZ0AVRQhHZHsdWAdlBhZbQlFGEXANQlBzHVgBZgYibUVncyV8D0VXfhlsBFcCIh8WC0ASdwFOUzYZWAFlARdZRFdFEHENQ1B%2fEVgBYwIXbUNnQA%3d%3d; __jdv=76161171|baidu-pinzhuan|t_288551095_baidupinzhuan|cpc|0f3d30c8dba7459bb52f2eb5eba8ac7d_0_2fd584c850734a79b7c43fd3adf9299e|1569598452338; shshshfp=e4692fc09e869eb8a131f9313f907646; __jda=122270672.1553345717454455955504.1553345717.1568709832.1569595855.6; __jdc=122270672; 3AB9D23F7A4B3C9B=6442ZN2AYTQNOA3LG2FNGXFN2RC4IKKFPMBAILZO4XTZ5WF2FIXMLCOGE7W6FUTZKDAQCQTIDOJPYVHFMWGGNPOAAE; shshshsID=acb4d916af2f63ae802589a494d0ac43_58_1569599865122; __jdb=122270672.63.1553345717454455955504|6.1569595855'
17 | self.data = []
18 | self.COUNT = 0
19 |
20 | def request(self, url):
21 | '''请求体'''
22 | headers = {
23 | 'cookie': self.cookie,
24 | 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/'
25 | '537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36',
26 | 'upgrade-insecure-requests': '1',
27 | 'sec-fetch-mode': 'navigate',
28 | 'sec-fetch-site': 'none',
29 | 'sec-fetch-user': '?1',
30 | }
31 | # proxy = {
32 | # 'https': '125.123.124.207:9999'
33 | # }
34 | resp = requests.get(url, headers=headers)
35 | resp.encoding = 'gbk'
36 | return resp.text
37 |
38 | @staticmethod
39 | def random_id_generator():
40 | '''随机选取图书id'''
41 | ids = []
42 | while True:
43 | rand_id = random.randint(12000001, 12999999)
44 | if rand_id not in ids:
45 | ids.append(rand_id)
46 | if len(ids) == 10000:
47 | break
48 | return ids
49 |
50 | @staticmethod
51 | def clean_comment(comment: str):
52 | '''清除冗余词汇'''
53 | comment = comment.replace('使用心得:', '').replace('\xa0', '')
54 | return comment
55 |
56 | def parse(self, text):
57 | '''解析网页,抓取所需商品评论数据'''
58 | html = etree.HTML(text)
59 | divs = html.xpath("//div[@id='hidcomment']/div[@class='item']")
60 | for div in divs:
61 | self.COUNT += 1
62 | try:
63 | comment = div.xpath(".//div[@class='comment-content']/text()")[0]
64 | except Exception as e:
65 | print(e)
66 | comment = ''
67 | info = {
68 | '商品评论': self.clean_comment(comment)
69 | }
70 | self.data.append(info)
71 | print(info)
72 |
73 | def to_csv(self, filename):
74 | '''保存数据文件到相对路径'''
75 | df = pd.DataFrame(self.data)
76 | df.to_csv(f'{filename}.csv', index_label=None, index=None, encoding='gbk')
77 |
78 | def main(self):
79 | ids = self.random_id_generator()
80 | for i, id in enumerate(ids):
81 | try:
82 | print(f'-----正在爬取第{i + 1}本图书:{id}的数据-----')
83 | url = f'https://item.jd.com/{id}.html'
84 | text = self.request(url)
85 | self.parse(text)
86 | print('**************************************')
87 | print(f'*** 当前爬取到的评论总数为:{self.COUNT} ***')
88 | print('**************************************')
89 |
90 | assert self.COUNT <= 1000, '爬取的商品评论数超过1000条!' # 当抓取到商品评论的数量为1k时停止
91 |
92 | time.sleep(random.randint(2, 5))
93 | except Exception as ex:
94 | self.to_csv('comment_data')
95 | print(ex)
96 | self.to_csv('comment_data')
97 |
98 |
99 | if __name__ == '__main__':
100 | spi = Spider()
101 | spi.main()
102 |
103 |
--------------------------------------------------------------------------------
/word_cloud.py:
--------------------------------------------------------------------------------
1 | from wordcloud import WordCloud
2 | import jieba
3 | import matplotlib.pyplot as plt
4 | """
5 | Written by Zijun Xi
6 | Date: 2019/9/28
7 | """
8 |
9 |
10 | def word_cloud_creation(filename):
11 | '''创建词云,并进行分词'''
12 | text = open(filename).read()
13 | word_list = jieba.cut(text, cut_all=True)
14 | wl = ' '.join(word_list)
15 | return wl
16 |
17 |
18 | def word_cloud_settings():
19 | '''设置词云的属性'''
20 | wc = WordCloud(
21 | background_color='white',
22 | max_words=2000,
23 | max_font_size=100,
24 | height=1200,
25 | width=1500,
26 | random_state=30,
27 | font_path='C:\Windows\Fonts\simfang.ttf'
28 | )
29 | return wc
30 |
31 |
32 | def word_cloud_implementation(wl, wc):
33 | '''生成词云,并展示'''
34 | my_words = wc.generate(wl)
35 | plt.imshow(my_words)
36 | plt.axis('off')
37 | wc.to_file(f'./assets/word_cloud.png')
38 | plt.show()
39 |
--------------------------------------------------------------------------------