├── Crawler
├── .gitignore
├── .idea
│ ├── .name
│ ├── Crawler.iml
│ ├── dataSources.local.xml
│ ├── dataSources.xml
│ ├── dataSources
│ │ └── 2097b77a-0349-4758-8855-8b770b9e50b1.xml
│ ├── deployment.xml
│ ├── misc.xml
│ ├── modules.xml
│ ├── vcs.xml
│ └── workspace.xml
├── Crawler
│ ├── .idea
│ │ ├── ImageSpider.iml
│ │ ├── deployment.xml
│ │ ├── misc.xml
│ │ ├── modules.xml
│ │ ├── other.xml
│ │ ├── vcs.xml
│ │ └── workspace.xml
│ ├── __init__.py
│ ├── commands
│ │ ├── __init__.py
│ │ └── crawlall.py
│ ├── expand_package
│ │ ├── Comment.py
│ │ ├── DBcontrol.py
│ │ ├── Sent_Dict
│ │ │ ├── __init__.py
│ │ │ ├── negative.txt
│ │ │ ├── positive.txt
│ │ │ ├── 否定词.txt
│ │ │ └── 程度级别词语.txt
│ │ ├── WordCloud.py
│ │ ├── __init__.py
│ │ ├── makebeautifulSoup.py
│ │ ├── negative.txt
│ │ ├── picDownloadScript.py
│ │ ├── positive.txt
│ │ ├── senti_dict.py
│ │ ├── senti_dict_class.py
│ │ ├── 否定词.txt
│ │ └── 程度级别词语.txt
│ ├── items.py
│ ├── middlewares.py
│ ├── pipelines.py
│ ├── settings.py
│ └── spiders
│ │ ├── __init__.py
│ │ ├── spider_expends.py
│ │ ├── tengxu.py
│ │ ├── wangyi.py
│ │ └── xinlang.py
├── TengxunMain.py
├── TogetherCrawl.py
├── WangyiMain.py
├── XinlangMain.py
├── desktop.ini
├── scrapy.cfg
├── setup.py
└── togetherCrawl_scheduling.py
├── README.md
└── xinlanggundong
├── .idea
├── deployment.xml
├── encodings.xml
├── misc.xml
├── modules.xml
├── vcs.xml
├── workspace.xml
└── xinlanggundong.iml
├── README.md
├── ViewData.ipynb
├── lastday.csv
├── main.py
├── output(utf8).csv
├── scrapy.cfg
└── xinlanggundong
├── __init__.py
├── __pycache__
├── __init__.cpython-36.pyc
├── items.cpython-36.pyc
├── middlewares.cpython-36.pyc
└── settings.cpython-36.pyc
├── items.py
├── middlewares.py
├── pipelines.py
├── settings.py
└── spiders
├── __init__.py
├── __pycache__
├── __init__.cpython-36.pyc
└── xinlangspider.cpython-36.pyc
└── xinlangspider.py
/Crawler/.gitignore:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/realzhengyiming/newsSpier_scrapy/2852f63981f764bfc0f2c733d52f104a3bf3c9e1/Crawler/.gitignore
--------------------------------------------------------------------------------
/Crawler/.idea/.name:
--------------------------------------------------------------------------------
1 | Crawler
--------------------------------------------------------------------------------
/Crawler/.idea/Crawler.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/Crawler/.idea/dataSources.local.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 | #@
7 | `
8 |
9 |
10 | master_key
11 | root
12 | *:@
13 |
14 |
15 |
--------------------------------------------------------------------------------
/Crawler/.idea/dataSources.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | mysql
6 | true
7 | com.mysql.jdbc.Driver
8 | jdbc:mysql://localhost:3306/flask
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
--------------------------------------------------------------------------------
/Crawler/.idea/deployment.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
--------------------------------------------------------------------------------
/Crawler/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
--------------------------------------------------------------------------------
/Crawler/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/Crawler/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/Crawler/Crawler/.idea/ImageSpider.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
--------------------------------------------------------------------------------
/Crawler/Crawler/.idea/deployment.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
--------------------------------------------------------------------------------
/Crawler/Crawler/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
--------------------------------------------------------------------------------
/Crawler/Crawler/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/Crawler/Crawler/.idea/other.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
--------------------------------------------------------------------------------
/Crawler/Crawler/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/Crawler/Crawler/.idea/workspace.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
73 |
74 |
75 |
76 |
77 |
78 |
79 |
80 |
81 |
82 |
83 |
84 |
85 |
86 |
87 |
88 |
89 |
90 |
91 |
92 |
93 |
94 |
95 |
96 |
97 |
98 |
99 |
100 |
101 |
102 |
103 |
104 |
105 |
106 |
107 |
108 |
109 |
110 |
111 |
112 |
113 |
114 |
115 |
116 |
117 |
118 |
119 |
120 |
121 |
122 |
123 |
124 |
125 |
126 |
127 |
128 |
129 |
130 |
131 |
132 |
133 |
134 |
135 |
136 |
137 |
138 |
139 |
140 |
141 |
142 |
143 |
144 |
145 |
146 |
147 |
148 |
149 |
150 |
151 |
152 |
153 |
154 |
155 |
156 |
157 |
158 |
159 |
160 |
161 |
162 |
163 |
164 |
165 |
166 |
167 |
168 |
169 |
170 |
171 |
172 |
173 |
174 |
175 |
176 |
177 |
178 |
179 |
180 |
181 |
182 |
183 |
184 |
185 |
186 |
187 |
188 |
189 |
190 |
191 |
192 |
193 |
194 |
195 |
200 |
201 |
202 |
203 | 找到没有的
204 | 有url的
205 | 有图片的
206 | TextPipeline
207 | name
208 | md5sum
209 | 图片下载
210 | type
211 | None
212 | 判断这个不是管道
213 | 管道进来了
214 | 这个item没有图片
215 | print
216 |
217 |
218 |
219 |
220 |
221 |
222 |
223 |
224 |
225 |
226 |
227 |
228 |
229 |
230 |
231 |
232 |
233 |
234 |
235 |
236 |
237 |
238 |
239 |
240 |
241 |
242 |
243 |
244 | true
245 | DEFINITION_ORDER
246 |
247 |
248 |
249 |
250 |
251 |
252 |
253 |
254 |
255 |
256 |
257 |
258 |
259 |
260 |
261 |
262 |
263 |
264 |
265 |
266 |
267 |
268 |
269 |
270 |
271 |
272 |
273 |
274 |
275 |
276 |
277 |
278 |
279 |
280 |
281 |
282 |
283 |
284 |
285 |
286 |
287 |
288 |
289 |
290 |
291 |
292 |
293 |
294 |
295 |
296 |
297 |
298 |
299 |
300 |
301 |
302 |
303 |
304 |
305 |
306 |
307 |
308 |
309 |
310 |
311 |
312 |
313 |
314 |
315 |
316 |
317 |
318 |
319 |
320 |
321 |
322 |
323 |
324 |
325 |
326 |
327 |
328 |
329 |
330 |
331 |
332 |
333 |
334 |
335 |
336 |
337 |
338 |
339 |
340 |
341 |
342 |
343 |
344 |
345 |
346 |
347 |
348 |
349 |
350 |
351 |
352 |
353 |
354 |
355 |
356 |
357 |
358 |
359 |
360 |
361 |
362 |
363 |
364 |
365 |
366 |
367 |
368 |
369 |
370 |
371 |
372 |
373 |
374 |
375 |
376 |
377 |
378 |
379 |
380 |
381 |
382 |
383 |
384 |
385 |
386 |
387 |
388 |
389 |
390 |
391 |
392 |
393 |
394 |
395 |
396 |
397 |
398 |
399 |
400 |
401 |
402 |
403 |
404 |
405 |
406 |
407 |
408 |
409 |
410 |
411 |
412 |
413 |
414 |
415 |
416 |
417 |
418 |
419 |
420 |
421 |
422 |
423 |
424 |
425 |
426 |
427 |
428 |
429 |
430 |
431 |
432 |
433 |
434 |
435 | 1556605149319
436 |
437 |
438 | 1556605149319
439 |
440 |
441 |
442 |
443 |
444 |
445 |
446 |
447 |
448 |
449 |
450 |
451 |
452 |
453 |
454 |
455 |
456 |
457 |
458 |
459 |
460 |
461 |
462 |
463 |
464 |
465 |
466 |
467 |
468 |
469 |
470 |
471 |
472 |
473 |
474 |
475 |
476 |
477 |
478 |
479 |
480 |
481 |
482 |
483 |
484 |
485 |
486 |
487 |
488 |
489 |
490 |
491 |
492 |
493 |
494 |
495 |
496 |
497 |
498 |
499 |
500 |
501 |
502 |
503 |
504 |
505 |
506 |
507 |
508 |
509 |
510 |
511 |
512 |
513 |
514 |
515 |
516 |
517 |
518 |
519 |
520 |
521 |
522 |
523 |
524 |
525 |
526 |
527 |
528 |
529 |
530 |
531 |
532 |
533 |
534 |
535 |
536 |
537 |
538 |
539 |
540 |
541 |
542 |
543 |
544 |
545 |
546 |
547 |
548 |
549 |
550 |
551 |
552 |
553 |
554 |
555 |
556 |
557 |
558 |
559 |
560 |
561 |
562 |
563 |
564 |
565 |
566 |
567 |
568 |
569 |
570 |
571 |
572 |
573 |
574 |
575 |
576 |
577 |
578 |
579 |
580 |
581 |
582 |
583 |
584 |
585 |
586 |
587 |
588 |
589 |
590 |
591 |
592 |
593 |
594 |
595 |
596 |
597 |
598 |
599 |
600 |
601 |
602 |
603 |
604 |
605 |
606 |
607 |
608 |
609 |
610 |
611 |
612 |
613 |
614 |
615 |
616 |
617 |
618 |
619 |
620 |
621 |
622 |
623 |
624 |
625 |
626 |
627 |
628 |
629 |
630 |
631 |
632 |
633 |
634 |
635 |
636 |
637 |
638 |
639 |
640 |
641 |
642 |
643 |
644 |
645 |
646 |
647 |
648 |
649 |
650 |
651 |
652 |
653 |
654 |
655 |
656 |
657 |
658 |
659 |
660 |
661 |
662 |
--------------------------------------------------------------------------------
/Crawler/Crawler/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/realzhengyiming/newsSpier_scrapy/2852f63981f764bfc0f2c733d52f104a3bf3c9e1/Crawler/Crawler/__init__.py
--------------------------------------------------------------------------------
/Crawler/Crawler/commands/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/realzhengyiming/newsSpier_scrapy/2852f63981f764bfc0f2c733d52f104a3bf3c9e1/Crawler/Crawler/commands/__init__.py
--------------------------------------------------------------------------------
/Crawler/Crawler/commands/crawlall.py:
--------------------------------------------------------------------------------
1 | from scrapy.commands import ScrapyCommand
2 |
3 | from Crawler.expand_package.Comment import CommentCrawl
4 | from Crawler.expand_package.DBcontrol import DB
5 |
6 |
7 | class Command(ScrapyCommand):
8 | requires_project = True
9 |
10 | def syntax(self):
11 | return '[options]'
12 |
13 | def short_desc(self):
14 | return 'Runs all of the spiders'
15 |
16 | def run(self, args, opts):
17 | spider_list = self.crawler_process.spiders.list()
18 | for name in spider_list:
19 | self.crawler_process.crawl(name, **opts.__dict__)
20 | self.crawler_process.start()
21 | print("三大站点的新闻正文爬取完毕了!")
22 | # todo 先是tengxun表——》django表的分类
23 | ## todo 分类还需要调用评分进行插入到里面。
24 | print("正在进行它新闻分类和情感得分分数录入...")
25 | dbtool = DB()
26 | dbtool.classifyDB()
27 |
28 | print("正在进行腾讯新闻评论的爬取...")
29 | commentC = CommentCrawl()
30 | commentC.getCommentMain() # 测试主题从url中提取,url又可以合成。
31 | print("今天爬虫任务完成!")
32 |
--------------------------------------------------------------------------------
/Crawler/Crawler/expand_package/Comment.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*
2 |
3 | # 这个是收集评论的类
4 | # todo 评论好像有问题,这边这儿的。
5 | import time
6 |
7 | import emoji
8 |
9 | # from DBcontrol import DB
10 | # from makebeautifulSoup import makeBS
11 |
12 | # from NewsSenti.tengxun.DBcontrol import DB
13 | # from NewsSenti.tengxun.makebeautifulSoup import makeBS
14 | from Crawler.expand_package.DBcontrol import DB
15 | from Crawler.expand_package.makebeautifulSoup import makeBS
16 |
17 |
18 | class CommentCrawl(object):
19 | def __init__(self):
20 | self.dbHelper = DB()
21 |
22 | def changTimeToDate(self,dateString):
23 | timeStamp = dateString
24 | timeArray = time.localtime(timeStamp)
25 | print(timeArray)
26 | otherStyleTime = time.strftime("%Y-%m-%d", timeArray)
27 | # print(otherStyleTime)
28 | return otherStyleTime
29 |
30 |
31 | def getNewsIdAndUrl(self): #提取出新闻的id和url
32 | # dbHelper = DB()
33 | themeWord = ['car','technology','home','entertainment','house','finance','sports'] #类别新闻
34 | resultDic = {}
35 | sqlHead = "select News_id,url from newssentimentanalysis_"
36 | sqlTail = "news where Mcontent='未提取'" # 记得更新了状态后要修改成已提取
37 | # 插入
38 | for theme in themeWord:
39 | print(sqlHead+theme+sqlTail)
40 | resultDic[theme] = self.dbHelper.__query__(sqlHead+theme+sqlTail)# 查询
41 | return resultDic #返回格式{'car':[{'id':xx,'url':xx},.....,'home'...]
42 |
43 | def getAwriteCommentJson(self,id,url,theme): #这个是评论专用的请求返回成字典的,theme 是方便找到表然后更新状态的。
44 | sqlHead = "update newssentimentanalysis_"
45 | sqlTail = "news set Mcontent='%s' and where url='%s '" # 更新指定表内的评论状态词,需要只处理腾讯的吗
46 |
47 | sql = sqlHead+theme+sqlTail % ("已提取",url) #这个是更新状态用的sql
48 |
49 | # 异常时一般是没有评论
50 | sqlERROR = sqlHead+theme+sqlTail % ("无评论",url) # 如果发现没有
51 |
52 |
53 | time.sleep(0.5)
54 | cooker = makeBS()
55 | commentRawUrl = "http://coral.qq.com/article/"
56 | cmt_id = cooker.getCmt_id(url) #去掉空格
57 | if cmt_id==None:
58 | return False # 没有找到的话,那就是没评论啊
59 | if cmt_id.find("'")!=-1:
60 | cmt_id = cmt_id.replace("'","")
61 | else :
62 | cmt_id = cmt_id.strip()
63 |
64 | #这个用来拼接用到。
65 | try:
66 | allUrl = commentRawUrl + str(cmt_id) + "/comment/#"
67 | print(allUrl)
68 | responseDic = cooker.makeBSjson(allUrl)
69 | commentList = responseDic['data']['commentid'] # todo 不知道怎么回事调用不到这个评论的。
70 | # print(commentList)
71 | from pprint import pprint
72 | for comment in commentList:
73 | pprint(type(comment['id']))
74 | print(comment['id'])
75 | comment['content'] = emoji.demojize(comment['content']) #过滤emoji
76 | comment['userinfo']['nick'] = emoji.demojize(comment['userinfo']['nick'])
77 | comment['time']=self.changTimeToDate(comment['time']) #时间戳改成日期字符串
78 | print("新闻id "+ str(id))
79 | print("新闻的url是 "+ url)
80 | if self.dbHelper.classifyDBComment(url=url,id=id,comment=comment) : #评论直接插入django表内的数据库,并且更新新闻评论状态.
81 | print("更新成功")
82 | self.dbHelper.__query__(sql) # 这儿设置更新里面新闻的状态。
83 | else:
84 | print("更新失败")
85 | self.dbHelper.__query__(sqlERROR) # 这儿设置更新里面新闻的状态。
86 | print("已经成功更新此条新闻 "+url+" "+theme)
87 | print("")
88 | return True
89 | #-----------------------这儿可以合成sql语句的话就可以执行插入的操作了。-----------------------
90 | # 通过url来合成插入的sql语句,DBcontrol的方法中来做这些东西
91 | except Exception as e:
92 | print("此条可能无评论,正在跳过")
93 | # 这儿需要插入无评论才可以。 todo
94 | # self.dbHelper.__query__(sqlERROR) # 失败的话,更新成失败
95 | print(sqlERROR) #更新成
96 | print(e)
97 | return False
98 |
99 |
100 | def getCommentMain(self): # 这儿应该是提取出所有为提取的新闻,然后还要记得更新状态
101 | resultDic = self.getNewsIdAndUrl() # 返回的是拼装好的含主题的list
102 | # from pprint import pprint
103 | # pprint(resultDic)
104 |
105 | resultList = []
106 | count = 0
107 | for theme in resultDic:
108 | print("现在是",theme)
109 | for oneNews in resultDic[theme]:
110 | count+=1 #这个累加,然后如果是到了一定的数量那就休眠一下
111 | if count%100==0: #每100条
112 | time.sleep(15) #休息两分钟。
113 |
114 | print(oneNews) #已经提取出来了
115 | print("获得commentjson")
116 | # 分类----------------------------------------更新原来的状态.----------------------------------------
117 | sql = ""
118 | sql2=""
119 | sqlHead = "update newssentimentanalysis_"
120 | # 'update newssentimentanalysis_homenews set Mcontent="无评论" where News_id=1'
121 | sqlTail = "news set Mcontent = '已提取' where News_id={}"
122 | sqlTailErr = "news set Mcontent = '无评论' where News_id={}"
123 |
124 | # 插入正文得分的sql
125 |
126 | # 这句就是更新tengxun表中的数据,用id
127 |
128 | if oneNews['url'].find('auto') != -1 or oneNews['url'].find('car') != -1 : # 找到这个就是汽车,中间是表名
129 | sql = sqlHead + "car" + sqlTail
130 | sql2 = sqlHead + "car" + sqlTailErr
131 | pass
132 | elif oneNews['url'].find('tech') != -1: # 找到这个就是科技
133 | sql = sqlHead + "technology" + sqlTail
134 | sql2 = sqlHead + "technology" + sqlTailErr
135 |
136 | pass
137 | elif oneNews['url'].find('news') != -1: # 找到这个就是默认新闻
138 | sql = sqlHead + "home" + sqlTail
139 | sql2 = sqlHead + "home" + sqlTailErr
140 |
141 | pass
142 | elif oneNews['url'].find('ent') != -1: # 找到这个就是娱乐
143 | sql = sqlHead + "entertainment" + sqlTail
144 | sql2 = sqlHead + "entertainment" + sqlTailErr
145 |
146 | pass
147 | elif oneNews['url'].find('house') != -1: # 找到这个就是房产
148 | sql = sqlHead + "house" + sqlTail
149 | sql2 = sqlHead + "house" + sqlTailErr
150 |
151 | pass
152 | elif oneNews['url'].find('finance') != -1: # 找到这个就是经济
153 | sql = sqlHead + "finance" + sqlTail
154 | sql2 = sqlHead + "finance" + sqlTailErr
155 |
156 | pass
157 | elif oneNews['url'].find('sports') != -1: # 找到这个就是运动
158 | sql = sqlHead + "sports" + sqlTail
159 | sql2 = sqlHead + "sports" + sqlTailErr
160 |
161 | pass
162 | else:
163 | print("这边这种是网易的情况-归为默认新闻home中去")
164 |
165 | sql = sqlHead + "home" + sqlTail
166 | sql2 = sqlHead + "home" + sqlTailErr
167 |
168 | print(theme) # 分类
169 | if self.getAwriteCommentJson(id=oneNews['News_id'],url=oneNews['url'],theme=theme): #逐条插入,进行,这个不需要返回
170 | print("提取出评论")
171 | print(sql.format(oneNews['News_id']))
172 | self.dbHelper.__query__(sql.format(oneNews['News_id']))
173 |
174 | else:
175 | print("cmt_id 提取失败")
176 | print(sql2.format(oneNews['News_id']))
177 | self.dbHelper.__query__(sql2.format(oneNews['News_id']))
178 | print("更新无评论")
179 |
180 | print()
181 |
182 | # resultList.append(oneNews) # 添加进入
183 | print("finish comments crawl!")
184 |
185 | if __name__ == '__main__':
186 | commentC = CommentCrawl()
187 | # print(commentC.getNewsIdAndUrl())
188 | # print(commentC.getCommentJson("http:////sports.qq.com//a//20190315//000008.htm",55)) #测试单个
189 | commentC.getCommentMain() #测试主题从url中提取,url又可以合成。
190 |
191 |
--------------------------------------------------------------------------------
/Crawler/Crawler/expand_package/DBcontrol.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*
2 |
3 | # 此处是使用非orm来操作数据库的简单池化操作的代码
4 | # 2018/9/8 修改成使用连接池的方式来进行数据库的链接
5 | # 需要导入如下的依赖库,如果没有,请 安装 pymysql ,DBUtils
6 | # 提取返回数据的全部变成了返回字典类型
7 | # 这个是连接数据库的东西,这次使用数据库连接池把,使用连接池可以避免反复的重新创建新连接
8 |
9 | import traceback
10 | from datetime import date, timedelta
11 | import emoji
12 | import pymysql as pymysql
13 | import time
14 | from DBUtils.PooledDB import PooledDB
15 |
16 | # 这个是从配置文件(同级目录下)config.py中加载链接数据库的数据
17 | # mysqlInfo 中格式如下放着就可以,也可以直接使用,把__init__函数中需要链接部分直接替换即可
18 | # mysqlInfo = {
19 | # "host": '127.0.0.1',
20 | # "user": 'root',
21 | # "passwd": '123456',
22 | # "db": 'test', #改同一个数据库了。
23 | # "port": 3306,
24 | # "charset": 'utf8' #这个是数据库的配置文件
25 | # }
26 | # from .senti_dict_class import Senti_dict_class
27 | from Crawler.expand_package.senti_dict import Senti_Text
28 | from Crawler.settings import mysqlInfo
29 |
30 |
31 | class DB:
32 |
33 | __pool = None #这个也是静态的属性
34 |
35 | def __init__(self):
36 | # 构造函数,创建数据库连接、游标,默认创建一个对象就获得一个连接,用完后就关闭就可以了
37 | self.coon = DB.getmysqlconn() #这个是默认创建出来的东西
38 | self.cur = self.coon.cursor(cursor=pymysql.cursors.DictCursor)
39 |
40 | # 数据库连接池连接
41 | @staticmethod # 这个是静态的方法可以直接调用的
42 | def getmysqlconn(): # 从连接池里面获得一个连接
43 | if DB.__pool is None:
44 | __pool = PooledDB(creator=pymysql, mincached=2, maxcached=20, host=mysqlInfo['host'],
45 | user=mysqlInfo['user'], passwd=mysqlInfo['passwd'], db=mysqlInfo['db'],
46 | port=mysqlInfo['port'], charset=mysqlInfo['charset'])
47 | # print(__pool)
48 | return __pool.connection()
49 | # 释放资源
50 |
51 | def dispose(self): #这儿只能断默认初始化的那个连接
52 | self.coon.close()
53 | self.cur.close()
54 |
55 | # ---------------- 这儿开始写方法-----------------------
56 | def ifExists(self,webTitle):
57 | coon = DB.getmysqlconn() # 每次都默认获得一个新连接来进行相关的操作
58 | cur = coon.cursor(cursor=pymysql.cursors.DictCursor)
59 | sql = "SELECT * FROM tengxun WHERE title='%s'and urlState='True';"
60 | #因为这儿没有加上try,catch,所以出问题
61 | try:
62 | cur.execute(sql%(webTitle))
63 | except Exception as e:
64 | print(e)
65 | print("函数ifExists出问题了,你检查一下")
66 | print(sql%(webTitle))
67 | rowNumber = cur.rowcount
68 | if rowNumber>0:
69 | return True
70 | else:
71 | return False
72 |
73 |
74 | # ------- 下面可以日常的直接编写操作数据库的代码---------------
75 |
76 |
77 | def __query__(self,sql): # 自定义查询,返回字典的类型
78 | coon =DB.getmysqlconn() # 每次都默认获得一个新连接来进行相关的操作
79 | cur = coon.cursor(cursor=pymysql.cursors.DictCursor) # 这儿这个选项是设置返回结果为字典的类型,如果默认的话,那就是列表i
80 | # ----- 标准的查询模块 ---下面就是执行的部分
81 | try:
82 | cur.execute(sql)
83 | URLs = cur.fetchall() # 返回数据的列表,可以设置返回的是字典
84 | # -----
85 | print(sql)
86 | print(cur.rowcount)
87 | coon.commit()
88 |
89 |
90 | return URLs
91 | except Exception as e:
92 | print(e)
93 | coon.rollback()
94 | finally:
95 | cur.close()
96 | coon.close()
97 |
98 |
99 |
100 | # 更新部分的例子,sql语句不同而已
101 | def updateById(self,id):
102 | coon =DB.getmysqlconn() # 每次都默认获得一个新连接来进行相关的操作
103 | cur = coon.cursor(cursor=pymysql.cursors.DictCursor)
104 |
105 | sql = "update tengxun set hadmix='True' where id = %d;" % int(id) #就只是更新一下相应的url的状态就可以了
106 | try:
107 | cur.execute(sql)
108 | # 提交
109 | coon.commit()
110 | except Exception as e:
111 | # 错误回滚
112 | print("更新出错")
113 | print(e)
114 | coon.rollback()
115 | finally:
116 | coon.commit() #提交这个事务
117 | cur.close()
118 | coon.close()
119 |
120 |
121 | # 插入的例子
122 | def insert(self,value): #这个是把网址先存到里面去url,这儿的意思是插入tengxun那个表
123 | coon =DB.getmysqlconn() # 每次都默认获得一个新连接来进行相关的操作
124 | cur = coon.cursor(cursor=pymysql.cursors.DictCursor)
125 | sql = "insert into testtable (value) values(%s)"
126 | try:
127 | cur.execute(sql,value) # 这样来直接把值替换进行就可以,注意类型
128 | # 提交
129 | coon.commit()
130 | except Exception as e:
131 | # 错误回滚
132 | print(sql)
133 | print(e)
134 | coon.rollback()
135 | finally:
136 | coon.commit() #提交这个事务
137 | cur.close()
138 | coon.close()
139 |
140 |
141 | def insert(self,value): #这个是把网址先存到里面去url,这儿的意思是插入tengxun那个表
142 | coon =DB.getmysqlconn() # 每次都默认获得一个新连接来进行相关的操作
143 | cur = coon.cursor(cursor=pymysql.cursors.DictCursor)
144 | sql = "insert into tengxun (url) values(%s)"
145 | try:
146 | cur.execute(sql,value) # 这样来直接把值替换进行就可以,注意类型
147 | # 提交
148 | coon.commit()
149 | except Exception as e:
150 | # 错误回滚
151 | print(sql)
152 | print(e)
153 | coon.rollback()
154 | finally:
155 | coon.commit() #提交这个事务
156 | cur.close()
157 | coon.close()
158 |
159 |
160 | # 更新的例子 todo 加上插入数据库的操作。把一个item传进来把 , 这个是可以统一使用的。
161 | def insertItem(self,item):
162 | '''
163 | url = scrapy.Field()
164 | urlState = scrapy.Field()
165 | title = scrapy.Field()
166 | Hcontent = scrapy.Field()
167 | Tcontent = scrapy.Field()
168 | Acontent = scrapy.Field()
169 | newdate = scrapy.Field()
170 | fromWhere = scrapy.Field()
171 | :param item: 默认item是[] 列表内的,哪怕是一个元素也也是一样的。
172 | :return:
173 | '''
174 | coon =DB.getmysqlconn() # 每次都默认获得一个新连接来进行相关的操作
175 | cur = coon.cursor(cursor=pymysql.cursors.DictCursor)
176 | sql = "insert into tengxun (url,urlState,title,Hcontent,Tcontent,Acontent,newdate,fromWhere)" \
177 | " values('{0}','{1}','{2}','{3}','{4}','{5}','{6}','{7}')".format(
178 | item['url'][0],item['urlState'][0],item['title'][0],item['Hcontent'][0],item['Tcontent'][0],
179 | item['Acontent'][0],item['newdate'][0],item['fromWhere'][0])
180 |
181 | try:
182 | print(sql)
183 | cur.execute(sql) # 这样来直接把值替换进行就可以,注意类型
184 | # 提交
185 | coon.commit()
186 | print("插入数据库tengxun成功")
187 | except Exception as e:
188 | # 错误回滚
189 | print(sql)
190 | print(e)
191 | coon.rollback()
192 | # time.sleep(30)
193 | finally:
194 | coon.commit() #提交这个事务
195 | cur.close()
196 | coon.close()
197 |
198 | # ----------------------------------评论的数据库分类插入,传入新闻的url和id,commentDic <聚合的dic>
199 | def classifyDBComment(self,url,id,comment):
200 | print("开始分类整理") #
201 | # print(comment['id'])
202 | sql = "" #评论正文插入 m nbvcbv
203 | sqlHead = "insert into newssentimentanalysis_"
204 | sqlTail = "comment (NikeName,Comment,Date,News_id_id) values (%s,%s,%s,%s)"
205 |
206 | # 插入评论得分的sql
207 | sql2 = ""
208 | sql2Tail = "analysis_comment(Pos_Score,Neg_score,Sentiment,Comment_id_id,Date) values (%s,%s,%s,last_insert_id(),%s)" # 这个我也知道
209 |
210 | # 这句就是更新新闻表中的数据,用id newssentimentanalysis_carcomment
211 | sqlNews = ""
212 | sqlNewsHead = "update newssentimentanalysis_"
213 | sqlNewsTail = "news SET Mcontent='已提取' where News_id=%s" #id是数字
214 |
215 | # 插入正文得
216 | # updateSql = "update tengxun SET hadmix='True' where id='%s' " #Mcontent,这个字段用来“未提取”-》“已提取
217 |
218 | if url.find('auto') != -1: # 找到这个就是汽车,中间是表名
219 | sql = sqlHead + "car" + sqlTail
220 | sql2 = sqlHead + "car" + sql2Tail
221 | sqlNews =sqlNewsHead+ "car"+ sqlNewsTail
222 | pass
223 | if url.find('tech') != -1: # 找到这个就是科技
224 | sql = sqlHead + "technology" + sqlTail
225 | sql2 = sqlHead + "technology" + sql2Tail
226 | sqlNews =sqlNewsHead+ "technology"+ sqlNewsTail
227 |
228 | if url.find('news') != -1: # 找到这个就是默认新闻
229 | sql = sqlHead + "home" + sqlTail
230 | sql2 = sqlHead + "home" + sql2Tail
231 | sqlNews =sqlNewsHead+ "home"+ sqlNewsTail
232 |
233 |
234 | if url.find('ent') != -1: # 找到这个就是娱乐
235 | sql = sqlHead + "entertainment" + sqlTail
236 | sql2 = sqlHead + "entertainment" + sql2Tail
237 | sqlNews =sqlNewsHead+ "entertainment"+ sqlNewsTail
238 |
239 | if url.find('house') != -1: # 找到这个就是房产
240 | sql = sqlHead + "house" + sqlTail
241 | sql2 = sqlHead + "house" + sql2Tail
242 | sqlNews =sqlNewsHead+ "house"+ sqlNewsTail
243 |
244 | if url.find('finance') != -1: # 找到这个就是经济
245 | sql = sqlHead + "finance" + sqlTail
246 | sql2 = sqlHead + "finance" + sql2Tail
247 | sqlNews =sqlNewsHead+ "finance"+ sqlNewsTail
248 |
249 | if url.find('sports') != -1: # 找到这个就是运动
250 | sql = sqlHead + "sports" + sqlTail
251 | sql2 = sqlHead + "sports" + sql2Tail
252 | sqlNews =sqlNewsHead+ "sports"+ sqlNewsTail
253 |
254 | else:
255 | pass # 未能分类,也放到默认的那儿去吗。
256 |
257 | # --------------------------------获取得分----------------------------------
258 | # print(type(comment['id']))
259 | print(comment['content'])
260 | print(emoji.demojize(comment['userinfo']['nick']))
261 |
262 | print(url,str(id)) # 这儿也是没有做异常处理的。
263 |
264 | # senti_counter = Senti_dict_class()
265 | # pos_score, neg_score, SentiResult = .Senti_Text(text)
266 | pos_score, neg_score, SentiResult = Senti_Text(comment['content']) # 这个是纯文本部分
267 | # pos_score, neg_score, SentiResult = Senti_Text(comment['content']) # 这个是纯文本部分
268 | if SentiResult.find("[")!=-1:
269 | SentiResult = SentiResult.replact("[","")
270 | if SentiResult.find("]")!=-1:
271 | SentiResult = SentiResult.replact("]","")
272 | print(SentiResult)
273 | # 中立的情况好像是返回直接是0
274 | print(pos_score)
275 | # ---------------------------这边开始数据库插入相关操作-----------------------------
276 | coon = DB.getmysqlconn() # 每次都默认获得一个新连接来进行相关的操作
277 | cur = coon.cursor(cursor=pymysql.cursors.DictCursor)
278 |
279 | try:
280 | cur.execute(sql, (
281 | comment['userinfo']['nick'], comment['content'],comment['time'], id)) # 插入指定的表(分类)
282 |
283 |
284 | cur.execute(sql2, (
285 | pos_score, neg_score, SentiResult,comment['time'])) # 插入评分 ,加上了日期了 todo获得评分
286 | # print(sqlNews % int(id))
287 | id = str(id)
288 |
289 | cur.execute(sqlNews, (id)) # 更新新闻的 Mcontent,这个是可以工作的啊
290 |
291 | coon.commit()
292 | return True
293 | # time.sleep()
294 | except Exception as e:
295 | print(pos_score)
296 | print(neg_score)
297 | print(SentiResult)
298 | # print(Tcontent)
299 | # 错误回滚
300 | print("事务回滚,跳过插入")
301 | # print(rowDic['id'])
302 | print(sql, (
303 | comment['userinfo']['nick'], comment['content'],comment['time'], id))
304 |
305 | print(id)
306 | print(type(id))
307 | print(sqlNews % (id))
308 |
309 |
310 | print(e)
311 | coon.rollback()
312 | traceback.print_exc()
313 | return False # 提取评论失败的都不管.
314 | finally:
315 | coon.commit() # 提交这个事务
316 | cur.close()
317 | coon.close()
318 | print("这条新闻的评论写入完毕")
319 |
320 | # 把tengxun表中的数据,计算评分,并且分类到django表中去
321 | def classifyDB(self): #
322 | resultDic = self.__query__( # todo 测试部分
323 | "select id,url,title,urlState,Hcontent,Mcontent,Tcontent,Acontent,newdate,fromWhere from tengxun where urlState='True' and hadmix='False'")
324 | print("开始分类整理")
325 | for rowDic in resultDic:
326 | # 插入分类新闻主表的sql
327 | sql = ""
328 | sqlHead = "insert into newssentimentanalysis_"
329 | sqlTail = "news (url,Title,UrlState,Hcontent,Mcontent,Tcontent,Acontent,Date,fromWhere) values (%s,%s,%s,%s,%s,%s,%s,%s,%s)"
330 |
331 | # 插入正文得分的sql
332 | sql2 = ""
333 | sql2Tail = "analysis_news(Pos_Score,Neg_score,Sentiment,News_id_id,Date) values (%s,%s,%s,last_insert_id(),%s)" # 这个是sql的
334 |
335 | # 这句就是更新tengxun表中的数据,用id
336 | updateSql = "update tengxun SET hadmix='True' where id='%s' " # 这个是分类用的数据.
337 |
338 | if rowDic['url'].find('auto') != -1: # 找到这个就是汽车,中间是表名
339 | sql = sqlHead + "car" + sqlTail
340 | sql2 = sqlHead + "car" + sql2Tail
341 | pass
342 | if rowDic['url'].find('tech') != -1: # 找到这个就是科技
343 | sql = sqlHead + "technology" + sqlTail
344 | sql2 = sqlHead + "technology" + sql2Tail
345 |
346 | pass
347 | if rowDic['url'].find('news') != -1: # 找到这个就是默认新闻
348 | sql = sqlHead + "home" + sqlTail
349 | sql2 = sqlHead + "home" + sql2Tail
350 |
351 | pass
352 | if rowDic['url'].find('ent') != -1: # 找到这个就是娱乐
353 | sql = sqlHead + "entertainment" + sqlTail
354 | sql2 = sqlHead + "entertainment" + sql2Tail
355 |
356 | pass
357 | if rowDic['url'].find('house') != -1: # 找到这个就是房产
358 | sql = sqlHead + "house" + sqlTail
359 | sql2 = sqlHead + "house" + sql2Tail
360 |
361 | pass
362 | if rowDic['url'].find('finance') != -1: # 找到这个就是经济
363 | sql = sqlHead + "finance" + sqlTail
364 | sql2 = sqlHead + "finance" + sql2Tail
365 |
366 | pass
367 | if rowDic['url'].find('sports') != -1: # 找到这个就是运动
368 | sql = sqlHead + "sports" + sqlTail
369 | sql2 = sqlHead + "sports" + sql2Tail
370 |
371 | pass
372 | else:
373 | print("这边这种是网易的情况-归为默认新闻home中去")
374 |
375 | sql = sqlHead + "home" + sqlTail
376 | sql2 = sqlHead + "home" + sql2Tail
377 |
378 | pass # 未能分类,也放到默认的那儿去吗。 #
379 |
380 | # --------------------------------获取得分----------------------------------
381 | print("Tcontent长度")
382 | print(len(rowDic['Tcontent']))
383 | pos_score, neg_score, SentiResult = "", "", ""
384 |
385 | # senti_counter = Senti_dict_class()
386 | pos_score, neg_score, SentiResult = Senti_Text(rowDic['Tcontent'])
387 | # pos_score, neg_score, SentiResult = senti_counter.Senti_Text(rowDic['Tcontent']) # 这个是纯文本部分
388 |
389 | # todo 进行分数写入和的部分
390 |
391 |
392 | # pos_score, neg_score, SentiResult = Senti_Sentence(rowDic['Tcontent']) #这个是纯文本部分
393 |
394 | print("分类时候写入分数检查")
395 | print()
396 |
397 | # print(rowDic['Tcontent'])
398 | # print()
399 | print(sql % (
400 | rowDic['url'], rowDic['title'], True, rowDic['Hcontent'], '未提取', rowDic['Tcontent'], rowDic['Acontent'],
401 | rowDic['newdate'], rowDic['fromWhere']
402 | ))
403 | print(pos_score)
404 | print(neg_score)
405 | print(SentiResult)
406 |
407 | # ---------------------------这边开始数据库插入相关操作-----------------------------
408 |
409 | coon = DB.getmysqlconn() # 每次都默认获得一个新连接来进行相关的操作
410 | cur = coon.cursor(cursor=pymysql.cursors.DictCursor)
411 | # print(rowDic['url'])
412 | # print(rowDic['title'])
413 | # print(rowDic['Hcontent'])
414 | # print('未提取')
415 | # print(rowDic['Tcontent'])
416 | # print(rowDic['Acontent'])
417 | # print(rowDic['newdate'])
418 | # print(rowDic['fromWhere'])
419 |
420 | # print((sql %(
421 | # rowDic['url'],rowDic['title'],"True",rowDic['Hcontent'],'未提取',rowDic['Tcontent'],rowDic['Acontent'],rowDic['newdate'],rowDic['fromWhere']
422 | # )
423 | # ))
424 |
425 | try: # 三个一起操作,很多麻烦事情的。可以,这样操作也是可以的。
426 | cur.execute(sql,
427 | (
428 | rowDic['url'], rowDic['title'], True, rowDic['Hcontent'], '未提取', rowDic['Tcontent'],
429 | rowDic['Acontent'], rowDic['newdate'], rowDic['fromWhere']
430 | )
431 | ) # 插入指定的表(分类)
432 |
433 | print("插入成功才用得上这个的把。") # 无法提取到这个的。在写一次查询把。
434 | # print(cur.lastrowid()) # 上一个插入的id是,还真是有,那就直接返回过来就可以了
435 | # print(type(cur.lastrowid())) # 上一个插入的id是,还真是有,那就直接返回过来就可以了
436 |
437 | cur.execute(sql2, (pos_score, neg_score, SentiResult, rowDic['newdate'])) # 插入评分 todo获得评分
438 | cur.execute(updateSql, (rowDic['id'])) # 更新tengxun hadmix,这个是可以工作的啊
439 | # 提交
440 | coon.commit()
441 |
442 | except Exception as e:
443 | # 错误回滚
444 | print("事务回滚,跳过插入")
445 | # print(rowDic['id'])
446 | # print(sql%(rowDic['url'],rowDic['title'],True,rowDic['Hcontent'],'未使用',rowDic['Tcontent'],rowDic['Acontent'],rowDic['newdate'],rowDic['fromWhere']))
447 | print(e)
448 | coon.rollback()
449 | traceback.print_exc()
450 |
451 | finally:
452 | # print("插入成功")
453 | coon.commit() # 提交这个事务
454 | cur.close()
455 | coon.close()
456 | print("今天的量分完了")
457 |
458 |
459 |
460 |
461 | if __name__ == "__main__": # 下面都是用来测试用的。
462 |
463 | chak = DB()
464 | # chak.classifyDB()
465 | # chak. 测试用调用
466 | chak.__query__("update newssentimentanalysis_carnews set Mcontent = '无评论' where News_id=4")
467 |
468 |
469 |
470 | print("DB finish!")
471 |
472 |
473 |
474 |
475 |
--------------------------------------------------------------------------------
/Crawler/Crawler/expand_package/Sent_Dict/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/realzhengyiming/newsSpier_scrapy/2852f63981f764bfc0f2c733d52f104a3bf3c9e1/Crawler/Crawler/expand_package/Sent_Dict/__init__.py
--------------------------------------------------------------------------------
/Crawler/Crawler/expand_package/Sent_Dict/否定词.txt:
--------------------------------------------------------------------------------
1 | 不
2 | 不是
3 | 不能
4 | 不可
5 | 没有
6 | 不要
7 | 别
8 | 没
9 | 无
10 | 莫
11 | 未
12 | 勿
13 | 休
14 | 甭
15 | 非
--------------------------------------------------------------------------------
/Crawler/Crawler/expand_package/Sent_Dict/程度级别词语.txt:
--------------------------------------------------------------------------------
1 | extreme
2 | 百分之百
3 | 倍加
4 | 备至
5 | 不得了
6 | 不堪
7 | 不可开交
8 | 不亦乐乎
9 | 不折不扣
10 | 彻头彻尾
11 | 充分
12 | 到头
13 | 地地道道
14 | 非常
15 | 极
16 | 极度
17 | 极端
18 | 极其
19 | 极为
20 | 截然
21 | 尽
22 | 惊人地
23 | 绝
24 | 绝顶
25 | 绝对
26 | 绝对化
27 | 刻骨
28 | 酷
29 | 满
30 | 满贯
31 | 满心
32 | 莫大
33 | 奇
34 | 入骨
35 | 甚为
36 | 十二分
37 | 十分
38 | 十足
39 | 死
40 | 滔天
41 | 痛
42 | 透
43 | 完全
44 | 完完全全
45 | 万
46 | 万般
47 | 万分
48 | 万万
49 | 无比
50 | 无度
51 | 无可估量
52 | 无以复加
53 | 无以伦比
54 | 要命
55 | 要死
56 | 已极
57 | 已甚
58 | 异常
59 | 逾常
60 | 贼
61 | 之极
62 | 之至
63 | 至极
64 | 卓绝
65 | 最为
66 | 佼佼
67 | 郅
68 | 綦
69 | 齁
70 | 最
71 | very
72 | 不为过
73 | 超
74 | 超额
75 | 超外差
76 | 超微结构
77 | 超物质
78 | 出头
79 | 多
80 | 浮
81 | 过
82 | 过度
83 | 过分
84 | 过火
85 | 过劲
86 | 过了头
87 | 过猛
88 | 过热
89 | 过甚
90 | 过头
91 | 过于
92 | 过逾
93 | 何止
94 | 何啻
95 | 开外
96 | 苦
97 | 老
98 | 偏
99 | 强
100 | 溢
101 | 忒
102 | 不过
103 | 不少
104 | 不胜
105 | 惨
106 | 沉
107 | 沉沉
108 | 出奇
109 | 大为
110 | 多
111 | 多多
112 | 多加
113 | 多么
114 | 分外
115 | 格外
116 | 够瞧的
117 | 够呛
118 | 好
119 | 好不
120 | 何等
121 | 很
122 | 很是
123 | 坏
124 | 可
125 | 老
126 | 老大
127 | 良
128 | 颇
129 | 颇为
130 | 甚
131 | 实在
132 | 太
133 | 太甚
134 | 特
135 | 特别
136 | 尤
137 | 尤其
138 | 尤为
139 | 尤以
140 | 远
141 | 着实
142 | 曷
143 | 碜
144 | more
145 | 大不了
146 | 多
147 | 更
148 | 比较
149 | 更加
150 | 更进一步
151 | 更为
152 | 还
153 | 还要
154 | 较
155 | 较比
156 | 较为
157 | 进一步
158 | 那般
159 | 那么
160 | 那样
161 | 强
162 | 如斯
163 | 益
164 | 益发
165 | 尤甚
166 | 逾
167 | 愈
168 | 愈 ... 愈
169 | 愈发
170 | 愈加
171 | 愈来愈
172 | 愈益
173 | 远远
174 | 越 ... 越
175 | 越发
176 | 越加
177 | 越来越
178 | 越是
179 | 这般
180 | 这样
181 | 足
182 | 足足
183 | ish
184 | 点点滴滴
185 | 多多少少
186 | 怪
187 | 好生
188 | 还
189 | 或多或少
190 | 略
191 | 略加
192 | 略略
193 | 略微
194 | 略为
195 | 蛮
196 | 稍
197 | 稍稍
198 | 稍微
199 | 稍为
200 | 稍许
201 | 挺
202 | 未免
203 | 相当
204 | 些
205 | 些微
206 | 些小
207 | 一点
208 | 一点儿
209 | 一些
210 | 有点
211 | 有点儿
212 | 有些
213 | 半点
214 | 不大
215 | 不丁点儿
216 | 不甚
217 | 不怎么
218 | 聊
219 | 没怎么
220 | 轻度
221 | 弱
222 | 丝毫
223 | 微
224 | 相对
225 | last
226 |
--------------------------------------------------------------------------------
/Crawler/Crawler/expand_package/WordCloud.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*
2 |
3 | from wordcloud import WordCloud
4 | import PIL.Image as image
5 | import numpy as np
6 | import jieba
7 | import datetime
8 | import os
9 | import time
10 |
11 | from Crawler.settings import IMAGES_STORE, WORDCLOUD_STORE
12 |
13 | Yesterday = (datetime.datetime.now()-datetime.timedelta(days=2)).strftime('%Y-%m-%d')#昨天
14 |
15 | def trans_CN(text):
16 | #中文要进行分词,不像英文自动有空格
17 | wordlist = jieba.cut(text)
18 | result = ' '.join(wordlist)
19 | return result
20 |
21 |
22 | def Gen_WordCloud(text,Newsid):
23 | #输入:text文章内容,Newsid文章的id号
24 | #输出:image_path对应词云图片的路径
25 | text = trans_CN(text)#分词
26 | #mask = np.array(image.open('./static/images/cloud.png'))#如果要把词云形状弄成特定图形要用该语句
27 | wordcloud = WordCloud(
28 | #mask=mask,
29 | font_path = "C:\Windows\Fonts\simhei.ttf", #加载中文字体
30 | background_color='white', #背景色
31 | max_words=2000,#允许最大词汇
32 | #max_font_size=60 #最大号字体
33 | ).generate(text)
34 |
35 | image_produce = wordcloud.to_image()
36 | name = str(Newsid)+".png" #构造温江名
37 | # path = "../../static/images/WordCloud/" #保存文件夹
38 | path = WORDCLOUD_STORE
39 | if not os.path.exists(path):
40 | os.makedirs(path)
41 | save_path =path+name #保存的完整路径 这个地址也是创建到爬虫项目的外面,刚好,目录结构不变的情况下。
42 | print(save_path)
43 | wordcloud.to_file(save_path) #保存词云
44 | img_path=save_path+name #对应的要传给
标签的路径
45 | #print("save to :",save_path)
46 | #image_produce.show()
47 | print("生成词云成功了!")
48 | return img_path
49 |
50 | if __name__=="__main__":
51 | Newsid="shitshit"
52 | text='近日,上汽大通官方公布了全新MPV车型G20的最新官图,从此次公布的官图中不难看出,大通G20在外形轮廓上沿用了家族式设计。大灯采用了全LED光源,造型极具科技感。内饰中控区采用了悬浮式设计,营造出了更多的储物空间。据悉,大通G20将在2019上海车展期间正式亮相。从官图细节中可以看出,大通G20的前脸设计相比G10车型焕然一新。不规则形状的大灯和硕大的进气格栅相连接,其大灯内部结构也更加复杂,采用全LED光源。侧面轮廓上,大通G20采用了悬浮式车窗设计。尾灯同样采用全LED光源,两侧尾灯之间采用镀铬条相连,尾部采用字母logo居中的形式,而非图形logo。内饰部分,厂方着重强调了悬浮式中控设计。从官图中可以看出,大通G20采用了旋钮式换挡操作,换挡旋钮四周集成了众多驾驶辅助功能,视觉效果上具备更强的科技感。而悬浮式设计则为底部营造了更大的储物空间,便于放置乘客带上车的手包或其它物品。目前,官方暂未透露新车将会搭载哪款动力总成。根据推测,大通G20有望搭载2.0T汽油发动机和1.9T柴油发动机,预计在2019年上海车展期间正式亮相。'
53 | Gen_WordCloud(text,Newsid)
--------------------------------------------------------------------------------
/Crawler/Crawler/expand_package/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/realzhengyiming/newsSpier_scrapy/2852f63981f764bfc0f2c733d52f104a3bf3c9e1/Crawler/Crawler/expand_package/__init__.py
--------------------------------------------------------------------------------
/Crawler/Crawler/expand_package/makebeautifulSoup.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*
2 |
3 | #coding=utf-8
4 | import random
5 | import time
6 | import json
7 | import chardet
8 | import requests
9 | import retrying
10 | from bs4 import BeautifulSoup
11 |
12 | class makeBS:
13 | @retrying.retry(stop_max_attempt_number=4) #重试4次,每次等待多久呢
14 | def mobiResponse(self,requestURL): #这个留着吧
15 | print(requestURL)
16 | my_headers = [ # 这边为了得到直接的手机端的页面代码返回,直接使用手机ua
17 | # 'Mozilla/5.0 (Linux; Android 7.1.1; MI 6 Build/NMF26X; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/53.0.2785.49 Mobile MQQBrowser/6.2 TBS/043508 Safari/537.36 MicroMessenger/6.5.13.1100 NetType/WIFI Language/zh_CN',
18 | # 'Mozilla/5.0 (Linux; Android 7.1.1; MI 6 Build/NMF26X) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/57.0.2987.132 Mobile Safari/537.36 Maxthon/3047',
19 | 'Mozilla/5.0 (Linux; Android 8.0.0; Pixel 2 XL Build/OPD1.170816.004) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Mobile Safari/537.36',
20 | # 'Mozilla/5.0 (Linux; U; Android 7.0; zh-cn; STF-AL00 Build/HUAWEISTF-AL00) AppleWebKit/537.36 (KHTML, like Gecko)Version/4.0 Chrome/37.0.0.0 MQQBrowser/7.9 Mobile Safari/537.36',
21 | # 'Mozilla/5.0 (Linux; U; Android 6.0.1; zh-CN; SM-C7000 Build/MMB29M) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/40.0.2214.89 UCBrowser/11.6.2.948 Mobile Safari/537.36',
22 | # 'Mozilla/5.0 (Linux; Android 7.0; STF-AL10 Build/HUAWEISTF-AL10; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/53.0.2785.49 Mobile MQQBrowser/6.2 TBS/043508 Safari/537.36 V1_AND_SQ_7.2.0_730_YYB_D QQ/7.2.0.3270 NetType/4G WebP/0.3.0 Pixel/1080'
23 | ]
24 | headers = {"User-Agent": random.choice(my_headers), 'Referer':requestURL} # 默认值
25 | try:
26 | rawhtml = requests.get(requestURL, headers=headers, allow_redirects=True, #跳转怎么是false
27 | timeout=30) # 一般提取文本的话,那就用text,如果是文件就content
28 | # print(rawhtml.headers)
29 | # rawhtml.encoding = "GBK" ##gbk>gb2312 使用这种方式尚且还有乱码的情况,部分乱码,那就是gbk可以修复
30 | # print(chardet.detect(rawhtml.content)['encoding'])
31 | if ("GB2312" == chardet.detect(rawhtml.content)['encoding']):
32 | rawhtml.encoding = "gbk"
33 | else:
34 | rawhtml.encoding = chardet.detect(rawhtml.content)['encoding'] # 这样应该就可以直接默认来编码了
35 | if rawhtml.status_code == 504:
36 | print(504)
37 | return
38 | print(rawhtml.url)
39 | print("状态码" + str(rawhtml.status_code))
40 | html = rawhtml.text
41 | return html #返回了这个网页的html 文档,然后再解析一次就可以了
42 | except Exception as e:
43 | print(e)
44 | return
45 |
46 | def makesoup(self,url): # 这儿是按页来打开的
47 | if url==None:
48 | return
49 | my_headers = [
50 | 'Mozilla/5.0 (Windows NT 5.2) AppleWebKit/534.30 (KHTML, like Gecko) Chrome/12.0.742.122 Safari/534.30',
51 | 'Mozilla/5.0 (Windows NT 5.1; rv:5.0) Gecko/20100101 Firefox/5.0',
52 | 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.2; Trident/4.0; .NET CLR 1.1.4322; .NET CLR 2.0.50727; .NET4.0E; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729; .NET4.0C)',
53 | 'Opera/9.80 (Windows NT 5.1; U; zh-cn) Presto/2.9.168 Version/11.50',
54 | 'Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/533.21.1 (KHTML, like Gecko) Version/5.0.5 Safari/533.21.1',
55 | 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 2.0.50727; .NET CLR 3.0.04506.648; .NET CLR 3.5.21022; .NET4.0E; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729; .NET4.0C)']
56 | headers = {"User-Agent": random.choice(my_headers)} #默认值
57 | if(url.find("ifeng.com")!=-1): #是凤凰的网址的话
58 | print("fenghuangNews")
59 | headers = {"User-Agent": random.choice(my_headers),
60 | "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
61 | 'Accept-Encoding':'gbk, utf-8',
62 | 'Accept-Language': 'zh-CN,zh;q=0.9',
63 | }
64 | if (url.find(".qq.com")!=-1):
65 | print("qqnews")
66 | headers = {
67 | "User-Agent": random.choice(my_headers),
68 | "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
69 | 'Accept-Encoding': 'gbk, utf-8',
70 | 'Accept-Language': 'zh-CN,zh;q=0.9',
71 | 'referer':url
72 | }
73 | """
74 | 获取网站的soup对象,#看看还能不能增加代理的东西,进来
75 | 有两个请求头的自定义,但是,为什么要分开来呢
76 | """
77 | # headers = {"User-Agent": random.choice(my_headers)}
78 | soup = None
79 | address = "http://223.203.0.14:8000" #默就是用了代理,怎么还是失败呢
80 | # address = None #访问页面的这个要使用代理才可以
81 | proxies = {'http': address, "https": address} # , 'https': 'http://localhost:8888',这儿现在就是没用代理的情况下
82 | try:
83 | rawhtml = requests.get(url, headers=headers, allow_redirects=True,timeout=60) #一般提取文本的话,那就用text,如果是文件就content
84 | if url.find("ifeng")!=-1:
85 | # print(chardet.detect(rawhtml.content)['encoding']) 经常性的检测错误
86 | print(chardet.detect(rawhtml.content)['encoding'])
87 | if ("GB2312"== chardet.detect(rawhtml.content)['encoding']):
88 | rawhtml.encoding = "gbk"
89 | else:
90 | rawhtml.encoding = "utf-8" # 这样应该就可以直接默认来编码了
91 | else:
92 | rawhtml.encoding = chardet.detect(rawhtml.content)['encoding'] #这样应该就可以直接默认来编码了
93 | # print(rawhtml.status_code)
94 | # print(rawhtml.headers)
95 | soup = BeautifulSoup(rawhtml.text, 'lxml')
96 | return soup
97 | except Exception as e: #如果超时的话就变成这样子
98 | print(e)
99 | # print(rawhtml.status_code)
100 | return soup #没有的话就是返回空的在这儿的None
101 |
102 | def makeBSjson(self,url):
103 | my_headers = [
104 | 'Mozilla/5.0 (Windows NT 5.2) AppleWebKit/534.30 (KHTML, like Gecko) Chrome/12.0.742.122 Safari/534.30',
105 | 'Mozilla/5.0 (Windows NT 5.1; rv:5.0) Gecko/20100101 Firefox/5.0',
106 | 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.2; Trident/4.0; .NET CLR 1.1.4322; .NET CLR 2.0.50727; .NET4.0E; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729; .NET4.0C)',
107 | 'Opera/9.80 (Windows NT 5.1; U; zh-cn) Presto/2.9.168 Version/11.50',
108 | 'Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/533.21.1 (KHTML, like Gecko) Version/5.0.5 Safari/533.21.1',
109 | 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 2.0.50727; .NET CLR 3.0.04506.648; .NET CLR 3.5.21022; .NET4.0E; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729; .NET4.0C)']
110 | headers = {"User-Agent": random.choice(my_headers)} # 默认值
111 | try:
112 | r = requests.get(url, headers=headers, allow_redirects=True,timeout=60) # 一般提取文本的话,那就用text,如果是文件就content
113 | json_response = r.content.decode() # 获取r的文本 就是一个json字符串
114 | # 将json字符串转换成dic字典对象
115 | dict_json = json.loads(json_response)
116 | return dict_json
117 | except Exception as e:
118 | print(e)
119 |
120 | #------------------------------2019-新增关于解析腾讯评论的请求--------------------------------------------------
121 | def getCmt_id(self,url):
122 | try:
123 | url = url.replace("//",'/') #使用的时候再转换,因为数据库里面的是四条杠的。
124 | response = requests.get(url)
125 | html = response.content
126 | bs = BeautifulSoup(html,'lxml')
127 | # pprint(BS)
128 | for i in bs.find_all("script"): # 这儿那么多可以换成正则表达式来找出这一大长串的数字。
129 | if i.text.find("cmt_id") != -1:
130 | # print(i.text) # 都是数字的话,那就把连续的数字都提取出来好了。
131 | for object in i.text.split(";"):
132 | if object.find("cmt_id") != -1:
133 | cmt_id = object.split("=")[1] # 这样会不会很危险呢。。。直接运行js代码。
134 | return(cmt_id)
135 | # print(object)
136 | return #如果没有找到的话,那就返回None
137 | except Exception as e:
138 | print(e)
139 | return
140 |
141 |
142 |
143 |
144 |
145 |
146 | if __name__ == "__main__": #这个就是url的东西
147 | url = "https://pl.ifeng.com/a/20181010/60101359_0.shtml"
148 | cooker = makeBS()
149 | html = cooker.makesoup(url)
150 |
151 |
152 |
153 |
--------------------------------------------------------------------------------
/Crawler/Crawler/expand_package/picDownloadScript.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*
2 |
3 | # 下载到的一个路径中去,把图片下载下来,并且把新闻里面的
4 | import hashlib
5 | import os
6 | import time
7 | import traceback
8 | import requests ##导入requests
9 |
10 | # from config import downloadPath
11 | from Crawler.settings import IMAGES_STORE # 这个是自己定义好的配置文件,一般可以放在相同目录下,可以直接访问。
12 |
13 | class Download:
14 | def __init__(self, path): # 先设置好下载的路径
15 | if (path == None):
16 | self.path = IMAGES_STORE # 这边也直接使用默认使用配置文件的地址
17 | print("是 None") # 每次管道也是还是重新生成的一个啊
18 | else:
19 | self.path = path
20 |
21 | def makeMd5(self,url):
22 | obj = hashlib.md5()
23 | obj.update(bytes(url,encoding="utf-8"))
24 | return obj.hexdigest()
25 |
26 | def downloadImg(self, img_url, imgName, referer, now_date): # 这个下载的模块是没有返回值的,
27 | time.sleep(0.5)
28 | '''
29 | img_url, 图片的下载链接
30 | imgName, 下载的图片的名字
31 | referer, 这个参数是请求的时候防止加了referer参数的反反爬虫用的。
32 | now_date 图片下载到 指定路径下的什么文件夹内,这儿是使用 日期字段 作为文件夹 测试可以随意修改
33 | 设置根据图片的url生成唯一的md5码,scrapy 类似的。
34 | '''
35 | headers = {
36 | 'User-Agent': "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
37 | 'Referer': referer} ##浏览器请求头(大部分网站没有这个请求头会报错、请务必加上哦)
38 | try:
39 | # int(shit) # todo 图片可以不下载了
40 | img = requests.get(img_url, headers=headers)
41 | # print(img)
42 | # print(self.path)
43 | if (False == os.path.exists(os.path.join(self.path, now_date))): # 不存在这个目录的话
44 | os.makedirs(self.path + '/' + now_date)
45 | if imgName==None: # 不设置的话,默认就是md5
46 | imgName = self.makeMd5(img_url) #改良过后通过url来生成唯一的md5的
47 | dPath = os.path.join(self.path, now_date, imgName + '.jpg') # imgName传进来不需要带时间
48 | # print(dPath)
49 | print("图片的文件名 " + dPath)
50 | f = open(dPath, 'ab')
51 | f.write(img.content)
52 | f.close()
53 | # print("下载成功")
54 | return os.path.join( now_date, imgName + '.jpg') # 返回相对路径
55 | except Exception as e:
56 | print(img_url)
57 | print(e)
58 | traceback.print_exc()
59 |
60 |
61 | if __name__ == "__main__":
62 | # 局部测试代码
63 | imgUrl = "http://inews.gtimg.com/newsapp_match/0/5403685404/0"
64 | downloadTool = Download(None) # todo 这儿有一个问题就是,这个图片的下载地址网页部分是带地址的,所以,最好的是网页部分不需要要再加上地址的文件夹,统一使用
65 | path = downloadTool.downloadImg(img_url="http://img1.gtimg.com/datalib_img//18-07-03/a/fda81a84eb06919ba40782c45ebbc28d.jpg" ,
66 | imgName = None,
67 | referer = None,
68 | now_date = "20190505") # 这个是下面新建力的文件夹
69 | print(path)
--------------------------------------------------------------------------------
/Crawler/Crawler/expand_package/senti_dict.py:
--------------------------------------------------------------------------------
1 | #-*- coding:utf8 -*-
2 |
3 |
4 |
5 | import jieba
6 | import numpy as np
7 | import re
8 | #打开词典文件,返回列表
9 | def open_dict(Dict = 'name', path=r'./Sent_Dict/'):
10 | path = os.path.join(path, '%s.txt' % Dict)
11 | dictionary = open(path, 'r', encoding='utf-8')
12 | dict = []
13 | for word in dictionary:
14 | word = word.strip('\n')
15 | dict.append(word)
16 | return dict
17 |
18 | def judgeodd(num):
19 | if (num % 2) == 0:
20 | return 'even'
21 | else:
22 | return 'odd'
23 |
24 |
25 |
26 |
27 | def open_file_as_text(filename):
28 | dict = []
29 | with open(filename, encoding='utf-8') as f:
30 | # print(f.read())
31 | dict = f.readlines()
32 | new = []
33 | for word in dict:
34 | new.append(word.replace("\n", ""))
35 | print(new)
36 | return new
37 |
38 |
39 | import os
40 | current_path = os.path.dirname(__file__)
41 | deny_word = open_dict(Dict = '否定词', path= current_path)
42 | posdict = open_dict(Dict = 'positive', path= current_path)
43 | negdict = open_dict(Dict = 'negative', path= current_path)
44 | degree_word = open_dict(Dict = '程度级别词语', path= current_path)
45 |
46 | # deny_word = open_file_as_text(current_path+'否定词.txt')
47 | # posdict = open_file_as_text(current_path+'positive.txt')
48 | # negdict = open_file_as_text(current_path+'negative.txt')
49 | # degree_word = open_file_as_text(current_path+'程度级别词语.txt')
50 |
51 | mostdict = degree_word[degree_word.index('extreme')+1 : degree_word.index('very')]#权重4,即在情感词前乘以4
52 | verydict = degree_word[degree_word.index('very')+1 : degree_word.index('more')]#权重3
53 | moredict = degree_word[degree_word.index('more')+1 : degree_word.index('ish')]#权重2
54 | ishdict = degree_word[degree_word.index('ish')+1 : degree_word.index('last')]#权重0.5
55 |
56 |
57 | import jieba
58 | def sentiment_score_list(dataset):
59 | seg_sentence = dataset.split(' 。')
60 | count1 = []
61 | count2 = []
62 | for sen in seg_sentence: #循环遍历每一个评论
63 | segtmp = jieba.lcut(sen, cut_all=False) #把句子进行分词,以列表的形式返回
64 | i = 0 #记录扫描到的词的位置
65 | a = 0 #记录情感词的位置
66 | poscount = 0 #积极词的第一次分值
67 | poscount2 = 0 #积极词反转后的分值
68 | poscount3 = 0 #积极词的最后分值(包括叹号的分值)
69 | negcount = 0
70 | negcount2 = 0
71 | negcount3 = 0
72 | for word in segtmp:
73 | if word in posdict: # 判断词语是否是情感词
74 | poscount += 1
75 | c = 0
76 | for w in segtmp[a:i]: # 扫描情感词前的程度词
77 | if w in mostdict:
78 | poscount *= 4.0
79 | elif w in verydict:
80 | poscount *= 3.0
81 | elif w in moredict:
82 | poscount *= 2.0
83 | elif w in ishdict:
84 | poscount *= 0.5
85 | elif w in deny_word:
86 | c += 1
87 | if judgeodd(c) == 'odd': # 扫描情感词前的否定词数
88 | poscount *= -1.0
89 | poscount2 += poscount
90 | poscount = 0
91 | poscount3 = poscount + poscount2 + poscount3
92 | poscount2 = 0
93 | else:
94 | poscount3 = poscount + poscount2 + poscount3
95 | poscount = 0
96 | a = i + 1 # 情感词的位置变化
97 |
98 | elif word in negdict: # 消极情感的分析,与上面一致
99 | negcount += 1
100 | d = 0
101 | for w in segtmp[a:i]:
102 | if w in mostdict:
103 | negcount *= 4.0
104 | elif w in verydict:
105 | negcount *= 3.0
106 | elif w in moredict:
107 | negcount *= 2.0
108 | elif w in ishdict:
109 | negcount *= 0.5
110 | elif w in degree_word:
111 | d += 1
112 | if judgeodd(d) == 'odd':
113 | negcount *= -1.0
114 | negcount2 += negcount
115 | negcount = 0
116 | negcount3 = negcount + negcount2 + negcount3
117 | negcount2 = 0
118 | else:
119 | negcount3 = negcount + negcount2 + negcount3
120 | negcount = 0
121 | a = i + 1
122 | elif word == '!' or word == '!': ##判断句子是否有感叹号
123 | for w2 in segtmp[::-1]: # 扫描感叹号前的情感词,发现后权值+2,然后退出循环
124 | if w2 in posdict or negdict:
125 | poscount3 += 2
126 | negcount3 += 2
127 | break
128 | i += 1 # 扫描词位置前移
129 |
130 |
131 | # 以下是防止出现负数的情况
132 | pos_count = 0
133 | neg_count = 0
134 | if poscount3 < 0 and negcount3 > 0:
135 | neg_count += negcount3 - poscount3
136 | pos_count = 0
137 | elif negcount3 < 0 and poscount3 > 0:
138 | pos_count = poscount3 - negcount3
139 | neg_count = 0
140 | elif poscount3 < 0 and negcount3 < 0:
141 | neg_count = -poscount3
142 | pos_count = -negcount3
143 | else:
144 | pos_count = poscount3
145 | neg_count = negcount3
146 |
147 | count1.append([pos_count, neg_count])
148 | count2.append(count1)
149 | count1 = []
150 |
151 | return count2
152 |
153 | def sentiment_score(senti_score_list):
154 | score = []
155 | for review in senti_score_list:
156 | score_array = np.array(review)
157 | Pos = np.sum(score_array[:, 0])
158 | Neg = np.sum(score_array[:, 1])
159 | AvgPos = np.mean(score_array[:, 0])
160 | AvgPos = float('%.1f'%AvgPos)
161 | AvgNeg = np.mean(score_array[:, 1])
162 | AvgNeg = float('%.1f'%AvgNeg)
163 | StdPos = np.std(score_array[:, 0])
164 | StdPos = float('%.1f'%StdPos)
165 | StdNeg = np.std(score_array[:, 1])
166 | StdNeg = float('%.1f'%StdNeg)
167 | score.append([Pos, Neg, AvgPos, AvgNeg, StdPos, StdNeg])
168 | return score
169 |
170 |
171 | def Senti_Sentence(word):
172 | if word == '':
173 | return 0,0,'NEU'
174 | else:
175 | result = sentiment_score(sentiment_score_list(str(word))) # 情感分析
176 | pos_score = result[0][0]
177 | neg_score = result[0][1]
178 | if pos_score == neg_score:
179 | SentiResult='NEU'
180 | elif pos_score > neg_score:
181 | SentiResult='POS'
182 | else:
183 | SentiResult='NEG'
184 | #print(pos_score,neg_score,SentiResult)
185 | return float(pos_score),float(neg_score),SentiResult
186 |
187 | def Senti_Text(text):
188 | if text == '':
189 | return 0,0,'NEU'
190 | else:
191 | text = str(text)
192 | seg_sentence = re.split('。|!|?|……|,',text)
193 | print(seg_sentence)
194 | pos_sum=0
195 | neg_sum=0
196 | sen_num=0
197 | for sentence in seg_sentence:
198 | if sentence != '':
199 | pos,neg,res=Senti_Sentence(sentence)
200 | pos_sum+=pos
201 | neg_sum+=neg
202 | sen_num+=1
203 | else:
204 | pass
205 | print('句子数:',sen_num)
206 | try:
207 | pos_score = pos_sum/sen_num
208 | neg_score = neg_sum/sen_num
209 | if pos_score == neg_score:
210 | SentiResult='NEU'
211 | elif pos_score > neg_score:
212 | SentiResult='POS'
213 | else:
214 | SentiResult='NEG'
215 | #print(pos_score,neg_score,SentiResult)
216 | return float(pos_score),float(neg_score),SentiResult
217 | except Exception as e : #
218 | print(e)
219 | return 0,0,'NEU'
220 |
221 |
222 |
223 | if __name__=="__main__":
224 | #data = '你就是个王八蛋,混账玩意!你们的手机真不好用!非常生气,我非常郁闷!!!!'
225 | #data2= '我好开心啊,非常非常非常高兴!今天我得了一百分,我很兴奋开心,愉快,开心'
226 | text='腾讯汽车 站]编辑从深圳市大兴观澜丰田了解到,卡罗拉双擎最高优惠0.30万元,促销时间为2019年03月01日--2019年03月03日, 欢迎有意向的朋友到店试乘试驾。卡罗拉双擎外观卡罗拉双擎内饰卡罗拉双擎细节版权声明:本文系腾讯汽车独家稿件,版权为腾讯汽车所有。文章内的价格为编辑在车市第一线真实采集到的当日价格,由于汽车价格变化莫测,同时此价格只是个体经销商的行为,所以价格仅供参考使用。'
227 | # print(sentiment_score_list(data))
228 | # print(sentiment_score(sentiment_score_list(data)))
229 | #print(sentiment_score(sentiment_score_list(data2)))
230 |
231 | # current_path = os.path.dirname(__file__)
232 |
233 | print("当前的路径是")
234 | # print(current_path)
235 | pos_score,neg_score,SentiResult=Senti_Text(text)
236 | print( pos_score,neg_score,SentiResult)
237 |
238 |
239 |
240 |
241 |
--------------------------------------------------------------------------------
/Crawler/Crawler/expand_package/senti_dict_class.py:
--------------------------------------------------------------------------------
1 | #-*- coding:utf8 -*-
2 | import jieba
3 | import numpy as np
4 | import re
5 | import jieba
6 | #打开词典文件,返回列表
7 |
8 |
9 | class Senti_dict_class:
10 | def __init__(self):
11 | pass
12 |
13 | # self.deny_word = self.open_dict(Dict='否定词', path=r'./Sent_Dict/')
14 | # self.posdict = self.open_dict(Dict='positive', path=r'./Sent_Dict/')
15 | # self.negdict = self.open_dict(Dict='negative', path=r'./Sent_Dict/')
16 | # self.degree_word = self.open_dict(Dict='程度级别词语', path=r'./Sent_Dict/')
17 |
18 | self.deny_word = self.open_file_as_text('./Sent_Dict/否定词.txt')
19 | self.posdict = self.open_file_as_text('./Sent_Dict/positive.txt')
20 | self.negdict = self.open_file_as_text('./Sent_Dict/negative.txt')
21 | self.degree_word = self.open_file_as_text('./Sent_Dict/程度级别词语.txt')
22 |
23 |
24 | self.mostdict = self.degree_word[self.degree_word.index('extreme') + 1: self.degree_word.index('very')] # 权重4,即在情感词前乘以4
25 | self.verydict = self.degree_word[self.degree_word.index('very') + 1: self.degree_word.index('more')] # 权重3
26 | self.moredict = self.degree_word[self.degree_word.index('more') + 1: self.degree_word.index('ish')] # 权重2
27 | self.ishdict = self.degree_word[self.degree_word.index('ish') + 1: self.degree_word.index('last')] # 权重0.5
28 |
29 |
30 | def open_file_as_text(self,filename):
31 | dict = []
32 | with open(filename, encoding='utf-8') as f:
33 | # print(f.read())
34 | dict=f.readlines()
35 | new = []
36 | for word in dict:
37 | new.append(word.replace("\n",""))
38 | print(new)
39 | return new
40 |
41 | def open_dict(self,Dict = 'name', path=r'Sent_Dict/'):
42 | path = path + '%s.txt' % Dict
43 | dictionary = open(path, 'r', encoding='utf-8')
44 | dict = []
45 | for word in dictionary:
46 | word = word.strip('\n')
47 | dict.append(word)
48 | return dict
49 |
50 | def judgeodd(self,num):
51 | if (num % 2) == 0:
52 | return 'even'
53 | else:
54 | return 'odd'
55 |
56 |
57 |
58 |
59 |
60 |
61 | def sentiment_score_list(self,dataset):
62 | seg_sentence = dataset.split(' 。')
63 | count1 = []
64 | count2 = []
65 | for sen in seg_sentence: #循环遍历每一个评论
66 | segtmp = jieba.lcut(sen, cut_all=False) #把句子进行分词,以列表的形式返回
67 | i = 0 #记录扫描到的词的位置
68 | a = 0 #记录情感词的位置
69 | poscount = 0 #积极词的第一次分值
70 | poscount2 = 0 #积极词反转后的分值
71 | poscount3 = 0 #积极词的最后分值(包括叹号的分值)
72 | negcount = 0
73 | negcount2 = 0
74 | negcount3 = 0
75 | for word in segtmp:
76 | if word in self.posdict: # 判断词语是否是情感词
77 | poscount += 1
78 | c = 0
79 | for w in segtmp[a:i]: # 扫描情感词前的程度词
80 | if w in self.mostdict:
81 | poscount *= 4.0
82 | elif w in self.verydict:
83 | poscount *= 3.0
84 | elif w in self.moredict:
85 | poscount *= 2.0
86 | elif w in self.ishdict:
87 | poscount *= 0.5
88 | elif w in self.deny_word:
89 | c += 1
90 | if self.judgeodd(c) == 'odd': # 扫描情感词前的否定词数
91 | poscount *= -1.0
92 | poscount2 += poscount
93 | poscount = 0
94 | poscount3 = poscount + poscount2 + poscount3
95 | poscount2 = 0
96 | else:
97 | poscount3 = poscount + poscount2 + poscount3
98 | poscount = 0
99 | a = i + 1 # 情感词的位置变化
100 |
101 | elif word in self.negdict: # 消极情感的分析,与上面一致
102 | negcount += 1
103 | d = 0
104 | for w in segtmp[a:i]:
105 | if w in self.mostdict:
106 | negcount *= 4.0
107 | elif w in self.verydict:
108 | negcount *= 3.0
109 | elif w in self.moredict:
110 | negcount *= 2.0
111 | elif w in self.ishdict:
112 | negcount *= 0.5
113 | elif w in self.degree_word:
114 | d += 1
115 | if self.judgeodd(d) == 'odd':
116 | negcount *= -1.0
117 | negcount2 += negcount
118 | negcount = 0
119 | negcount3 = negcount + negcount2 + negcount3
120 | negcount2 = 0
121 | else:
122 | negcount3 = negcount + negcount2 + negcount3
123 | negcount = 0
124 | a = i + 1
125 | elif word == '!' or word == '!': ##判断句子是否有感叹号
126 | for w2 in segtmp[::-1]: # 扫描感叹号前的情感词,发现后权值+2,然后退出循环
127 | if w2 in self.posdict or self.negdict:
128 | poscount3 += 2
129 | negcount3 += 2
130 | break
131 | i += 1 # 扫描词位置前移
132 |
133 |
134 | # 以下是防止出现负数的情况
135 | pos_count = 0
136 | neg_count = 0
137 | if poscount3 < 0 and negcount3 > 0:
138 | neg_count += negcount3 - poscount3
139 | pos_count = 0
140 | elif negcount3 < 0 and poscount3 > 0:
141 | pos_count = poscount3 - negcount3
142 | neg_count = 0
143 | elif poscount3 < 0 and negcount3 < 0:
144 | neg_count = -poscount3
145 | pos_count = -negcount3
146 | else:
147 | pos_count = poscount3
148 | neg_count = negcount3
149 |
150 | count1.append([pos_count, neg_count])
151 | count2.append(count1)
152 | count1 = []
153 |
154 | return count2
155 |
156 | def sentiment_score(self,senti_score_list):
157 | score = []
158 | for review in senti_score_list:
159 | score_array = np.array(review)
160 | Pos = np.sum(score_array[:, 0])
161 | Neg = np.sum(score_array[:, 1])
162 | AvgPos = np.mean(score_array[:, 0])
163 | AvgPos = float('%.1f'%AvgPos)
164 | AvgNeg = np.mean(score_array[:, 1])
165 | AvgNeg = float('%.1f'%AvgNeg)
166 | StdPos = np.std(score_array[:, 0])
167 | StdPos = float('%.1f'%StdPos)
168 | StdNeg = np.std(score_array[:, 1])
169 | StdNeg = float('%.1f'%StdNeg)
170 | score.append([Pos, Neg, AvgPos, AvgNeg, StdPos, StdNeg])
171 | return score
172 |
173 |
174 | def Senti_Sentence(self,word):
175 | if word == '':
176 | return 0,0,'NEU'
177 | else:
178 | result = self.sentiment_score(self.sentiment_score_list(str(word))) # 情感分析
179 | pos_score = result[0][0]
180 | neg_score = result[0][1]
181 | if pos_score == neg_score:
182 | SentiResult='NEU'
183 | elif pos_score > neg_score:
184 | SentiResult='POS'
185 | else:
186 | SentiResult='NEG'
187 | #print(pos_score,neg_score,SentiResult)
188 | return float(pos_score),float(neg_score),SentiResult
189 |
190 | def Senti_Text(self,text):
191 | if text == '':
192 | return 0,0,'NEU'
193 | else:
194 | text = str(text)
195 | seg_sentence = re.split('。|!|?|……|,',text)
196 | print(seg_sentence)
197 | pos_sum=0
198 | neg_sum=0
199 | sen_num=0
200 | for sentence in seg_sentence:
201 | if sentence != '':
202 | pos,neg,res=self.Senti_Sentence(sentence)
203 | pos_sum+=pos
204 | neg_sum+=neg
205 | sen_num+=1
206 | else:
207 | pass
208 | print('句子数:',sen_num)
209 | try:
210 | pos_score = pos_sum/sen_num
211 | neg_score = neg_sum/sen_num
212 | if pos_score == neg_score:
213 | SentiResult='NEU'
214 | elif pos_score > neg_score:
215 | SentiResult='POS'
216 | else:
217 | SentiResult='NEG'
218 | #print(pos_score,neg_score,SentiResult)
219 | return float(pos_score),float(neg_score),SentiResult
220 | except Exception as e : #
221 | print(e)
222 | return 0,0,'NEU'
223 |
224 |
225 |
226 | if __name__=="__main__":
227 | #data = '你就是个王八蛋,混账玩意!你们的手机真不好用!非常生气,我非常郁闷!!!!'
228 | #data2= '我好开心啊,非常非常非常高兴!今天我得了一百分,我很兴奋开心,愉快,开心'
229 | text='腾讯汽车 站]编辑从深圳市大兴观澜丰田了解到,卡罗拉双擎最高优惠0.30万元,促销时间为2019年03月01日--2019年03月03日, 欢迎有意向的朋友到店试乘试驾。卡罗拉双擎外观卡罗拉双擎内饰卡罗拉双擎细节版权声明:本文系腾讯汽车独家稿件,版权为腾讯汽车所有。文章内的价格为编辑在车市第一线真实采集到的当日价格,由于汽车价格变化莫测,同时此价格只是个体经销商的行为,所以价格仅供参考使用。'
230 | # print(sentiment_score_list(data))
231 | # print(sentiment_score(sentiment_score_list(data)))
232 | #print(sentiment_score(sentiment_score_list(data2)))
233 | senti_counter = Senti_dict_class()
234 | pos_score,neg_score,SentiResult=senti_counter.Senti_Text(text)
235 | print( pos_score,neg_score,SentiResult)
236 |
237 | # senti_counter.open_file_as_text("Sent_Dict/否定词.txt")
238 |
239 |
--------------------------------------------------------------------------------
/Crawler/Crawler/expand_package/否定词.txt:
--------------------------------------------------------------------------------
1 | 不
2 | 不是
3 | 不能
4 | 不可
5 | 没有
6 | 不要
7 | 别
8 | 没
9 | 无
10 | 莫
11 | 未
12 | 勿
13 | 休
14 | 甭
15 | 非
--------------------------------------------------------------------------------
/Crawler/Crawler/expand_package/程度级别词语.txt:
--------------------------------------------------------------------------------
1 | extreme
2 | 百分之百
3 | 倍加
4 | 备至
5 | 不得了
6 | 不堪
7 | 不可开交
8 | 不亦乐乎
9 | 不折不扣
10 | 彻头彻尾
11 | 充分
12 | 到头
13 | 地地道道
14 | 非常
15 | 极
16 | 极度
17 | 极端
18 | 极其
19 | 极为
20 | 截然
21 | 尽
22 | 惊人地
23 | 绝
24 | 绝顶
25 | 绝对
26 | 绝对化
27 | 刻骨
28 | 酷
29 | 满
30 | 满贯
31 | 满心
32 | 莫大
33 | 奇
34 | 入骨
35 | 甚为
36 | 十二分
37 | 十分
38 | 十足
39 | 死
40 | 滔天
41 | 痛
42 | 透
43 | 完全
44 | 完完全全
45 | 万
46 | 万般
47 | 万分
48 | 万万
49 | 无比
50 | 无度
51 | 无可估量
52 | 无以复加
53 | 无以伦比
54 | 要命
55 | 要死
56 | 已极
57 | 已甚
58 | 异常
59 | 逾常
60 | 贼
61 | 之极
62 | 之至
63 | 至极
64 | 卓绝
65 | 最为
66 | 佼佼
67 | 郅
68 | 綦
69 | 齁
70 | 最
71 | very
72 | 不为过
73 | 超
74 | 超额
75 | 超外差
76 | 超微结构
77 | 超物质
78 | 出头
79 | 多
80 | 浮
81 | 过
82 | 过度
83 | 过分
84 | 过火
85 | 过劲
86 | 过了头
87 | 过猛
88 | 过热
89 | 过甚
90 | 过头
91 | 过于
92 | 过逾
93 | 何止
94 | 何啻
95 | 开外
96 | 苦
97 | 老
98 | 偏
99 | 强
100 | 溢
101 | 忒
102 | 不过
103 | 不少
104 | 不胜
105 | 惨
106 | 沉
107 | 沉沉
108 | 出奇
109 | 大为
110 | 多
111 | 多多
112 | 多加
113 | 多么
114 | 分外
115 | 格外
116 | 够瞧的
117 | 够呛
118 | 好
119 | 好不
120 | 何等
121 | 很
122 | 很是
123 | 坏
124 | 可
125 | 老
126 | 老大
127 | 良
128 | 颇
129 | 颇为
130 | 甚
131 | 实在
132 | 太
133 | 太甚
134 | 特
135 | 特别
136 | 尤
137 | 尤其
138 | 尤为
139 | 尤以
140 | 远
141 | 着实
142 | 曷
143 | 碜
144 | more
145 | 大不了
146 | 多
147 | 更
148 | 比较
149 | 更加
150 | 更进一步
151 | 更为
152 | 还
153 | 还要
154 | 较
155 | 较比
156 | 较为
157 | 进一步
158 | 那般
159 | 那么
160 | 那样
161 | 强
162 | 如斯
163 | 益
164 | 益发
165 | 尤甚
166 | 逾
167 | 愈
168 | 愈 ... 愈
169 | 愈发
170 | 愈加
171 | 愈来愈
172 | 愈益
173 | 远远
174 | 越 ... 越
175 | 越发
176 | 越加
177 | 越来越
178 | 越是
179 | 这般
180 | 这样
181 | 足
182 | 足足
183 | ish
184 | 点点滴滴
185 | 多多少少
186 | 怪
187 | 好生
188 | 还
189 | 或多或少
190 | 略
191 | 略加
192 | 略略
193 | 略微
194 | 略为
195 | 蛮
196 | 稍
197 | 稍稍
198 | 稍微
199 | 稍为
200 | 稍许
201 | 挺
202 | 未免
203 | 相当
204 | 些
205 | 些微
206 | 些小
207 | 一点
208 | 一点儿
209 | 一些
210 | 有点
211 | 有点儿
212 | 有些
213 | 半点
214 | 不大
215 | 不丁点儿
216 | 不甚
217 | 不怎么
218 | 聊
219 | 没怎么
220 | 轻度
221 | 弱
222 | 丝毫
223 | 微
224 | 相对
225 | last
226 |
--------------------------------------------------------------------------------
/Crawler/Crawler/items.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define here the models for your scraped items
4 | #
5 | # See documentation in:
6 | # https://doc.scrapy.org/en/latest/topics/items.html
7 |
8 | import scrapy
9 |
10 |
11 | # class ImagespiderItem(scrapy.Item):
12 | # # define the fields for your item here like:
13 | # # name = scrapy.Field()
14 | # imgurl = scrapy.Field()
15 | # image_path = scrapy.Field()
16 | # pass
17 | #
18 | #
19 | # class TextItem(scrapy.Item):
20 | # textTitle = scrapy.Field() # title
21 | # #content = scrapy.Field() # 这个是文章的正文。
22 | #
23 | #
24 | # class simpleP(scrapy.Item):
25 | # simpleP = scrapy.Field() # 这个是单独的句子 KEYI
26 |
27 | # ------------这儿开始时新闻的。------------------ ,新浪的。
28 | class Image(scrapy.Item):
29 | src = scrapy.Field()
30 | path = scrapy.Field()
31 | title = scrapy.Field() # 或者说文件夹的名字。
32 | imagePath = scrapy.Field()
33 |
34 |
35 |
36 | class NewsContent(scrapy.Item): # 这个是具体的,图片也可以增回家一个字段把。
37 | url = scrapy.Field()
38 | title = scrapy.Field()
39 | Pcontent = scrapy.Field()
40 | timestamp = scrapy.Field()
41 | newsDate = scrapy.Field()
42 | imageUrls = scrapy.Field() # 可以调用原来的生成
43 | imagePath = scrapy.Field() # 保存在来相对位置
44 |
45 | # ----------- 三大新闻的 item 写入tengxun 数据表的这个,暂时主要是这四个字段
46 | class News(scrapy.Item):
47 | '''
48 | title = 标题
49 | Hcontent = 这个是首句的意思,暂时没怎么用到的样子。 有这几个就可以了,html代码的首段,是可能为只有一个图片的。
50 | Tcontent = 纯文字的全文吗
51 | Acontent = 这个是html 的全文。
52 | '''
53 |
54 | url = scrapy.Field()
55 | urlState = scrapy.Field()
56 | title = scrapy.Field()
57 | Hcontent = scrapy.Field()
58 | Tcontent = scrapy.Field()
59 | Acontent = scrapy.Field()
60 | newdate = scrapy.Field()
61 | fromWhere = scrapy.Field()
62 |
63 |
64 |
65 |
66 |
67 |
68 |
--------------------------------------------------------------------------------
/Crawler/Crawler/middlewares.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define here the models for your spider middleware
4 | #
5 | # See documentation in:
6 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html
7 | import random
8 |
9 | from scrapy import signals
10 |
11 | from scrapy.conf import settings # 这儿需要注意导入
12 |
13 | from requests_html import HTMLSession
14 |
15 |
16 |
17 |
18 | class ImagespiderDownloaderMiddleware(object):
19 | # Not all methods need to be defined. If a method is not defined,
20 | # scrapy acts as if the downloader middleware does not modify the
21 | # passed objects.
22 | @classmethod
23 | def from_crawler(cls, crawler):
24 | # This method is used by Scrapy to create your spiders.
25 | s = cls()
26 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
27 | return s
28 |
29 |
30 | def process_request(self,request,spider): # 后期可以改成使用requests-html的版本把,比较新。
31 | print("使用自定义请求")
32 | print(spider.name)
33 | ua = random.choice( settings["USER_AGENT_LIST"] )
34 | print(ua)
35 | request.headers['User-Agent'] = ua # 提取到的ua随机设置给请求
36 |
37 | # referer = "https://gczfl01.com" # 这个先闭
38 | # if referer:
39 | # request.headers['referer'] = referer
40 | # 设置代理,需要使用的时候使用,并且记得settings中设置,或者维护的代理池中提取(数据库)
41 | # proxy = random.choice( settings["PROXY"] )
42 | # request.meta['proxy'] = proxy
43 |
44 |
45 |
46 |
47 |
48 | pass
49 |
50 | def process_response(self, request, response, spider):
51 | # Called with the response returned from the downloader.
52 |
53 | # Must either;
54 | # - return a Response object
55 | # - return a Request object
56 | # - or raise IgnoreRequest
57 | return response
58 |
59 | def process_exception(self, request, exception, spider):
60 | # Called when a download handler or a process_request()
61 | # (from other downloader middleware) raises an exception.
62 |
63 | # Must either:
64 | # - return None: continue processing this exception
65 | # - return a Response object: stops process_exception() chain
66 | # - return a Request object: stops process_exception() chain
67 | pass
68 |
69 | def spider_opened(self, spider):
70 | spider.logger.info('Spider opened: %s' % spider.name)
71 |
--------------------------------------------------------------------------------
/Crawler/Crawler/pipelines.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define your item pipelines here
4 | #
5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
7 | import time
8 | from datetime import date, timedelta
9 |
10 | from bs4 import BeautifulSoup
11 |
12 | from Crawler.expand_package.WordCloud import Gen_WordCloud
13 | from Crawler.items import News # 这样导入才可以认识的啊
14 | from Crawler.items import Image,NewsContent # 这样导入才可以认识的啊
15 | from Crawler.expand_package.picDownloadScript import Download
16 | from Crawler.settings import CRAWL_DELAY_DAY, DOWMLOAD_IMG_IN_ACONTENT, MAKE_WORDCLOUD_STORE
17 |
18 | from Crawler.expand_package.DBcontrol import DB
19 |
20 |
21 | class newsPipeline(object): # 自己写的这个处理图片的管道,统一一个管道处理就可以了吗。
22 | def __init__(self):
23 | # 一个管道的下载的部分是一样的。
24 | self.downloadTool = Download(None) # setting 中设置默认的地址。
25 | # todo 增加mysql数据库组件也是一样的。
26 | self.crawlDelayDay = CRAWL_DELAY_DAY # 默认是爬取昨天的
27 | self.db = DB()
28 |
29 | def makeDateFolder(self): #
30 | crawl_date = (date.today() + timedelta(days=-self.crawlDelayDay)).strftime("%Y%m%d") # 昨天日期
31 | return crawl_date
32 |
33 | # def setFileTitle(self, title):
34 | # fileName = re.sub('[\/:*?"<>|]', '-', title) # 去掉非法字符
35 | # return fileName
36 |
37 |
38 |
39 |
40 | def downloadAndChangeImgPath(self,html_have_img,newsDate) -> str :
41 | '''
42 | :param html_have_img: 新闻的正文的html
43 | :param newsDate: 新闻的日期(用来做下载图片的文件名)
44 | :return: img 中的src修改成下载到本地的地址
45 | '''
46 | print("正在下载正文中")
47 | soup = BeautifulSoup( html_have_img , 'lxml')
48 | for img in soup.find_all("img"):
49 | tempSrc = img['src']
50 | if tempSrc.find("http:") == -1: # 默认可能漏掉了这部分的
51 | tempSrc = "http:" + tempSrc
52 | # time.sleep(1)
53 | fixedSrc = self.downloadTool.downloadImg(
54 | img_url=tempSrc,
55 | imgName=None,
56 | referer=None, now_date=newsDate) # 这个是下面新建力的文件夹,默认都是延迟一天的。
57 | img['src'] = fixedSrc # 这个地址放回去
58 | # 下载,返回path,然后修改。
59 | print(img['src'])
60 | print("图片下载并且修改src完成。")
61 | return [str(soup.extract()).replace("'", '"')]
62 |
63 |
64 | def fillter_Acontent(self,Acontent): # clean_the_Acontent which hava style or script
65 | soup = BeautifulSoup(Acontent, 'lxml')
66 | [s.extract() for s in soup("style")]
67 | [s.extract() for s in soup("script")]
68 | return str(soup)
69 |
70 | def process_item(self, item, spider):
71 | if isinstance(item, NewsContent):
72 | print("管道进来了!")
73 | if "imageUrls" in item: # 有图片才下载图片 这边的item还可以修改吗
74 | if len(item['imageUrls']) != 0:
75 | print(item['imageUrls'])
76 | downPath = []
77 | for url in item['imageUrls']:
78 | tempPath = self.downloadTool.downloadImg(
79 | img_url=url,
80 | imgName=None,
81 | referer=None, now_date=self.makeDateFolder()) # 这个是下面新建力的文件夹
82 | downPath.append(tempPath) # 这个也是一个list 下载地址的。
83 |
84 | downPathList=""
85 | # todo 这个是用来放回去的 。
86 | for path in downPath: # 当成独立的
87 | downPathList = downPathList+"

".format(path)
88 | print(downPathList)
89 | # item['imagePath']
90 | else:
91 | print("这个item没有图片")
92 | print(item['url'])
93 | # 返回item
94 | self.db.insert(item['url'])
95 | return item
96 |
97 | elif isinstance(item,Image):
98 | if "src" in item: # 有图片才下载图片 这边的item还可以修改吗
99 | if len(item['src']) != 0:
100 | print(item['src'])
101 | downPath = []
102 | print(item['title'])
103 | for url in item['src']:
104 | tempPath = self.downloadTool.downloadImg(
105 | img_url=url,
106 | imgName=None,
107 | referer=None, now_date=(item['title'][0])) # 这个是下面新建力的文件夹
108 | downPath.append(tempPath)
109 | item['imagePath'] = downPath
110 | return item
111 | pass
112 |
113 | elif isinstance(item, News): # 这儿是 新闻爬虫的。
114 | print("正在处理item")
115 | # 下载图片还有修改Acontent中的img
116 |
117 | if item['Acontent'][0].find("img") != -1 and DOWMLOAD_IMG_IN_ACONTENT: # 发现纯文本的这里面有图片。才执行这个下载图片
118 | print("新闻中有图片,正在本地化处理......")
119 | print(item['url'])
120 | # 这儿注释掉,暂时不用,节省空间。下载图片不下载
121 | item['Acontent'] = self.downloadAndChangeImgPath(item['Acontent'][0],item['newdate'][0]) # 插入数据库,需要把’变成”,下载失败的就没有本地化
122 | print("正在插入数据库")
123 | if item['Acontent'][0]!="": # 这儿是填充Tcontent 纯文本字段
124 | # 用bs4
125 | item['Acontent']= [self.fillter_Acontent(item['Acontent'][0])] # 先过滤一下Acontent中奇怪的标签。
126 | item['Tcontent'] =[ "".join(BeautifulSoup(item['Acontent'][0], 'lxml').text)]
127 | self.db.insertItem(item)
128 | print("插入成功数据库")
129 | if item['Tcontent'][0]!="" and MAKE_WORDCLOUD_STORE: # 不是空文本的情况下是可以生成词云图的。
130 | # 把url生成唯一的md5作为词云的文件名
131 | # 前台调用只需要用这个方法生成一下md5就行了,也是唯一的值。 前端需要注意这儿!
132 | Gen_WordCloud(Newsid=self.downloadTool.makeMd5(item['url'][0]) , text=item['Tcontent'][0])
133 | # time.sleep(60)
134 | else: # 没有词云,那就只能用默认的了。
135 | pass
136 | print("为无文本新闻")
137 | print(item['url'][0])
138 | return item
139 | pass
140 |
141 |
142 | else:
143 | print("判断这个不是管道。")
144 | print(item)
145 | return item
146 |
147 |
148 |
149 |
--------------------------------------------------------------------------------
/Crawler/Crawler/settings.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Scrapy settings for Crawler project
4 | #
5 | # For simplicity, this file contains only settings considered important or
6 | # commonly used. You can find more settings consulting the documentation:
7 | #
8 | # https://doc.scrapy.org/en/latest/topics/settings.html
9 | # https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
10 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html
11 |
12 | BOT_NAME = 'CrawlerCrawler'
13 |
14 | SPIDER_MODULES = ['Crawler.spiders']
15 | NEWSPIDER_MODULE = 'Crawler.spiders'
16 |
17 | #图片存储位置
18 | # IMAGES_STORE = 'D:\Crawler\yuhangyuan'
19 | IMAGES_STORE = "../static/images/" # 相对路径,生成到外面的Crawl项目名字外面,所以crawl放在djongo项目内一层即可
20 | DOWMLOAD_IMG_IN_ACONTENT = False # 这个是自定义的,设定是否进行图片本地化操作。True ,False两个设定
21 |
22 | # 词云的生成和上面图片的相对路径必须这样明显的不同,暂时不解
23 | WORDCLOUD_STORE = "../static/images/WordCloud/" # 相对路径,生成到外面的Crawl项目名字外面
24 | MAKE_WORDCLOUD_STORE = True # 开关生成词云
25 |
26 | #启动图片下载中间件
27 | ITEM_PIPELINES = {
28 | # 'Crawler.pipelines.TextPipeline': 300,
29 | 'Crawler.pipelines.newsPipeline': 300, # 先下载图片,后提取文本的意思
30 | }
31 | # IMAGES_STORE = "file/image"
32 | # IMAGES_URLS_FILED= 'imgurl' # 这个暂时好像没什么用,直接结合自己的图片下载模块进来也是可以的把。
33 |
34 | CRAWL_DELAY_DAY = 1
35 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
36 | #USER_AGENT = 'Crawler (+http://www.yourdomain.com)'
37 |
38 | mysqlInfo = {
39 | "host": '127.0.0.1',
40 | "user": 'root',
41 | "passwd": '123456',
42 | "db": 'newssenti', #改同一个数据库了。
43 | "port": 3306,
44 | "charset": 'utf8' #这个是数据库的配置文件
45 | }
46 |
47 | CRAWLALL_RUN_TIME = "00:01" # 24小时制
48 |
49 | COMMANDS_MODULE = 'Crawler.commands' # 配置爬取所有爬虫命令的。
50 | # Obey robots.txt rules
51 | ROBOTSTXT_OBEY = False
52 |
53 | USER_AGENT_LIST = [
54 | 'Mozilla/5.0 (Windows NT 5.2) AppleWebKit/534.30 (KHTML, like Gecko) Chrome/12.0.742.122 Safari/534.30',
55 | 'Mozilla/5.0 (Windows NT 5.1; rv:5.0) Gecko/20100101 Firefox/5.0',
56 | 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.2; Trident/4.0; .NET CLR 1.1.4322; .NET CLR 2.0.50727; .NET4.0E; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729; .NET4.0C)',
57 | 'Opera/9.80 (Windows NT 5.1; U; zh-cn) Presto/2.9.168 Version/11.50',
58 | 'Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/533.21.1 (KHTML, like Gecko) Version/5.0.5 Safari/533.21.1',
59 | 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 2.0.50727; .NET CLR 3.0.04506.648; .NET CLR 3.5.21022; .NET4.0E; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729; .NET4.0C)'
60 | ]
61 |
62 |
63 |
64 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
65 | #CONCURRENT_REQUESTS = 32
66 |
67 | # Configure a delay for requests for the same website (default: 0)
68 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
69 | # See also autothrottle settings and docs
70 | DOWNLOAD_DELAY = 1 # 下载延时
71 | # The download delay setting will honor only one of:
72 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
73 | #CONCURRENT_REQUESTS_PER_IP = 16
74 |
75 | # Disable cookies (enabled by default)
76 | #COOKIES_ENABLED = False
77 |
78 | # Disable Telnet Console (enabled by default)
79 | #TELNETCONSOLE_ENABLED = False
80 |
81 | # Override the default request headers:
82 | #DEFAULT_REQUEST_HEADERS = {
83 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
84 | # 'Accept-Language': 'en',
85 | #}
86 |
87 | # Enable or disable spider middlewares
88 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
89 | # SPIDER_MIDDLEWARES = {
90 | # }
91 |
92 | # Enable or disable downloader middlewares
93 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
94 | DOWNLOADER_MIDDLEWARES = {
95 | 'Crawler.middlewares.ImagespiderDownloaderMiddleware': 543,
96 | 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None, # 默认的是500需要,像这种这样就是可以关掉
97 | }
98 |
99 | # Enable or disable extensions
100 | # See https://doc.scrapy.org/en/latest/topics/extensions.html
101 | #EXTENSIONS = {
102 | # 'scrapy.extensions.telnet.TelnetConsole': None,
103 | #}
104 |
105 | # Configure item pipelines
106 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
107 | #ITEM_PIPELINES = {
108 | # 'Crawler.pipelines.ImagespiderPipeline': 300,
109 | #}
110 |
111 | # Enable and configure the AutoThrottle extension (disabled by default)
112 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html
113 | #AUTOTHROTTLE_ENABLED = True
114 | # The initial download delay
115 | #AUTOTHROTTLE_START_DELAY = 5
116 | # The maximum download delay to be set in case of high latencies
117 | #AUTOTHROTTLE_MAX_DELAY = 60
118 | # The average number of requests Scrapy should be sending in parallel to
119 | # each remote server
120 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
121 | # Enable showing throttling stats for every response received:
122 | #AUTOTHROTTLE_DEBUG = False
123 |
124 | # Enable and configure HTTP caching (disabled by default)
125 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
126 | #HTTPCACHE_ENABLED = True
127 | #HTTPCACHE_EXPIRATION_SECS = 0
128 | #HTTPCACHE_DIR = 'httpcache'
129 | #HTTPCACHE_IGNORE_HTTP_CODES = []
130 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
131 |
--------------------------------------------------------------------------------
/Crawler/Crawler/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 |
--------------------------------------------------------------------------------
/Crawler/Crawler/spiders/spider_expends.py:
--------------------------------------------------------------------------------
1 | import random
2 | from pprint import pprint
3 |
4 | import chardet
5 | import requests
6 | from datetime import date, timedelta
7 | from Crawler.settings import CRAWL_DELAY_DAY
8 |
9 |
10 | class TengxunExpend:
11 |
12 | def returnThemeCode(self, theme): # 这个是有用的,用来组合主题代码url的
13 | ent_Theme = 1537876288634
14 | sport_Theme = 1537877689177
15 | finance_Theme = 1537878365483
16 | tech_Theme = 1537879684280
17 | auto_Theme = 1537887032223
18 | house_Theme = 1537887128904
19 | news_Theme = 1537874915062
20 | if theme == 'news':
21 | return news_Theme
22 | if theme == 'ent':
23 | return ent_Theme
24 | if theme == 'sports':
25 | return sport_Theme
26 | if theme == 'tech':
27 | return tech_Theme
28 | if theme == 'auto':
29 | return auto_Theme
30 | if theme == 'house':
31 | return house_Theme
32 | if theme == 'finance':
33 | return finance_Theme
34 |
35 | def getThemeUrl(self, theme, today, pageNumber):
36 | rawUrl = "http://roll.news.qq.com/interface/cpcroll.php"
37 | rawReferer = '.qq.com/articleList/rolls/' # 'http://news 前面还有这个东西
38 | print(theme)
39 | print(today)
40 | print(pageNumber)
41 |
42 | my_headers = [
43 | 'Mozilla/5.0 (Windows NT 5.2) AppleWebKit/534.30 (KHTML, like Gecko) Chrome/12.0.742.122 Safari/534.30',
44 | 'Mozilla/5.0 (Windows NT 5.1; rv:5.0) Gecko/20100101 Firefox/5.0',
45 | 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.2; Trident/4.0; .NET CLR 1.1.4322; .NET CLR 2.0.50727; .NET4.0E; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729; .NET4.0C)',
46 | 'Opera/9.80 (Windows NT 5.1; U; zh-cn) Presto/2.9.168 Version/11.50',
47 | 'Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/533.21.1 (KHTML, like Gecko) Version/5.0.5 Safari/533.21.1',
48 | 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 2.0.50727; .NET CLR 3.0.04506.648; .NET CLR 3.5.21022; .NET4.0E; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729; .NET4.0C)']
49 | headers = {"User-Agent": random.choice(my_headers), 'Referer': 'http://' + theme + rawReferer} # 默认值
50 | rawUrl = rawUrl + "?callback=rollback&mode=1&cata=&_=" + str(
51 | self.returnThemeCode(theme)) + "&site=" + theme + "&page=" + str(pageNumber) + "&date=" + today
52 | print(rawUrl)
53 | try:
54 | rawhtml = requests.get(rawUrl, headers=headers, allow_redirects=False,
55 | timeout=30) # 一般提取文本的话,那就用text,如果是文件就content
56 | rawhtml.encoding = chardet.detect(rawhtml.content)['encoding']
57 | # print(rawhtml.url)
58 | print("状态码" + str(rawhtml.status_code))
59 | if rawhtml.status_code == 504:
60 | print(504)
61 | return
62 | print("页面的读取结果为")
63 | # print(rawhtml.text)
64 | if rawhtml.text.find('rollback') == 0:
65 | jsonString = rawhtml.text.split("rollback")[1] # 把js提取出来就可以了
66 | else:
67 | jsonString = rawhtml.text
68 | print(jsonString)
69 | dicData = eval(jsonString)
70 | print(type(jsonString))
71 | print(jsonString)
72 | # print(dicData['data']['article_info'])
73 | print(len(dicData['data']['article_info']))
74 | if dicData['data'] == "":
75 | print("超过了最大页数了,跳出了就可以了")
76 | return
77 | urllist = []
78 | for one in dicData['data']['article_info']:
79 | # print(one['url'])
80 | print(one['url'].replace("\\", "/")) # 还需要检查一下这个和之前的那种野蛮是不是一样的
81 | urllist.append(one['url'].replace("\\", "/"))
82 | return urllist
83 | except Exception as e:
84 | # print(e)
85 | return []
86 |
87 | def pageUrlMain(self, date=(date.today() + timedelta(days=-CRAWL_DELAY_DAY)).strftime("%Y-%m-%d") ): # 写入url进入数据库,并且写入分类
88 | resultUrlDic = {} # 写入数据库使用这个
89 | tempList = []
90 | themeList = ['news', 'ent', 'tech', 'auto', 'house', 'finance', 'sports'] # 一共有7个主题,其实不止这7个的
91 | for theme in themeList:
92 | print("第一个主题是")
93 | tempDList = []
94 | for i in range(1, 12): # 一般是10页就很多的了。10页以内
95 | print("第" + str(i) + "页")
96 | responseList = self.getThemeUrl(theme, date, i)
97 | if len(responseList) == 0:
98 | print("最大页数为" + str(i - 1) + "页")
99 | break
100 | else:
101 | tempList = tempList + responseList
102 | tempDList += responseList
103 | resultUrlDic[theme] = tempDList
104 | print(resultUrlDic)
105 | tempList = set(tempList)
106 | count = 0
107 | print("列表的url数量有:" + str(len(tempList)))
108 | for key in resultUrlDic:
109 | count += len(resultUrlDic[key])
110 | print("url总共有" + str(count))
111 |
112 | print("这个是PageUrls内的提取到的url")
113 | # pprint(resultUrlDic)
114 | print(len(resultUrlDic))
115 |
116 | print("这个开始是list类型的结果")
117 | # print(tempList)
118 |
119 | pprint(tempList)
120 |
121 |
122 | # self.dbhelper.saveDicToMysql(resultUrlDic,date,"tengxun") #参数,字典结果集,时间,分类,这儿是不需要写的。
123 | return list(tempList) # 直接这儿去重后
124 |
125 |
126 | class WangyiExpend: # 这个是网易爬虫需要获得新闻页面的拓展的部分,直接构造成start_urls,再来做别的操作。
127 | def getRollUrlList(self,date=(date.today() + timedelta(days=-CRAWL_DELAY_DAY)).strftime("%Y-%m-%d") ): #这个打开会是手机端的东西 #又重写了一遍了这个东西
128 | rollLatest = "http://news.163.com/latest/" #这个就是默认新闻
129 | requestURL ="http://news.163.com/special/0001220O/news_json.js?0.3699326344116929"
130 |
131 | my_headers = [ #这边为了得到直接的手机端的页面代码返回,直接使用手机ua
132 | 'Mozilla/5.0 (Linux; Android 7.1.1; MI 6 Build/NMF26X; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/53.0.2785.49 Mobile MQQBrowser/6.2 TBS/043508 Safari/537.36 MicroMessenger/6.5.13.1100 NetType/WIFI Language/zh_CN',
133 | 'Mozilla/5.0 (Linux; Android 7.1.1; MI 6 Build/NMF26X) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/57.0.2987.132 Mobile Safari/537.36 Maxthon/3047',
134 | # 'Mozilla/5.0 (iPhone 84; CPU iPhone OS 10_3_3 like Mac OS X) AppleWebKit/603.3.8 (KHTML, like Gecko) Version/10.0 MQQBrowser/7.8.0 Mobile/14G60 Safari/8536.25 MttCustomUA/2 QBWebViewType/1 WKType/1',
135 | 'Mozilla/5.0 (Linux; U; Android 7.0; zh-cn; STF-AL00 Build/HUAWEISTF-AL00) AppleWebKit/537.36 (KHTML, like Gecko)Version/4.0 Chrome/37.0.0.0 MQQBrowser/7.9 Mobile Safari/537.36',
136 | 'Mozilla/5.0 (Linux; U; Android 6.0.1; zh-CN; SM-C7000 Build/MMB29M) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/40.0.2214.89 UCBrowser/11.6.2.948 Mobile Safari/537.36',
137 | 'Mozilla/5.0 (Linux; Android 7.0; STF-AL10 Build/HUAWEISTF-AL10; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/53.0.2785.49 Mobile MQQBrowser/6.2 TBS/043508 Safari/537.36 V1_AND_SQ_7.2.0_730_YYB_D QQ/7.2.0.3270 NetType/4G WebP/0.3.0 Pixel/1080']
138 |
139 | headers = {"User-Agent": random.choice(my_headers), 'Referer': "http://news.163.com/latest/"} # 默认值
140 |
141 | try:
142 | rawhtml = requests.get(requestURL, headers=headers, allow_redirects=False,
143 | timeout=30) # 一般提取文本的话,那就用text,如果是文件就content
144 | rawhtml.encoding = "GBK" ##gbk>gb2312 使用这种方式尚且还有乱码的情况,部分乱码,那就是gbk可以修复
145 | # print(chardet.detect(rawhtml.content)['encoding'])
146 | if rawhtml.status_code == 504:
147 | print(504)
148 | return
149 | # print(rawhtml.url)
150 | print("状态码" + str(rawhtml.status_code))
151 | # print("页面的读取结果为")
152 | html = rawhtml.text
153 |
154 | result10=[]
155 | if html.find('"news":')!=-1:
156 | rawjsonString = html.split('"news":')[1].replace("};","")
157 | jsDic = eval("("+rawjsonString+")")
158 | for i in jsDic:
159 | if len(i)!=0:
160 | for content in i:
161 | if content['p'].split(" ")[0]==date: #这个是今天的
162 | url = content['l']
163 | if url.find("photoview")==-1: #不是图片的写入这儿
164 | result10.append(content['l'])
165 | else:
166 | pass
167 |
168 | # print("插入了"+str(len(result10)))
169 | print(result10)
170 | # self.saveListToMysql(result10, date) # todo 这儿做了注释,不写入数据库,方便进行测试/
171 |
172 | return result10 #这个是返回前一天的所有的url链接放在这儿,大概200条以内,又变少了啊
173 | except Exception as e:
174 | print(e)
175 | return #返回为空
176 | if __name__ == '__main__':
177 | # 腾讯的获得新闻列表的模块测试
178 | # tengxun_expend =TengxunExpend()
179 | # tengxun_expend.pageUrlMain()
180 |
181 | # 网易的获得新闻的列表的模块测试
182 | wangyi_expend =WangyiExpend()
183 | print(wangyi_expend.getRollUrlList()) # 默认都是获得昨天的新闻。
--------------------------------------------------------------------------------
/Crawler/Crawler/spiders/tengxu.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # 这儿改成是腾讯的就可以了。
3 | import traceback
4 | import scrapy
5 | from bs4 import BeautifulSoup
6 | from scrapy.loader import ItemLoader
7 | import time
8 | from Crawler.spiders.spider_expends import TengxunExpend
9 | from ..items import News
10 |
11 |
12 | class TengxunSpider(scrapy.Spider) : # 这边是没什么问题的了
13 | name = 'tengxun'
14 | allowed_domains = ["qq.com"]
15 | start_urls = [
16 | # 'http://roll.news.qq.com/'
17 | # 'https://news.qq.com/a/20190513/007759.htm', 测试用个案网页。
18 | # 'https://news.qq.com/a/20190512/004148.htm',
19 | # 'https://news.qq.com/a/20190514/000037.htm',
20 | # 'https://news.qq.com/a/20190513/005746.htm'
21 | ]
22 |
23 | count = 1
24 |
25 | def close(spider, reason):
26 | print("腾讯的爬虫爬完了。")
27 | # 这儿重写一下,我只写页面的具体内容的解析就可以了。
28 | def start_requests(self):
29 | tengxun_expend = TengxunExpend()
30 | self.start_urls = tengxun_expend.pageUrlMain() # 测试暂时改了
31 | for url in self.start_urls:
32 | print()
33 | print(url)
34 | yield scrapy.Request(url, dont_filter=False)
35 | # # 这里重写爬虫入口方法,将dont_filter设置为false
36 | # # 是为了让起始url放入srcapy.Request请求url池中,对起始url也做去重处理
37 | # # 一次是分页数据里检索到的第一页
38 |
39 |
40 | def parse(self, response): # 每一页的都在这儿了。
41 | main = response.xpath("//*[@class='Cnt-Main-Article-QQ']")[0]
42 | print(main) # xpath object
43 | title, Hcontent, Tcontent, Acontent = "", "", "", "" # 最后一个参数好像没什么用
44 | try:
45 | title = response.xpath("//head/title/text()").extract_first()
46 |
47 | newdate = response.xpath("//span[@class='a_time']/text()").extract_first().split(" ")[0]
48 | lenP = main.xpath("p")
49 | print(len(lenP))
50 | if len(lenP) > 2: # 为2的好像是纯视频的,还有一个文字描述的这种。
51 | Hcontent = lenP[0].extract()
52 |
53 | for p in main.xpath("p"):
54 | simpleP = p.extract()
55 | Acontent += simpleP
56 |
57 | # Tcontent = "".join(BeautifulSoup(Acontent, 'lxml').text)
58 | # print(title)
59 | # print()
60 | # print(Acontent)
61 | # print()
62 | # print(Tcontent)
63 | # print()
64 | # print(Hcontent)
65 | # print()
66 | newsloader = ItemLoader(item=News(), response=response) # 但是使用这种方法插入进去的都会是list。
67 | newsloader.add_value('title', title)
68 | newsloader.add_value('Acontent', Acontent)
69 | # newsloader.add_value('Tcontent', Tcontent) # 统一管道进行处理
70 | newsloader.add_value('Hcontent', Hcontent)
71 | newsloader.add_value('url', response.url)
72 | newsloader.add_value('urlState', "True")
73 | newsloader.add_value('fromWhere', "tengxun")
74 | newsloader.add_value("newdate",newdate)
75 |
76 | yield newsloader.load_item()
77 | print(newsloader.load_item())
78 | # time.sleep(180)
79 |
80 | else:
81 | print("这个为纯视频的新闻,无文本,正在跳过。")
82 |
83 | except Exception as e:
84 | print(e)
85 | traceback.print_exc() # 貌似这个,一个错
86 |
87 |
88 |
--------------------------------------------------------------------------------
/Crawler/Crawler/spiders/wangyi.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # 这儿改成是腾讯的就可以了。
3 | import traceback
4 | from datetime import timedelta,date
5 | import scrapy
6 | import time
7 | from bs4 import BeautifulSoup
8 | from scrapy.loader import ItemLoader
9 |
10 | from Crawler.items import News
11 | from Crawler.settings import CRAWL_DELAY_DAY
12 | from Crawler.spiders.spider_expends import WangyiExpend
13 |
14 |
15 | class WangyiSpider(scrapy.Spider) : # 这边是没什么问题的了
16 | name = 'wangyi'
17 | allowed_domains = ["163.com"] #
18 | start_urls = [
19 | # 'https://news.163.com/19/0514/04/EF4400KC0001899N.html',
20 | # 'https://news.163.com/19/0514/06/EF49KV8A00018AOR.html'
21 |
22 | ]
23 |
24 | count = 1
25 |
26 | def close(spider, reason):
27 | print("网易的爬虫爬完了。")
28 | # 这儿重写一下,我只写页面的具体内容的解析就可以了。
29 |
30 | def start_requests(self):
31 | wangyi_expend = WangyiExpend()
32 | self.start_urls = wangyi_expend.getRollUrlList() # 默认都是获得昨天的新闻。
33 | for url in self.start_urls:
34 | # print()
35 | # print(url)
36 | yield scrapy.Request(url, dont_filter=False)
37 |
38 |
39 |
40 | def parse(self, response): # 每一页的都在这儿了。
41 | throwSrcPart = (date.today() + timedelta(days=-CRAWL_DELAY_DAY)).strftime("%Y/%m/%d") # settings里面有
42 | print(throwSrcPart)
43 |
44 | title, Hcontent, Tcontent, Acontent = "", "", "", "" # 最后一个参数好像没什么用
45 | try:
46 | title = response.xpath("//head/title/text()").extract_first()
47 | mainP = response.xpath("//div[@class='post_text']")[0]
48 | # print(mainP.extract())
49 | for p in mainP.xpath("p"):
50 | pp = p.xpath("img/@src").extract()
51 | # print(p)
52 | if len(pp) !=0 : # 找到有图片
53 | # print("找到图片")
54 | # print(pp[0])
55 | if pp[0].find(throwSrcPart)!=-1:
56 | print(pp[0])
57 | print("丢弃这个p")
58 | else:
59 | Acontent += p.extract()
60 |
61 | else:
62 | Acontent += p.extract()
63 |
64 | # time.sleep(60)
65 | # print(Acontent)
66 | lastDayDate = (date.today() + timedelta(days=-CRAWL_DELAY_DAY)).strftime("%Y-%m-%d") # settings里面有
67 | tempAcontent = BeautifulSoup(Acontent, 'lxml')
68 | # Tcontent = "".join(tempAcontent.text)
69 |
70 | lenP = tempAcontent.find_all("p")
71 | print(len(lenP))
72 | if len(lenP) > 2: # 为2的好像是纯视频的,还有一个文字描述的这种。
73 | Hcontent = str(lenP[0])
74 | print("Hcontent")
75 | print(Hcontent.replace(r'\n',""))
76 |
77 |
78 | # print(title)
79 | # print()
80 | # print(Acontent)
81 | # print()
82 | # print(Tcontent)
83 | # print()
84 | # print(Hcontent)
85 | # print()
86 | newsloader = ItemLoader(item=News(), response=response) # 但是使用这种方法插入进去的都会是list。
87 | newsloader.add_value('title', title)
88 | newsloader.add_value('Acontent', Acontent)
89 | # newsloader.add_value('Tcontent', Tcontent) # 这个字段统一给管道进行处理
90 | newsloader.add_value('Hcontent', Hcontent)
91 | newsloader.add_value('url', response.url)
92 | newsloader.add_value('urlState', "True")
93 | newsloader.add_value('fromWhere', "wangyi")
94 | newsloader.add_value("newdate",lastDayDate)
95 |
96 | yield newsloader.load_item()
97 | print(newsloader.load_item())
98 | # time.sleep(180)
99 |
100 | # else:
101 | # print("这个为纯视频的新闻,无文本,正在跳过。")
102 |
103 | except Exception as e:
104 | print(e)
105 | traceback.print_exc() # 貌似这个,一个错
106 |
107 |
108 |
--------------------------------------------------------------------------------
/Crawler/Crawler/spiders/xinlang.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # todo 会卡住,这个问题怎么解决
3 | import json
4 | from datetime import timedelta,date
5 | import pysnooper # debug 用的包
6 | import scrapy
7 | from bs4 import BeautifulSoup
8 | from scrapy.loader import ItemLoader
9 | import time
10 | from Crawler.settings import CRAWL_DELAY_DAY
11 | from ..items import News
12 |
13 |
14 | class XinlangSpider(scrapy.Spider) :
15 | name = 'xinlang'
16 | # 爬取的域名,不会超出这个顶级域名
17 | allowed_domains = ['sina.com'] # 可以设置成不过滤吗。
18 | start_urls = [
19 | ]
20 |
21 | count = 1
22 | # {}占位符,用于字符串替换,将获取到的/text/page/1格式内容替换成完整url 这个是新浪新闻的。滚动新闻的页面
23 | host_url = 'https://feed.mix.sina.com.cn/api/roll/get?pageid=153&lid=2509&k=&num=50&page={}'
24 |
25 | def close(spider, reason):
26 | print("网易的爬虫爬完了。") #发邮件之类的。
27 |
28 |
29 |
30 | def start_requests(self):
31 | for num in range(1,100): # 这儿是看爬取多少页的。一般56*100=5600
32 | print(self.host_url.format(num))
33 | self.start_urls.append(self.host_url.format(num))
34 | for url in self.start_urls:
35 | yield scrapy.Request(url, dont_filter=False)
36 | # # 这里重写爬虫入口方法,将dont_filter设置为false
37 | # # 是为了让起始url放入srcapy.Request请求url池中,对起始url也做去重处理
38 | # # 否则会爬取到两次 https://www.qiushibaike.com/text/,一次是起始url
39 | # # 一次是分页数据里检索到的第一页
40 | def parse(self, response):
41 | # itemloader
42 | '''
43 | 这儿只取昨天的。 这儿是把json中每一页的url提取出来,有两层的深度。
44 | url = scrapy.Field()
45 | urlState = scrapy.Field()
46 | title = scrapy.Field()
47 | Hcontent = scrapy.Field()
48 | Tcontent = scrapy.Field()
49 | Acontent = scrapy.Field()
50 | newdate = scrapy.Field()
51 | fromWhere = scrapy.Field()
52 | :param response:
53 | :return:
54 | '''
55 | allDic = json.loads(response.body)
56 | # print(allDic)
57 | print(type(allDic))
58 | for one in allDic['result']['data']:
59 | itemloader = ItemLoader(item=News(), response=response )
60 | timeStamp = one['intime']
61 | timeArray = time.localtime(int(timeStamp))
62 | newsDatetemp = time.strftime("%Y-%m-%d %H:%M:%S", timeArray)
63 | newsDate = newsDatetemp.split(" ")[0]
64 | print(newsDate)
65 |
66 | url = ""
67 | if "url" in one:
68 | # print("有url的")
69 | url = one["url"]
70 | pass # 有就直接提取这个
71 | elif "urls" in one:
72 | print("没有url")
73 | tempUrl = one["urls"][0]
74 | url = tempUrl.replace("\/","/")
75 |
76 | print()
77 | # 添加进item
78 | lastDayDate = (date.today() + timedelta(days=-CRAWL_DELAY_DAY)).strftime("%Y-%m-%d") # settings里面有
79 | if newsDate ==lastDayDate: # 只取出昨天的新闻。特指只选择昨天的新闻,这样才对把
80 | itemloader.add_value('url',url) # 这儿我发现了,有些是没有这个字段的
81 | itemloader.add_value('title', one['title'])
82 | itemloader.add_value('newdate', newsDate)
83 | resultItem = itemloader.load_item() # item 也是可以传过去的,传过去继续填充。
84 | yield scrapy.Request(url=resultItem['url'][0],callback=self.newsContent,dont_filter=True,meta={"lastItem":resultItem})
85 | else:
86 | print("不是昨天的新闻,正在选择性跳过")
87 |
88 |
89 | # 这边是解析详情页的部分。
90 | @pysnooper.snoop() #这样就可以debug了
91 | def newsContent(self,response):
92 | title, Hcontent, Tcontent, Acontent = "", "", "", "" # 最后一个参数好像没什么用
93 | lastItem = response.meta.get("lastItem",None) # 这样就可以避免不行的。
94 |
95 | # 这边这个开始是划分句子,用html代码就可以,为了提取首段
96 | contentlist = []
97 | for allp in response.xpath("//div[@class='article']"): # //div[@class='article'] ,要取这下面的所有的文本对吧
98 | for p in allp.xpath("p"):
99 | print(p.extract())
100 | contentlist.append(p.extract())
101 | # contentlist.append(p.xpath("string(.)").extract_first().strip()) # 换用这种后呢,会不会就不会再发生那种事情了。
102 | print()
103 | print("全文中句子的数量有那么多{}".format(len(contentlist)))
104 | print(contentlist)
105 | if len(contentlist) > 0: # 是否是没有纯文本的新闻的处理写在管道里面就好了。
106 | print(contentlist[0]) # 取第一个作为首段的东西
107 | Hcontent = contentlist[0]
108 |
109 | # print("新闻的正文内容在这里。")
110 | Acontent = response.xpath("//div[@class='article']").extract_first() # 这个就是str
111 | # Tcontent = "".join(BeautifulSoup(Acontent, 'lxml').text)
112 | if Tcontent=="":
113 | print(Tcontent)
114 | print(Acontent)
115 | print("可能为图片新闻")
116 | print(response.url)
117 | # time.sleep(10)
118 |
119 | newsloader = ItemLoader(item=News(), response=response) # 但是使用这种方法插入进去的都会是list。
120 | newsloader.add_value('title', lastItem['title'][0])
121 | newsloader.add_value('Acontent', Acontent)
122 | # newsloader.add_value('Tcontent', Tcontent) # 统一有管道进行处理
123 | newsloader.add_value('Hcontent', Hcontent)
124 | newsloader.add_value('url', response.url)
125 | newsloader.add_value('urlState', "True")
126 | newsloader.add_value('fromWhere', "xinlang")
127 | newsloader.add_value("newdate", lastItem['newdate'][0])
128 |
129 |
130 | yield newsloader.load_item() # 这个扔给管道就可以了。
131 | print(newsloader.load_item())
132 | # time.sleep(60)
133 |
134 |
135 |
136 |
--------------------------------------------------------------------------------
/Crawler/TengxunMain.py:
--------------------------------------------------------------------------------
1 | from scrapy import cmdline
2 |
3 |
4 | if __name__ == '__main__':
5 | cmdline.execute("scrapy crawl tengxun".split())
6 |
--------------------------------------------------------------------------------
/Crawler/TogetherCrawl.py:
--------------------------------------------------------------------------------
1 | from scrapy import cmdline
2 | # 同时启动所有的爬虫进行爬取工作。
3 |
4 | if __name__ == '__main__':
5 | cmdline.execute("scrapy crawlall".split())
6 |
--------------------------------------------------------------------------------
/Crawler/WangyiMain.py:
--------------------------------------------------------------------------------
1 | from scrapy import cmdline
2 |
3 |
4 | if __name__ == '__main__':
5 | cmdline.execute("scrapy crawl wangyi".split())
6 |
--------------------------------------------------------------------------------
/Crawler/XinlangMain.py:
--------------------------------------------------------------------------------
1 | from scrapy import cmdline
2 |
3 |
4 | if __name__ == '__main__':
5 | cmdline.execute("scrapy crawl xinlang".split())
6 |
7 | # print("哈哈哈")
8 |
9 | # todo 新浪的,不知道为什么会提取出当天的,我只要昨天的,这样比较整齐。
10 | # todo 明天把分类到那六个表还有把评论提取到剩下的那六个表的操作做完,然后再合并起来。
11 | # todo 统一在管道进行过滤处理把,爬虫内是可以不处理的。然后管道内的那儿用bs4去掉style的这种/script这种也是。
--------------------------------------------------------------------------------
/Crawler/desktop.ini:
--------------------------------------------------------------------------------
1 | [.ShellClassInfo]
2 | IconResource=C:\WINDOWS\System32\SHELL32.dll,27
3 | [ViewState]
4 | Mode=
5 | Vid=
6 | FolderType=Generic
7 |
--------------------------------------------------------------------------------
/Crawler/scrapy.cfg:
--------------------------------------------------------------------------------
1 | # Automatically created by: scrapy startproject
2 | #
3 | # For more information about the [deploy] section see:
4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
5 |
6 | [settings]
7 | default = Crawler.settings
8 |
9 | [deploy]
10 | url = http://localhost:8088/
11 | project = Crawler
12 | username = demo
13 | password = 123456
14 |
15 |
--------------------------------------------------------------------------------
/Crawler/setup.py:
--------------------------------------------------------------------------------
1 | # Automatically created by: shub deploy
2 |
3 | from setuptools import setup, find_packages
4 |
5 | setup(
6 | name = 'project',
7 | version = '1.0',
8 | packages = find_packages(),
9 | entry_points = {'scrapy': ['settings = Crawler.settings']},
10 | )
11 |
--------------------------------------------------------------------------------
/Crawler/togetherCrawl_scheduling.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*
2 |
3 | import datetime
4 | import multiprocessing
5 | import os
6 | import schedule
7 | import time
8 | from scrapy import cmdline
9 | # 同时启动所有的爬虫进行爬取工作。
10 | from Crawler.settings import CRAWLALL_RUN_TIME
11 |
12 |
13 | def worker_1(interval):
14 | print ("开始所有爬虫工作")
15 | cmdline.execute("scrapy crawlall".split())
16 |
17 |
18 |
19 |
20 | class AutoRunAtTime: #这儿只是一个线程的
21 | def job(self,name): #这个是主线程把
22 | print("正在爬取今天的新闻内容")
23 | print('这里是进程: %sd 父进程ID:%s' % (os.getpid(), os.getppid()))
24 | p1 = multiprocessing.Process(target=worker_1, args=(6,))
25 | # p3 = multiprocessing.Process(target=worker_3, args=(4,))
26 |
27 | p1.daemon = True
28 | # p2.daemon = True
29 |
30 | p1.start()
31 | # p2.start()
32 | # p3.start()
33 | print("The number of CPU is:" + str(multiprocessing.cpu_count()))
34 | for p in multiprocessing.active_children():
35 | print("child p.name:" + p.name + "\tp.id" + str(p.pid))
36 |
37 | p1.join()
38 | # p2.join()
39 |
40 |
41 | def startAutoRun(self,timeSet): #24小时制的时间输入,传入一个时间的字符串
42 | name = "scrapy_news"
43 | schedule.every().day.at(timeSet).do(self.job, name) # 应该也是24小时制的,记得 “输入24小时制的时间字符串
44 | while True:
45 | schedule.run_pending()
46 | # print("等待下一次...")
47 | time.sleep(1)
48 |
49 |
50 | if __name__=="__main__":
51 | autoRun = AutoRunAtTime()
52 | print(time.strftime('%Y.%m.%d', time.localtime(time.time())))
53 | print("现在的时间是")
54 | print(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
55 | autoRun.startAutoRun(CRAWLALL_RUN_TIME) #测试直接这儿写运行时间比较方便
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # newsSpider scrapy
2 | 故名思意,这个就是一个新闻的爬虫,目前这个项目是用来练习scrapy框架的爬虫的地方。目前这个项目是一边学习一边的进行更新的,现在是用来爬取新浪新闻的, 如果也对你有帮助可以小小star鼓励一下哦😝
3 |
4 | ## xinlanggundong (练习时的部分项目 🙃 下面的才是完整版本)
5 | 这儿是新浪新闻滚动新闻的测试项目部分
6 |
7 | ## Crawler (比较完整 😀 )
8 | 比较完整的,爬取 腾讯 、网易 、新浪 新闻的scrap爬虫项目。
9 | ### 功能:
10 | + 服务器上运行可以设定定时爬取
11 | + 管道部分写的是自定义写入自己定义的mysql的样式,具体使用可以在settings中启用或者停用或者自定义
12 | + 里面是spiders/文件夹下三个平台的爬虫可以一同进行爬取工作,
13 | + 对腾讯、网易、新浪新闻中的正文和图片进行本地化爬取。
14 | + 并且管道中有对每篇新闻生成对应的词云图片
15 |
16 | ### 使用方法:
17 | + 1.安装scrapy 环境,建议conda配置
18 | + 2.git clone https://github.com/realzhengyiming/newsSpier_scrapy.git
19 | + 3.```cd Crawler```
20 | + 4.```python Together_Crawl.py``` 一次性跑三个爬虫,包括腾讯、网易、新浪
21 | + 5.```python togetherCrawl_scheduling.py``` 一次性跑三个爬虫,包括腾讯、网易、新浪(定时,时间设置先在settings.py中设置)
22 | + 设置settings.py 中``` CRAWLALL_RUN_TIME="XX:XX" 24小时制 ```
23 | + 如果是linux上定时跑,可以 ```nohup python togetherCrawl_scheduling.py```
24 | + 6. ```python tengxunMain.py``` 只爬取腾讯爬虫的部分
25 | + 7. ```python wangyiMain.py``` 只爬取网易新闻爬虫的部分
26 | + 8.```python xinlangMain.py``` 只爬取新浪爬虫的部分
27 | + 9. 重写了命令,可以直接scrapy crawlall 进行三个爬虫的同时爬取(同理默认scrapy crawl tengxun 这样也是可以的)
28 |
29 | ### 更多设置
30 | + 6.新闻中的图片需要下载请在settings.py 中 设置,如``` IMAGES_STORE = "../static/images/" ```(此处使用相对路径)
31 | + ```DOWMLOAD_IMG_IN_ACONTENT = False ``` 开启或者关闭把新闻中的图片本地化操作。
32 | + 7.开关词云的生成,settings.py 中设置
33 | + ``` MAKE_WORDCLOUD_STORE = True ``` 默认开启词云
34 | + ``` WORDCLOUD_STORE = "../static/images/WordCloud/" ``` 设置词云的生成地址,默认是相对路径,项目外同级目录
35 |
36 | ### 注意🎃
37 | 因为我这个项目是另一个完整项目的一部分, 另一个完整项目是django+scrapy 的新闻的情感分析平台, 这是scrapy用来做数据爬取入库操作的。
38 |
39 | 所以这儿的管道做的操作比较多,除了本地化图片和新闻正文,还有生成词云,甚至还有调用简单词频的方法进行情感分析的操作后才写入数据库的管道做的操作比较多,
40 | 除了本地化图片和新闻正文,还有生成词云,所以使用的时候可以根据这个来改,请自定义去掉不需要的功能。
41 |
42 |
43 |
44 | # todo
45 | 练习中待做的事情。
46 | + 设置UA (👌)
47 | + 代理,类似ua (👌)
48 | + 设置请求延迟 (👌)
49 | + settings 中 DOWNLOAD_DELAY (👌)
50 | + 重写请求,使用 selenuim + chrome 无头模式组合来使用做动态爬取 (👌)
51 | + 提取下一页后继续爬取 (👌)
52 | + 可以爬取新浪新闻滚动页面了(默认设置成爬取前一天的,目前只能纯文本) (👌)
53 | + 可以爬取新浪新闻滚动页面了(默认设置成爬取前一天的,结合图片和纯文本) (👌)
54 | + 使用chrome+ selenuim 的时候下载图片前的设置referer
55 | + 如果上面那条不太好用,也可以考虑 使用 requests-html 这个比较新的可以解析动态的库来进行合并。
56 | + 自定义下载媒体的图片 (👌)
57 | + 链接把数据写入mongdb 或者 别的数据库mysql (👌)
58 |
59 |
60 |
61 |
62 |
63 |
64 |
--------------------------------------------------------------------------------
/xinlanggundong/.idea/deployment.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
--------------------------------------------------------------------------------
/xinlanggundong/.idea/encodings.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/xinlanggundong/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
--------------------------------------------------------------------------------
/xinlanggundong/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/xinlanggundong/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
--------------------------------------------------------------------------------
/xinlanggundong/.idea/workspace.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
73 |
74 |
75 |
76 |
77 |
78 |
79 |
80 |
81 |
82 |
83 |
84 |
85 |
86 |
87 |
88 |
89 |
90 |
91 |
92 |
93 |
94 |
95 |
96 |
97 |
98 |
99 |
100 |
101 |
102 |
103 |
104 |
105 |
106 |
107 |
108 |
109 |
110 |
111 |
112 |
113 |
114 |
115 |
116 |
117 |
118 |
119 |
120 |
121 |
122 |
123 |
124 |
125 |
126 |
127 |
128 |
129 |
130 |
131 |
132 |
133 |
134 |
135 |
136 |
137 |
138 |
139 |
140 |
141 |
142 |
143 |
144 |
145 |
146 |
147 |
148 |
149 |
150 |
151 |
152 |
153 |
154 |
155 |
156 |
157 |
158 |
159 |
160 |
161 |
162 |
163 |
164 |
165 |
166 |
167 |
168 |
169 |
170 |
171 |
172 |
173 |
174 |
175 |
176 |
177 |
178 |
179 |
180 |
181 |
182 |
183 |
184 |
185 |
186 |
187 |
188 |
189 |
190 |
191 |
192 |
193 |
194 |
195 |
196 |
197 |
198 |
199 |
200 |
201 |
202 |
203 | de
204 | robot
205 | 50
206 | 看他的朋友圈就懂了
207 | count
208 | 提取url中
209 |
210 |
211 |
212 |
213 |
214 |
215 |
216 |
217 |
218 |
219 |
220 |
221 |
222 |
223 |
224 |
225 |
226 |
227 |
228 |
229 |
230 |
231 | true
232 | DEFINITION_ORDER
233 |
234 |
235 |
236 |
237 |
238 |
239 |
240 |
241 |
242 |
243 |
244 |
245 |
246 |
247 |
248 |
249 |
250 |
251 |
252 |
253 |
254 |
255 |
256 |
257 |
258 |
259 |
260 |
261 |
262 |
263 |
264 |
265 |
266 |
267 |
268 |
269 |
270 |
271 |
272 |
273 |
274 |
275 |
276 |
277 |
278 |
279 |
280 |
281 |
282 |
283 |
284 |
285 |
286 |
287 |
288 |
289 |
290 |
291 |
292 |
293 |
294 |
295 |
296 |
297 |
298 |
299 |
300 |
301 |
302 |
303 |
304 |
305 |
306 |
307 |
308 |
309 |
310 |
311 |
312 |
313 |
314 |
315 |
316 |
317 |
318 |
319 |
320 |
321 |
322 |
323 |
324 |
325 |
326 |
327 |
328 |
329 |
330 |
331 |
332 |
333 |
334 |
335 |
336 |
337 |
338 |
339 |
340 |
341 |
342 |
343 |
344 |
345 |
346 |
347 |
348 |
349 |
350 |
351 |
352 |
353 |
354 |
355 |
356 |
357 |
358 |
359 |
360 |
361 |
362 |
363 |
364 |
365 |
366 |
367 |
368 |
369 |
370 |
371 |
372 |
373 |
374 |
375 | project
376 |
377 |
378 |
379 |
380 |
381 |
382 |
383 |
384 |
385 |
386 |
387 |
388 |
389 |
390 |
391 |
392 |
393 |
394 |
395 |
396 |
397 |
398 |
399 |
400 | 1555981071936
401 |
402 |
403 | 1555981071936
404 |
405 |
406 |
407 |
408 |
409 |
410 |
411 |
412 |
413 |
414 |
415 |
416 |
417 |
418 |
419 |
420 |
421 |
422 |
423 |
424 |
425 |
426 |
427 |
428 |
429 |
430 |
431 |
432 |
433 |
434 |
435 |
436 |
437 |
438 |
439 |
440 |
441 |
442 |
443 |
444 |
445 |
446 |
447 |
448 |
449 |
450 |
451 |
452 |
453 |
454 |
455 |
456 |
457 |
458 |
459 |
460 |
461 |
462 |
463 |
464 |
465 |
466 |
467 |
468 |
469 |
470 |
471 |
472 |
473 |
474 |
475 |
476 |
477 |
478 |
479 |
480 |
481 |
482 |
483 |
484 |
485 |
486 |
487 |
488 |
489 |
490 |
491 |
492 |
493 |
494 |
495 |
496 |
497 |
498 |
499 |
500 |
501 |
502 |
503 |
504 |
505 |
506 |
507 |
508 |
509 |
510 |
511 |
512 |
513 |
514 |
515 |
516 |
517 |
518 |
519 |
520 |
521 |
522 |
523 |
524 |
525 |
526 |
527 |
528 |
529 |
530 |
531 |
532 |
533 |
534 |
535 |
536 |
537 |
538 |
539 |
540 |
541 |
542 |
543 |
544 |
545 |
546 |
547 |
548 |
549 |
550 |
551 |
552 |
553 |
554 |
555 |
556 |
557 |
558 |
559 |
560 |
561 |
562 |
563 |
564 |
565 |
566 |
567 |
568 |
569 |
570 |
571 |
572 |
573 |
574 |
575 |
576 |
577 |
578 |
579 |
580 |
581 |
582 |
--------------------------------------------------------------------------------
/xinlanggundong/.idea/xinlanggundong.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
--------------------------------------------------------------------------------
/xinlanggundong/README.md:
--------------------------------------------------------------------------------
1 | # 这个是新浪滚动新闻爬取的scrapy 爬虫测试
2 | 2019-05-01
3 | 已经可以准确的爬取前一天的所有新闻了(设置成这样,方便第二天1点把昨天的所有新闻爬下来),这个是增量爬虫
4 |
5 |
6 |
7 | # todo
8 | 待做到事情。
9 | + 设置UA (👌)
10 | + 代理 (👌)类似ua
11 | + 设置请求延迟 (👌)
12 | + settings 中 DOWNLOAD_DELAY
13 | + 可以把爬取下前一天的所有新闻的文字版本了
14 | + 大部分可以爬下来的属于同一种的页面布局 (👌)
15 | + 发现了另一种布局,数量不算多,可增加解析或者丢弃
16 | + 重写请求,使用 selenuim + chrome 无头模式组合来使用做动态爬取
17 | + 提取下一页后继续爬取
18 | + 下载媒体的图片
19 | + 链接把数据写入mongdb
20 |
--------------------------------------------------------------------------------
/xinlanggundong/ViewData.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "import pandas as pd\n",
10 | "import numpy as np"
11 | ]
12 | },
13 | {
14 | "cell_type": "code",
15 | "execution_count": 5,
16 | "metadata": {},
17 | "outputs": [],
18 | "source": [
19 | "df = pd.read_csv(\"lastday.csv\",encoding=\"gb18030\")"
20 | ]
21 | },
22 | {
23 | "cell_type": "code",
24 | "execution_count": 7,
25 | "metadata": {},
26 | "outputs": [
27 | {
28 | "name": "stdout",
29 | "output_type": "stream",
30 | "text": [
31 | "\n",
32 | "RangeIndex: 2227 entries, 0 to 2226\n",
33 | "Data columns (total 4 columns):\n",
34 | "Pcontent 2210 non-null object\n",
35 | "newsDate 2227 non-null object\n",
36 | "title 2227 non-null object\n",
37 | "url 2227 non-null object\n",
38 | "dtypes: object(4)\n",
39 | "memory usage: 69.7+ KB\n"
40 | ]
41 | }
42 | ],
43 | "source": [
44 | "df.info()"
45 | ]
46 | },
47 | {
48 | "cell_type": "code",
49 | "execution_count": 8,
50 | "metadata": {},
51 | "outputs": [
52 | {
53 | "data": {
54 | "text/html": [
55 | "\n",
56 | "\n",
69 | "
\n",
70 | " \n",
71 | " \n",
72 | " | \n",
73 | " Pcontent | \n",
74 | " newsDate | \n",
75 | " title | \n",
76 | " url | \n",
77 | "
\n",
78 | " \n",
79 | " \n",
80 | " \n",
81 | " 0 | \n",
82 | " 印巴空战过去已经整整两个月了,由于各种各样的原因,2月27日发生在南亚上空的那场空战,于外界... | \n",
83 | " 2019-04-30 | \n",
84 | " “枭龙”击落印度苏-30MKI证据确凿?照片系人为捏造 | \n",
85 | " https://mil.news.sina.com.cn/jssd/2019-04-30/d... | \n",
86 | "
\n",
87 | " \n",
88 | " 1 | \n",
89 | " 节前重磅!科创板两融规则来了,券源扩大、借入人需符合4条件、证金公司可出借自有证券…来看十大... | \n",
90 | " 2019-04-30 | \n",
91 | " 科创板两融规则来了 来看十大关键点 | \n",
92 | " https://finance.sina.com.cn/stock/kechuangban/... | \n",
93 | "
\n",
94 | " \n",
95 | " 2 | \n",
96 | " 蚂蚁金服清空趣店股份,从合作终止到“被动”荣升二股东,双方早生嫌隙?趣店的前六大股东中,只剩... | \n",
97 | " 2019-04-30 | \n",
98 | " 蚂蚁金服清空趣店股份 双方或早生嫌隙? | \n",
99 | " https://finance.sina.com.cn/roll/2019-04-30/do... | \n",
100 | "
\n",
101 | " \n",
102 | " 3 | \n",
103 | " 苹果将在周二盘后发布其2019财年2季度财报,今年以来持续性反弹的公司股价将面临考验。苹果公... | \n",
104 | " 2019-04-30 | \n",
105 | " 苹果财报看点:iPhone、服务业、5G手机一个都不能少 | \n",
106 | " https://finance.sina.com.cn/stock/usstock/c/20... | \n",
107 | "
\n",
108 | " \n",
109 | " 4 | \n",
110 | " 原标题:助神舟、嫦娥任务成功的功勋舰远望2号落户地披露:江苏江阴新华社消息,圆满完成40多年... | \n",
111 | " 2019-04-30 | \n",
112 | " 助攻嫦娥任务的功勋舰落户地披露:江苏江阴 | \n",
113 | " https://news.sina.com.cn/o/2019-04-30/doc-ihvh... | \n",
114 | "
\n",
115 | " \n",
116 | "
\n",
117 | "
"
118 | ],
119 | "text/plain": [
120 | " Pcontent newsDate \\\n",
121 | "0 印巴空战过去已经整整两个月了,由于各种各样的原因,2月27日发生在南亚上空的那场空战,于外界... 2019-04-30 \n",
122 | "1 节前重磅!科创板两融规则来了,券源扩大、借入人需符合4条件、证金公司可出借自有证券…来看十大... 2019-04-30 \n",
123 | "2 蚂蚁金服清空趣店股份,从合作终止到“被动”荣升二股东,双方早生嫌隙?趣店的前六大股东中,只剩... 2019-04-30 \n",
124 | "3 苹果将在周二盘后发布其2019财年2季度财报,今年以来持续性反弹的公司股价将面临考验。苹果公... 2019-04-30 \n",
125 | "4 原标题:助神舟、嫦娥任务成功的功勋舰远望2号落户地披露:江苏江阴新华社消息,圆满完成40多年... 2019-04-30 \n",
126 | "\n",
127 | " title \\\n",
128 | "0 “枭龙”击落印度苏-30MKI证据确凿?照片系人为捏造 \n",
129 | "1 科创板两融规则来了 来看十大关键点 \n",
130 | "2 蚂蚁金服清空趣店股份 双方或早生嫌隙? \n",
131 | "3 苹果财报看点:iPhone、服务业、5G手机一个都不能少 \n",
132 | "4 助攻嫦娥任务的功勋舰落户地披露:江苏江阴 \n",
133 | "\n",
134 | " url \n",
135 | "0 https://mil.news.sina.com.cn/jssd/2019-04-30/d... \n",
136 | "1 https://finance.sina.com.cn/stock/kechuangban/... \n",
137 | "2 https://finance.sina.com.cn/roll/2019-04-30/do... \n",
138 | "3 https://finance.sina.com.cn/stock/usstock/c/20... \n",
139 | "4 https://news.sina.com.cn/o/2019-04-30/doc-ihvh... "
140 | ]
141 | },
142 | "execution_count": 8,
143 | "metadata": {},
144 | "output_type": "execute_result"
145 | }
146 | ],
147 | "source": [
148 | "df.head()"
149 | ]
150 | },
151 | {
152 | "cell_type": "code",
153 | "execution_count": 10,
154 | "metadata": {},
155 | "outputs": [
156 | {
157 | "data": {
158 | "text/plain": [
159 | "2019-04-30 2226\n",
160 | "newsDate 1\n",
161 | "Name: newsDate, dtype: int64"
162 | ]
163 | },
164 | "execution_count": 10,
165 | "metadata": {},
166 | "output_type": "execute_result"
167 | }
168 | ],
169 | "source": [
170 | "df.newsDate.value_counts()"
171 | ]
172 | },
173 | {
174 | "cell_type": "code",
175 | "execution_count": 13,
176 | "metadata": {},
177 | "outputs": [
178 | {
179 | "data": {
180 | "text/html": [
181 | "\n",
182 | "\n",
195 | "
\n",
196 | " \n",
197 | " \n",
198 | " | \n",
199 | " Pcontent | \n",
200 | " newsDate | \n",
201 | " title | \n",
202 | " url | \n",
203 | "
\n",
204 | " \n",
205 | " \n",
206 | " \n",
207 | " 10 | \n",
208 | " Pcontent | \n",
209 | " newsDate | \n",
210 | " title | \n",
211 | " url | \n",
212 | "
\n",
213 | " \n",
214 | "
\n",
215 | "
"
216 | ],
217 | "text/plain": [
218 | " Pcontent newsDate title url\n",
219 | "10 Pcontent newsDate title url"
220 | ]
221 | },
222 | "execution_count": 13,
223 | "metadata": {},
224 | "output_type": "execute_result"
225 | }
226 | ],
227 | "source": [
228 | "df[df.newsDate!=\"2019-04-30\"]"
229 | ]
230 | },
231 | {
232 | "cell_type": "code",
233 | "execution_count": 18,
234 | "metadata": {},
235 | "outputs": [
236 | {
237 | "data": {
238 | "text/html": [
239 | "\n",
240 | "\n",
253 | "
\n",
254 | " \n",
255 | " \n",
256 | " | \n",
257 | " Pcontent | \n",
258 | " newsDate | \n",
259 | " title | \n",
260 | " url | \n",
261 | "
\n",
262 | " \n",
263 | " \n",
264 | " \n",
265 | " 420 | \n",
266 | " NaN | \n",
267 | " 2019-04-30 | \n",
268 | " 贾跃亭被立案调查,中国股民该如何索赔? | \n",
269 | " http://tech.sina.com.cn/csj/2019-04-30/doc-ihvhiewr9161860.shtml | \n",
270 | "
\n",
271 | " \n",
272 | " 692 | \n",
273 | " NaN | \n",
274 | " 2019-04-30 | \n",
275 | " 融资13亿后突然死亡!被苹果点赞的明星创业公司倒闭 | \n",
276 | " http://tech.sina.com.cn/csj/2019-04-30/doc-ihvhiqax5927259.shtml | \n",
277 | "
\n",
278 | " \n",
279 | " 695 | \n",
280 | " NaN | \n",
281 | " 2019-04-30 | \n",
282 | " 揭秘腾讯最有权力的50人:他们掌握新科技帝国权杖 | \n",
283 | " http://tech.sina.com.cn/csj/2019-04-30/doc-ihvhiewr9118737.shtml | \n",
284 | "
\n",
285 | " \n",
286 | " 823 | \n",
287 | " NaN | \n",
288 | " 2019-04-30 | \n",
289 | " 硅谷20年沉浮亲历者:我见过狂欢 也见过狂欢后的下坠 | \n",
290 | " http://tech.sina.com.cn/csj/2019-04-30/doc-ihvhiqax5909104.shtml | \n",
291 | "
\n",
292 | " \n",
293 | " 1024 | \n",
294 | " NaN | \n",
295 | " 2019-04-30 | \n",
296 | " 人造鸡蛋来袭?这家公司拯救一万亿鸡蛋供求 | \n",
297 | " http://tech.sina.com.cn/csj/2019-04-30/doc-ihvhiewr9129488.shtml | \n",
298 | "
\n",
299 | " \n",
300 | " 1133 | \n",
301 | " NaN | \n",
302 | " 2019-04-30 | \n",
303 | " 创始人要怎么讲故事? | \n",
304 | " http://tech.sina.com.cn/csj/2019-04-30/doc-ihvhiewr9107168.shtml | \n",
305 | "
\n",
306 | " \n",
307 | " 1142 | \n",
308 | " NaN | \n",
309 | " 2019-04-30 | \n",
310 | " 瑞幸不是咖啡店:上市型创业的登峰造极 | \n",
311 | " http://tech.sina.com.cn/csj/2019-04-30/doc-ihvhiewr9084907.shtml | \n",
312 | "
\n",
313 | " \n",
314 | " 1186 | \n",
315 | " NaN | \n",
316 | " 2019-04-30 | \n",
317 | " Uber和滴滴背后迥异的“悬崖式“监管 | \n",
318 | " http://tech.sina.com.cn/csj/2019-04-30/doc-ihvhiqax5892377.shtml | \n",
319 | "
\n",
320 | " \n",
321 | " 1293 | \n",
322 | " NaN | \n",
323 | " 2019-04-30 | \n",
324 | " 暴风冯鑫:成也“风口”,败也“风口” | \n",
325 | " http://tech.sina.com.cn/csj/2019-04-30/doc-ihvhiewr9053993.shtml | \n",
326 | "
\n",
327 | " \n",
328 | " 1317 | \n",
329 | " NaN | \n",
330 | " 2019-04-30 | \n",
331 | " 中关村“金三角”往事 | \n",
332 | " http://tech.sina.com.cn/csj/2019-04-30/doc-ihvhiewr9049349.shtml | \n",
333 | "
\n",
334 | " \n",
335 | " 1362 | \n",
336 | " NaN | \n",
337 | " 2019-04-30 | \n",
338 | " 起底身份倒卖产业:那些被公开叫卖的人生 | \n",
339 | " http://tech.sina.com.cn/csj/2019-04-30/doc-ihvhiewr9053334.shtml | \n",
340 | "
\n",
341 | " \n",
342 | " 1385 | \n",
343 | " NaN | \n",
344 | " 2019-04-30 | \n",
345 | " 翻了翻乐视早年的财报,它曾经真的有过梦想 | \n",
346 | " http://tech.sina.com.cn/csj/2019-04-30/doc-ihvhiqax5866502.shtml | \n",
347 | "
\n",
348 | " \n",
349 | " 1408 | \n",
350 | " NaN | \n",
351 | " 2019-04-30 | \n",
352 | " 和高通和解后 苹果5G负责人离职、部门重大重组 | \n",
353 | " https://tech.sina.com.cn/roll/2019-04-30/doc-ihvhiqax5881451.shtml | \n",
354 | "
\n",
355 | " \n",
356 | " 1517 | \n",
357 | " NaN | \n",
358 | " 2019-04-30 | \n",
359 | " 90颗钻石加持:苹果双折叠屏iPhone Z渲染图曝光 | \n",
360 | " https://tech.sina.com.cn/mobile/n/n/2019-04-30/doc-ihvhiewr9043759.shtml | \n",
361 | "
\n",
362 | " \n",
363 | " 1578 | \n",
364 | " NaN | \n",
365 | " 2019-04-30 | \n",
366 | " 比赛博朋克还朋克的生物黑客 | \n",
367 | " http://tech.sina.com.cn/csj/2019-04-30/doc-ihvhiewr9061108.shtml | \n",
368 | "
\n",
369 | " \n",
370 | " 1584 | \n",
371 | " NaN | \n",
372 | " 2019-04-30 | \n",
373 | " 苹果发布iOS 12.3新测试版:提升速度、稳定性 | \n",
374 | " https://tech.sina.com.cn/mobile/n/n/2019-04-30/doc-ihvhiqax5856721.shtml | \n",
375 | "
\n",
376 | " \n",
377 | " 1589 | \n",
378 | " NaN | \n",
379 | " 2019-04-30 | \n",
380 | " Spotify付费订户达到1亿:比苹果多一倍 | \n",
381 | " https://tech.sina.com.cn/i/2019-04-30/doc-ihvhiqax5858395.shtml | \n",
382 | "
\n",
383 | " \n",
384 | "
\n",
385 | "
"
386 | ],
387 | "text/plain": [
388 | " Pcontent newsDate title url\n",
389 | "420 NaN 2019-04-30 贾跃亭被立案调查,中国股民该如何索赔? http://tech.sina.com.cn/csj/2019-04-30/doc-ihvhiewr9161860.shtml\n",
390 | "692 NaN 2019-04-30 融资13亿后突然死亡!被苹果点赞的明星创业公司倒闭 http://tech.sina.com.cn/csj/2019-04-30/doc-ihvhiqax5927259.shtml\n",
391 | "695 NaN 2019-04-30 揭秘腾讯最有权力的50人:他们掌握新科技帝国权杖 http://tech.sina.com.cn/csj/2019-04-30/doc-ihvhiewr9118737.shtml\n",
392 | "823 NaN 2019-04-30 硅谷20年沉浮亲历者:我见过狂欢 也见过狂欢后的下坠 http://tech.sina.com.cn/csj/2019-04-30/doc-ihvhiqax5909104.shtml\n",
393 | "1024 NaN 2019-04-30 人造鸡蛋来袭?这家公司拯救一万亿鸡蛋供求 http://tech.sina.com.cn/csj/2019-04-30/doc-ihvhiewr9129488.shtml\n",
394 | "1133 NaN 2019-04-30 创始人要怎么讲故事? http://tech.sina.com.cn/csj/2019-04-30/doc-ihvhiewr9107168.shtml\n",
395 | "1142 NaN 2019-04-30 瑞幸不是咖啡店:上市型创业的登峰造极 http://tech.sina.com.cn/csj/2019-04-30/doc-ihvhiewr9084907.shtml\n",
396 | "1186 NaN 2019-04-30 Uber和滴滴背后迥异的“悬崖式“监管 http://tech.sina.com.cn/csj/2019-04-30/doc-ihvhiqax5892377.shtml\n",
397 | "1293 NaN 2019-04-30 暴风冯鑫:成也“风口”,败也“风口” http://tech.sina.com.cn/csj/2019-04-30/doc-ihvhiewr9053993.shtml\n",
398 | "1317 NaN 2019-04-30 中关村“金三角”往事 http://tech.sina.com.cn/csj/2019-04-30/doc-ihvhiewr9049349.shtml\n",
399 | "1362 NaN 2019-04-30 起底身份倒卖产业:那些被公开叫卖的人生 http://tech.sina.com.cn/csj/2019-04-30/doc-ihvhiewr9053334.shtml\n",
400 | "1385 NaN 2019-04-30 翻了翻乐视早年的财报,它曾经真的有过梦想 http://tech.sina.com.cn/csj/2019-04-30/doc-ihvhiqax5866502.shtml\n",
401 | "1408 NaN 2019-04-30 和高通和解后 苹果5G负责人离职、部门重大重组 https://tech.sina.com.cn/roll/2019-04-30/doc-ihvhiqax5881451.shtml\n",
402 | "1517 NaN 2019-04-30 90颗钻石加持:苹果双折叠屏iPhone Z渲染图曝光 https://tech.sina.com.cn/mobile/n/n/2019-04-30/doc-ihvhiewr9043759.shtml\n",
403 | "1578 NaN 2019-04-30 比赛博朋克还朋克的生物黑客 http://tech.sina.com.cn/csj/2019-04-30/doc-ihvhiewr9061108.shtml\n",
404 | "1584 NaN 2019-04-30 苹果发布iOS 12.3新测试版:提升速度、稳定性 https://tech.sina.com.cn/mobile/n/n/2019-04-30/doc-ihvhiqax5856721.shtml\n",
405 | "1589 NaN 2019-04-30 Spotify付费订户达到1亿:比苹果多一倍 https://tech.sina.com.cn/i/2019-04-30/doc-ihvhiqax5858395.shtml"
406 | ]
407 | },
408 | "execution_count": 18,
409 | "metadata": {},
410 | "output_type": "execute_result"
411 | }
412 | ],
413 | "source": [
414 | "pd.set_option('max_colwidth',200)\n",
415 | "\n",
416 | "df[df.Pcontent.isnull()] # 这一类都属于新浪专栏,这部分应该算是比较小的了。"
417 | ]
418 | },
419 | {
420 | "cell_type": "code",
421 | "execution_count": 27,
422 | "metadata": {},
423 | "outputs": [],
424 | "source": [
425 | "df = df[['newsDate','title','url','Pcontent']]\n",
426 | "df.to_csv(\"output.csv\",index=0)"
427 | ]
428 | },
429 | {
430 | "cell_type": "code",
431 | "execution_count": 28,
432 | "metadata": {},
433 | "outputs": [
434 | {
435 | "name": "stdout",
436 | "output_type": "stream",
437 | "text": [
438 | "\n",
439 | "RangeIndex: 2227 entries, 0 to 2226\n",
440 | "Data columns (total 4 columns):\n",
441 | "newsDate 2227 non-null object\n",
442 | "title 2227 non-null object\n",
443 | "url 2227 non-null object\n",
444 | "Pcontent 2210 non-null object\n",
445 | "dtypes: object(4)\n",
446 | "memory usage: 69.7+ KB\n"
447 | ]
448 | }
449 | ],
450 | "source": [
451 | "dff = pd.read_csv(\"output.csv\")\n",
452 | "dff.info()"
453 | ]
454 | },
455 | {
456 | "cell_type": "code",
457 | "execution_count": null,
458 | "metadata": {},
459 | "outputs": [],
460 | "source": []
461 | }
462 | ],
463 | "metadata": {
464 | "kernelspec": {
465 | "display_name": "Python 3",
466 | "language": "python",
467 | "name": "python3"
468 | },
469 | "language_info": {
470 | "codemirror_mode": {
471 | "name": "ipython",
472 | "version": 3
473 | },
474 | "file_extension": ".py",
475 | "mimetype": "text/x-python",
476 | "name": "python",
477 | "nbconvert_exporter": "python",
478 | "pygments_lexer": "ipython3",
479 | "version": "3.7.0"
480 | }
481 | },
482 | "nbformat": 4,
483 | "nbformat_minor": 2
484 | }
485 |
--------------------------------------------------------------------------------
/xinlanggundong/lastday.csv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/realzhengyiming/newsSpier_scrapy/2852f63981f764bfc0f2c733d52f104a3bf3c9e1/xinlanggundong/lastday.csv
--------------------------------------------------------------------------------
/xinlanggundong/main.py:
--------------------------------------------------------------------------------
1 | from scrapy import cmdline
2 |
3 | cmdline.execute("scrapy crawl xinlangspider".split())
--------------------------------------------------------------------------------
/xinlanggundong/scrapy.cfg:
--------------------------------------------------------------------------------
1 | # Automatically created by: scrapy startproject
2 | #
3 | # For more information about the [deploy] section see:
4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
5 |
6 | [settings]
7 | default = xinlanggundong.settings
8 |
9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = xinlanggundong
12 |
--------------------------------------------------------------------------------
/xinlanggundong/xinlanggundong/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/realzhengyiming/newsSpier_scrapy/2852f63981f764bfc0f2c733d52f104a3bf3c9e1/xinlanggundong/xinlanggundong/__init__.py
--------------------------------------------------------------------------------
/xinlanggundong/xinlanggundong/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/realzhengyiming/newsSpier_scrapy/2852f63981f764bfc0f2c733d52f104a3bf3c9e1/xinlanggundong/xinlanggundong/__pycache__/__init__.cpython-36.pyc
--------------------------------------------------------------------------------
/xinlanggundong/xinlanggundong/__pycache__/items.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/realzhengyiming/newsSpier_scrapy/2852f63981f764bfc0f2c733d52f104a3bf3c9e1/xinlanggundong/xinlanggundong/__pycache__/items.cpython-36.pyc
--------------------------------------------------------------------------------
/xinlanggundong/xinlanggundong/__pycache__/middlewares.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/realzhengyiming/newsSpier_scrapy/2852f63981f764bfc0f2c733d52f104a3bf3c9e1/xinlanggundong/xinlanggundong/__pycache__/middlewares.cpython-36.pyc
--------------------------------------------------------------------------------
/xinlanggundong/xinlanggundong/__pycache__/settings.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/realzhengyiming/newsSpier_scrapy/2852f63981f764bfc0f2c733d52f104a3bf3c9e1/xinlanggundong/xinlanggundong/__pycache__/settings.cpython-36.pyc
--------------------------------------------------------------------------------
/xinlanggundong/xinlanggundong/items.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define here the models for your scraped items
4 | #
5 | # See documentation in:
6 | # http://doc.scrapy.org/en/latest/topics/items.html
7 |
8 | import scrapy
9 |
10 |
11 | class XinlanggundongItem(scrapy.Item):
12 | # define the fields for your item here like:
13 | # name = scrapy.Field()
14 | newsUrl = scrapy.Field() # 新闻链接
15 | title = scrapy.Field() # 新闻标题
16 | dateTime = scrapy.Field() # 新闻日期
17 | pass
18 |
19 |
20 |
21 | # ---- 下面这儿是新增加的新闻爬取的,目前是新浪新闻
22 | class News(scrapy.Item):
23 | url = scrapy.Field()
24 | title = scrapy.Field()
25 | timestamp = scrapy.Field()
26 | newsDate = scrapy.Field()
27 |
28 |
29 | class NewsContent(scrapy.Item):
30 | url = scrapy.Field()
31 | title = scrapy.Field()
32 | Pcontent = scrapy.Field()
33 | newsDate = scrapy.Field()
34 |
--------------------------------------------------------------------------------
/xinlanggundong/xinlanggundong/middlewares.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define here the models for your spider middleware
4 | #
5 | # See documentation in:
6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html
7 | import random
8 | import time
9 | import scrapy
10 | from scrapy import signals
11 |
12 | from scrapy.conf import settings # 这儿需要注意导入
13 | from selenium import webdriver
14 | from selenium.webdriver.chrome.options import Options
15 |
16 | # # todo
17 | # class SeleniumMiddlerware(object):
18 | # """
19 | # 利用selenium,获取动态页面数据
20 | # """
21 | # def process_request(self, request, spider):
22 | # print("正在调用请求哈")
23 | # chrome_options = Options()
24 | # chrome_options.add_argument('--headless') # 使用无头谷歌浏览器模式
25 | # chrome_options.add_argument('--disable-gpu')
26 | # chrome_options.add_argument('--no-sandbox')
27 | # # 指定谷歌浏览器路径 ,寻找一个对应版本的chrome驱动去才可以。 todo
28 | # self.driver = webdriver.Chrome(chrome_options=chrome_options,
29 | # executable_path=r'D:\chromedriver')
30 | # if request.url: #不为什么的话,这个代码怎么搞呢。
31 | # self.driver.get(request.url)
32 | # time.sleep(1) # 这儿设置了限速,手动限速
33 | # html = self.driver.page_source
34 | # self.driver.quit()
35 | # print("这儿是解析的requests")
36 | # # print(html) 这个暂时可以注释掉
37 | # return scrapy.http.HtmlResponse(url=request.url, body=html.encode('utf-8'), encoding='utf-8',
38 | # request=request)
39 |
40 |
41 | class XinlanggundongSpiderMiddleware(object):
42 |
43 | # 1这个是静态的设置配置id,启用这个就可以了
44 | def process_request(self,request,spider):
45 | # 自动生成的这儿直接增加设置ua 的部分,手动增加部分
46 | print("现在使用—》自己定义好的UA")
47 | ua = random.choice( settings["USER_AGENT_LIST"] )
48 | print(ua)
49 | request.headers['User-Agent'] = ua # 提取到的ua随机设置给请求
50 |
51 | # 设置代理,需要使用的时候使用,并且记得settings中设置,或者维护的代理池中提取(数据库)
52 | # proxy = random.choice( settings["PROXY"] )
53 | # request.meta['proxy'] = proxy
54 | pass
55 |
56 | # def process_request(self, request, spider):
57 | # referer = request.url
58 | # if referer:
59 | # request.headers['referer'] = referer chrome如何设置referer
60 |
61 |
62 | # 2这个是chrome 调用的。 重要的是找正确版本的chromedriver才可以用的样子。
63 | # def process_request(self, request, spider):
64 | # print("正在调用chrome请求")
65 | # print("现在使用—》chrome发起请求")
66 |
67 | # chrome_options = Options()
68 | # chrome_options.add_argument('--headless') # 使用无头谷歌浏览器模式
69 | # chrome_options.add_argument('--disable-gpu')
70 | # chrome_options.add_argument('--no-sandbox')
71 | # # 指定谷歌浏览器路径
72 | # driver = webdriver.Chrome(chrome_options=chrome_options,
73 | # executable_path=r'D:/chromedriver')
74 | # if request.url : #不为什么的话,这个代码怎么搞呢。
75 | # driver.get(request.url)
76 | # time.sleep(0.8) # 这儿设置了限速,手动限速
77 | # html = driver.page_source
78 | # driver.quit()
79 | # # print("这儿是解析的requests")
80 | # # print(html) 这个暂时可以注释掉
81 | # return scrapy.http.HtmlResponse(url=request.url, body=html.encode('utf-8'), encoding='utf-8',
82 | # request=request)
83 |
84 |
85 | @classmethod
86 | def from_crawler(cls, crawler):
87 | # This method is used by Scrapy to create your spiders.
88 | s = cls()
89 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
90 | return s
91 |
92 | def process_spider_input(response, spider):
93 | # Called for each response that goes through the spider
94 | # middleware and into the spider.
95 |
96 | # Should return None or raise an exception.
97 | return None
98 |
99 | def process_spider_output(response, result, spider):
100 | # Called with the results returned from the Spider, after
101 | # it has processed the response.
102 |
103 | # Must return an iterable of Request, dict or Item objects.
104 | for i in result:
105 | yield i
106 |
107 | def process_spider_exception(response, exception, spider):
108 | # Called when a spider or process_spider_input() method
109 | # (from other spider middleware) raises an exception.
110 |
111 | # Should return either None or an iterable of Response, dict
112 | # or Item objects.
113 | pass
114 |
115 | def process_start_requests(start_requests, spider):
116 | # Called with the start requests of the spider, and works
117 | # similarly to the process_spider_output() method, except
118 | # that it doesn’t have a response associated.
119 |
120 | # Must return only requests (not items).
121 | for r in start_requests:
122 | yield r
123 |
124 | def spider_opened(self, spider):
125 | spider.logger.info('Spider opened: %s' % spider.name)
126 |
--------------------------------------------------------------------------------
/xinlanggundong/xinlanggundong/pipelines.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define your item pipelines here
4 | #
5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
7 |
8 |
9 | # return 回去的意思就是给剩下的管道使用。
10 |
11 | class XinlanggundongPipeline(object):
12 | def process_item(self, item, spider):
13 | return item
14 |
--------------------------------------------------------------------------------
/xinlanggundong/xinlanggundong/settings.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Scrapy settings for xinlanggundong project
4 | #
5 | # For simplicity, this file contains only settings considered important or
6 | # commonly used. You can find more settings consulting the documentation:
7 | #
8 | # http://doc.scrapy.org/en/latest/topics/settings.html
9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
11 |
12 | BOT_NAME = 'xinlanggundong'
13 |
14 | SPIDER_MODULES = ['xinlanggundong.spiders']
15 | NEWSPIDER_MODULE = 'xinlanggundong.spiders'
16 |
17 |
18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
19 | #USER_AGENT = 'xinlanggundong (+http://www.yourdomain.com)'
20 |
21 | # Obey robots.txt rules
22 | ROBOTSTXT_OBEY = False
23 |
24 | # 设置UA user-agent PC
25 | USER_AGENT_LIST = [
26 | 'Mozilla/5.0 (Windows NT 5.2) AppleWebKit/534.30 (KHTML, like Gecko) Chrome/12.0.742.122 Safari/534.30',
27 | 'Mozilla/5.0 (Windows NT 5.1; rv:5.0) Gecko/20100101 Firefox/5.0',
28 | 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.2; Trident/4.0; .NET CLR 1.1.4322; .NET CLR 2.0.50727; .NET4.0E; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729; .NET4.0C)',
29 | 'Opera/9.80 (Windows NT 5.1; U; zh-cn) Presto/2.9.168 Version/11.50',
30 | 'Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/533.21.1 (KHTML, like Gecko) Version/5.0.5 Safari/533.21.1',
31 | 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 2.0.50727; .NET CLR 3.0.04506.648; .NET CLR 3.5.21022; .NET4.0E; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729; .NET4.0C)'
32 | ]
33 |
34 |
35 |
36 | # 手机UA
37 | my_headers = [ # 这边为了得到直接的手机端的页面代码返回,直接使用手机ua
38 | # 'Mozilla/5.0 (Linux; Android 7.1.1; MI 6 Build/NMF26X; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/53.0.2785.49 Mobile MQQBrowser/6.2 TBS/043508 Safari/537.36 MicroMessenger/6.5.13.1100 NetType/WIFI Language/zh_CN',
39 | # 'Mozilla/5.0 (Linux; Android 7.1.1; MI 6 Build/NMF26X) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/57.0.2987.132 Mobile Safari/537.36 Maxthon/3047',
40 | 'Mozilla/5.0 (Linux; Android 8.0.0; Pixel 2 XL Build/OPD1.170816.004) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Mobile Safari/537.36',
41 | # 'Mozilla/5.0 (Linux; U; Android 7.0; zh-cn; STF-AL00 Build/HUAWEISTF-AL00) AppleWebKit/537.36 (KHTML, like Gecko)Version/4.0 Chrome/37.0.0.0 MQQBrowser/7.9 Mobile Safari/537.36',
42 | # 'Mozilla/5.0 (Linux; U; Android 6.0.1; zh-CN; SM-C7000 Build/MMB29M) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/40.0.2214.89 UCBrowser/11.6.2.948 Mobile Safari/537.36',
43 | # 'Mozilla/5.0 (Linux; Android 7.0; STF-AL10 Build/HUAWEISTF-AL10; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/53.0.2785.49 Mobile MQQBrowser/6.2 TBS/043508 Safari/537.36 V1_AND_SQ_7.2.0_730_YYB_D QQ/7.2.0.3270 NetType/4G WebP/0.3.0 Pixel/1080'
44 | ]
45 |
46 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
47 | # CONCURRENT_REQUESTS = 32
48 |
49 | # Configure a delay for requests for the same website (default: 0)
50 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
51 | # See also autothrottle settings and docs
52 | # 设置下载延时。
53 | DOWNLOAD_DELAY = 0.5 # 或者0.8 一般,太慢效率不行
54 | # The download delay setting will honor only one of:
55 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
56 | #CONCURRENT_REQUESTS_PER_IP = 16
57 |
58 | # Disable cookies (enabled by default)
59 | #COOKIES_ENABLED = False
60 |
61 | # Disable Telnet Console (enabled by default)
62 | #TELNETCONSOLE_ENABLED = False
63 |
64 | # Override the default request headers:
65 | #DEFAULT_REQUEST_HEADERS = {
66 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
67 | # 'Accept-Language': 'en',
68 | #}
69 |
70 | # Enable or disable spider middlewares
71 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
72 | #SPIDER_MIDDLEWARES = {
73 | # 'xinlanggundong.middlewares.XinlanggundongSpiderMiddleware': 543,
74 | #}
75 |
76 | FEED_EXPORT_ENCODING = "utf-8" # 这儿还是设置成gbk 或者utf-8 好一点,默认都是
77 |
78 | # Enable or disable downloader middlewares
79 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
80 | DOWNLOADER_MIDDLEWARES = {
81 | # 'xinlanggundong.middlewares.MyCustomDownloaderMiddleware': 543,
82 | 'xinlanggundong.middlewares.XinlanggundongSpiderMiddleware' : 543, # 这儿替换上自己的中间件来使用。
83 | 'scrapy.contrib.downloadermiddleware.useragent.UserAgentMiddleware':None, # 默认的是500需要,像这种这样就是可以关掉
84 | }
85 |
86 | # Enable or disable extensions
87 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
88 | #EXTENSIONS = {
89 | # 'scrapy.extensions.telnet.TelnetConsole': None,
90 | #}
91 |
92 | # Configure item pipelines
93 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
94 | #ITEM_PIPELINES = {
95 | # 'xinlanggundong.pipelines.XinlanggundongPipeline': 300,
96 | #}
97 |
98 | # Enable and configure the AutoThrottle extension (disabled by default)
99 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
100 | #AUTOTHROTTLE_ENABLED = True
101 | # The initial download delay
102 | #AUTOTHROTTLE_START_DELAY = 5
103 | # The maximum download delay to be set in case of high latencies
104 | #AUTOTHROTTLE_MAX_DELAY = 60
105 | # The average number of requests Scrapy should be sending in parallel to
106 | # each remote server
107 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
108 | # Enable showing throttling stats for every response received:
109 | #AUTOTHROTTLE_DEBUG = False
110 |
111 | # Enable and configure HTTP caching (disabled by default)
112 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
113 | #HTTPCACHE_ENABLED = True
114 | #HTTPCACHE_EXPIRATION_SECS = 0
115 | #HTTPCACHE_DIR = 'httpcache'
116 | #HTTPCACHE_IGNORE_HTTP_CODES = []
117 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
118 |
--------------------------------------------------------------------------------
/xinlanggundong/xinlanggundong/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 |
--------------------------------------------------------------------------------
/xinlanggundong/xinlanggundong/spiders/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/realzhengyiming/newsSpier_scrapy/2852f63981f764bfc0f2c733d52f104a3bf3c9e1/xinlanggundong/xinlanggundong/spiders/__pycache__/__init__.cpython-36.pyc
--------------------------------------------------------------------------------
/xinlanggundong/xinlanggundong/spiders/__pycache__/xinlangspider.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/realzhengyiming/newsSpier_scrapy/2852f63981f764bfc0f2c733d52f104a3bf3c9e1/xinlanggundong/xinlanggundong/spiders/__pycache__/xinlangspider.cpython-36.pyc
--------------------------------------------------------------------------------
/xinlanggundong/xinlanggundong/spiders/xinlangspider.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import json
3 | from datetime import date, timedelta
4 |
5 | import pysnooper # debug 用的包
6 | import scrapy
7 | from scrapy.loader import ItemLoader
8 | import time
9 |
10 | from ..items import News
11 | from ..items import NewsContent
12 |
13 |
14 |
15 | class XinlangspiderSpider(scrapy.Spider):
16 | name = 'news'
17 | # 爬取的域名,不会超出这个顶级域名
18 | allowed_domains = ['sina.com'] # 可以设置成不过滤吗。
19 | start_urls = [
20 | ]
21 |
22 | count = 1
23 | # {}占位符,用于字符串替换,将获取到的/text/page/1格式内容替换成完整url 这个是新浪新闻的。滚动新闻的页面
24 | host_url = 'https://feed.mix.sina.com.cn/api/roll/get?pageid=153&lid=2509&k=&num=50&page={}'
25 |
26 | def start_requests(self):
27 | for num in range(1,60): # 这儿是看爬取多少页的。50*60 =3000 已经够多的了。新浪的
28 | print(self.host_url.format(num))
29 | self.start_urls.append(self.host_url.format(num))
30 | for url in self.start_urls: # 第一层算是广度优先爬取对吧。
31 | yield scrapy.Request(url, dont_filter=False)
32 | # # 这里重写爬虫入口方法,将dont_filter设置为false
33 | # # 是为了让起始url放入srcapy.Request请求url池中,对起始url也做去重处理
34 | # # 否则会爬取到两次 https://www.qiushibaike.com/text/,一次是起始url
35 | # # 一次是分页数据里检索到的第一页
36 |
37 |
38 | def parse(self, response): # 每一页的都在这儿了。
39 | # itemloader
40 | allDic = json.loads(response.body)
41 | # print(allDic)
42 | print(type(allDic))
43 | for one in allDic['result']['data']:
44 | # print(one['url'])
45 | # print(one['title'])
46 | timeStamp = one['intime']
47 | timeArray = time.localtime(int(timeStamp))
48 | newsDatetemp = time.strftime("%Y-%m-%d %H:%M:%S", timeArray)
49 | newsDate = newsDatetemp.split(" ")[0] # 这个是日期的字符串
50 | # print(newsDate)
51 | # print(one['intime'])
52 |
53 | # 这儿做一个测试,用来看是不是在同一个地方的。不要当天的,这儿注意
54 | lastDay = (date.today() + timedelta(days=-1)).strftime("%Y-%m-%d") # 获取昨天日期
55 | lastDayEnd = (date.today() + timedelta(days=-2)).strftime("%Y-%m-%d") # 获取前天日期
56 |
57 | # 第二天爬取会出现今天的,还有昨天的,还有前天的。
58 | if newsDate == lastDay:
59 | # 添加进item
60 | itemloader = ItemLoader(item=News(), response=response, )
61 | itemloader.add_value('url', one['url'])
62 | itemloader.add_value('title', one['title'])
63 | itemloader.add_value('timestamp', one['intime'])
64 | itemloader.add_value('newsDate', newsDate)
65 | resultItem = itemloader.load_item()
66 | yield scrapy.Request(url=resultItem['url'][0], callback=self.newsContent, dont_filter=True,
67 | meta={"lastItem": resultItem})
68 |
69 | if newsDate == lastDayEnd: # 这个是前天的
70 | break # 这儿就是跳出循环了
71 |
72 | else: # 是今天的情况就不用管了
73 | print("这条是今天的,跳过爬取---》因为我们要提取完整的昨天的新闻。😁")
74 | print()
75 | # print("这儿这个就是跑完了!")
76 |
77 | # 这边是解析详情页的部分。
78 | @pysnooper.snoop() #这样就可以debug了
79 | def newsContent(self,response):
80 | print()
81 | print()
82 | lastItem = response.meta["lastItem"]
83 | print(lastItem['url'][0])
84 | print(lastItem['title'][0])
85 | print(lastItem['newsDate'][0])
86 | # print(response.body)
87 | contentlist = []
88 | print("全文在这儿了")
89 | # print(response.xpath("//div[@class='article']").xpath('string(.)').extract_first())
90 | for allp in response.xpath("//div[@class='article']"): # //div[@class='article'] ,要取这下面的所有的文本对吧
91 | print(allp.xpath("p"))
92 | for p in allp.xpath("p"):
93 | # print(p.xpath("text()").extract_first())
94 | contentlist.append(p.xpath("string(.)").extract_first().strip()) # 换用这种后呢,会不会就不会再发生那种事情了。
95 | print()
96 | print()
97 | print(contentlist)
98 | # time.sleep(60)
99 |
100 | # print(contentlist) # todo 有时候是None的回去 研究一下这部分的部分
101 | print(len(contentlist))
102 | tempContent = ""
103 | if len(contentlist)== 0 :
104 | tempContent=""
105 | else:
106 | # 这儿可能回合并出错的。合并出错就再试一试咯。应该没什么大问题的。
107 | tempContent = "".join(contentlist) # todo是这儿的问题把,也就是说可能contentlist里面并不是纯文本的。
108 |
109 |
110 | print("检查第几个{}".format(self.count))
111 | self.count=self.count+1
112 | print(tempContent)
113 | newsloader = ItemLoader(item=NewsContent(), response=response)
114 | newsloader.add_value('Pcontent',tempContent)
115 | newsloader.add_value('title',lastItem['title'][0])
116 | newsloader.add_value('url',lastItem['url'][0])
117 | newsloader.add_value("newsDate",lastItem['newsDate'][0])
118 |
119 | print(lastItem['newsDate'][0])
120 | # time.sleep(15)
121 |
122 | yield newsloader.load_item()
123 | # time.sleep(30)
124 | pass
125 |
--------------------------------------------------------------------------------