├── Crawler ├── .gitignore ├── .idea │ ├── .name │ ├── Crawler.iml │ ├── dataSources.local.xml │ ├── dataSources.xml │ ├── dataSources │ │ └── 2097b77a-0349-4758-8855-8b770b9e50b1.xml │ ├── deployment.xml │ ├── misc.xml │ ├── modules.xml │ ├── vcs.xml │ └── workspace.xml ├── Crawler │ ├── .idea │ │ ├── ImageSpider.iml │ │ ├── deployment.xml │ │ ├── misc.xml │ │ ├── modules.xml │ │ ├── other.xml │ │ ├── vcs.xml │ │ └── workspace.xml │ ├── __init__.py │ ├── commands │ │ ├── __init__.py │ │ └── crawlall.py │ ├── expand_package │ │ ├── Comment.py │ │ ├── DBcontrol.py │ │ ├── Sent_Dict │ │ │ ├── __init__.py │ │ │ ├── negative.txt │ │ │ ├── positive.txt │ │ │ ├── 否定词.txt │ │ │ └── 程度级别词语.txt │ │ ├── WordCloud.py │ │ ├── __init__.py │ │ ├── makebeautifulSoup.py │ │ ├── negative.txt │ │ ├── picDownloadScript.py │ │ ├── positive.txt │ │ ├── senti_dict.py │ │ ├── senti_dict_class.py │ │ ├── 否定词.txt │ │ └── 程度级别词语.txt │ ├── items.py │ ├── middlewares.py │ ├── pipelines.py │ ├── settings.py │ └── spiders │ │ ├── __init__.py │ │ ├── spider_expends.py │ │ ├── tengxu.py │ │ ├── wangyi.py │ │ └── xinlang.py ├── TengxunMain.py ├── TogetherCrawl.py ├── WangyiMain.py ├── XinlangMain.py ├── desktop.ini ├── scrapy.cfg ├── setup.py └── togetherCrawl_scheduling.py ├── README.md └── xinlanggundong ├── .idea ├── deployment.xml ├── encodings.xml ├── misc.xml ├── modules.xml ├── vcs.xml ├── workspace.xml └── xinlanggundong.iml ├── README.md ├── ViewData.ipynb ├── lastday.csv ├── main.py ├── output(utf8).csv ├── scrapy.cfg └── xinlanggundong ├── __init__.py ├── __pycache__ ├── __init__.cpython-36.pyc ├── items.cpython-36.pyc ├── middlewares.cpython-36.pyc └── settings.cpython-36.pyc ├── items.py ├── middlewares.py ├── pipelines.py ├── settings.py └── spiders ├── __init__.py ├── __pycache__ ├── __init__.cpython-36.pyc └── xinlangspider.cpython-36.pyc └── xinlangspider.py /Crawler/.gitignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/realzhengyiming/newsSpier_scrapy/2852f63981f764bfc0f2c733d52f104a3bf3c9e1/Crawler/.gitignore -------------------------------------------------------------------------------- /Crawler/.idea/.name: -------------------------------------------------------------------------------- 1 | Crawler -------------------------------------------------------------------------------- /Crawler/.idea/Crawler.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /Crawler/.idea/dataSources.local.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | #@ 7 | ` 8 | 9 | 10 | master_key 11 | root 12 | *:@ 13 | 14 | 15 | -------------------------------------------------------------------------------- /Crawler/.idea/dataSources.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | mysql 6 | true 7 | com.mysql.jdbc.Driver 8 | jdbc:mysql://localhost:3306/flask 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | -------------------------------------------------------------------------------- /Crawler/.idea/deployment.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | -------------------------------------------------------------------------------- /Crawler/.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 6 | 7 | -------------------------------------------------------------------------------- /Crawler/.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /Crawler/.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /Crawler/Crawler/.idea/ImageSpider.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 11 | 12 | 14 | -------------------------------------------------------------------------------- /Crawler/Crawler/.idea/deployment.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | -------------------------------------------------------------------------------- /Crawler/Crawler/.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 6 | 7 | -------------------------------------------------------------------------------- /Crawler/Crawler/.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /Crawler/Crawler/.idea/other.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 7 | -------------------------------------------------------------------------------- /Crawler/Crawler/.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /Crawler/Crawler/.idea/workspace.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 29 | 30 | 31 | 32 | 33 | 34 | 44 | 45 | 268 | 269 | 270 | 271 | 272 | 273 | 274 | 275 | 276 | 277 | 278 | 279 | 280 | 281 | 282 | 283 | 284 | 285 | 286 | 287 | 288 | 289 | 290 | 291 | 292 | 293 | 294 | 295 | 305 | 306 | 307 | 308 | 309 | 329 | 330 | 331 | 351 | 352 | 353 | 373 | 374 | 375 | 392 | 393 | 394 | 414 | 415 | 416 | 417 | 418 | 419 | 420 | 421 | 422 | 423 | 424 | 425 | 426 | 427 | 428 | 429 | 430 | 431 | 432 | 433 | 434 | 435 | 1556605149319 436 | 440 | 441 | 442 | 443 | 444 | 445 | 446 | 447 | 448 | 449 | 450 | 451 | 452 | 453 | 454 | 455 | 456 | 457 | 458 | 459 | 460 | 461 | 462 | 463 | 464 | 465 | 466 | 467 | 468 | 469 | 470 | 471 | 472 | 473 | 474 | 475 | 476 | 477 | 478 | 479 | 480 | 482 | 483 | 485 | 486 | 487 | 488 | 489 | 490 | 491 | 492 | 493 | 494 | 495 | 496 | 497 | 498 | 499 | 500 | 501 | 502 | 503 | 504 | 505 | 506 | 507 | 508 | 509 | 510 | 511 | 512 | 513 | 514 | 515 | 516 | 517 | 518 | 519 | 520 | 521 | 522 | 523 | 524 | 525 | 526 | 527 | 528 | 529 | 530 | 531 | 532 | 533 | 534 | 535 | 536 | 537 | 538 | 539 | 540 | 541 | 542 | 543 | 544 | 545 | 546 | 547 | 548 | 549 | 550 | 551 | 552 | 553 | 554 | 555 | 556 | 557 | 558 | 559 | 560 | 561 | 562 | 563 | 564 | 565 | 566 | 567 | 568 | 569 | 570 | 571 | 572 | 573 | 574 | 575 | 576 | 577 | 578 | 579 | 580 | 581 | 582 | 583 | 584 | 585 | 586 | 587 | 588 | 589 | 590 | 591 | 592 | 593 | 594 | 595 | 596 | 597 | 598 | 599 | 600 | 601 | 602 | 603 | 604 | 605 | 606 | 607 | 608 | 609 | 610 | 611 | 612 | 613 | 614 | 615 | 616 | 617 | 618 | 619 | 620 | 621 | 622 | 623 | 624 | 625 | 626 | 627 | 628 | 629 | 630 | 631 | 632 | 633 | 634 | 635 | 636 | 637 | 638 | 639 | 640 | 641 | 642 | 643 | 644 | 645 | 646 | 647 | 648 | 649 | 650 | 651 | 652 | 653 | 654 | 655 | 656 | 657 | 658 | 659 | 660 | 661 | 662 | -------------------------------------------------------------------------------- /Crawler/Crawler/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/realzhengyiming/newsSpier_scrapy/2852f63981f764bfc0f2c733d52f104a3bf3c9e1/Crawler/Crawler/__init__.py -------------------------------------------------------------------------------- /Crawler/Crawler/commands/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/realzhengyiming/newsSpier_scrapy/2852f63981f764bfc0f2c733d52f104a3bf3c9e1/Crawler/Crawler/commands/__init__.py -------------------------------------------------------------------------------- /Crawler/Crawler/commands/crawlall.py: -------------------------------------------------------------------------------- 1 | from scrapy.commands import ScrapyCommand 2 | 3 | from Crawler.expand_package.Comment import CommentCrawl 4 | from Crawler.expand_package.DBcontrol import DB 5 | 6 | 7 | class Command(ScrapyCommand): 8 | requires_project = True 9 | 10 | def syntax(self): 11 | return '[options]' 12 | 13 | def short_desc(self): 14 | return 'Runs all of the spiders' 15 | 16 | def run(self, args, opts): 17 | spider_list = self.crawler_process.spiders.list() 18 | for name in spider_list: 19 | self.crawler_process.crawl(name, **opts.__dict__) 20 | self.crawler_process.start() 21 | print("三大站点的新闻正文爬取完毕了!") 22 | # todo 先是tengxun表——》django表的分类 23 | ## todo 分类还需要调用评分进行插入到里面。 24 | print("正在进行它新闻分类和情感得分分数录入...") 25 | dbtool = DB() 26 | dbtool.classifyDB() 27 | 28 | print("正在进行腾讯新闻评论的爬取...") 29 | commentC = CommentCrawl() 30 | commentC.getCommentMain() # 测试主题从url中提取,url又可以合成。 31 | print("今天爬虫任务完成!") 32 | -------------------------------------------------------------------------------- /Crawler/Crawler/expand_package/Comment.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -* 2 | 3 | # 这个是收集评论的类 4 | # todo 评论好像有问题,这边这儿的。 5 | import time 6 | 7 | import emoji 8 | 9 | # from DBcontrol import DB 10 | # from makebeautifulSoup import makeBS 11 | 12 | # from NewsSenti.tengxun.DBcontrol import DB 13 | # from NewsSenti.tengxun.makebeautifulSoup import makeBS 14 | from Crawler.expand_package.DBcontrol import DB 15 | from Crawler.expand_package.makebeautifulSoup import makeBS 16 | 17 | 18 | class CommentCrawl(object): 19 | def __init__(self): 20 | self.dbHelper = DB() 21 | 22 | def changTimeToDate(self,dateString): 23 | timeStamp = dateString 24 | timeArray = time.localtime(timeStamp) 25 | print(timeArray) 26 | otherStyleTime = time.strftime("%Y-%m-%d", timeArray) 27 | # print(otherStyleTime) 28 | return otherStyleTime 29 | 30 | 31 | def getNewsIdAndUrl(self): #提取出新闻的id和url 32 | # dbHelper = DB() 33 | themeWord = ['car','technology','home','entertainment','house','finance','sports'] #类别新闻 34 | resultDic = {} 35 | sqlHead = "select News_id,url from newssentimentanalysis_" 36 | sqlTail = "news where Mcontent='未提取'" # 记得更新了状态后要修改成已提取 37 | # 插入 38 | for theme in themeWord: 39 | print(sqlHead+theme+sqlTail) 40 | resultDic[theme] = self.dbHelper.__query__(sqlHead+theme+sqlTail)# 查询 41 | return resultDic #返回格式{'car':[{'id':xx,'url':xx},.....,'home'...] 42 | 43 | def getAwriteCommentJson(self,id,url,theme): #这个是评论专用的请求返回成字典的,theme 是方便找到表然后更新状态的。 44 | sqlHead = "update newssentimentanalysis_" 45 | sqlTail = "news set Mcontent='%s' and where url='%s '" # 更新指定表内的评论状态词,需要只处理腾讯的吗 46 | 47 | sql = sqlHead+theme+sqlTail % ("已提取",url) #这个是更新状态用的sql 48 | 49 | # 异常时一般是没有评论 50 | sqlERROR = sqlHead+theme+sqlTail % ("无评论",url) # 如果发现没有 51 | 52 | 53 | time.sleep(0.5) 54 | cooker = makeBS() 55 | commentRawUrl = "http://coral.qq.com/article/" 56 | cmt_id = cooker.getCmt_id(url) #去掉空格 57 | if cmt_id==None: 58 | return False # 没有找到的话,那就是没评论啊 59 | if cmt_id.find("'")!=-1: 60 | cmt_id = cmt_id.replace("'","") 61 | else : 62 | cmt_id = cmt_id.strip() 63 | 64 | #这个用来拼接用到。 65 | try: 66 | allUrl = commentRawUrl + str(cmt_id) + "/comment/#" 67 | print(allUrl) 68 | responseDic = cooker.makeBSjson(allUrl) 69 | commentList = responseDic['data']['commentid'] # todo 不知道怎么回事调用不到这个评论的。 70 | # print(commentList) 71 | from pprint import pprint 72 | for comment in commentList: 73 | pprint(type(comment['id'])) 74 | print(comment['id']) 75 | comment['content'] = emoji.demojize(comment['content']) #过滤emoji 76 | comment['userinfo']['nick'] = emoji.demojize(comment['userinfo']['nick']) 77 | comment['time']=self.changTimeToDate(comment['time']) #时间戳改成日期字符串 78 | print("新闻id "+ str(id)) 79 | print("新闻的url是 "+ url) 80 | if self.dbHelper.classifyDBComment(url=url,id=id,comment=comment) : #评论直接插入django表内的数据库,并且更新新闻评论状态. 81 | print("更新成功") 82 | self.dbHelper.__query__(sql) # 这儿设置更新里面新闻的状态。 83 | else: 84 | print("更新失败") 85 | self.dbHelper.__query__(sqlERROR) # 这儿设置更新里面新闻的状态。 86 | print("已经成功更新此条新闻 "+url+" "+theme) 87 | print("") 88 | return True 89 | #-----------------------这儿可以合成sql语句的话就可以执行插入的操作了。----------------------- 90 | # 通过url来合成插入的sql语句,DBcontrol的方法中来做这些东西 91 | except Exception as e: 92 | print("此条可能无评论,正在跳过") 93 | # 这儿需要插入无评论才可以。 todo 94 | # self.dbHelper.__query__(sqlERROR) # 失败的话,更新成失败 95 | print(sqlERROR) #更新成 96 | print(e) 97 | return False 98 | 99 | 100 | def getCommentMain(self): # 这儿应该是提取出所有为提取的新闻,然后还要记得更新状态 101 | resultDic = self.getNewsIdAndUrl() # 返回的是拼装好的含主题的list 102 | # from pprint import pprint 103 | # pprint(resultDic) 104 | 105 | resultList = [] 106 | count = 0 107 | for theme in resultDic: 108 | print("现在是",theme) 109 | for oneNews in resultDic[theme]: 110 | count+=1 #这个累加,然后如果是到了一定的数量那就休眠一下 111 | if count%100==0: #每100条 112 | time.sleep(15) #休息两分钟。 113 | 114 | print(oneNews) #已经提取出来了 115 | print("获得commentjson") 116 | # 分类----------------------------------------更新原来的状态.---------------------------------------- 117 | sql = "" 118 | sql2="" 119 | sqlHead = "update newssentimentanalysis_" 120 | # 'update newssentimentanalysis_homenews set Mcontent="无评论" where News_id=1' 121 | sqlTail = "news set Mcontent = '已提取' where News_id={}" 122 | sqlTailErr = "news set Mcontent = '无评论' where News_id={}" 123 | 124 | # 插入正文得分的sql 125 | 126 | # 这句就是更新tengxun表中的数据,用id 127 | 128 | if oneNews['url'].find('auto') != -1 or oneNews['url'].find('car') != -1 : # 找到这个就是汽车,中间是表名 129 | sql = sqlHead + "car" + sqlTail 130 | sql2 = sqlHead + "car" + sqlTailErr 131 | pass 132 | elif oneNews['url'].find('tech') != -1: # 找到这个就是科技 133 | sql = sqlHead + "technology" + sqlTail 134 | sql2 = sqlHead + "technology" + sqlTailErr 135 | 136 | pass 137 | elif oneNews['url'].find('news') != -1: # 找到这个就是默认新闻 138 | sql = sqlHead + "home" + sqlTail 139 | sql2 = sqlHead + "home" + sqlTailErr 140 | 141 | pass 142 | elif oneNews['url'].find('ent') != -1: # 找到这个就是娱乐 143 | sql = sqlHead + "entertainment" + sqlTail 144 | sql2 = sqlHead + "entertainment" + sqlTailErr 145 | 146 | pass 147 | elif oneNews['url'].find('house') != -1: # 找到这个就是房产 148 | sql = sqlHead + "house" + sqlTail 149 | sql2 = sqlHead + "house" + sqlTailErr 150 | 151 | pass 152 | elif oneNews['url'].find('finance') != -1: # 找到这个就是经济 153 | sql = sqlHead + "finance" + sqlTail 154 | sql2 = sqlHead + "finance" + sqlTailErr 155 | 156 | pass 157 | elif oneNews['url'].find('sports') != -1: # 找到这个就是运动 158 | sql = sqlHead + "sports" + sqlTail 159 | sql2 = sqlHead + "sports" + sqlTailErr 160 | 161 | pass 162 | else: 163 | print("这边这种是网易的情况-归为默认新闻home中去") 164 | 165 | sql = sqlHead + "home" + sqlTail 166 | sql2 = sqlHead + "home" + sqlTailErr 167 | 168 | print(theme) # 分类 169 | if self.getAwriteCommentJson(id=oneNews['News_id'],url=oneNews['url'],theme=theme): #逐条插入,进行,这个不需要返回 170 | print("提取出评论") 171 | print(sql.format(oneNews['News_id'])) 172 | self.dbHelper.__query__(sql.format(oneNews['News_id'])) 173 | 174 | else: 175 | print("cmt_id 提取失败") 176 | print(sql2.format(oneNews['News_id'])) 177 | self.dbHelper.__query__(sql2.format(oneNews['News_id'])) 178 | print("更新无评论") 179 | 180 | print() 181 | 182 | # resultList.append(oneNews) # 添加进入 183 | print("finish comments crawl!") 184 | 185 | if __name__ == '__main__': 186 | commentC = CommentCrawl() 187 | # print(commentC.getNewsIdAndUrl()) 188 | # print(commentC.getCommentJson("http:////sports.qq.com//a//20190315//000008.htm",55)) #测试单个 189 | commentC.getCommentMain() #测试主题从url中提取,url又可以合成。 190 | 191 | -------------------------------------------------------------------------------- /Crawler/Crawler/expand_package/DBcontrol.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -* 2 | 3 | # 此处是使用非orm来操作数据库的简单池化操作的代码 4 | # 2018/9/8 修改成使用连接池的方式来进行数据库的链接 5 | # 需要导入如下的依赖库,如果没有,请 安装 pymysql ,DBUtils 6 | # 提取返回数据的全部变成了返回字典类型 7 | # 这个是连接数据库的东西,这次使用数据库连接池把,使用连接池可以避免反复的重新创建新连接 8 | 9 | import traceback 10 | from datetime import date, timedelta 11 | import emoji 12 | import pymysql as pymysql 13 | import time 14 | from DBUtils.PooledDB import PooledDB 15 | 16 | # 这个是从配置文件(同级目录下)config.py中加载链接数据库的数据 17 | # mysqlInfo 中格式如下放着就可以,也可以直接使用,把__init__函数中需要链接部分直接替换即可 18 | # mysqlInfo = { 19 | # "host": '127.0.0.1', 20 | # "user": 'root', 21 | # "passwd": '123456', 22 | # "db": 'test', #改同一个数据库了。 23 | # "port": 3306, 24 | # "charset": 'utf8' #这个是数据库的配置文件 25 | # } 26 | # from .senti_dict_class import Senti_dict_class 27 | from Crawler.expand_package.senti_dict import Senti_Text 28 | from Crawler.settings import mysqlInfo 29 | 30 | 31 | class DB: 32 | 33 | __pool = None #这个也是静态的属性 34 | 35 | def __init__(self): 36 | # 构造函数,创建数据库连接、游标,默认创建一个对象就获得一个连接,用完后就关闭就可以了 37 | self.coon = DB.getmysqlconn() #这个是默认创建出来的东西 38 | self.cur = self.coon.cursor(cursor=pymysql.cursors.DictCursor) 39 | 40 | # 数据库连接池连接 41 | @staticmethod # 这个是静态的方法可以直接调用的 42 | def getmysqlconn(): # 从连接池里面获得一个连接 43 | if DB.__pool is None: 44 | __pool = PooledDB(creator=pymysql, mincached=2, maxcached=20, host=mysqlInfo['host'], 45 | user=mysqlInfo['user'], passwd=mysqlInfo['passwd'], db=mysqlInfo['db'], 46 | port=mysqlInfo['port'], charset=mysqlInfo['charset']) 47 | # print(__pool) 48 | return __pool.connection() 49 | # 释放资源 50 | 51 | def dispose(self): #这儿只能断默认初始化的那个连接 52 | self.coon.close() 53 | self.cur.close() 54 | 55 | # ---------------- 这儿开始写方法----------------------- 56 | def ifExists(self,webTitle): 57 | coon = DB.getmysqlconn() # 每次都默认获得一个新连接来进行相关的操作 58 | cur = coon.cursor(cursor=pymysql.cursors.DictCursor) 59 | sql = "SELECT * FROM tengxun WHERE title='%s'and urlState='True';" 60 | #因为这儿没有加上try,catch,所以出问题 61 | try: 62 | cur.execute(sql%(webTitle)) 63 | except Exception as e: 64 | print(e) 65 | print("函数ifExists出问题了,你检查一下") 66 | print(sql%(webTitle)) 67 | rowNumber = cur.rowcount 68 | if rowNumber>0: 69 | return True 70 | else: 71 | return False 72 | 73 | 74 | # ------- 下面可以日常的直接编写操作数据库的代码--------------- 75 | 76 | 77 | def __query__(self,sql): # 自定义查询,返回字典的类型 78 | coon =DB.getmysqlconn() # 每次都默认获得一个新连接来进行相关的操作 79 | cur = coon.cursor(cursor=pymysql.cursors.DictCursor) # 这儿这个选项是设置返回结果为字典的类型,如果默认的话,那就是列表i 80 | # ----- 标准的查询模块 ---下面就是执行的部分 81 | try: 82 | cur.execute(sql) 83 | URLs = cur.fetchall() # 返回数据的列表,可以设置返回的是字典 84 | # ----- 85 | print(sql) 86 | print(cur.rowcount) 87 | coon.commit() 88 | 89 | 90 | return URLs 91 | except Exception as e: 92 | print(e) 93 | coon.rollback() 94 | finally: 95 | cur.close() 96 | coon.close() 97 | 98 | 99 | 100 | # 更新部分的例子,sql语句不同而已 101 | def updateById(self,id): 102 | coon =DB.getmysqlconn() # 每次都默认获得一个新连接来进行相关的操作 103 | cur = coon.cursor(cursor=pymysql.cursors.DictCursor) 104 | 105 | sql = "update tengxun set hadmix='True' where id = %d;" % int(id) #就只是更新一下相应的url的状态就可以了 106 | try: 107 | cur.execute(sql) 108 | # 提交 109 | coon.commit() 110 | except Exception as e: 111 | # 错误回滚 112 | print("更新出错") 113 | print(e) 114 | coon.rollback() 115 | finally: 116 | coon.commit() #提交这个事务 117 | cur.close() 118 | coon.close() 119 | 120 | 121 | # 插入的例子 122 | def insert(self,value): #这个是把网址先存到里面去url,这儿的意思是插入tengxun那个表 123 | coon =DB.getmysqlconn() # 每次都默认获得一个新连接来进行相关的操作 124 | cur = coon.cursor(cursor=pymysql.cursors.DictCursor) 125 | sql = "insert into testtable (value) values(%s)" 126 | try: 127 | cur.execute(sql,value) # 这样来直接把值替换进行就可以,注意类型 128 | # 提交 129 | coon.commit() 130 | except Exception as e: 131 | # 错误回滚 132 | print(sql) 133 | print(e) 134 | coon.rollback() 135 | finally: 136 | coon.commit() #提交这个事务 137 | cur.close() 138 | coon.close() 139 | 140 | 141 | def insert(self,value): #这个是把网址先存到里面去url,这儿的意思是插入tengxun那个表 142 | coon =DB.getmysqlconn() # 每次都默认获得一个新连接来进行相关的操作 143 | cur = coon.cursor(cursor=pymysql.cursors.DictCursor) 144 | sql = "insert into tengxun (url) values(%s)" 145 | try: 146 | cur.execute(sql,value) # 这样来直接把值替换进行就可以,注意类型 147 | # 提交 148 | coon.commit() 149 | except Exception as e: 150 | # 错误回滚 151 | print(sql) 152 | print(e) 153 | coon.rollback() 154 | finally: 155 | coon.commit() #提交这个事务 156 | cur.close() 157 | coon.close() 158 | 159 | 160 | # 更新的例子 todo 加上插入数据库的操作。把一个item传进来把 , 这个是可以统一使用的。 161 | def insertItem(self,item): 162 | ''' 163 | url = scrapy.Field() 164 | urlState = scrapy.Field() 165 | title = scrapy.Field() 166 | Hcontent = scrapy.Field() 167 | Tcontent = scrapy.Field() 168 | Acontent = scrapy.Field() 169 | newdate = scrapy.Field() 170 | fromWhere = scrapy.Field() 171 | :param item: 默认item是[] 列表内的,哪怕是一个元素也也是一样的。 172 | :return: 173 | ''' 174 | coon =DB.getmysqlconn() # 每次都默认获得一个新连接来进行相关的操作 175 | cur = coon.cursor(cursor=pymysql.cursors.DictCursor) 176 | sql = "insert into tengxun (url,urlState,title,Hcontent,Tcontent,Acontent,newdate,fromWhere)" \ 177 | " values('{0}','{1}','{2}','{3}','{4}','{5}','{6}','{7}')".format( 178 | item['url'][0],item['urlState'][0],item['title'][0],item['Hcontent'][0],item['Tcontent'][0], 179 | item['Acontent'][0],item['newdate'][0],item['fromWhere'][0]) 180 | 181 | try: 182 | print(sql) 183 | cur.execute(sql) # 这样来直接把值替换进行就可以,注意类型 184 | # 提交 185 | coon.commit() 186 | print("插入数据库tengxun成功") 187 | except Exception as e: 188 | # 错误回滚 189 | print(sql) 190 | print(e) 191 | coon.rollback() 192 | # time.sleep(30) 193 | finally: 194 | coon.commit() #提交这个事务 195 | cur.close() 196 | coon.close() 197 | 198 | # ----------------------------------评论的数据库分类插入,传入新闻的url和id,commentDic <聚合的dic> 199 | def classifyDBComment(self,url,id,comment): 200 | print("开始分类整理") # 201 | # print(comment['id']) 202 | sql = "" #评论正文插入 m nbvcbv 203 | sqlHead = "insert into newssentimentanalysis_" 204 | sqlTail = "comment (NikeName,Comment,Date,News_id_id) values (%s,%s,%s,%s)" 205 | 206 | # 插入评论得分的sql 207 | sql2 = "" 208 | sql2Tail = "analysis_comment(Pos_Score,Neg_score,Sentiment,Comment_id_id,Date) values (%s,%s,%s,last_insert_id(),%s)" # 这个我也知道 209 | 210 | # 这句就是更新新闻表中的数据,用id newssentimentanalysis_carcomment 211 | sqlNews = "" 212 | sqlNewsHead = "update newssentimentanalysis_" 213 | sqlNewsTail = "news SET Mcontent='已提取' where News_id=%s" #id是数字 214 | 215 | # 插入正文得 216 | # updateSql = "update tengxun SET hadmix='True' where id='%s' " #Mcontent,这个字段用来“未提取”-》“已提取 217 | 218 | if url.find('auto') != -1: # 找到这个就是汽车,中间是表名 219 | sql = sqlHead + "car" + sqlTail 220 | sql2 = sqlHead + "car" + sql2Tail 221 | sqlNews =sqlNewsHead+ "car"+ sqlNewsTail 222 | pass 223 | if url.find('tech') != -1: # 找到这个就是科技 224 | sql = sqlHead + "technology" + sqlTail 225 | sql2 = sqlHead + "technology" + sql2Tail 226 | sqlNews =sqlNewsHead+ "technology"+ sqlNewsTail 227 | 228 | if url.find('news') != -1: # 找到这个就是默认新闻 229 | sql = sqlHead + "home" + sqlTail 230 | sql2 = sqlHead + "home" + sql2Tail 231 | sqlNews =sqlNewsHead+ "home"+ sqlNewsTail 232 | 233 | 234 | if url.find('ent') != -1: # 找到这个就是娱乐 235 | sql = sqlHead + "entertainment" + sqlTail 236 | sql2 = sqlHead + "entertainment" + sql2Tail 237 | sqlNews =sqlNewsHead+ "entertainment"+ sqlNewsTail 238 | 239 | if url.find('house') != -1: # 找到这个就是房产 240 | sql = sqlHead + "house" + sqlTail 241 | sql2 = sqlHead + "house" + sql2Tail 242 | sqlNews =sqlNewsHead+ "house"+ sqlNewsTail 243 | 244 | if url.find('finance') != -1: # 找到这个就是经济 245 | sql = sqlHead + "finance" + sqlTail 246 | sql2 = sqlHead + "finance" + sql2Tail 247 | sqlNews =sqlNewsHead+ "finance"+ sqlNewsTail 248 | 249 | if url.find('sports') != -1: # 找到这个就是运动 250 | sql = sqlHead + "sports" + sqlTail 251 | sql2 = sqlHead + "sports" + sql2Tail 252 | sqlNews =sqlNewsHead+ "sports"+ sqlNewsTail 253 | 254 | else: 255 | pass # 未能分类,也放到默认的那儿去吗。 256 | 257 | # --------------------------------获取得分---------------------------------- 258 | # print(type(comment['id'])) 259 | print(comment['content']) 260 | print(emoji.demojize(comment['userinfo']['nick'])) 261 | 262 | print(url,str(id)) # 这儿也是没有做异常处理的。 263 | 264 | # senti_counter = Senti_dict_class() 265 | # pos_score, neg_score, SentiResult = .Senti_Text(text) 266 | pos_score, neg_score, SentiResult = Senti_Text(comment['content']) # 这个是纯文本部分 267 | # pos_score, neg_score, SentiResult = Senti_Text(comment['content']) # 这个是纯文本部分 268 | if SentiResult.find("[")!=-1: 269 | SentiResult = SentiResult.replact("[","") 270 | if SentiResult.find("]")!=-1: 271 | SentiResult = SentiResult.replact("]","") 272 | print(SentiResult) 273 | # 中立的情况好像是返回直接是0 274 | print(pos_score) 275 | # ---------------------------这边开始数据库插入相关操作----------------------------- 276 | coon = DB.getmysqlconn() # 每次都默认获得一个新连接来进行相关的操作 277 | cur = coon.cursor(cursor=pymysql.cursors.DictCursor) 278 | 279 | try: 280 | cur.execute(sql, ( 281 | comment['userinfo']['nick'], comment['content'],comment['time'], id)) # 插入指定的表(分类) 282 | 283 | 284 | cur.execute(sql2, ( 285 | pos_score, neg_score, SentiResult,comment['time'])) # 插入评分 ,加上了日期了 todo获得评分 286 | # print(sqlNews % int(id)) 287 | id = str(id) 288 | 289 | cur.execute(sqlNews, (id)) # 更新新闻的 Mcontent,这个是可以工作的啊 290 | 291 | coon.commit() 292 | return True 293 | # time.sleep() 294 | except Exception as e: 295 | print(pos_score) 296 | print(neg_score) 297 | print(SentiResult) 298 | # print(Tcontent) 299 | # 错误回滚 300 | print("事务回滚,跳过插入") 301 | # print(rowDic['id']) 302 | print(sql, ( 303 | comment['userinfo']['nick'], comment['content'],comment['time'], id)) 304 | 305 | print(id) 306 | print(type(id)) 307 | print(sqlNews % (id)) 308 | 309 | 310 | print(e) 311 | coon.rollback() 312 | traceback.print_exc() 313 | return False # 提取评论失败的都不管. 314 | finally: 315 | coon.commit() # 提交这个事务 316 | cur.close() 317 | coon.close() 318 | print("这条新闻的评论写入完毕") 319 | 320 | # 把tengxun表中的数据,计算评分,并且分类到django表中去 321 | def classifyDB(self): # 322 | resultDic = self.__query__( # todo 测试部分 323 | "select id,url,title,urlState,Hcontent,Mcontent,Tcontent,Acontent,newdate,fromWhere from tengxun where urlState='True' and hadmix='False'") 324 | print("开始分类整理") 325 | for rowDic in resultDic: 326 | # 插入分类新闻主表的sql 327 | sql = "" 328 | sqlHead = "insert into newssentimentanalysis_" 329 | sqlTail = "news (url,Title,UrlState,Hcontent,Mcontent,Tcontent,Acontent,Date,fromWhere) values (%s,%s,%s,%s,%s,%s,%s,%s,%s)" 330 | 331 | # 插入正文得分的sql 332 | sql2 = "" 333 | sql2Tail = "analysis_news(Pos_Score,Neg_score,Sentiment,News_id_id,Date) values (%s,%s,%s,last_insert_id(),%s)" # 这个是sql的 334 | 335 | # 这句就是更新tengxun表中的数据,用id 336 | updateSql = "update tengxun SET hadmix='True' where id='%s' " # 这个是分类用的数据. 337 | 338 | if rowDic['url'].find('auto') != -1: # 找到这个就是汽车,中间是表名 339 | sql = sqlHead + "car" + sqlTail 340 | sql2 = sqlHead + "car" + sql2Tail 341 | pass 342 | if rowDic['url'].find('tech') != -1: # 找到这个就是科技 343 | sql = sqlHead + "technology" + sqlTail 344 | sql2 = sqlHead + "technology" + sql2Tail 345 | 346 | pass 347 | if rowDic['url'].find('news') != -1: # 找到这个就是默认新闻 348 | sql = sqlHead + "home" + sqlTail 349 | sql2 = sqlHead + "home" + sql2Tail 350 | 351 | pass 352 | if rowDic['url'].find('ent') != -1: # 找到这个就是娱乐 353 | sql = sqlHead + "entertainment" + sqlTail 354 | sql2 = sqlHead + "entertainment" + sql2Tail 355 | 356 | pass 357 | if rowDic['url'].find('house') != -1: # 找到这个就是房产 358 | sql = sqlHead + "house" + sqlTail 359 | sql2 = sqlHead + "house" + sql2Tail 360 | 361 | pass 362 | if rowDic['url'].find('finance') != -1: # 找到这个就是经济 363 | sql = sqlHead + "finance" + sqlTail 364 | sql2 = sqlHead + "finance" + sql2Tail 365 | 366 | pass 367 | if rowDic['url'].find('sports') != -1: # 找到这个就是运动 368 | sql = sqlHead + "sports" + sqlTail 369 | sql2 = sqlHead + "sports" + sql2Tail 370 | 371 | pass 372 | else: 373 | print("这边这种是网易的情况-归为默认新闻home中去") 374 | 375 | sql = sqlHead + "home" + sqlTail 376 | sql2 = sqlHead + "home" + sql2Tail 377 | 378 | pass # 未能分类,也放到默认的那儿去吗。 # 379 | 380 | # --------------------------------获取得分---------------------------------- 381 | print("Tcontent长度") 382 | print(len(rowDic['Tcontent'])) 383 | pos_score, neg_score, SentiResult = "", "", "" 384 | 385 | # senti_counter = Senti_dict_class() 386 | pos_score, neg_score, SentiResult = Senti_Text(rowDic['Tcontent']) 387 | # pos_score, neg_score, SentiResult = senti_counter.Senti_Text(rowDic['Tcontent']) # 这个是纯文本部分 388 | 389 | # todo 进行分数写入和的部分 390 | 391 | 392 | # pos_score, neg_score, SentiResult = Senti_Sentence(rowDic['Tcontent']) #这个是纯文本部分 393 | 394 | print("分类时候写入分数检查") 395 | print() 396 | 397 | # print(rowDic['Tcontent']) 398 | # print() 399 | print(sql % ( 400 | rowDic['url'], rowDic['title'], True, rowDic['Hcontent'], '未提取', rowDic['Tcontent'], rowDic['Acontent'], 401 | rowDic['newdate'], rowDic['fromWhere'] 402 | )) 403 | print(pos_score) 404 | print(neg_score) 405 | print(SentiResult) 406 | 407 | # ---------------------------这边开始数据库插入相关操作----------------------------- 408 | 409 | coon = DB.getmysqlconn() # 每次都默认获得一个新连接来进行相关的操作 410 | cur = coon.cursor(cursor=pymysql.cursors.DictCursor) 411 | # print(rowDic['url']) 412 | # print(rowDic['title']) 413 | # print(rowDic['Hcontent']) 414 | # print('未提取') 415 | # print(rowDic['Tcontent']) 416 | # print(rowDic['Acontent']) 417 | # print(rowDic['newdate']) 418 | # print(rowDic['fromWhere']) 419 | 420 | # print((sql %( 421 | # rowDic['url'],rowDic['title'],"True",rowDic['Hcontent'],'未提取',rowDic['Tcontent'],rowDic['Acontent'],rowDic['newdate'],rowDic['fromWhere'] 422 | # ) 423 | # )) 424 | 425 | try: # 三个一起操作,很多麻烦事情的。可以,这样操作也是可以的。 426 | cur.execute(sql, 427 | ( 428 | rowDic['url'], rowDic['title'], True, rowDic['Hcontent'], '未提取', rowDic['Tcontent'], 429 | rowDic['Acontent'], rowDic['newdate'], rowDic['fromWhere'] 430 | ) 431 | ) # 插入指定的表(分类) 432 | 433 | print("插入成功才用得上这个的把。") # 无法提取到这个的。在写一次查询把。 434 | # print(cur.lastrowid()) # 上一个插入的id是,还真是有,那就直接返回过来就可以了 435 | # print(type(cur.lastrowid())) # 上一个插入的id是,还真是有,那就直接返回过来就可以了 436 | 437 | cur.execute(sql2, (pos_score, neg_score, SentiResult, rowDic['newdate'])) # 插入评分 todo获得评分 438 | cur.execute(updateSql, (rowDic['id'])) # 更新tengxun hadmix,这个是可以工作的啊 439 | # 提交 440 | coon.commit() 441 | 442 | except Exception as e: 443 | # 错误回滚 444 | print("事务回滚,跳过插入") 445 | # print(rowDic['id']) 446 | # print(sql%(rowDic['url'],rowDic['title'],True,rowDic['Hcontent'],'未使用',rowDic['Tcontent'],rowDic['Acontent'],rowDic['newdate'],rowDic['fromWhere'])) 447 | print(e) 448 | coon.rollback() 449 | traceback.print_exc() 450 | 451 | finally: 452 | # print("插入成功") 453 | coon.commit() # 提交这个事务 454 | cur.close() 455 | coon.close() 456 | print("今天的量分完了") 457 | 458 | 459 | 460 | 461 | if __name__ == "__main__": # 下面都是用来测试用的。 462 | 463 | chak = DB() 464 | # chak.classifyDB() 465 | # chak. 测试用调用 466 | chak.__query__("update newssentimentanalysis_carnews set Mcontent = '无评论' where News_id=4") 467 | 468 | 469 | 470 | print("DB finish!") 471 | 472 | 473 | 474 | 475 | -------------------------------------------------------------------------------- /Crawler/Crawler/expand_package/Sent_Dict/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/realzhengyiming/newsSpier_scrapy/2852f63981f764bfc0f2c733d52f104a3bf3c9e1/Crawler/Crawler/expand_package/Sent_Dict/__init__.py -------------------------------------------------------------------------------- /Crawler/Crawler/expand_package/Sent_Dict/否定词.txt: -------------------------------------------------------------------------------- 1 | 不 2 | 不是 3 | 不能 4 | 不可 5 | 没有 6 | 不要 7 | 别 8 | 没 9 | 无 10 | 莫 11 | 未 12 | 勿 13 | 休 14 | 甭 15 | 非 -------------------------------------------------------------------------------- /Crawler/Crawler/expand_package/Sent_Dict/程度级别词语.txt: -------------------------------------------------------------------------------- 1 | extreme 2 | 百分之百 3 | 倍加 4 | 备至 5 | 不得了 6 | 不堪 7 | 不可开交 8 | 不亦乐乎 9 | 不折不扣 10 | 彻头彻尾 11 | 充分 12 | 到头 13 | 地地道道 14 | 非常 15 | 极 16 | 极度 17 | 极端 18 | 极其 19 | 极为 20 | 截然 21 | 尽 22 | 惊人地 23 | 绝 24 | 绝顶 25 | 绝对 26 | 绝对化 27 | 刻骨 28 | 酷 29 | 满 30 | 满贯 31 | 满心 32 | 莫大 33 | 奇 34 | 入骨 35 | 甚为 36 | 十二分 37 | 十分 38 | 十足 39 | 死 40 | 滔天 41 | 痛 42 | 透 43 | 完全 44 | 完完全全 45 | 万 46 | 万般 47 | 万分 48 | 万万 49 | 无比 50 | 无度 51 | 无可估量 52 | 无以复加 53 | 无以伦比 54 | 要命 55 | 要死 56 | 已极 57 | 已甚 58 | 异常 59 | 逾常 60 | 贼 61 | 之极 62 | 之至 63 | 至极 64 | 卓绝 65 | 最为 66 | 佼佼 67 | 郅 68 | 綦 69 | 齁 70 | 最 71 | very 72 | 不为过 73 | 超 74 | 超额 75 | 超外差 76 | 超微结构 77 | 超物质 78 | 出头 79 | 多 80 | 浮 81 | 过 82 | 过度 83 | 过分 84 | 过火 85 | 过劲 86 | 过了头 87 | 过猛 88 | 过热 89 | 过甚 90 | 过头 91 | 过于 92 | 过逾 93 | 何止 94 | 何啻 95 | 开外 96 | 苦 97 | 老 98 | 偏 99 | 强 100 | 溢 101 | 忒 102 | 不过 103 | 不少 104 | 不胜 105 | 惨 106 | 沉 107 | 沉沉 108 | 出奇 109 | 大为 110 | 多 111 | 多多 112 | 多加 113 | 多么 114 | 分外 115 | 格外 116 | 够瞧的 117 | 够呛 118 | 好 119 | 好不 120 | 何等 121 | 很 122 | 很是 123 | 坏 124 | 可 125 | 老 126 | 老大 127 | 良 128 | 颇 129 | 颇为 130 | 甚 131 | 实在 132 | 太 133 | 太甚 134 | 特 135 | 特别 136 | 尤 137 | 尤其 138 | 尤为 139 | 尤以 140 | 远 141 | 着实 142 | 曷 143 | 碜 144 | more 145 | 大不了 146 | 多 147 | 更 148 | 比较 149 | 更加 150 | 更进一步 151 | 更为 152 | 还 153 | 还要 154 | 较 155 | 较比 156 | 较为 157 | 进一步 158 | 那般 159 | 那么 160 | 那样 161 | 强 162 | 如斯 163 | 益 164 | 益发 165 | 尤甚 166 | 逾 167 | 愈 168 | 愈 ... 愈 169 | 愈发 170 | 愈加 171 | 愈来愈 172 | 愈益 173 | 远远 174 | 越 ... 越 175 | 越发 176 | 越加 177 | 越来越 178 | 越是 179 | 这般 180 | 这样 181 | 足 182 | 足足 183 | ish 184 | 点点滴滴 185 | 多多少少 186 | 怪 187 | 好生 188 | 还 189 | 或多或少 190 | 略 191 | 略加 192 | 略略 193 | 略微 194 | 略为 195 | 蛮 196 | 稍 197 | 稍稍 198 | 稍微 199 | 稍为 200 | 稍许 201 | 挺 202 | 未免 203 | 相当 204 | 些 205 | 些微 206 | 些小 207 | 一点 208 | 一点儿 209 | 一些 210 | 有点 211 | 有点儿 212 | 有些 213 | 半点 214 | 不大 215 | 不丁点儿 216 | 不甚 217 | 不怎么 218 | 聊 219 | 没怎么 220 | 轻度 221 | 弱 222 | 丝毫 223 | 微 224 | 相对 225 | last 226 | -------------------------------------------------------------------------------- /Crawler/Crawler/expand_package/WordCloud.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -* 2 | 3 | from wordcloud import WordCloud 4 | import PIL.Image as image 5 | import numpy as np 6 | import jieba 7 | import datetime 8 | import os 9 | import time 10 | 11 | from Crawler.settings import IMAGES_STORE, WORDCLOUD_STORE 12 | 13 | Yesterday = (datetime.datetime.now()-datetime.timedelta(days=2)).strftime('%Y-%m-%d')#昨天 14 | 15 | def trans_CN(text): 16 | #中文要进行分词,不像英文自动有空格 17 | wordlist = jieba.cut(text) 18 | result = ' '.join(wordlist) 19 | return result 20 | 21 | 22 | def Gen_WordCloud(text,Newsid): 23 | #输入:text文章内容,Newsid文章的id号 24 | #输出:image_path对应词云图片的路径 25 | text = trans_CN(text)#分词 26 | #mask = np.array(image.open('./static/images/cloud.png'))#如果要把词云形状弄成特定图形要用该语句 27 | wordcloud = WordCloud( 28 | #mask=mask, 29 | font_path = "C:\Windows\Fonts\simhei.ttf", #加载中文字体 30 | background_color='white', #背景色 31 | max_words=2000,#允许最大词汇 32 | #max_font_size=60 #最大号字体 33 | ).generate(text) 34 | 35 | image_produce = wordcloud.to_image() 36 | name = str(Newsid)+".png" #构造温江名 37 | # path = "../../static/images/WordCloud/" #保存文件夹 38 | path = WORDCLOUD_STORE 39 | if not os.path.exists(path): 40 | os.makedirs(path) 41 | save_path =path+name #保存的完整路径 这个地址也是创建到爬虫项目的外面,刚好,目录结构不变的情况下。 42 | print(save_path) 43 | wordcloud.to_file(save_path) #保存词云 44 | img_path=save_path+name #对应的要传给标签的路径 45 | #print("save to :",save_path) 46 | #image_produce.show() 47 | print("生成词云成功了!") 48 | return img_path 49 | 50 | if __name__=="__main__": 51 | Newsid="shitshit" 52 | text='近日,上汽大通官方公布了全新MPV车型G20的最新官图,从此次公布的官图中不难看出,大通G20在外形轮廓上沿用了家族式设计。大灯采用了全LED光源,造型极具科技感。内饰中控区采用了悬浮式设计,营造出了更多的储物空间。据悉,大通G20将在2019上海车展期间正式亮相。从官图细节中可以看出,大通G20的前脸设计相比G10车型焕然一新。不规则形状的大灯和硕大的进气格栅相连接,其大灯内部结构也更加复杂,采用全LED光源。侧面轮廓上,大通G20采用了悬浮式车窗设计。尾灯同样采用全LED光源,两侧尾灯之间采用镀铬条相连,尾部采用字母logo居中的形式,而非图形logo。内饰部分,厂方着重强调了悬浮式中控设计。从官图中可以看出,大通G20采用了旋钮式换挡操作,换挡旋钮四周集成了众多驾驶辅助功能,视觉效果上具备更强的科技感。而悬浮式设计则为底部营造了更大的储物空间,便于放置乘客带上车的手包或其它物品。目前,官方暂未透露新车将会搭载哪款动力总成。根据推测,大通G20有望搭载2.0T汽油发动机和1.9T柴油发动机,预计在2019年上海车展期间正式亮相。' 53 | Gen_WordCloud(text,Newsid) -------------------------------------------------------------------------------- /Crawler/Crawler/expand_package/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/realzhengyiming/newsSpier_scrapy/2852f63981f764bfc0f2c733d52f104a3bf3c9e1/Crawler/Crawler/expand_package/__init__.py -------------------------------------------------------------------------------- /Crawler/Crawler/expand_package/makebeautifulSoup.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -* 2 | 3 | #coding=utf-8 4 | import random 5 | import time 6 | import json 7 | import chardet 8 | import requests 9 | import retrying 10 | from bs4 import BeautifulSoup 11 | 12 | class makeBS: 13 | @retrying.retry(stop_max_attempt_number=4) #重试4次,每次等待多久呢 14 | def mobiResponse(self,requestURL): #这个留着吧 15 | print(requestURL) 16 | my_headers = [ # 这边为了得到直接的手机端的页面代码返回,直接使用手机ua 17 | # 'Mozilla/5.0 (Linux; Android 7.1.1; MI 6 Build/NMF26X; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/53.0.2785.49 Mobile MQQBrowser/6.2 TBS/043508 Safari/537.36 MicroMessenger/6.5.13.1100 NetType/WIFI Language/zh_CN', 18 | # 'Mozilla/5.0 (Linux; Android 7.1.1; MI 6 Build/NMF26X) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/57.0.2987.132 Mobile Safari/537.36 Maxthon/3047', 19 | 'Mozilla/5.0 (Linux; Android 8.0.0; Pixel 2 XL Build/OPD1.170816.004) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Mobile Safari/537.36', 20 | # 'Mozilla/5.0 (Linux; U; Android 7.0; zh-cn; STF-AL00 Build/HUAWEISTF-AL00) AppleWebKit/537.36 (KHTML, like Gecko)Version/4.0 Chrome/37.0.0.0 MQQBrowser/7.9 Mobile Safari/537.36', 21 | # 'Mozilla/5.0 (Linux; U; Android 6.0.1; zh-CN; SM-C7000 Build/MMB29M) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/40.0.2214.89 UCBrowser/11.6.2.948 Mobile Safari/537.36', 22 | # 'Mozilla/5.0 (Linux; Android 7.0; STF-AL10 Build/HUAWEISTF-AL10; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/53.0.2785.49 Mobile MQQBrowser/6.2 TBS/043508 Safari/537.36 V1_AND_SQ_7.2.0_730_YYB_D QQ/7.2.0.3270 NetType/4G WebP/0.3.0 Pixel/1080' 23 | ] 24 | headers = {"User-Agent": random.choice(my_headers), 'Referer':requestURL} # 默认值 25 | try: 26 | rawhtml = requests.get(requestURL, headers=headers, allow_redirects=True, #跳转怎么是false 27 | timeout=30) # 一般提取文本的话,那就用text,如果是文件就content 28 | # print(rawhtml.headers) 29 | # rawhtml.encoding = "GBK" ##gbk>gb2312 使用这种方式尚且还有乱码的情况,部分乱码,那就是gbk可以修复 30 | # print(chardet.detect(rawhtml.content)['encoding']) 31 | if ("GB2312" == chardet.detect(rawhtml.content)['encoding']): 32 | rawhtml.encoding = "gbk" 33 | else: 34 | rawhtml.encoding = chardet.detect(rawhtml.content)['encoding'] # 这样应该就可以直接默认来编码了 35 | if rawhtml.status_code == 504: 36 | print(504) 37 | return 38 | print(rawhtml.url) 39 | print("状态码" + str(rawhtml.status_code)) 40 | html = rawhtml.text 41 | return html #返回了这个网页的html 文档,然后再解析一次就可以了 42 | except Exception as e: 43 | print(e) 44 | return 45 | 46 | def makesoup(self,url): # 这儿是按页来打开的 47 | if url==None: 48 | return 49 | my_headers = [ 50 | 'Mozilla/5.0 (Windows NT 5.2) AppleWebKit/534.30 (KHTML, like Gecko) Chrome/12.0.742.122 Safari/534.30', 51 | 'Mozilla/5.0 (Windows NT 5.1; rv:5.0) Gecko/20100101 Firefox/5.0', 52 | 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.2; Trident/4.0; .NET CLR 1.1.4322; .NET CLR 2.0.50727; .NET4.0E; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729; .NET4.0C)', 53 | 'Opera/9.80 (Windows NT 5.1; U; zh-cn) Presto/2.9.168 Version/11.50', 54 | 'Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/533.21.1 (KHTML, like Gecko) Version/5.0.5 Safari/533.21.1', 55 | 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 2.0.50727; .NET CLR 3.0.04506.648; .NET CLR 3.5.21022; .NET4.0E; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729; .NET4.0C)'] 56 | headers = {"User-Agent": random.choice(my_headers)} #默认值 57 | if(url.find("ifeng.com")!=-1): #是凤凰的网址的话 58 | print("fenghuangNews") 59 | headers = {"User-Agent": random.choice(my_headers), 60 | "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8", 61 | 'Accept-Encoding':'gbk, utf-8', 62 | 'Accept-Language': 'zh-CN,zh;q=0.9', 63 | } 64 | if (url.find(".qq.com")!=-1): 65 | print("qqnews") 66 | headers = { 67 | "User-Agent": random.choice(my_headers), 68 | "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8", 69 | 'Accept-Encoding': 'gbk, utf-8', 70 | 'Accept-Language': 'zh-CN,zh;q=0.9', 71 | 'referer':url 72 | } 73 | """ 74 | 获取网站的soup对象,#看看还能不能增加代理的东西,进来 75 | 有两个请求头的自定义,但是,为什么要分开来呢 76 | """ 77 | # headers = {"User-Agent": random.choice(my_headers)} 78 | soup = None 79 | address = "http://223.203.0.14:8000" #默就是用了代理,怎么还是失败呢 80 | # address = None #访问页面的这个要使用代理才可以 81 | proxies = {'http': address, "https": address} # , 'https': 'http://localhost:8888',这儿现在就是没用代理的情况下 82 | try: 83 | rawhtml = requests.get(url, headers=headers, allow_redirects=True,timeout=60) #一般提取文本的话,那就用text,如果是文件就content 84 | if url.find("ifeng")!=-1: 85 | # print(chardet.detect(rawhtml.content)['encoding']) 经常性的检测错误 86 | print(chardet.detect(rawhtml.content)['encoding']) 87 | if ("GB2312"== chardet.detect(rawhtml.content)['encoding']): 88 | rawhtml.encoding = "gbk" 89 | else: 90 | rawhtml.encoding = "utf-8" # 这样应该就可以直接默认来编码了 91 | else: 92 | rawhtml.encoding = chardet.detect(rawhtml.content)['encoding'] #这样应该就可以直接默认来编码了 93 | # print(rawhtml.status_code) 94 | # print(rawhtml.headers) 95 | soup = BeautifulSoup(rawhtml.text, 'lxml') 96 | return soup 97 | except Exception as e: #如果超时的话就变成这样子 98 | print(e) 99 | # print(rawhtml.status_code) 100 | return soup #没有的话就是返回空的在这儿的None 101 | 102 | def makeBSjson(self,url): 103 | my_headers = [ 104 | 'Mozilla/5.0 (Windows NT 5.2) AppleWebKit/534.30 (KHTML, like Gecko) Chrome/12.0.742.122 Safari/534.30', 105 | 'Mozilla/5.0 (Windows NT 5.1; rv:5.0) Gecko/20100101 Firefox/5.0', 106 | 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.2; Trident/4.0; .NET CLR 1.1.4322; .NET CLR 2.0.50727; .NET4.0E; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729; .NET4.0C)', 107 | 'Opera/9.80 (Windows NT 5.1; U; zh-cn) Presto/2.9.168 Version/11.50', 108 | 'Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/533.21.1 (KHTML, like Gecko) Version/5.0.5 Safari/533.21.1', 109 | 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 2.0.50727; .NET CLR 3.0.04506.648; .NET CLR 3.5.21022; .NET4.0E; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729; .NET4.0C)'] 110 | headers = {"User-Agent": random.choice(my_headers)} # 默认值 111 | try: 112 | r = requests.get(url, headers=headers, allow_redirects=True,timeout=60) # 一般提取文本的话,那就用text,如果是文件就content 113 | json_response = r.content.decode() # 获取r的文本 就是一个json字符串 114 | # 将json字符串转换成dic字典对象 115 | dict_json = json.loads(json_response) 116 | return dict_json 117 | except Exception as e: 118 | print(e) 119 | 120 | #------------------------------2019-新增关于解析腾讯评论的请求-------------------------------------------------- 121 | def getCmt_id(self,url): 122 | try: 123 | url = url.replace("//",'/') #使用的时候再转换,因为数据库里面的是四条杠的。 124 | response = requests.get(url) 125 | html = response.content 126 | bs = BeautifulSoup(html,'lxml') 127 | # pprint(BS) 128 | for i in bs.find_all("script"): # 这儿那么多可以换成正则表达式来找出这一大长串的数字。 129 | if i.text.find("cmt_id") != -1: 130 | # print(i.text) # 都是数字的话,那就把连续的数字都提取出来好了。 131 | for object in i.text.split(";"): 132 | if object.find("cmt_id") != -1: 133 | cmt_id = object.split("=")[1] # 这样会不会很危险呢。。。直接运行js代码。 134 | return(cmt_id) 135 | # print(object) 136 | return #如果没有找到的话,那就返回None 137 | except Exception as e: 138 | print(e) 139 | return 140 | 141 | 142 | 143 | 144 | 145 | 146 | if __name__ == "__main__": #这个就是url的东西 147 | url = "https://pl.ifeng.com/a/20181010/60101359_0.shtml" 148 | cooker = makeBS() 149 | html = cooker.makesoup(url) 150 | 151 | 152 | 153 | -------------------------------------------------------------------------------- /Crawler/Crawler/expand_package/picDownloadScript.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -* 2 | 3 | # 下载到的一个路径中去,把图片下载下来,并且把新闻里面的 4 | import hashlib 5 | import os 6 | import time 7 | import traceback 8 | import requests ##导入requests 9 | 10 | # from config import downloadPath 11 | from Crawler.settings import IMAGES_STORE # 这个是自己定义好的配置文件,一般可以放在相同目录下,可以直接访问。 12 | 13 | class Download: 14 | def __init__(self, path): # 先设置好下载的路径 15 | if (path == None): 16 | self.path = IMAGES_STORE # 这边也直接使用默认使用配置文件的地址 17 | print("是 None") # 每次管道也是还是重新生成的一个啊 18 | else: 19 | self.path = path 20 | 21 | def makeMd5(self,url): 22 | obj = hashlib.md5() 23 | obj.update(bytes(url,encoding="utf-8")) 24 | return obj.hexdigest() 25 | 26 | def downloadImg(self, img_url, imgName, referer, now_date): # 这个下载的模块是没有返回值的, 27 | time.sleep(0.5) 28 | ''' 29 | img_url, 图片的下载链接 30 | imgName, 下载的图片的名字 31 | referer, 这个参数是请求的时候防止加了referer参数的反反爬虫用的。 32 | now_date 图片下载到 指定路径下的什么文件夹内,这儿是使用 日期字段 作为文件夹 测试可以随意修改 33 | 设置根据图片的url生成唯一的md5码,scrapy 类似的。 34 | ''' 35 | headers = { 36 | 'User-Agent': "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1", 37 | 'Referer': referer} ##浏览器请求头(大部分网站没有这个请求头会报错、请务必加上哦) 38 | try: 39 | # int(shit) # todo 图片可以不下载了 40 | img = requests.get(img_url, headers=headers) 41 | # print(img) 42 | # print(self.path) 43 | if (False == os.path.exists(os.path.join(self.path, now_date))): # 不存在这个目录的话 44 | os.makedirs(self.path + '/' + now_date) 45 | if imgName==None: # 不设置的话,默认就是md5 46 | imgName = self.makeMd5(img_url) #改良过后通过url来生成唯一的md5的 47 | dPath = os.path.join(self.path, now_date, imgName + '.jpg') # imgName传进来不需要带时间 48 | # print(dPath) 49 | print("图片的文件名 " + dPath) 50 | f = open(dPath, 'ab') 51 | f.write(img.content) 52 | f.close() 53 | # print("下载成功") 54 | return os.path.join( now_date, imgName + '.jpg') # 返回相对路径 55 | except Exception as e: 56 | print(img_url) 57 | print(e) 58 | traceback.print_exc() 59 | 60 | 61 | if __name__ == "__main__": 62 | # 局部测试代码 63 | imgUrl = "http://inews.gtimg.com/newsapp_match/0/5403685404/0" 64 | downloadTool = Download(None) # todo 这儿有一个问题就是,这个图片的下载地址网页部分是带地址的,所以,最好的是网页部分不需要要再加上地址的文件夹,统一使用 65 | path = downloadTool.downloadImg(img_url="http://img1.gtimg.com/datalib_img//18-07-03/a/fda81a84eb06919ba40782c45ebbc28d.jpg" , 66 | imgName = None, 67 | referer = None, 68 | now_date = "20190505") # 这个是下面新建力的文件夹 69 | print(path) -------------------------------------------------------------------------------- /Crawler/Crawler/expand_package/senti_dict.py: -------------------------------------------------------------------------------- 1 | #-*- coding:utf8 -*- 2 | 3 | 4 | 5 | import jieba 6 | import numpy as np 7 | import re 8 | #打开词典文件,返回列表 9 | def open_dict(Dict = 'name', path=r'./Sent_Dict/'): 10 | path = os.path.join(path, '%s.txt' % Dict) 11 | dictionary = open(path, 'r', encoding='utf-8') 12 | dict = [] 13 | for word in dictionary: 14 | word = word.strip('\n') 15 | dict.append(word) 16 | return dict 17 | 18 | def judgeodd(num): 19 | if (num % 2) == 0: 20 | return 'even' 21 | else: 22 | return 'odd' 23 | 24 | 25 | 26 | 27 | def open_file_as_text(filename): 28 | dict = [] 29 | with open(filename, encoding='utf-8') as f: 30 | # print(f.read()) 31 | dict = f.readlines() 32 | new = [] 33 | for word in dict: 34 | new.append(word.replace("\n", "")) 35 | print(new) 36 | return new 37 | 38 | 39 | import os 40 | current_path = os.path.dirname(__file__) 41 | deny_word = open_dict(Dict = '否定词', path= current_path) 42 | posdict = open_dict(Dict = 'positive', path= current_path) 43 | negdict = open_dict(Dict = 'negative', path= current_path) 44 | degree_word = open_dict(Dict = '程度级别词语', path= current_path) 45 | 46 | # deny_word = open_file_as_text(current_path+'否定词.txt') 47 | # posdict = open_file_as_text(current_path+'positive.txt') 48 | # negdict = open_file_as_text(current_path+'negative.txt') 49 | # degree_word = open_file_as_text(current_path+'程度级别词语.txt') 50 | 51 | mostdict = degree_word[degree_word.index('extreme')+1 : degree_word.index('very')]#权重4,即在情感词前乘以4 52 | verydict = degree_word[degree_word.index('very')+1 : degree_word.index('more')]#权重3 53 | moredict = degree_word[degree_word.index('more')+1 : degree_word.index('ish')]#权重2 54 | ishdict = degree_word[degree_word.index('ish')+1 : degree_word.index('last')]#权重0.5 55 | 56 | 57 | import jieba 58 | def sentiment_score_list(dataset): 59 | seg_sentence = dataset.split(' 。') 60 | count1 = [] 61 | count2 = [] 62 | for sen in seg_sentence: #循环遍历每一个评论 63 | segtmp = jieba.lcut(sen, cut_all=False) #把句子进行分词,以列表的形式返回 64 | i = 0 #记录扫描到的词的位置 65 | a = 0 #记录情感词的位置 66 | poscount = 0 #积极词的第一次分值 67 | poscount2 = 0 #积极词反转后的分值 68 | poscount3 = 0 #积极词的最后分值(包括叹号的分值) 69 | negcount = 0 70 | negcount2 = 0 71 | negcount3 = 0 72 | for word in segtmp: 73 | if word in posdict: # 判断词语是否是情感词 74 | poscount += 1 75 | c = 0 76 | for w in segtmp[a:i]: # 扫描情感词前的程度词 77 | if w in mostdict: 78 | poscount *= 4.0 79 | elif w in verydict: 80 | poscount *= 3.0 81 | elif w in moredict: 82 | poscount *= 2.0 83 | elif w in ishdict: 84 | poscount *= 0.5 85 | elif w in deny_word: 86 | c += 1 87 | if judgeodd(c) == 'odd': # 扫描情感词前的否定词数 88 | poscount *= -1.0 89 | poscount2 += poscount 90 | poscount = 0 91 | poscount3 = poscount + poscount2 + poscount3 92 | poscount2 = 0 93 | else: 94 | poscount3 = poscount + poscount2 + poscount3 95 | poscount = 0 96 | a = i + 1 # 情感词的位置变化 97 | 98 | elif word in negdict: # 消极情感的分析,与上面一致 99 | negcount += 1 100 | d = 0 101 | for w in segtmp[a:i]: 102 | if w in mostdict: 103 | negcount *= 4.0 104 | elif w in verydict: 105 | negcount *= 3.0 106 | elif w in moredict: 107 | negcount *= 2.0 108 | elif w in ishdict: 109 | negcount *= 0.5 110 | elif w in degree_word: 111 | d += 1 112 | if judgeodd(d) == 'odd': 113 | negcount *= -1.0 114 | negcount2 += negcount 115 | negcount = 0 116 | negcount3 = negcount + negcount2 + negcount3 117 | negcount2 = 0 118 | else: 119 | negcount3 = negcount + negcount2 + negcount3 120 | negcount = 0 121 | a = i + 1 122 | elif word == '!' or word == '!': ##判断句子是否有感叹号 123 | for w2 in segtmp[::-1]: # 扫描感叹号前的情感词,发现后权值+2,然后退出循环 124 | if w2 in posdict or negdict: 125 | poscount3 += 2 126 | negcount3 += 2 127 | break 128 | i += 1 # 扫描词位置前移 129 | 130 | 131 | # 以下是防止出现负数的情况 132 | pos_count = 0 133 | neg_count = 0 134 | if poscount3 < 0 and negcount3 > 0: 135 | neg_count += negcount3 - poscount3 136 | pos_count = 0 137 | elif negcount3 < 0 and poscount3 > 0: 138 | pos_count = poscount3 - negcount3 139 | neg_count = 0 140 | elif poscount3 < 0 and negcount3 < 0: 141 | neg_count = -poscount3 142 | pos_count = -negcount3 143 | else: 144 | pos_count = poscount3 145 | neg_count = negcount3 146 | 147 | count1.append([pos_count, neg_count]) 148 | count2.append(count1) 149 | count1 = [] 150 | 151 | return count2 152 | 153 | def sentiment_score(senti_score_list): 154 | score = [] 155 | for review in senti_score_list: 156 | score_array = np.array(review) 157 | Pos = np.sum(score_array[:, 0]) 158 | Neg = np.sum(score_array[:, 1]) 159 | AvgPos = np.mean(score_array[:, 0]) 160 | AvgPos = float('%.1f'%AvgPos) 161 | AvgNeg = np.mean(score_array[:, 1]) 162 | AvgNeg = float('%.1f'%AvgNeg) 163 | StdPos = np.std(score_array[:, 0]) 164 | StdPos = float('%.1f'%StdPos) 165 | StdNeg = np.std(score_array[:, 1]) 166 | StdNeg = float('%.1f'%StdNeg) 167 | score.append([Pos, Neg, AvgPos, AvgNeg, StdPos, StdNeg]) 168 | return score 169 | 170 | 171 | def Senti_Sentence(word): 172 | if word == '': 173 | return 0,0,'NEU' 174 | else: 175 | result = sentiment_score(sentiment_score_list(str(word))) # 情感分析 176 | pos_score = result[0][0] 177 | neg_score = result[0][1] 178 | if pos_score == neg_score: 179 | SentiResult='NEU' 180 | elif pos_score > neg_score: 181 | SentiResult='POS' 182 | else: 183 | SentiResult='NEG' 184 | #print(pos_score,neg_score,SentiResult) 185 | return float(pos_score),float(neg_score),SentiResult 186 | 187 | def Senti_Text(text): 188 | if text == '': 189 | return 0,0,'NEU' 190 | else: 191 | text = str(text) 192 | seg_sentence = re.split('。|!|?|……|,',text) 193 | print(seg_sentence) 194 | pos_sum=0 195 | neg_sum=0 196 | sen_num=0 197 | for sentence in seg_sentence: 198 | if sentence != '': 199 | pos,neg,res=Senti_Sentence(sentence) 200 | pos_sum+=pos 201 | neg_sum+=neg 202 | sen_num+=1 203 | else: 204 | pass 205 | print('句子数:',sen_num) 206 | try: 207 | pos_score = pos_sum/sen_num 208 | neg_score = neg_sum/sen_num 209 | if pos_score == neg_score: 210 | SentiResult='NEU' 211 | elif pos_score > neg_score: 212 | SentiResult='POS' 213 | else: 214 | SentiResult='NEG' 215 | #print(pos_score,neg_score,SentiResult) 216 | return float(pos_score),float(neg_score),SentiResult 217 | except Exception as e : # 218 | print(e) 219 | return 0,0,'NEU' 220 | 221 | 222 | 223 | if __name__=="__main__": 224 | #data = '你就是个王八蛋,混账玩意!你们的手机真不好用!非常生气,我非常郁闷!!!!' 225 | #data2= '我好开心啊,非常非常非常高兴!今天我得了一百分,我很兴奋开心,愉快,开心' 226 | text='腾讯汽车 站]编辑从深圳市大兴观澜丰田了解到,卡罗拉双擎最高优惠0.30万元,促销时间为2019年03月01日--2019年03月03日, 欢迎有意向的朋友到店试乘试驾。卡罗拉双擎外观卡罗拉双擎内饰卡罗拉双擎细节版权声明:本文系腾讯汽车独家稿件,版权为腾讯汽车所有。文章内的价格为编辑在车市第一线真实采集到的当日价格,由于汽车价格变化莫测,同时此价格只是个体经销商的行为,所以价格仅供参考使用。' 227 | # print(sentiment_score_list(data)) 228 | # print(sentiment_score(sentiment_score_list(data))) 229 | #print(sentiment_score(sentiment_score_list(data2))) 230 | 231 | # current_path = os.path.dirname(__file__) 232 | 233 | print("当前的路径是") 234 | # print(current_path) 235 | pos_score,neg_score,SentiResult=Senti_Text(text) 236 | print( pos_score,neg_score,SentiResult) 237 | 238 | 239 | 240 | 241 | -------------------------------------------------------------------------------- /Crawler/Crawler/expand_package/senti_dict_class.py: -------------------------------------------------------------------------------- 1 | #-*- coding:utf8 -*- 2 | import jieba 3 | import numpy as np 4 | import re 5 | import jieba 6 | #打开词典文件,返回列表 7 | 8 | 9 | class Senti_dict_class: 10 | def __init__(self): 11 | pass 12 | 13 | # self.deny_word = self.open_dict(Dict='否定词', path=r'./Sent_Dict/') 14 | # self.posdict = self.open_dict(Dict='positive', path=r'./Sent_Dict/') 15 | # self.negdict = self.open_dict(Dict='negative', path=r'./Sent_Dict/') 16 | # self.degree_word = self.open_dict(Dict='程度级别词语', path=r'./Sent_Dict/') 17 | 18 | self.deny_word = self.open_file_as_text('./Sent_Dict/否定词.txt') 19 | self.posdict = self.open_file_as_text('./Sent_Dict/positive.txt') 20 | self.negdict = self.open_file_as_text('./Sent_Dict/negative.txt') 21 | self.degree_word = self.open_file_as_text('./Sent_Dict/程度级别词语.txt') 22 | 23 | 24 | self.mostdict = self.degree_word[self.degree_word.index('extreme') + 1: self.degree_word.index('very')] # 权重4,即在情感词前乘以4 25 | self.verydict = self.degree_word[self.degree_word.index('very') + 1: self.degree_word.index('more')] # 权重3 26 | self.moredict = self.degree_word[self.degree_word.index('more') + 1: self.degree_word.index('ish')] # 权重2 27 | self.ishdict = self.degree_word[self.degree_word.index('ish') + 1: self.degree_word.index('last')] # 权重0.5 28 | 29 | 30 | def open_file_as_text(self,filename): 31 | dict = [] 32 | with open(filename, encoding='utf-8') as f: 33 | # print(f.read()) 34 | dict=f.readlines() 35 | new = [] 36 | for word in dict: 37 | new.append(word.replace("\n","")) 38 | print(new) 39 | return new 40 | 41 | def open_dict(self,Dict = 'name', path=r'Sent_Dict/'): 42 | path = path + '%s.txt' % Dict 43 | dictionary = open(path, 'r', encoding='utf-8') 44 | dict = [] 45 | for word in dictionary: 46 | word = word.strip('\n') 47 | dict.append(word) 48 | return dict 49 | 50 | def judgeodd(self,num): 51 | if (num % 2) == 0: 52 | return 'even' 53 | else: 54 | return 'odd' 55 | 56 | 57 | 58 | 59 | 60 | 61 | def sentiment_score_list(self,dataset): 62 | seg_sentence = dataset.split(' 。') 63 | count1 = [] 64 | count2 = [] 65 | for sen in seg_sentence: #循环遍历每一个评论 66 | segtmp = jieba.lcut(sen, cut_all=False) #把句子进行分词,以列表的形式返回 67 | i = 0 #记录扫描到的词的位置 68 | a = 0 #记录情感词的位置 69 | poscount = 0 #积极词的第一次分值 70 | poscount2 = 0 #积极词反转后的分值 71 | poscount3 = 0 #积极词的最后分值(包括叹号的分值) 72 | negcount = 0 73 | negcount2 = 0 74 | negcount3 = 0 75 | for word in segtmp: 76 | if word in self.posdict: # 判断词语是否是情感词 77 | poscount += 1 78 | c = 0 79 | for w in segtmp[a:i]: # 扫描情感词前的程度词 80 | if w in self.mostdict: 81 | poscount *= 4.0 82 | elif w in self.verydict: 83 | poscount *= 3.0 84 | elif w in self.moredict: 85 | poscount *= 2.0 86 | elif w in self.ishdict: 87 | poscount *= 0.5 88 | elif w in self.deny_word: 89 | c += 1 90 | if self.judgeodd(c) == 'odd': # 扫描情感词前的否定词数 91 | poscount *= -1.0 92 | poscount2 += poscount 93 | poscount = 0 94 | poscount3 = poscount + poscount2 + poscount3 95 | poscount2 = 0 96 | else: 97 | poscount3 = poscount + poscount2 + poscount3 98 | poscount = 0 99 | a = i + 1 # 情感词的位置变化 100 | 101 | elif word in self.negdict: # 消极情感的分析,与上面一致 102 | negcount += 1 103 | d = 0 104 | for w in segtmp[a:i]: 105 | if w in self.mostdict: 106 | negcount *= 4.0 107 | elif w in self.verydict: 108 | negcount *= 3.0 109 | elif w in self.moredict: 110 | negcount *= 2.0 111 | elif w in self.ishdict: 112 | negcount *= 0.5 113 | elif w in self.degree_word: 114 | d += 1 115 | if self.judgeodd(d) == 'odd': 116 | negcount *= -1.0 117 | negcount2 += negcount 118 | negcount = 0 119 | negcount3 = negcount + negcount2 + negcount3 120 | negcount2 = 0 121 | else: 122 | negcount3 = negcount + negcount2 + negcount3 123 | negcount = 0 124 | a = i + 1 125 | elif word == '!' or word == '!': ##判断句子是否有感叹号 126 | for w2 in segtmp[::-1]: # 扫描感叹号前的情感词,发现后权值+2,然后退出循环 127 | if w2 in self.posdict or self.negdict: 128 | poscount3 += 2 129 | negcount3 += 2 130 | break 131 | i += 1 # 扫描词位置前移 132 | 133 | 134 | # 以下是防止出现负数的情况 135 | pos_count = 0 136 | neg_count = 0 137 | if poscount3 < 0 and negcount3 > 0: 138 | neg_count += negcount3 - poscount3 139 | pos_count = 0 140 | elif negcount3 < 0 and poscount3 > 0: 141 | pos_count = poscount3 - negcount3 142 | neg_count = 0 143 | elif poscount3 < 0 and negcount3 < 0: 144 | neg_count = -poscount3 145 | pos_count = -negcount3 146 | else: 147 | pos_count = poscount3 148 | neg_count = negcount3 149 | 150 | count1.append([pos_count, neg_count]) 151 | count2.append(count1) 152 | count1 = [] 153 | 154 | return count2 155 | 156 | def sentiment_score(self,senti_score_list): 157 | score = [] 158 | for review in senti_score_list: 159 | score_array = np.array(review) 160 | Pos = np.sum(score_array[:, 0]) 161 | Neg = np.sum(score_array[:, 1]) 162 | AvgPos = np.mean(score_array[:, 0]) 163 | AvgPos = float('%.1f'%AvgPos) 164 | AvgNeg = np.mean(score_array[:, 1]) 165 | AvgNeg = float('%.1f'%AvgNeg) 166 | StdPos = np.std(score_array[:, 0]) 167 | StdPos = float('%.1f'%StdPos) 168 | StdNeg = np.std(score_array[:, 1]) 169 | StdNeg = float('%.1f'%StdNeg) 170 | score.append([Pos, Neg, AvgPos, AvgNeg, StdPos, StdNeg]) 171 | return score 172 | 173 | 174 | def Senti_Sentence(self,word): 175 | if word == '': 176 | return 0,0,'NEU' 177 | else: 178 | result = self.sentiment_score(self.sentiment_score_list(str(word))) # 情感分析 179 | pos_score = result[0][0] 180 | neg_score = result[0][1] 181 | if pos_score == neg_score: 182 | SentiResult='NEU' 183 | elif pos_score > neg_score: 184 | SentiResult='POS' 185 | else: 186 | SentiResult='NEG' 187 | #print(pos_score,neg_score,SentiResult) 188 | return float(pos_score),float(neg_score),SentiResult 189 | 190 | def Senti_Text(self,text): 191 | if text == '': 192 | return 0,0,'NEU' 193 | else: 194 | text = str(text) 195 | seg_sentence = re.split('。|!|?|……|,',text) 196 | print(seg_sentence) 197 | pos_sum=0 198 | neg_sum=0 199 | sen_num=0 200 | for sentence in seg_sentence: 201 | if sentence != '': 202 | pos,neg,res=self.Senti_Sentence(sentence) 203 | pos_sum+=pos 204 | neg_sum+=neg 205 | sen_num+=1 206 | else: 207 | pass 208 | print('句子数:',sen_num) 209 | try: 210 | pos_score = pos_sum/sen_num 211 | neg_score = neg_sum/sen_num 212 | if pos_score == neg_score: 213 | SentiResult='NEU' 214 | elif pos_score > neg_score: 215 | SentiResult='POS' 216 | else: 217 | SentiResult='NEG' 218 | #print(pos_score,neg_score,SentiResult) 219 | return float(pos_score),float(neg_score),SentiResult 220 | except Exception as e : # 221 | print(e) 222 | return 0,0,'NEU' 223 | 224 | 225 | 226 | if __name__=="__main__": 227 | #data = '你就是个王八蛋,混账玩意!你们的手机真不好用!非常生气,我非常郁闷!!!!' 228 | #data2= '我好开心啊,非常非常非常高兴!今天我得了一百分,我很兴奋开心,愉快,开心' 229 | text='腾讯汽车 站]编辑从深圳市大兴观澜丰田了解到,卡罗拉双擎最高优惠0.30万元,促销时间为2019年03月01日--2019年03月03日, 欢迎有意向的朋友到店试乘试驾。卡罗拉双擎外观卡罗拉双擎内饰卡罗拉双擎细节版权声明:本文系腾讯汽车独家稿件,版权为腾讯汽车所有。文章内的价格为编辑在车市第一线真实采集到的当日价格,由于汽车价格变化莫测,同时此价格只是个体经销商的行为,所以价格仅供参考使用。' 230 | # print(sentiment_score_list(data)) 231 | # print(sentiment_score(sentiment_score_list(data))) 232 | #print(sentiment_score(sentiment_score_list(data2))) 233 | senti_counter = Senti_dict_class() 234 | pos_score,neg_score,SentiResult=senti_counter.Senti_Text(text) 235 | print( pos_score,neg_score,SentiResult) 236 | 237 | # senti_counter.open_file_as_text("Sent_Dict/否定词.txt") 238 | 239 | -------------------------------------------------------------------------------- /Crawler/Crawler/expand_package/否定词.txt: -------------------------------------------------------------------------------- 1 | 不 2 | 不是 3 | 不能 4 | 不可 5 | 没有 6 | 不要 7 | 别 8 | 没 9 | 无 10 | 莫 11 | 未 12 | 勿 13 | 休 14 | 甭 15 | 非 -------------------------------------------------------------------------------- /Crawler/Crawler/expand_package/程度级别词语.txt: -------------------------------------------------------------------------------- 1 | extreme 2 | 百分之百 3 | 倍加 4 | 备至 5 | 不得了 6 | 不堪 7 | 不可开交 8 | 不亦乐乎 9 | 不折不扣 10 | 彻头彻尾 11 | 充分 12 | 到头 13 | 地地道道 14 | 非常 15 | 极 16 | 极度 17 | 极端 18 | 极其 19 | 极为 20 | 截然 21 | 尽 22 | 惊人地 23 | 绝 24 | 绝顶 25 | 绝对 26 | 绝对化 27 | 刻骨 28 | 酷 29 | 满 30 | 满贯 31 | 满心 32 | 莫大 33 | 奇 34 | 入骨 35 | 甚为 36 | 十二分 37 | 十分 38 | 十足 39 | 死 40 | 滔天 41 | 痛 42 | 透 43 | 完全 44 | 完完全全 45 | 万 46 | 万般 47 | 万分 48 | 万万 49 | 无比 50 | 无度 51 | 无可估量 52 | 无以复加 53 | 无以伦比 54 | 要命 55 | 要死 56 | 已极 57 | 已甚 58 | 异常 59 | 逾常 60 | 贼 61 | 之极 62 | 之至 63 | 至极 64 | 卓绝 65 | 最为 66 | 佼佼 67 | 郅 68 | 綦 69 | 齁 70 | 最 71 | very 72 | 不为过 73 | 超 74 | 超额 75 | 超外差 76 | 超微结构 77 | 超物质 78 | 出头 79 | 多 80 | 浮 81 | 过 82 | 过度 83 | 过分 84 | 过火 85 | 过劲 86 | 过了头 87 | 过猛 88 | 过热 89 | 过甚 90 | 过头 91 | 过于 92 | 过逾 93 | 何止 94 | 何啻 95 | 开外 96 | 苦 97 | 老 98 | 偏 99 | 强 100 | 溢 101 | 忒 102 | 不过 103 | 不少 104 | 不胜 105 | 惨 106 | 沉 107 | 沉沉 108 | 出奇 109 | 大为 110 | 多 111 | 多多 112 | 多加 113 | 多么 114 | 分外 115 | 格外 116 | 够瞧的 117 | 够呛 118 | 好 119 | 好不 120 | 何等 121 | 很 122 | 很是 123 | 坏 124 | 可 125 | 老 126 | 老大 127 | 良 128 | 颇 129 | 颇为 130 | 甚 131 | 实在 132 | 太 133 | 太甚 134 | 特 135 | 特别 136 | 尤 137 | 尤其 138 | 尤为 139 | 尤以 140 | 远 141 | 着实 142 | 曷 143 | 碜 144 | more 145 | 大不了 146 | 多 147 | 更 148 | 比较 149 | 更加 150 | 更进一步 151 | 更为 152 | 还 153 | 还要 154 | 较 155 | 较比 156 | 较为 157 | 进一步 158 | 那般 159 | 那么 160 | 那样 161 | 强 162 | 如斯 163 | 益 164 | 益发 165 | 尤甚 166 | 逾 167 | 愈 168 | 愈 ... 愈 169 | 愈发 170 | 愈加 171 | 愈来愈 172 | 愈益 173 | 远远 174 | 越 ... 越 175 | 越发 176 | 越加 177 | 越来越 178 | 越是 179 | 这般 180 | 这样 181 | 足 182 | 足足 183 | ish 184 | 点点滴滴 185 | 多多少少 186 | 怪 187 | 好生 188 | 还 189 | 或多或少 190 | 略 191 | 略加 192 | 略略 193 | 略微 194 | 略为 195 | 蛮 196 | 稍 197 | 稍稍 198 | 稍微 199 | 稍为 200 | 稍许 201 | 挺 202 | 未免 203 | 相当 204 | 些 205 | 些微 206 | 些小 207 | 一点 208 | 一点儿 209 | 一些 210 | 有点 211 | 有点儿 212 | 有些 213 | 半点 214 | 不大 215 | 不丁点儿 216 | 不甚 217 | 不怎么 218 | 聊 219 | 没怎么 220 | 轻度 221 | 弱 222 | 丝毫 223 | 微 224 | 相对 225 | last 226 | -------------------------------------------------------------------------------- /Crawler/Crawler/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # https://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | # class ImagespiderItem(scrapy.Item): 12 | # # define the fields for your item here like: 13 | # # name = scrapy.Field() 14 | # imgurl = scrapy.Field() 15 | # image_path = scrapy.Field() 16 | # pass 17 | # 18 | # 19 | # class TextItem(scrapy.Item): 20 | # textTitle = scrapy.Field() # title 21 | # #content = scrapy.Field() # 这个是文章的正文。 22 | # 23 | # 24 | # class simpleP(scrapy.Item): 25 | # simpleP = scrapy.Field() # 这个是单独的句子 KEYI 26 | 27 | # ------------这儿开始时新闻的。------------------ ,新浪的。 28 | class Image(scrapy.Item): 29 | src = scrapy.Field() 30 | path = scrapy.Field() 31 | title = scrapy.Field() # 或者说文件夹的名字。 32 | imagePath = scrapy.Field() 33 | 34 | 35 | 36 | class NewsContent(scrapy.Item): # 这个是具体的,图片也可以增回家一个字段把。 37 | url = scrapy.Field() 38 | title = scrapy.Field() 39 | Pcontent = scrapy.Field() 40 | timestamp = scrapy.Field() 41 | newsDate = scrapy.Field() 42 | imageUrls = scrapy.Field() # 可以调用原来的生成 43 | imagePath = scrapy.Field() # 保存在来相对位置 44 | 45 | # ----------- 三大新闻的 item 写入tengxun 数据表的这个,暂时主要是这四个字段 46 | class News(scrapy.Item): 47 | ''' 48 | title = 标题 49 | Hcontent = 这个是首句的意思,暂时没怎么用到的样子。 有这几个就可以了,html代码的首段,是可能为只有一个图片的。 50 | Tcontent = 纯文字的全文吗 51 | Acontent = 这个是html 的全文。 52 | ''' 53 | 54 | url = scrapy.Field() 55 | urlState = scrapy.Field() 56 | title = scrapy.Field() 57 | Hcontent = scrapy.Field() 58 | Tcontent = scrapy.Field() 59 | Acontent = scrapy.Field() 60 | newdate = scrapy.Field() 61 | fromWhere = scrapy.Field() 62 | 63 | 64 | 65 | 66 | 67 | 68 | -------------------------------------------------------------------------------- /Crawler/Crawler/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html 7 | import random 8 | 9 | from scrapy import signals 10 | 11 | from scrapy.conf import settings # 这儿需要注意导入 12 | 13 | from requests_html import HTMLSession 14 | 15 | 16 | 17 | 18 | class ImagespiderDownloaderMiddleware(object): 19 | # Not all methods need to be defined. If a method is not defined, 20 | # scrapy acts as if the downloader middleware does not modify the 21 | # passed objects. 22 | @classmethod 23 | def from_crawler(cls, crawler): 24 | # This method is used by Scrapy to create your spiders. 25 | s = cls() 26 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 27 | return s 28 | 29 | 30 | def process_request(self,request,spider): # 后期可以改成使用requests-html的版本把,比较新。 31 | print("使用自定义请求") 32 | print(spider.name) 33 | ua = random.choice( settings["USER_AGENT_LIST"] ) 34 | print(ua) 35 | request.headers['User-Agent'] = ua # 提取到的ua随机设置给请求 36 | 37 | # referer = "https://gczfl01.com" # 这个先闭 38 | # if referer: 39 | # request.headers['referer'] = referer 40 | # 设置代理,需要使用的时候使用,并且记得settings中设置,或者维护的代理池中提取(数据库) 41 | # proxy = random.choice( settings["PROXY"] ) 42 | # request.meta['proxy'] = proxy 43 | 44 | 45 | 46 | 47 | 48 | pass 49 | 50 | def process_response(self, request, response, spider): 51 | # Called with the response returned from the downloader. 52 | 53 | # Must either; 54 | # - return a Response object 55 | # - return a Request object 56 | # - or raise IgnoreRequest 57 | return response 58 | 59 | def process_exception(self, request, exception, spider): 60 | # Called when a download handler or a process_request() 61 | # (from other downloader middleware) raises an exception. 62 | 63 | # Must either: 64 | # - return None: continue processing this exception 65 | # - return a Response object: stops process_exception() chain 66 | # - return a Request object: stops process_exception() chain 67 | pass 68 | 69 | def spider_opened(self, spider): 70 | spider.logger.info('Spider opened: %s' % spider.name) 71 | -------------------------------------------------------------------------------- /Crawler/Crawler/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | import time 8 | from datetime import date, timedelta 9 | 10 | from bs4 import BeautifulSoup 11 | 12 | from Crawler.expand_package.WordCloud import Gen_WordCloud 13 | from Crawler.items import News # 这样导入才可以认识的啊 14 | from Crawler.items import Image,NewsContent # 这样导入才可以认识的啊 15 | from Crawler.expand_package.picDownloadScript import Download 16 | from Crawler.settings import CRAWL_DELAY_DAY, DOWMLOAD_IMG_IN_ACONTENT, MAKE_WORDCLOUD_STORE 17 | 18 | from Crawler.expand_package.DBcontrol import DB 19 | 20 | 21 | class newsPipeline(object): # 自己写的这个处理图片的管道,统一一个管道处理就可以了吗。 22 | def __init__(self): 23 | # 一个管道的下载的部分是一样的。 24 | self.downloadTool = Download(None) # setting 中设置默认的地址。 25 | # todo 增加mysql数据库组件也是一样的。 26 | self.crawlDelayDay = CRAWL_DELAY_DAY # 默认是爬取昨天的 27 | self.db = DB() 28 | 29 | def makeDateFolder(self): # 30 | crawl_date = (date.today() + timedelta(days=-self.crawlDelayDay)).strftime("%Y%m%d") # 昨天日期 31 | return crawl_date 32 | 33 | # def setFileTitle(self, title): 34 | # fileName = re.sub('[\/:*?"<>|]', '-', title) # 去掉非法字符 35 | # return fileName 36 | 37 | 38 | 39 | 40 | def downloadAndChangeImgPath(self,html_have_img,newsDate) -> str : 41 | ''' 42 | :param html_have_img: 新闻的正文的html 43 | :param newsDate: 新闻的日期(用来做下载图片的文件名) 44 | :return: img 中的src修改成下载到本地的地址 45 | ''' 46 | print("正在下载正文中") 47 | soup = BeautifulSoup( html_have_img , 'lxml') 48 | for img in soup.find_all("img"): 49 | tempSrc = img['src'] 50 | if tempSrc.find("http:") == -1: # 默认可能漏掉了这部分的 51 | tempSrc = "http:" + tempSrc 52 | # time.sleep(1) 53 | fixedSrc = self.downloadTool.downloadImg( 54 | img_url=tempSrc, 55 | imgName=None, 56 | referer=None, now_date=newsDate) # 这个是下面新建力的文件夹,默认都是延迟一天的。 57 | img['src'] = fixedSrc # 这个地址放回去 58 | # 下载,返回path,然后修改。 59 | print(img['src']) 60 | print("图片下载并且修改src完成。") 61 | return [str(soup.extract()).replace("'", '"')] 62 | 63 | 64 | def fillter_Acontent(self,Acontent): # clean_the_Acontent which hava style or script 65 | soup = BeautifulSoup(Acontent, 'lxml') 66 | [s.extract() for s in soup("style")] 67 | [s.extract() for s in soup("script")] 68 | return str(soup) 69 | 70 | def process_item(self, item, spider): 71 | if isinstance(item, NewsContent): 72 | print("管道进来了!") 73 | if "imageUrls" in item: # 有图片才下载图片 这边的item还可以修改吗 74 | if len(item['imageUrls']) != 0: 75 | print(item['imageUrls']) 76 | downPath = [] 77 | for url in item['imageUrls']: 78 | tempPath = self.downloadTool.downloadImg( 79 | img_url=url, 80 | imgName=None, 81 | referer=None, now_date=self.makeDateFolder()) # 这个是下面新建力的文件夹 82 | downPath.append(tempPath) # 这个也是一个list 下载地址的。 83 | 84 | downPathList="" 85 | # todo 这个是用来放回去的 。 86 | for path in downPath: # 当成独立的

87 | downPathList = downPathList+"

".format(path) 88 | print(downPathList) 89 | # item['imagePath'] 90 | else: 91 | print("这个item没有图片") 92 | print(item['url']) 93 | # 返回item 94 | self.db.insert(item['url']) 95 | return item 96 | 97 | elif isinstance(item,Image): 98 | if "src" in item: # 有图片才下载图片 这边的item还可以修改吗 99 | if len(item['src']) != 0: 100 | print(item['src']) 101 | downPath = [] 102 | print(item['title']) 103 | for url in item['src']: 104 | tempPath = self.downloadTool.downloadImg( 105 | img_url=url, 106 | imgName=None, 107 | referer=None, now_date=(item['title'][0])) # 这个是下面新建力的文件夹 108 | downPath.append(tempPath) 109 | item['imagePath'] = downPath 110 | return item 111 | pass 112 | 113 | elif isinstance(item, News): # 这儿是 新闻爬虫的。 114 | print("正在处理item") 115 | # 下载图片还有修改Acontent中的img 116 | 117 | if item['Acontent'][0].find("img") != -1 and DOWMLOAD_IMG_IN_ACONTENT: # 发现纯文本的这里面有图片。才执行这个下载图片 118 | print("新闻中有图片,正在本地化处理......") 119 | print(item['url']) 120 | # 这儿注释掉,暂时不用,节省空间。下载图片不下载 121 | item['Acontent'] = self.downloadAndChangeImgPath(item['Acontent'][0],item['newdate'][0]) # 插入数据库,需要把’变成”,下载失败的就没有本地化 122 | print("正在插入数据库") 123 | if item['Acontent'][0]!="": # 这儿是填充Tcontent 纯文本字段 124 | # 用bs4 125 | item['Acontent']= [self.fillter_Acontent(item['Acontent'][0])] # 先过滤一下Acontent中奇怪的标签。 126 | item['Tcontent'] =[ "".join(BeautifulSoup(item['Acontent'][0], 'lxml').text)] 127 | self.db.insertItem(item) 128 | print("插入成功数据库") 129 | if item['Tcontent'][0]!="" and MAKE_WORDCLOUD_STORE: # 不是空文本的情况下是可以生成词云图的。 130 | # 把url生成唯一的md5作为词云的文件名 131 | # 前台调用只需要用这个方法生成一下md5就行了,也是唯一的值。 前端需要注意这儿! 132 | Gen_WordCloud(Newsid=self.downloadTool.makeMd5(item['url'][0]) , text=item['Tcontent'][0]) 133 | # time.sleep(60) 134 | else: # 没有词云,那就只能用默认的了。 135 | pass 136 | print("为无文本新闻") 137 | print(item['url'][0]) 138 | return item 139 | pass 140 | 141 | 142 | else: 143 | print("判断这个不是管道。") 144 | print(item) 145 | return item 146 | 147 | 148 | 149 | -------------------------------------------------------------------------------- /Crawler/Crawler/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for Crawler project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # https://doc.scrapy.org/en/latest/topics/settings.html 9 | # https://doc.scrapy.org/en/latest/topics/downloader-middleware.html 10 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'CrawlerCrawler' 13 | 14 | SPIDER_MODULES = ['Crawler.spiders'] 15 | NEWSPIDER_MODULE = 'Crawler.spiders' 16 | 17 | #图片存储位置 18 | # IMAGES_STORE = 'D:\Crawler\yuhangyuan' 19 | IMAGES_STORE = "../static/images/" # 相对路径,生成到外面的Crawl项目名字外面,所以crawl放在djongo项目内一层即可 20 | DOWMLOAD_IMG_IN_ACONTENT = False # 这个是自定义的,设定是否进行图片本地化操作。True ,False两个设定 21 | 22 | # 词云的生成和上面图片的相对路径必须这样明显的不同,暂时不解 23 | WORDCLOUD_STORE = "../static/images/WordCloud/" # 相对路径,生成到外面的Crawl项目名字外面 24 | MAKE_WORDCLOUD_STORE = True # 开关生成词云 25 | 26 | #启动图片下载中间件 27 | ITEM_PIPELINES = { 28 | # 'Crawler.pipelines.TextPipeline': 300, 29 | 'Crawler.pipelines.newsPipeline': 300, # 先下载图片,后提取文本的意思 30 | } 31 | # IMAGES_STORE = "file/image" 32 | # IMAGES_URLS_FILED= 'imgurl' # 这个暂时好像没什么用,直接结合自己的图片下载模块进来也是可以的把。 33 | 34 | CRAWL_DELAY_DAY = 1 35 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 36 | #USER_AGENT = 'Crawler (+http://www.yourdomain.com)' 37 | 38 | mysqlInfo = { 39 | "host": '127.0.0.1', 40 | "user": 'root', 41 | "passwd": '123456', 42 | "db": 'newssenti', #改同一个数据库了。 43 | "port": 3306, 44 | "charset": 'utf8' #这个是数据库的配置文件 45 | } 46 | 47 | CRAWLALL_RUN_TIME = "00:01" # 24小时制 48 | 49 | COMMANDS_MODULE = 'Crawler.commands' # 配置爬取所有爬虫命令的。 50 | # Obey robots.txt rules 51 | ROBOTSTXT_OBEY = False 52 | 53 | USER_AGENT_LIST = [ 54 | 'Mozilla/5.0 (Windows NT 5.2) AppleWebKit/534.30 (KHTML, like Gecko) Chrome/12.0.742.122 Safari/534.30', 55 | 'Mozilla/5.0 (Windows NT 5.1; rv:5.0) Gecko/20100101 Firefox/5.0', 56 | 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.2; Trident/4.0; .NET CLR 1.1.4322; .NET CLR 2.0.50727; .NET4.0E; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729; .NET4.0C)', 57 | 'Opera/9.80 (Windows NT 5.1; U; zh-cn) Presto/2.9.168 Version/11.50', 58 | 'Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/533.21.1 (KHTML, like Gecko) Version/5.0.5 Safari/533.21.1', 59 | 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 2.0.50727; .NET CLR 3.0.04506.648; .NET CLR 3.5.21022; .NET4.0E; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729; .NET4.0C)' 60 | ] 61 | 62 | 63 | 64 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 65 | #CONCURRENT_REQUESTS = 32 66 | 67 | # Configure a delay for requests for the same website (default: 0) 68 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay 69 | # See also autothrottle settings and docs 70 | DOWNLOAD_DELAY = 1 # 下载延时 71 | # The download delay setting will honor only one of: 72 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 73 | #CONCURRENT_REQUESTS_PER_IP = 16 74 | 75 | # Disable cookies (enabled by default) 76 | #COOKIES_ENABLED = False 77 | 78 | # Disable Telnet Console (enabled by default) 79 | #TELNETCONSOLE_ENABLED = False 80 | 81 | # Override the default request headers: 82 | #DEFAULT_REQUEST_HEADERS = { 83 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 84 | # 'Accept-Language': 'en', 85 | #} 86 | 87 | # Enable or disable spider middlewares 88 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html 89 | # SPIDER_MIDDLEWARES = { 90 | # } 91 | 92 | # Enable or disable downloader middlewares 93 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html 94 | DOWNLOADER_MIDDLEWARES = { 95 | 'Crawler.middlewares.ImagespiderDownloaderMiddleware': 543, 96 | 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None, # 默认的是500需要,像这种这样就是可以关掉 97 | } 98 | 99 | # Enable or disable extensions 100 | # See https://doc.scrapy.org/en/latest/topics/extensions.html 101 | #EXTENSIONS = { 102 | # 'scrapy.extensions.telnet.TelnetConsole': None, 103 | #} 104 | 105 | # Configure item pipelines 106 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html 107 | #ITEM_PIPELINES = { 108 | # 'Crawler.pipelines.ImagespiderPipeline': 300, 109 | #} 110 | 111 | # Enable and configure the AutoThrottle extension (disabled by default) 112 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html 113 | #AUTOTHROTTLE_ENABLED = True 114 | # The initial download delay 115 | #AUTOTHROTTLE_START_DELAY = 5 116 | # The maximum download delay to be set in case of high latencies 117 | #AUTOTHROTTLE_MAX_DELAY = 60 118 | # The average number of requests Scrapy should be sending in parallel to 119 | # each remote server 120 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 121 | # Enable showing throttling stats for every response received: 122 | #AUTOTHROTTLE_DEBUG = False 123 | 124 | # Enable and configure HTTP caching (disabled by default) 125 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 126 | #HTTPCACHE_ENABLED = True 127 | #HTTPCACHE_EXPIRATION_SECS = 0 128 | #HTTPCACHE_DIR = 'httpcache' 129 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 130 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 131 | -------------------------------------------------------------------------------- /Crawler/Crawler/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /Crawler/Crawler/spiders/spider_expends.py: -------------------------------------------------------------------------------- 1 | import random 2 | from pprint import pprint 3 | 4 | import chardet 5 | import requests 6 | from datetime import date, timedelta 7 | from Crawler.settings import CRAWL_DELAY_DAY 8 | 9 | 10 | class TengxunExpend: 11 | 12 | def returnThemeCode(self, theme): # 这个是有用的,用来组合主题代码url的 13 | ent_Theme = 1537876288634 14 | sport_Theme = 1537877689177 15 | finance_Theme = 1537878365483 16 | tech_Theme = 1537879684280 17 | auto_Theme = 1537887032223 18 | house_Theme = 1537887128904 19 | news_Theme = 1537874915062 20 | if theme == 'news': 21 | return news_Theme 22 | if theme == 'ent': 23 | return ent_Theme 24 | if theme == 'sports': 25 | return sport_Theme 26 | if theme == 'tech': 27 | return tech_Theme 28 | if theme == 'auto': 29 | return auto_Theme 30 | if theme == 'house': 31 | return house_Theme 32 | if theme == 'finance': 33 | return finance_Theme 34 | 35 | def getThemeUrl(self, theme, today, pageNumber): 36 | rawUrl = "http://roll.news.qq.com/interface/cpcroll.php" 37 | rawReferer = '.qq.com/articleList/rolls/' # 'http://news 前面还有这个东西 38 | print(theme) 39 | print(today) 40 | print(pageNumber) 41 | 42 | my_headers = [ 43 | 'Mozilla/5.0 (Windows NT 5.2) AppleWebKit/534.30 (KHTML, like Gecko) Chrome/12.0.742.122 Safari/534.30', 44 | 'Mozilla/5.0 (Windows NT 5.1; rv:5.0) Gecko/20100101 Firefox/5.0', 45 | 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.2; Trident/4.0; .NET CLR 1.1.4322; .NET CLR 2.0.50727; .NET4.0E; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729; .NET4.0C)', 46 | 'Opera/9.80 (Windows NT 5.1; U; zh-cn) Presto/2.9.168 Version/11.50', 47 | 'Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/533.21.1 (KHTML, like Gecko) Version/5.0.5 Safari/533.21.1', 48 | 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 2.0.50727; .NET CLR 3.0.04506.648; .NET CLR 3.5.21022; .NET4.0E; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729; .NET4.0C)'] 49 | headers = {"User-Agent": random.choice(my_headers), 'Referer': 'http://' + theme + rawReferer} # 默认值 50 | rawUrl = rawUrl + "?callback=rollback&mode=1&cata=&_=" + str( 51 | self.returnThemeCode(theme)) + "&site=" + theme + "&page=" + str(pageNumber) + "&date=" + today 52 | print(rawUrl) 53 | try: 54 | rawhtml = requests.get(rawUrl, headers=headers, allow_redirects=False, 55 | timeout=30) # 一般提取文本的话,那就用text,如果是文件就content 56 | rawhtml.encoding = chardet.detect(rawhtml.content)['encoding'] 57 | # print(rawhtml.url) 58 | print("状态码" + str(rawhtml.status_code)) 59 | if rawhtml.status_code == 504: 60 | print(504) 61 | return 62 | print("页面的读取结果为") 63 | # print(rawhtml.text) 64 | if rawhtml.text.find('rollback') == 0: 65 | jsonString = rawhtml.text.split("rollback")[1] # 把js提取出来就可以了 66 | else: 67 | jsonString = rawhtml.text 68 | print(jsonString) 69 | dicData = eval(jsonString) 70 | print(type(jsonString)) 71 | print(jsonString) 72 | # print(dicData['data']['article_info']) 73 | print(len(dicData['data']['article_info'])) 74 | if dicData['data'] == "": 75 | print("超过了最大页数了,跳出了就可以了") 76 | return 77 | urllist = [] 78 | for one in dicData['data']['article_info']: 79 | # print(one['url']) 80 | print(one['url'].replace("\\", "/")) # 还需要检查一下这个和之前的那种野蛮是不是一样的 81 | urllist.append(one['url'].replace("\\", "/")) 82 | return urllist 83 | except Exception as e: 84 | # print(e) 85 | return [] 86 | 87 | def pageUrlMain(self, date=(date.today() + timedelta(days=-CRAWL_DELAY_DAY)).strftime("%Y-%m-%d") ): # 写入url进入数据库,并且写入分类 88 | resultUrlDic = {} # 写入数据库使用这个 89 | tempList = [] 90 | themeList = ['news', 'ent', 'tech', 'auto', 'house', 'finance', 'sports'] # 一共有7个主题,其实不止这7个的 91 | for theme in themeList: 92 | print("第一个主题是") 93 | tempDList = [] 94 | for i in range(1, 12): # 一般是10页就很多的了。10页以内 95 | print("第" + str(i) + "页") 96 | responseList = self.getThemeUrl(theme, date, i) 97 | if len(responseList) == 0: 98 | print("最大页数为" + str(i - 1) + "页") 99 | break 100 | else: 101 | tempList = tempList + responseList 102 | tempDList += responseList 103 | resultUrlDic[theme] = tempDList 104 | print(resultUrlDic) 105 | tempList = set(tempList) 106 | count = 0 107 | print("列表的url数量有:" + str(len(tempList))) 108 | for key in resultUrlDic: 109 | count += len(resultUrlDic[key]) 110 | print("url总共有" + str(count)) 111 | 112 | print("这个是PageUrls内的提取到的url") 113 | # pprint(resultUrlDic) 114 | print(len(resultUrlDic)) 115 | 116 | print("这个开始是list类型的结果") 117 | # print(tempList) 118 | 119 | pprint(tempList) 120 | 121 | 122 | # self.dbhelper.saveDicToMysql(resultUrlDic,date,"tengxun") #参数,字典结果集,时间,分类,这儿是不需要写的。 123 | return list(tempList) # 直接这儿去重后 124 | 125 | 126 | class WangyiExpend: # 这个是网易爬虫需要获得新闻页面的拓展的部分,直接构造成start_urls,再来做别的操作。 127 | def getRollUrlList(self,date=(date.today() + timedelta(days=-CRAWL_DELAY_DAY)).strftime("%Y-%m-%d") ): #这个打开会是手机端的东西 #又重写了一遍了这个东西 128 | rollLatest = "http://news.163.com/latest/" #这个就是默认新闻 129 | requestURL ="http://news.163.com/special/0001220O/news_json.js?0.3699326344116929" 130 | 131 | my_headers = [ #这边为了得到直接的手机端的页面代码返回,直接使用手机ua 132 | 'Mozilla/5.0 (Linux; Android 7.1.1; MI 6 Build/NMF26X; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/53.0.2785.49 Mobile MQQBrowser/6.2 TBS/043508 Safari/537.36 MicroMessenger/6.5.13.1100 NetType/WIFI Language/zh_CN', 133 | 'Mozilla/5.0 (Linux; Android 7.1.1; MI 6 Build/NMF26X) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/57.0.2987.132 Mobile Safari/537.36 Maxthon/3047', 134 | # 'Mozilla/5.0 (iPhone 84; CPU iPhone OS 10_3_3 like Mac OS X) AppleWebKit/603.3.8 (KHTML, like Gecko) Version/10.0 MQQBrowser/7.8.0 Mobile/14G60 Safari/8536.25 MttCustomUA/2 QBWebViewType/1 WKType/1', 135 | 'Mozilla/5.0 (Linux; U; Android 7.0; zh-cn; STF-AL00 Build/HUAWEISTF-AL00) AppleWebKit/537.36 (KHTML, like Gecko)Version/4.0 Chrome/37.0.0.0 MQQBrowser/7.9 Mobile Safari/537.36', 136 | 'Mozilla/5.0 (Linux; U; Android 6.0.1; zh-CN; SM-C7000 Build/MMB29M) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/40.0.2214.89 UCBrowser/11.6.2.948 Mobile Safari/537.36', 137 | 'Mozilla/5.0 (Linux; Android 7.0; STF-AL10 Build/HUAWEISTF-AL10; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/53.0.2785.49 Mobile MQQBrowser/6.2 TBS/043508 Safari/537.36 V1_AND_SQ_7.2.0_730_YYB_D QQ/7.2.0.3270 NetType/4G WebP/0.3.0 Pixel/1080'] 138 | 139 | headers = {"User-Agent": random.choice(my_headers), 'Referer': "http://news.163.com/latest/"} # 默认值 140 | 141 | try: 142 | rawhtml = requests.get(requestURL, headers=headers, allow_redirects=False, 143 | timeout=30) # 一般提取文本的话,那就用text,如果是文件就content 144 | rawhtml.encoding = "GBK" ##gbk>gb2312 使用这种方式尚且还有乱码的情况,部分乱码,那就是gbk可以修复 145 | # print(chardet.detect(rawhtml.content)['encoding']) 146 | if rawhtml.status_code == 504: 147 | print(504) 148 | return 149 | # print(rawhtml.url) 150 | print("状态码" + str(rawhtml.status_code)) 151 | # print("页面的读取结果为") 152 | html = rawhtml.text 153 | 154 | result10=[] 155 | if html.find('"news":')!=-1: 156 | rawjsonString = html.split('"news":')[1].replace("};","") 157 | jsDic = eval("("+rawjsonString+")") 158 | for i in jsDic: 159 | if len(i)!=0: 160 | for content in i: 161 | if content['p'].split(" ")[0]==date: #这个是今天的 162 | url = content['l'] 163 | if url.find("photoview")==-1: #不是图片的写入这儿 164 | result10.append(content['l']) 165 | else: 166 | pass 167 | 168 | # print("插入了"+str(len(result10))) 169 | print(result10) 170 | # self.saveListToMysql(result10, date) # todo 这儿做了注释,不写入数据库,方便进行测试/ 171 | 172 | return result10 #这个是返回前一天的所有的url链接放在这儿,大概200条以内,又变少了啊 173 | except Exception as e: 174 | print(e) 175 | return #返回为空 176 | if __name__ == '__main__': 177 | # 腾讯的获得新闻列表的模块测试 178 | # tengxun_expend =TengxunExpend() 179 | # tengxun_expend.pageUrlMain() 180 | 181 | # 网易的获得新闻的列表的模块测试 182 | wangyi_expend =WangyiExpend() 183 | print(wangyi_expend.getRollUrlList()) # 默认都是获得昨天的新闻。 -------------------------------------------------------------------------------- /Crawler/Crawler/spiders/tengxu.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 这儿改成是腾讯的就可以了。 3 | import traceback 4 | import scrapy 5 | from bs4 import BeautifulSoup 6 | from scrapy.loader import ItemLoader 7 | import time 8 | from Crawler.spiders.spider_expends import TengxunExpend 9 | from ..items import News 10 | 11 | 12 | class TengxunSpider(scrapy.Spider) : # 这边是没什么问题的了 13 | name = 'tengxun' 14 | allowed_domains = ["qq.com"] 15 | start_urls = [ 16 | # 'http://roll.news.qq.com/' 17 | # 'https://news.qq.com/a/20190513/007759.htm', 测试用个案网页。 18 | # 'https://news.qq.com/a/20190512/004148.htm', 19 | # 'https://news.qq.com/a/20190514/000037.htm', 20 | # 'https://news.qq.com/a/20190513/005746.htm' 21 | ] 22 | 23 | count = 1 24 | 25 | def close(spider, reason): 26 | print("腾讯的爬虫爬完了。") 27 | # 这儿重写一下,我只写页面的具体内容的解析就可以了。 28 | def start_requests(self): 29 | tengxun_expend = TengxunExpend() 30 | self.start_urls = tengxun_expend.pageUrlMain() # 测试暂时改了 31 | for url in self.start_urls: 32 | print() 33 | print(url) 34 | yield scrapy.Request(url, dont_filter=False) 35 | # # 这里重写爬虫入口方法,将dont_filter设置为false 36 | # # 是为了让起始url放入srcapy.Request请求url池中,对起始url也做去重处理 37 | # # 一次是分页数据里检索到的第一页 38 | 39 | 40 | def parse(self, response): # 每一页的都在这儿了。 41 | main = response.xpath("//*[@class='Cnt-Main-Article-QQ']")[0] 42 | print(main) # xpath object 43 | title, Hcontent, Tcontent, Acontent = "", "", "", "" # 最后一个参数好像没什么用 44 | try: 45 | title = response.xpath("//head/title/text()").extract_first() 46 | 47 | newdate = response.xpath("//span[@class='a_time']/text()").extract_first().split(" ")[0] 48 | lenP = main.xpath("p") 49 | print(len(lenP)) 50 | if len(lenP) > 2: # 为2的好像是纯视频的,还有一个文字描述的这种。 51 | Hcontent = lenP[0].extract() 52 | 53 | for p in main.xpath("p"): 54 | simpleP = p.extract() 55 | Acontent += simpleP 56 | 57 | # Tcontent = "".join(BeautifulSoup(Acontent, 'lxml').text) 58 | # print(title) 59 | # print() 60 | # print(Acontent) 61 | # print() 62 | # print(Tcontent) 63 | # print() 64 | # print(Hcontent) 65 | # print() 66 | newsloader = ItemLoader(item=News(), response=response) # 但是使用这种方法插入进去的都会是list。 67 | newsloader.add_value('title', title) 68 | newsloader.add_value('Acontent', Acontent) 69 | # newsloader.add_value('Tcontent', Tcontent) # 统一管道进行处理 70 | newsloader.add_value('Hcontent', Hcontent) 71 | newsloader.add_value('url', response.url) 72 | newsloader.add_value('urlState', "True") 73 | newsloader.add_value('fromWhere', "tengxun") 74 | newsloader.add_value("newdate",newdate) 75 | 76 | yield newsloader.load_item() 77 | print(newsloader.load_item()) 78 | # time.sleep(180) 79 | 80 | else: 81 | print("这个为纯视频的新闻,无文本,正在跳过。") 82 | 83 | except Exception as e: 84 | print(e) 85 | traceback.print_exc() # 貌似这个,一个错 86 | 87 | 88 | -------------------------------------------------------------------------------- /Crawler/Crawler/spiders/wangyi.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 这儿改成是腾讯的就可以了。 3 | import traceback 4 | from datetime import timedelta,date 5 | import scrapy 6 | import time 7 | from bs4 import BeautifulSoup 8 | from scrapy.loader import ItemLoader 9 | 10 | from Crawler.items import News 11 | from Crawler.settings import CRAWL_DELAY_DAY 12 | from Crawler.spiders.spider_expends import WangyiExpend 13 | 14 | 15 | class WangyiSpider(scrapy.Spider) : # 这边是没什么问题的了 16 | name = 'wangyi' 17 | allowed_domains = ["163.com"] # 18 | start_urls = [ 19 | # 'https://news.163.com/19/0514/04/EF4400KC0001899N.html', 20 | # 'https://news.163.com/19/0514/06/EF49KV8A00018AOR.html' 21 | 22 | ] 23 | 24 | count = 1 25 | 26 | def close(spider, reason): 27 | print("网易的爬虫爬完了。") 28 | # 这儿重写一下,我只写页面的具体内容的解析就可以了。 29 | 30 | def start_requests(self): 31 | wangyi_expend = WangyiExpend() 32 | self.start_urls = wangyi_expend.getRollUrlList() # 默认都是获得昨天的新闻。 33 | for url in self.start_urls: 34 | # print() 35 | # print(url) 36 | yield scrapy.Request(url, dont_filter=False) 37 | 38 | 39 | 40 | def parse(self, response): # 每一页的都在这儿了。 41 | throwSrcPart = (date.today() + timedelta(days=-CRAWL_DELAY_DAY)).strftime("%Y/%m/%d") # settings里面有 42 | print(throwSrcPart) 43 | 44 | title, Hcontent, Tcontent, Acontent = "", "", "", "" # 最后一个参数好像没什么用 45 | try: 46 | title = response.xpath("//head/title/text()").extract_first() 47 | mainP = response.xpath("//div[@class='post_text']")[0] 48 | # print(mainP.extract()) 49 | for p in mainP.xpath("p"): 50 | pp = p.xpath("img/@src").extract() 51 | # print(p) 52 | if len(pp) !=0 : # 找到有图片 53 | # print("找到图片") 54 | # print(pp[0]) 55 | if pp[0].find(throwSrcPart)!=-1: 56 | print(pp[0]) 57 | print("丢弃这个p") 58 | else: 59 | Acontent += p.extract() 60 | 61 | else: 62 | Acontent += p.extract() 63 | 64 | # time.sleep(60) 65 | # print(Acontent) 66 | lastDayDate = (date.today() + timedelta(days=-CRAWL_DELAY_DAY)).strftime("%Y-%m-%d") # settings里面有 67 | tempAcontent = BeautifulSoup(Acontent, 'lxml') 68 | # Tcontent = "".join(tempAcontent.text) 69 | 70 | lenP = tempAcontent.find_all("p") 71 | print(len(lenP)) 72 | if len(lenP) > 2: # 为2的好像是纯视频的,还有一个文字描述的这种。 73 | Hcontent = str(lenP[0]) 74 | print("Hcontent") 75 | print(Hcontent.replace(r'\n',"")) 76 | 77 | 78 | # print(title) 79 | # print() 80 | # print(Acontent) 81 | # print() 82 | # print(Tcontent) 83 | # print() 84 | # print(Hcontent) 85 | # print() 86 | newsloader = ItemLoader(item=News(), response=response) # 但是使用这种方法插入进去的都会是list。 87 | newsloader.add_value('title', title) 88 | newsloader.add_value('Acontent', Acontent) 89 | # newsloader.add_value('Tcontent', Tcontent) # 这个字段统一给管道进行处理 90 | newsloader.add_value('Hcontent', Hcontent) 91 | newsloader.add_value('url', response.url) 92 | newsloader.add_value('urlState', "True") 93 | newsloader.add_value('fromWhere', "wangyi") 94 | newsloader.add_value("newdate",lastDayDate) 95 | 96 | yield newsloader.load_item() 97 | print(newsloader.load_item()) 98 | # time.sleep(180) 99 | 100 | # else: 101 | # print("这个为纯视频的新闻,无文本,正在跳过。") 102 | 103 | except Exception as e: 104 | print(e) 105 | traceback.print_exc() # 貌似这个,一个错 106 | 107 | 108 | -------------------------------------------------------------------------------- /Crawler/Crawler/spiders/xinlang.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # todo 会卡住,这个问题怎么解决 3 | import json 4 | from datetime import timedelta,date 5 | import pysnooper # debug 用的包 6 | import scrapy 7 | from bs4 import BeautifulSoup 8 | from scrapy.loader import ItemLoader 9 | import time 10 | from Crawler.settings import CRAWL_DELAY_DAY 11 | from ..items import News 12 | 13 | 14 | class XinlangSpider(scrapy.Spider) : 15 | name = 'xinlang' 16 | # 爬取的域名,不会超出这个顶级域名 17 | allowed_domains = ['sina.com'] # 可以设置成不过滤吗。 18 | start_urls = [ 19 | ] 20 | 21 | count = 1 22 | # {}占位符,用于字符串替换,将获取到的/text/page/1格式内容替换成完整url 这个是新浪新闻的。滚动新闻的页面 23 | host_url = 'https://feed.mix.sina.com.cn/api/roll/get?pageid=153&lid=2509&k=&num=50&page={}' 24 | 25 | def close(spider, reason): 26 | print("网易的爬虫爬完了。") #发邮件之类的。 27 | 28 | 29 | 30 | def start_requests(self): 31 | for num in range(1,100): # 这儿是看爬取多少页的。一般56*100=5600 32 | print(self.host_url.format(num)) 33 | self.start_urls.append(self.host_url.format(num)) 34 | for url in self.start_urls: 35 | yield scrapy.Request(url, dont_filter=False) 36 | # # 这里重写爬虫入口方法,将dont_filter设置为false 37 | # # 是为了让起始url放入srcapy.Request请求url池中,对起始url也做去重处理 38 | # # 否则会爬取到两次 https://www.qiushibaike.com/text/,一次是起始url 39 | # # 一次是分页数据里检索到的第一页 40 | def parse(self, response): 41 | # itemloader 42 | ''' 43 | 这儿只取昨天的。 这儿是把json中每一页的url提取出来,有两层的深度。 44 | url = scrapy.Field() 45 | urlState = scrapy.Field() 46 | title = scrapy.Field() 47 | Hcontent = scrapy.Field() 48 | Tcontent = scrapy.Field() 49 | Acontent = scrapy.Field() 50 | newdate = scrapy.Field() 51 | fromWhere = scrapy.Field() 52 | :param response: 53 | :return: 54 | ''' 55 | allDic = json.loads(response.body) 56 | # print(allDic) 57 | print(type(allDic)) 58 | for one in allDic['result']['data']: 59 | itemloader = ItemLoader(item=News(), response=response ) 60 | timeStamp = one['intime'] 61 | timeArray = time.localtime(int(timeStamp)) 62 | newsDatetemp = time.strftime("%Y-%m-%d %H:%M:%S", timeArray) 63 | newsDate = newsDatetemp.split(" ")[0] 64 | print(newsDate) 65 | 66 | url = "" 67 | if "url" in one: 68 | # print("有url的") 69 | url = one["url"] 70 | pass # 有就直接提取这个 71 | elif "urls" in one: 72 | print("没有url") 73 | tempUrl = one["urls"][0] 74 | url = tempUrl.replace("\/","/") 75 | 76 | print() 77 | # 添加进item 78 | lastDayDate = (date.today() + timedelta(days=-CRAWL_DELAY_DAY)).strftime("%Y-%m-%d") # settings里面有 79 | if newsDate ==lastDayDate: # 只取出昨天的新闻。特指只选择昨天的新闻,这样才对把 80 | itemloader.add_value('url',url) # 这儿我发现了,有些是没有这个字段的 81 | itemloader.add_value('title', one['title']) 82 | itemloader.add_value('newdate', newsDate) 83 | resultItem = itemloader.load_item() # item 也是可以传过去的,传过去继续填充。 84 | yield scrapy.Request(url=resultItem['url'][0],callback=self.newsContent,dont_filter=True,meta={"lastItem":resultItem}) 85 | else: 86 | print("不是昨天的新闻,正在选择性跳过") 87 | 88 | 89 | # 这边是解析详情页的部分。 90 | @pysnooper.snoop() #这样就可以debug了 91 | def newsContent(self,response): 92 | title, Hcontent, Tcontent, Acontent = "", "", "", "" # 最后一个参数好像没什么用 93 | lastItem = response.meta.get("lastItem",None) # 这样就可以避免不行的。 94 | 95 | # 这边这个开始是划分句子,用html代码就可以,为了提取首段 96 | contentlist = [] 97 | for allp in response.xpath("//div[@class='article']"): # //div[@class='article'] ,要取这下面的所有的文本对吧 98 | for p in allp.xpath("p"): 99 | print(p.extract()) 100 | contentlist.append(p.extract()) 101 | # contentlist.append(p.xpath("string(.)").extract_first().strip()) # 换用这种后呢,会不会就不会再发生那种事情了。 102 | print() 103 | print("全文中句子的数量有那么多{}".format(len(contentlist))) 104 | print(contentlist) 105 | if len(contentlist) > 0: # 是否是没有纯文本的新闻的处理写在管道里面就好了。 106 | print(contentlist[0]) # 取第一个作为首段的东西 107 | Hcontent = contentlist[0] 108 | 109 | # print("新闻的正文内容在这里。") 110 | Acontent = response.xpath("//div[@class='article']").extract_first() # 这个就是str 111 | # Tcontent = "".join(BeautifulSoup(Acontent, 'lxml').text) 112 | if Tcontent=="": 113 | print(Tcontent) 114 | print(Acontent) 115 | print("可能为图片新闻") 116 | print(response.url) 117 | # time.sleep(10) 118 | 119 | newsloader = ItemLoader(item=News(), response=response) # 但是使用这种方法插入进去的都会是list。 120 | newsloader.add_value('title', lastItem['title'][0]) 121 | newsloader.add_value('Acontent', Acontent) 122 | # newsloader.add_value('Tcontent', Tcontent) # 统一有管道进行处理 123 | newsloader.add_value('Hcontent', Hcontent) 124 | newsloader.add_value('url', response.url) 125 | newsloader.add_value('urlState', "True") 126 | newsloader.add_value('fromWhere', "xinlang") 127 | newsloader.add_value("newdate", lastItem['newdate'][0]) 128 | 129 | 130 | yield newsloader.load_item() # 这个扔给管道就可以了。 131 | print(newsloader.load_item()) 132 | # time.sleep(60) 133 | 134 | 135 | 136 | -------------------------------------------------------------------------------- /Crawler/TengxunMain.py: -------------------------------------------------------------------------------- 1 | from scrapy import cmdline 2 | 3 | 4 | if __name__ == '__main__': 5 | cmdline.execute("scrapy crawl tengxun".split()) 6 | -------------------------------------------------------------------------------- /Crawler/TogetherCrawl.py: -------------------------------------------------------------------------------- 1 | from scrapy import cmdline 2 | # 同时启动所有的爬虫进行爬取工作。 3 | 4 | if __name__ == '__main__': 5 | cmdline.execute("scrapy crawlall".split()) 6 | -------------------------------------------------------------------------------- /Crawler/WangyiMain.py: -------------------------------------------------------------------------------- 1 | from scrapy import cmdline 2 | 3 | 4 | if __name__ == '__main__': 5 | cmdline.execute("scrapy crawl wangyi".split()) 6 | -------------------------------------------------------------------------------- /Crawler/XinlangMain.py: -------------------------------------------------------------------------------- 1 | from scrapy import cmdline 2 | 3 | 4 | if __name__ == '__main__': 5 | cmdline.execute("scrapy crawl xinlang".split()) 6 | 7 | # print("哈哈哈") 8 | 9 | # todo 新浪的,不知道为什么会提取出当天的,我只要昨天的,这样比较整齐。 10 | # todo 明天把分类到那六个表还有把评论提取到剩下的那六个表的操作做完,然后再合并起来。 11 | # todo 统一在管道进行过滤处理把,爬虫内是可以不处理的。然后管道内的那儿用bs4去掉style的这种/script这种也是。 -------------------------------------------------------------------------------- /Crawler/desktop.ini: -------------------------------------------------------------------------------- 1 | [.ShellClassInfo] 2 | IconResource=C:\WINDOWS\System32\SHELL32.dll,27 3 | [ViewState] 4 | Mode= 5 | Vid= 6 | FolderType=Generic 7 | -------------------------------------------------------------------------------- /Crawler/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html 5 | 6 | [settings] 7 | default = Crawler.settings 8 | 9 | [deploy] 10 | url = http://localhost:8088/ 11 | project = Crawler 12 | username = demo 13 | password = 123456 14 | 15 | -------------------------------------------------------------------------------- /Crawler/setup.py: -------------------------------------------------------------------------------- 1 | # Automatically created by: shub deploy 2 | 3 | from setuptools import setup, find_packages 4 | 5 | setup( 6 | name = 'project', 7 | version = '1.0', 8 | packages = find_packages(), 9 | entry_points = {'scrapy': ['settings = Crawler.settings']}, 10 | ) 11 | -------------------------------------------------------------------------------- /Crawler/togetherCrawl_scheduling.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -* 2 | 3 | import datetime 4 | import multiprocessing 5 | import os 6 | import schedule 7 | import time 8 | from scrapy import cmdline 9 | # 同时启动所有的爬虫进行爬取工作。 10 | from Crawler.settings import CRAWLALL_RUN_TIME 11 | 12 | 13 | def worker_1(interval): 14 | print ("开始所有爬虫工作") 15 | cmdline.execute("scrapy crawlall".split()) 16 | 17 | 18 | 19 | 20 | class AutoRunAtTime: #这儿只是一个线程的 21 | def job(self,name): #这个是主线程把 22 | print("正在爬取今天的新闻内容") 23 | print('这里是进程: %sd 父进程ID:%s' % (os.getpid(), os.getppid())) 24 | p1 = multiprocessing.Process(target=worker_1, args=(6,)) 25 | # p3 = multiprocessing.Process(target=worker_3, args=(4,)) 26 | 27 | p1.daemon = True 28 | # p2.daemon = True 29 | 30 | p1.start() 31 | # p2.start() 32 | # p3.start() 33 | print("The number of CPU is:" + str(multiprocessing.cpu_count())) 34 | for p in multiprocessing.active_children(): 35 | print("child p.name:" + p.name + "\tp.id" + str(p.pid)) 36 | 37 | p1.join() 38 | # p2.join() 39 | 40 | 41 | def startAutoRun(self,timeSet): #24小时制的时间输入,传入一个时间的字符串 42 | name = "scrapy_news" 43 | schedule.every().day.at(timeSet).do(self.job, name) # 应该也是24小时制的,记得 “输入24小时制的时间字符串 44 | while True: 45 | schedule.run_pending() 46 | # print("等待下一次...") 47 | time.sleep(1) 48 | 49 | 50 | if __name__=="__main__": 51 | autoRun = AutoRunAtTime() 52 | print(time.strftime('%Y.%m.%d', time.localtime(time.time()))) 53 | print("现在的时间是") 54 | print(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')) 55 | autoRun.startAutoRun(CRAWLALL_RUN_TIME) #测试直接这儿写运行时间比较方便 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # newsSpider scrapy 2 | 故名思意,这个就是一个新闻的爬虫,目前这个项目是用来练习scrapy框架的爬虫的地方。目前这个项目是一边学习一边的进行更新的,现在是用来爬取新浪新闻的, 如果也对你有帮助可以小小star鼓励一下哦😝 3 | 4 | ## xinlanggundong (练习时的部分项目 🙃 下面的才是完整版本) 5 | 这儿是新浪新闻滚动新闻的测试项目部分 6 | 7 | ## Crawler (比较完整 😀 ) 8 | 比较完整的,爬取 腾讯 、网易 、新浪 新闻的scrap爬虫项目。 9 | ### 功能: 10 | + 服务器上运行可以设定定时爬取 11 | + 管道部分写的是自定义写入自己定义的mysql的样式,具体使用可以在settings中启用或者停用或者自定义 12 | + 里面是spiders/文件夹下三个平台的爬虫可以一同进行爬取工作, 13 | + 对腾讯、网易、新浪新闻中的正文和图片进行本地化爬取。 14 | + 并且管道中有对每篇新闻生成对应的词云图片 15 | 16 | ### 使用方法: 17 | + 1.安装scrapy 环境,建议conda配置 18 | + 2.git clone https://github.com/realzhengyiming/newsSpier_scrapy.git 19 | + 3.```cd Crawler``` 20 | + 4.```python Together_Crawl.py``` 一次性跑三个爬虫,包括腾讯、网易、新浪 21 | + 5.```python togetherCrawl_scheduling.py``` 一次性跑三个爬虫,包括腾讯、网易、新浪(定时,时间设置先在settings.py中设置) 22 | + 设置settings.py 中``` CRAWLALL_RUN_TIME="XX:XX" 24小时制 ``` 23 | + 如果是linux上定时跑,可以 ```nohup python togetherCrawl_scheduling.py``` 24 | + 6. ```python tengxunMain.py``` 只爬取腾讯爬虫的部分 25 | + 7. ```python wangyiMain.py``` 只爬取网易新闻爬虫的部分 26 | + 8.```python xinlangMain.py``` 只爬取新浪爬虫的部分 27 | + 9. 重写了命令,可以直接scrapy crawlall 进行三个爬虫的同时爬取(同理默认scrapy crawl tengxun 这样也是可以的) 28 | 29 | ### 更多设置 30 | + 6.新闻中的图片需要下载请在settings.py 中 设置,如``` IMAGES_STORE = "../static/images/" ```(此处使用相对路径) 31 | + ```DOWMLOAD_IMG_IN_ACONTENT = False ``` 开启或者关闭把新闻中的图片本地化操作。 32 | + 7.开关词云的生成,settings.py 中设置 33 | + ``` MAKE_WORDCLOUD_STORE = True ``` 默认开启词云 34 | + ``` WORDCLOUD_STORE = "../static/images/WordCloud/" ``` 设置词云的生成地址,默认是相对路径,项目外同级目录 35 | 36 | ### 注意🎃 37 | 因为我这个项目是另一个完整项目的一部分, 另一个完整项目是django+scrapy 的新闻的情感分析平台, 这是scrapy用来做数据爬取入库操作的。 38 | 39 | 所以这儿的管道做的操作比较多,除了本地化图片和新闻正文,还有生成词云,甚至还有调用简单词频的方法进行情感分析的操作后才写入数据库的管道做的操作比较多, 40 | 除了本地化图片和新闻正文,还有生成词云,所以使用的时候可以根据这个来改,请自定义去掉不需要的功能。 41 | 42 | 43 | 44 | # todo 45 | 练习中待做的事情。 46 | + 设置UA (👌) 47 | + 代理,类似ua (👌) 48 | + 设置请求延迟 (👌) 49 | + settings 中 DOWNLOAD_DELAY (👌) 50 | + 重写请求,使用 selenuim + chrome 无头模式组合来使用做动态爬取 (👌) 51 | + 提取下一页后继续爬取 (👌) 52 | + 可以爬取新浪新闻滚动页面了(默认设置成爬取前一天的,目前只能纯文本) (👌) 53 | + 可以爬取新浪新闻滚动页面了(默认设置成爬取前一天的,结合图片和纯文本) (👌) 54 | + 使用chrome+ selenuim 的时候下载图片前的设置referer 55 | + 如果上面那条不太好用,也可以考虑 使用 requests-html 这个比较新的可以解析动态的库来进行合并。 56 | + 自定义下载媒体的图片 (👌) 57 | + 链接把数据写入mongdb 或者 别的数据库mysql (👌) 58 | 59 | 60 | 61 | 62 | 63 | 64 | -------------------------------------------------------------------------------- /xinlanggundong/.idea/deployment.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | -------------------------------------------------------------------------------- /xinlanggundong/.idea/encodings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /xinlanggundong/.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /xinlanggundong/.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /xinlanggundong/.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | -------------------------------------------------------------------------------- /xinlanggundong/.idea/workspace.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 14 | 15 | 16 | 17 | 18 | 19 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | 145 | 146 | 147 | 148 | 149 | 150 | 151 | 152 | 153 | 154 | 155 | 156 | 157 | 158 | 159 | 160 | 161 | 162 | 163 | 164 | 165 | 166 | 167 | 168 | 169 | 170 | 171 | 172 | 173 | 174 | 175 | 176 | 177 | 178 | 179 | 180 | 181 | 182 | 183 | 184 | 185 | 186 | 187 | 188 | 189 | 190 | 191 | 192 | 193 | 194 | 195 | 200 | 201 | 202 | 203 | de 204 | robot 205 | 50 206 | 看他的朋友圈就懂了 207 | count 208 | 提取url中 209 | 210 | 211 | 212 | 214 | 215 | 227 | 228 | 229 | 230 | 231 | true 232 | DEFINITION_ORDER 233 | 234 | 235 | 240 | 241 | 242 | 243 | 244 | 245 | 246 | 247 | 248 | 249 | 250 | 251 | 252 | 253 | 254 | 255 | 256 | 257 | 258 | 259 | 260 |