├── CnkiSpider ├── __init__.py ├── settings │ ├── __init__.py │ ├── settings.py │ └── settings_distribute.py ├── spiders │ ├── __init__.py │ ├── patent.py │ └── paperAchSpider.py ├── pipelines.py ├── items.py ├── proxy.py ├── middlewares.py ├── file_util.py ├── customDownloadMiddlewares.py ├── statusManager.py └── commonUtils.py ├── dataSrc ├── codeTest.txt ├── codeSrc.txt ├── code.txt └── codeBak.txt ├── cnki.ico ├── cnki.png ├── runPatent.py ├── config.cfg ├── runPaperAch.py ├── main.py ├── 重要命令.txt ├── .gitignore ├── init.sh ├── sql ├── errorCode.sql ├── errorLink.sql └── status.sql ├── scrapy.cfg ├── analyse.spec ├── CnkiSpiderExec.spec ├── analyse.py ├── README.md └── CnkiSpiderExec.py /CnkiSpider/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /dataSrc/codeTest.txt: -------------------------------------------------------------------------------- 1 | C042_9 -------------------------------------------------------------------------------- /CnkiSpider/settings/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /cnki.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aFlyBird0/CnkiSpider/HEAD/cnki.ico -------------------------------------------------------------------------------- /cnki.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aFlyBird0/CnkiSpider/HEAD/cnki.png -------------------------------------------------------------------------------- /runPatent.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | if __name__ == '__main__': 4 | os.system("scrapy crawl patent") -------------------------------------------------------------------------------- /config.cfg: -------------------------------------------------------------------------------- 1 | [spider] 2 | type=patent 3 | start=2020-07-01 4 | end=2020-07-31 5 | [database] 6 | no=12 -------------------------------------------------------------------------------- /runPaperAch.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | if __name__ == '__main__': 4 | os.system("scrapy crawl paperAch") -------------------------------------------------------------------------------- /CnkiSpider/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | from scrapy import cmdline 2 | 3 | if __name__ == '__main__': 4 | # cmdline.execute("scrapy crawl patent".split()) 5 | # cmdline.execute("scrapy crawl patent -s JOBDIR=jobs/patent-1".split()) 6 | cmdline.execute("scrapy crawl paperAch".split()) -------------------------------------------------------------------------------- /重要命令.txt: -------------------------------------------------------------------------------- 1 | nohup python3 -u runPatent.py > runPatent.log 2>&1 & 2 | nohup python3 -u runPaperAch.py > runPaperAch.log 2>&1 & 3 | 4 | spec数据文件修改 5 | datas=[('dataSrc','dataSrc'), ('./scrapy.cfg', '.'), ('./config.cfg', '.'), ('CnkiSpider/spiders', 'CnkiSpider/spiders'), ('log','log')], 6 | 打包完成记得在dis/CnkiSpiderExec/目录下建立 log 文件夹 7 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | error/ 2 | result/ 3 | target/ 4 | author/ 5 | html/ 6 | author_output/ 7 | 8 | *.csv 9 | 10 | # Log files 11 | *.log 12 | 13 | # Editor directories and files 14 | .idea 15 | 16 | # scrapyd 打包文件 17 | *.egg 18 | # scrapyd和pyinstaller打包文件 19 | build/ 20 | 21 | # pyinstaller打包文件 22 | dist/ 23 | 24 | **/__pycache__ 25 | -------------------------------------------------------------------------------- /init.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | # 生成必要的target,result等文件夹 3 | if [ ! -d "target" ];then 4 | mkdir "target" 5 | fi 6 | 7 | if [ ! -d "result" ];then 8 | mkdir "result" 9 | fi 10 | 11 | if [ ! -d "html" ];then 12 | mkdir "html" 13 | fi 14 | 15 | if [ ! -d "log" ];then 16 | mkdir "log" 17 | fi 18 | 19 | if [ ! -d "error" ];then 20 | mkdir "error" 21 | fi 22 | 23 | if [ ! -d "author" ];then 24 | mkdir "author" 25 | fi 26 | 27 | if [ ! -d "author_output" ];then 28 | mkdir "author_output" 29 | fi -------------------------------------------------------------------------------- /sql/errorCode.sql: -------------------------------------------------------------------------------- 1 | SET NAMES utf8mb4; 2 | SET FOREIGN_KEY_CHECKS = 0; 3 | 4 | -- ---------------------------- 5 | -- Table structure for status 6 | -- ---------------------------- 7 | -- DROP TABLE IF EXISTS `errorCode`; 8 | CREATE TABLE If Not Exists `errorCode` ( 9 | `id` INT UNSIGNED AUTO_INCREMENT, 10 | `type` varchar(255) COMMENT '文献类型,用于区分专利patent和(期刊、博硕、成果)的链接获取', 11 | `code` varchar(255) COMMENT '学科分类', 12 | `date` varchar(255) NOT NULL COMMENT '日期', 13 | PRIMARY KEY(`id`), 14 | unique index(`type`, `code`, `date`) 15 | ) ENGINE = InnoDB -------------------------------------------------------------------------------- /sql/errorLink.sql: -------------------------------------------------------------------------------- 1 | SET NAMES utf8mb4; 2 | SET FOREIGN_KEY_CHECKS = 0; 3 | 4 | -- ---------------------------- 5 | -- Table structure for status 6 | -- ---------------------------- 7 | -- DROP TABLE IF EXISTS `errorLink`; 8 | CREATE TABLE If Not Exists `errorLink` ( 9 | `id` INT UNSIGNED AUTO_INCREMENT, 10 | `type` varchar(255) COMMENT '文献类型,用于区分专利patent和(期刊、博硕、成果)的链接获取', 11 | `code` varchar(255) COMMENT '学科分类', 12 | `Link` varchar(255) NOT NULL COMMENT '链接', 13 | `date` varchar(255) NOT NULL COMMENT '日期', 14 | PRIMARY KEY(`id`) 15 | ) ENGINE = InnoDB -------------------------------------------------------------------------------- /sql/status.sql: -------------------------------------------------------------------------------- 1 | SET NAMES utf8mb4; 2 | SET FOREIGN_KEY_CHECKS = 0; 3 | 4 | -- ---------------------------- 5 | -- Table structure for status 6 | -- ---------------------------- 7 | -- DROP TABLE IF EXISTS `status`; 8 | CREATE TABLE If Not Exists `status` ( 9 | -- `id` INT UNSIGNED AUTO_INCREMENT, 10 | `type` varchar(255) COMMENT '爬虫类型,用于区分专利patent和(期刊、博硕、成果)paperAndAch的链接获取', 11 | `curCode` varchar(255) COMMENT '目前正在爬(链接获取)的学科分类', 12 | `curDate` varchar(255) NOT NULL COMMENT '目前正在爬(链接获取)的日期', 13 | `endDate` varchar(255) NOT NULL COMMENT '终止日期(包含)', 14 | `status` varchar(255) COMMENT '爬虫状态', 15 | PRIMARY KEY(`type`) 16 | ) ENGINE = InnoDB 17 | -------------------------------------------------------------------------------- /scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html 5 | 6 | [settings] 7 | # update this "default" item when you want to change the env 8 | default = CnkiSpider.settings.settings_distribute 9 | 10 | product = CnkiSpider.settings.settings 11 | develop = CnkiSpider.settings.dev_settings 12 | distribute = CnkiSpider.settings.settings_distribute 13 | distribute142out = CnkiSpider.settings.settings_distribute142out 14 | 15 | [deploy: cnki] 16 | url = http://localhost:6800/ 17 | ;url = http://10.1.13.143:6800/ 18 | project = CnkiSpider 19 | -------------------------------------------------------------------------------- /analyse.spec: -------------------------------------------------------------------------------- 1 | # -*- mode: python ; coding: utf-8 -*- 2 | 3 | 4 | block_cipher = None 5 | 6 | 7 | a = Analysis(['analyse.py'], 8 | pathex=['D:\\Desktop\\labProject\\CnkiSpider'], 9 | binaries=[], 10 | datas=[('dataSrc','dataSrc'), ('./scrapy.cfg', '.'), ('./config.cfg', '.'), ('CnkiSpider/spiders', 'CnkiSpider/spiders'), ('log','log')], 11 | hiddenimports=[], 12 | hookspath=[], 13 | runtime_hooks=[], 14 | excludes=[], 15 | win_no_prefer_redirects=False, 16 | win_private_assemblies=False, 17 | cipher=block_cipher, 18 | noarchive=False) 19 | pyz = PYZ(a.pure, a.zipped_data, 20 | cipher=block_cipher) 21 | exe = EXE(pyz, 22 | a.scripts, 23 | a.binaries, 24 | a.zipfiles, 25 | a.datas, 26 | [], 27 | name='analyse', 28 | debug=False, 29 | bootloader_ignore_signals=False, 30 | strip=False, 31 | upx=True, 32 | upx_exclude=[], 33 | runtime_tmpdir=None, 34 | console=False , icon='cnki.ico') 35 | -------------------------------------------------------------------------------- /CnkiSpiderExec.spec: -------------------------------------------------------------------------------- 1 | # -*- mode: python ; coding: utf-8 -*- 2 | 3 | 4 | block_cipher = None 5 | 6 | 7 | a = Analysis(['CnkiSpiderExec.py'], 8 | pathex=['D:\\Desktop\\labProject\\CnkiSpider'], 9 | binaries=[], 10 | datas=[('dataSrc','dataSrc'), ('./scrapy.cfg', '.'), ('./config.cfg', '.'), ('CnkiSpider/spiders', 'CnkiSpider/spiders'), ('log','log')], 11 | hiddenimports=[], 12 | hookspath=[], 13 | runtime_hooks=[], 14 | excludes=[], 15 | win_no_prefer_redirects=False, 16 | win_private_assemblies=False, 17 | cipher=block_cipher, 18 | noarchive=False) 19 | pyz = PYZ(a.pure, a.zipped_data, 20 | cipher=block_cipher) 21 | exe = EXE(pyz, 22 | a.scripts, 23 | [], 24 | exclude_binaries=True, 25 | name='CnkiSpiderExec', 26 | debug=False, 27 | bootloader_ignore_signals=False, 28 | strip=False, 29 | upx=True, 30 | console=False , icon='cnki.ico') 31 | coll = COLLECT(exe, 32 | a.binaries, 33 | a.zipfiles, 34 | a.datas, 35 | strip=False, 36 | upx=True, 37 | upx_exclude=[], 38 | name='CnkiSpiderExec') 39 | -------------------------------------------------------------------------------- /analyse.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | class ItemAnalyse: 4 | ''' 5 | 此文件可单独用 pyinstaller 打包成 exe 6 | 放在分发模式的根目录下,运行,计算爬取的文献数量,用来计算抓取率/数据丢失率 7 | 根据之前的测试,分发模式下,每个任务运行两次,数据丢失率在 0.03% 还是 0.003% 左右(太久远了忘了) 8 | ''' 9 | 10 | resultPath = "./result" 11 | 12 | @classmethod 13 | def getAllItemNums(cls): 14 | # result文件夹层级如下:./result/年份/类型 15 | sum = 0 16 | for yearDir in os.listdir(cls.resultPath): 17 | for typeDir in os.listdir(cls.resultPath + '/' + yearDir): 18 | sum += cls.getPathItemNums(cls.resultPath + '/' + yearDir + '/' + typeDir) 19 | return sum 20 | 21 | @classmethod 22 | def getPathItemNums(cls, path:str): 23 | sum = 0 24 | for file in os.listdir(path): 25 | sum += cls.getOneFileItemNums(path.rstrip('/') + '/' + file) 26 | return sum 27 | 28 | 29 | @classmethod 30 | def getOneFileItemNums(cls, pathFilename: str): 31 | with open(pathFilename, mode="r", encoding='utf-8') as f: 32 | lines = f.readlines() 33 | return len(lines) - 1 34 | 35 | @classmethod 36 | def getCodeSrcNewList(cls, filename): 37 | with open(filename, mode="r", encoding='utf-8') as f: 38 | src = f.readline() 39 | return src.split(',') 40 | 41 | @classmethod 42 | def writeNewCodeSrc(cls, filename, codeSrcList): 43 | with open(filename, mode="a", encoding='utf-8') as f: 44 | for code in codeSrcList: 45 | f.write(code + "\n") 46 | 47 | if __name__ == '__main__': 48 | # num = ItemAnalyse.getPathItemNums("./result/2020/boshuo") 49 | # print(num) 50 | # codeSrcList = ItemAnalyse.getCodeSrcNewList("./dataSrc/codeSrc.txt") 51 | # print(len(codeSrcList)) 52 | # ItemAnalyse.writeNewCodeSrc("./dataSrc/codeNewest.txt", codeSrcList) 53 | 54 | num = ItemAnalyse.getAllItemNums() 55 | print(num) -------------------------------------------------------------------------------- /CnkiSpider/pipelines.py: -------------------------------------------------------------------------------- 1 | # Define your item pipelines here 2 | # 3 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 4 | # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html 5 | 6 | 7 | # useful for handling different item types with a single interface 8 | from itemadapter import ItemAdapter 9 | 10 | from CnkiSpider.file_util import FileUtil 11 | from CnkiSpider.commonUtils import SpiderTypeEnum 12 | from CnkiSpider.items import * 13 | import csv 14 | 15 | 16 | class CnkispiderPipeline: 17 | def process_item(self, item, spider): 18 | if spider.name == 'patent': 19 | if isinstance(item, PatentContentItem): 20 | year = item['year'] 21 | resultPath = FileUtil.mkResultYearTypeDir(year, item['type']) 22 | # 每个细分学科分类存成一个文件 23 | # resultFilename = resultPath + item['naviCode'] + '.csv' 24 | # 每个大学科分类存成一个文件 25 | resultFilename = resultPath + item['naviCode'][0] + '.csv' 26 | FileUtil.write_header(resultFilename, item.keys()) 27 | item = self.removeLineFeed(item) 28 | FileUtil.saveItem(resultFilename=resultFilename, item=item) 29 | elif isinstance(item, ErrorUrlItem): 30 | # self.markLinkError(item['url'], SpiderTypeEnum.PATENT.value) 31 | self.easyMarkErrorItem(item) 32 | 33 | elif spider.name == 'paperAch': 34 | if isinstance(item, PatentContentItem) or isinstance(item, JournalContentItem)\ 35 | or isinstance(item, BoshuoContentItem) or isinstance(item, AchContentItem): 36 | year = item['year'] 37 | # 不同类型的根据type字段直接新建对应的文件夹 38 | resultPath = FileUtil.mkResultYearTypeDir(year, item['type']) 39 | # 每个细分学科分类存成一个文件 40 | # resultFilename = resultPath + item['naviCode'] + '.csv' 41 | # 每个大学科分类存成一个文件 42 | resultFilename = resultPath + item['naviCode'][0] + '.csv' 43 | FileUtil.write_header(resultFilename, item.keys()) 44 | item = self.removeLineFeed(item) 45 | FileUtil.saveItem(resultFilename=resultFilename, item=item) 46 | elif isinstance(item, ErrorUrlItem): 47 | # self.markLinkError(item['url'], SpiderTypeEnum.PATENT.value) 48 | self.easyMarkErrorItem(item) 49 | return item 50 | 51 | def removeLineFeed(self, item): 52 | ''' 53 | 消除item的换行 54 | :param item: 55 | :return: 56 | ''' 57 | for key in item: 58 | if item[key]: 59 | item[key] = item[key].replace('\n', '').replace('\r', ' ') 60 | return item 61 | 62 | def markLinkError(self, url, type): 63 | with open(FileUtil.errorLinkDir + type + 'Error.txt', 'a', encoding='utf-8') as file: 64 | file.write(url + '\n') 65 | 66 | def easyMarkErrorItem(self, item: ErrorUrlItem): 67 | ''' 68 | 简单的记录错误,先都存到同一个文件中 69 | :param item: 70 | :return: 71 | ''' 72 | resultPath = FileUtil.errorDir() 73 | resultFilename = resultPath + 'allErrors.csv' 74 | FileUtil.write_header(resultFilename, item.keys()) 75 | item = self.removeLineFeed(item) 76 | with open(resultFilename, 'a', encoding='utf-8', newline='') as f: 77 | csvWriter = csv.DictWriter(f, item.keys()) 78 | csvWriter.writerow(item) 79 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # 知网爬虫(专利、论文、成果) 2 | 3 | ## 〇、归档说明 4 | 5 | 已经开了个新坑,[知网专利爬虫](https://github.com/aFlyBird0/CnkiPatentSpider) 实现更简单,代码更易读。功能可能稍逊色些,但更不容易出错,更可控。 6 | 7 | 挺久之前写的代码了,再回过来看,感觉写得太烂了。所以修复屎山的最好的办法是——重新写一个。 8 | 9 | 原先使用了 scrapy,考虑地太理想了。想着各种一键化走完全流程、各种自动化的控制。但犯的最大的错误,一个是代码因为需求迭代太乱了,一个是错误记录应该只读不删,即应该记录下所有的错误的日期和学科分类代码,就算下次重新爬取了错误的内容,也不能删掉,而应该简单打个标签。 10 | 11 | 新的流程更偏向实践、易维护,以及结合我最近在学的云原生,把最后一步做成真正的分布式的爬虫。 12 | 13 | ## 一、整体介绍 14 | 15 | [Scrapy知网爬虫(一)整体理论篇 | Bird's Blog](https://blog.aflybird.cn/2021/07/Scrapy%E7%9F%A5%E7%BD%91%E7%88%AC%E8%99%AB%EF%BC%88%E4%B8%80%EF%BC%89%E6%95%B4%E4%BD%93%E7%90%86%E8%AE%BA%E7%AF%87/) 16 | 17 | ## 二、注意 18 | 1. 这是21年4月写的爬虫,目前应该处于基本可用的状态。因为业务需求一直在变动,所以写得很乱。有些地方可能需要手动改一下。(居然有人认真看和star,我有空尽量规范、测试一下代码。一年后再看以前自己写的代码,想喷死自己) 19 | 2. 专利在2021年5月左右亲测可爬,爬百万千万数据没问题。论文、项目部分可能部分字段的解析没写完,建议先跑跑看,有问题的话修改 `CnkiSpider/spiders/paperAchSpider.py` 中的 `html` 解析部分 20 | 3. 注意 `/dataSrc` 下的学科分类代码文件,不全,我只选取了理工科部分 21 | 4. 专利:考虑到法律状态会更新,所以建议在用到的时候实时请求知网,故此字段暂未爬取。有空会附上这段爬虫代码。 22 | ## 三、运行 23 | 前置工作:建立数据库,默认数据库名为 `ZhiWangSpider`,此配置项在 `/CkniSpider/settings` 文件夹下的配置文件中的 `MYSQL_DATABASE` 中。 24 | 25 | ### 3.1 服务器运行模式 26 | #### 3.1.1 模式介绍 27 | 此模式是部署在服务器上,设定好任务,然后单机跑 28 | #### 3.1.2 如何设定爬取任务 29 | 先运行一次项目,程序会自动生成 `status`、`errorCode`、`errorLink` 表(也可在 `/sql` 目录下找到这三个建表文件手动创建) 30 | 程序会报错,提示 " `status` 表中缺少 `type` 为 `patent` (也可能是 `paper`)的数据条,请手动插入" 31 | 这时候,在数据库的 `status` 表中插入一条记录,设置以下几项 32 | * `type` :必填。设置为 `patent` 就是爬专利, `paperAch` 是论文和项目 33 | * `curDate`:必填。爬虫任务的起始日期。(爬虫会在运行时将此字段作为当前在爬日期,并不断更新此字段) 34 | * `endDate`:选填。爬虫任务的结束日期。可不填,默认昨天。 35 | * `curCode`:不建议填。当前爬的学科分类信息,会默认从 A001_1 开始。 36 | * `status`:不填。记录爬虫最后一次获取任务的时间,用来简单显示爬虫运行情况(是正在跑、还是跑完了、还是崩了) 37 | > 注,因为多线程原因,当 `status` 显示为 `finish` 的时候,其实只是把设定的任务读取完毕,可能还没爬完,建议等一会。 38 | #### 3.1.3 配置文件说明 39 | 此模式的配置都在 `/CnkiSpider/settings` 文件夹下,包括 redis 数据库号和 mysql 表名 40 | *修改 scrapy.cfg 中的 default 字段即可切换配置* 41 | dev_settings.py、settings.py 都是服务器运行模式,可以改改数据库字段和其他设置 42 | 43 | ### 3.2 分发模式 44 | #### 3.2.1 模式介绍 45 | 此模式是打包成 exe , 可分布式独立运行。 46 | #### 3.2.2 如何设定爬取任务 47 | 每个子爬虫的任务是独立的,所以不在 `settings` 文件夹下,在 `/config.cfg` 配置文件中,详见下文 48 | #### 3.2.3 配置文件说明 49 | 此模式,每个子爬虫拥有不同的 redis 数据库号 与 mysql 表名(每个表前缀一样,利用不同的数据库号区分) 50 | 例如,子爬虫1连了 redis 数据库号 `1`,对应的 mysql 数据表名是 `errorCode1`、`errorLink1`、`status1`。(分发模式这三个表会自动建) 51 | 所以将这个数字配置独立到了 `/config.cfg` 中 52 | 53 | `/config.cfg` 如下:其中, `no` 同时充当了爬虫序号、redis 与 mysql 数据库(表)号,`start` 是子爬虫任务开始日期,`end` 是结束 54 | ```config 55 | [spider] 56 | type=patent 57 | start=2020-07-01 58 | end=2020-07-31 59 | [database] 60 | no=12 61 | 62 | ``` 63 | 64 | 其他配置文件说明: 65 | `/CnkiSpider/settings/settins_distribute.py` 是适用于分发模式的配置文件 66 | *修改 scrapy.cfg 中的 default 字段即可切换配置* 67 | 68 | #### 3.2.4 如何运行: 69 | 1. 清除旧的打包文件:删除根目录下的 dist 文件夹 70 | 2. 生成spec文件,具体如何利用pycharm生成此文件请自行查阅资料,关键词 "pyinstaller, pycharm, external tools",我目前使用的打包命令(如果不增加文件仅修改内容可跳过此步,直接使用我已经生成的CnkiSpiderExec.py) 71 | ```shell script 72 | -w -i cnki.ico $FileNameWithoutExtension$.py 73 | ``` 74 | ![spec生成: pycharm pyinstaller external tools 配置示意](https://tcualhp-notes.oss-cn-hangzhou.aliyuncs.com/img/1624277583.jpg) 75 | 3. 修改 spec 文件。找到 `datas=[]` 那行,改成 76 | ```shell script 77 | datas=[('dataSrc','dataSrc'), ('./scrapy.cfg', '.'), ('./config.cfg', '.'), ('CnkiSpider/spiders', 'CnkiSpider/spiders'), ('log','log')], 78 | ``` 79 | 4. 利用 spec 文件生成 exe(打包), 80 | ![利用spec生成exe: pycharm pyinstaller external tools 配置示意](https://tcualhp-notes.oss-cn-hangzhou.aliyuncs.com/img/image-20210621201507728.png) 81 | 5. 拷贝在根目录下生成的 dist 文件夹中的 CnkiSpiderExec 文件夹 82 | 6. 在拷贝后的 CnkiSpiderExec 文件夹根目录下建立 log 文件夹 83 | 7. 修改拷贝的文件夹根目录下的 `config.cfg`,设置爬取类型、开始结束日期、数据库编号(既是redis的数据库号也是mysql的表号) 84 | 8. 运行 CnkiSpiderExec.exe 85 | -------------------------------------------------------------------------------- /CnkiSpider/items.py: -------------------------------------------------------------------------------- 1 | # Define here the models for your scraped items 2 | # 3 | # See documentation in: 4 | # https://docs.scrapy.org/en/latest/topics/items.html 5 | 6 | import scrapy 7 | 8 | 9 | class CnkispiderItem(scrapy.Item): 10 | # define the fields for your item here like: 11 | # name = scrapy.Field() 12 | pass 13 | 14 | class BoshuoLinkItem(scrapy.Item): 15 | ''' 16 | 博硕 17 | ''' 18 | url = scrapy.Field() 19 | db = scrapy.Field() # 所属数据库 20 | code = scrapy.Field() # 学科分类 21 | 22 | class JournalLinkItem(scrapy.Item): 23 | ''' 24 | 期刊 25 | ''' 26 | url = scrapy.Field() 27 | db = scrapy.Field() # 所属数据库 28 | code = scrapy.Field() # 学科分类 29 | 30 | class AchLinkItem(scrapy.Item): 31 | ''' 32 | 科技成果 33 | ''' 34 | url = scrapy.Field() 35 | db = scrapy.Field() # 所属数据库 36 | code = scrapy.Field() # 学科分类 37 | 38 | class PatentContentItem(scrapy.Item): 39 | type = scrapy.Field() #类型,区分是专利还是期刊、博硕、论文 40 | title = scrapy.Field() # 标题 41 | url = scrapy.Field() # 专利的url 42 | naviCode = scrapy.Field() # 学科代码 43 | year = scrapy.Field() # 年份, 应该是公开日的年份,仅作爬虫分类用,不一定准确 44 | applicationType = scrapy.Field() # 专利类型 45 | applicationDate = scrapy.Field() # 申请日 46 | applyPublicationNo = scrapy.Field() # 申请公布号 47 | authPublicationNo = scrapy.Field() # 授权公布号 48 | multiPublicationNo = scrapy.Field() # 多次公布 49 | publicationDate = scrapy.Field() # 公开公告日 50 | authPublicationDate = scrapy.Field() #授权公告日 51 | applicant = scrapy.Field() # 申请人 52 | applicantAddress = scrapy.Field() # 地址 53 | inventors = scrapy.Field() # 发明人原始字符串 54 | applicationNO = scrapy.Field() # 申请(专利)号 55 | areaCode = scrapy.Field() # 国省代码 56 | classificationNO = scrapy.Field() # 分类号 57 | mainClassificationNo = scrapy.Field() # 主分类号 58 | agency = scrapy.Field() # 代理机构 59 | agent = scrapy.Field() # 代理人 60 | page = scrapy.Field() # 页数 61 | abstract = scrapy.Field() # 摘要 62 | sovereignty = scrapy.Field() # 主权项 63 | legalStatus = scrapy.Field() # 法律状态 64 | 65 | class JournalContentItem(scrapy.Item): 66 | naviCode = scrapy.Field() #学科分类代码 如A001这种 67 | type = scrapy.Field() 68 | year = scrapy.Field() 69 | url = scrapy.Field() 70 | uid = scrapy.Field() 71 | title = scrapy.Field() 72 | authors = scrapy.Field() #纯作者名列表 73 | authorsWithCode = scrapy.Field() #带作者code的作者列表 74 | organs = scrapy.Field() 75 | authorOrganJson = scrapy.Field() #作者和单位的对应关系json字符串 76 | summary = scrapy.Field() 77 | keywords = scrapy.Field() 78 | DOI = scrapy.Field() 79 | special = scrapy.Field() #专辑 80 | subject = scrapy.Field() #专题 81 | cate_code = scrapy.Field() #分类号 82 | db = scrapy.Field() #来源数据库 83 | 84 | magazine = scrapy.Field() # 期刊 85 | mentor = scrapy.Field() # 博硕导师 86 | 87 | 88 | class BoshuoContentItem(scrapy.Item): 89 | naviCode = scrapy.Field() # 学科分类代码 如A001这种 90 | type = scrapy.Field() 91 | year = scrapy.Field() 92 | url = scrapy.Field() 93 | uid = scrapy.Field() 94 | title = scrapy.Field() 95 | authors = scrapy.Field() #纯作者名列表 96 | authorsWithCode = scrapy.Field() #带作者code的作者列表 97 | organs = scrapy.Field() 98 | authorOrganJson = scrapy.Field() # 作者和单位的对应关系json字符串 99 | summary = scrapy.Field() 100 | keywords = scrapy.Field() 101 | DOI = scrapy.Field() 102 | special = scrapy.Field() 103 | subject = scrapy.Field() 104 | cate_code = scrapy.Field() 105 | db = scrapy.Field() #来源数据库 106 | 107 | magazine = scrapy.Field() # 期刊 108 | mentor = scrapy.Field() # 博硕导师 109 | 110 | 111 | class AchContentItem(scrapy.Item): 112 | naviCode = scrapy.Field() # 学科分类代码 如A001这种 113 | type = scrapy.Field() 114 | year = scrapy.Field() 115 | url = scrapy.Field() 116 | uid = scrapy.Field() 117 | title = scrapy.Field() 118 | authors = scrapy.Field() 119 | organ = scrapy.Field() # 第一完成单位 120 | keywords = scrapy.Field() 121 | book_code = scrapy.Field() # 中图分类号 122 | subject_code = scrapy.Field() # 学科分类号 123 | summary = scrapy.Field() 124 | category = scrapy.Field() # 成果类别 125 | in_time = scrapy.Field() # 成果入库时间 126 | pass_time = scrapy.Field() # 研究起止时间 127 | level = scrapy.Field() # 成果水平 128 | evaluate = scrapy.Field() # 评价形式 129 | 130 | class AuthorItem(scrapy.Item): 131 | code = scrapy.Field() 132 | name = scrapy.Field() 133 | school = scrapy.Field() 134 | category = scrapy.Field() 135 | upload_amount = scrapy.Field() 136 | download_amount = scrapy.Field() 137 | 138 | class ErrorUrlItem(scrapy.Item): 139 | url = scrapy.Field() 140 | errType = scrapy.Field() 141 | reqType = scrapy.Field() 142 | -------------------------------------------------------------------------------- /dataSrc/codeSrc.txt: -------------------------------------------------------------------------------- 1 | A001_1,A001_2,A001_3,A001_4,A002_1,A002_2,A002_3,A002_4,A002_5,A002_6,A002_7,A002_8,A002_9,A002_A,A002_B,A002_C,A002_D,A003_1,A003_2,A004_1,A004_2,A004_3,A004_4,A004_5,A004_6,A004_7,A004_8,A004_9,A005_1,A005_2,A005_3,A005_4,A005_5,A005_6,A005_7,A005_8,A005_9,A005_A,A005_B,A005_C,A005_D,A005_E,A005_F,A005_G,A005_H,A005_I,A005_J,A006_1,A006_2,A006_3,A006_4,A006_5,A006_6,A006_7,A006_8,A006_9,A006_A,A006_B,A006_C,A006_D,A006_E,A006_F,A006_G,A007_1,A007_2,A007_3,A007_4,A007_5,A007_6,A007_7,A007_8,A007_9,A007_A,A008_1,A008_2,A009_1,A009_2,A009_3,A009_4,A009_5,A009_6,A009_7,A009_8,A009_9,A009_A,A009_B,A009_C,A010_1,A010_2,A010_3,A010_4,A010_5,A010_6,A010_7,A010_8,A010_9,A010_A,A010_B,A010_C,A010_D,A010_E,A011_1,A011_2,A011_3,A011_4,A011_5,A011_6,A011_7,A011_8,A011_9,A011_A,A011_B,A011_C,A011_D,A011_E,A011_F,A011_G,A011_H,A011_I,A011_K,A012_1,A012_2,A012_3,A012_4,A012_5,A012_6,A012_7,A012_8,A012_9,A012_A,A012_B,A012_C,A012_D,A012_E,A012_F,A013_1,A013_2,B014_1,B014_2,B014_3,B014_4,B014_5,B014_6,B014_7,B014_8,B015_1,B015_2,B015_3,B015_4,B015_5,B015_6,B015_7,B015_8,B016_1,B016_2,B016_3,B016_4,B016_5,B016_6,B016_7,B016_8,B017_1,B017_2,B017_3,B017_4,B017_5,B017_6,B017_7,B018_1,B018_2,B018_3,B018_4,B018_5,B018_6,B018_7,B019_1,B019_2,B019_3,B019_4,B019_5,B019_6,B020_1,B020_2,B020_3,B020_4,B020_5,B020_6,B020_7,B020_8,B020_9,B020_A,B020_B,B020_C,B021_1,B021_2,B021_3,B021_4,B021_5,B021_6,B021_7,B021_8,B022_1,B022_2,B022_3,B022_4,B022_5,B022_6,B022_7,B022_8,B022_9,B022_A,B022_B,B022_C,B023_1,B023_2,B023_3,B023_4,B023_5,B023_6,B023_7,B023_8,B023_9,B023_A,B023_B,B024_1,B024_2,B024_3,B024_4,B024_5,B024_6,B024_7,B024_8,B024_9,B024_A,B024_B,B024_C,B024_D,B024_E,B025_1,B025_2,B025_3,B025_4,B025_5,B025_6,B025_7,B026_1,B026_2,B026_3,B026_4,B026_5,B026_6,B027_1,B027_2,B027_3,B027_4,B027_5,B027_6,C028_1,C028_2,C028_3,C028_4,C028_5,C028_6,C028_7,C028_8,C028_9,C029_1,C029_2,C029_3,C029_4,C029_5,C029_6,C029_7,C029_8,C029_9,C029_A,C029_B,C029_C,C029_D,C030_1,C030_2,C030_3,C030_4,C030_5,C030_6,C030_7,C030_8,C030_9,C030_A,C030_B,C030_C,C030_D,C030_E,C030_F,C030_G,C030_H,C030_I,C031_1,C031_2,C031_3,C031_4,C032_1,C032_2,C032_3,C032_4,C032_5,C033_1,C033_2,C033_3,C033_4,C033_5,C033_6,C033_7,C033_8,C034_1,C034_2,C034_3,C035_1,C035_2,C035_3,C035_4,C035_5,C035_6,C035_7,C035_8,C035_9,C035_A,C035_B,C035_C,C035_D,C035_E,C036_1,C036_2,C036_3,C036_4,C036_51,C036_52,C036_53,C036_54,C036_55,C036_56,C036_57,C036_58,C036_59,C036_6,C036_7,C036_8,C036_9,C036_A,C036_B,C036_C,C036_D,C036_E,C036_F,C036_G,C037_1,C037_2,C037_3,C037_4,C037_5,C037_6,C037_7,C037_8,C037_9,C037_A,C038_1,C038_2,C038_3,C039_1,C039_2,C039_3,C039_4,C039_5,C039_6,C039_7,C040_1,C040_2,C040_3,C040_4,C040_5,C040_6,C040_7,C040_8,C040_9,C041_1,C041_2,C041_3,C041_4,C041_5,C041_6,C041_7,C041_8,C042_1,C042_2,C042_3,C042_4,C042_5,C042_6,C042_7,C042_8,C042_9,C042_A,C042_B,C042_C,C042_D,D043_1,D043_2,D043_3,D043_4,D043_5,D043_6,D043_7,D043_8,D043_9,D044_1,D044_2,D044_3,D044_4,D044_5,D044_6,D044_7,D044_8,D044_9,D045_1,D045_2,D045_3,D045_4,D045_5,D045_6,D045_7,D045_8,D046_1,D046_2,D046_3,D046_4,D046_5,D046_6,D046_7,D046_8,D046_9,D046_A,D046_B,D046_C,D047_1,D047_2,D047_3,D047_4,D047_5,D047_6,D047_7,D047_8,D048_1,D048_2,D048_3,D048_4,D048_5,D048_6,D048_7,D049_1,D049_2,D049_3,D049_4,D049_5,D049_6,D049_7,D049_8,D050_1,D050_2,D050_3,D050_4,D051_1,D051_2,D051_3,D052_1,D052_2,D052_3,D052_4,D052_5,D052_6,D052_7,D052_8,E053_1,E053_2,E053_3,E053_4,E053_5,E053_6,E053_7,E053_8,E053_9,E053_A,E054_1,E054_2,E054_3,E054_4,E054_5,E054_6,E055_0,E055_1,E055_2,E055_3,E055_4,E055_5,E055_6,E055_7,E055_8,E056_1,E056_2,E056_3,E056_4,E056_5,E056_6,E056_7,E056_8,E056_9,E056_A,E056_B,E056_C,E056_D,E056_E,E056_F,E056_G,E057_1,E057_2,E057_3,E057_4,E057_5,E057_6,E057_7,E057_8,E057_9,E057_A,E057_B,E058_1,E058_2,E059_1,E059_2,E059_3,E059_4,E059_5,E059_6,E059_7,E059_8,E059_9,E059_A,E059_B,E059_C,E060_1,E060_2,E060_3,E060_4,E060_5,E060_6,E060_7,E060_8,E060_9,E061_1,E061_2,E061_3,E061_4,E061_5,E061_6,E061_7,E061_8,E061_9,E061_A,E061_B,E062_1,E062_2,E062_3,E062_4,E062_6,E062_7,E063_1,E063_2,E063_3,E063_4,E063_5,E063_6,E064_1,E064_2,E064_3,E064_4,E064_5,E064_6,E064_7,E065_1,E065_2,E066_1,E066_2,E066_3,E066_4,E066_5,E066_6,E066_7,E066_8,E067_1,E067_2,E067_3,E067_4,E067_5,E067_6,E067_7,E067_8,E067_9,E068_1,E068_2,E068_3,E068_4,E068_5,E068_6,E069_1,E069_2,E069_3,E069_4,E069_5,E069_6,E069_7,E069_8,E069_9,E070_1,E070_2,E070_3,E070_4,E070_5,E070_6,E070_7,E070_8,E070_9,E070_A,E071_1,E071_2,E071_3,E071_4,E071_5,E071_6,E071_7,E072_1,E072_2,E072_3,E072_4,E072_5,E072_6,E072_7,E072_8,E072_9,E072_A,E072_B,E072_C,E072_D,E073_1,E073_2,E073_3,E073_4,E073_5,E073_6,E073_8,E074_1,E074_2,E074_3,E074_4,E074_5,E074_6,E075_1,E075_2,E076_1,E076_2,E076_3,E076_4,E076_5,E076_6,E076_7,E077_1,E077_2,E077_3,E078_1,E078_2,E078_3,E078_4,E079_1,E079_2,E079_3,E079_4,E079_5,E079_6,E079_7,E079_8,E080_1,E080_2,E080_3,E080_4,E080_5,E080_6,E080_7,E080_8,E080_9,E080_A,I135_1,I135_2,I135_3,I135_4,I135_5,I135_6,I135_7,I135_8,I136_1,I136_2,I136_3,I136_4,I136_5,I136_6,I136_7,I136_8,I136_9,I136_A,I136_B,I136_C,I136_D,I136_E,I136_F,I136_G,I137_1,I137_2,I137_3,I137_4,I137_5,I138_1,I138_2,I138_3,I138_4,I138_5,I138_6,I138_7,I138_8,I138_9,I138_A,I138_B,I138_C,I139_1,I139_2,I139_3,I139_4,I139_5,I139_6,I139_7,I139_8,I140_1,I140_2,I140_3,I140_4,I140_5,I140_6,I141_1,I141_2,I141_3,I142_1,I142_2,I142_3,I142_4,I142_5,I142_6,I142_7,I142_8,I143_1,I143_2,I144_1,I144_2 -------------------------------------------------------------------------------- /CnkiSpider/proxy.py: -------------------------------------------------------------------------------- 1 | import hashlib 2 | import requests 3 | import json 4 | from scrapy.utils.project import get_project_settings 5 | import logging 6 | import sys 7 | import time 8 | import base64 9 | import random 10 | 11 | class ApeProxyManager: 12 | settings = get_project_settings() 13 | open = settings.get("PROXY_OPEN") 14 | id = settings.get("PROXY_ID") 15 | secret = settings.get("PROXY_SECRET") 16 | limit = settings.get("PROXY_LIMIT") 17 | format = settings.get("PROXY_FORMAT") 18 | auth_mode = settings.get("PROXY_AUTH_MODE") 19 | proxyUser = id 20 | proxyPass = secret 21 | print('proxyUser:', proxyUser) 22 | proxyAuth = "Basic " + base64.urlsafe_b64encode(bytes((proxyUser + ":" + proxyPass), "ascii")).decode("utf8") 23 | params = { 24 | "id": id, 25 | "secret": secret, 26 | "limit": limit, 27 | "format": format, 28 | "auth_mode": auth_mode 29 | } 30 | 31 | # 目前剩余的ip 32 | proxyLeft = 0 33 | # ip列表 34 | proxies = [] 35 | # ip复用的最大次数 36 | reuseMAX = settings.get("PROXY_REUSE") 37 | # ip已经复用的次数 38 | reuseCur = 0 39 | 40 | @classmethod 41 | def getProxiesDicts(cls): 42 | ''' 43 | 一次性获取多个代理 44 | :return: 45 | ''' 46 | try: 47 | response = requests.get( 48 | url="http://tunnel-api.apeyun.com/q", 49 | params=ApeProxyManager.params, 50 | headers={ 51 | "Content-Type": "text/plain; charset=utf-8", 52 | } 53 | ) 54 | # print('Response HTTP Status Code: {status_code}'.format( 55 | # status_code=response.status_code)) 56 | if response.status_code == 200: 57 | # print(response.text) 58 | res = json.loads(response.text) 59 | if res['code'] == 200: 60 | # return (res['data'][0]["ip"], res['data'][0]['port']) 61 | # print(res) 62 | data = res['data'] 63 | # print(data) 64 | cls.proxies = [] 65 | for proxy in data: 66 | # 再封装一个直接是http:ip:port形式的 67 | ipPort = "http://" + proxy['ip'] + ":" + str(proxy['port']) 68 | cls.proxies.append({'ip': proxy['ip'], 'port':str(proxy['port']), 'string': ipPort}) 69 | logging.debug('获取到的所有代理:%s', str(cls.proxies)) 70 | # print(cls.proxies) 71 | return True 72 | elif res['code'] == 11010030: 73 | # "当前IP已绑定到其它订单,请先解绑" 74 | logging.debug("重新请求ip中,原因如下:当前IP已绑定到其它订单,请先解绑") 75 | time.sleep(1) 76 | cls.getProxiesDicts() 77 | else: 78 | logging.debug('代理请求错误,错误代码:%d,错误信息:%s' % (res['code'], res['msg'] )) 79 | return False 80 | except requests.exceptions.RequestException: 81 | logging.error('代理获取时,HTTP Request failed') 82 | return False 83 | 84 | @classmethod 85 | def getProxy(cls): 86 | ''' 87 | 获取一个代理 88 | :return: 89 | ''' 90 | if cls.proxyLeft > 0 and len(cls.proxies) > 0: 91 | # proxy = cls.proxies[ApeProxyManager.limit-cls.proxyLeft] 92 | proxy = random.choice(cls.proxies) 93 | logging.debug("代理复用,获取到的代理:%s:%s" % (proxy['ip'], proxy['port'])) 94 | cls.proxyLeft -= 1 95 | # logging.debug("当前剩余代理数量:%s" % cls.proxyLeft) 96 | return proxy 97 | else: 98 | for i in range(12): 99 | if cls.getProxiesDicts(): 100 | # 总可用数量等于代理数量 * 最大复用次数 101 | cls.proxyLeft = ApeProxyManager.limit * ApeProxyManager.reuseMAX 102 | # proxy = cls.proxies[ApeProxyManager.limit-cls.proxyLeft] 103 | proxy = random.choice(cls.proxies) 104 | cls.proxyLeft -= 1 105 | logging.debug("请求新批次代理,获取到的代理:%s:%s" % (proxy['ip'], proxy['port'])) 106 | # logging.debug("当前剩余代理数量:%s" % cls.proxyLeft) 107 | return proxy 108 | time.sleep(1) 109 | logging.error("连续十二次获取ip失败,程序退出") 110 | sys.exit() 111 | 112 | @classmethod 113 | def proxyDict2String(cls, proxy): 114 | ''' 115 | 将字典形式的代理转化为http://ip:port形式 116 | :param proxy: 117 | :return: 118 | ''' 119 | if not proxy: 120 | return proxy 121 | return "http://" + proxy['ip'] + ":" + str(proxy['port']) 122 | 123 | @classmethod 124 | def removeBadProxy(cls, proxyString: str): 125 | ''' 126 | 根据proxyString移除无效代理(请求错误的) 127 | :param proxyString: 128 | :return: 129 | ''' 130 | # 先减去可用代理总次数,防止剩下的代理被用太多次 131 | # 考虑到高并发,可能一个问题代理同时被使用多次导致失败多次,这里减去复用次数的四分之一 132 | cls.proxyLeft -= cls.reuseMAX / 4 133 | # logging.info("去重前的代理是 %s" % str([item['string'] for item in cls.proxies])) 134 | cls.proxies = [item for item in cls.proxies if not item["string"] == proxyString] 135 | logging.debug("代理 %s 已经被去除" % proxyString) 136 | # logging.info("现在还剩的代理是 %s" % str([item['string'] for item in cls.proxies])) 137 | 138 | if __name__ == '__main__': 139 | for i in range(1000): 140 | proxy = ApeProxyManager.getProxy() 141 | # print(proxy) 142 | -------------------------------------------------------------------------------- /CnkiSpiderExec.py: -------------------------------------------------------------------------------- 1 | from scrapy.crawler import CrawlerProcess 2 | from scrapy.utils.project import get_project_settings 3 | 4 | #打包需要的import 5 | import urllib.robotparser 6 | import scrapy.spiderloader 7 | import scrapy.statscollectors 8 | import scrapy.logformatter 9 | import scrapy.dupefilters 10 | import scrapy.squeues 11 | import scrapy.extensions.spiderstate 12 | import scrapy.extensions.corestats 13 | import scrapy.extensions.telnet 14 | import scrapy.extensions.logstats 15 | import scrapy.extensions.memusage 16 | import scrapy.extensions.memdebug 17 | import scrapy.extensions.feedexport 18 | import scrapy.extensions.closespider 19 | import scrapy.extensions.debug 20 | import scrapy.extensions.httpcache 21 | import scrapy.extensions.statsmailer 22 | import scrapy.extensions.throttle 23 | import scrapy.core.scheduler 24 | import scrapy.core.engine 25 | import scrapy.core.scraper 26 | import scrapy.core.spidermw 27 | import scrapy.core.downloader 28 | import scrapy.downloadermiddlewares.stats 29 | import scrapy.downloadermiddlewares.httpcache 30 | import scrapy.downloadermiddlewares.cookies 31 | import scrapy.downloadermiddlewares.useragent 32 | import scrapy.downloadermiddlewares.httpproxy 33 | import scrapy.downloadermiddlewares.ajaxcrawl 34 | # import scrapy.downloadermiddlewares.chunked 35 | import scrapy.downloadermiddlewares.decompression 36 | import scrapy.downloadermiddlewares.defaultheaders 37 | import scrapy.downloadermiddlewares.downloadtimeout 38 | import scrapy.downloadermiddlewares.httpauth 39 | import scrapy.downloadermiddlewares.httpcompression 40 | import scrapy.downloadermiddlewares.redirect 41 | import scrapy.downloadermiddlewares.retry 42 | import scrapy.downloadermiddlewares.robotstxt 43 | import scrapy.spidermiddlewares.depth 44 | import scrapy.spidermiddlewares.httperror 45 | import scrapy.spidermiddlewares.offsite 46 | import scrapy.spidermiddlewares.referer 47 | import scrapy.spidermiddlewares.urllength 48 | import scrapy.pipelines 49 | import scrapy.core.downloader.handlers.http 50 | import scrapy.core.downloader.contextfactory 51 | 52 | import scrapy_redis.pipelines 53 | import scrapy_redis.dupefilter 54 | import scrapy_redis.spiders 55 | import scrapy_redis.scheduler 56 | import scrapy_redis.queue 57 | 58 | 59 | from enum import Enum 60 | import time 61 | import requests 62 | from CnkiSpider.proxy import ApeProxyManager 63 | import logging 64 | import scrapy 65 | from CnkiSpider.file_util import FileUtil 66 | from scrapy.utils.project import get_project_settings 67 | import sys 68 | from twisted.internet import defer 69 | from twisted.internet.error import TimeoutError, DNSLookupError, \ 70 | ConnectionRefusedError, ConnectionDone, ConnectError, \ 71 | ConnectionLost, TCPTimedOutError 72 | from scrapy.http import HtmlResponse 73 | from twisted.web.client import ResponseFailed 74 | from scrapy.core.downloader.handlers.http11 import TunnelError 75 | from CnkiSpider.file_util import FileUtil 76 | from CnkiSpider.commonUtils import SpiderTypeEnum, ErrorUtil 77 | import twisted 78 | import scrapy 79 | import time 80 | import math 81 | import re 82 | import requests 83 | from CnkiSpider.items import PatentContentItem 84 | from CnkiSpider.items import ErrorUrlItem 85 | from CnkiSpider.commonUtils import StringUtil 86 | from CnkiSpider.statusManager import StatusManager 87 | from CnkiSpider.commonUtils import SpiderTypeEnum, CookieUtil, ErrorUtil 88 | from CnkiSpider.file_util import FileUtil 89 | from CnkiSpider.proxy import ApeProxyManager 90 | from scrapy.http.cookies import CookieJar 91 | from scrapy_redis.spiders import RedisSpider 92 | import base64 93 | import os 94 | 95 | 96 | from CnkiSpider.customDownloadMiddlewares import * 97 | from CnkiSpider.file_util import * 98 | from CnkiSpider.pipelines import * 99 | from CnkiSpider.proxy import * 100 | from CnkiSpider.statusManager import * 101 | from CnkiSpider.settings import settings, settings_distribute 102 | from CnkiSpider.spiders import * 103 | from CnkiSpider.spiders import patent, paperAchSpider, __init__ 104 | from CnkiSpider.proxy import * 105 | from CnkiSpider.pipelines import * 106 | 107 | from configparser import ConfigParser 108 | 109 | # 弹窗 110 | from tkinter import messagebox 111 | from tkinter import * 112 | import psutil 113 | 114 | def alreadyRun(): 115 | if os.path.exists("pid.txt"): 116 | with open("pid.txt", "r") as f: 117 | spid = f.readline() 118 | pid = int(spid.strip()) 119 | pids = psutil.pids() 120 | # 文件中保存的进程号真正在运行 121 | # 采用覆盖模式模式, 122 | if pid in pids: 123 | return True 124 | with open("pid.txt", "w") as f2: 125 | f2.write(str(os.getpid())) 126 | return False 127 | else: 128 | with open("pid.txt", "w") as f: 129 | f.write(str(os.getpid())) 130 | return False 131 | 132 | def getProcessIdfromName(processname): 133 | pl = psutil.pids() 134 | for pid in pl: 135 | if psutil.Process(pid).name() == processname: 136 | return pid 137 | return -1 138 | 139 | def ifProcessNameExist(processname): 140 | if getProcessIdfromName(processname) >=0: 141 | return True 142 | else: 143 | return False 144 | 145 | cp = ConfigParser() 146 | # 与exe同目录 147 | cp.read('./config.cfg') 148 | spiderType = cp.get('spider', 'type') 149 | 150 | root = Tk() 151 | root.withdraw() # ****实现主窗口隐藏 152 | 153 | 154 | if alreadyRun(): 155 | messagebox.showinfo(title="提示", message="已有程序在后台启动,请不要重复运行") 156 | exit(1) 157 | else: 158 | process = CrawlerProcess(get_project_settings()) 159 | messagebox.showinfo(title="提示",message="程序已在后台启动,请不要使用加速球或将此程序加入至白名单中,此弹窗可关闭") 160 | if not FileUtil.IfFinishTask(): 161 | if spiderType == 'patent': 162 | process.crawl('patent') 163 | process.start() 164 | elif spiderType == 'paperAch': 165 | process.crawl('paperAch') 166 | process.start() -------------------------------------------------------------------------------- /CnkiSpider/middlewares.py: -------------------------------------------------------------------------------- 1 | # Define here the models for your spider middleware 2 | # 3 | # See documentation in: 4 | # https://docs.scrapy.org/en/latest/topics/spider-middleware.html 5 | 6 | from scrapy import signals 7 | 8 | # useful for handling different item types with a single interface 9 | from itemadapter import is_item, ItemAdapter 10 | from CnkiSpider.file_util import FileUtil 11 | from CnkiSpider.commonUtils import SpiderTypeEnum 12 | import logging 13 | from CnkiSpider.proxy import ApeProxyManager 14 | 15 | 16 | class CnkispiderSpiderMiddleware: 17 | # Not all methods need to be defined. If a method is not defined, 18 | # scrapy acts as if the spider middleware does not modify the 19 | # passed objects. 20 | 21 | @classmethod 22 | def from_crawler(cls, crawler): 23 | # This method is used by Scrapy to create your spiders. 24 | s = cls() 25 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 26 | return s 27 | 28 | def process_spider_input(self, response, spider): 29 | # Called for each response that goes through the spider 30 | # middleware and into the spider. 31 | 32 | # Should return None or raise an exception. 33 | return None 34 | 35 | def process_spider_output(self, response, result, spider): 36 | # Called with the results returned from the Spider, after 37 | # it has processed the response. 38 | 39 | # Must return an iterable of Request, or item objects. 40 | for i in result: 41 | yield i 42 | 43 | def process_spider_exception(self, response, exception, spider): 44 | # Called when a spider or process_spider_input() method 45 | # (from other spider middleware) raises an exception. 46 | 47 | # Should return either None or an iterable of Request or item objects. 48 | pass 49 | 50 | def process_start_requests(self, start_requests, spider): 51 | # Called with the start requests of the spider, and works 52 | # similarly to the process_spider_output() method, except 53 | # that it doesn’t have a response associated. 54 | 55 | # Must return only requests (not items). 56 | for r in start_requests: 57 | yield r 58 | 59 | def spider_opened(self, spider): 60 | spider.logger.info('Spider opened: %s' % spider.name) 61 | 62 | 63 | class CnkispiderDownloaderMiddleware: 64 | # Not all methods need to be defined. If a method is not defined, 65 | # scrapy acts as if the downloader middleware does not modify the 66 | # passed objects. 67 | 68 | @classmethod 69 | def from_crawler(cls, crawler): 70 | # This method is used by Scrapy to create your spiders. 71 | s = cls() 72 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 73 | return s 74 | 75 | def process_request(self, request, spider): 76 | # Called for each request that goes through the downloader 77 | # middleware. 78 | 79 | # Must either: 80 | # - return None: continue processing this request 81 | # - or return a Response object 82 | # - or return a Request object 83 | # - or raise IgnoreRequest: process_exception() methods of 84 | # installed downloader middleware will be called 85 | # request.meta["proxy"] = ProxyManager.getProxyString() 86 | # request.headers["Proxy-Authorization"] = ProxyManager.proxyAuth 87 | return None 88 | 89 | def process_response(self, request, response, spider): 90 | # Called with the response returned from the downloader. 91 | 92 | # Must either; 93 | # - return a Response object 94 | # - return a Request object 95 | # - or raise IgnoreRequest 96 | return response 97 | 98 | # def process_exception(self, request, exception, spider): 99 | # # Called when a download handler or a process_request() 100 | # # (from other downloader middleware) raises an exception. 101 | # 102 | # # Must either: 103 | # # - return None: continue processing this exception 104 | # # - return a Response object: stops process_exception() chain 105 | # # - return a Request object: stops process_exception() chain 106 | # pass 107 | 108 | # 109 | def process_exception(self, request, exception, spider): 110 | ''' 111 | 全局请求异常处理, 这里暂时没用 112 | :param request: 113 | :param exception: 114 | :param spider: 115 | :return: 116 | ''' 117 | # key = request.cb_kwargs 118 | # print(request) 119 | # print('全局异常拦截!!!\n') 120 | # print('异常', exception) 121 | # print(type(exception)) 122 | # if spider.name == SpiderTypeEnum.PATENT.value: 123 | # # print(key) 124 | # if key['requestType'] == 'PatentGetFirstPage': 125 | # self.markDayError(type=SpiderTypeEnum.PATENT.value, code=key['code'], date=key['date']) 126 | # elif key['requestType'] == 'PatentGetLinks': 127 | # self.markPageError(type=SpiderTypeEnum.PATENT.value, code=key['code'], date=key['date'], pagenum=key['pagenum']) 128 | # elif key['requestType'] == "patentGetContent": 129 | # self.markLinkError(type=SpiderTypeEnum.PATENT.value, url=key['url']) 130 | # else: 131 | # print('这传的什么jb玩意?') 132 | # # self.markFirstError(key['code'], key['date'], pagenum) 133 | # elif 'error' in spider.name: 134 | # if 'pagenum' in key: 135 | # pagenum = key['pagenum'] 136 | # else: 137 | # pagenum = 0 138 | # self.markSecondError(key['code'], key['date'], pagenum) 139 | return request 140 | # 141 | # def markLinkError(self, url, type): 142 | # with open(FileUtil.errorLinkDir + type + 'Error.txt', 'a', encoding='utf-8') as file: 143 | # file.write(url + '\n') 144 | # 145 | # def markSecondError(self, code, date, pagenum): 146 | # if pagenum == 0: 147 | # with open('error/erday.txt', 'a', encoding='utf-8') as f: 148 | # f.write(code + '&' + date + '\n') 149 | # else: 150 | # with open('error/erpage.txt', 'a', encoding='utf-8') as f: 151 | # f.write(code + '&' + date + '&' + str(pagenum) + '\n') 152 | # 153 | # def markFirstError(self, code, date, pagenum): 154 | # if pagenum == 0: 155 | # with open('error/errorday_' + date + '.txt', 'a', encoding='utf-8') as f: 156 | # f.write(code + '&' + date + '\n') 157 | # else: 158 | # with open('error/errorpage_' + date + 'txt', 'a', encoding='utf-8') as f: 159 | # f.write(code + '&' + date + '&' + str(pagenum) + '\n') 160 | # 161 | # def easyErrorRecoder(self, url): 162 | # with open('error/EasyErrorRecorder.txt', 'a', encoding='utf-8') as file: 163 | # file.write(url + '\n') 164 | # 165 | # def markDayError(self, type, code, date): 166 | # with open(FileUtil.errorDayDir + type + '.txt', 'a', encoding='utf-8') as f: 167 | # f.write(code + '&' + date + '\n') 168 | # 169 | # def markPageError(self, type, code, date, pagenum): 170 | # with open(FileUtil.errorPageDir + type + '.txt', 'a', encoding='utf-8') as f: 171 | # f.write(code + '&' + date + '&' + str(pagenum) + '\n') 172 | # 173 | 174 | 175 | 176 | def spider_opened(self, spider): 177 | spider.logger.info('Spider opened: %s' % spider.name) -------------------------------------------------------------------------------- /CnkiSpider/file_util.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import os 3 | import random 4 | import string 5 | import sys 6 | import logging 7 | 8 | class PackUtil: 9 | ''' 10 | 打包工具,做一个文件路径转换 11 | ''' 12 | @classmethod 13 | def resource_path(cls, relative_path: str): 14 | if getattr(sys, 'frozen', False): 15 | base_path = sys._MEIPASS 16 | else: 17 | base_path = os.path.abspath(".") 18 | return os.path.join(base_path, relative_path) 19 | 20 | class FileUtil: 21 | ''' 22 | 文件工具类 23 | ''' 24 | 25 | 26 | # 如下方式,运行时,不打包则会生成在项目目录,打包会生成在exe内部(运行结束消失) 27 | # targetDir = PackUtil.resource_path("./target/") 28 | # resultDir = PackUtil.resource_path("./result/") 29 | # htmlDir = PackUtil.resource_path("./html/") 30 | # errorDir = PackUtil.resource_path("./error/") 31 | # logDir = PackUtil.resource_path("./log/") 32 | # 33 | # errorLinkDir = PackUtil.resource_path("./error/link/") 34 | # errorOverflowDir = PackUtil.resource_path("./error/overflow/") 35 | # errorDayDir = PackUtil.resource_path("./error/day/") 36 | # errorPageDir = PackUtil.resource_path("./error/page/") 37 | 38 | # 如下方式,运行时,不打包则会生成在项目目录,打包会生成在exe同级目录 39 | targetDir = "./target/" 40 | resultDir = "./result/" 41 | htmlDir = "./html/" 42 | errorDir = "./error/" 43 | logDir = "./log/" 44 | 45 | errorLinkDir = "./error/link/" 46 | errorOverflowDir = "./error/overflow/" 47 | errorDayDir = "./error/day/" 48 | errorPageDir = "./error/page/" 49 | 50 | # @classmethod 51 | # def write_header(cls, filename, header): 52 | # ''' 53 | # (已废弃)不存在文件则创建文件,不存在header则写入hearder,创建header 54 | # :return: 55 | # ''' 56 | # # newline的作用是防止每次插入都有空行 57 | # with open(filename, "a+", newline='', encoding='utf-8') as csvfile: 58 | # writer = csv.DictWriter(csvfile, header) 59 | # # 以读的方式打开csv 用csv.reader方式判断是否存在标题。 60 | # with open(filename, "r", newline="", encoding='utf-8') as f: 61 | # reader = csv.reader(f) 62 | # if not [row for row in reader]: 63 | # writer.writeheader() 64 | 65 | @classmethod 66 | def write_header(cls, filename, header): 67 | ''' 68 | 不存在文件则创建文件并写入header 69 | 没有上一个函数严谨,但可以避免文件读取写入冲突 70 | :return: 71 | ''' 72 | # newline的作用是防止每次插入都有空行 73 | if not os.path.exists(filename): 74 | with open(filename, "a+", newline='', encoding='utf-8') as csvfile: 75 | writer = csv.DictWriter(csvfile, header) 76 | writer.writeheader() 77 | 78 | @classmethod 79 | def saveItem(cls, resultFilename, item): 80 | with open(resultFilename, 'a', encoding='utf-8', newline='') as f: 81 | csvWriter = csv.DictWriter(f, item.keys()) 82 | csvWriter.writerow(item) 83 | 84 | @classmethod 85 | def remove_reduntant_header_one_file(cls, dir_path: str, filename: str): 86 | ''' 87 | 删除某个文件多余的header 88 | :param filename: 89 | :return: 90 | ''' 91 | num = 0 92 | actual_filename = dir_path + '/' + filename 93 | with open(actual_filename, 'r', encoding='utf-8') as fin: 94 | # 处理好的文件换个文件夹,放在 '原文件夹名_handled' 里面 95 | output_filename = dir_path + '_handled/' + filename 96 | with open(output_filename, 'a+', encoding='utf-8', newline='') as fout: 97 | header = fin.readline() #读取第一行,第一行的标题要保留 98 | fout.write(header) 99 | for line in fin.readlines(): 100 | # 忽略以后的标题行 101 | if line[0:7] != 'authors' and line[0:3] != 'DOI': 102 | fout.write(line) 103 | num += 1 104 | return num 105 | 106 | @classmethod 107 | def remove_reduntant_header_one_dir(cls, dir_path: str): 108 | ''' 109 | 删除某个文件夹下所有文件的多余header 110 | :return: 111 | ''' 112 | for filename in os.listdir(dir_path): 113 | cls.remove_reduntant_header_one_file(dir_path, filename) 114 | 115 | @classmethod 116 | def mkResultYearTypeDir(cls, year: str, type: str): 117 | ''' 118 | 创建每一年每个文章种类的文件夹 119 | :param year: 120 | :param type:专利、博硕、期刊, 成果,值为paten, boshuo, journal,achievement 121 | :return: 122 | ''' 123 | resultDir = 'result/' 124 | yearTypeDir = resultDir + year + '/' + type + '/' 125 | if not os.path.exists(yearTypeDir): 126 | os.makedirs(yearTypeDir) 127 | return yearTypeDir 128 | 129 | @classmethod 130 | def mkDirsIfNotExist(cls, dirsName): 131 | if not os.path.exists(dirsName): 132 | os.makedirs(dirsName) 133 | 134 | @classmethod 135 | def initOutputDir(cls): 136 | ''' 137 | 初始化输出文件夹 138 | :return: 139 | ''' 140 | cls.mkDirsIfNotExist(cls.targetDir) 141 | cls.mkDirsIfNotExist(cls.resultDir) 142 | cls.mkDirsIfNotExist(cls.htmlDir) 143 | cls.mkDirsIfNotExist(cls.errorDir) 144 | cls.mkDirsIfNotExist(cls.logDir) 145 | cls.mkErrorLinkDirs() 146 | cls.mkErrorOverflowDirs() 147 | cls.mkErrorDayDirs() 148 | cls.mkErrorPageDirs() 149 | 150 | 151 | @classmethod 152 | def mkErrorLinkDirs(cls): 153 | ''' 154 | 出错链接文件夹,按item类型分类,期刊,博硕,成果,专利 155 | :param type: 156 | :return: 157 | ''' 158 | cls.mkDirsIfNotExist(cls.errorLinkDir) 159 | 160 | @classmethod 161 | def mkErrorOverflowDirs(cls): 162 | cls.mkDirsIfNotExist(cls.errorOverflowDir) 163 | 164 | @classmethod 165 | def mkErrorDayDirs(cls): 166 | cls.mkDirsIfNotExist(cls.errorDayDir) 167 | 168 | @classmethod 169 | def mkErrorPageDirs(cls): 170 | cls.mkDirsIfNotExist(cls.errorPageDir) 171 | 172 | @classmethod 173 | def saveHtml(cls, year, response, type:str, url, title): 174 | ''' 175 | 存放html文件 176 | :param response: 177 | :param type: 178 | :param url: 179 | :param title: 180 | :return: 181 | ''' 182 | # 根据年份和类型存储html源文件,每个网页单独存储一个文件 183 | filepath = './html/' + year + '/' + type + '/' 184 | if not os.path.exists(filepath): 185 | os.makedirs(filepath) 186 | # 按标题命名, 加个随机字符防止文件被覆盖,因为存在同名专利 187 | ranStr = ''.join(random.sample(string.ascii_letters + string.digits, 3)) 188 | htmlFileName = filepath + cls.handleFilename(title) + '_' + ranStr + '.html' 189 | with open(htmlFileName, 'w', encoding='utf-8') as f: 190 | f.write(response.text) 191 | 192 | @classmethod 193 | def handleFilename(cls, filename): 194 | ''' 195 | windows的文件名中不能含有一些特殊字符,所以要处理一下文件名 196 | :param filename: 197 | :return: 198 | ''' 199 | sets = ['/', '\\', ':', '*', '?', '"', '<', '>', '|'] 200 | for char in filename: 201 | if char in sets: 202 | filename = filename.replace(char, '') 203 | return filename.replace('\n', '').replace('\r', ' ') 204 | 205 | @classmethod 206 | def markFinishOnce(cls): 207 | ''' 208 | 标记成功运行,连续标记两次证明程序运行完成 209 | :return: 210 | ''' 211 | filenameOne = "1.txt" 212 | filenameTwo = "2.txt" 213 | if os.path.exists(filenameTwo): 214 | logging.info("已经运行两次了") 215 | elif os.path.exists(filenameOne): 216 | logging.info("第二次运行完成") 217 | with open(filenameTwo, "w", encoding="utf-8") as f: 218 | f.write("第二次运行完成,所有任务完成,请将整个文件夹返还,谢谢!\n") 219 | else: 220 | logging.info("第一次运行完成") 221 | with open(filenameOne, "w", encoding="utf-8") as f: 222 | f.write("第一次运行完成,请再次运行exe文件,并不要删除此文件,等待2.txt生成\n") 223 | @classmethod 224 | def IfFinishTask(cls): 225 | ''' 226 | 本次分发的任务是否完成 227 | :return: 228 | ''' 229 | filenameOne = "1.txt" 230 | filenameTwo = "2.txt" 231 | return os.path.exists(filenameTwo) 232 | -------------------------------------------------------------------------------- /CnkiSpider/settings/settings.py: -------------------------------------------------------------------------------- 1 | # Scrapy settings for CnkiSpider project 2 | # 3 | # For simplicity, this file contains only settings considered important or 4 | # commonly used. You can find more settings consulting the documentation: 5 | # 6 | # https://docs.scrapy.org/en/latest/topics/settings.html 7 | # https://docs.scrapy.org/en/latest/topics/downloader-middleware.html 8 | # https://docs.scrapy.org/en/latest/topics/spider-middleware.html 9 | 10 | import os 11 | import random 12 | # from CnkiSpider.commonUtils import SpiderTypeEnum 13 | 14 | BOT_NAME = 'CnkiSpider' 15 | 16 | SPIDER_MODULES = ['CnkiSpider.spiders'] 17 | NEWSPIDER_MODULE = 'CnkiSpider.spiders' 18 | 19 | # PROJECT_ROOT = os.path.abspath(os.path.dirname(__file__)) 20 | 21 | 22 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 23 | #USER_AGENT = 'CnkiSpider (+http://www.yourdomain.com)' 24 | 25 | # Obey robots.txt rules 26 | ROBOTSTXT_OBEY = False 27 | 28 | # 日志级别 29 | # LOG_LEVEL = 'WARNING' 30 | LOG_LEVEL = 'INFO' 31 | # LOG_FILE = '/home/lhp/common.log' 32 | 33 | RETRY_ENABLED = True 34 | RETRY_TIMES = 3 35 | DOWNLOAD_TIMEOUT = 5 36 | 37 | # 控制学科分类读取文件 38 | # True为./dataSrc/codeTest.txt 39 | # 删去此配置或 False 为 ./dataSrc/code.txt 40 | # 正常运行设置为 False,测试时设置为 True ,只爬一部分学科分类对应的文献 41 | CODE_FILE_TEST_MODE = False 42 | 43 | # 分发模式,开启此模式后,每次启动会重置当前爬虫状态(即重新链接获取,但链接解析不会重复) 44 | DISTRUBUTE_MODE = False 45 | 46 | # SPIDER_TYPE = SpiderTypeEnum.PATENT.value 47 | # SPIDER_TYPE = SpiderTypeEnum.PAPER_AND_ACH.value 48 | # 爬取的文献的开始日期和结束日期,此配置已废弃,开始日期与结束日期请在 status 表中设置 49 | # START_DATE = '2020-07-01' 50 | # END_DATE = '2020-07-31' 51 | 52 | ####################### proxy begin ########################## 53 | # 是否开启,不开启就用本机 54 | PROXY_OPEN = False 55 | # PROXY_ID = "2021041600266801515" 56 | PROXY_ID = "2121042901240321356" 57 | PROXY_SECRET = "Ibmt35hmHCMlE2fH" 58 | # 一次请求的代理的数量,目前买的套餐最高支持5 59 | PROXY_LIMIT = 5 60 | PROXY_FORMAT = "json" 61 | PROXY_AUTH_MODE = "basic" 62 | # 每个代理重复利用的次数 63 | PROXY_REUSE = 15 64 | 65 | ###################### proxy end ########################## 66 | 67 | 68 | ####################### redis begin ############################## 69 | 70 | # 指定Redis的主机名和端口 71 | # 敏感信息配置在环境变量中 72 | REDIS_HOST = os.environ.get("REDIS_HOST") 73 | REDIS_PORT = 16379 74 | 75 | # 设置密码 76 | REDIS_PARAMS = { 77 | 'password': os.environ.get("REDIS_PWD"), #密码 78 | 'db': 9 # 切换数据库 79 | } 80 | 81 | # 调度器启用Redis存储Requests队列 82 | SCHEDULER = "scrapy_redis.scheduler.Scheduler" 83 | 84 | # 确保所有的爬虫实例使用Redis进行重复过滤 85 | DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter" 86 | 87 | # 将Requests队列持久化到Redis,可支持暂停或重启爬虫 88 | SCHEDULER_PERSIST = True 89 | 90 | # [浅谈深度优先和广度优先(scrapy-redis) - 风不再来 - 博客园](https://www.cnblogs.com/yunxintryyoubest/p/9955867.html) 91 | # Requests的调度策略,默认优先级队列 92 | # SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.PriorityQueue' # 优先级队列 93 | SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.LifoQueue' # 深度优先 94 | # SCHEDULER_QUEUE_CLASS='scrapy_redis.queue.FifoQueue' # 广度优先 95 | 96 | # 将爬取到的items保存到Redis 以便进行后续处理 97 | # ITEM_PIPELINES = { 98 | # 'scrapy_redis.pipelines.RedisPipeline': 300 99 | # } 100 | 101 | ITEM_PIPELINES = { 102 | # 'scrapy_redis.pipelines.RedisPipeline': 299, 103 | 'CnkiSpider.pipelines.CnkispiderPipeline': 300, 104 | } 105 | 106 | ######################## redis end ############################## 107 | 108 | ######################### mysql begin ########################## 109 | 110 | # 敏感信息配置在环境变量中 111 | MYSQL_HOST = os.environ.get("MYSQL_HOST") 112 | MYSQL_PORT = "3306" 113 | MYSQL_USER = "root" 114 | MYSQL_PASSWD = os.environ.get("MYSQL_PWD") 115 | MYSQL_DATABASE = "ZhiWangSpider" 116 | 117 | #表名,将本机和服务器跑的表分开 118 | MYSQL_TABLE = "status" 119 | 120 | #状态表名,记录了当前爬取的日期和代码 121 | STATUS_TABLE = "status" 122 | # 链接获取错误表(用来重新获取某日期与某学科分类对应下的所有链接) 123 | ERROR_CODE_TABLE = "errorCode" 124 | # 链接请求错误表(用来重新请求错误链接) 125 | ERROR_LINK_TABLE = "errorLink" 126 | 127 | ######################### mysql end ############################# 128 | 129 | 130 | # 重试状态码,其中400,407,478是猿人云返回的错误码 131 | RETRY_HTTP_CODES = [500, 502, 503, 504, 522, 524, 408, 400, 407, 478] 132 | 133 | ############################ 下载速度控制 begin ############################ 134 | 135 | # 下载延时 136 | # DOWNLOAD_DELAY = 3 137 | # 随机将上面的下载延时乘0.5-1.5 138 | # RANDOMIZE_DOWNLOAD_DELAY=True 139 | 140 | # 自动限速,必开,插件会自动调整合适的下载延迟 141 | AUTOTHROTTLE_ENABLED = True 142 | AUTOTHROTTLE_TARGET_CONCURRENCY = 8 143 | AUTOTHROTTLE_START_DELAY = 3 144 | 145 | # 并发控制,默认16 146 | CONCURRENT_REQUESTS_PER_DOMAIN = 4 147 | #CONCURRENT_REQUESTS_PER_IP = 16 148 | 149 | 150 | ####################### 下载速度控制 end ######################## 151 | 152 | # 代理中间件的开启与否 153 | proxyMiddleWarePrio = 110 if PROXY_OPEN else None 154 | 155 | DOWNLOADER_MIDDLEWARES = { 156 | 157 | 'CnkiSpider.customDownloadMiddlewares.CnkiSpiderHeaderMiddleware': 100, 158 | 159 | 'CnkiSpider.customDownloadMiddlewares.CnkispiderSpiderProxyMiddleware': proxyMiddleWarePrio, #代理中间件 160 | # 全局错误中间件,注意这里是改的response,而下载中间件的response的优先级是反的 161 | # 全局错误中间件和重试中间件(RetryAndGetFailedUrl)都是改写了process_response 162 | # 而全局错误肯定是在重试之后再处理的,所以全局错误中间件的优先级比重试高(response优先级相反) 163 | 'CnkiSpider.customDownloadMiddlewares.ProcessAllExceptionMiddleware': 120, 164 | 'CnkiSpider.customDownloadMiddlewares.RetryAndGetFailedUrl': 130, 165 | # 'CnkiSpider.middlewares.CnkispiderDownloaderMiddleware': 543, 166 | 'scrapy.downloadermiddlewares.retry.RetryMiddleware':None, #禁用自带重试中间件 167 | } 168 | 169 | # 自定义cookie 170 | COOKIES_ENABLED = True 171 | 172 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 173 | #CONCURRENT_REQUESTS = 32 174 | 175 | # Configure a delay for requests for the same website (default: 0) 176 | # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay 177 | # See also autothrottle settings and docs 178 | #DOWNLOAD_DELAY = 3 179 | # The download delay setting will honor only one of: 180 | 181 | 182 | # Disable cookies (enabled by default) 183 | # COOKIES_ENABLED = True 184 | 185 | # Disable Telnet Console (enabled by default) 186 | #TELNETCONSOLE_ENABLED = False 187 | 188 | # Override the default request headers: 189 | #DEFAULT_REQUEST_HEADERS = { 190 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 191 | # 'Accept-Language': 'en', 192 | #} 193 | 194 | # Enable or disable spider middlewares 195 | # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html 196 | #SPIDER_MIDDLEWARES = { 197 | # 'CnkiSpider.middlewares.CnkispiderSpiderMiddleware': 543, 198 | #} 199 | 200 | # Enable or disable downloader middlewares 201 | # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html 202 | 203 | 204 | # Enable or disable extensions 205 | # See https://docs.scrapy.org/en/latest/topics/extensions.html 206 | #EXTENSIONS = { 207 | # 'scrapy.extensions.telnet.TelnetConsole': None, 208 | #} 209 | 210 | # Configure item pipelines 211 | # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html 212 | #ITEM_PIPELINES = { 213 | # 'CnkiSpider.pipelines.CnkispiderPipeline': 300, 214 | #} 215 | 216 | # Enable and configure the AutoThrottle extension (disabled by default) 217 | # See https://docs.scrapy.org/en/latest/topics/autothrottle.html 218 | #AUTOTHROTTLE_ENABLED = True 219 | # The initial download delay 220 | #AUTOTHROTTLE_START_DELAY = 5 221 | # The maximum download delay to be set in case of high latencies 222 | #AUTOTHROTTLE_MAX_DELAY = 60 223 | # The average number of requests Scrapy should be sending in parallel to 224 | # each remote server 225 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 226 | # Enable showing throttling stats for every response received: 227 | #AUTOTHROTTLE_DEBUG = False 228 | 229 | # Enable and configure HTTP caching (disabled by default) 230 | # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 231 | #HTTPCACHE_ENABLED = True 232 | #HTTPCACHE_EXPIRATION_SECS = 0 233 | #HTTPCACHE_DIR = 'httpcache' 234 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 235 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 236 | 237 | DEFAULT_REQUEST_HEADERS = { 238 | "Accept": "*/*", 239 | "Accept-Encoding": "gzip, deflate, br", 240 | "Accept-Language": "zh-CN,zh;q=0.9", 241 | "Connection": "keep-alive", 242 | "Content-Type": "application/x-www-form-urlencoded", 243 | "Host": "kns.cnki.net", 244 | "Origin": "https://kns.cnki.net", 245 | # "Referer": "https://kns.cnki.net/kns/brief/result.aspx?dbprefix=SCDB&crossDbcodes=CJFQ,CDFD,CMFD,CPFD,IPFD,CCND,CCJD", 246 | "Sec-Fetch-Dest": "empty", 247 | "Sec-Fetch-Mode": "cors", 248 | "Sec-Fetch-Site": "same-origin", 249 | # "User-Agent": random.choice(USER_AGENTS) 250 | } 251 | -------------------------------------------------------------------------------- /CnkiSpider/settings/settings_distribute.py: -------------------------------------------------------------------------------- 1 | # Scrapy settings for CnkiSpider project 2 | # 3 | # For simplicity, this file contains only settings considered important or 4 | # commonly used. You can find more settings consulting the documentation: 5 | # 6 | # https://docs.scrapy.org/en/latest/topics/settings.html 7 | # https://docs.scrapy.org/en/latest/topics/downloader-middleware.html 8 | # https://docs.scrapy.org/en/latest/topics/spider-middleware.html 9 | 10 | import os 11 | import random 12 | from configparser import ConfigParser 13 | 14 | BOT_NAME = 'CnkiSpider' 15 | 16 | SPIDER_MODULES = ['CnkiSpider.spiders'] 17 | NEWSPIDER_MODULE = 'CnkiSpider.spiders' 18 | 19 | # PROJECT_ROOT = os.path.abspath(os.path.dirname(__file__)) 20 | 21 | 22 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 23 | #USER_AGENT = 'CnkiSpider (+http://www.yourdomain.com)' 24 | 25 | # Obey robots.txt rules 26 | ROBOTSTXT_OBEY = False 27 | 28 | # 日志级别 29 | # LOG_LEVEL = 'WARNING' 30 | LOG_LEVEL = 'INFO' 31 | # LOG_FILE = '/home/lhp/common.log' 32 | 33 | RETRY_ENABLED = True 34 | RETRY_TIMES = 3 35 | DOWNLOAD_TIMEOUT = 5 36 | 37 | # 控制学科分类读取文件 38 | # True为./dataSrc/codeTest.txt 39 | # 删去此配置或 False 为 ./dataSrc/code.txt 40 | CODE_FILE_TEST_MODE = False 41 | 42 | # 分发模式,开启此模式后,每次启动会重置当前爬虫状态(即重新链接获取,但链接解析不会重复) 43 | DISTRUBUTE_MODE = True 44 | 45 | 46 | cp = ConfigParser() 47 | # 与exe同目录 48 | print(os.getcwd()) 49 | cp.read('./config.cfg') 50 | # spiderType = cp.get('spider', 'type') 51 | DATABASE_VERSION = cp.get('database', 'no') 52 | 53 | # SPIDER_TYPE = SpiderTypeEnum.PATENT.value 54 | # SPIDER_TYPE = SpiderTypeEnum.PAPER_AND_ACH.value 55 | # START_DATE = '2020-07-01' 56 | # END_DATE = '2020-07-31' 57 | 58 | ####################### proxy begin ########################## 59 | # 是否开启,不开启就用本机 60 | PROXY_OPEN = False 61 | # PROXY_ID = "2021041600266801515" 62 | PROXY_ID = "2121042901240321356" 63 | PROXY_SECRET = "Ibmt35hmHCMlE2fH" 64 | # 一次请求的代理的数量,目前买的套餐最高支持5 65 | PROXY_LIMIT = 5 66 | PROXY_FORMAT = "json" 67 | PROXY_AUTH_MODE = "basic" 68 | # 每个代理重复利用的次数 69 | PROXY_REUSE = 10 70 | 71 | ###################### proxy end ########################## 72 | 73 | 74 | ####################### redis begin ############################## 75 | 76 | # 指定Redis的主机名和端口 77 | # 敏感信息配置在环境变量中 78 | REDIS_HOST = os.environ.get("REDIS_HOST") 79 | REDIS_PORT = 16379 80 | 81 | # 设置密码 82 | REDIS_PARAMS = { 83 | 'password': os.environ.get("REDIS_PWD"), #远程连接的密码 84 | 'db': int(DATABASE_VERSION) # 切换数据库 85 | } 86 | 87 | # 调度器启用Redis存储Requests队列 88 | SCHEDULER = "scrapy_redis.scheduler.Scheduler" 89 | 90 | # 确保所有的爬虫实例使用Redis进行重复过滤 91 | DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter" 92 | 93 | # 将Requests队列持久化到Redis,可支持暂停或重启爬虫 94 | SCHEDULER_PERSIST = True 95 | 96 | # [浅谈深度优先和广度优先(scrapy-redis) - 风不再来 - 博客园](https://www.cnblogs.com/yunxintryyoubest/p/9955867.html) 97 | # Requests的调度策略,默认优先级队列 98 | # SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.PriorityQueue' # 优先级队列 99 | SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.LifoQueue' # 深度优先 100 | # SCHEDULER_QUEUE_CLASS='scrapy_redis.queue.FifoQueue' # 广度优先 101 | 102 | # 将爬取到的items保存到Redis 以便进行后续处理 103 | # ITEM_PIPELINES = { 104 | # 'scrapy_redis.pipelines.RedisPipeline': 300 105 | # } 106 | 107 | ITEM_PIPELINES = { 108 | # 'scrapy_redis.pipelines.RedisPipeline': 299, 109 | 'CnkiSpider.pipelines.CnkispiderPipeline': 300, 110 | } 111 | 112 | ######################## redis end ############################## 113 | 114 | ######################### mysql begin ########################## 115 | # 敏感信息配置在环境变量中 116 | MYSQL_HOST = os.environ.get("MYSQL_HOST") 117 | MYSQL_PORT = "3306" 118 | MYSQL_USER = "root" 119 | MYSQL_PASSWD = os.environ.get("MYSQL_PWD") 120 | MYSQL_DATABASE = "ZhiWangSpider" 121 | 122 | #表名,将本机和服务器跑的表分开 123 | MYSQL_TABLE = "status" + str(DATABASE_VERSION) 124 | 125 | #状态表名,记录了当前爬取的日期和代码 126 | STATUS_TABLE = "status" + str(DATABASE_VERSION) 127 | # 链接获取错误表(用来重新获取某日期与某学科分类对应下的所有链接) 128 | ERROR_CODE_TABLE = "errorCode" + str(DATABASE_VERSION) 129 | # 链接请求错误表(用来重新请求错误链接) 130 | ERROR_LINK_TABLE = "errorLink" + str(DATABASE_VERSION) 131 | 132 | ######################### mysql end ############################# 133 | 134 | 135 | # 重试状态码,其中400,407,478是猿人云返回的错误码 136 | RETRY_HTTP_CODES = [500, 502, 503, 504, 522, 524, 408, 400, 407, 478] 137 | 138 | ############################ 下载速度控制 begin ############################ 139 | 140 | # 下载延时 141 | # DOWNLOAD_DELAY = 3 142 | # 随机将上面的下载延时乘0.5-1.5 143 | # RANDOMIZE_DOWNLOAD_DELAY=True 144 | 145 | # 自动限速,必开,插件会自动调整合适的下载延迟 146 | AUTOTHROTTLE_ENABLED = True 147 | AUTOTHROTTLE_TARGET_CONCURRENCY = 8 148 | AUTOTHROTTLE_START_DELAY = 3 149 | 150 | # 并发控制,默认16 151 | CONCURRENT_REQUESTS_PER_DOMAIN = 4 152 | #CONCURRENT_REQUESTS_PER_IP = 16 153 | 154 | 155 | ####################### 下载速度控制 end ######################## 156 | 157 | # 代理中间件的开启与否 158 | proxyMiddleWarePrio = 110 if PROXY_OPEN else None 159 | 160 | DOWNLOADER_MIDDLEWARES = { 161 | 162 | 'CnkiSpider.customDownloadMiddlewares.CnkiSpiderHeaderMiddleware': 100, 163 | 164 | 'CnkiSpider.customDownloadMiddlewares.CnkispiderSpiderProxyMiddleware': proxyMiddleWarePrio, #代理中间件 165 | # 全局错误中间件,注意这里是改的response,而下载中间件的response的优先级是反的 166 | # 全局错误中间件和重试中间件(RetryAndGetFailedUrl)都是改写了process_response 167 | # 而全局错误肯定是在重试之后再处理的,所以全局错误中间件的优先级比重试高(response优先级相反) 168 | 'CnkiSpider.customDownloadMiddlewares.ProcessAllExceptionMiddleware': 120, 169 | 'CnkiSpider.customDownloadMiddlewares.RetryAndGetFailedUrl': 130, 170 | # 'CnkiSpider.middlewares.CnkispiderDownloaderMiddleware': 543, 171 | 'scrapy.downloadermiddlewares.retry.RetryMiddleware':None, #禁用自带重试中间件 172 | } 173 | 174 | # 自定义cookie 175 | COOKIES_ENABLED = True 176 | 177 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 178 | #CONCURRENT_REQUESTS = 32 179 | 180 | # Configure a delay for requests for the same website (default: 0) 181 | # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay 182 | # See also autothrottle settings and docs 183 | #DOWNLOAD_DELAY = 3 184 | # The download delay setting will honor only one of: 185 | 186 | 187 | # Disable cookies (enabled by default) 188 | # COOKIES_ENABLED = True 189 | 190 | # Disable Telnet Console (enabled by default) 191 | #TELNETCONSOLE_ENABLED = False 192 | 193 | # Override the default request headers: 194 | #DEFAULT_REQUEST_HEADERS = { 195 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 196 | # 'Accept-Language': 'en', 197 | #} 198 | 199 | # Enable or disable spider middlewares 200 | # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html 201 | #SPIDER_MIDDLEWARES = { 202 | # 'CnkiSpider.middlewares.CnkispiderSpiderMiddleware': 543, 203 | #} 204 | 205 | # Enable or disable downloader middlewares 206 | # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html 207 | 208 | 209 | # Enable or disable extensions 210 | # See https://docs.scrapy.org/en/latest/topics/extensions.html 211 | #EXTENSIONS = { 212 | # 'scrapy.extensions.telnet.TelnetConsole': None, 213 | #} 214 | 215 | # Configure item pipelines 216 | # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html 217 | #ITEM_PIPELINES = { 218 | # 'CnkiSpider.pipelines.CnkispiderPipeline': 300, 219 | #} 220 | 221 | # Enable and configure the AutoThrottle extension (disabled by default) 222 | # See https://docs.scrapy.org/en/latest/topics/autothrottle.html 223 | #AUTOTHROTTLE_ENABLED = True 224 | # The initial download delay 225 | #AUTOTHROTTLE_START_DELAY = 5 226 | # The maximum download delay to be set in case of high latencies 227 | #AUTOTHROTTLE_MAX_DELAY = 60 228 | # The average number of requests Scrapy should be sending in parallel to 229 | # each remote server 230 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 231 | # Enable showing throttling stats for every response received: 232 | #AUTOTHROTTLE_DEBUG = False 233 | 234 | # Enable and configure HTTP caching (disabled by default) 235 | # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 236 | #HTTPCACHE_ENABLED = True 237 | #HTTPCACHE_EXPIRATION_SECS = 0 238 | #HTTPCACHE_DIR = 'httpcache' 239 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 240 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 241 | 242 | DEFAULT_REQUEST_HEADERS = { 243 | # Referer和User-Agent在customDownloadMiddlewares.CnkiSpiderHeaderMiddleware设置 244 | "Accept": "*/*", 245 | "Accept-Encoding": "gzip, deflate, br", 246 | "Accept-Language": "zh-CN,zh;q=0.9", 247 | "Connection": "keep-alive", 248 | "Content-Type": "application/x-www-form-urlencoded", 249 | "Host": "kns.cnki.net", 250 | "Origin": "https://kns.cnki.net", 251 | # "Referer": "https://kns.cnki.net/kns/brief/result.aspx?dbprefix=SCDB&crossDbcodes=CJFQ,CDFD,CMFD,CPFD,IPFD,CCND,CCJD", 252 | "Sec-Fetch-Dest": "empty", 253 | "Sec-Fetch-Mode": "cors", 254 | "Sec-Fetch-Site": "same-origin", 255 | # "User-Agent": random.choice(USER_AGENTS) 256 | } 257 | -------------------------------------------------------------------------------- /dataSrc/code.txt: -------------------------------------------------------------------------------- 1 | A001_1 2 | A001_2 3 | A001_3 4 | A001_4 5 | A002_1 6 | A002_2 7 | A002_3 8 | A002_4 9 | A002_5 10 | A002_6 11 | A002_7 12 | A002_8 13 | A002_9 14 | A002_A 15 | A002_B 16 | A002_C 17 | A002_D 18 | A003_1 19 | A003_2 20 | A004_1 21 | A004_2 22 | A004_3 23 | A004_4 24 | A004_5 25 | A004_6 26 | A004_7 27 | A004_8 28 | A004_9 29 | A005_1 30 | A005_2 31 | A005_3 32 | A005_4 33 | A005_5 34 | A005_6 35 | A005_7 36 | A005_8 37 | A005_9 38 | A005_A 39 | A005_B 40 | A005_C 41 | A005_D 42 | A005_E 43 | A005_F 44 | A005_G 45 | A005_H 46 | A005_I 47 | A005_J 48 | A006_1 49 | A006_2 50 | A006_3 51 | A006_4 52 | A006_5 53 | A006_6 54 | A006_7 55 | A006_8 56 | A006_9 57 | A006_A 58 | A006_B 59 | A006_C 60 | A006_D 61 | A006_E 62 | A006_F 63 | A006_G 64 | A007_1 65 | A007_2 66 | A007_3 67 | A007_4 68 | A007_5 69 | A007_6 70 | A007_7 71 | A007_8 72 | A007_9 73 | A007_A 74 | A008_1 75 | A008_2 76 | A009_1 77 | A009_2 78 | A009_3 79 | A009_4 80 | A009_5 81 | A009_6 82 | A009_7 83 | A009_8 84 | A009_9 85 | A009_A 86 | A009_B 87 | A009_C 88 | A010_1 89 | A010_2 90 | A010_3 91 | A010_4 92 | A010_5 93 | A010_6 94 | A010_7 95 | A010_8 96 | A010_9 97 | A010_A 98 | A010_B 99 | A010_C 100 | A010_D 101 | A010_E 102 | A011_1 103 | A011_2 104 | A011_3 105 | A011_4 106 | A011_5 107 | A011_6 108 | A011_7 109 | A011_8 110 | A011_9 111 | A011_A 112 | A011_B 113 | A011_C 114 | A011_D 115 | A011_E 116 | A011_F 117 | A011_G 118 | A011_H 119 | A011_I 120 | A011_K 121 | A012_1 122 | A012_2 123 | A012_3 124 | A012_4 125 | A012_5 126 | A012_6 127 | A012_7 128 | A012_8 129 | A012_9 130 | A012_A 131 | A012_B 132 | A012_C 133 | A012_D 134 | A012_E 135 | A012_F 136 | A013_1 137 | A013_2 138 | B014_1 139 | B014_2 140 | B014_3 141 | B014_4 142 | B014_5 143 | B014_6 144 | B014_7 145 | B014_8 146 | B015_1 147 | B015_2 148 | B015_3 149 | B015_4 150 | B015_5 151 | B015_6 152 | B015_7 153 | B015_8 154 | B016_1 155 | B016_2 156 | B016_3 157 | B016_4 158 | B016_5 159 | B016_6 160 | B016_7 161 | B016_8 162 | B017_1 163 | B017_2 164 | B017_3 165 | B017_4 166 | B017_5 167 | B017_6 168 | B017_7 169 | B018_1 170 | B018_2 171 | B018_3 172 | B018_4 173 | B018_5 174 | B018_6 175 | B018_7 176 | B019_1 177 | B019_2 178 | B019_3 179 | B019_4 180 | B019_5 181 | B019_6 182 | B020_1 183 | B020_2 184 | B020_3 185 | B020_4 186 | B020_5 187 | B020_6 188 | B020_7 189 | B020_8 190 | B020_9 191 | B020_A 192 | B020_B 193 | B020_C 194 | B021_1 195 | B021_2 196 | B021_3 197 | B021_4 198 | B021_5 199 | B021_6 200 | B021_7 201 | B021_8 202 | B022_1 203 | B022_2 204 | B022_3 205 | B022_4 206 | B022_5 207 | B022_6 208 | B022_7 209 | B022_8 210 | B022_9 211 | B022_A 212 | B022_B 213 | B022_C 214 | B023_1 215 | B023_2 216 | B023_3 217 | B023_4 218 | B023_5 219 | B023_6 220 | B023_7 221 | B023_8 222 | B023_9 223 | B023_A 224 | B023_B 225 | B024_1 226 | B024_2 227 | B024_3 228 | B024_4 229 | B024_5 230 | B024_6 231 | B024_7 232 | B024_8 233 | B024_9 234 | B024_A 235 | B024_B 236 | B024_C 237 | B024_D 238 | B024_E 239 | B025_1 240 | B025_2 241 | B025_3 242 | B025_4 243 | B025_5 244 | B025_6 245 | B025_7 246 | B026_1 247 | B026_2 248 | B026_3 249 | B026_4 250 | B026_5 251 | B026_6 252 | B027_1 253 | B027_2 254 | B027_3 255 | B027_4 256 | B027_5 257 | B027_6 258 | C028_1 259 | C028_2 260 | C028_3 261 | C028_4 262 | C028_5 263 | C028_6 264 | C028_7 265 | C028_8 266 | C028_9 267 | C029_1 268 | C029_2 269 | C029_3 270 | C029_4 271 | C029_5 272 | C029_6 273 | C029_7 274 | C029_8 275 | C029_9 276 | C029_A 277 | C029_B 278 | C029_C 279 | C029_D 280 | C030_1 281 | C030_2 282 | C030_3 283 | C030_4 284 | C030_5 285 | C030_6 286 | C030_7 287 | C030_8 288 | C030_9 289 | C030_A 290 | C030_B 291 | C030_C 292 | C030_D 293 | C030_E 294 | C030_F 295 | C030_G 296 | C030_H 297 | C030_I 298 | C031_1 299 | C031_2 300 | C031_3 301 | C031_4 302 | C032_1 303 | C032_2 304 | C032_3 305 | C032_4 306 | C032_5 307 | C033_1 308 | C033_2 309 | C033_3 310 | C033_4 311 | C033_5 312 | C033_6 313 | C033_7 314 | C033_8 315 | C034_1 316 | C034_2 317 | C034_3 318 | C035_1 319 | C035_2 320 | C035_3 321 | C035_4 322 | C035_5 323 | C035_6 324 | C035_7 325 | C035_8 326 | C035_9 327 | C035_A 328 | C035_B 329 | C035_C 330 | C035_D 331 | C035_E 332 | C036_1 333 | C036_2 334 | C036_3 335 | C036_4 336 | C036_51 337 | C036_52 338 | C036_53 339 | C036_54 340 | C036_55 341 | C036_56 342 | C036_57 343 | C036_58 344 | C036_59 345 | C036_6 346 | C036_7 347 | C036_8 348 | C036_9 349 | C036_A 350 | C036_B 351 | C036_C 352 | C036_D 353 | C036_E 354 | C036_F 355 | C036_G 356 | C037_1 357 | C037_2 358 | C037_3 359 | C037_4 360 | C037_5 361 | C037_6 362 | C037_7 363 | C037_8 364 | C037_9 365 | C037_A 366 | C038_1 367 | C038_2 368 | C038_3 369 | C039_1 370 | C039_2 371 | C039_3 372 | C039_4 373 | C039_5 374 | C039_6 375 | C039_7 376 | C040_1 377 | C040_2 378 | C040_3 379 | C040_4 380 | C040_5 381 | C040_6 382 | C040_7 383 | C040_8 384 | C040_9 385 | C041_1 386 | C041_2 387 | C041_3 388 | C041_4 389 | C041_5 390 | C041_6 391 | C041_7 392 | C041_8 393 | C042_1 394 | C042_2 395 | C042_3 396 | C042_4 397 | C042_5 398 | C042_6 399 | C042_7 400 | C042_8 401 | C042_9 402 | C042_A 403 | C042_B 404 | C042_C 405 | C042_D 406 | D043_1 407 | D043_2 408 | D043_3 409 | D043_4 410 | D043_5 411 | D043_6 412 | D043_7 413 | D043_8 414 | D043_9 415 | D044_1 416 | D044_2 417 | D044_3 418 | D044_4 419 | D044_5 420 | D044_6 421 | D044_7 422 | D044_8 423 | D044_9 424 | D045_1 425 | D045_2 426 | D045_3 427 | D045_4 428 | D045_5 429 | D045_6 430 | D045_7 431 | D045_8 432 | D046_1 433 | D046_2 434 | D046_3 435 | D046_4 436 | D046_5 437 | D046_6 438 | D046_7 439 | D046_8 440 | D046_9 441 | D046_A 442 | D046_B 443 | D046_C 444 | D047_1 445 | D047_2 446 | D047_3 447 | D047_4 448 | D047_5 449 | D047_6 450 | D047_7 451 | D047_8 452 | D048_1 453 | D048_2 454 | D048_3 455 | D048_4 456 | D048_5 457 | D048_6 458 | D048_7 459 | D049_1 460 | D049_2 461 | D049_3 462 | D049_4 463 | D049_5 464 | D049_6 465 | D049_7 466 | D049_8 467 | D050_1 468 | D050_2 469 | D050_3 470 | D050_4 471 | D051_1 472 | D051_2 473 | D051_3 474 | D052_1 475 | D052_2 476 | D052_3 477 | D052_4 478 | D052_5 479 | D052_6 480 | D052_7 481 | D052_8 482 | E053_1 483 | E053_2 484 | E053_3 485 | E053_4 486 | E053_5 487 | E053_6 488 | E053_7 489 | E053_8 490 | E053_9 491 | E053_A 492 | E054_1 493 | E054_2 494 | E054_3 495 | E054_4 496 | E054_5 497 | E054_6 498 | E055_0 499 | E055_1 500 | E055_2 501 | E055_3 502 | E055_4 503 | E055_5 504 | E055_6 505 | E055_7 506 | E055_8 507 | E056_1 508 | E056_2 509 | E056_3 510 | E056_4 511 | E056_5 512 | E056_6 513 | E056_7 514 | E056_8 515 | E056_9 516 | E056_A 517 | E056_B 518 | E056_C 519 | E056_D 520 | E056_E 521 | E056_F 522 | E056_G 523 | E057_1 524 | E057_2 525 | E057_3 526 | E057_4 527 | E057_5 528 | E057_6 529 | E057_7 530 | E057_8 531 | E057_9 532 | E057_A 533 | E057_B 534 | E058_1 535 | E058_2 536 | E059_1 537 | E059_2 538 | E059_3 539 | E059_4 540 | E059_5 541 | E059_6 542 | E059_7 543 | E059_8 544 | E059_9 545 | E059_A 546 | E059_B 547 | E059_C 548 | E060_1 549 | E060_2 550 | E060_3 551 | E060_4 552 | E060_5 553 | E060_6 554 | E060_7 555 | E060_8 556 | E060_9 557 | E061_1 558 | E061_2 559 | E061_3 560 | E061_4 561 | E061_5 562 | E061_6 563 | E061_7 564 | E061_8 565 | E061_9 566 | E061_A 567 | E061_B 568 | E062_1 569 | E062_2 570 | E062_3 571 | E062_4 572 | E062_6 573 | E062_7 574 | E063_1 575 | E063_2 576 | E063_3 577 | E063_4 578 | E063_5 579 | E063_6 580 | E064_1 581 | E064_2 582 | E064_3 583 | E064_4 584 | E064_5 585 | E064_6 586 | E064_7 587 | E065_1 588 | E065_2 589 | E066_1 590 | E066_2 591 | E066_3 592 | E066_4 593 | E066_5 594 | E066_6 595 | E066_7 596 | E066_8 597 | E067_1 598 | E067_2 599 | E067_3 600 | E067_4 601 | E067_5 602 | E067_6 603 | E067_7 604 | E067_8 605 | E067_9 606 | E068_1 607 | E068_2 608 | E068_3 609 | E068_4 610 | E068_5 611 | E068_6 612 | E069_1 613 | E069_2 614 | E069_3 615 | E069_4 616 | E069_5 617 | E069_6 618 | E069_7 619 | E069_8 620 | E069_9 621 | E070_1 622 | E070_2 623 | E070_3 624 | E070_4 625 | E070_5 626 | E070_6 627 | E070_7 628 | E070_8 629 | E070_9 630 | E070_A 631 | E071_1 632 | E071_2 633 | E071_3 634 | E071_4 635 | E071_5 636 | E071_6 637 | E071_7 638 | E072_1 639 | E072_2 640 | E072_3 641 | E072_4 642 | E072_5 643 | E072_6 644 | E072_7 645 | E072_8 646 | E072_9 647 | E072_A 648 | E072_B 649 | E072_C 650 | E072_D 651 | E073_1 652 | E073_2 653 | E073_3 654 | E073_4 655 | E073_5 656 | E073_6 657 | E073_8 658 | E074_1 659 | E074_2 660 | E074_3 661 | E074_4 662 | E074_5 663 | E074_6 664 | E075_1 665 | E075_2 666 | E076_1 667 | E076_2 668 | E076_3 669 | E076_4 670 | E076_5 671 | E076_6 672 | E076_7 673 | E077_1 674 | E077_2 675 | E077_3 676 | E078_1 677 | E078_2 678 | E078_3 679 | E078_4 680 | E079_1 681 | E079_2 682 | E079_3 683 | E079_4 684 | E079_5 685 | E079_6 686 | E079_7 687 | E079_8 688 | E080_1 689 | E080_2 690 | E080_3 691 | E080_4 692 | E080_5 693 | E080_6 694 | E080_7 695 | E080_8 696 | E080_9 697 | E080_A 698 | I135_1 699 | I135_2 700 | I135_3 701 | I135_4 702 | I135_5 703 | I135_6 704 | I135_7 705 | I135_8 706 | I136_1 707 | I136_2 708 | I136_3 709 | I136_4 710 | I136_5 711 | I136_6 712 | I136_7 713 | I136_8 714 | I136_9 715 | I136_A 716 | I136_B 717 | I136_C 718 | I136_D 719 | I136_E 720 | I136_F 721 | I136_G 722 | I137_1 723 | I137_2 724 | I137_3 725 | I137_4 726 | I137_5 727 | I138_1 728 | I138_2 729 | I138_3 730 | I138_4 731 | I138_5 732 | I138_6 733 | I138_7 734 | I138_8 735 | I138_9 736 | I138_A 737 | I138_B 738 | I138_C 739 | I139_1 740 | I139_2 741 | I139_3 742 | I139_4 743 | I139_5 744 | I139_6 745 | I139_7 746 | I139_8 747 | I140_1 748 | I140_2 749 | I140_3 750 | I140_4 751 | I140_5 752 | I140_6 753 | I141_1 754 | I141_2 755 | I141_3 756 | I142_1 757 | I142_2 758 | I142_3 759 | I142_4 760 | I142_5 761 | I142_6 762 | I142_7 763 | I142_8 764 | I143_1 765 | I143_2 766 | I144_1 767 | I144_2 -------------------------------------------------------------------------------- /dataSrc/codeBak.txt: -------------------------------------------------------------------------------- 1 | A001_1 2 | A001_2 3 | A001_3 4 | A001_4 5 | A002_1 6 | A002_2 7 | A002_3 8 | A002_4 9 | A002_5 10 | A002_6 11 | A002_7 12 | A002_8 13 | A002_9 14 | A002_A 15 | A002_B 16 | A002_C 17 | A002_D 18 | A003_1 19 | A003_2 20 | A004_1 21 | A004_2 22 | A004_3 23 | A004_4 24 | A004_5 25 | A004_6 26 | A004_7 27 | A004_8 28 | A004_9 29 | A005_1 30 | A005_2 31 | A005_3 32 | A005_4 33 | A005_5 34 | A005_6 35 | A005_7 36 | A005_8 37 | A005_9 38 | A005_A 39 | A005_B 40 | A005_C 41 | A005_D 42 | A005_E 43 | A005_F 44 | A005_G 45 | A005_H 46 | A005_I 47 | A005_J 48 | A006_1 49 | A006_2 50 | A006_3 51 | A006_4 52 | A006_5 53 | A006_6 54 | A006_7 55 | A006_8 56 | A006_9 57 | A006_A 58 | A006_B 59 | A006_C 60 | A006_D 61 | A006_E 62 | A006_F 63 | A006_G 64 | A007_1 65 | A007_2 66 | A007_3 67 | A007_4 68 | A007_5 69 | A007_6 70 | A007_7 71 | A007_8 72 | A007_9 73 | A007_A 74 | A008_1 75 | A008_2 76 | A009_1 77 | A009_2 78 | A009_3 79 | A009_4 80 | A009_5 81 | A009_6 82 | A009_7 83 | A009_8 84 | A009_9 85 | A009_A 86 | A009_B 87 | A009_C 88 | A010_1 89 | A010_2 90 | A010_3 91 | A010_4 92 | A010_5 93 | A010_6 94 | A010_7 95 | A010_8 96 | A010_9 97 | A010_A 98 | A010_B 99 | A010_C 100 | A010_D 101 | A010_E 102 | A011_1 103 | A011_2 104 | A011_3 105 | A011_4 106 | A011_5 107 | A011_6 108 | A011_7 109 | A011_8 110 | A011_9 111 | A011_A 112 | A011_B 113 | A011_C 114 | A011_D 115 | A011_E 116 | A011_F 117 | A011_G 118 | A011_H 119 | A011_I 120 | A011_K 121 | A012_1 122 | A012_2 123 | A012_3 124 | A012_4 125 | A012_5 126 | A012_6 127 | A012_7 128 | A012_8 129 | A012_9 130 | A012_A 131 | A012_B 132 | A012_C 133 | A012_D 134 | A012_E 135 | A012_F 136 | A013_1 137 | A013_2 138 | B014_1 139 | B014_2 140 | B014_3 141 | B014_4 142 | B014_5 143 | B014_6 144 | B014_7 145 | B014_8 146 | B015_1 147 | B015_2 148 | B015_3 149 | B015_4 150 | B015_5 151 | B015_6 152 | B015_7 153 | B015_8 154 | B016_1 155 | B016_2 156 | B016_3 157 | B016_4 158 | B016_5 159 | B016_6 160 | B016_7 161 | B016_8 162 | B017_1 163 | B017_2 164 | B017_3 165 | B017_4 166 | B017_5 167 | B017_6 168 | B017_7 169 | B018_1 170 | B018_2 171 | B018_3 172 | B018_4 173 | B018_5 174 | B018_6 175 | B018_7 176 | B019_1 177 | B019_2 178 | B019_3 179 | B019_4 180 | B019_5 181 | B019_6 182 | B020_1 183 | B020_2 184 | B020_3 185 | B020_4 186 | B020_5 187 | B020_6 188 | B020_7 189 | B020_8 190 | B020_9 191 | B020_A 192 | B020_B 193 | B020_C 194 | B021_1 195 | B021_2 196 | B021_3 197 | B021_4 198 | B021_5 199 | B021_6 200 | B021_7 201 | B021_8 202 | B022_1 203 | B022_2 204 | B022_3 205 | B022_4 206 | B022_5 207 | B022_6 208 | B022_7 209 | B022_8 210 | B022_9 211 | B022_A 212 | B022_B 213 | B022_C 214 | B023_1 215 | B023_2 216 | B023_3 217 | B023_4 218 | B023_5 219 | B023_6 220 | B023_7 221 | B023_8 222 | B023_9 223 | B023_A 224 | B023_B 225 | B024_1 226 | B024_2 227 | B024_3 228 | B024_4 229 | B024_5 230 | B024_6 231 | B024_7 232 | B024_8 233 | B024_9 234 | B024_A 235 | B024_B 236 | B024_C 237 | B024_D 238 | B024_E 239 | B025_1 240 | B025_2 241 | B025_3 242 | B025_4 243 | B025_5 244 | B025_6 245 | B025_7 246 | B026_1 247 | B026_2 248 | B026_3 249 | B026_4 250 | B026_5 251 | B026_6 252 | B027_1 253 | B027_2 254 | B027_3 255 | B027_4 256 | B027_5 257 | B027_6 258 | C028_1 259 | C028_2 260 | C028_3 261 | C028_4 262 | C028_5 263 | C028_6 264 | C028_7 265 | C028_8 266 | C028_9 267 | C029_1 268 | C029_2 269 | C029_3 270 | C029_4 271 | C029_5 272 | C029_6 273 | C029_7 274 | C029_8 275 | C029_9 276 | C029_A 277 | C029_B 278 | C029_C 279 | C029_D 280 | C030_1 281 | C030_2 282 | C030_3 283 | C030_4 284 | C030_5 285 | C030_6 286 | C030_7 287 | C030_8 288 | C030_9 289 | C030_A 290 | C030_B 291 | C030_C 292 | C030_D 293 | C030_E 294 | C030_F 295 | C030_G 296 | C030_H 297 | C030_I 298 | C031_1 299 | C031_2 300 | C031_3 301 | C031_4 302 | C032_1 303 | C032_2 304 | C032_3 305 | C032_4 306 | C032_5 307 | C033_1 308 | C033_2 309 | C033_3 310 | C033_4 311 | C033_5 312 | C033_6 313 | C033_7 314 | C033_8 315 | C034_1 316 | C034_2 317 | C034_3 318 | C035_1 319 | C035_2 320 | C035_3 321 | C035_4 322 | C035_5 323 | C035_6 324 | C035_7 325 | C035_8 326 | C035_9 327 | C035_A 328 | C035_B 329 | C035_C 330 | C035_D 331 | C035_E 332 | C036_1 333 | C036_2 334 | C036_3 335 | C036_4 336 | C036_5 337 | C036_6 338 | C036_7 339 | C036_8 340 | C036_9 341 | C036_A 342 | C036_B 343 | C036_C 344 | C036_D 345 | C036_E 346 | C036_F 347 | C036_G 348 | C037_1 349 | C037_2 350 | C037_3 351 | C037_4 352 | C037_5 353 | C037_6 354 | C037_7 355 | C037_8 356 | C037_9 357 | C037_A 358 | C038_1 359 | C038_2 360 | C038_3 361 | C039_1 362 | C039_2 363 | C039_3 364 | C039_4 365 | C039_5 366 | C039_6 367 | C039_7 368 | C040_1 369 | C040_2 370 | C040_3 371 | C040_4 372 | C040_5 373 | C040_6 374 | C040_7 375 | C040_8 376 | C040_9 377 | C041_1 378 | C041_2 379 | C041_3 380 | C041_4 381 | C041_5 382 | C041_6 383 | C041_7 384 | C041_8 385 | C042_1 386 | C042_2 387 | C042_3 388 | C042_4 389 | C042_5 390 | C042_6 391 | C042_7 392 | C042_8 393 | C042_A 394 | C042_B 395 | C042_C 396 | C042_D 397 | D043_1 398 | D043_2 399 | D043_3 400 | D043_4 401 | D043_5 402 | D043_6 403 | D043_7 404 | D043_8 405 | D043_9 406 | D044_1 407 | D044_2 408 | D044_3 409 | D044_4 410 | D044_5 411 | D044_6 412 | D044_7 413 | D044_8 414 | D044_9 415 | D045_1 416 | D045_2 417 | D045_3 418 | D045_4 419 | D045_5 420 | D045_6 421 | D045_7 422 | D045_8 423 | D046_1 424 | D046_2 425 | D046_3 426 | D046_4 427 | D046_5 428 | D046_6 429 | D046_7 430 | D046_8 431 | D046_9 432 | D046_A 433 | D046_B 434 | D046_C 435 | D047_1 436 | D047_2 437 | D047_3 438 | D047_4 439 | D047_5 440 | D047_6 441 | D047_7 442 | D047_8 443 | D048_1 444 | D048_2 445 | D048_3 446 | D048_4 447 | D048_5 448 | D048_6 449 | D048_7 450 | D049_1 451 | D049_2 452 | D049_3 453 | D049_4 454 | D049_5 455 | D049_6 456 | D049_7 457 | D049_8 458 | D050_1 459 | D050_2 460 | D050_3 461 | D050_4 462 | D051_1 463 | D051_2 464 | D051_3 465 | D052_1 466 | D052_2 467 | D052_3 468 | D052_4 469 | D052_5 470 | D052_6 471 | D052_7 472 | D052_8 473 | E053_1 474 | E053_2 475 | E053_3 476 | E053_4 477 | E053_5 478 | E053_6 479 | E053_7 480 | E053_8 481 | E053_9 482 | E053_A 483 | E054_1 484 | E054_2 485 | E054_3 486 | E054_4 487 | E054_5 488 | E054_6 489 | E055_0 490 | E055_1 491 | E055_2 492 | E055_3 493 | E055_4 494 | E055_5 495 | E055_6 496 | E055_7 497 | E055_8 498 | E056_1 499 | E056_2 500 | E056_3 501 | E056_4 502 | E056_5 503 | E056_6 504 | E056_7 505 | E056_8 506 | E056_9 507 | E056_A 508 | E056_B 509 | E056_C 510 | E056_D 511 | E056_E 512 | E056_F 513 | E056_G 514 | E057_1 515 | E057_2 516 | E057_3 517 | E057_4 518 | E057_5 519 | E057_6 520 | E057_7 521 | E057_8 522 | E057_9 523 | E057_A 524 | E057_B 525 | E058_1 526 | E058_2 527 | E059_1 528 | E059_2 529 | E059_3 530 | E059_4 531 | E059_5 532 | E059_6 533 | E059_7 534 | E059_8 535 | E059_9 536 | E059_A 537 | E059_B 538 | E059_C 539 | E060_1 540 | E060_2 541 | E060_3 542 | E060_4 543 | E060_5 544 | E060_6 545 | E060_7 546 | E060_8 547 | E060_9 548 | E061_1 549 | E061_2 550 | E061_3 551 | E061_4 552 | E061_5 553 | E061_6 554 | E061_7 555 | E061_8 556 | E061_9 557 | E061_A 558 | E061_B 559 | E062_1 560 | E062_2 561 | E062_3 562 | E062_4 563 | E062_6 564 | E062_7 565 | E063_1 566 | E063_2 567 | E063_3 568 | E063_4 569 | E063_5 570 | E063_6 571 | E064_1 572 | E064_2 573 | E064_3 574 | E064_4 575 | E064_5 576 | E064_6 577 | E064_7 578 | E065_1 579 | E065_2 580 | E066_1 581 | E066_2 582 | E066_3 583 | E066_4 584 | E066_5 585 | E066_6 586 | E066_7 587 | E066_8 588 | E067_1 589 | E067_2 590 | E067_3 591 | E067_4 592 | E067_5 593 | E067_6 594 | E067_7 595 | E067_8 596 | E067_9 597 | E068_1 598 | E068_2 599 | E068_3 600 | E068_4 601 | E068_5 602 | E068_6 603 | E069_1 604 | E069_2 605 | E069_3 606 | E069_4 607 | E069_5 608 | E069_6 609 | E069_7 610 | E069_8 611 | E069_9 612 | E070_1 613 | E070_2 614 | E070_3 615 | E070_4 616 | E070_5 617 | E070_6 618 | E070_7 619 | E070_8 620 | E070_9 621 | E070_A 622 | E071_1 623 | E071_2 624 | E071_3 625 | E071_4 626 | E071_5 627 | E071_6 628 | E071_7 629 | E072_1 630 | E072_2 631 | E072_3 632 | E072_4 633 | E072_5 634 | E072_6 635 | E072_7 636 | E072_8 637 | E072_9 638 | E072_A 639 | E072_B 640 | E072_C 641 | E072_D 642 | E073_1 643 | E073_2 644 | E073_3 645 | E073_4 646 | E073_5 647 | E073_6 648 | E073_8 649 | E074_1 650 | E074_2 651 | E074_3 652 | E074_4 653 | E074_5 654 | E074_6 655 | E075_1 656 | E075_2 657 | E076_1 658 | E076_2 659 | E076_3 660 | E076_4 661 | E076_5 662 | E076_6 663 | E076_7 664 | E077_1 665 | E077_2 666 | E077_3 667 | E078_1 668 | E078_2 669 | E078_3 670 | E078_4 671 | E079_1 672 | E079_2 673 | E079_3 674 | E079_4 675 | E079_5 676 | E079_6 677 | E079_7 678 | E079_8 679 | E080_1 680 | E080_2 681 | E080_3 682 | E080_4 683 | E080_5 684 | E080_6 685 | E080_7 686 | E080_8 687 | E080_9 688 | E080_A 689 | I135_1 690 | I135_2 691 | I135_3 692 | I135_4 693 | I135_5 694 | I135_6 695 | I135_7 696 | I135_8 697 | I136_1 698 | I136_2 699 | I136_3 700 | I136_4 701 | I136_5 702 | I136_6 703 | I136_7 704 | I136_8 705 | I136_9 706 | I136_A 707 | I136_B 708 | I136_C 709 | I136_D 710 | I136_E 711 | I136_F 712 | I136_G 713 | I137_1 714 | I137_2 715 | I137_3 716 | I137_4 717 | I137_5 718 | I138_1 719 | I138_2 720 | I138_3 721 | I138_4 722 | I138_5 723 | I138_6 724 | I138_7 725 | I138_8 726 | I138_9 727 | I138_A 728 | I138_B 729 | I138_C 730 | I139_1 731 | I139_2 732 | I139_3 733 | I139_4 734 | I139_5 735 | I139_6 736 | I139_7 737 | I139_8 738 | I140_1 739 | I140_2 740 | I140_3 741 | I140_4 742 | I140_5 743 | I140_6 744 | I141_1 745 | I141_2 746 | I141_3 747 | I142_1 748 | I142_2 749 | I142_3 750 | I142_4 751 | I142_5 752 | I142_6 753 | I142_7 754 | I142_8 755 | I143_1 756 | I143_2 757 | I144_1 758 | I144_2 759 | C036_51 760 | C036_52 761 | C036_53 762 | C036_54 763 | C036_55 764 | C036_56 765 | C036_57 766 | C036_58 767 | C036_59 -------------------------------------------------------------------------------- /CnkiSpider/customDownloadMiddlewares.py: -------------------------------------------------------------------------------- 1 | from twisted.internet import defer 2 | from twisted.internet.error import TimeoutError, DNSLookupError, \ 3 | ConnectionRefusedError, ConnectionDone, ConnectError, \ 4 | ConnectionLost, TCPTimedOutError 5 | from scrapy.http import HtmlResponse 6 | from twisted.web.client import ResponseFailed 7 | from scrapy.core.downloader.handlers.http11 import TunnelError 8 | from CnkiSpider.file_util import FileUtil 9 | from CnkiSpider.commonUtils import SpiderTypeEnum, ErrorUtil 10 | import twisted 11 | from scrapy.utils.project import get_project_settings 12 | 13 | 14 | 15 | class ProcessAllExceptionMiddleware(object): 16 | ALL_EXCEPTIONS = (defer.TimeoutError, TimeoutError, DNSLookupError, 17 | ConnectionRefusedError, ConnectionDone, ConnectError, 18 | ConnectionLost, TCPTimedOutError, ResponseFailed, 19 | IOError, TunnelError, twisted.internet.error.ConnectionLost) 20 | 21 | # def process_response(self, request, response, spider): 22 | # # 捕获状态码为40x/50x的response 23 | # if str(response.status).startswith('4') or str(response.status).startswith('5'): 24 | # # 随意封装,直接返回response,spider代码中根据url==''来处理response 25 | # response = HtmlResponse(url='') 26 | # return response 27 | # # 其他状态码不处理 28 | # return response 29 | 30 | 31 | 32 | def process_exception(self, request, exception, spider): 33 | # 捕获RetryAndGetFailedUrl的重试中间件捕获后剩下的异常 34 | if not isinstance(exception, self.ALL_EXCEPTIONS): 35 | # 在日志中打印异常类型 36 | logging.error('全局异常捕获: %s' % (exception)) 37 | # 随意封装一个response,返回给spider 38 | response = HtmlResponse(url='exception') 39 | return response 40 | 41 | def process_response(self, request, response, spider): 42 | if str(response.status).startswith('4') or str(response.status).startswith('5'): 43 | # # 随意封装,直接返回response,spider代码中根据url==''来处理response 44 | key = request.cb_kwargs 45 | # print(request) 46 | # print("截取到了错误请求") 47 | if spider.name == SpiderTypeEnum.PATENT.value: 48 | # print(key) 49 | if key['requestType'] in ['PatentGetFirstPage', 'PatentGetLinks']: 50 | # print('请求状态:%s' % response.status) 51 | # print('请求链接:%s' % response.meta['url']) 52 | # print('响应内容:%s' % response.text) 53 | ErrorUtil.markCodeError(type=SpiderTypeEnum.PATENT, code=key['code'], date=key['date']) 54 | # elif key['requestType'] == 'PatentGetLinks': 55 | # ErrorUtil.markPageError(type=SpiderTypeEnum.PATENT, code=key['code'], date=key['date'], 56 | # pagenum=key['pagenum']) 57 | elif key['requestType'] == "patentGetContent": 58 | ErrorUtil.markLinkError(type=SpiderTypeEnum.PATENT, url=key['url'], code=key['code'], date=key['date']) 59 | elif spider.name == SpiderTypeEnum.PAPER_AND_ACH.value: 60 | if key['requestType'] in ["PaperAchGetFirstPage", 'PaperAchGetLinks']: 61 | ErrorUtil.markCodeError(type=SpiderTypeEnum.PAPER_AND_ACH, code=key['code'], date=key['date']) 62 | # elif key['requestType'] == 'PaperAchGetLinks': 63 | # ErrorUtil.markPageError(type=SpiderTypeEnum.PAPER_AND_ACH, code=key['code'], date=key['date'], 64 | # pagenum=key['pagenum']) 65 | elif key['requestType'] == "JournalGetContent": 66 | ErrorUtil.markLinkError(type=SpiderTypeEnum.JOURNAL, url=key['url'], code=key['code'], date=key['date']) 67 | elif key['requestType'] == "BoshuoGetContent": 68 | ErrorUtil.markLinkError(type=SpiderTypeEnum.BOSHUO, url=key['url'], code=key['code'], date=key['date']) 69 | elif key['requestType'] == "AchGetContent": 70 | ErrorUtil.markLinkError(type=SpiderTypeEnum.ACHIEVEMENT, url=key['url'], code=key['code'], date=key['date']) 71 | # 返回一个出错的response,前端判断url为空就说明之前的请求有问题 72 | return HtmlResponse(url='') 73 | return response 74 | 75 | 76 | def spider_opened(self, spider): 77 | spider.logger.info('Spider opened: %s' % spider.name) 78 | 79 | 80 | 81 | 82 | from CnkiSpider.proxy import ApeProxyManager 83 | 84 | 85 | class CnkispiderSpiderProxyMiddleware: 86 | ''' 87 | 给所有的request加上代理 88 | ''' 89 | def process_request(self, request, spider): 90 | request.meta["proxy"] = ApeProxyManager.getProxy()['string'] 91 | request.headers["Proxy-Authorization"] = ApeProxyManager.proxyAuth 92 | return None 93 | 94 | import random 95 | 96 | class CnkiSpiderHeaderMiddleware: 97 | 98 | USER_AGENTS = [ 99 | "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36", 100 | "Mozilla/5.0 (Android; Mobile; rv:14.0) Gecko/14.0 Firefox/14.0", 101 | "Mozilla/5.0 (Android; Tablet; rv:14.0) Gecko/14.0 Firefox/14.0", 102 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:21.0) Gecko/20100101 Firefox/21.0", 103 | "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:21.0) Gecko/20130331 Firefox/21.0", 104 | "Mozilla/5.0 (Windows NT 6.2; WOW64; rv:21.0) Gecko/20100101 Firefox/21.0", 105 | "Mozilla/5.0 (Linux; Android 4.1.1; Nexus 7 Build/JRO03D) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.166 Safari/535.19", 106 | "Mozilla/5.0 (Linux; Android 4.0.4; Galaxy Nexus Build/IMM76B) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.133 Mobile Safari/535.19", 107 | "Mozilla/5.0 (Linux; Android 4.1.2; Nexus 7 Build/JZ054K) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.166 Safari/535.19", 108 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36", 109 | "Mozilla/5.0 (compatible; WOW64; MSIE 10.0; Windows NT 6.2)", 110 | "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)", 111 | "Opera/9.80 (Windows NT 6.1; WOW64; U; en) Presto/2.10.229 Version/11.62"] 112 | 113 | def process_request(self, request, spider): 114 | request.headers.setdefault('User-Agent', random.choice(self.USER_AGENTS)) 115 | # 专利和论文设置不同的Refer 116 | if spider.name == SpiderTypeEnum.PAPER_AND_ACH: 117 | request.headers.setdefault("Referer", "https://kns.cnki.net/kns/brief/result.aspx?dbprefix=SCDB&crossDbcodes=CJFQ,CDFD,CMFD,CPFD,IPFD,CCND,CCJD") 118 | elif spider.name == SpiderTypeEnum.PATENT: 119 | request.headers.setdefault("Referer", "https://kns.cnki.net/kns/brief/result.aspx?dbprefix=SCOD") 120 | 121 | 122 | 123 | 124 | import time 125 | from scrapy.downloadermiddlewares.retry import RetryMiddleware 126 | from scrapy.utils.response import response_status_message 127 | import logging 128 | from CnkiSpider.commonUtils import ApeProxyManager 129 | 130 | 131 | class RetryAndGetFailedUrl(RetryMiddleware): 132 | start_date = time.strftime('%Y-%m-%d', time.localtime()) 133 | def process_response(self, request, response, spider): 134 | # 在之前构造的request中可以加入meta信息dont_retry来决定是否重试 135 | if request.meta.get('dont_retry', False): 136 | return response 137 | 138 | # 检查状态码是否在列表中,在的话就调用_retry方法进行重试 139 | if response.status in self.retry_http_codes: 140 | reason = response_status_message(response.status) 141 | # 在此处进行自己的操作,如删除不可用代理,打日志等 142 | settings = get_project_settings() 143 | if settings.get("PROXY_OPEN"): 144 | oldProxyString = request.meta["proxy"] 145 | ApeProxyManager.removeBadProxy(oldProxyString) 146 | proxyString = ApeProxyManager.getProxy()['string'] 147 | request.meta["proxy"] = proxyString 148 | logging.warning('切换代理(%s)重试中(%s)' % (proxyString, request.url)) 149 | else: 150 | logging.warning('未启用代理,重试中(%s)' % (request.url)) 151 | self.save_url(request, spider) 152 | return self._retry(request, reason, spider) or response 153 | return response 154 | 155 | def process_exception(self, request, exception, spider): 156 | if ( 157 | isinstance(exception, self.EXCEPTIONS_TO_RETRY) 158 | and not request.meta.get('dont_retry', False) 159 | ): 160 | # logging.warning('错误异常捕捉:%s,开始重试' % exception) 161 | logging.warning("捕获到异常 %s" % exception) 162 | settings = get_project_settings() 163 | retries = request.meta.get('retry_times', 0) + 1 164 | if settings.get("PROXY_OPEN") and isinstance(exception, TunnelError): 165 | oldProxyString = request.meta["proxy"] 166 | ApeProxyManager.removeBadProxy(oldProxyString) 167 | proxyString = ApeProxyManager.getProxy()['string'] 168 | request.meta["proxy"] = proxyString 169 | logging.warning('代理异常,切换代理(%s)第 %d 次重试中(%s)' % (proxyString, retries, request.url)) 170 | else: 171 | logging.warning('未启用代理或网络异常与代理无关,第 %d 次重试中(%s)' % (retries, request.url)) 172 | self.save_url(request, spider) 173 | return self._retry(request, exception, spider) 174 | 175 | def save_url(self, request, spider): 176 | ''' 177 | 当前重试次数已经超过了最大重试次数,修改代理 178 | :param request: 179 | :return: 180 | ''' 181 | retries = request.meta.get('retry_times', 0) + 1 182 | if retries > self.max_retry_times: 183 | key = request.cb_kwargs 184 | logging.warning("连续请求%s次,放弃请求" % str(retries)) 185 | if spider.name == SpiderTypeEnum.PATENT.value: 186 | if key['requestType'] == 'PatentGetFirstPage': 187 | ErrorUtil.markCodeError(type=SpiderTypeEnum.PATENT, code=key['code'], date=key['date']) 188 | # elif key['requestType'] == 'PatentGetLinks': 189 | # ErrorUtil.markPageError(type=SpiderTypeEnum.PATENT, code=key['code'], date=key['date'], 190 | # pagenum=key['pagenum']) 191 | elif key['requestType'] == "patentGetContent": 192 | ErrorUtil.markLinkError(type=SpiderTypeEnum.PATENT, url=key['url'], date=key['date'], code=key['code']) 193 | elif spider.name == SpiderTypeEnum.PAPER_AND_ACH.value: 194 | if key['requestType'] == "PaperAchGetFirstPage": 195 | ErrorUtil.markCodeError(type=SpiderTypeEnum.PAPER_AND_ACH, code=key['code'], date=key['date']) 196 | # elif key['requestType'] == 'PaperAchGetLinks': 197 | # ErrorUtil.markPageError(type=SpiderTypeEnum.PAPER_AND_ACH.value, code=key['code'], date=key['date'], 198 | # pagenum=key['pagenum']) 199 | elif key['requestType'] == "JournalGetContent": 200 | ErrorUtil.markLinkError(type=SpiderTypeEnum.JOURNAL, url=key['url'], date=key['date'], code=key['code']) 201 | elif key['requestType'] == "BoshuoGetContent": 202 | ErrorUtil.markLinkError(type=SpiderTypeEnum.BOSHUO, url=key['url'], date=key['date'], code=key['code']) 203 | elif key['requestType'] == "AchGetContent": 204 | ErrorUtil.markLinkError(type=SpiderTypeEnum.ACHIEVEMENT, url=key['url'], date=key['date'], code=key['code']) 205 | 206 | 207 | -------------------------------------------------------------------------------- /CnkiSpider/statusManager.py: -------------------------------------------------------------------------------- 1 | import pymysql 2 | from CnkiSpider.commonUtils import SpiderTypeEnum 3 | import datetime 4 | import logging 5 | from scrapy.utils.project import get_project_settings 6 | import time 7 | from CnkiSpider.file_util import PackUtil 8 | from configparser import ConfigParser 9 | 10 | ''' 11 | 爬虫状态管理工具,主要记录爬到的日期和代码 12 | ''' 13 | 14 | 15 | class StatusManager(): 16 | 17 | def __init__(self, type: SpiderTypeEnum): 18 | 19 | srcCodeFileNormal = PackUtil.resource_path('dataSrc/code.txt') 20 | srcCodeFileTest = PackUtil.resource_path('dataSrc/codeTest.txt') 21 | 22 | settings = get_project_settings() 23 | codeFileTestMode = settings.get("CODE_FILE_TEST_MODE", default=False) 24 | if codeFileTestMode: 25 | self.srcCodeFile = srcCodeFileTest 26 | print("启用了学科分类测试模式,将使用测试学科分类源: ./dataSrc/codeTest.txt") 27 | logging.info("启用了学科分类测试模式,将使用测试学科分类源: ./dataSrc/codeTest.txt") 28 | else: 29 | self.srcCodeFile = srcCodeFileNormal 30 | print("未启用学科分类测试模式,将使用完整的学科分类源: ./dataSrc/code.txt") 31 | logging.info("未启用学科分类测试模式,将使用完整的学科分类源: ./dataSrc/code.txt") 32 | self.host = settings.get("MYSQL_HOST") 33 | self.port = int(settings.get("MYSQL_PORT")) 34 | self.user = settings.get("MYSQL_USER") 35 | self.passwd = settings.get("MYSQL_PASSWD") 36 | self.database = settings.get("MYSQL_DATABASE") 37 | self.table = settings.get("STATUS_TABLE") 38 | self.errorCodeTable = settings.get("ERROR_CODE_TABLE") 39 | self.errorLinkTable = settings.get("ERROR_LINK_TABLE") 40 | # self.spiderType = settings.get("SPIDER_TYPE") 41 | # self.startDateDefault = settings.get("START_DATE") 42 | # self.endDateDefault = settings.get("END_DATE") 43 | # logging.debug(host, port, user, passwd, database, table) 44 | 45 | cp = ConfigParser() 46 | # 与exe同目录 47 | cp.read('./config.cfg') 48 | self.startDateDefault = cp.get('spider', 'start') 49 | self.endDateDefault = cp.get('spider', 'end') 50 | 51 | self.codes = self.getCodeAll() 52 | self.codeFirst = self.codes[0] 53 | self.codeLen = len(self.codes) 54 | self.type = type.value 55 | 56 | self.conn = pymysql.connect(host=self.host, port=self.port, user=self.user, 57 | passwd=self.passwd, database=self.database) 58 | self.cursor = self.conn.cursor() 59 | today = datetime.date.today() 60 | oneday = datetime.timedelta(days=1) 61 | self.today = today.strftime('%Y-%m-%d') 62 | self.yesterday = (today - oneday).strftime('%Y-%m-%d') 63 | # 默认截止日期是昨天 64 | self.endDate = None 65 | 66 | self.distributeMode = settings.get("DISTRUBUTE_MODE", default=False) 67 | self.createStatusTableFromTablename(self.table) 68 | self.createErrorCodeTableFromTablename(self.errorCodeTable) 69 | self.createErrorLinkTableFromTablename(self.errorLinkTable) 70 | if self.distributeMode: 71 | self.setDefaultDateAndCode() 72 | 73 | def reCon(self): 74 | """ MySQLdb.OperationalError异常""" 75 | # self.con.close() 76 | while True: 77 | try: 78 | self.conn.ping() 79 | return 80 | except pymysql.err.OperationalError: 81 | logging.warning("mysql连接失败开始重连") 82 | self.conn.ping(True) 83 | time.sleep(3) 84 | 85 | def createStatusTableFromTablename(self, tablename): 86 | self.reCon() 87 | cursor = self.conn.cursor() 88 | sql = ''' 89 | 90 | 91 | CREATE TABLE If Not Exists `%s` ( 92 | -- `id` INT UNSIGNED AUTO_INCREMENT, 93 | `type` varchar(255) COMMENT '爬虫类型,用于区分专利patent和(期刊、博硕、成果)paperAndAch的链接获取', 94 | `curCode` varchar(255) COMMENT '目前正在爬(链接获取)的学科分类', 95 | `curDate` varchar(255) NOT NULL COMMENT '目前正在爬(链接获取)的日期', 96 | `endDate` varchar(255) NOT NULL COMMENT '终止日期(包含)', 97 | `status` varchar(255) COMMENT '爬虫状态', 98 | PRIMARY KEY(`type`) 99 | ) ENGINE = InnoDB''' % (tablename) 100 | cursor.execute(sql) 101 | self.conn.commit() 102 | 103 | def createErrorCodeTableFromTablename(self, tablename): 104 | self.reCon() 105 | cursor = self.conn.cursor() 106 | sql = ''' 107 | 108 | 109 | CREATE TABLE If Not Exists `%s` ( 110 | `id` INT UNSIGNED AUTO_INCREMENT, 111 | `type` varchar(255) COMMENT '文献类型,用于区分专利patent和(期刊、博硕、成果)的链接获取', 112 | `code` varchar(255) COMMENT '学科分类', 113 | `date` varchar(255) NOT NULL COMMENT '日期', 114 | PRIMARY KEY(`id`), 115 | unique index(`type`, `code`, `date`) 116 | ) ENGINE = InnoDB''' % (tablename) 117 | cursor.execute(sql) 118 | self.conn.commit() 119 | 120 | def createErrorLinkTableFromTablename(self, tablename): 121 | self.reCon() 122 | cursor = self.conn.cursor() 123 | sql = ''' 124 | 125 | 126 | CREATE TABLE If Not Exists `%s` ( 127 | `id` INT UNSIGNED AUTO_INCREMENT, 128 | `type` varchar(255) COMMENT '文献类型,用于区分专利patent和(期刊、博硕、成果)的链接获取', 129 | `code` varchar(255) COMMENT '学科分类', 130 | `Link` varchar(255) NOT NULL COMMENT '链接', 131 | `date` varchar(255) NOT NULL COMMENT '日期', 132 | PRIMARY KEY(`id`) 133 | ) ENGINE = InnoDB''' % (tablename) 134 | cursor.execute(sql) 135 | self.conn.commit() 136 | 137 | def setDefaultDateAndCode(self): 138 | self.reCon() 139 | cursor = self.conn.cursor() 140 | selectSql = "select `curCode`, `curDate`, `endDate` from `%s` where `type` = '%s'" % (self.table, self.type) 141 | cursor.execute(selectSql) 142 | result = cursor.fetchone() 143 | if result is None: 144 | sql = "INSERT INTO `%s`(`type`, `curDate`, `endDate`, `curCode`)VALUES('%s', '%s', '%s', '%s')" \ 145 | % (self.table, self.type, self.startDateDefault, self.endDateDefault, self.getCodeFirst()) 146 | else: 147 | sql = "UPDATE `%s` SET curDate = '%s', endDate = '%s', curCode = '%s' WHERE `type` = '%s'" \ 148 | % (self.table, self.startDateDefault, self.endDateDefault, self.getCodeFirst(), self.type) 149 | cursor.execute(sql) 150 | self.conn.commit() 151 | 152 | def getCodeFirst(self): 153 | return self.codeFirst 154 | 155 | def getLastDateAndCode(self): 156 | ''' 157 | 从数据库中读取上次爬的日期和学科分类,这次重新爬 158 | :return: 159 | ''' 160 | self.reCon() 161 | cursor = self.conn.cursor() 162 | cursor.execute("select `curCode`, `curDate`, `endDate` from `%s` where `type` = '%s'" % (self.table, self.type)) 163 | result = cursor.fetchone() 164 | # 数据库没数据就返回空,报错给调用者,提示用户向mysql中添加数据 165 | # 判断type为专利的数据条是否存在 166 | if result is None: 167 | print('mysql的status表中缺失type为%s的数据条,请手动插入' % self.type) 168 | logging.error('mysql的status表中缺失type为%s的数据条,请手动插入' % self.type) 169 | # self.conn.close() 170 | return None 171 | # 判断开始日期是否存在 172 | if result[1] == "" or result[1] is None: 173 | print('mysql的status表中type为%s的数据条缺少关键的开始日期信息,请手动更新' % self.type) 174 | logging.error('mysql的status表中type为%s的数据条缺少关键的日期信息,请手动更新' % self.type) 175 | self.conn.close() 176 | return None 177 | # 判断结束日期是否存在 178 | if result[2] == "" or result[2] is None: 179 | print('未设置初始结束日期信息,已自动设置为昨天') 180 | logging.info('未设置初始结束日期信息,已自动设置为昨天') 181 | self.setEndDate(endDate=self.yesterday) 182 | self.endDate = self.yesterday 183 | else: 184 | self.endDate = result[2] 185 | # logging.info("获取的终止日期为 %s" % self.endDate) 186 | # 判断学科代码是否存在,不存在就默认从code表第一个开始 187 | if result[0] == "" or result[0] is None: 188 | code = self.codes[0] 189 | cursor.execute("UPDATE `%s` SET curCode = '%s' WHERE type = '%s'" % (self.table, code, self.type)) 190 | self.conn.commit() 191 | # conn.close() 192 | print('未设置初始code信息,已自动设置为', code) 193 | logging.info('未设置初始code信息,已自动设置为%s' % code) 194 | return result[1], code 195 | return result[1], result[0] 196 | 197 | def getNextDateAndCode(self): 198 | ''' 199 | 返回下一个待爬的日期和学科分类 200 | 在爬虫开始的时候先调用getLastDateAndCode 201 | 返回的不是None再调用这个,getNext返回None代表已经爬完了 202 | :return: 203 | ''' 204 | lastDate, lastCode = self.getLastDateAndCode() 205 | 206 | oneday = datetime.timedelta(days=1) 207 | 208 | if lastDate > self.endDate: 209 | logging.info("已经爬完了任务中最后一天的最后一个学科分类,但页面请求和页面解析以及内容存取还在进行中!") 210 | self.setStatusFinished() 211 | self.closeConn() 212 | return None 213 | 214 | index = 0 215 | # 获取上一个code在当前的日期的记录 216 | for i in range(len(self.codes)): 217 | if lastCode == self.codes[i]: 218 | index = i 219 | break 220 | # 某日期的code还没运行完 221 | if index < len(self.codes) - 1: 222 | self.markCurrentDateAndCode(lastDate, self.codes[index + 1]) 223 | logging.info("获取的下一个日期、学科分类为:%s,%s" % (lastDate, self.codes[index + 1])) 224 | # print("获取的下一个日期、学科分类为:%s,%s" % (lastDate, self.codes[index+1])) 225 | self.setStatusRunning() 226 | return lastDate, self.codes[index + 1] 227 | # 所有学科分类的爬完了,爬下一天的 228 | else: 229 | # 上一次的日期已经是昨天了,代表爬完(今天的肯定不能爬,因为还没过完) 230 | if lastDate >= self.endDate: 231 | logging.info("已经爬完了任务中最后一天的最后一个学科分类,但页面请求和页面解析以及内容存取还在进行中!") 232 | self.setStatusFinished() 233 | self.closeConn() 234 | return None 235 | # 进入到下一个日期,学科分类置为第一个 236 | else: 237 | year = int(lastDate[0:4]) 238 | month = int(lastDate[5:7]) 239 | day = int(lastDate[8:10]) 240 | nextDay = (datetime.date(year, month, day) + oneday).strftime('%Y-%m-%d') 241 | self.markCurrentDateAndCode(nextDay, self.codes[0]) 242 | # logging.debug("获取的下一个日期、学科分类为:%s,%s" % (nextDay, self.codes[0])) 243 | # print("获取的下一个日期、学科分类为:%s,%s" % (lastDate, self.codes[index+1])) 244 | self.setStatusRunning() 245 | return nextDay, self.codes[0] 246 | 247 | def markCurrentDateAndCode(self, date: str, code: str): 248 | ''' 249 | 记录当前正在爬的日期和code至数据库中 250 | :return: 251 | ''' 252 | # 更新正在爬取的日期和学科分类的sql 253 | self.reCon() 254 | cursor = self.conn.cursor() 255 | updateSql = "UPDATE `%s` SET curDate = '%s', curCode = '%s' WHERE type = '%s'" % ( 256 | self.table, date, code, self.type) 257 | cursor.execute(updateSql) 258 | self.conn.commit() 259 | 260 | def stepIntoNextDate(self, lastDate: str): 261 | ''' 262 | 直接进入到当天的日期 263 | :param lastDate: 264 | :return: 265 | ''' 266 | # year = int(lastDate[0:4]) 267 | # month = int(lastDate[5:7]) 268 | # day = int(lastDate[8:10]) 269 | # oneday = datetime.timedelta(days=1) 270 | # nextDay = (datetime.date(year, month, day) + oneday).strftime('%Y-%m-%d') 271 | # 设置上次的日期为当天的最后一个代码,下次获取代码自动会获取到当天的 272 | self.markCurrentDateAndCode(date=lastDate, code=self.codes[self.codeLen - 1]) 273 | logging.warning("%s 无任何专利/论文/成果,已跳过当日" % lastDate) 274 | 275 | def setEndDate(self, endDate): 276 | ''' 277 | 设置默认截止日期为昨天 278 | :return: 279 | ''' 280 | self.reCon() 281 | cursor = self.conn.cursor() 282 | cursor.execute("UPDATE `%s` SET endDate = '%s' WHERE type = '%s'" % (self.table, endDate, self.type)) 283 | self.conn.commit() 284 | 285 | def setStatusRunning(self): 286 | ''' 287 | 更新数据库中最后一次程序运行时间 288 | :return: 289 | ''' 290 | # 更新正在爬取的日期和学科分类的sql 291 | timeStr = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') 292 | status = 'last running:' + timeStr 293 | updateSql = "UPDATE `%s` SET status = '%s' WHERE type = '%s'" % ( 294 | self.table, status, self.type) 295 | self.reCon() 296 | cursor = self.conn.cursor() 297 | cursor.execute(updateSql) 298 | self.conn.commit() 299 | # conn.close() 300 | 301 | def setStatusFinished(self): 302 | ''' 303 | 所有日期和code都已跑完,向数据库中置标记 304 | :return: 305 | ''' 306 | # 更新正在爬取的日期和学科分类的sql 307 | timeStr = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') 308 | status = 'finished:' + timeStr 309 | updateSql = "UPDATE `%s` SET status = '%s' WHERE type = '%s'" % ( 310 | self.table, status, self.type) 311 | self.reCon() 312 | cursor = self.conn.cursor() 313 | cursor.execute(updateSql) 314 | self.conn.commit() 315 | # conn.close() 316 | 317 | def getCodeAll(self): 318 | ''' 319 | 从文件中获取所有code信息 320 | :return: 321 | ''' 322 | with open(self.srcCodeFile, 'r') as f: 323 | all = f.read() 324 | return all.split() 325 | 326 | def closeConn(self): 327 | ''' 328 | 关闭数据库连接 329 | :return: 330 | ''' 331 | self.conn.close() 332 | 333 | 334 | if __name__ == '__main__': 335 | sm = StatusManager(SpiderTypeEnum.PATENT) 336 | print(sm.getLastDateAndCode()) 337 | print(sm.getNextDateAndCode()) 338 | -------------------------------------------------------------------------------- /CnkiSpider/commonUtils.py: -------------------------------------------------------------------------------- 1 | import os 2 | from enum import Enum 3 | import time 4 | import requests 5 | from CnkiSpider.proxy import ApeProxyManager 6 | import logging 7 | import scrapy 8 | from CnkiSpider.file_util import FileUtil 9 | from scrapy.utils.project import get_project_settings 10 | import sys 11 | 12 | class StringUtil: 13 | 14 | @classmethod 15 | def stringHanlde(cls, s): 16 | ''' 17 | 将非空字符串去首尾空格,空类型设为"" 18 | :param s: 19 | :return: 20 | ''' 21 | if s and s is not None: 22 | s = s.strip() 23 | else: 24 | s = "" 25 | return s 26 | 27 | class SpiderTypeEnum(Enum): 28 | ''' 29 | 爬虫类型枚举类,用于状态管理 30 | ''' 31 | PATENT = "patent" 32 | JOURNAL = "journal" 33 | BOSHUO = "boshuo" 34 | ACHIEVEMENT = "achievement" 35 | PAPER_AND_ACH = "paperAch" 36 | 37 | class CookieUtil(): 38 | @classmethod 39 | def getPatentCookies(cls, date, code, proxyDict=None): 40 | ''' 41 | (已弃用,见getPatentCookiesProxy)根据日期,分类代码获取cookies,翻页时必须要有cookie 42 | :param date: 43 | :param code: 44 | :param proxy: 45 | :return: 46 | ''' 47 | search_url = 'http://kns.cnki.net/kns/request/SearchHandler.ashx' 48 | times = time.strftime('%a %b %d %Y %H:%M:%S') + ' GMT+0800 (中国标准时间)' 49 | headers = { 50 | "Accept": "*/*", 51 | "Accept-Encoding": "gzip, deflate, br", 52 | "Accept-Language": "zh-CN,zh;q=0.9", 53 | "Connection": "keep-alive", 54 | "Content-Type": "application/x-www-form-urlencoded", 55 | "Host": "kns.cnki.net", 56 | "Origin": "https://kns.cnki.net", 57 | "Referer": "https://kns.cnki.net/kns/brief/result.aspx?dbprefix=SCPD", 58 | "Sec-Fetch-Mode": "cors", 59 | "Sec-Fetch-Site": "same-origin", 60 | "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36" 61 | } 62 | params = { 63 | "action": "", 64 | "NaviCode": code, 65 | "ua": "1.21", 66 | "isinEn": "0", 67 | "PageName": "ASP.brief_result_aspx", 68 | "DbPrefix": "SCPD", 69 | "DbCatalog": "中国专利数据库", 70 | "ConfigFile": "SCPD.xml", 71 | "db_opt": "SCOD", 72 | "db_value": "中国专利数据库", 73 | "date_gkr_from": date, 74 | "date_gkr_to": date, 75 | "his": '0', 76 | '__': times 77 | } 78 | # logging.debug('requests获取cookis, 代理为%s' % str(proxyDict)) 79 | if proxyDict: 80 | session_response = requests.get(search_url, params=params, headers=headers,proxies=cls.configReqestsProxyMeta(proxyDict)) 81 | else: 82 | session_response = requests.get(search_url, headers=headers, params=params) 83 | cookies = requests.utils.dict_from_cookiejar(session_response.cookies) 84 | return cookies 85 | 86 | @classmethod 87 | def getPatentCookiesProxy(cls, date, code): 88 | ''' 89 | 根据日期,分类代码获取cookies,翻页时必须要有cookie,带代理 90 | :param date: 91 | :param code: 92 | :param proxy: 93 | :return: 94 | ''' 95 | search_url = 'http://kns.cnki.net/kns/request/SearchHandler.ashx' 96 | times = time.strftime('%a %b %d %Y %H:%M:%S') + ' GMT+0800 (中国标准时间)' 97 | headers = { 98 | "Accept": "*/*", 99 | "Accept-Encoding": "gzip, deflate, br", 100 | "Accept-Language": "zh-CN,zh;q=0.9", 101 | "Connection": "keep-alive", 102 | "Content-Type": "application/x-www-form-urlencoded", 103 | "Host": "kns.cnki.net", 104 | "Origin": "https://kns.cnki.net", 105 | "Referer": "https://kns.cnki.net/kns/brief/result.aspx?dbprefix=SCPD", 106 | "Sec-Fetch-Mode": "cors", 107 | "Sec-Fetch-Site": "same-origin", 108 | "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36" 109 | } 110 | params = { 111 | "action": "", 112 | "NaviCode": code, 113 | "ua": "1.21", 114 | "isinEn": "0", 115 | "PageName": "ASP.brief_result_aspx", 116 | "DbPrefix": "SCPD", 117 | "DbCatalog": "中国专利数据库", 118 | "ConfigFile": "SCPD.xml", 119 | "db_opt": "SCOD", 120 | "db_value": "中国专利数据库", 121 | "date_gkr_from": date, 122 | "date_gkr_to": date, 123 | "his": '0', 124 | '__': times 125 | } 126 | 127 | settings = get_project_settings() 128 | 129 | # logging.debug('requests获取cookis, 代理为%s' % str(proxyDict)) 130 | session_response = None 131 | for i in range(30): 132 | try: 133 | if settings.get("PROXY_OPEN"): 134 | proxyDict = ApeProxyManager.getProxy() 135 | session_response = requests.get(search_url, params=params, headers=headers, proxies=cls.configReqestsProxyMeta(proxyDict)) 136 | else: 137 | session_response = requests.get(search_url, params=params, headers=headers) 138 | if session_response.status_code == 200: 139 | break 140 | else: 141 | logging.warning("cookie获取失败,第%d次重新获取中" % (i+1)) 142 | time.sleep(1) 143 | except requests.exceptions.RequestException as e: 144 | logging.error('cookie获取发生异常 %s' % str(e)) 145 | if not session_response: 146 | logging.error("cookie获取失败,程序退出") 147 | sys.exit() 148 | cookies = requests.utils.dict_from_cookiejar(session_response.cookies) 149 | return cookies 150 | 151 | @classmethod 152 | def configReqestsProxyMeta(cls, proxyDict): 153 | ''' 154 | 封装requests请求中的猿人云所必须的proxy参数 155 | :return: 156 | ''' 157 | proxyMeta = "http://%(user)s:%(pass)s@%(host)s:%(port)s" % { 158 | "host": proxyDict['ip'], 159 | "port": proxyDict['port'], 160 | "user": ApeProxyManager.id, 161 | "pass": ApeProxyManager.secret, 162 | } 163 | 164 | proxies = { 165 | "http": proxyMeta, 166 | "https": proxyMeta, 167 | } 168 | 169 | @classmethod 170 | def getPatentCookiesScrapy(cls, date, code): 171 | search_url = 'http://kns.cnki.net/kns/request/SearchHandler.ashx' 172 | times = time.strftime('%a %b %d %Y %H:%M:%S') + ' GMT+0800 (中国标准时间)' 173 | params = { 174 | "action": "", 175 | "NaviCode": code, 176 | "ua": "1.21", 177 | "isinEn": "0", 178 | "PageName": "ASP.brief_result_aspx", 179 | "DbPrefix": "SCPD", 180 | "DbCatalog": "中国专利数据库", 181 | "ConfigFile": "SCPD.xml", 182 | "db_opt": "SCOD", 183 | "db_value": "中国专利数据库", 184 | "date_gkr_from": date, 185 | "date_gkr_to": date, 186 | "his": '0', 187 | '__': times 188 | } 189 | response = scrapy.Request( 190 | url=search_url, 191 | dont_filter=True, 192 | meta=params 193 | ) 194 | headers = response.headers 195 | cookies = response.headers.getlist('Set-Cookie') 196 | print(cookies) 197 | return cookies 198 | 199 | @classmethod 200 | def getPaperAchCookiesProxy(cls, date, code): 201 | ''' 202 | 根据日期,分类代码获取论文和成果cookies,带代理 203 | :param cls: 204 | :param date: 205 | :param code: 206 | :return: 207 | ''' 208 | search_url = 'https://kns.cnki.net/kns/request/SearchHandler.ashx/' 209 | now_time = time.strftime('%a %b %d %Y %H:%M:%S') + ' GMT+0800 (中国标准时间)' 210 | headers = { 211 | "Accept": "*/*", 212 | "Accept-Encoding": "gzip, deflate, br", 213 | "Accept-Language": "zh-CN,zh;q=0.9", 214 | "Connection": "keep-alive", 215 | "Content-Type": "application/x-www-form-urlencoded", 216 | "Host": "kns.cnki.net", 217 | "Origin": "https://kns.cnki.net", 218 | "Referer": "https://kns.cnki.net/kns/brief/result.aspx?dbprefix=SCDB&crossDbcodes=CJFQ,CDFD,CMFD,CPFD,IPFD,CCND,CCJD", 219 | "Sec-Fetch-Mode": "cors", 220 | "Sec-Fetch-Site": "same-origin", 221 | "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36" 222 | } 223 | params = { 224 | "action": "", 225 | "NaviCode": code, 226 | "ua": "1.21", 227 | "isinEn": "1", 228 | "PageName": "ASP.brief_result_aspx", 229 | "DbPrefix": "SCDB", 230 | "DbCatalog": "中国学术文献网络出版总库", 231 | "ConfigFile": "SCDB.xml", 232 | "db_opt": "CJFQ,CJRF,CJFN,CDFD,CMFD,CPFD,IPFD,CCND,BDZK,CISD,SNAD,CCJD", 233 | "publishdate_from": date, 234 | "publishdate_to": date, 235 | "CKB_extension": "ZYW", 236 | "his": "0", 237 | '__': now_time 238 | } 239 | settings = get_project_settings() 240 | 241 | # logging.debug('requests获取cookis, 代理为%s' % str(proxyDict)) 242 | session_response = None 243 | for i in range(30): 244 | try: 245 | if settings.get("PROXY_OPEN"): 246 | proxyDict = ApeProxyManager.getProxy() 247 | session_response = requests.get(search_url, params=params,headers=headers, 248 | proxies=cls.configReqestsProxyMeta(proxyDict)) 249 | else: 250 | session_response = requests.get(search_url, params=params, headers=headers) 251 | if session_response.status_code == 200: 252 | break 253 | else: 254 | logging.warning("cookie获取失败,第%d次重新获取中" % (i + 1)) 255 | time.sleep(1) 256 | except requests.exceptions.RequestException as e: 257 | logging.error('cookie获取发生异常 %s' % str(e)) 258 | if not session_response: 259 | logging.error("cookie获取失败,程序退出") 260 | sys.exit() 261 | cookies = requests.utils.dict_from_cookiejar(session_response.cookies) 262 | return cookies 263 | 264 | import pymysql 265 | 266 | 267 | class ErrorUtil(): 268 | ''' 269 | 错误判断工具类 270 | ''' 271 | settings = get_project_settings() 272 | host = settings.get("MYSQL_HOST") 273 | port = int(settings.get("MYSQL_PORT")) 274 | user = settings.get("MYSQL_USER") 275 | passwd = settings.get("MYSQL_PASSWD") 276 | database = settings.get("MYSQL_DATABASE") 277 | # table = settings.get("STATUS_TABLE") 278 | errorCodeTable = settings.get("ERROR_CODE_TABLE") 279 | errorLinkTable = settings.get("ERROR_LINK_TABLE") 280 | 281 | conn = pymysql.connect(host=host, port=port, user=user, passwd=passwd, database=database) 282 | 283 | @classmethod 284 | def reCon(cls): 285 | """ MySQLdb.OperationalError异常""" 286 | # self.con.close() 287 | while True: 288 | try: 289 | cls.conn.ping() 290 | return 291 | except pymysql.err.OperationalError: 292 | logging.warning("mysql连接失败开始重连") 293 | cls.conn.ping(True) 294 | time.sleep(3) 295 | 296 | 297 | @classmethod 298 | def isBadResponse(cls, response): 299 | ''' 300 | 中间件如果接受到错误请求会构造一个url为空的response,这里判断是不是请求出错 301 | :param response: 302 | :return: 303 | ''' 304 | if (not response.url) or (response.url == 'exception'): # 接收到url==''或'exception'时 305 | return True 306 | else: 307 | return False 308 | 309 | # @classmethod 310 | # def markLinkError(cls, url, type, code): 311 | # with open(FileUtil.errorLinkDir + type + 'Error.txt', 'a', encoding='utf-8') as file: 312 | # file.write(url + '\n') 313 | 314 | @classmethod 315 | def markLinkError(cls, url, type: SpiderTypeEnum, code, date): 316 | ''' 317 | 记录出错的链接 318 | :param url: 319 | :param type: 320 | :param code: 321 | :return: 322 | ''' 323 | cls.reCon() 324 | cursor = cls.conn.cursor() 325 | sql = "INSERT INTO `%s` (`type`, `code`, `link`, `date`)VALUES('%s', '%s', '%s', '%s')" % (cls.errorLinkTable, type.value, code, url, date) 326 | cursor.execute(sql) 327 | cls.conn.commit() 328 | 329 | # @classmethod 330 | # def markDayError(cls, type, code, date): 331 | # with open(FileUtil.errorDayDir + type + '.txt', 'a', encoding='utf-8') as f: 332 | # f.write(code + '&' + date + '\n') 333 | 334 | @classmethod 335 | def markCodeError(cls, type, code, date): 336 | ''' 337 | 记录某天某学科链接获取失败情况 338 | :param type: 339 | :param code: 340 | :param date: 341 | :return: 342 | ''' 343 | cls.reCon() 344 | cursor = cls.conn.cursor() 345 | sql = "INSERT IGNORE INTO `%s` (`type`, `code`, `date`)VALUES('%s', '%s', '%s')" % ( 346 | cls.errorCodeTable, type.value, code, date) 347 | cursor.execute(sql) 348 | cls.conn.commit() 349 | 350 | @classmethod 351 | def getOneErrorCode(cls, type:SpiderTypeEnum = None): 352 | if type: 353 | sql = "select `id`, `type`, `code`, `date` from `%s` where `type` = '%s' limit 1" % (cls.errorCodeTable, type.value) 354 | else: 355 | sql = "select `id`, `type`, `code`, `date` from `%s` limit 1" % (cls.errorCodeTable) 356 | cls.reCon() 357 | cursor = cls.conn.cursor() 358 | cursor.execute(sql) 359 | result = cursor.fetchone() 360 | if not result: 361 | return None 362 | else: 363 | return result 364 | 365 | @classmethod 366 | def getOneErrorLink(cls, type: SpiderTypeEnum = None): 367 | if type: 368 | sql = "select `id`, `type`, `code`, `link`, `date` from `%s` where `type` = '%s' limit 1" % ( 369 | cls.errorLinkTable, type.value) 370 | else: 371 | sql = "select `id`, `type`, `code`, `link`, `date` from `%s` limit 1" % (cls.errorLinkTable) 372 | cls.reCon() 373 | cursor = cls.conn.cursor() 374 | cursor.execute(sql) 375 | result = cursor.fetchone() 376 | if not result: 377 | return None 378 | else: 379 | return result 380 | 381 | @classmethod 382 | def deleteErrorCode(cls, id:int): 383 | sql = "delete from `%s` where `id` = %d" % (cls.errorCodeTable, id) 384 | cls.reCon() 385 | cursor = cls.conn.cursor() 386 | cursor.execute(sql) 387 | cls.conn.commit() 388 | 389 | @classmethod 390 | def deleteErrorLink(cls, id: int): 391 | sql = "delete from `%s` where `id` = %d" % (cls.errorLinkTable, id) 392 | cls.reCon() 393 | cursor = cls.conn.cursor() 394 | cursor.execute(sql) 395 | cls.conn.commit() 396 | 397 | @classmethod 398 | def closeConn(cls): 399 | ''' 400 | 关闭数据库连接 401 | :return: 402 | ''' 403 | cls.conn.close() 404 | 405 | 406 | if __name__ == '__main__': 407 | CookieUtil.getPatentCookiesScrapy(date='2020-01-01', code='A') 408 | -------------------------------------------------------------------------------- /CnkiSpider/spiders/patent.py: -------------------------------------------------------------------------------- 1 | import scrapy 2 | import time 3 | import math 4 | import re 5 | import requests 6 | from CnkiSpider.items import PatentContentItem 7 | from CnkiSpider.items import ErrorUrlItem 8 | from CnkiSpider.commonUtils import StringUtil 9 | from CnkiSpider.statusManager import StatusManager 10 | from CnkiSpider.commonUtils import SpiderTypeEnum, CookieUtil, ErrorUtil 11 | from CnkiSpider.file_util import FileUtil 12 | from CnkiSpider.proxy import ApeProxyManager 13 | from scrapy.http.cookies import CookieJar 14 | from scrapy_redis.spiders import RedisSpider 15 | from CnkiSpider.items import * 16 | 17 | import logging 18 | import datetime 19 | 20 | 21 | class PatentSpider(RedisSpider): 22 | name = 'patent' 23 | allowed_domains = ['www.cnki.net', 'kns.cnki.net'] 24 | # start_urls = ['//https://www.cnki.net//'] 25 | custom_settings = { 26 | # 设置管道下载 27 | # 设置log日志 28 | 'LOG_LEVEL': 'INFO', 29 | # 'LOG_LEVEL': 'DEBUG', 30 | 'LOG_FILE': FileUtil.logDir + 'patent.log' 31 | } 32 | 33 | def __init__(self, settings, *args, **kwargs): 34 | super(PatentSpider, self).__init__(*args, **kwargs) 35 | FileUtil.initOutputDir() 36 | # self.base_url = 'http://dbpub.cnki.net/grid2008/dbpub/detail.aspx?dbcode=scpd&' 37 | self.base_url = 'https://kns.cnki.net/kns/brief/brief.aspx?curpage=%d&RecordsPerPage=50&QueryID=10&ID=&turnpage=1&tpagemode=L&dbPrefix=SCPD&Fields=&DisplayMode=listmode&PageName=ASP.brief_result_aspx&isinEn=0&' 38 | self.patent_content_pre_url = 'https://kns.cnki.net/kcms/detail/detail.aspx?dbcode=SCPD&dbname=SCPD%s&filename=%s' 39 | self.sm = StatusManager(SpiderTypeEnum.PATENT) 40 | 41 | # 获取setting中的年份值 42 | @classmethod 43 | def from_crawler(cls, crawler, *args, **kwargs): 44 | spider = cls(crawler.settings, *args, **kwargs) 45 | spider._set_crawler(crawler) 46 | return spider 47 | 48 | # 重写startrequests 49 | def start_requests(self): 50 | # util = PatentUtil() 51 | # util.generateUrlsDir() 52 | # dates = util.getAllDayPerYear() 53 | 54 | lastDateAndCode = self.sm.getLastDateAndCode() 55 | if lastDateAndCode is None: 56 | return 57 | # 上次爬取可能进行到了一半,所以要重爬一下 58 | nextDateAndCode = lastDateAndCode 59 | while nextDateAndCode is not None: 60 | date = nextDateAndCode[0] 61 | code = nextDateAndCode[1] 62 | logging.info("开始爬取专利链接,日期:%s,学科分类:%s" % (date, code)) 63 | 64 | # proxyDict = ApeProxyManager.getProxyDict() 65 | # proxyString = ApeProxyManager.proxyDict2String(proxyDict) 66 | 67 | url_first = 'https://kns.cnki.net/kns/brief/brief.aspx?curpage=%d&RecordsPerPage=50&QueryID=10&ID=&turnpage=1&tpagemode=L&dbPrefix=SCPD&Fields=&DisplayMode=listmode&PageName=ASP.brief_result_aspx&isinEn=0&' % 1 68 | 69 | if code == self.sm.getCodeFirst(): 70 | # 获取全年信息 71 | cookiesAllCodeForOneDate = CookieUtil.getPatentCookiesProxy(date, "*") 72 | yield scrapy.Request( 73 | url=url_first, 74 | cookies=cookiesAllCodeForOneDate, 75 | callback=self.ifSkipDate, 76 | cb_kwargs={ 77 | 'cookies': cookiesAllCodeForOneDate, 78 | "code": code, 79 | "date": date, 80 | "requestType": 'PatentIfSkipDate' 81 | }, 82 | meta={ 83 | 'url': url_first, 84 | # 'proxy': proxyString, 85 | "requestType": 'PatentIfSkipDate' 86 | }, 87 | dont_filter=True 88 | ) 89 | 90 | cookies = CookieUtil.getPatentCookiesProxy(date, code) 91 | 92 | # print("发起请求获取第一页信息", date, code) 93 | yield scrapy.Request( 94 | url=url_first, 95 | cookies=cookies, 96 | callback=self.parse_first_page, 97 | cb_kwargs={ 98 | 'cookies': cookies, 99 | "code": code, 100 | "date": date, 101 | "requestType": 'PatentGetFirstPage' 102 | }, 103 | meta={ 104 | 'url': url_first, 105 | # 'proxy': proxyString, 106 | "requestType": 'PatentGetFirstPage' 107 | }, 108 | dont_filter=True 109 | ) 110 | nextDateAndCode = self.sm.getNextDateAndCode() 111 | logging.info('所有专利链接已经获取结束!') 112 | 113 | # 获取失败的日期和学科代码,重新获取链接并请求、解析内容 114 | # self.handleErrorCodeDate() 115 | # 获取请求失败的链接,重新请求并解析内容 116 | # self.hanldeErrorLink() 117 | 118 | 119 | #################### 重新获取失败的链接,直到所有链接都获取成功 开始 ################### 120 | logging.info('开始重新获取出错链接并重爬链接') 121 | errCodeDate = ErrorUtil.getOneErrorCode(type=SpiderTypeEnum.PATENT) 122 | while errCodeDate: 123 | id = errCodeDate[0] 124 | type = errCodeDate[1] 125 | code = errCodeDate[2] 126 | date = errCodeDate[3] 127 | # 从数据库中删除这条已经获取的日期代码对,不用担心出错,如果出错会被错误处理模块捕获 128 | # 但其实这里还有个小bug,就是可能有的请求还在请求中,但是数据库这时候空了,导致最后几个出错请求没被重新爬 129 | # 这样的问题就只涉及几个专利,只要再运行一次程序就行 130 | ErrorUtil.deleteErrorCode(id=id) 131 | url_first = 'https://kns.cnki.net/kns/brief/brief.aspx?curpage=%d&RecordsPerPage=50&QueryID=10&ID=&turnpage=1&tpagemode=L&dbPrefix=SCPD&Fields=&DisplayMode=listmode&PageName=ASP.brief_result_aspx&isinEn=0&' % 1 132 | 133 | cookies = CookieUtil.getPatentCookiesProxy(date, code) 134 | 135 | # print("发起请求获取第一页信息", date, code) 136 | yield scrapy.Request( 137 | url=url_first, 138 | cookies=cookies, 139 | callback=self.parse_first_page, 140 | cb_kwargs={ 141 | 'cookies': cookies, 142 | "code": code, 143 | "date": date, 144 | "requestType": 'PatentGetFirstPage' 145 | }, 146 | meta={ 147 | 'url': url_first, 148 | # 'proxy': proxyString, 149 | "requestType": 'PatentGetFirstPage' 150 | }, 151 | dont_filter=True 152 | ) 153 | errCodeDate = ErrorUtil.getOneErrorCode(type=SpiderTypeEnum.PATENT) 154 | logging.info('重新获取出错链接并重爬链接结束') 155 | ###################################### 重新获取失败的链接,直到所有链接都获取成功 结束################ 156 | 157 | 158 | 159 | ######################## 重新请求所有失败链接 开始 ############################ 160 | logging.info("开始请求失败链接") 161 | errorLink = ErrorUtil.getOneErrorLink(type=SpiderTypeEnum.PATENT) 162 | while errorLink: 163 | id = errorLink[0] 164 | type = errorLink[1] 165 | code = errorLink[2] 166 | link = errorLink[3] 167 | date = errorLink[4] 168 | 169 | ErrorUtil.deleteErrorLink(id) 170 | 171 | url = link 172 | yield scrapy.Request( 173 | url=url, 174 | # cookies=cookies, 175 | callback=self.parse_content, 176 | dont_filter=True, # 这里不去重,因为之前的链接应该请求过,如果去重再次请求会直接过滤 177 | cb_kwargs={ 178 | 'url': url, 179 | 'code': code, 180 | 'date': date, 181 | "requestType": "patentGetContent" 182 | }, 183 | meta={ 184 | 'url': url, 185 | # 'proxy': proxyString, 186 | "requestType": "patentGetContent" 187 | } 188 | ) 189 | errorLink = ErrorUtil.getOneErrorLink(type=SpiderTypeEnum.PATENT) 190 | logging.info("所有失败链接已重新请求完毕") 191 | ######################## 重新请求所有失败链接 结束 ############################ 192 | 193 | logging.info("当前(论文)爬取任务、错误重爬均已完成") 194 | 195 | FileUtil.markFinishOnce() 196 | 197 | def handleErrorCodeDate(self): 198 | ''' 199 | 重新获取失败的链接,直到所有链接都获取成功 开始 200 | :return: 201 | ''' 202 | logging.info('开始重新获取出错链接并重爬链接') 203 | errCodeDate = ErrorUtil.getOneErrorCode(type=SpiderTypeEnum.PATENT) 204 | while errCodeDate: 205 | id = errCodeDate[0] 206 | type = errCodeDate[1] 207 | code = errCodeDate[2] 208 | date = errCodeDate[3] 209 | # 从数据库中删除这条已经获取的日期代码对,不用担心出错,如果出错会被错误处理模块捕获 210 | # 但其实这里还有个小bug,就是可能有的请求还在请求中,但是数据库这时候空了,导致最后几个出错请求没被重新爬 211 | # 这样的问题就只涉及几个专利,只要再运行一次程序就行 212 | ErrorUtil.deleteErrorCode(id=id) 213 | url_first = 'https://kns.cnki.net/kns/brief/brief.aspx?curpage=%d&RecordsPerPage=50&QueryID=10&ID=&turnpage=1&tpagemode=L&dbPrefix=SCPD&Fields=&DisplayMode=listmode&PageName=ASP.brief_result_aspx&isinEn=0&' % 1 214 | 215 | cookies = CookieUtil.getPatentCookiesProxy(date, code) 216 | 217 | # print("发起请求获取第一页信息", date, code) 218 | yield scrapy.Request( 219 | url=url_first, 220 | cookies=cookies, 221 | callback=self.parse_first_page, 222 | cb_kwargs={ 223 | 'cookies': cookies, 224 | "code": code, 225 | "date": date, 226 | "requestType": 'PatentGetFirstPage' 227 | }, 228 | meta={ 229 | 'url': url_first, 230 | # 'proxy': proxyString, 231 | "requestType": 'PatentGetFirstPage' 232 | }, 233 | dont_filter=True 234 | ) 235 | errCodeDate = ErrorUtil.getOneErrorCode(type=SpiderTypeEnum.PATENT) 236 | logging.info('开始重新获取出错链接并重爬链接') 237 | ###################################### 重新获取失败的链接,直到所以链接都获取成功 结束################ 238 | 239 | def hanldeErrorLink(self): 240 | ''' 241 | 重新请求所有失败链接 242 | :return: 243 | ''' 244 | logging.info("开始请求失败链接") 245 | print("我运行了") 246 | errorLink = ErrorUtil.getOneErrorLink(type=SpiderTypeEnum.PATENT) 247 | while errorLink: 248 | id = errorLink[0] 249 | type = errorLink[1] 250 | code = errorLink[2] 251 | link = errorLink[3] 252 | date = errorLink[4] 253 | 254 | ErrorUtil.deleteErrorLink(id) 255 | 256 | url = link 257 | yield scrapy.Request( 258 | url=url, 259 | # cookies=cookies, 260 | callback=self.parse_content, 261 | dont_filter=True, # 这里不去重,因为之前的链接应该请求过,如果去重再次请求会直接过滤 262 | cb_kwargs={ 263 | 'url': url, 264 | 'code': code, 265 | 'date': date, 266 | "requestType": "patentGetContent" 267 | }, 268 | meta={ 269 | 'url': url, 270 | # 'proxy': proxyString, 271 | "requestType": "patentGetContent" 272 | } 273 | ) 274 | errorLink = ErrorUtil.getOneErrorLink(type=SpiderTypeEnum.PATENT) 275 | logging.info("所有失败链接已重新请求完毕") 276 | 277 | 278 | 279 | 280 | def ifSkipDate(self, response,code, date,cookies, requestType): 281 | ''' 282 | 判断某天是否有专利,如果没有的话,就直接跳过该天 283 | :param response: 284 | :param code: 285 | :param date: 286 | :param cookies: 287 | :param requestType: 288 | :return: 289 | ''' 290 | # print('进入parse_first_page成功') 291 | if ErrorUtil.isBadResponse(response=response): 292 | return 293 | # print('进入parse_first_page成功2') 294 | # 使用上次请求的cookie,否则无法翻页成功 295 | cookies_now = cookies 296 | # 获取上次请求的使用的proxy,这次请求用的cookie和proxy都和以前一致 297 | # proxyString = response.meta['proxy'] 298 | pagerTitleCell = response.xpath('//div[@class="pagerTitleCell"]/text()').extract_first() 299 | if pagerTitleCell == None: 300 | # print(response.text) 301 | # 这里的url一定不是空的,如果是空的话前面已经return了不用担心 302 | logging.error("专利页面解析出现错误 %s %s %s %s" % (code, date, response.meta['url'], response.text)) 303 | ErrorUtil.markCodeError(code=code, date=date, type=SpiderTypeEnum.PATENT) 304 | return 305 | page = pagerTitleCell.strip() 306 | num = int(re.findall(r'\d+', page.replace(',', ''))[0]) # 文献数 307 | pagenum = math.ceil(num / 50) # 算出页数 308 | logging.info("%s 共有:%d篇文献" % (date, num)) 309 | if num < 1: 310 | self.sm.stepIntoNextDate(date) 311 | return 312 | 313 | # 第一页内容解析,获取页数信息 314 | def parse_first_page(self, response,code, date,cookies, requestType): 315 | # print('进入parse_first_page成功') 316 | if ErrorUtil.isBadResponse(response=response): 317 | return 318 | # print('进入parse_first_page成功2') 319 | # 使用上次请求的cookie,否则无法翻页成功 320 | cookies_now = cookies 321 | # 获取上次请求的使用的proxy,这次请求用的cookie和proxy都和以前一致 322 | # proxyString = response.meta['proxy'] 323 | pagerTitleCell = response.xpath('//div[@class="pagerTitleCell"]/text()').extract_first() 324 | if pagerTitleCell == None: 325 | # print(response.text) 326 | # 这里的url一定不是空的,如果是空的话前面已经return了不用担心 327 | logging.error("专利页面解析出现错误 %s %s %s %s" % (code, date, response.meta['url'], response.text)) 328 | ErrorUtil.markCodeError(code=code, date=date, type=SpiderTypeEnum.PATENT) 329 | return 330 | page = pagerTitleCell.strip() 331 | num = int(re.findall(r'\d+', page.replace(',', ''))[0]) # 文献数 332 | pagenum = math.ceil(num / 50) # 算出页数 333 | logging.info("%s %s 共有:%d篇文献, %d页" % (code, date, num, pagenum)) 334 | if num < 1: 335 | return 336 | if pagenum > 120: 337 | with open(FileUtil.errorOverflowDir + 'papentOverflow.txt', 'a') as f: 338 | f.write(date + ':' + code + '\n') 339 | return 340 | # 测试一下cookie一样换proxy行不行的通 341 | # 后续:测试完成,证明行得通 342 | # proxyDict = ApeProxyManager.getProxyDict() 343 | # proxyString = ApeProxyManager.proxyDict2String(proxyDict) 344 | for i in range(1, pagenum + 1): 345 | # 超过15页换cookie 346 | if i % 13 == 0: 347 | # proxyDict = ApeProxyManager.getProxyDict() 348 | # proxyString = ApeProxyManager.proxyDict2String(proxyDict) 349 | cookies_now = CookieUtil.getPatentCookiesProxy(date, code) 350 | url = self.base_url % i 351 | # logging.debug('换了proxy未换cookie看是否能请求成功') 352 | # logging.debug("发起请求获取第%d页信息 %s %s", (i, date, code)) 353 | yield scrapy.Request( 354 | url=url, 355 | cookies=cookies_now, 356 | callback=self.parse_page_links, 357 | cb_kwargs={ 358 | "pagenum": i, 359 | "code": code, 360 | "date": date, 361 | "requestType": "PatentGetLinks" 362 | }, 363 | meta={ 364 | 'url':url, 365 | # 'proxy': proxyString, 366 | "requestType": "PatentGetLinks" 367 | }, 368 | dont_filter=True 369 | ) 370 | 371 | def parse_page_links(self,response,pagenum,code,date,requestType): 372 | if ErrorUtil.isBadResponse(response=response): 373 | return 374 | link = response.xpath('//a[@class="fz14"]/@href').extract() # 返回链接地址href列表 375 | if len(link) == 0: 376 | return 377 | logging.debug("日期:%s,学科分类:%s,第%d页有%d个专利" % (date, code, pagenum+1, len(link))) 378 | for j in range(len(link)): 379 | # item = PatentCodeItem() 380 | patentCode = re.search(r'filename=(.*)$', link[j]).group(1) 381 | # item['patentCode'] = patentCode 382 | # yield item 383 | url = self.patent_content_pre_url % (date[0:4], patentCode) 384 | # print(date) 385 | # proxyDict = ApeProxyManager.getProxyDict() 386 | # proxyString = ApeProxyManager.proxyDict2String(proxyDict) 387 | # logging.debug("准备发起解析专利请求,%s" % url) 388 | yield scrapy.Request( 389 | url=url, 390 | # cookies=cookies, 391 | callback=self.parse_content, 392 | dont_filter=False, # 这里参与去重,专利文件不重复 393 | cb_kwargs={ 394 | 'url': url, 395 | 'code': code, 396 | 'date': date, 397 | "requestType": "patentGetContent" 398 | }, 399 | meta={ 400 | 'url':url, 401 | # 'proxy': proxyString, 402 | "requestType": "patentGetContent" 403 | } 404 | ) 405 | 406 | # 获取专利详情页内容 407 | def parse_content(self, response, url, code, date, requestType): 408 | if ErrorUtil.isBadResponse(response=response): 409 | return 410 | logging.debug("解析专利:%s" % url) 411 | item = self.getDefaultPatentItem() 412 | item['type'] = SpiderTypeEnum.PATENT.value 413 | item['naviCode'] = code # 学科分类 414 | item['year'] = date[0:4] # 年份,主要用来爬虫归类用 415 | title = response.xpath("//h1/text()").extract_first().strip() 416 | item['title'] = title 417 | # 有的是在row下,有的是在row的row1和row2下,这么写效率最高 418 | rows = response.xpath("//div[@class='row'] | //div[@class='row-1'] | //div[@class='row-2']") 419 | for row in rows: 420 | key = row.xpath("./span[@class='rowtit']/text() | ./span[@class='rowtit2']/text()").extract_first() 421 | if key is not None: 422 | key = key.strip() 423 | # 有的文本含有链接,提取方法不一样 424 | # 得到列表形式的值 425 | valueList = row.xpath("./p[@class='funds']/text() | ./p[@class='funds']/a/text()").getall() 426 | value = "".join(valueList) 427 | if key == "专利类型:": 428 | item['applicationType'] = value # 专利类型 429 | elif key == "申请日:": 430 | item['applicationDate'] = value # 申请日 431 | elif key == "多次公布:": 432 | # 多次公布后面发现是动态加载,其实这样获取不到,但是,不同阶段的专利的申请(专利)号是一样的,多次公布不爬问题也不大 433 | item['multiPublicationNo'] = value # 多次公布 434 | elif key == "申请人:": 435 | item['applicant'] = value # 申请人 436 | elif key == "地址:": 437 | item['applicantAddress'] = value # 地址 438 | elif key == "发明人:": 439 | item['inventors'] = value # 发明人原始字符串 440 | i1, i2, i3, i4 = self.getFirstFourAuthor(value) 441 | elif key == "申请(专利)号:": 442 | item['applicationNO'] = value # 申请(专利)号 443 | elif key == "申请公布号:": 444 | item['applyPublicationNo'] = value # 申请公布号 445 | elif key == "授权公布号:": 446 | item['authPublicationNo'] =value # 授权公布号 447 | elif key == "公开公告日:": 448 | item['publicationDate'] = value # 公开公告日 449 | elif key == "授权公告日:": 450 | item['authPublicationDate'] = value # 授权公告日 451 | elif key == "国省代码:": 452 | item['areaCode'] = value # 国省代码 453 | elif key == "分类号:": 454 | item['classificationNO'] = value # 分类号 455 | elif key == "主分类号:": 456 | item['mainClassificationNo'] = value # 主分类号 457 | elif key == "代理机构:": 458 | item['agency'] = value # 代理机构 459 | elif key == "代理人:": 460 | item['agent'] = value # 代理人 461 | elif key == "页数:": 462 | item['page'] = value # 页数 463 | 464 | abstract = response.xpath("//div[@class='abstract-text']/text()").extract_first() # 摘要 465 | sovereignty = response.xpath("//div[@class='claim-text']/text()").extract_first() # 主权项 466 | item['abstract'] = StringUtil.stringHanlde(abstract) # 摘要 467 | item['sovereignty'] = StringUtil.stringHanlde(sovereignty) # 主权项 468 | item['url'] = url # 专利在知网的链接 469 | 470 | # 法律状态 base url 471 | # legalStatusBaseUrl = "https://kns.cnki.net/kcms/detail/frame/ReaderComments.aspx?flag=gbserach&dbcode=SCPD&dbname=SCPD&filename=%s&vl=%s" 472 | # 法律状态url中的vl字段 473 | # vl = response.xpath("//input[@id='listv']/@value").extract_first() 474 | # item['legalStatus'] = legalStatusBaseUrl % (item['applicationNO'], vl) # 法律状态链接 475 | item['legalStatus'] = "" #法律状态会更新,所以建议以后用到的时候实时请求知网。相关的爬虫代码有空会给出 476 | # 保存html文件 477 | FileUtil.saveHtml(year=date[0:4], response=response, type=SpiderTypeEnum.PATENT.value, url=url, title=title) 478 | yield item 479 | 480 | def getDefaultPatentItem(self): 481 | ''' 482 | 为专利item加默认的键,防止后面保存出现不存在的键 483 | :param item: 484 | :return: 485 | ''' 486 | item = PatentContentItem() 487 | item['naviCode'] = "" 488 | item['title'] = "" 489 | item['year'] = "" 490 | item['applicationType'] = "" 491 | item['applicationDate'] = "" 492 | item['multiPublicationNo'] = "" 493 | item['applicant'] = "" 494 | item['applicantAddress'] = "" 495 | item['inventors'] = "" 496 | item['applicationNO'] = "" 497 | item['applyPublicationNo'] = "" 498 | item['publicationDate'] = "" 499 | item['authPublicationDate'] = "" 500 | item['authPublicationNo'] = "" 501 | item['areaCode'] = "" 502 | item['classificationNO'] = "" 503 | item['mainClassificationNo'] = "" 504 | item['agency'] = "" 505 | item['agent'] = "" 506 | item['page'] = "" 507 | item['abstract'] = "" 508 | item['sovereignty'] = "" 509 | item['url'] = "" 510 | item['legalStatus'] = "" 511 | return item 512 | 513 | def getFirstFourAuthor(self, inventors): 514 | # 把“叶莉华; 冯洋洋; 王著元; 程志祥; 崔一平”形式的作者划分开 515 | # 查了下中国专利作者的数量是不受限制的,这里对发明人做了预处理,提取分割了前四位 516 | inventorsList = inventors.split(';') 517 | inventorsLen = len(inventorsList) 518 | if inventorsLen < 1: 519 | return '', '', '', '' 520 | first_inventor = inventorsList[0] 521 | if inventorsLen > 1: 522 | second_inventor = inventorsList[1].strip() 523 | else: 524 | second_inventor = '' 525 | if inventorsLen > 2: 526 | third_inventor = inventorsList[2].strip() 527 | else: 528 | third_inventor = '' 529 | if inventorsLen > 3: 530 | fourth_inventor = inventorsList[3].strip() 531 | else: 532 | fourth_inventor = '' 533 | 534 | return first_inventor, second_inventor, third_inventor, fourth_inventor 535 | 536 | 537 | 538 | 539 | # 根据日期,分类代码获取cookies 540 | # def getCookies(self, date, code, proxy = None): 541 | # search_url = 'http://kns.cnki.net/kns/request/SearchHandler.ashx' 542 | # times = time.strftime('%a %b %d %Y %H:%M:%S') + ' GMT+0800 (中国标准时间)' 543 | # params = { 544 | # "action": "", 545 | # "NaviCode": code, 546 | # "ua": "1.21", 547 | # "isinEn": "0", 548 | # "PageName": "ASP.brief_result_aspx", 549 | # "DbPrefix": "SCPD", 550 | # "DbCatalog": "中国专利数据库", 551 | # "ConfigFile": "SCPD.xml", 552 | # "db_opt": "SCOD", 553 | # "db_value": "中国专利数据库", 554 | # "date_gkr_from": date, 555 | # "date_gkr_to": date, 556 | # "his": '0', 557 | # '__': times 558 | # } 559 | # if proxy: 560 | # session_response = requests.get(search_url, params=params, proxy=proxy) 561 | # else: 562 | # session_response = requests.get(search_url, params=params) 563 | # cookies = requests.utils.dict_from_cookiejar(session_response.cookies) 564 | # return cookies 565 | 566 | 567 | 568 | def generateErrorItem(self, response): 569 | ''' 570 | (已弃用)判断是否出现错误,如果有错误,yield错误item,并返回出错标志 571 | :param response: 572 | :return: 573 | ''' 574 | item = ErrorUrlItem() 575 | item['url'] = response.meta['url'] 576 | item['reqType'] = response.meta['requestType'] 577 | errorFlag = False 578 | if not response.url: # 接收到url==''时 579 | print('这里是异常item url') 580 | logging.info('500') 581 | item['errType'] = '500' 582 | errorFlag = True 583 | # todo 报错错误item 584 | # yield item 585 | elif 'exception' in response.url: 586 | print('这里是异常item exception') 587 | item = ErrorUrlItem() 588 | item['errType'] = 'Exception' 589 | errorFlag = True 590 | # todo 保存错误item 591 | # yield item 592 | # print('errorFlag', errorFlag) 593 | return errorFlag 594 | 595 | 596 | -------------------------------------------------------------------------------- /CnkiSpider/spiders/paperAchSpider.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import math 3 | import re 4 | import time 5 | import uuid 6 | from scrapy_redis.spiders import RedisSpider 7 | 8 | import requests 9 | import scrapy 10 | 11 | from CnkiSpider.items import * 12 | from CnkiSpider.statusManager import StatusManager 13 | from CnkiSpider.commonUtils import StringUtil, SpiderTypeEnum, CookieUtil, ErrorUtil 14 | from CnkiSpider.file_util import FileUtil 15 | import os 16 | import logging 17 | 18 | class PaperAchSpider(RedisSpider): 19 | ''' 20 | 博硕、期刊、科技成果都在这个spider里面 21 | ''' 22 | name = 'paperAch' 23 | allowed_domains = ['www.cnki.net', 'kns.cnki.net'] 24 | custom_settings = { 25 | # 设置管道下载 26 | # 设置log日志 27 | 'LOG_LEVEL': 'INFO', 28 | 'LOG_FILE': FileUtil.logDir + 'paperAch.log' 29 | } 30 | 31 | def __init__(self, settings, *args, **kwargs): 32 | super(PaperAchSpider, self).__init__(*args, **kwargs) 33 | self.base_url = 'https://kns.cnki.net/kns/brief/brief.aspx?RecordsPerPage=50&QueryID=33&ID=&turnpage=1&tpagemode=L&dbPrefix=SCDB&Fields=&DisplayMode=listmode&PageName=ASP.brief_result_aspx&isinEn=1&curpage=' 34 | 35 | @classmethod 36 | def from_crawler(cls, crawler, *args, **kwargs): 37 | spider = cls(crawler.settings, *args, **kwargs) 38 | spider._set_crawler(crawler) 39 | return spider 40 | 41 | # 重写startrequests 42 | def start_requests(self): 43 | 44 | FileUtil.initOutputDir() 45 | sm = StatusManager(SpiderTypeEnum.PAPER_AND_ACH) 46 | lastDateAndCode = sm.getLastDateAndCode() 47 | if lastDateAndCode is None: 48 | return 49 | # 上次爬取可能进行到了一半,所以要重爬一下 50 | nextDateAndCode = lastDateAndCode 51 | while nextDateAndCode is not None: 52 | date = nextDateAndCode[0] 53 | code = nextDateAndCode[1] 54 | logging.info("开始爬取论文和成果链接,日期:%s,学科分类:%s" % (date, code)) 55 | # 根据日期、code获取cookies 56 | cookies = CookieUtil.getPaperAchCookiesProxy(date,code) 57 | url_first = self.base_url + '1' 58 | yield scrapy.Request( 59 | url=url_first, 60 | cookies=cookies, 61 | callback=self.parse_first_page, 62 | cb_kwargs={ 63 | 'cookies': cookies, 64 | "code": code, 65 | "date": date, 66 | "requestType": "PaperAchGetFirstPage" 67 | }, 68 | meta={ 69 | 'url': url_first, 70 | "requestType": "PaperAchGetFirstPage" 71 | }, 72 | dont_filter=True 73 | ) 74 | nextDateAndCode = sm.getNextDateAndCode() 75 | logging.info('所有论文成果链接已经获取结束!') 76 | 77 | #################### 重新获取失败的链接,直到所有链接都获取成功 开始 ################### 78 | logging.info('开始重新获取出错链接并重爬链接') 79 | errCodeDate = ErrorUtil.getOneErrorCode(type=SpiderTypeEnum.PAPER_AND_ACH) 80 | while errCodeDate: 81 | id = errCodeDate[0] 82 | type = errCodeDate[1] 83 | code = errCodeDate[2] 84 | date = errCodeDate[3] 85 | # 从数据库中删除这条已经获取的日期代码对,不用担心出错,如果出错会被错误处理模块捕获 86 | # 但其实这里还有个小bug,就是可能有的请求还在请求中,但是数据库这时候空了,导致最后几个出错请求没被重新爬 87 | # 这样的问题就只涉及几个专利,只要再运行一次程序就行 88 | ErrorUtil.deleteErrorCode(id=id) 89 | cookies = CookieUtil.getPaperAchCookiesProxy(date, code) 90 | url_first = self.base_url + '1' 91 | yield scrapy.Request( 92 | url=url_first, 93 | cookies=cookies, 94 | callback=self.parse_first_page, 95 | cb_kwargs={ 96 | 'cookies': cookies, 97 | "code": code, 98 | "date": date, 99 | "requestType": "PaperAchGetFirstPage" 100 | }, 101 | meta={ 102 | 'url': url_first, 103 | "requestType": "PaperAchGetFirstPage" 104 | }, 105 | dont_filter=True 106 | ) 107 | 108 | errCodeDate = ErrorUtil.getOneErrorCode(type=SpiderTypeEnum.PAPER_AND_ACH) 109 | logging.info('开始重新获取出错链接并重爬链接') 110 | ###################################### 重新获取失败的链接,直到所有链接都获取成功 结束################ 111 | 112 | ######################## 重新请求所有失败链接 开始 ############################ 113 | logging.info("开始请求期刊失败链接") 114 | errorLink = ErrorUtil.getOneErrorLink(type=SpiderTypeEnum.JOURNAL) 115 | while errorLink: 116 | id = errorLink[0] 117 | type = errorLink[1] 118 | code = errorLink[2] 119 | link = errorLink[3] 120 | date = errorLink[4] 121 | 122 | ErrorUtil.deleteErrorLink(id) 123 | 124 | url = link 125 | yield scrapy.Request( 126 | url=url, 127 | callback=self.parse_journal_content, 128 | dont_filter=True, # 这里不去重,因为之前的链接应该请求过,如果去重再次请求会直接过滤 129 | cb_kwargs={ 130 | 'url': url, 131 | 'code': code, 132 | 'date': date, 133 | "requestType": "JournalGetContent" 134 | }, 135 | meta={ 136 | 'url': url, 137 | "requestType": "JournalGetContent" 138 | } 139 | ) 140 | errorLink = ErrorUtil.getOneErrorLink(type=SpiderTypeEnum.JOURNAL) 141 | logging.info("所有期刊失败链接已重新请求完毕") 142 | 143 | logging.info("开始请求期刊失败链接") 144 | errorLink = ErrorUtil.getOneErrorLink(type=SpiderTypeEnum.JOURNAL) 145 | while errorLink: 146 | id = errorLink[0] 147 | type = errorLink[1] 148 | code = errorLink[2] 149 | link = errorLink[3] 150 | date = errorLink[4] 151 | 152 | ErrorUtil.deleteErrorLink(id) 153 | 154 | url = link 155 | yield scrapy.Request( 156 | url=url, 157 | callback=self.parse_journal_content, 158 | dont_filter=True, # 这里不去重,因为之前的链接应该请求过,如果去重再次请求会直接过滤 159 | cb_kwargs={ 160 | 'url': url, 161 | 'code': code, 162 | 'date': date, 163 | "requestType": "JournalGetContent" 164 | }, 165 | meta={ 166 | 'url': url, 167 | "requestType": "JournalGetContent" 168 | } 169 | ) 170 | errorLink = ErrorUtil.getOneErrorLink(type=SpiderTypeEnum.JOURNAL) 171 | logging.info("所有期刊失败链接已重新请求完毕") 172 | 173 | logging.info("开始请求博硕失败链接") 174 | errorLink = ErrorUtil.getOneErrorLink(type=SpiderTypeEnum.BOSHUO) 175 | while errorLink: 176 | id = errorLink[0] 177 | type = errorLink[1] 178 | code = errorLink[2] 179 | link = errorLink[3] 180 | date = errorLink[4] 181 | 182 | ErrorUtil.deleteErrorLink(id) 183 | 184 | url = link 185 | yield scrapy.Request( 186 | url=url, 187 | callback=self.parse_boshuo_content, 188 | dont_filter=True, # 这里不去重,因为之前的链接应该请求过,如果去重再次请求会直接过滤 189 | cb_kwargs={ 190 | 'url': url, 191 | 'code': code, 192 | 'date': date, 193 | "requestType": "BoshuoGetContent" 194 | }, 195 | meta={ 196 | 'url': url, 197 | "requestType": "BoshuoGetContent" 198 | } 199 | ) 200 | errorLink = ErrorUtil.getOneErrorLink(type=SpiderTypeEnum.BOSHUO) 201 | logging.info("所有博硕失败链接已重新请求完毕") 202 | 203 | logging.info("开始请求成果失败链接") 204 | errorLink = ErrorUtil.getOneErrorLink(type=SpiderTypeEnum.ACHIEVEMENT) 205 | while errorLink: 206 | id = errorLink[0] 207 | type = errorLink[1] 208 | code = errorLink[2] 209 | link = errorLink[3] 210 | date = errorLink[4] 211 | 212 | ErrorUtil.deleteErrorLink(id) 213 | 214 | url = link 215 | yield scrapy.Request( 216 | url=url, 217 | callback=self.parse_ach_content, 218 | dont_filter=True, # 这里不去重,因为之前的链接应该请求过,如果去重再次请求会直接过滤 219 | cb_kwargs={ 220 | 'url': url, 221 | 'code': code, 222 | 'date': date, 223 | "requestType": "AchGetContent" 224 | }, 225 | meta={ 226 | 'url': url, 227 | "requestType": "AchGetContent" 228 | } 229 | ) 230 | errorLink = ErrorUtil.getOneErrorLink(type=SpiderTypeEnum.ACHIEVEMENT) 231 | logging.info("所有成果失败链接已重新请求完毕") 232 | ######################## 重新请求所有失败链接 结束 ############################ 233 | 234 | logging.info("当前(期刊、博硕、成果)爬取任务、错误重爬均已完成") 235 | 236 | FileUtil.markFinishOnce() 237 | 238 | # 第一页内容解析,获取页数信息 239 | def parse_first_page(self,response,cookies,code,date, requestType): 240 | if ErrorUtil.isBadResponse(response=response): 241 | return 242 | cookies_now = cookies 243 | pagerTitleCell = response.xpath('//div[@class="pagerTitleCell"]/text()').extract_first() 244 | if pagerTitleCell == None: 245 | logging.error('第一页解析出错,以下是获取到的response:%s' % response.text) 246 | ErrorUtil.markCodeError(code=code, date=date, type=SpiderTypeEnum.PAPER_AND_ACH) 247 | return 248 | page = pagerTitleCell.strip() 249 | num = int(re.findall(r'\d+', page.replace(',', ''))[0]) # 文献数 250 | pagenum = math.ceil(num / 50) #算出页数 251 | logging.info("%s %s 共有:%d篇文献, %d页" % (code,date,num,pagenum)) 252 | if num < 1: 253 | return 254 | if pagenum > 120: 255 | with open(FileUtil.errorOverflowDir + 'papentOverflow.txt', 'a') as f: 256 | f.write(date + ':' + code + '\n') 257 | return 258 | for i in range(1,pagenum+1): 259 | if i % 13 == 0: 260 | cookies_now = CookieUtil.getPaperAchCookiesProxy(date,code) # 超过15页换cookie 261 | url = self.base_url + str(i) 262 | yield scrapy.Request( 263 | url=url, 264 | cookies=cookies_now, 265 | callback=self.parse_page_links, 266 | cb_kwargs={ 267 | "pagenum": i, 268 | "code": code, 269 | "date": date, 270 | "requestType": "PaperAchGetLinks" 271 | }, 272 | meta={ 273 | 'url': url, 274 | "requestType": "PaperAchGetLinks" 275 | }, 276 | dont_filter=True 277 | ) 278 | 279 | # 解析列表内容获取链接 280 | def parse_page_links(self,response,pagenum,code,date,requestType): 281 | if ErrorUtil.isBadResponse(response=response): 282 | return 283 | rows = response.xpath('//table[@class="GridTableContent"]/tr') 284 | if len(rows) < 1: 285 | # 某一页没有获取到列表内容 286 | logging.error('页面无链接,以下是获取到的response:%s' % response.text) 287 | ErrorUtil.markCodeError(code=code,date=date, type=SpiderTypeEnum.PAPER_AND_ACH) 288 | return 289 | else: 290 | rows.pop(0) # 去掉标题行 291 | num = len(rows) # 该页链接数 292 | logging.debug("爬取%s %s 第%d页: %d个链接" % (code,date,pagenum,num)) 293 | for row in rows: 294 | link = row.xpath('./td/a[@class="fz14"]/@href').extract_first() 295 | link_params = link.split('&') 296 | urlParam = link_params[3] + "&" + link_params[4] + "&" + link_params[5] 297 | url = 'https://kns.cnki.net/KCMS/detail/detail.aspx?' + urlParam 298 | db = row.xpath('./td')[5].xpath('./text()').extract_first().strip() 299 | # 按不同的db类型解析文献,且只取期刊、博硕、成果三类,其余舍弃 300 | if db == '期刊': 301 | item = JournalLinkItem() 302 | item['code'] = code 303 | item['url'] = url 304 | item['db'] = db 305 | yield scrapy.Request( 306 | url=url, 307 | callback=self.parse_journal_content, 308 | # dont_filter=True, 309 | cb_kwargs={ 310 | 'url': url, 311 | 'code': code, 312 | 'date': date, 313 | "requestType": "JournalGetContent" 314 | }, 315 | meta={ 316 | 'url': url, 317 | "requestType": "JournalGetContent" 318 | } 319 | ) 320 | elif db == '博士' or db == '硕士': 321 | item = BoshuoLinkItem() 322 | item['code'] = code 323 | item['url'] = url 324 | item['db'] = db 325 | yield scrapy.Request( 326 | url=url, 327 | callback=self.parse_boshuo_content, 328 | # dont_filter=True, 329 | cb_kwargs={ 330 | 'url': url, 331 | 'code': code, 332 | 'date': date, 333 | "requestType": "BoshuoGetContent" 334 | }, 335 | meta={ 336 | 'url': url, 337 | "requestType": "BoshuoGetContent" 338 | } 339 | ) 340 | elif db == '科技成果': 341 | item = AchLinkItem() 342 | item['code'] = code 343 | item['url'] = url 344 | item['db'] = db 345 | yield scrapy.Request( 346 | url=url, 347 | callback=self.parse_ach_content, 348 | # dont_filter=True, 349 | cb_kwargs={ 350 | 'url': url, 351 | 'code': code, 352 | 'date': date, 353 | "requestType": "AchGetContent" 354 | }, 355 | meta={ 356 | 'url': url, 357 | "requestType": "AchGetContent" 358 | } 359 | ) 360 | 361 | 362 | # 获取期刊详情页内容 363 | def parse_journal_content(self, response, url, code, date, requestType): 364 | # 跳过知网错误链接 365 | # if url == 'https://kns.cnki.net/KCMS/detail/Error.aspx': 366 | # return 367 | if ErrorUtil.isBadResponse(response=response): 368 | return 369 | logging.debug("解析期刊:%s" % url) 370 | item = self.getDefaultJournalItem() 371 | item['naviCode'] = code 372 | item['type'] = SpiderTypeEnum.JOURNAL.value 373 | item['year'] = date[0:4] 374 | item['url'] = url 375 | # 根据link链接生成唯一uid,散列是SHA1,去除- 376 | uid = str(uuid.uuid5(uuid.NAMESPACE_DNS, url)) 377 | suid = ''.join(uid.split('-')) 378 | item['uid'] = suid 379 | item['title'] = response.xpath('//h1/text()').extract_first() 380 | magazine = response.xpath('//div[@class="top-tip"]/span/a') 381 | magazinefunc = magazine.xpath('./@onclick').extract_first() 382 | m = magazinefunc.strip().split("'") 383 | item['magazine'] = magazine.xpath('./text()').extract_first() + "-pcode=" + m[1] + "&pykm=" + m[3] 384 | summary = response.xpath('string(//span[@id="ChDivSummary"])').extract_first() 385 | item['summary'] = summary.replace('\n', '').replace('\r', ' ') 386 | keywordsfuncs = response.xpath('//p[@class="keywords"]/a/@onclick').extract() 387 | if len(keywordsfuncs) > 0: 388 | keywords = "" 389 | for k in keywordsfuncs: 390 | k = k.strip().split("'") 391 | keywords = keywords + ";" + k[3] + "-" + k[7] 392 | item['keywords'] = keywords[1:] 393 | item['authorsWithCode'] = self.getAuthorsWithLinkStr(response) 394 | item['authors'] = "&&".join(self.getAuthorsList(response)) 395 | # 单位字符串用';'分隔 396 | organs = ";".join(self.getOrganList(response)) 397 | item['organs'] = organs 398 | item['authorOrganJson'] = self.getAuthorOrganJson(response) 399 | top_space = response.xpath('//li[@class="top-space"]') 400 | for space in top_space: # 存在不同文献格式不同,只能判断标题名称 401 | title = space.xpath('./span/text()').extract_first() 402 | content = space.xpath('./p/text()').extract_first() 403 | if title == 'DOI:': 404 | item['DOI'] = content 405 | if title == '分类号:': 406 | item['cate_code'] = content 407 | if title == '来源数据库:': 408 | item['db'] = content 409 | if title == '专辑:': 410 | item['special'] = content 411 | if title == '专题:': 412 | item['subject'] = content 413 | # 保存html文件 414 | FileUtil.saveHtml(response=response, type=SpiderTypeEnum.JOURNAL.value, url=url, title=item['title'], year=date[0:4]) 415 | yield item 416 | 417 | # 获取博硕详情页内容 418 | def parse_boshuo_content(self, response, url, code, date, requestType): 419 | # if url == 'https://kns.cnki.net/KCMS/detail/Error.aspx': 420 | # return 421 | if ErrorUtil.isBadResponse(response=response): 422 | return 423 | logging.debug("解析博硕:%s" % url) 424 | item = self.getDefaultBoshuoItem() 425 | # 跳过知网错误链接 426 | item['naviCode'] = code 427 | item['type'] = SpiderTypeEnum.BOSHUO.value 428 | item['year'] = date[0:4] 429 | # 无用字段,为了让期刊和博硕的数据格式保持一致 430 | item['authorsWithCode'] = "" 431 | item['url'] = url 432 | # 根据link链接生成唯一uid,散列是SHA1,去除- 433 | uid = str(uuid.uuid5(uuid.NAMESPACE_DNS, url)) 434 | suid = ''.join(uid.split('-')) 435 | item['uid'] = suid 436 | item['title'] = response.xpath('//div[@class="wx-tit"]/h1/text()').extract_first() 437 | summary = response.xpath('//span[@id="ChDivSummary"]/text()').extract_first() 438 | if summary: 439 | item['summary'] = summary.replace('\n', '').replace('\r', ' ') 440 | keywordsfuncs = response.xpath('//div[@class="brief"]/div/p[@class="keywords"]/a/@onclick').extract() 441 | keywords = "" 442 | for k in keywordsfuncs: 443 | k = k.strip().split("'") 444 | keywords = keywords + ";" + k[3] + "-" + k[7] 445 | item['keywords'] = keywords[1:] 446 | brief = response.xpath('//div[@class="wx-tit"]/h3/span') 447 | splitTag = "&&" # 作者分隔符 448 | if (len(brief) >= 2): 449 | authors = brief[0] 450 | if authors.xpath('./a'): 451 | authorfuncs = authors.xpath('./a/@onclick').extract() 452 | authors = "" 453 | for a in authorfuncs: 454 | a = a.strip().split("'") 455 | author = a[3] + '-' + a[5] 456 | authors = authors + splitTag + author 457 | # 去除第一个多余拆分隔符的影响 458 | item['authors'] = authors[len(splitTag):] 459 | else: 460 | item['authors'] = authors.xpath('./text()').extract_first() + "-null" 461 | school = brief[1] 462 | if school.xpath('./a'): 463 | item['organs'] = school.xpath('./a/text()').extract_first().strip() 464 | else: 465 | item['organs'] = school.xpath('./text()').extract_first() 466 | 467 | authorOrganJson = {} 468 | author = item['authors'].split("-")[0] 469 | authorOrganJson[author] = [item['organs']] 470 | item['authorOrganJson'] = str(authorOrganJson) 471 | top_space = response.xpath('//li[@class="top-space"]') 472 | for space in top_space: # 存在不同文献格式不同,只能判断标题名称 473 | title = space.xpath('./span/text()').extract_first() 474 | content = space.xpath('./p/text()').extract_first() 475 | if title == 'DOI:': 476 | item['DOI'] = content 477 | if title == '来源数据库:': 478 | item['db'] = content 479 | if title == '专辑:': 480 | item['special'] = content 481 | if title == '专题:': 482 | item['subject'] = content 483 | if title == '分类号:': 484 | item['cate_code'] = content 485 | rows = response.xpath('//div[@class="row"]') 486 | for row in rows: 487 | title = row.xpath('./span/text()').extract_first() 488 | if title == '导师:': 489 | if row.xpath('./p/a'): 490 | mentorfuncs = row.xpath('./p/a/@onclick').extract_first() 491 | m = mentorfuncs.strip().split("'") 492 | item['mentor'] = m[3] + '-' + m[5] 493 | else: 494 | item['mentor'] = row.xpath('./p/text()').extract_first() 495 | # 保存html文件 496 | FileUtil.saveHtml(response=response, type=SpiderTypeEnum.BOSHUO.value, url=url, title=item['title'], year=date[0:4]) 497 | yield item 498 | 499 | # 获取成果详情页内容 500 | def parse_ach_content(self, response, url, code, date, requestType): 501 | # 跳过知网错误链接 502 | # if url == 'https://kns.cnki.net/KCMS/detail/Error.aspx': 503 | # return 504 | if ErrorUtil.isBadResponse(response=response): 505 | return 506 | logging.debug("解析成果:%s" % url) 507 | item = self.getDefaultAchItem() 508 | item['naviCode'] = code 509 | item['type'] = SpiderTypeEnum.ACHIEVEMENT.value 510 | item['year'] = date[0:4] 511 | item['url'] = url 512 | # 根据link链接生成唯一uid,散列是SHA1,去除- 513 | uid = str(uuid.uuid5(uuid.NAMESPACE_DNS, url)) 514 | suid = ''.join(uid.split('-')) 515 | item['uid'] = suid 516 | item['title'] = response.xpath('//h1/text()').extract_first() 517 | rows = response.xpath('//div[@class="row"]') 518 | for row in rows: 519 | title = row.xpath('./span/text()').extract_first() 520 | content = row.xpath('./p/text()').extract_first() 521 | if title == '成果完成人:': 522 | item['authors'] = content.replace(";", "&&") 523 | if title == '第一完成单位:': 524 | item['organ'] = content 525 | if title == '关键词:': 526 | item['keywords'] = content 527 | if title == '中图分类号:': 528 | item['book_code'] = content 529 | if title == '学科分类号:': 530 | item['subject_code'] = content 531 | if title == '成果简介:': 532 | item['summary'] = content.replace('\n', '').replace('\r', ' ') 533 | if title == '成果类别:': 534 | item['category'] = content 535 | if title == '成果入库时间:': 536 | item['in_time'] = content 537 | if title == '成果水平:': 538 | item['level'] = content 539 | if title == '研究起止时间:': 540 | item['pass_time'] = content 541 | if title == '评价形式:': 542 | item['evaluate'] = content 543 | # 保存html文件 544 | FileUtil.saveHtml(response=response, type=SpiderTypeEnum.ACHIEVEMENT.value, url=url, title=item['title'], year=date[0:4]) 545 | yield item 546 | 547 | def getDefaultJournalItem(self): 548 | ''' 549 | 为期刊item加默认的键,防止后面保存出现不存在的键 550 | :param item: 551 | :return: 552 | ''' 553 | item = JournalContentItem() 554 | item['naviCode'] = "" # 学科分类代码 如A001这种 555 | item['type'] = "" 556 | item['year'] = "" 557 | item['url'] = "" 558 | item['uid'] = "" 559 | item['title'] = "" 560 | item['authors'] = "" # 纯作者名列表 561 | item['authorsWithCode'] = "" # 带作者code的作者列表 562 | item['organs'] = "" 563 | item['authorOrganJson'] = "" # 作者和单位的对应关系json字符串 564 | item['summary'] = "" 565 | item['keywords'] = "" 566 | item['DOI'] = "" 567 | item['special'] = "" # 专辑 568 | item['subject'] = "" # 专题 569 | item['cate_code'] = "" # 分类号 570 | item['db'] = "" # 来源数据库 571 | 572 | item['magazine'] = "" # 期刊 573 | item['mentor'] = "" # 博硕导师 574 | return item 575 | 576 | def getDefaultBoshuoItem(self): 577 | item = BoshuoContentItem() 578 | item['naviCode'] = "" # 学科分类代码 如A001这种 579 | item['type'] = "" 580 | item['year'] = "" 581 | item['url'] = "" 582 | item['uid'] = "" 583 | item['title'] = "" 584 | item['authors'] = "" # 纯作者名列表 585 | item['authorsWithCode'] = "" # 带作者code的作者列表 586 | item['organs'] = "" 587 | item['authorOrganJson'] = "" # 作者和单位的对应关系json字符串 588 | item['summary'] = "" 589 | item['keywords'] = "" 590 | item['DOI'] = "" 591 | item['special'] = "" # 专辑 592 | item['subject'] = "" # 专题 593 | item['cate_code'] = "" # 分类号 594 | item['db'] = "" # 来源数据库 595 | 596 | item['magazine'] = "" # 期刊 597 | item['mentor'] = "" # 博硕导师 598 | return item 599 | 600 | def getDefaultAchItem(self): 601 | item = AchContentItem() 602 | item['naviCode'] = "" # 学科分类代码 如A001这种 603 | item['type'] = "" 604 | item['year'] = "" 605 | item['url'] = "" 606 | item['uid'] = "" 607 | item['title'] = "" 608 | item['authors'] = "" 609 | item['organ'] = "" # 第一完成单位 610 | item['keywords'] = "" 611 | item['book_code'] = "" # 中图分类号 612 | item['subject_code'] = "" # 学科分类号 613 | item['summary'] = "" 614 | item['category'] = "" # 成果类别 615 | item['in_time'] = "" # 成果入库时间 616 | item['pass_time'] = "" # 研究起止时间 617 | item['level'] = "" # 成果水平 618 | item['evaluate'] = "" # 评价形式 619 | return item 620 | 621 | 622 | # 获得专家(作者)和其链接的字符串 623 | def getAuthorsWithLinkStr(self, response): 624 | authorSelector = response.xpath('//h3[@class="author"]/span') 625 | 626 | authors = "" #作者字符串 627 | spiltTag = "&&" # 作者间的分隔符 628 | for selector in authorSelector: 629 | if selector.xpath('./a'): 630 | authorfunc = selector.xpath('./a/@onclick').extract_first() 631 | a = authorfunc.strip().split("'") 632 | author = a[3] + "-" + a[5] 633 | else: 634 | author = selector.xpath('./text()').extract_first() + "-null" 635 | # 连接不同的作者,用&&是因为有的作者名中含有&,会导致分离作者名与代码时候产生错误 636 | authors = authors + spiltTag + author 637 | 638 | return authors[len(spiltTag):] 639 | 640 | #获得专家(作者)列表字符串(无链接) 641 | def getAuthorsList(self, response): 642 | authorSelector = response.xpath('//h3[@class="author"]/span') 643 | 644 | authors = [] #作者列表 645 | for selector in authorSelector: 646 | if selector.xpath('./a'): 647 | authorfunc = selector.xpath('./a/@onclick').extract_first() 648 | a = authorfunc.strip().split("'") 649 | author = a[3] 650 | else: 651 | author = selector.xpath('./text()').extract_first() 652 | authors.append(author) 653 | return authors 654 | 655 | # 得到形如 '杭电;浙大;清华大学' 形式的单位字符串 656 | # 这个是原先的方式,单位顺序会乱掉,已舍弃 657 | # 已舍弃 658 | def getOrganStrOld(self, response): 659 | organstr = "" 660 | organSelector = response.xpath('//div[@class="wx-tit"]/h3')[1] 661 | if organSelector.xpath('./a[@class="author"]'): 662 | organ_a = organSelector.xpath('./a[@class="author"]/text()').extract() 663 | for o in organ_a: 664 | organstr = organstr + o.strip() 665 | # organ_noa = organSelector.xpath('./span/text()').extract_first() 666 | organ_noa = organSelector.xpath('./text()').extract_first() 667 | if organ_noa: 668 | organstr = organstr + organ_noa 669 | nonum = (re.sub(r'(\d+)', ' ', organstr)).strip() 670 | organlist = nonum.strip('.').split('.') 671 | organs = "" 672 | spiltTag = ';' # 单位间的分隔符 673 | for organ in organlist: 674 | organs = organs + spiltTag + organ.strip() 675 | # 规避掉开头多余的分隔符 676 | return organs[len(spiltTag):] 677 | 678 | # 得到单位的列表[杭电,浙大,清华大学],正序版 679 | def getOrganList(self, response): 680 | organSelector = response.xpath('//div[@class="wx-tit"]/h3')[1] 681 | # 这里简化了原来的代码,针对有链接和无链接的单位,用一个或逻辑, 682 | # 有链接的提取a标签下的,没链接的直接就是text(),这样的话顺序是对的 683 | # 注意单位有的是嵌套在span下的,有的没有span 684 | xpathStr = './span/a[@class="author"]/text() | ./span/text() | ./a[@class="author"]/text() | ./text()' 685 | organs = organSelector.xpath(xpathStr).extract() 686 | # 存在一个text()中两个单位的情况,需要处理一下 687 | organStr = "".join(organs) 688 | # for organ in organs: 689 | # # 去除'数字.' 690 | # organList.append(re.sub(r'(\d+)\.', ' ', organ).strip()) 691 | organsWithoutNum = (re.sub(r'(\d+)', ' ', organStr)).strip() 692 | organList = organsWithoutNum.strip('.').split('.') 693 | return organList 694 | 695 | # 生成作者和单位相对应的json 696 | def getAuthorOrganJson(self, response): 697 | # authorSelector = response.xpath('//h3[@class="author"]/span') 698 | # organSelector = response.xpath('//div[@class="wx-tit"]/h3')[1] 699 | authorOrganDict = {} # 专家和单位的对应字典,后面转成json 700 | # 作者列表 701 | authorList = self.getAuthorsList(response) 702 | # 如果作者代码块中没有出现sup,即无上标,说明所有人在同一个单位 703 | if not response.xpath('//h3[@class="author"]/span//sup').extract_first(): 704 | # 得到形如 '杭电;浙大;清华大学' 形式的单位字符串 705 | # 但由于没有上标,单位应该只有一个,所以这里其实单位是单独的 706 | organStr = self.getOrganList(response) 707 | # 为每一个作者添加单位信息 708 | for author in authorList: 709 | authorOrganDict[author] = organStr 710 | else: 711 | # 得到所有的下标,按顺序和作者对应就能得到每个作者与某个单位的对应关系 712 | # 结果示例:['1', '1', '1', '1,2'] 713 | # 下面用span,不能用a,有的人是没有链接的 714 | subs = response.xpath('//h3[@class="author"]/span//sup/text()').extract() 715 | # 得到形如 '杭电;浙大;清华大学' 形式的单位字符串,再转换成列表 716 | organList = self.getOrganList(response) 717 | # 遍历专家列表,并取出同index的sub列表,利用sub中存储的下标在单位列表中找到对应 718 | for i in range(len(authorList)): 719 | # 每个作者可能属于多个单位 720 | organListOneAuthor = [] 721 | # logging.debug(subs, i) 722 | # logging.debug(organList) 723 | # logging.debug(authorList) 724 | for index in subs[i].split(','): 725 | # 这里注意index要减1,因为知网上标是从1开始,列表是从0开始 726 | try: 727 | organListOneAuthor.append(organList[int(index)-1]) 728 | except IndexError as e: 729 | # 这段异常调试代码已经找到了数组越界成因,主要是之前单位字段提取有点问题 730 | # logging.error('异常来啦!') 731 | # logging.error('标题:%s' % response.xpath('//h1/text()').extract_first()) 732 | # logging.error('上标列表', subs) 733 | # logging.error('目前上标', index) 734 | # logging.error('单位列表', organList) 735 | # logging.error('作者列表', authorList) 736 | # 重新抛出 737 | raise e 738 | authorOrganDict[authorList[i]] = organListOneAuthor 739 | return str(authorOrganDict) 740 | --------------------------------------------------------------------------------