├── CNKI
├── CNKI
│ ├── __init__.py
│ ├── cnki_patent_info.csv
│ ├── __pycache__
│ │ ├── items.cpython-36.pyc
│ │ ├── __init__.cpython-36.pyc
│ │ ├── pipelines.cpython-36.pyc
│ │ └── settings.cpython-36.pyc
│ ├── spiders
│ │ ├── __pycache__
│ │ │ ├── cnki.cpython-36.pyc
│ │ │ └── __init__.cpython-36.pyc
│ │ ├── __init__.py
│ │ ├── cnkispider.py
│ │ └── cnki.py
│ ├── main.py
│ ├── items.py
│ ├── pipelines.py
│ ├── middlewares.py
│ ├── settings.py
│ └── dem.html
├── .idea
│ ├── encodings.xml
│ ├── misc.xml
│ ├── modules.xml
│ ├── CNKI.iml
│ └── workspace.xml
└── scrapy.cfg
├── ipPool
├── ip_free_pool.csv
├── ip_free_pool2.csv
├── available.csv
├── check_ip.py
├── check_ip_use.py
└── ipSpider.py
├── .gitattributes
└── README.md
/CNKI/CNKI/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/ipPool/ip_free_pool.csv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wen-fei/CNKISpider/HEAD/ipPool/ip_free_pool.csv
--------------------------------------------------------------------------------
/ipPool/ip_free_pool2.csv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wen-fei/CNKISpider/HEAD/ipPool/ip_free_pool2.csv
--------------------------------------------------------------------------------
/CNKI/CNKI/cnki_patent_info.csv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wen-fei/CNKISpider/HEAD/CNKI/CNKI/cnki_patent_info.csv
--------------------------------------------------------------------------------
/CNKI/CNKI/__pycache__/items.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wen-fei/CNKISpider/HEAD/CNKI/CNKI/__pycache__/items.cpython-36.pyc
--------------------------------------------------------------------------------
/CNKI/CNKI/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wen-fei/CNKISpider/HEAD/CNKI/CNKI/__pycache__/__init__.cpython-36.pyc
--------------------------------------------------------------------------------
/CNKI/CNKI/__pycache__/pipelines.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wen-fei/CNKISpider/HEAD/CNKI/CNKI/__pycache__/pipelines.cpython-36.pyc
--------------------------------------------------------------------------------
/CNKI/CNKI/__pycache__/settings.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wen-fei/CNKISpider/HEAD/CNKI/CNKI/__pycache__/settings.cpython-36.pyc
--------------------------------------------------------------------------------
/CNKI/CNKI/spiders/__pycache__/cnki.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wen-fei/CNKISpider/HEAD/CNKI/CNKI/spiders/__pycache__/cnki.cpython-36.pyc
--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | *.js linguist-language=python
2 | *.css linguist-language=python
3 | *.html linguist-language=python
4 | *.ipynb linguist-language=python
5 |
--------------------------------------------------------------------------------
/CNKI/CNKI/spiders/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wen-fei/CNKISpider/HEAD/CNKI/CNKI/spiders/__pycache__/__init__.cpython-36.pyc
--------------------------------------------------------------------------------
/CNKI/CNKI/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 |
--------------------------------------------------------------------------------
/CNKI/CNKI/main.py:
--------------------------------------------------------------------------------
1 | # Created by Landuy at 2017/10/9
2 | from scrapy.cmdline import execute
3 | import os
4 | import sys
5 |
6 | sys.path.append(os.path.dirname(os.path.abspath(__file__)))
7 | execute(["scrapy", "crawl", "cnkisp"])
--------------------------------------------------------------------------------
/CNKI/.idea/encodings.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/CNKI/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
--------------------------------------------------------------------------------
/CNKI/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/CNKI/scrapy.cfg:
--------------------------------------------------------------------------------
1 | # Automatically created by: scrapy startproject
2 | #
3 | # For more information about the [deploy] section see:
4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
5 |
6 | [settings]
7 | default = CNKI.settings
8 |
9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = CNKI
12 |
--------------------------------------------------------------------------------
/ipPool/available.csv:
--------------------------------------------------------------------------------
1 | 61.135.217.7:80
2 | 113.128.26.251:24640
3 | 183.184.60.140:8118
4 | 120.78.15.63:80
5 | 119.29.190.104:8118
6 | 180.167.46.22:53281
7 | 218.18.232.29:8080
8 | 202.141.161.30:8118
9 | 116.7.243.58:53281
10 | 42.52.212.248:80
11 | 14.154.31.85:8118
12 | 180.106.111.63:8118
13 | 220.166.241.124:8118
14 | 110.73.10.244:8123
15 | 121.12.42.188:61234
16 | 125.125.235.22:8118
17 | 118.250.50.190:80
18 | 218.27.173.175:8118
19 | 101.249.223.232:80
20 | 42.243.78.204:8080
21 | 113.134.160.77:80
22 |
--------------------------------------------------------------------------------
/CNKI/.idea/CNKI.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # CNKISpider
2 | 知网专利爬虫,仅用于学习交流,不做商业使用
3 |
4 | #### 发现新的爬取入口
5 |
6 | 今天同学突然告诉我爬取了100多W(我们需要爬2014年的,总共190W+),细问才知道,知网的专利详情页的url组成是有规则的。
7 |
8 | 举个例子:
9 |
10 | http://dbpub.cnki.net/grid2008/dbpub/Detail.aspx?DBName=SCPD2014&FileName=CN203968251U&QueryID=28&CurRec=2
11 |
12 | 对于这个某个专利的url来说,我们只要变化FileName=CN203968251U就可以了,=号后面代表的是专利公开号,专利公开号亦称专利文献号,组成方式为“国别号+分类号+流水号+标识代码”,如CN1340998A,表示中国的第340998号发明专利(来自百度百科)。
13 |
14 | 假如我们需要爬取2014年的所有专利,我们可以通过搜索找到2014年1月1日(2014年非常早的一篇专利号)和2014年12月31日(2014年非常晚的一篇专利号),取中间的差值,就可以爬取绝大部分需要的专利了。
15 |
16 | 其中,CN是固定的,末尾的字母是专利标识代码,中国只有ASU种
17 |
18 | 所有,避免了爬取url列表页(反爬虫严重)和复杂的验证码问题,直接构建循环爬取详情页即可。
19 |
20 | ### 项目使用工具
21 |
22 | 框架使用的的Scrapy1.3,python版本3.6
--------------------------------------------------------------------------------
/CNKI/CNKI/items.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define here the models for your scraped items
4 | #
5 | # See documentation in:
6 | # http://doc.scrapy.org/en/latest/topics/items.html
7 | from scrapy import Item, Field
8 |
9 |
10 | class CnkiItem(Item):
11 | application_no = Field()
12 | publication_no = Field()
13 | application_day = Field()
14 | publication_day = Field()
15 | publication_user = Field()
16 | publication_address = Field()
17 | patent_inventor = Field()
18 | patent_agent = Field()
19 | patent_agent_user = Field()
20 | patent_summary = Field()
21 | patent_main_item = Field()
22 | main_cls_no = Field()
23 | patent_cls_np = Field()
24 | patent_title = Field()
25 |
--------------------------------------------------------------------------------
/CNKI/CNKI/pipelines.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define your item pipelines here
4 | #
5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
7 | import csv
8 |
9 | class CnkiPipeline(object):
10 | colname = ['application_no', 'application_day', 'publication_no',
11 | 'publication_day', 'publication_user', 'publication_address',
12 | 'patent_inventor', 'patent_agent', 'patent_agent_user',
13 | 'publication_address', 'patent_inventor', 'patent_agent',
14 | 'patent_agent_user', 'patent_summary', 'patent_main_item']
15 |
16 | def open_spider(self, spider):
17 | # 在爬虫启动时,创建csv,并设置newline=''来避免空行出现
18 | self.file = open('cnki_patent_info.csv', 'w', newline='')
19 | # 启动csv的字典写入方法
20 | self.writer = csv.DictWriter(self.file, self.colname)
21 | # 写入字段名称作为首行
22 | self.writer.writeheader()
23 |
24 | def close_spider(self, spider):
25 | self.file.close()
26 |
27 |
28 | def process_item(self, item, spider):
29 | self.writer.writerow(item)
30 | return item
31 |
--------------------------------------------------------------------------------
/ipPool/check_ip.py:
--------------------------------------------------------------------------------
1 | # Created by Landuy at 2017/9/29
2 | import urllib.request as urllib2
3 | import threading
4 |
5 | inFile = open('ip_free_pool.csv', 'r')
6 | outFile = open('available.csv', 'w')
7 | url = 'http://www.huangshanben.com/'
8 | lock = threading.Lock()
9 |
10 |
11 | def test():
12 | lock.acquire()
13 | line = inFile.readline().strip(',')[1:]
14 | lock.release()
15 | protocol, proxy = line[1], line[0]
16 | cookie = "PHPSESSID=5f7mbqghvk1kt5n9illa0nr175; kmsign=56023b6880039; KMUID=ezsEg1YCOzxg97EwAwUXAg=="
17 | try:
18 | proxy_support = urllib2.ProxyHandler({protocol.lower(): '://'.join(line.split('='))})
19 | opener = urllib2.build_opener(proxy_support, urllib2.HTTPHandler)
20 | urllib2.install_opener(opener)
21 | request = urllib2.Request(url)
22 | request.add_header("cookie", cookie)
23 | content = urllib2.urlopen(request, timeout=4).read()
24 | if len(content) >= 1000:
25 | lock.acquire()
26 | print('add proxy', proxy)
27 | outFile.write('\"%s\",\n' % proxy)
28 | lock.release()
29 | else:
30 | print('出现验证码或IP被封杀')
31 | except Exception:
32 | print(Exception)
33 |
34 |
35 | all_thread = []
36 | for i in range(500):
37 | t = threading.Thread(target=test)
38 | all_thread.append(t)
39 | t.start()
40 |
41 | for t in all_thread:
42 | t.join()
43 |
44 | inFile.close()
45 | outFile.close()
--------------------------------------------------------------------------------
/CNKI/CNKI/spiders/cnkispider.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import scrapy
3 |
4 | from cnki.items import CnkiItem
5 |
6 |
7 | class CnkispiderSpider(scrapy.Spider):
8 |
9 | name = "cnkispider"
10 | allowed_domains = ["dbpub.cnki.net/Grid2008/Dbpub/Brief.aspx?ID=SCPD&subBase=all"]
11 | #专利号按顺序排列
12 | start_urls = ['http://dbpub.cnki.net/Grid2008/Dbpub/Detail.aspx?DBName=SCPD2010&FileName=CN'+str(j)+'U&QueryID=4&CurRec=1' for j in range(203369100, 204050060)]
13 |
14 | def parse(self, response):
15 |
16 | name = response.css('td[width="832"]::text').extract_first()
17 | cnki = response.css('#box')
18 | all = cnki.css('td[bgcolor="#FFFFFF"]::text').extract()
19 | item = CnkiItem()
20 | item['name'] = name
21 | item['number'] = all[0]
22 | item['data'] = all[1]
23 | item['outnumber'] = all[2]
24 | item['outdata'] = all[3]
25 | item['sname'] = all[4]
26 | item['add'] = all[5]
27 | item['fname'] = all[7]
28 | item['dadd'] = all[11]
29 | item['dname'] = all[12]
30 | item['pnum'] = all[14]
31 | item['keyword'] = all[15]
32 | item['pages'] = all[17]
33 | item['typenum'] = all[18]
34 | yield item
35 | #for i in range(302697180,303060980):
36 | # url = 'http://dbpub.cnki.net/Grid2008/Dbpub/Detail.aspx?DBName=SCPD2010&FileName=CN'+str(i)+'S&QueryID=4&CurRec=1'
37 | # yield scrapy.Request(url=url, callback=self.parse)
--------------------------------------------------------------------------------
/ipPool/check_ip_use.py:
--------------------------------------------------------------------------------
1 | # Created by Landuy at 2017/9/29
2 | import urllib.request as urllib2
3 | import threading
4 |
5 | inFile = open('ip_free_pool.csv', 'r')
6 | outFile = open('available.csv', 'w')
7 | url = 'http://www.huangshanben.com/'
8 | lock = threading.Lock()
9 |
10 |
11 | def test():
12 | lock.acquire()
13 | line = inFile.readline().split(',')
14 | print(line)
15 | lock.release()
16 | protocol, proxy = line[5].strip(), line[0]
17 | print(protocol, proxy)
18 | cookie = "PHPSESSID=5f7mbqghvk1kt5n9illa0nr175; kmsign=56023b6880039; KMUID=ezsEg1YCOzxg97EwAwUXAg=="
19 | try:
20 | proxy_support = urllib2.ProxyHandler({protocol.lower(): '://'.join([protocol, proxy])})
21 | opener = urllib2.build_opener(proxy_support, urllib2.HTTPHandler)
22 | urllib2.install_opener(opener)
23 | request = urllib2.Request(url)
24 | request.add_header("cookie", cookie)
25 | content = urllib2.urlopen(request, timeout=4).read()
26 | if len(content) >= 1000:
27 | lock.acquire()
28 | print('add proxy %s:%s'% (proxy,line[1]))
29 | outFile.write('\"%s:%s\",\n' % (proxy, line[1]))
30 | lock.release()
31 | else:
32 | print('出现验证码或IP被封杀')
33 | except Exception:
34 | print(Exception.args)
35 |
36 |
37 | all_thread = []
38 | for i in range(3070):
39 | t = threading.Thread(target=test)
40 | all_thread.append(t)
41 | t.start()
42 |
43 | for t in all_thread:
44 | t.join()
45 |
46 | inFile.close()
47 | outFile.close()
--------------------------------------------------------------------------------
/ipPool/ipSpider.py:
--------------------------------------------------------------------------------
1 | # Created by Landuy at 2017/9/29
2 | import urllib.request as urllib2
3 | from bs4 import BeautifulSoup
4 | import csv
5 |
6 |
7 |
8 | def check_ip(ip_port):
9 | """检测ip是否可用,只爬取可用的ip"""
10 | #Todo 是用Scrapy进行爬取的时候在使用之前也要先检测代理是否可用,或者采取判断机制,不可用的直接抛弃
11 | return True
12 | pass
13 |
14 |
15 |
16 | # py3 读文件写入做了严格的区分,不要用wb模式,用w模式,另外注意加上newline,否则文件会每个一行多一个空行
17 | with open("ip_free_pool.csv", "w", newline="") as csvfile:
18 | write = csv.writer(csvfile)
19 | # 先写入columns_name
20 | write.writerow(["ip", "port", "address", "speed", "contact_speed", "protocol_type"])
21 | for page in range(1, 500):
22 | """
23 | 只获取前50页的IP数量,足够备用
24 | 需要筛选出高质量的代理IP
25 | """
26 | url = 'http://www.xicidaili.com/nn/%s' % page
27 | user_agent = "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.134 Safari/537.36"
28 | request = urllib2.Request(url)
29 | request.add_header("User-Agent", user_agent)
30 | # 得到网页内容
31 | content = urllib2.urlopen(request)
32 | # 对网页进行解析
33 | soup = BeautifulSoup(content, 'lxml')
34 | # 获取本页面所有的ip列表
35 | trs = soup.find('table', {"id": "ip_list"}).findAll('tr')
36 | # print(trs) # 测试有数据
37 | # 对ip进行提取,trs[0]是表头,过滤掉
38 | for tr in trs[1:]:
39 | tds = tr.findAll('td')
40 | # ip地址
41 | ip = tds[1].text.strip()
42 | # 端口
43 | port = tds[2].text.strip()
44 | # 类型
45 | protocol_type = tds[5].text.strip()
46 | # 服务器地址, 有可能为空
47 | address = tds[3].find('a')
48 | if address is None:
49 | address= "UNKNOW"
50 | else:
51 | address = address.text
52 | # 获取速度 attrs可以获取标签内的属性的内容,[:-1] 可以截断字符串弃掉非数字内容
53 | speed = tds[6].find('div').attrs['title'][:-1]
54 | # 筛选速度比较好的
55 | if float(speed) > 2:
56 | continue
57 | else:
58 | # 获取链接速度
59 | contact_speed = tds[7].find('div').attrs['title'][:-1]
60 | if float(contact_speed) > 2:
61 | continue
62 | else:
63 | # 检测代理是否可用
64 | ip_port = ip + ":" + port
65 | can_use = check_ip(ip_port)
66 | if can_use is True:
67 | # 写入多行用writerows
68 | write.writerow([ip, port, address, speed, contact_speed, protocol_type])
69 | print(ip, port, address, speed, contact_speed, protocol_type)
70 | else:
71 | print("代理不可用")
72 |
--------------------------------------------------------------------------------
/CNKI/CNKI/middlewares.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define here the models for your spider middleware
4 | #
5 | # See documentation in:
6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html
7 |
8 | from scrapy import signals
9 | from fake_useragent import UserAgent
10 |
11 | class CnkiSpiderMiddleware(object):
12 | # Not all methods need to be defined. If a method is not defined,
13 | # scrapy acts as if the spider middleware does not modify the
14 | # passed objects.
15 |
16 | @classmethod
17 | def from_crawler(cls, crawler):
18 | # This method is used by Scrapy to create your spiders.
19 | s = cls()
20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
21 | return s
22 |
23 | def process_spider_input(response, spider):
24 | # Called for each response that goes through the spider
25 | # middleware and into the spider.
26 |
27 | # Should return None or raise an exception.
28 | return None
29 |
30 | def process_spider_output(response, result, spider):
31 | # Called with the results returned from the Spider, after
32 | # it has processed the response.
33 |
34 | # Must return an iterable of Request, dict or Item objects.
35 | for i in result:
36 | yield i
37 |
38 | def process_spider_exception(response, exception, spider):
39 | # Called when a spider or process_spider_input() method
40 | # (from other spider middleware) raises an exception.
41 |
42 | # Should return either None or an iterable of Response, dict
43 | # or Item objects.
44 | pass
45 |
46 | def process_start_requests(start_requests, spider):
47 | # Called with the start requests of the spider, and works
48 | # similarly to the process_spider_output() method, except
49 | # that it doesn’t have a response associated.
50 |
51 | # Must return only requests (not items).
52 | for r in start_requests:
53 | yield r
54 |
55 | def spider_opened(self, spider):
56 | spider.logger.info('Spider opened: %s' % spider.name)
57 |
58 |
59 | class RandomUserAgentMiddleware(object):
60 |
61 | def __init__(self, crawler):
62 | super(RandomUserAgentMiddleware, self).__init__()
63 | self.ua = UserAgent()
64 | self.ua_type = crawler.settings.get("RANDOM_UA_TYPE", "random")
65 |
66 |
67 | def spider_opened(self, spider):
68 | spider.logger.info('Spider opened: %s' % spider.name)
69 |
70 |
71 | @classmethod
72 | def from_crawler(cls, crawler):
73 | return cls(crawler)
74 |
75 |
76 | def process_request(self, request, spider):
77 | # def get_ua():
78 | # return getattr(self.ua, self.ua_type)
79 |
80 | # random_agent = get_ua()
81 | # request.headers.setdefault("User-Agent", get_ua())
82 | # 设置代理
83 | # request.meta['proxy'] = ""
84 | request.headers.setdefault("Cookie", "Ecp_ClientId=2170914074501149831; RsPerPage=20; cnkiUserKey=42463fbc-f813-8023-21c7-d4cd29c7bff8; ASP.NET_SessionId=u3vqombtchoej45crc4daaue; SID_kns=123119; SID_kinfo=125104; SID_klogin=125144; SID_krsnew=125131; SID_kredis=125144; Ecp_IpLoginFail=171009112.81.2.110")
--------------------------------------------------------------------------------
/CNKI/CNKI/settings.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Scrapy settings for CNKI project
4 | #
5 | # For simplicity, this file contains only settings considered important or
6 | # commonly used. You can find more settings consulting the documentation:
7 | #
8 | # http://doc.scrapy.org/en/latest/topics/settings.html
9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
11 |
12 | BOT_NAME = 'CNKI'
13 |
14 | SPIDER_MODULES = ['CNKI.spiders']
15 | NEWSPIDER_MODULE = 'CNKI.spiders'
16 |
17 |
18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
19 | #USER_AGENT = 'CNKI (+http://www.yourdomain.com)'
20 |
21 | # Obey robots.txt rules
22 | ROBOTSTXT_OBEY = False
23 |
24 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
25 | #CONCURRENT_REQUESTS = 32
26 |
27 | # Configure a delay for requests for the same website (default: 0)
28 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
29 | # See also autothrottle settings and docs
30 | DOWNLOAD_DELAY = 1
31 | # The download delay setting will honor only one of:
32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
33 | #CONCURRENT_REQUESTS_PER_IP = 16
34 |
35 | # Disable cookies (enabled by default)
36 | #COOKIES_ENABLED = False
37 |
38 | # Disable Telnet Console (enabled by default)
39 | #TELNETCONSOLE_ENABLED = False
40 |
41 | # Override the default request headers:
42 | #DEFAULT_REQUEST_HEADERS = {
43 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
44 | # 'Accept-Language': 'en',
45 | #}
46 |
47 | # Enable or disable spider middlewares
48 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
49 | #SPIDER_MIDDLEWARES = {
50 | # 'CNKI.middlewares.CnkiSpiderMiddleware': 543,
51 | #}
52 |
53 | # Enable or disable downloader middlewares
54 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
55 | # DOWNLOADER_MIDDLEWARES = {
56 | # "CNKI.middlewares.RandomUserAgentMiddleware" : 543,
57 | # 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None,
58 | # }
59 |
60 | # Enable or disable extensions
61 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
62 | #EXTENSIONS = {
63 | # 'scrapy.extensions.telnet.TelnetConsole': None,
64 | #}
65 |
66 | # Configure item pipelines
67 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
68 | ITEM_PIPELINES = {
69 | 'CNKI.pipelines.CnkiPipeline': 300,
70 | }
71 |
72 | # Enable and configure the AutoThrottle extension (disabled by default)
73 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
74 | AUTOTHROTTLE_ENABLED = True
75 | # The initial download delay
76 | AUTOTHROTTLE_START_DELAY = 3
77 | # The maximum download delay to be set in case of high latencies
78 | AUTOTHROTTLE_MAX_DELAY = 10
79 | # The average number of requests Scrapy should be sending in parallel to
80 | # each remote server
81 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
82 | # Enable showing throttling stats for every response received:
83 | #AUTOTHROTTLE_DEBUG = False
84 |
85 | # Enable and configure HTTP caching (disabled by default)
86 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
87 | #HTTPCACHE_ENABLED = True
88 | #HTTPCACHE_EXPIRATION_SECS = 0
89 | #HTTPCACHE_DIR = 'httpcache'
90 | #HTTPCACHE_IGNORE_HTTP_CODES = []
91 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
92 | DEFAULT_REQUEST_HEADERS = {
93 | "Host" : "kns.cnki.net",
94 | "User-Agent" : "Mozilla/5.0 (Windows NT 10.0; …) Gecko/20100101 Firefox/56.0",
95 | "Accept" : "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
96 | "Referer" : "http://kns.cnki.net/kns/brief/…ageName=ASP.brief_result_aspx",
97 | "Cookie" : "Ecp_ClientId=1 171009095901465355; Ecp_IpLoginFail=171009112.81.2.110; RsPerPage=50; cnkiUserKey=dd6eca65-a22c-330b-d486-22684afbe7b2; ASP.NET_SessionId=5atsoskm5rxqkirzhct0vjdb; SID_kns=123122; SID_kinfo=125102; SID_klogin=125141; SID_kredis=125142; SID_krsnew=125132",
98 | "Connection" : "keep-alive",
99 | "Upgrade-Insecure-Requests" : "1"
100 | }
--------------------------------------------------------------------------------
/CNKI/CNKI/spiders/cnki.py:
--------------------------------------------------------------------------------
1 | # Created by Landuy at 2017/10/9
2 | import scrapy
3 | from scrapy import Request
4 | from urllib import parse
5 | from CNKI.items import CnkiItem
6 | import re
7 |
8 |
9 | class cnkiSpider(scrapy.Spider):
10 | name = 'cnkisp'
11 | # allowed_domains = ["www.cnki.net"]
12 | start_urls = ["http://kns.cnki.net/kns/brief/result.aspx?dbPrefix=SCPD"]
13 |
14 | cookies = {
15 | "ASP.NET_SessionId": "5atsoskm5rxqkirzhct0vjdb",
16 | "cnkiUserKey": "dd6eca65-a22c-330b-d486-22684afbe7b2",
17 | "Ecp_ClientId": "1171009095901465355",
18 | "Ecp_IpLoginFail": "171009112.81.2.110",
19 | "SID_kns": "123122",
20 | "SID_kinfo": "125102",
21 | "SID_klogin": "125141",
22 | "SID_kredis": "125142",
23 | "RsPerPage": "20"
24 | }
25 | meta = {'dont_redirect': True, 'handle_httpstatus_list': [302]}
26 |
27 | headers = {
28 | "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:56.0) Gecko/20100101 Firefox/56.0",
29 | "Cookie" : "Ecp_ClientId=1171009095901465355; Ecp_IpLoginFail=171009112.81.2.110; RsPerPage=50; cnkiUserKey=dd6eca65-a22c-330b-d486-22684afbe7b2; ASP.NET_SessionId=5atsoskm5rxqkirzhct0vjdb; SID_kns=123122; SID_kinfo=125102; SID_klogin=125141; SID_kredis=125142; SID_krsnew=125132",
30 |
31 | }
32 |
33 | def start_requests(self):
34 | start_url = "http://kns.cnki.net/kns/brief/brief.aspx?" \
35 | "curpage=1&RecordsPerPage=50" \
36 | "&QueryID=5" \
37 | "&ID=&turnpage=1" \
38 | "&tpagemode=L" \
39 | "&dbPrefix=SCPD" \
40 | "&Fields=" \
41 | "&DisplayMode=listmode" \
42 | "&PageName=ASP.brief_result_aspx#J_ORDER&"
43 |
44 | yield Request(url=start_url, headers=self.headers, cookies=self.cookies)
45 |
46 | def parse(self, response):
47 | """
48 | 得到专利详情页链接列表
49 | :param response:
50 | :return:
51 | """
52 |
53 | urls_node = response.css("table.GridTableContent tr")
54 | for node in urls_node[1:]:
55 | patent_detail_url = "http://dbpub.cnki.net/grid2008/dbpub/detail.aspx?dbcode=SCPD&dbname=SCPD2017&filename="
56 | patent_url = node.css("a.fz14::attr(href)").extract_first("")
57 | # / kns / detail / detail.aspx?QueryID = 5 & CurRec = 8 & dbcode = scpd & dbname = SCPD2014 & filename = CN103786360A
58 | match_re = re.match(".*filename=(\w+)", patent_url)
59 | if match_re:
60 | patent_detail_url = patent_detail_url + match_re.group(1)
61 | else:
62 | print("url错误")
63 | continue
64 | print("专利详情url:", patent_detail_url)
65 | yield Request(url=patent_detail_url, callback=self.parse_detail,
66 | headers=self.headers, cookies=self.cookies, meta=self.meta)
67 |
68 | # 提取下一页交给scrapy下载
69 | next_url = response.css("div.TitleLeftCell a::attr(href)").extract()[-1]
70 | print("next url is :", parse.urljoin(response.url, next_url))
71 | yield Request(url=parse.urljoin(response.url, next_url),
72 | callback=parse, headers=self.headers, cookies=self.cookies, meta=self.meta)
73 |
74 |
75 | def parse_detail(self, response):
76 | """
77 | 详情提取
78 | :param response:
79 | :return:
80 | """
81 | print("详情页提取")
82 | node_list = response.css("table#box tr")
83 | node_1 = node_list[0].css("td::text").extract()
84 | application_no = node_1[1].replace(u'\xa0', u'')
85 | application_day = node_1[3].replace(u'\xa0', u'')
86 | node_2 = node_list[1].css("td::text").extract()
87 | publication_no = node_2[1].replace(u'\xa0', u'')
88 | publication_day = node_2[3].replace(u'\xa0', u'')
89 | node_3 = node_list[2].css("td::text").extract()
90 | publication_user = node_3[1].replace(u'\xa0', u'')
91 | publication_address = node_3[3].replace(u'\xa0', u'')
92 | node_4 = node_list[4].css("td::text").extract()
93 | patent_inventor = node_4[1].replace(u'\xa0', u'')
94 | node_5 = node_list[7].css("td::text").extract()
95 | patent_agent = node_5[1].replace(u'\xa0', u'')
96 | patent_agent_user = node_5[3].replace(u'\xa0', u'')
97 | node_6 = node_list[10].css("td::text").extract()
98 | patent_summary = node_6[1].replace(u'\xa0', u'')
99 | node_7 = node_list[11].css("td::text").extract()
100 | patent_main_item = node_7[1].replace(u'\xa0', u'')
101 | # main_cls_no =
102 | # patent_cls_np =
103 | # patent_title =
104 | item = CnkiItem()
105 | item['application_no'] = application_no
106 | item['application_day'] = application_day
107 | item['publication_no'] = publication_no
108 | item['publication_day'] = publication_day
109 | item['publication_user'] = publication_user
110 | item['publication_address'] = publication_address
111 | item['patent_inventor'] = patent_inventor
112 | item['patent_agent'] = patent_agent
113 | item['patent_agent_user'] = patent_agent_user
114 | item['patent_summary'] = patent_summary
115 | item['patent_main_item'] = patent_main_item
116 | yield item
--------------------------------------------------------------------------------
/CNKI/.idea/workspace.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
73 |
74 |
75 |
76 |
77 |
78 |
79 |
80 |
81 |
82 |
89 |
90 |
91 |
92 | GridTableContent
93 |
94 |
95 |
96 |
97 |
98 |
99 |
100 |
101 |
102 |
103 |
104 |
105 |
106 |
107 |
108 |
109 |
110 |
111 |
112 |
113 |
114 | true
115 | DEFINITION_ORDER
116 |
117 |
118 |
119 |
120 |
121 |
122 |
123 |
124 |
125 |
126 |
127 |
128 |
129 |
130 |
131 |
132 |
133 |
134 |
135 |
136 |
137 |
138 |
139 |
140 |
141 |
142 |
143 |
144 |
145 |
146 |
147 |
148 |
149 |
150 |
151 |
152 |
153 |
154 |
155 |
156 |
157 |
158 |
159 |
160 |
161 |
162 |
163 |
164 |
165 |
166 |
167 |
168 |
169 |
170 |
171 |
172 |
173 |
174 |
175 |
176 |
177 |
178 |
179 |
180 |
181 |
182 |
183 |
184 |
185 |
186 |
187 |
188 |
189 |
190 |
191 |
192 |
193 |
194 |
195 |
196 |
197 |
198 |
199 |
200 |
201 |
202 |
203 |
204 |
205 |
206 |
207 |
208 |
209 |
210 |
211 |
212 |
213 |
214 |
215 |
216 |
217 |
218 |
219 |
220 |
221 |
222 |
223 |
224 |
225 |
226 |
227 |
228 |
229 |
230 |
231 |
232 |
233 |
234 |
235 |
236 |
237 |
238 |
239 |
240 |
241 |
242 |
243 |
244 |
245 |
246 |
247 |
248 |
249 |
250 |
251 |
252 |
253 |
254 |
255 |
256 |
257 |
258 | 1507510269862
259 |
260 |
261 | 1507510269862
262 |
263 |
264 |
265 |
266 |
267 |
268 |
269 |
270 |
271 |
272 |
273 |
274 |
275 |
276 |
277 |
278 |
279 |
280 |
281 |
282 |
283 |
284 |
285 |
286 |
287 |
288 |
289 |
290 |
291 |
292 |
293 |
294 |
295 |
296 |
297 |
298 |
299 |
300 |
301 |
302 |
303 |
304 |
305 |
306 |
307 |
308 | file://$PROJECT_DIR$/CNKI/spiders/cnki.py
309 | 80
310 |
311 |
312 |
313 | file://$PROJECT_DIR$/CNKI/spiders/cnki.py
314 | 99
315 |
316 |
317 |
318 | file://$PROJECT_DIR$/CNKI/spiders/cnki.py
319 | 103
320 |
321 |
322 |
323 |
324 |
325 |
326 |
327 |
328 |
329 |
330 |
331 |
332 |
333 |
334 |
335 |
336 |
337 |
338 |
339 |
340 |
341 |
342 |
343 |
344 |
345 |
346 |
347 |
348 |
349 |
350 |
351 |
352 |
353 |
354 |
355 |
356 |
357 |
358 |
359 |
360 |
361 |
362 |
363 |
364 |
365 |
366 |
367 |
368 |
369 |
370 |
371 |
372 |
373 |
374 |
375 |
376 |
377 |
378 |
379 |
380 |
381 |
382 |
383 |
384 |
385 |
386 |
387 |
388 |
389 |
390 |
391 |
392 |
393 |
394 |
395 |
396 |
397 |
398 |
399 |
400 |
401 |
402 |
403 |
404 |
405 |
406 |
407 |
408 |
409 |
410 |
411 |
412 |
413 |
414 |
415 |
416 |
417 |
418 |
419 |
420 |
421 |
422 |
423 |
424 |
425 |
426 |
427 |
428 |
429 |
430 |
431 |
432 |
433 |
434 |
435 |
436 |
437 |
438 |
439 |
440 |
441 |
442 |
443 |
444 |
445 |
446 |
447 |
448 |
449 |
450 |
451 |
452 |
453 |
454 |
455 |
456 |
457 |
458 |
459 |
460 |
461 |
462 |
463 |
464 |
465 |
466 |
467 |
468 |
469 |
470 |
471 |
472 |
473 |
474 |
475 |
476 |
477 |
478 |
479 |
480 |
481 |
482 |
483 |
484 |
485 |
486 |
487 |
488 |
489 |
490 |
491 |
492 |
493 |
494 |
495 |
496 |
497 |
498 |
499 |
500 |
501 |
502 |
503 |
504 |
505 |
506 |
507 |
508 |
509 |
510 |
511 |
512 |
513 |
514 |
515 |
516 |
517 |
518 |
519 |
520 |
521 |
522 |
523 |
524 |
525 |
526 |
527 |
528 |
529 |
530 |
531 |
532 |
533 |
534 |
535 |
536 |
537 |
538 |
539 |
540 |
541 |
542 |
543 |
544 |
545 |
546 |
547 |
548 |
549 |
550 |
551 |
552 |
553 |
554 |
555 |
556 |
557 |
558 |
559 |
560 |
561 |
562 |
563 |
564 |
565 |
566 |
567 |
568 |
569 |
570 |
571 |
572 |
573 |
574 |
575 |
576 |
577 |
578 |
579 |
580 |
581 |
582 |
583 |
584 |
585 |
586 |
587 |
588 |
589 |
590 |
591 |
592 |
593 |
594 |
595 |
596 |
597 |
598 |
599 |
600 |
601 |
602 |
603 |
604 |
605 |
606 |
607 |
608 |
609 |
610 |
611 |
612 |
613 |
614 |
615 |
616 |
617 |
618 |
619 |
620 |
621 |
622 |
623 |
624 |
625 |
626 |
627 |
628 |
629 |
630 |
631 |
632 |
633 |
634 |
635 |
636 |
637 |
638 |
639 |
640 |
641 |
642 |
643 |
644 |
645 |
646 |
647 |
648 |
649 |
650 |
651 |
652 |
653 |
654 |
655 |
656 |
657 |
658 |
659 |
660 |
661 |
662 |
663 |
664 |
665 |
666 |
667 |
668 |
669 |
670 |
671 |
672 |
673 |
674 |
675 |
676 |
677 |
678 |
679 |
680 |
681 |
682 |
683 |
684 |
685 |
686 |
687 |
688 |
689 |
690 |
691 |
692 |
693 |
694 |
--------------------------------------------------------------------------------
/CNKI/CNKI/dem.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 | 概览页
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
1030 |
1076 |
1090 |
1091 |

1092 |
1093 |
1096 |
1097 |
1098 |
1099 |
--------------------------------------------------------------------------------