├── LICENSE ├── README.md └── 服务器端核心源码 ├── __init__.py ├── __pycache__ ├── __init__.cpython-37.pyc ├── middlewares.cpython-37.pyc ├── pipelines.cpython-37.pyc └── settings.cpython-37.pyc ├── items.py ├── main.py ├── middlewares.py ├── pipelines.py ├── settings.py ├── spiders ├── __init__.py ├── __pycache__ │ ├── __init__.cpython-37.pyc │ └── cninfospider1.cpython-37.pyc └── cninfospider1.py └── timerStartDaily.py /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # CninfoDistributedSpider 2 | 针对巨潮资讯网上市公司公告的分布式爬虫,采用scrapy和kafka的分布式架构。可以爬取爬取指定上市公司列表、指定时间段内的所有公告并保存PDF。后续会加入搜索引擎功能 3 | -------------------------------------------------------------------------------- /服务器端核心源码/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/flicck/CninfoDistributedSpider/46a1dd14f62d3c2cb8b8b1f1a02790deaded2d73/服务器端核心源码/__init__.py -------------------------------------------------------------------------------- /服务器端核心源码/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/flicck/CninfoDistributedSpider/46a1dd14f62d3c2cb8b8b1f1a02790deaded2d73/服务器端核心源码/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /服务器端核心源码/__pycache__/middlewares.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/flicck/CninfoDistributedSpider/46a1dd14f62d3c2cb8b8b1f1a02790deaded2d73/服务器端核心源码/__pycache__/middlewares.cpython-37.pyc -------------------------------------------------------------------------------- /服务器端核心源码/__pycache__/pipelines.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/flicck/CninfoDistributedSpider/46a1dd14f62d3c2cb8b8b1f1a02790deaded2d73/服务器端核心源码/__pycache__/pipelines.cpython-37.pyc -------------------------------------------------------------------------------- /服务器端核心源码/__pycache__/settings.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/flicck/CninfoDistributedSpider/46a1dd14f62d3c2cb8b8b1f1a02790deaded2d73/服务器端核心源码/__pycache__/settings.cpython-37.pyc -------------------------------------------------------------------------------- /服务器端核心源码/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # https://docs.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class MyspiderItem(scrapy.Item): 12 | # define the fields for your item here like: 13 | # name = scrapy.Field() 14 | pass 15 | -------------------------------------------------------------------------------- /服务器端核心源码/main.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | #用于调试的类 3 | from scrapy.cmdline import execute 4 | 5 | import sys 6 | import os 7 | sys.path.append(os.path.dirname(os.path.abspath(__file__))) 8 | 9 | print(os.path.dirname(os.path.abspath(__file__))) 10 | 11 | execute() 12 | -------------------------------------------------------------------------------- /服务器端核心源码/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # https://docs.scrapy.org/en/latest/topics/spider-middleware.html 7 | import urllib 8 | 9 | from scrapy.http import Response 10 | from scrapy import signals 11 | from scrapy.http import HtmlResponse 12 | from selenium import webdriver 13 | import re 14 | import sys 15 | import urllib.parse 16 | sys.path.append("D:\pythonWorkBase\mySpider\myspider\myspider\spiders\cninfospider1.py") 17 | 18 | import requests 19 | from myspider.spiders.cninfospider1 import Cninfospider1Spider 20 | class MyspiderSpiderMiddleware(object): 21 | # Not all methods need to be defined. If a method is not defined, 22 | # scrapy acts as if the spider middleware does not modify the 23 | # passed objects. 24 | 25 | @classmethod 26 | def from_crawler(cls, crawler): 27 | # This method is used by Scrapy to create your spiders. 28 | s = cls() 29 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 30 | return s 31 | 32 | def process_spider_input(self, response, spider): 33 | # Called for each response that goes through the spider 34 | # middleware and into the spider. 35 | 36 | # Should return None or raise an exception. 37 | return None 38 | 39 | def process_spider_output(self, response, result, spider): 40 | # Called with the results returned from the Spider, after 41 | # it has processed the response. 42 | 43 | # Must return an iterable of Request, dict or Item objects. 44 | for i in result: 45 | yield i 46 | 47 | def process_spider_exception(self, response, exception, spider): 48 | # Called when a spider or process_spider_input() method 49 | # (from other spider middleware) raises an exception. 50 | 51 | # Should return either None or an iterable of Request, dict 52 | # or Item objects. 53 | pass 54 | 55 | def process_start_requests(self, start_requests, spider): 56 | # Called with the start requests of the spider, and works 57 | # similarly to the process_spider_output() method, except 58 | # that it doesn’t have a response associated. 59 | 60 | # Must return only requests (not items). 61 | for r in start_requests: 62 | yield r 63 | 64 | def spider_opened(self, spider): 65 | spider.logger.info('Spider opened: %s' % spider.name) 66 | 67 | 68 | class MyspiderDownloaderMiddleware(object): 69 | # Not all methods need to be defined. If a method is not defined, 70 | # scrapy acts as if the downloader middleware does not modify the 71 | # passed objects. 72 | 73 | @classmethod 74 | def from_crawler(cls, crawler): 75 | # This method is used by Scrapy to create your spiders. 76 | s = cls() 77 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 78 | return s 79 | 80 | def process_request(self, request, spider): 81 | #中间件---功能:将page超出时间范围的请求筛选掉 82 | t = re.search(r'pageNum=(.*)',request.url) 83 | if t is not None: 84 | page = t.group(0).split("=")[1] 85 | company = re.search(r'searchkey=(.*)&',request.url).group(0).split("&")[0].split("=")[1] 86 | company = urllib.parse.unquote(company) 87 | # print("拿到的"+str(Cninfospider1Spider.time_limit_list[Cninfospider1Spider.company_list.index(company)])) 88 | if int(page) > Cninfospider1Spider.time_limit_list[Cninfospider1Spider.company_list.index(company)]: 89 | #请求出界,不要请求了 90 | r1 = Response("www.baidu.com") 91 | return r1 92 | else: 93 | return None 94 | else: 95 | return None 96 | 97 | #PHantomJs的设置 98 | # # if 'PhantomJS' in request.meta: 99 | # driver = webdriver.PhantomJS() 100 | # driver.get(request.url) 101 | # content = driver.page_source.encode('utf-8') 102 | # driver.quit() 103 | # return HtmlResponse(request.url, encoding='utf-8', body=content, request=request) 104 | 105 | # Called for each request that goes through the downloader 106 | # middleware. 107 | 108 | # Must either: 109 | # - return None: continue processing this request 110 | # - or return a Response object 111 | # - or return a Request object 112 | # - or raise IgnoreRequest: process_exception() methods of 113 | # installed downloader middleware will be called 114 | 115 | 116 | def process_response(self, request, response, spider): 117 | # Called with the response returned from the downloader. 118 | 119 | # Must either; 120 | # - return a Response object 121 | # - return a Request object 122 | # - or raise IgnoreRequest 123 | return response 124 | 125 | def process_exception(self, request, exception, spider): 126 | # Called when a download handler or a process_request() 127 | # (from other downloader middleware) raises an exception. 128 | 129 | # Must either: 130 | # - return None: continue processing this exception 131 | # - return a Response object: stops process_exception() chain 132 | # - return a Request object: stops process_exception() chain 133 | pass 134 | 135 | def spider_opened(self, spider): 136 | spider.logger.info('Spider opened: %s' % spider.name) 137 | -------------------------------------------------------------------------------- /服务器端核心源码/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html 7 | import redis 8 | import json,hashlib 9 | # Scrapy 10 | 11 | from scrapy.conf import settings 12 | 13 | # PyKafka 14 | from pykafka import KafkaClient 15 | #所有的pipeline必须有return 16 | class MyspiderPipeline(object): 17 | # section1 这里是导入redis的代码 18 | # counters= {} 19 | # re = redis.Redis(host='127.0.0.1', port=6379, db=2) 20 | # def process_item(self, item, spider): 21 | # 22 | # tipster = item["com"] 23 | # if MyspiderPipeline.counters.__contains__(tipster): 24 | # i = MyspiderPipeline.counters.get(tipster) 25 | # i = i + 1 26 | # t = json.dumps(item) 27 | # MyspiderPipeline.re.set(item["com"]+"_"+item["time"]+"_"+str(i),t) 28 | # tmpdict = {tipster:i} 29 | # MyspiderPipeline.counters.update(tmpdict) 30 | # else: 31 | # t = json.dumps(item) 32 | # MyspiderPipeline.re.set(item["com"]+"_"+item["time"]+"_"+str(1), t) 33 | # MyspiderPipeline.counters[tipster] = 1 34 | # return item 35 | 36 | 37 | # section2 这里是导入到kafka的代码 38 | def __init__(self): 39 | # 判断下配置里面个给的是啥 40 | # 1. 如果长度等于1, list只有一个数据, 如果是字符肯定大于1 41 | # 2. 否则, 判断类型是否是list, 是的话用 逗号分隔 42 | # 3. 否则就是一个字符串 43 | kafka_ip_port = settings['KAFKA_IP_PORT'] 44 | if len(kafka_ip_port) == 1: 45 | kafka_ip_port = kafka_ip_port[0] 46 | else: 47 | if isinstance(kafka_ip_port, list): 48 | kafka_ip_port = ",".join(kafka_ip_port) 49 | else: 50 | kafka_ip_port = kafka_ip_port 51 | 52 | # 初始化client 53 | self._client = KafkaClient(hosts=kafka_ip_port) 54 | 55 | # 初始化Producer 需要把topic name变成字节的形式 56 | self._producer = \ 57 | self._client.topics[ 58 | settings['KAFKA_TOPIC_NAME'].encode(encoding="UTF-8") 59 | ].get_producer() 60 | # 自动读取文件创建topic 61 | # def create_topic(self, brokers,topic='topic', num_partitions=3, configs=None, timeout_ms=3000): 62 | # 63 | # client = KafkaClient(hosts=brokers) 64 | # 65 | # if topic not in client.cluster.topics(exclude_internal_topics=True): # Topic不存在 66 | # 67 | # request = admin.CreateTopicsRequest_v0( 68 | # create_topic_requests=[( 69 | # topic, 70 | # num_partitions, 71 | # -1, # replication unset. 72 | # [], # Partition assignment. 73 | # [(key, value) for key, value in configs.items()], # Configs 74 | # )], 75 | # timeout=timeout_ms 76 | # ) 77 | # 78 | # future = client.send(2, request) # 2是Controller,发送给其他Node都创建失败。 79 | # client.poll(timeout_ms=timeout_ms, future=future, sleep=False) # 这里 80 | # 81 | # result = future.value 82 | # # error_code = result.topic_error_codes[0][1] 83 | # print("CREATE TOPIC RESPONSE: ", result) # 0 success, 41 NOT_CONTROLLER, 36 ALREADY_EXISTS 84 | # client.close() 85 | # else: # Topic已经存在 86 | # print("Topic already exists!") 87 | # return 88 | 89 | def process_item(self, item, spider): 90 | if spider.name == "cninfospider1": 91 | t = json.dumps(item) 92 | self._producer.produce(t.encode(encoding="UTF-8")) 93 | 94 | #获得t的md5存放到mysql数据库中 95 | md5_data = self.md5(t) 96 | print(md5_data) 97 | #将这个md5值存放到mysql数据库去重 98 | 99 | print(item['title']) 100 | print(item['time']) 101 | print(item['page']) 102 | 103 | #如果 104 | return item 105 | 106 | #数据也能获取md5指纹 107 | def md5(self,t): 108 | obj = hashlib.md5() 109 | obj.update(bytes(t,encoding="utf-8")) 110 | return obj.hexdigest() 111 | 112 | def close_spider(self,spider): 113 | if spider.name == "cninfospider1": 114 | self._producer.stop() 115 | 116 | -------------------------------------------------------------------------------- /服务器端核心源码/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for myspider project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # https://docs.scrapy.org/en/latest/topics/settings.html 9 | # https://docs.scrapy.org/en/latest/topics/downloader-middleware.html 10 | # https://docs.scrapy.org/en/latest/topics/spider-middleware.html 11 | from selenium import webdriver 12 | 13 | BOT_NAME = 'myspider' 14 | 15 | SPIDER_MODULES = ['myspider.spiders'] 16 | NEWSPIDER_MODULE = 'myspider.spiders' 17 | 18 | 19 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 20 | USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36' 21 | 22 | # Obey robots.txt rules 23 | ROBOTSTXT_OBEY = False 24 | LOG_LEVEL = 'WARN' 25 | 26 | # kafka配置 27 | KAFKA_IP_PORT = ["192.168.88.196:9092"] 28 | KAFKA_TOPIC_NAME = "wanghan15-post" 29 | # CONCURRENT_REQUESTS = 1 30 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 31 | #CONCURRENT_REQUESTS = 32 32 | # driver = driver = webdriver.PhantomJS(executable_path='D:/environment_ENGLISH_VERSION/python/Scripts/phantomjs') 33 | # Configure a delay for requests for the same website (default: 0) 34 | # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay 35 | # See also autothrottle settings and docs 36 | #DOWNLOAD_DELAY = 3 37 | # The download delay setting will honor only one of: 38 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 39 | #CONCURRENT_REQUESTS_PER_IP = 16 40 | 41 | # Disable cookies (enabled by default) 42 | #COOKIES_ENABLED = False 43 | 44 | # Disable Telnet Console (enabled by default) 45 | #TELNETCONSOLE_ENABLED = False 46 | 47 | # Override the default request headers: 48 | #DEFAULT_REQUEST_HEADERS = { 49 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 50 | # 'Accept-Language': 'en', 51 | #} 52 | 53 | # Enable or disable spider middlewares 54 | # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html 55 | #SPIDER_MIDDLEWARES = { 56 | # 'myspider.middlewares.MyspiderSpiderMiddleware': 543, 57 | #} 58 | 59 | # Enable or disable downloader middlewares 60 | # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html 61 | DOWNLOADER_MIDDLEWARES = { 62 | 'myspider.middlewares.MyspiderDownloaderMiddleware': 100, 63 | } 64 | 65 | # Enable or disable extensions 66 | # See https://docs.scrapy.org/en/latest/topics/extensions.html 67 | #EXTENSIONS = { 68 | # 'scrapy.extensions.telnet.TelnetConsole': None, 69 | #} 70 | 71 | # Configure item pipelines 72 | # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html 73 | ITEM_PIPELINES = { 74 | 'myspider.pipelines.MyspiderPipeline': 300, #值代表pipeLine的位置,数字越小代表距离引擎越近--》可以在字典里加很多个pipeline,数值决定了数据的顺序 75 | } 76 | 77 | # Enable and configure the AutoThrottle extension (disabled by default) 78 | # See https://docs.scrapy.org/en/latest/topics/autothrottle.html 79 | #AUTOTHROTTLE_ENABLED = True 80 | # The initial download delay 81 | #AUTOTHROTTLE_START_DELAY = 5 82 | # The maximum download delay to be set in case of high latencies 83 | #AUTOTHROTTLE_MAX_DELAY = 60 84 | # The average number of requests Scrapy should be sending in parallel to 85 | # each remote server 86 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 87 | # Enable showing throttling stats for every response received: 88 | #AUTOTHROTTLE_DEBUG = False 89 | 90 | # Enable and configure HTTP caching (disabled by default) 91 | # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 92 | #HTTPCACHE_ENABLED = True 93 | #HTTPCACHE_EXPIRATION_SECS = 0 94 | #HTTPCACHE_DIR = 'httpcache' 95 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 96 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 97 | -------------------------------------------------------------------------------- /服务器端核心源码/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /服务器端核心源码/spiders/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/flicck/CninfoDistributedSpider/46a1dd14f62d3c2cb8b8b1f1a02790deaded2d73/服务器端核心源码/spiders/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /服务器端核心源码/spiders/__pycache__/cninfospider1.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/flicck/CninfoDistributedSpider/46a1dd14f62d3c2cb8b8b1f1a02790deaded2d73/服务器端核心源码/spiders/__pycache__/cninfospider1.cpython-37.pyc -------------------------------------------------------------------------------- /服务器端核心源码/spiders/cninfospider1.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import json 3 | import scrapy 4 | import re 5 | import logging 6 | import threading 7 | import time 8 | import random 9 | import datetime 10 | import copy 11 | class Company_List: 12 | #这里需要放公司文件的绝对路径 13 | f = open("C:/Users/stawind/Desktop/list/WaitSearchCompanyList.txt", "r", encoding='UTF-8') 14 | def get_company_list(self): 15 | companyList=[] 16 | while True: 17 | line = Company_List.f.readline().strip() 18 | if len(line) == 0: 19 | break 20 | companyList.append(line) 21 | # line = line.strip() 22 | return companyList 23 | current_cursor = -1 24 | mutex = threading.Lock() 25 | mutex1 = threading.Lock() 26 | mutex2 = threading.Lock() 27 | mutex3 = threading.Lock() 28 | # page_thread_sequence = 0 29 | #注意scrapy是异步多线程框架,yield方法不会阻塞,如果需要对request和response进行修改,需要区middlewares中间件进行修改 30 | class Cninfospider1Spider(scrapy.Spider): 31 | name = 'cninfospider1' 32 | allowed_domains = ['cninfo.com.cn'] 33 | reader = Company_List() 34 | # start_company = reader.get_next_line() 35 | start_urls = ['http://www.cninfo.com.cn/'] 36 | company_list = reader.get_company_list() 37 | 38 | # 查询的page限制,和timeLimitBound同一优先级 39 | pageLimit = 100000 40 | # 查询的时间限制,和pageLimit同一优先级,左闭右开 41 | timeLimitBound =["2019-09-01","2019-10-21"] 42 | time_limit_list = [pageLimit] * company_list.__len__() 43 | pre = datetime.date(int(timeLimitBound[0].split("-")[0]), int(timeLimitBound[0].split("-")[1]), int(timeLimitBound[0].split("-")[2])) 44 | post = datetime.date(int(timeLimitBound[1].split("-")[0]), int(timeLimitBound[1].split("-")[1]), int(timeLimitBound[1].split("-")[2])) 45 | page_thread_sequence = 0 46 | page_thread_sequence1 = 0 47 | def parse(self, response): 48 | global current_cursor 49 | global page_thread_sequence 50 | while current_cursor+1 <= Cninfospider1Spider.company_list.__len__(): 51 | #scrapy是使用多线程的,所以注意加锁增加原子性 52 | mutex.acquire() 53 | current_cursor = current_cursor + 1 54 | #爬虫友好性,设置爬取一个公司的间隔时间 55 | time.sleep(random.randint(0,9)/5.0) 56 | if current_cursor+1 > Cninfospider1Spider.company_list.__len__(): 57 | break 58 | company = Cninfospider1Spider.company_list[current_cursor] 59 | Cninfospider1Spider.current_company = company 60 | # 通过meta传递当前company数据--》由于scrapy是异步的,所以全局变量很容易被覆盖 61 | request = scrapy.Request( 62 | url="http://www.cninfo.com.cn/new/fulltextSearch/full?searchkey=" + company + "&sdate=&edate=&isfulltext=false&sortName=nothing&sortType=desc&pageNum=1", 63 | callback=self.parse0, meta={"company": company}, dont_filter=True) 64 | yield request 65 | mutex.release() 66 | 67 | #翻页 68 | def parse0(self, response): 69 | list1 =[] 70 | company = response.meta["company"] 71 | print (company) 72 | dic = json.loads(response.body.decode(response.encoding)) 73 | num = int(dic.get("totalAnnouncement")) 74 | page = int((num / 10))+1 if num % 10 != 0 else int((num/10)) 75 | 76 | # 如果没有页数说明公司不存在,需要报警告 77 | if page == 0: 78 | logger = logging.getLogger(__name__) 79 | logger.warn(company + "不存在") 80 | 81 | # 如果总共页数大于页数限制,就请求页数限制的的全部信息 82 | if page >=Cninfospider1Spider.pageLimit: 83 | for i in range(Cninfospider1Spider.pageLimit,0,-1): 84 | request = scrapy.Request(url = "http://www.cninfo.com.cn/new/fulltextSearch/full?searchkey="+company+"&sdate=&edate=&isfulltext=false&sortName=nothing&sortType=desc&pageNum="+str(i),callback=self.parse1,meta={"company":company,"page":copy.deepcopy(i)}, dont_filter=True) 85 | list1.append(request) 86 | # yield request 87 | else: 88 | #不然就获取所有页数的信息 89 | for k in range(page,0,-1): 90 | request = scrapy.Request( 91 | url="http://www.cninfo.com.cn/new/fulltextSearch/full?searchkey="+company+"&sdate=&edate=&isfulltext=false&sortName=nothing&sortType=desc&pageNum=" + str( 92 | k), callback=self.parse1, meta={"company":company,"page":copy.deepcopy(k)},dont_filter=True) 93 | list1.append(request) 94 | # yield request 95 | return list1 96 | #把每一页的数据放到item里面 97 | def parse1(self, response): 98 | 99 | if response.body != b'': 100 | #通过meta在不同方法里面传递信息 101 | company = response.meta["company"] 102 | page = response.meta["page"] 103 | 104 | dic = json.loads(response.body.decode(response.encoding)) 105 | 106 | for j in range(0, dic.get("announcements").__len__()): 107 | item = {} 108 | #确实是存在没有命名的公告的 109 | titles = "未命名" 110 | if dic.get("announcements")[j]["announcementTitle"] is not None: 111 | titles = re.split('[::]',dic.get("announcements")[j]["announcementTitle"],1) 112 | if titles.__len__() == 2: 113 | title = titles[1] 114 | else: 115 | title = titles[0] 116 | adjuntUrl = dic.get("announcements")[j]["adjunctUrl"] 117 | adjuntUrl_time = adjuntUrl.split("/")[1] 118 | #防止拿不到time 119 | try: 120 | current_time = datetime.date(int(adjuntUrl_time.split("-")[0]), int(adjuntUrl_time.split("-")[1]), int(adjuntUrl_time.split("-")[2])) 121 | except Exception as e: 122 | print (e.__traceback__) 123 | continue 124 | else: 125 | if current_time >= Cninfospider1Spider.post: 126 | pass 127 | 128 | else: 129 | if current_time < Cninfospider1Spider.pre: 130 | index = Cninfospider1Spider.company_list.index(company) 131 | #加锁,只允许第一个达到这个page的进入修改Cninfospider1Spider.time_limit_list[index] 132 | #否则数会不对 133 | mutex1.acquire() 134 | if(Cninfospider1Spider.page_thread_sequence == 0): 135 | Cninfospider1Spider.page_thread_sequence = 1 136 | if Cninfospider1Spider.time_limit_list[index] == Cninfospider1Spider.pageLimit: 137 | Cninfospider1Spider.time_limit_list[index] = page+1 138 | else: 139 | pass 140 | mutex1.release() 141 | else: 142 | item["com"] = company 143 | item["time"] = adjuntUrl_time 144 | item["title"] = title.replace('', '').replace('','') 145 | item["url"] = "http://static.cninfo.com.cn/" + adjuntUrl 146 | item["page"] = page 147 | yield item 148 | -------------------------------------------------------------------------------- /服务器端核心源码/timerStartDaily.py: -------------------------------------------------------------------------------- 1 | from scrapy import cmdline 2 | import datetime 3 | import time 4 | import shutil 5 | import os 6 | #爬虫任务定时设置 7 | 8 | #这是为爬虫能够续爬而创建的目录。存储续爬需要的数据 9 | recoderDir = r"C:/Users/stawind/Desktop/spider/cninfospider1" 10 | #判断爬虫是否在运行的标记 11 | checkFile = "C:/Users/stawind/Desktop/spider/isRunning.txt" 12 | 13 | startTime = datetime.datetime.now() 14 | print(f"startTime={startTime}") 15 | 16 | i = 0 17 | moniter = 0 18 | 19 | while True: 20 | isRunning = os.path.isfile(checkFile) 21 | if not isRunning: 22 | #在爬虫启动之前处理一些事情,清理掉jobdir = crawls 23 | isExsit = os.path.isdir(recoderDir) 24 | print(f"cninfospider not running,ready to start.isExsit:{isExsit}") 25 | if isExsit: 26 | #删除续爬目录crawls及目录下所有文件 27 | removeRes = shutil.rmtree(recoderDir) 28 | print(f"At time:{datetime.datetime.now()}, delete res:{removeRes}") 29 | else: 30 | print(f"At time:{datetime.datetime.now()}, Dir:{recoderDir} is not exsit.") 31 | time.sleep(20) 32 | clawerTime = datetime.datetime.now() 33 | waitTime = clawerTime - startTime 34 | print(f"At time:{clawerTime}, start clawer: mySpider !!!, waitTime:{waitTime}") 35 | cmdline.execute('scrapy crawl cninfospider1 -s JOBDIR=C:/Users/stawind/Desktop/spider/cninfospider1/storeMyRequest'.split()) 36 | break #爬虫结束后退出脚本 37 | else: 38 | print(f"At time:{datetime.datetime.now()}, mySpider is running, sleep to wait.") 39 | i += 1 40 | 41 | time.sleep(10) 42 | moniter += 10 43 | if moniter >= 1440: 44 | break 45 | 46 | --------------------------------------------------------------------------------