├── LICENSE
├── README.md
└── 服务器端核心源码
    ├── __init__.py
    ├── __pycache__
        ├── __init__.cpython-37.pyc
        ├── middlewares.cpython-37.pyc
        ├── pipelines.cpython-37.pyc
        └── settings.cpython-37.pyc
    ├── items.py
    ├── main.py
    ├── middlewares.py
    ├── pipelines.py
    ├── settings.py
    ├── spiders
        ├── __init__.py
        ├── __pycache__
        │   ├── __init__.cpython-37.pyc
        │   └── cninfospider1.cpython-37.pyc
        └── cninfospider1.py
    └── timerStartDaily.py


/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # CninfoDistributedSpider
2 | 针对巨潮资讯网上市公司公告的分布式爬虫，采用scrapy和kafka的分布式架构。可以爬取爬取指定上市公司列表、指定时间段内的所有公告并保存PDF。后续会加入搜索引擎功能
3 | 


--------------------------------------------------------------------------------
/服务器端核心源码/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/flicck/CninfoDistributedSpider/46a1dd14f62d3c2cb8b8b1f1a02790deaded2d73/服务器端核心源码/__init__.py


--------------------------------------------------------------------------------
/服务器端核心源码/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/flicck/CninfoDistributedSpider/46a1dd14f62d3c2cb8b8b1f1a02790deaded2d73/服务器端核心源码/__pycache__/__init__.cpython-37.pyc


--------------------------------------------------------------------------------
/服务器端核心源码/__pycache__/middlewares.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/flicck/CninfoDistributedSpider/46a1dd14f62d3c2cb8b8b1f1a02790deaded2d73/服务器端核心源码/__pycache__/middlewares.cpython-37.pyc


--------------------------------------------------------------------------------
/服务器端核心源码/__pycache__/pipelines.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/flicck/CninfoDistributedSpider/46a1dd14f62d3c2cb8b8b1f1a02790deaded2d73/服务器端核心源码/__pycache__/pipelines.cpython-37.pyc


--------------------------------------------------------------------------------
/服务器端核心源码/__pycache__/settings.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/flicck/CninfoDistributedSpider/46a1dd14f62d3c2cb8b8b1f1a02790deaded2d73/服务器端核心源码/__pycache__/settings.cpython-37.pyc


--------------------------------------------------------------------------------
/服务器端核心源码/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # https://docs.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | 
10 | 
11 | class MyspiderItem(scrapy.Item):
12 |     # define the fields for your item here like:
13 |     # name = scrapy.Field()
14 |     pass
15 | 


--------------------------------------------------------------------------------
/服务器端核心源码/main.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | #用于调试的类
 3 | from scrapy.cmdline import execute
 4 | 
 5 | import sys
 6 | import os
 7 | sys.path.append(os.path.dirname(os.path.abspath(__file__)))
 8 | 
 9 | print(os.path.dirname(os.path.abspath(__file__)))
10 | 
11 | execute()
12 | 


--------------------------------------------------------------------------------
/服务器端核心源码/middlewares.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Define here the models for your spider middleware
  4 | #
  5 | # See documentation in:
  6 | # https://docs.scrapy.org/en/latest/topics/spider-middleware.html
  7 | import urllib
  8 | 
  9 | from scrapy.http import Response
 10 | from scrapy import signals
 11 | from scrapy.http import HtmlResponse
 12 | from selenium import webdriver
 13 | import re
 14 | import sys
 15 | import urllib.parse
 16 | sys.path.append("D:\pythonWorkBase\mySpider\myspider\myspider\spiders\cninfospider1.py")
 17 | 
 18 | import requests
 19 | from myspider.spiders.cninfospider1 import Cninfospider1Spider
 20 | class MyspiderSpiderMiddleware(object):
 21 |     # Not all methods need to be defined. If a method is not defined,
 22 |     # scrapy acts as if the spider middleware does not modify the
 23 |     # passed objects.
 24 | 
 25 |     @classmethod
 26 |     def from_crawler(cls, crawler):
 27 |         # This method is used by Scrapy to create your spiders.
 28 |         s = cls()
 29 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
 30 |         return s
 31 | 
 32 |     def process_spider_input(self, response, spider):
 33 |         # Called for each response that goes through the spider
 34 |         # middleware and into the spider.
 35 | 
 36 |         # Should return None or raise an exception.
 37 |         return None
 38 | 
 39 |     def process_spider_output(self, response, result, spider):
 40 |         # Called with the results returned from the Spider, after
 41 |         # it has processed the response.
 42 | 
 43 |         # Must return an iterable of Request, dict or Item objects.
 44 |         for i in result:
 45 |             yield i
 46 | 
 47 |     def process_spider_exception(self, response, exception, spider):
 48 |         # Called when a spider or process_spider_input() method
 49 |         # (from other spider middleware) raises an exception.
 50 | 
 51 |         # Should return either None or an iterable of Request, dict
 52 |         # or Item objects.
 53 |         pass
 54 | 
 55 |     def process_start_requests(self, start_requests, spider):
 56 |         # Called with the start requests of the spider, and works
 57 |         # similarly to the process_spider_output() method, except
 58 |         # that it doesn’t have a response associated.
 59 | 
 60 |         # Must return only requests (not items).
 61 |         for r in start_requests:
 62 |             yield r
 63 | 
 64 |     def spider_opened(self, spider):
 65 |         spider.logger.info('Spider opened: %s' % spider.name)
 66 | 
 67 | 
 68 | class MyspiderDownloaderMiddleware(object):
 69 |     # Not all methods need to be defined. If a method is not defined,
 70 |     # scrapy acts as if the downloader middleware does not modify the
 71 |     # passed objects.
 72 | 
 73 |     @classmethod
 74 |     def from_crawler(cls, crawler):
 75 |         # This method is used by Scrapy to create your spiders.
 76 |         s = cls()
 77 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
 78 |         return s
 79 | 
 80 |     def process_request(self, request, spider):
 81 | #中间件---功能：将page超出时间范围的请求筛选掉
 82 |         t = re.search(r'pageNum=(.*)',request.url)
 83 |         if t is not None:
 84 |             page = t.group(0).split("=")[1]
 85 |             company = re.search(r'searchkey=(.*)&',request.url).group(0).split("&")[0].split("=")[1]
 86 |             company = urllib.parse.unquote(company)
 87 |             # print("拿到的"+str(Cninfospider1Spider.time_limit_list[Cninfospider1Spider.company_list.index(company)]))
 88 |             if int(page) > Cninfospider1Spider.time_limit_list[Cninfospider1Spider.company_list.index(company)]:
 89 |                 #请求出界，不要请求了
 90 |                 r1 = Response("www.baidu.com")
 91 |                 return r1
 92 |             else:
 93 |                 return None
 94 |         else:
 95 |             return None
 96 | 
 97 |     #PHantomJs的设置
 98 |         # # if 'PhantomJS' in request.meta:
 99 |         #     driver = webdriver.PhantomJS()
100 |         #     driver.get(request.url)
101 |         #     content = driver.page_source.encode('utf-8')
102 |         #     driver.quit()
103 |         #     return HtmlResponse(request.url, encoding='utf-8', body=content, request=request)
104 | 
105 |         # Called for each request that goes through the downloader
106 |         # middleware.
107 | 
108 |         # Must either:
109 |         # - return None: continue processing this request
110 |         # - or return a Response object
111 |         # - or return a Request object
112 |         # - or raise IgnoreRequest: process_exception() methods of
113 |         #   installed downloader middleware will be called
114 | 
115 | 
116 |     def process_response(self, request, response, spider):
117 |         # Called with the response returned from the downloader.
118 | 
119 |         # Must either;
120 |         # - return a Response object
121 |         # - return a Request object
122 |         # - or raise IgnoreRequest
123 |         return response
124 | 
125 |     def process_exception(self, request, exception, spider):
126 |         # Called when a download handler or a process_request()
127 |         # (from other downloader middleware) raises an exception.
128 | 
129 |         # Must either:
130 |         # - return None: continue processing this exception
131 |         # - return a Response object: stops process_exception() chain
132 |         # - return a Request object: stops process_exception() chain
133 |         pass
134 | 
135 |     def spider_opened(self, spider):
136 |         spider.logger.info('Spider opened: %s' % spider.name)
137 | 


--------------------------------------------------------------------------------
/服务器端核心源码/pipelines.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Define your item pipelines here
  4 | #
  5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
  6 | # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
  7 | import redis
  8 | import json,hashlib
  9 | # Scrapy
 10 | 
 11 | from scrapy.conf import settings
 12 | 
 13 | # PyKafka
 14 | from pykafka import KafkaClient
 15 | #所有的pipeline必须有return
 16 | class MyspiderPipeline(object):
 17 |     # section1 这里是导入redis的代码
 18 |     # counters= {}
 19 |     # re = redis.Redis(host='127.0.0.1', port=6379, db=2)
 20 |     # def process_item(self, item, spider):
 21 |     #
 22 |     #     tipster = item["com"]
 23 |     #     if MyspiderPipeline.counters.__contains__(tipster):
 24 |     #         i = MyspiderPipeline.counters.get(tipster)
 25 |     #         i = i + 1
 26 |     #         t = json.dumps(item)
 27 |     #         MyspiderPipeline.re.set(item["com"]+"_"+item["time"]+"_"+str(i),t)
 28 |     #         tmpdict = {tipster:i}
 29 |     #         MyspiderPipeline.counters.update(tmpdict)
 30 |     #     else:
 31 |     #         t = json.dumps(item)
 32 |     #         MyspiderPipeline.re.set(item["com"]+"_"+item["time"]+"_"+str(1), t)
 33 |     #         MyspiderPipeline.counters[tipster] = 1
 34 |     #     return item
 35 | 
 36 | 
 37 |     # section2 这里是导入到kafka的代码
 38 |     def __init__(self):
 39 |         # 判断下配置里面个给的是啥
 40 |         # 1. 如果长度等于1, list只有一个数据, 如果是字符肯定大于1
 41 |         # 2. 否则, 判断类型是否是list, 是的话用 逗号分隔
 42 |         # 3. 否则就是一个字符串
 43 |         kafka_ip_port = settings['KAFKA_IP_PORT']
 44 |         if len(kafka_ip_port) == 1:
 45 |             kafka_ip_port = kafka_ip_port[0]
 46 |         else:
 47 |             if isinstance(kafka_ip_port, list):
 48 |                 kafka_ip_port = ",".join(kafka_ip_port)
 49 |             else:
 50 |                 kafka_ip_port = kafka_ip_port
 51 | 
 52 |         # 初始化client
 53 |         self._client = KafkaClient(hosts=kafka_ip_port)
 54 | 
 55 |         # 初始化Producer 需要把topic name变成字节的形式
 56 |         self._producer = \
 57 |             self._client.topics[
 58 |                 settings['KAFKA_TOPIC_NAME'].encode(encoding="UTF-8")
 59 |             ].get_producer()
 60 |     # 自动读取文件创建topic
 61 |     # def create_topic(self, brokers,topic='topic', num_partitions=3, configs=None, timeout_ms=3000):
 62 |     #
 63 |     #     client = KafkaClient(hosts=brokers)
 64 |     #
 65 |     #     if topic not in client.cluster.topics(exclude_internal_topics=True):  # Topic不存在
 66 |     #
 67 |     #         request = admin.CreateTopicsRequest_v0(
 68 |     #             create_topic_requests=[(
 69 |     #                 topic,
 70 |     #                 num_partitions,
 71 |     #                 -1,  # replication unset.
 72 |     #                 [],  # Partition assignment.
 73 |     #                 [(key, value) for key, value in configs.items()],  # Configs
 74 |     #             )],
 75 |     #             timeout=timeout_ms
 76 |     #         )
 77 |     #
 78 |     #         future = client.send(2, request)  # 2是Controller,发送给其他Node都创建失败。
 79 |     #         client.poll(timeout_ms=timeout_ms, future=future, sleep=False)  # 这里
 80 |     #
 81 |     #         result = future.value
 82 |     #         # error_code = result.topic_error_codes[0][1]
 83 |     #         print("CREATE TOPIC RESPONSE: ", result)  # 0 success, 41 NOT_CONTROLLER, 36 ALREADY_EXISTS
 84 |     #         client.close()
 85 |     #     else:  # Topic已经存在
 86 |     #         print("Topic already exists!")
 87 |     #         return
 88 | 
 89 |     def process_item(self, item, spider):
 90 |         if spider.name == "cninfospider1":
 91 |             t = json.dumps(item)
 92 |             self._producer.produce(t.encode(encoding="UTF-8"))
 93 | 
 94 |             #获得t的md5存放到mysql数据库中
 95 |             md5_data = self.md5(t)
 96 |             print(md5_data)
 97 |             #将这个md5值存放到mysql数据库去重
 98 | 
 99 |             print(item['title'])
100 |             print(item['time'])
101 |             print(item['page'])
102 | 
103 |             #如果
104 |             return item
105 | 
106 |     #数据也能获取md5指纹
107 |     def md5(self,t):
108 |         obj = hashlib.md5()
109 |         obj.update(bytes(t,encoding="utf-8"))
110 |         return obj.hexdigest()
111 | 
112 |     def close_spider(self,spider):
113 |         if spider.name == "cninfospider1":
114 |             self._producer.stop()
115 | 
116 | 


--------------------------------------------------------------------------------
/服务器端核心源码/settings.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Scrapy settings for myspider project
 4 | #
 5 | # For simplicity, this file contains only settings considered important or
 6 | # commonly used. You can find more settings consulting the documentation:
 7 | #
 8 | #     https://docs.scrapy.org/en/latest/topics/settings.html
 9 | #     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
10 | #     https://docs.scrapy.org/en/latest/topics/spider-middleware.html
11 | from selenium import webdriver
12 | 
13 | BOT_NAME = 'myspider'
14 | 
15 | SPIDER_MODULES = ['myspider.spiders']
16 | NEWSPIDER_MODULE = 'myspider.spiders'
17 | 
18 | 
19 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
20 | USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36'
21 | 
22 | # Obey robots.txt rules
23 | ROBOTSTXT_OBEY = False
24 | LOG_LEVEL = 'WARN'
25 | 
26 | # kafka配置
27 | KAFKA_IP_PORT = ["192.168.88.196:9092"]
28 | KAFKA_TOPIC_NAME = "wanghan15-post"
29 | # CONCURRENT_REQUESTS = 1
30 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
31 | #CONCURRENT_REQUESTS = 32
32 | # driver = driver = webdriver.PhantomJS(executable_path='D:/environment_ENGLISH_VERSION/python/Scripts/phantomjs')
33 | # Configure a delay for requests for the same website (default: 0)
34 | # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
35 | # See also autothrottle settings and docs
36 | #DOWNLOAD_DELAY = 3
37 | # The download delay setting will honor only one of:
38 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
39 | #CONCURRENT_REQUESTS_PER_IP = 16
40 | 
41 | # Disable cookies (enabled by default)
42 | #COOKIES_ENABLED = False
43 | 
44 | # Disable Telnet Console (enabled by default)
45 | #TELNETCONSOLE_ENABLED = False
46 | 
47 | # Override the default request headers:
48 | #DEFAULT_REQUEST_HEADERS = {
49 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
50 | #   'Accept-Language': 'en',
51 | #}
52 | 
53 | # Enable or disable spider middlewares
54 | # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
55 | #SPIDER_MIDDLEWARES = {
56 | #    'myspider.middlewares.MyspiderSpiderMiddleware': 543,
57 | #}
58 | 
59 | # Enable or disable downloader middlewares
60 | # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
61 | DOWNLOADER_MIDDLEWARES = {
62 |     'myspider.middlewares.MyspiderDownloaderMiddleware': 100,
63 | }
64 | 
65 | # Enable or disable extensions
66 | # See https://docs.scrapy.org/en/latest/topics/extensions.html
67 | #EXTENSIONS = {
68 | #    'scrapy.extensions.telnet.TelnetConsole': None,
69 | #}
70 | 
71 | # Configure item pipelines
72 | # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
73 | ITEM_PIPELINES = {
74 |     'myspider.pipelines.MyspiderPipeline': 300, #值代表pipeLine的位置，数字越小代表距离引擎越近--》可以在字典里加很多个pipeline，数值决定了数据的顺序
75 | }
76 | 
77 | # Enable and configure the AutoThrottle extension (disabled by default)
78 | # See https://docs.scrapy.org/en/latest/topics/autothrottle.html
79 | #AUTOTHROTTLE_ENABLED = True
80 | # The initial download delay
81 | #AUTOTHROTTLE_START_DELAY = 5
82 | # The maximum download delay to be set in case of high latencies
83 | #AUTOTHROTTLE_MAX_DELAY = 60
84 | # The average number of requests Scrapy should be sending in parallel to
85 | # each remote server
86 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
87 | # Enable showing throttling stats for every response received:
88 | #AUTOTHROTTLE_DEBUG = False
89 | 
90 | # Enable and configure HTTP caching (disabled by default)
91 | # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
92 | #HTTPCACHE_ENABLED = True
93 | #HTTPCACHE_EXPIRATION_SECS = 0
94 | #HTTPCACHE_DIR = 'httpcache'
95 | #HTTPCACHE_IGNORE_HTTP_CODES = []
96 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
97 | 


--------------------------------------------------------------------------------
/服务器端核心源码/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/服务器端核心源码/spiders/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/flicck/CninfoDistributedSpider/46a1dd14f62d3c2cb8b8b1f1a02790deaded2d73/服务器端核心源码/spiders/__pycache__/__init__.cpython-37.pyc


--------------------------------------------------------------------------------
/服务器端核心源码/spiders/__pycache__/cninfospider1.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/flicck/CninfoDistributedSpider/46a1dd14f62d3c2cb8b8b1f1a02790deaded2d73/服务器端核心源码/spiders/__pycache__/cninfospider1.cpython-37.pyc


--------------------------------------------------------------------------------
/服务器端核心源码/spiders/cninfospider1.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import json
  3 | import scrapy
  4 | import re
  5 | import logging
  6 | import threading
  7 | import time
  8 | import random
  9 | import datetime
 10 | import copy
 11 | class Company_List:
 12 |     #这里需要放公司文件的绝对路径
 13 |     f = open("C:/Users/stawind/Desktop/list/WaitSearchCompanyList.txt", "r", encoding='UTF-8')
 14 |     def get_company_list(self):
 15 |         companyList=[]
 16 |         while True:
 17 |             line = Company_List.f.readline().strip()
 18 |             if len(line) == 0:
 19 |                 break
 20 |             companyList.append(line)
 21 |         # line = line.strip()
 22 |         return companyList
 23 | current_cursor = -1
 24 | mutex = threading.Lock()
 25 | mutex1 = threading.Lock()
 26 | mutex2 = threading.Lock()
 27 | mutex3 = threading.Lock()
 28 | # page_thread_sequence = 0
 29 | #注意scrapy是异步多线程框架，yield方法不会阻塞，如果需要对request和response进行修改，需要区middlewares中间件进行修改
 30 | class Cninfospider1Spider(scrapy.Spider):
 31 |     name = 'cninfospider1'
 32 |     allowed_domains = ['cninfo.com.cn']
 33 |     reader = Company_List()
 34 |     # start_company = reader.get_next_line()
 35 |     start_urls = ['http://www.cninfo.com.cn/']
 36 |     company_list = reader.get_company_list()
 37 | 
 38 |     # 查询的page限制，和timeLimitBound同一优先级
 39 |     pageLimit = 100000
 40 |     # 查询的时间限制，和pageLimit同一优先级,左闭右开
 41 |     timeLimitBound =["2019-09-01","2019-10-21"]
 42 |     time_limit_list = [pageLimit] * company_list.__len__()
 43 |     pre = datetime.date(int(timeLimitBound[0].split("-")[0]), int(timeLimitBound[0].split("-")[1]), int(timeLimitBound[0].split("-")[2]))
 44 |     post = datetime.date(int(timeLimitBound[1].split("-")[0]), int(timeLimitBound[1].split("-")[1]), int(timeLimitBound[1].split("-")[2]))
 45 |     page_thread_sequence = 0
 46 |     page_thread_sequence1 = 0
 47 |     def parse(self, response):
 48 |         global current_cursor
 49 |         global page_thread_sequence
 50 |         while current_cursor+1 <=  Cninfospider1Spider.company_list.__len__():
 51 |             #scrapy是使用多线程的，所以注意加锁增加原子性
 52 |             mutex.acquire()
 53 |             current_cursor = current_cursor + 1
 54 | #爬虫友好性，设置爬取一个公司的间隔时间
 55 |             time.sleep(random.randint(0,9)/5.0)
 56 |             if current_cursor+1 >  Cninfospider1Spider.company_list.__len__():
 57 |                 break
 58 |             company = Cninfospider1Spider.company_list[current_cursor]
 59 |             Cninfospider1Spider.current_company = company
 60 |             # 通过meta传递当前company数据--》由于scrapy是异步的，所以全局变量很容易被覆盖
 61 |             request = scrapy.Request(
 62 |                 url="http://www.cninfo.com.cn/new/fulltextSearch/full?searchkey=" + company + "&sdate=&edate=&isfulltext=false&sortName=nothing&sortType=desc&pageNum=1",
 63 |                 callback=self.parse0, meta={"company": company}, dont_filter=True)
 64 |             yield request
 65 |             mutex.release()
 66 | 
 67 |     #翻页
 68 |     def parse0(self, response):
 69 |         list1 =[]
 70 |         company = response.meta["company"]
 71 |         print (company)
 72 |         dic = json.loads(response.body.decode(response.encoding))
 73 |         num = int(dic.get("totalAnnouncement"))
 74 |         page = int((num / 10))+1 if num % 10 != 0 else int((num/10))
 75 | 
 76 |         # 如果没有页数说明公司不存在，需要报警告
 77 |         if page == 0:
 78 |             logger = logging.getLogger(__name__)
 79 |             logger.warn(company + "不存在")
 80 | 
 81 |         # 如果总共页数大于页数限制，就请求页数限制的的全部信息
 82 |         if page >=Cninfospider1Spider.pageLimit:
 83 |             for i in range(Cninfospider1Spider.pageLimit,0,-1):
 84 |                 request = scrapy.Request(url = "http://www.cninfo.com.cn/new/fulltextSearch/full?searchkey="+company+"&sdate=&edate=&isfulltext=false&sortName=nothing&sortType=desc&pageNum="+str(i),callback=self.parse1,meta={"company":company,"page":copy.deepcopy(i)}, dont_filter=True)
 85 |                 list1.append(request)
 86 |                 # yield request
 87 |         else:
 88 |             #不然就获取所有页数的信息
 89 |             for k in range(page,0,-1):
 90 |                 request = scrapy.Request(
 91 |                     url="http://www.cninfo.com.cn/new/fulltextSearch/full?searchkey="+company+"&sdate=&edate=&isfulltext=false&sortName=nothing&sortType=desc&pageNum=" + str(
 92 |                         k), callback=self.parse1, meta={"company":company,"page":copy.deepcopy(k)},dont_filter=True)
 93 |                 list1.append(request)
 94 |                 # yield  request
 95 |         return list1
 96 |     #把每一页的数据放到item里面
 97 |     def parse1(self, response):
 98 | 
 99 |         if response.body != b'':
100 |             #通过meta在不同方法里面传递信息
101 |             company = response.meta["company"]
102 |             page = response.meta["page"]
103 | 
104 |             dic = json.loads(response.body.decode(response.encoding))
105 | 
106 |             for j in range(0, dic.get("announcements").__len__()):
107 |                 item = {}
108 |                 #确实是存在没有命名的公告的
109 |                 titles = "未命名"
110 |                 if dic.get("announcements")[j]["announcementTitle"] is not None:
111 |                     titles = re.split('[:：]',dic.get("announcements")[j]["announcementTitle"],1)
112 |                 if titles.__len__() == 2:
113 |                     title = titles[1]
114 |                 else:
115 |                     title = titles[0]
116 |                 adjuntUrl = dic.get("announcements")[j]["adjunctUrl"]
117 |                 adjuntUrl_time = adjuntUrl.split("/")[1]
118 |                 #防止拿不到time
119 |                 try:
120 |                     current_time = datetime.date(int(adjuntUrl_time.split("-")[0]), int(adjuntUrl_time.split("-")[1]), int(adjuntUrl_time.split("-")[2]))
121 |                 except Exception as e:
122 |                     print (e.__traceback__)
123 |                     continue
124 |                 else:
125 |                     if current_time >= Cninfospider1Spider.post:
126 |                         pass
127 | 
128 |                     else:
129 |                         if current_time < Cninfospider1Spider.pre:
130 |                             index = Cninfospider1Spider.company_list.index(company)
131 |                             #加锁，只允许第一个达到这个page的进入修改Cninfospider1Spider.time_limit_list[index]
132 |                             #否则数会不对
133 |                             mutex1.acquire()
134 |                             if(Cninfospider1Spider.page_thread_sequence == 0):
135 |                                 Cninfospider1Spider.page_thread_sequence = 1
136 |                                 if Cninfospider1Spider.time_limit_list[index] == Cninfospider1Spider.pageLimit:
137 |                                     Cninfospider1Spider.time_limit_list[index] = page+1
138 |                             else:
139 |                                 pass
140 |                             mutex1.release()
141 |                         else:
142 |                             item["com"] = company
143 |                             item["time"] = adjuntUrl_time
144 |                             item["title"] = title.replace('<em>', '').replace('</em>','')
145 |                             item["url"] = "http://static.cninfo.com.cn/" + adjuntUrl
146 |                             item["page"] = page
147 |                             yield item
148 | 


--------------------------------------------------------------------------------
/服务器端核心源码/timerStartDaily.py:
--------------------------------------------------------------------------------
 1 | from scrapy import cmdline
 2 | import datetime
 3 | import time
 4 | import shutil
 5 | import os
 6 | #爬虫任务定时设置
 7 | 
 8 | #这是为爬虫能够续爬而创建的目录。存储续爬需要的数据
 9 | recoderDir = r"C:/Users/stawind/Desktop/spider/cninfospider1"
10 | #判断爬虫是否在运行的标记
11 | checkFile = "C:/Users/stawind/Desktop/spider/isRunning.txt"
12 | 
13 | startTime = datetime.datetime.now()
14 | print(f"startTime={startTime}")
15 | 
16 | i = 0
17 | moniter = 0
18 | 
19 | while True:
20 |     isRunning = os.path.isfile(checkFile)
21 |     if not isRunning:
22 |         #在爬虫启动之前处理一些事情，清理掉jobdir = crawls
23 |         isExsit = os.path.isdir(recoderDir)
24 |         print(f"cninfospider not running,ready to start.isExsit:{isExsit}")
25 |         if isExsit:
26 |             #删除续爬目录crawls及目录下所有文件
27 |             removeRes = shutil.rmtree(recoderDir)
28 |             print(f"At time:{datetime.datetime.now()}, delete res:{removeRes}")
29 |         else:
30 |             print(f"At time:{datetime.datetime.now()}, Dir:{recoderDir} is not exsit.")
31 |         time.sleep(20)
32 |         clawerTime = datetime.datetime.now()
33 |         waitTime = clawerTime - startTime
34 |         print(f"At time:{clawerTime}, start clawer: mySpider !!!, waitTime:{waitTime}")
35 |         cmdline.execute('scrapy crawl cninfospider1 -s JOBDIR=C:/Users/stawind/Desktop/spider/cninfospider1/storeMyRequest'.split())
36 |         break #爬虫结束后退出脚本
37 |     else:
38 |         print(f"At time:{datetime.datetime.now()}, mySpider is running, sleep to wait.")
39 |     i += 1
40 | 
41 |     time.sleep(10)
42 |     moniter += 10
43 |     if moniter >= 1440:
44 |         break
45 | 
46 | 


--------------------------------------------------------------------------------