├── .github └── FUNDING.yml ├── .gitignore ├── .travis.yml ├── LICENSE ├── README.md ├── demo ├── demo.py └── zhihu.py ├── requirements.txt ├── setup.py └── simpyder ├── __init__.py ├── __version__.py ├── config.py ├── scheduler.py ├── spiders ├── __init__.py ├── asyn_spider.py └── spiders.py └── utils.py /.github/FUNDING.yml: -------------------------------------------------------------------------------- 1 | # These are supported funding model platforms 2 | 3 | github: jannchie # Replace with up to 4 GitHub Sponsors-enabled usernames e.g., [user1, user2] 4 | patreon: # Replace with a single Patreon username 5 | open_collective: # Replace with a single Open Collective username 6 | ko_fi: # Replace with a single Ko-fi username 7 | tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel 8 | community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry 9 | liberapay: # Replace with a single Liberapay username 10 | issuehunt: # Replace with a single IssueHunt username 11 | otechie: # Replace with a single Otechie username 12 | custom: ['https://azz.net/jannchie'] # Replace with up to 4 custom sponsorship URLs e.g., ['link1', 'link2'] 13 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | *.pyc 131 | /dist 132 | /build 133 | /demo/simpyder 134 | /demo/bilispider.py 135 | 136 | .vscode/settings.json 137 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | os: linux 2 | language: python 3 | python: 4 | - "3.7" 5 | script: python setup.py sdist build 6 | deploy: 7 | edge: true 8 | provider: pypi 9 | username: "__token__" 10 | password: ${PYPI_TOKEN} 11 | on: 12 | branch: master 13 | tags: true 14 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Jannchie 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Simpyder - Simple Python Spider 2 | 3 | Simpyder - 轻量级**协程**Python爬虫 4 | 5 | ## 特点 6 | 7 | - 轻量级:下载便利,依赖较少,使用简单。 8 | - 协程:单线程,通过协程实现并发。 9 | - 可定制:简单配置,适应各种爬取场合。 10 | 11 | ## 快速开始 12 | 13 | ### 下载 14 | 15 | ```bash 16 | #使用pip3 17 | pip3 install simpyder --user 18 | ``` 19 | 20 | ```bash 21 | # 更新包 22 | pip3 install simpyder --upgrade 23 | ``` 24 | 25 | ### 编码 26 | 27 | 用户只需要定义三个函数,实现三个模块: 28 | 29 | #### 链接获取 30 | 31 | 我们需要一个定义一个[异步生成器](https://docs.python.org/zh-cn/3/c-api/gen.html),用于产生链接。 32 | 33 | ``` python 34 | async def gen_url(): 35 | for each_id in range(100): 36 | yield "https://www.biliob.com/api/video/{}".format(each_id) 37 | ``` 38 | 39 | #### 链接解析 40 | 41 | 我们需要定义一个解析链接的函数。其中第一个参数是Response对象,也就是上述函数对应URL的访问结果。 42 | 43 | 该函数需要返回一个对象,作为处理结果。 44 | 45 | 注意,与普通函数不同,这是一个协程函数。需要在前面加上`async`。代表该函数是异步的。 46 | 47 | ``` python 48 | async def parse(response): 49 | return response.xpath('//meta[@name="title"]/@content')[0] 50 | ``` 51 | 52 | #### 数据导出 53 | 54 | 上面函数的处理结果将在这个函数中统一被导出。下列例子为直接在控制台中打印导出结果。 55 | 56 | 保存需要IO操作,因此这个函数可能运行较慢,因此也需要是异步的。我们在前面添加`async`关键词 57 | 58 | ``` python 59 | async def save(item): 60 | print(item) 61 | ``` 62 | 63 | ### 然后将这些模块组成一个Spider 64 | 65 | 首先导入爬虫对象: 66 | 67 | ``` python 68 | import AsynSpider from simpyder.spiders 69 | ``` 70 | 71 | 你可以这样组装Spider 72 | 73 | ``` python 74 | spider = AsyncSpider() 75 | spider.gen_url = gen_url 76 | spider.parse = parse 77 | spider.save = save 78 | ``` 79 | 80 | ### 接着就可以开始爬虫任务 81 | 82 | ``` python 83 | s.run() 84 | ``` 85 | 86 | ### 你也可以通过构造函数进行一些配置 87 | 88 | ``` python 89 | 90 | spider = AsyncSpider(name="TEST") 91 | ``` 92 | 93 | ## 示例程序 94 | 95 | ``` python 96 | from simpyder.spiders import AsynSpider 97 | 98 | # new一个异步爬虫 99 | s = AsynSpider() 100 | 101 | # 定义链接生成的生成器,这里是爬取800次百度首页的爬虫 102 | def g(): 103 | count = 0 104 | while count < 800: 105 | count += 1 106 | yield "https://www.baidu.com" 107 | 108 | # 绑定生成器 109 | s.gen_url = g 110 | 111 | # 定义用于解析的异步函数,这里不进行任何操作,返回一段文本 112 | async def p(res): 113 | return "parsed item" 114 | 115 | # 绑定解析器 116 | s.parse = p 117 | 118 | # 定义用于存储的异步函数,这里不进行任何操作,但是返回2,表示解析出2个对象 119 | async def s(item): 120 | return 2 121 | 122 | # 绑定存储器 123 | s.save = s 124 | 125 | # 运行 126 | s.run() 127 | 128 | ``` 129 | 130 | ## 理论速率 131 | 132 | 运行上述代码,可以得到单进程、并发数:64、仅进行计数操作的下载速率: 133 | 134 | ``` log 135 | [2020-09-02 23:42:48,097][CRITICAL] @ Simpyder: user_agent: Simpyder ver.0.1.9 136 | [2020-09-02 23:42:48,169][CRITICAL] @ Simpyder: concurrency: 64 137 | [2020-09-02 23:42:48,244][CRITICAL] @ Simpyder: interval: 0 138 | [2020-09-02 23:42:48,313][INFO] @ Simpyder: 已经爬取0个链接(0/min),共产生0个对象(0/min) 139 | [2020-09-02 23:42:48,319][INFO] @ Simpyder: Start Crawler: 0 140 | [2020-09-02 23:42:53,325][INFO] @ Simpyder: 已经爬取361个链接(4332/min),共产生658个对象(7896/min) 141 | [2020-09-02 23:42:58,304][INFO] @ Simpyder: 已经爬取792个链接(5280/min),共产生1540个对象(10266/min) 142 | [2020-09-02 23:43:03,304][INFO] @ Simpyder: 已经爬取1024个链接(4388/min),共产生2048个对象(8777/min) 143 | [2020-09-02 23:43:05,007][CRITICAL] @ Simpyder: Simpyder任务执行完毕 144 | [2020-09-02 23:43:05,008][CRITICAL] @ Simpyder: 累计消耗时间:0:00:16.695013 145 | [2020-09-02 23:43:05,008][CRITICAL] @ Simpyder: 累计爬取链接:1024 146 | [2020-09-02 23:43:05,009][CRITICAL] @ Simpyder: 累计生成对象:2048 147 | ``` 148 | 149 | --- 150 | 151 | - 该项目由[@Jannchie](https://github.com/Jannchie)维护 152 | - 你可以通过邮箱[jannchie@gmail.com](jannchie@gmail.com)进行联系 -------------------------------------------------------------------------------- /demo/demo.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | ''' 3 | 这是一个DEMO。该程序用于爬取B站AV号小于100的视频页面标题 4 | ''' 5 | 6 | import requests 7 | from simpyder import Spider 8 | from simpyder import SimpyderConfig 9 | 10 | 11 | def gen_url(): 12 | for each_id in range(100): 13 | yield f"https://www.bilibili.com/video/av{each_id}" 14 | 15 | 16 | def parse(response): 17 | return response.xpath('//meta[@name="title"]/@content')[0] 18 | 19 | 20 | def save(item): 21 | print(item) 22 | 23 | 24 | if __name__ == "__main__": 25 | s1 = Spider("BILIBILI TITLE SPIDER", gen_url, parse, save) 26 | sc = SimpyderConfig() 27 | sc.COOKIES = "example:value;" 28 | sc.USER_AGENT = "my user agent" 29 | s1.assemble(gen_url=gen_url, parse=parse, save=save, config=sc) 30 | s1.run() 31 | -------------------------------------------------------------------------------- /demo/zhihu.py: -------------------------------------------------------------------------------- 1 | import csv 2 | from csv import DictReader 3 | from time import sleep 4 | from simpyder import Spider 5 | from simpyder import FAKE_UA 6 | from simpyder import SimpyderConfig 7 | import simpyder 8 | import re 9 | import datetime 10 | 11 | from pymongo import MongoClient 12 | import os 13 | env_dist = os.environ 14 | # 存入mongodb 15 | client = MongoClient(env_dist.get("BILIOB_MONGO_URL")) 16 | db = client.zhihu 17 | 18 | # 该网站的爬取需要伪造cookies,请填入自己的cookie 19 | cookie = '' 20 | 21 | 22 | def get_url(): 23 | while True: 24 | yield 'https://www.zhihu.com/hot' 25 | sleep(60) 26 | 27 | 28 | def parse(response): 29 | hot_list = response.xpath('//div[@class="HotItem-content"]') 30 | data = [] 31 | for hot_item in hot_list: 32 | try: 33 | title = hot_item.xpath('.//h2[@class="HotItem-title"]/text()')[0] 34 | print(title) 35 | point = re.findall(r"\d+\.?\d*", hot_item.xpath( 36 | './/div[contains(@class,"HotItem-metrics")]/text()')[0])[0] 37 | 38 | print(point) 39 | data.append({ 40 | 'date': datetime.datetime.now().strftime("%Y-%m-%d %H:%M"), 41 | 'title': title, 42 | 'value': int(point) 43 | }) 44 | except Exception as e: 45 | print('[ERROR]'.format(title)) 46 | return data 47 | 48 | 49 | f = csv.DictWriter(open('./zhihu.csv', 'w', encoding='utf-8-sig'), 50 | fieldnames=['date', 'title', 'value']) 51 | 52 | 53 | def save(items): 54 | for item in items: 55 | f.writerow(item) 56 | 57 | 58 | s = Spider() 59 | s.assemble(get_url, parse, save) 60 | 61 | sc = SimpyderConfig() 62 | sc.PARSE_THREAD_NUMER = 1 63 | sc.COOKIE = cookie 64 | sc.USER_AGENT = FAKE_UA 65 | s.set_config(sc) 66 | s.run() 67 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | aiohttp>=3.6.2 2 | requests>=2.20.0 3 | lxml>=4.4.2 4 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | from setuptools import setup, find_packages 3 | from os.path import dirname, join 4 | from simpyder.__version__ import __VERSION__ 5 | print(find_packages()) 6 | with open('./README.md', 'r', encoding="utf-8") as fh: 7 | long_description = fh.read() 8 | setup( 9 | name='simpyder', 10 | version=__VERSION__, 11 | description=( 12 | 'Distributed multithreading universal crawler' 13 | ), 14 | include_package_data=True, 15 | long_description=long_description, 16 | long_description_content_type="text/markdown", 17 | author='Jannchie', 18 | author_email='jannchie@gmail.com', 19 | maintainer='Jannchie', 20 | maintainer_email='jannchie@gmail.com', 21 | license='MIT License', 22 | packages=find_packages(exclude=('demo', 'demo.*')), 23 | platforms=["all"], 24 | url='https://github.com/Jannchie/simpyder', 25 | classifiers=[ 26 | "Programming Language :: Python :: 3", 27 | "License :: OSI Approved :: MIT License", 28 | "Operating System :: OS Independent", 29 | ], 30 | python_requires='>=3.6', 31 | install_requires=[ 32 | "aiohttp>=3.6.2", 33 | "requests>=2.20.0", 34 | "lxml>=4.4.2" 35 | ] 36 | ) 37 | -------------------------------------------------------------------------------- /simpyder/__init__.py: -------------------------------------------------------------------------------- 1 | from .utils import FAKE_UA 2 | from .spiders import Spider 3 | from .config import SimpyderConfig 4 | -------------------------------------------------------------------------------- /simpyder/__version__.py: -------------------------------------------------------------------------------- 1 | __VERSION__ = "0.1.12" 2 | -------------------------------------------------------------------------------- /simpyder/config.py: -------------------------------------------------------------------------------- 1 | from simpyder.utils import DEFAULT_UA 2 | 3 | 4 | class SimpyderConfig(): 5 | COOKIE = "" 6 | DOWNLOAD_INTERVAL = 0 7 | HEADERS = None 8 | LOG_LEVEL = "INFO" 9 | PARSE_THREAD_NUMER = 8 10 | USER_AGENT = DEFAULT_UA 11 | pass 12 | -------------------------------------------------------------------------------- /simpyder/scheduler.py: -------------------------------------------------------------------------------- 1 | from .config import SimpyderConfig 2 | 3 | 4 | class Scheduler(): 5 | def __init__(self, spiders=[]): 6 | super().__init__() 7 | self.spiders = spiders 8 | 9 | def run_spiders(self): 10 | for each_spider in self.spiders: 11 | each_spider.run() 12 | -------------------------------------------------------------------------------- /simpyder/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | from .spiders import Spider 2 | from .asyn_spider import AsynSpider 3 | -------------------------------------------------------------------------------- /simpyder/spiders/asyn_spider.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | import asyncio 3 | from time import sleep 4 | import threading 5 | from asyncio.queues import Queue 6 | import logging 7 | import requests 8 | from requests.adapters import HTTPAdapter 9 | from lxml.etree import HTML 10 | import datetime 11 | import aiohttp 12 | import socket 13 | from simpyder.config import SimpyderConfig 14 | 15 | from asyncio import TimeoutError 16 | 17 | from simpyder.utils import _get_logger 18 | from simpyder.__version__ import __VERSION__ 19 | 20 | 21 | class AsynSpider(): 22 | async def gen_proxy(self): 23 | while True: 24 | yield "" 25 | 26 | async def __update_proxy(self): 27 | if len(self.succeed_proxies) != 0: 28 | self.proxy = next(iter(self.succeed_proxies)) 29 | else: 30 | try: 31 | self.proxy = await self.proxy_gener.__anext__() 32 | except Exception as e: 33 | self.logger.warning("没有可用代理!") 34 | self.proxy = '' 35 | 36 | async def get(self, url, proxy='', retry=5): 37 | response = None 38 | # 重试次数 39 | for i in range(retry): 40 | try: 41 | response = await self.session.get( 42 | url, headers=self.headers, proxy='' if proxy == None else proxy, timeout=5) 43 | if 'content-type' in response.headers and 'html' in response.content_type: 44 | response.xpath = HTML(await response.text()).xpath 45 | if response.content_type == 'application/json': 46 | response.json_data = await response.json() 47 | if response.status != 200 or self.except_content_type != None and response.content_type != self.except_content_type: 48 | if proxy != None: 49 | await self.__update_proxy() 50 | proxy = self.proxy 51 | continue 52 | break 53 | except (Exception, BaseException, TimeoutError) as e: 54 | if proxy != None: 55 | await self.__update_proxy() 56 | proxy = self.proxy 57 | continue 58 | break 59 | if response != None and response.status == 200: 60 | self.succeed_proxies.add(proxy) 61 | else: 62 | self.succeed_proxies.discard(self.proxy) 63 | if proxy != None: 64 | await self.__update_proxy() 65 | return response 66 | 67 | async def gen_url(self): 68 | self.except_queue.put('未实现方法: gen_url(),无法开启爬虫任务。') 69 | yield None 70 | 71 | async def parse(self, response): 72 | self.logger.critical('未实现方法: parse(response),将直接返回Response对象') 73 | return response 74 | 75 | async def save(self, item): 76 | self.logger.critical('未实现方法: save(item),将直接打印爬取内容。') 77 | print(item) 78 | return item 79 | 80 | def __init__(self, name="Simpyder", user_agent=f"Simpyder ver.{__VERSION__}", interval=0, concurrency=8, log_level='INFO'): 81 | self.count = 0 82 | self.finished = False 83 | self.log_interval = 5 84 | self.name = name 85 | self.succeed_proxies = set() 86 | self.retry = 5 87 | self.user_agent = user_agent 88 | self.concurrency = concurrency 89 | self.interval = interval 90 | self.log_level = log_level 91 | self.proxy = '' 92 | self._url_count = 0 93 | self._item_count = 0 94 | self._statistic = [] 95 | self.except_content_type = None 96 | self.headers = { 97 | 'user-agent': self.user_agent 98 | } 99 | # self.session = requests.session() 100 | # self.session.mount('http://', HTTPAdapter(max_retries=3)) 101 | # self.session.mount('https://', HTTPAdapter(max_retries=3)) 102 | self.session = aiohttp.ClientSession() 103 | 104 | def run(self): 105 | self.logger = _get_logger(f"{self.name}", self.log_level) 106 | print("""\033[0;32m 107 | _____ _ Author: Jannchie __ 108 | / ___/(_)___ ___ ____ __ ______/ /__ _____ 109 | \__ \/ / __ `__ \/ __ \/ / / / __ / _ \/ ___/ 110 | ___/ / / / / / / / /_/ / /_/ / /_/ / __/ / 111 | /____/_/_/ /_/ /_/ .___/\__, /\__,_/\___/_/ 112 | /_/ /____/ version: {}\033[0m """ .format(__VERSION__)) 113 | self.logger.critical(f"user_agent: {self.user_agent}") 114 | self.logger.critical(f"concurrency: {self.concurrency}") 115 | self.logger.critical(f"interval: {self.interval}") 116 | self.proxy_gener = self.gen_proxy() 117 | self.loop = asyncio.get_event_loop() 118 | self.loop.run_until_complete(self._run()) 119 | self.loop.close() 120 | 121 | async def _print_log(self): 122 | self._statistic.append({ 123 | 'url_count': self._url_count, 124 | 'item_count': self._item_count, 125 | 'time': datetime.datetime.now() 126 | }) 127 | if (len(self._statistic) > 10): 128 | self._statistic = self._statistic[1:10] 129 | delta_url_count = self._statistic[-1]['url_count'] - \ 130 | self._statistic[0]['url_count'] 131 | delta_item_count = self._statistic[-1]['item_count'] - \ 132 | self._statistic[0]['item_count'] 133 | delta_seconds = (self._statistic[-1]['time'] - 134 | self._statistic[0]['time']).seconds 135 | url_rate = 0 if delta_seconds == 0 else delta_url_count / \ 136 | (delta_seconds / 60) 137 | item_rate = 0 if delta_seconds == 0 else delta_item_count / \ 138 | (delta_seconds / 60) 139 | 140 | loading = (f"[限速基线:{int(url_rate / (60 / self.interval) * 100)}%]" 141 | if self.interval != 0 else "") 142 | 143 | self.logger.info( 144 | f"已经爬取{self._url_count}个链接({int(url_rate)}/min),共产生{self._item_count}个对象({int(item_rate)}/min) {loading}" 145 | ) 146 | 147 | async def _auto_print_log(self): 148 | self._last_url_count = 0 149 | self._last_item_count = 0 150 | while self.finished == False: 151 | await self._print_log() 152 | await asyncio.sleep(self.log_interval) 153 | 154 | async def crawl_one_url(self, url, proxy): 155 | try: 156 | self.logger.debug(f"> Crawl a Url: {url}") 157 | if type(url) == str and url[:4] == 'http': 158 | self.logger.debug(f"下载数据:{url}") 159 | res = await self.get(url) 160 | if res is None: 161 | self.logger.warning(f"下载数据失败 {url} {proxy}") 162 | else: 163 | self.logger.debug("非URL直接返回") 164 | res = url 165 | self._url_count += 1 166 | item = await self.parse(res) 167 | count = await self.save(item) 168 | self._item_count += count if type(count) == int else 1 169 | self.logger.debug(f"√ Crawl a Url: {url}") 170 | except Exception as e: 171 | self.logger.exception(e) 172 | 173 | async def __crawl(self, crawl_sem, lock): 174 | async with crawl_sem: 175 | try: 176 | if not self.url_task_queue.empty(): 177 | await lock.acquire() 178 | self.count += 1 179 | try: 180 | lock.release() 181 | url = await self.url_task_queue.get() 182 | await self.crawl_one_url(url, self.proxy) 183 | url = self.url_task_queue.task_done() 184 | finally: 185 | await lock.acquire() 186 | self.count -= 1 187 | lock.release() 188 | else: 189 | await asyncio.sleep(10) 190 | except Exception as e: 191 | self.logger.exception(e) 192 | 193 | async def _run_crawler(self, i): 194 | try: 195 | crawl_sem = asyncio.Semaphore(self.concurrency) 196 | lock = asyncio.Lock() 197 | self.logger.info(f"Start Crawler: {i}") 198 | while self.finished == False or not self.url_task_queue.empty(): 199 | await asyncio.sleep(0) 200 | async with crawl_sem: 201 | asyncio.ensure_future(self.__crawl(crawl_sem, lock)) 202 | except Exception as e: 203 | self.logger.exception(e) 204 | 205 | async def _add_url_to_queue(self): 206 | url_gener = self.gen_url() 207 | async for url in url_gener: 208 | self.logger.debug(f"Crawl Url: {url}") 209 | await self.url_task_queue.put(url) 210 | await asyncio.sleep(self.interval) 211 | 212 | async def _run(self): 213 | self.logger.debug("Spider Task Start") 214 | 215 | self.proxy = await self.proxy_gener.__anext__() 216 | 217 | self.url_task_queue = Queue(30) 218 | 219 | start_time = datetime.datetime.now() 220 | tasks = [] 221 | 222 | print_log = asyncio.ensure_future(self._auto_print_log()) 223 | 224 | self.logger.debug("Create Crawl Tasks") 225 | 226 | crawl_task = asyncio.ensure_future(self._run_crawler(0)) 227 | 228 | await self._add_url_to_queue() 229 | await asyncio.sleep(5) 230 | while not self.url_task_queue.empty() or self.count != 0: 231 | await asyncio.sleep(5) 232 | self.finished = True 233 | await crawl_task 234 | self.logger.critical("Simpyder任务执行完毕") 235 | end_time = datetime.datetime.now() 236 | delta_time = end_time - start_time 237 | self.logger.critical('累计消耗时间:% s' % str(delta_time)) 238 | self.logger.critical('累计爬取链接:% s' % str(self._url_count)) 239 | self.logger.critical('累计生成对象:% s' % str(self._item_count)) 240 | 241 | await print_log 242 | await self.session.close() 243 | 244 | 245 | if __name__ == "__main__": 246 | s = AsynSpider() 247 | s.concurrency = 64 248 | s.interval = 0 249 | 250 | async def g(): 251 | for _ in range(1024): 252 | # await asyncio.sleep(0.1) 253 | yield "https://www.baidu.com" 254 | s.gen_url = g 255 | 256 | async def parse(res): 257 | await asyncio.sleep(0.1) 258 | return "parsed item" 259 | s.parse = parse 260 | 261 | async def save(item): 262 | await asyncio.sleep(0.1) 263 | return 2 264 | s.save = save 265 | s.run() 266 | -------------------------------------------------------------------------------- /simpyder/spiders/spiders.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | from time import sleep 3 | import threading 4 | import queue 5 | import logging 6 | import requests 7 | from requests.adapters import HTTPAdapter 8 | from lxml.etree import HTML 9 | import datetime 10 | 11 | import socket 12 | from simpyder.config import SimpyderConfig 13 | 14 | from simpyder.utils import _get_logger 15 | from simpyder.__version__ import __VERSION__ 16 | 17 | 18 | class Spider(): 19 | 20 | def gen_url(self): 21 | self.except_queue.put('未实现方法: gen_url(),无法开启爬虫任务。') 22 | yield None 23 | 24 | def get_response(self, url): 25 | return self.get(url) 26 | 27 | def parse(self, response): 28 | self.logger.critical('未实现方法: parse(response),将直接返回Response对象') 29 | return response 30 | 31 | def save(self, item): 32 | self.logger.critical('未实现方法: save(item),将直接打印爬取内容。') 33 | print(item) 34 | return item 35 | 36 | def __run_save(self): 37 | logger = _get_logger(f"{self.name} - 子线程 - SAVE", self.config.LOG_LEVEL) 38 | count = 0 39 | while True: 40 | if not self.item_queue.empty(): 41 | count = 0 42 | self._saving = True 43 | try: 44 | item = self.item_queue.get() 45 | if item is None or item == False: 46 | continue 47 | logger.debug(item) 48 | item = self.save(item) 49 | self.meta['item_count'] += 1 50 | if self._finish == True: 51 | return 52 | except Exception as e: 53 | self.logger.exception(e) 54 | else: 55 | self._saving = False 56 | sleep(1) 57 | 58 | def assemble(self, gen_url=None, parse=None, save=None, config: SimpyderConfig = SimpyderConfig()): 59 | if gen_url != None: 60 | self.gen_url = gen_url 61 | if parse != None: 62 | self.parse = parse 63 | if save != None: 64 | self.save = save 65 | self.set_config(config) 66 | 67 | def set_config(self, config: SimpyderConfig): 68 | self.config = config 69 | 70 | def __apply_config(self): 71 | if self.config.HEADERS is None: 72 | self.headers = {'cookie': self.config.COOKIE, 73 | 'User-Agent': self.config.USER_AGENT} 74 | else: 75 | self.headers = self.config.HEADERS 76 | self.PARSE_THREAD_NUMER = self.config.PARSE_THREAD_NUMER 77 | if (len(self.config.USER_AGENT) < 30): 78 | self.logger.critical(f"使用User-Agent:{self.config.USER_AGENT}") 79 | else: 80 | self.logger.critical(f"使用User-Agent:{self.config.USER_AGENT[:30]}...") 81 | self.logger.critical(f"使用COOKIE:{self.config.COOKIE}") 82 | self.logger.critical(f"线程数:{self.config.PARSE_THREAD_NUMER}") 83 | 84 | def get(self, url): 85 | response = self.session.get(url, headers=self.headers) 86 | if 'html' in response.headers['content-type']: 87 | response.xpath = HTML(response.text).xpath 88 | return response 89 | 90 | def __init__(self, name="Simpyder", gen_url=None, parse=None, save=None, config=SimpyderConfig()): 91 | # 配置Session,复用TCP连接 92 | self.session = requests.session() 93 | self.session.mount('http://', HTTPAdapter(max_retries=3)) 94 | self.session.mount('https://', HTTPAdapter(max_retries=3)) 95 | 96 | # 载入配置 97 | self.config = config 98 | 99 | # 载入主线程日志记录 100 | self.logger = _get_logger(f"{name} - 主线程", self.config.LOG_LEVEL) 101 | 102 | # 构造函数组装 103 | self.assemble(gen_url, parse, save) 104 | 105 | self.QUEUE_LEN = self.config.PARSE_THREAD_NUMER * 2 106 | self.url_queue = queue.Queue(self.QUEUE_LEN) 107 | self.item_queue = queue.Queue(self.QUEUE_LEN) 108 | self.except_queue = queue.Queue(1) 109 | self.queueLock = threading.Lock() 110 | self.threads = [] 111 | self.name = name 112 | self._saving = True 113 | 114 | def __get_info(self): 115 | log = _get_logger(f"{self.name} - 子线程 - INFO", self.config.LOG_LEVEL) 116 | history = [] 117 | interval = 5 118 | while True: 119 | c_time = datetime.datetime.now() 120 | history.append( 121 | (c_time, self.meta['link_count'], self.meta['item_count'])) 122 | if len(history) > 60: 123 | history = history[-60:] 124 | if (c_time - self.meta['start_time']).total_seconds() % interval < 1 and len(history) > 1: 125 | delta_link = (history[-interval + 1][1] - history[0][1]) * 60 / \ 126 | ((history[-interval + 1][0] - history[0][0]).total_seconds() + 1) 127 | delta_item = (history[-interval + 1][2] - history[0][2]) * 60 / \ 128 | ((history[-interval + 1][0] - history[0][0]).total_seconds() + 1) 129 | if (self.config.DOWNLOAD_INTERVAL == 0): 130 | load = 100 131 | else: 132 | load = int((history[-1][1] - history[0][1]) * 60 / 133 | (history[-1][0] - history[0][0]).total_seconds() / 134 | (60 / (self.config.DOWNLOAD_INTERVAL / self.config.PARSE_THREAD_NUMER)) * 100) 135 | result = { 136 | 'computer_name': socket.gethostname(), 137 | 'spider_name': self.start_time, 138 | 'start_time': self.start_time, 139 | 'update_time': datetime.datetime.now(), 140 | 'load': load, 141 | 'delta_link': delta_link, 142 | 'delta_item': delta_item 143 | }, 144 | log.info( 145 | f"正在爬取第 {self.meta['link_count']} 个链接({int(delta_link)}/min, 负载{load}%),共产生 {self.meta['item_count']} 个对象({int(delta_item)}/min)" 146 | ) 147 | sleep(1) 148 | 149 | def run(self): 150 | self.start_time = datetime.datetime.now() 151 | self._finish = False 152 | print(""" 153 | _____ _ Author: Jannchie __ 154 | / ___/(_)___ ___ ____ __ ______/ /__ _____ 155 | \__ \/ / __ `__ \/ __ \/ / / / __ / _ \/ ___/ 156 | ___/ / / / / / / / /_/ / /_/ / /_/ / __/ / 157 | /____/_/_/ /_/ /_/ .___/\__, /\__,_/\___/_/ 158 | /_/ /____/ version: {} 159 | 160 | 161 | """ .format(__VERSION__)) 162 | self.__apply_config() 163 | 164 | self.logger.critical(f"Simpyder ver.{__VERSION__}") 165 | self.logger.critical("启动爬虫任务") 166 | meta = { 167 | 'link_count': 0, 168 | 'item_count': 0, 169 | 'thread_number': self.config.PARSE_THREAD_NUMER, 170 | 'download_interval': self.config.DOWNLOAD_INTERVAL, 171 | 'start_time': self.start_time, 172 | } 173 | self.meta = meta 174 | info_thread = threading.Thread(target=self.__get_info, name="状态打印线程") 175 | info_thread.setDaemon(True) 176 | info_thread.start() 177 | save_thread = threading.Thread(target=self.__run_save, name="保存项目线程") 178 | save_thread.setDaemon(True) 179 | save_thread.start() 180 | for i in range(self.PARSE_THREAD_NUMER): 181 | self.threads.append( 182 | self.ParseThread( 183 | f'{self.name} - 子线程 - No.{i}', 184 | self.url_queue, 185 | self.queueLock, 186 | self.get_response, 187 | self.parse, 188 | self.save, 189 | self.except_queue, 190 | self.item_queue, 191 | meta, 192 | self.config, 193 | )) 194 | for each_thread in self.threads: 195 | each_thread.setDaemon(True) 196 | each_thread.start() 197 | url_gener = self.gen_url() 198 | for each_url in url_gener: 199 | self.queueLock.acquire() 200 | while (self.url_queue.full()): 201 | if self.queueLock.locked(): 202 | self.logger.debug(f"队列满: {each_url}") 203 | self.queueLock.release() 204 | sleep(0.1) 205 | self.logger.debug(f"加入待爬: {each_url}") 206 | if self.queueLock.locked(): 207 | self.queueLock.release() 208 | 209 | self.queueLock.acquire() 210 | self.url_queue.put(each_url) 211 | self.queueLock.release() 212 | 213 | self.logger.info("全部请求完毕,等待解析进程") 214 | while self.url_queue.empty() == False or self.item_queue.empty() == False or self._saving == True: 215 | if self.except_queue.empty() == False: 216 | except_info = self.except_queue.get() 217 | self.logger = _get_logger(self.name, self.config.LOG_LEVEL) 218 | self.logger.error(except_info) 219 | # for each_thread in self.threads: 220 | # each_thread.join() 221 | break 222 | self.logger.critical("全部解析完毕,等待保存进程") 223 | self._finish = True 224 | save_thread.join() 225 | self.logger.critical(f'合计爬取项目数:{meta["item_count"]}') 226 | self.logger.critical(f'合计爬取链接数:{meta["link_count"]}') 227 | 228 | class ParseThread(threading.Thread): 229 | def __init__(self, name, url_queue, queueLock, get_response, parse, save, except_queue, item_queue, meta, config): 230 | threading.Thread.__init__(self, target=self.run) 231 | self.name = name 232 | self.url_queue = url_queue 233 | self.queueLock = queueLock 234 | self.get_response = get_response 235 | self.parse = parse 236 | self.save = save 237 | self.item_queue = item_queue 238 | self.except_queue = except_queue 239 | self.logger = _get_logger(self.name, config.LOG_LEVEL) 240 | self.meta = meta 241 | 242 | def run(self): 243 | while True: 244 | try: 245 | sleep(self.meta['download_interval']) 246 | self.queueLock.acquire() 247 | if not self.url_queue.empty(): 248 | url = self.url_queue.get() 249 | self.meta['link_count'] += 1 250 | else: 251 | url = None 252 | self.queueLock.release() 253 | 254 | if url is None: 255 | sleep(1) 256 | continue 257 | self.logger.debug(f"开始爬取 {url}") 258 | response = self.get_response(url) 259 | try: 260 | item = self.parse(response) 261 | except Exception as e: 262 | # 如果解析失败 263 | self.logger.exception(e) 264 | continue 265 | self.item_queue.put(item) 266 | datetime.timedelta(1) 267 | except NotImplementedError as e: 268 | self.logger.exception(e) 269 | return 270 | except Exception as e: 271 | self.logger.exception(e) 272 | -------------------------------------------------------------------------------- /simpyder/utils.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from os.path import dirname, join 3 | from .__version__ import __VERSION__ 4 | DEFAULT_UA = f'Simpyder {__VERSION__}' 5 | FAKE_UA = 'Mozilla/5.0 (Windows NT 10.0 Win64 x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 Safari/537.36' 6 | 7 | 8 | BLACK, RED, GREEN, YELLOW, BLUE, MAGENTA, CYAN, WHITE = range(8) 9 | 10 | # The background is set with 40 plus the number of the color, and the foreground with 30 11 | 12 | # These are the sequences need to get colored ouput 13 | RESET_SEQ = "\033[0m" 14 | COLOR_SEQ = "\033[1;%dm" 15 | BOLD_SEQ = "\033[1m" 16 | 17 | 18 | def formatter_message(message, use_color=True): 19 | if use_color: 20 | message = message.replace( 21 | "$RESET", RESET_SEQ).replace("$BOLD", BOLD_SEQ) 22 | else: 23 | message = message.replace("$RESET", "").replace("$BOLD", "") 24 | return message 25 | 26 | 27 | COLORS = { 28 | 'WARNING': YELLOW, 29 | 'INFO': CYAN, 30 | 'DEBUG': BLUE, 31 | 'CRITICAL': YELLOW, 32 | 'ERROR': RED 33 | } 34 | 35 | 36 | class ColoredFormatter(logging.Formatter): 37 | def __init__(self, msg, use_color=True): 38 | logging.Formatter.__init__(self, msg) 39 | self.use_color = use_color 40 | 41 | def format(self, record): 42 | levelname = record.levelname 43 | if self.use_color and levelname in COLORS: 44 | levelname_color = COLOR_SEQ % ( 45 | 30 + COLORS[levelname]) + levelname + RESET_SEQ 46 | record.levelname = levelname_color 47 | return logging.Formatter.format(self, record) 48 | 49 | 50 | # Custom logger class with multiple destinations 51 | class ColoredLogger(logging.Logger): 52 | FORMAT = "[%(asctime)s][%(levelname)s] @ %(name)s: %(message)s" 53 | COLOR_FORMAT = formatter_message(FORMAT, True) 54 | 55 | def __init__(self, name): 56 | logging.Logger.__init__(self, name, logging.INFO) 57 | color_formatter = ColoredFormatter(self.COLOR_FORMAT) 58 | console = logging.StreamHandler() 59 | console.setFormatter(color_formatter) 60 | self.addHandler(console) 61 | return 62 | 63 | 64 | def _get_logger(name, level='INFO'): 65 | logging.setLoggerClass(ColoredLogger) 66 | logger = logging.getLogger(name) 67 | logger.setLevel(level) 68 | return logger 69 | --------------------------------------------------------------------------------