├── frontera ├── VERSION ├── contrib │ ├── __init__.py │ ├── requests │ │ ├── __init__.py │ │ ├── manager.py │ │ └── converters.py │ ├── scrapy │ │ ├── __init__.py │ │ ├── middlewares │ │ │ ├── __init__.py │ │ │ ├── seeds │ │ │ │ ├── __init__.py │ │ │ │ ├── file.py │ │ │ │ └── s3.py │ │ │ └── schedulers.py │ │ ├── schedulers │ │ │ ├── __init__.py │ │ │ └── recording.py │ │ ├── overusedbuffer.py │ │ ├── manager.py │ │ ├── settings_adapter.py │ │ └── converters.py │ ├── middlewares │ │ ├── __init__.py │ │ ├── domain.py │ │ └── fingerprint.py │ ├── messagebus │ │ ├── __init__.py │ │ ├── kafka │ │ │ └── __init__.py │ │ └── zeromq │ │ │ ├── socket_config.py │ │ │ └── broker.py │ ├── backends │ │ ├── remote │ │ │ ├── __init__.py │ │ │ ├── codecs │ │ │ │ ├── __init__.py │ │ │ │ └── msgpack.py │ │ │ └── messagebus.py │ │ ├── partitioners.py │ │ ├── sqlalchemy │ │ │ ├── models.py │ │ │ └── revisiting.py │ │ └── __init__.py │ └── canonicalsolvers │ │ ├── __init__.py │ │ ├── common.py │ │ └── basic.py ├── logger │ ├── __init__.py │ ├── handlers │ │ └── __init__.py │ ├── formatters │ │ ├── json.py │ │ ├── __init__.py │ │ └── color.py │ └── filters │ │ └── __init__.py ├── utils │ ├── __init__.py │ ├── graphs │ │ ├── diagrams │ │ │ ├── graph_01.png │ │ │ ├── graph_02.png │ │ │ ├── graph_03.png │ │ │ ├── graph_04.png │ │ │ ├── graph_05.png │ │ │ ├── graph_06.png │ │ │ ├── graph_07.png │ │ │ └── graph_08.png │ │ ├── __init__.py │ │ ├── generate_diagrams.py │ │ ├── models.py │ │ ├── manager.py │ │ └── data.py │ ├── encoders.py │ ├── converters.py │ ├── fingerprint.py │ ├── url.py │ ├── async.py │ ├── heap.py │ ├── misc.py │ ├── managers.py │ └── tester.py ├── worker │ ├── __init__.py │ ├── strategies │ │ ├── bfs.py │ │ └── __init__.py │ └── server.py ├── exceptions.py ├── __init__.py ├── settings │ ├── default_settings.py │ └── __init__.py └── core │ ├── __init__.py │ ├── codec.py │ └── models.py ├── requirements.txt ├── docs ├── source │ ├── _ext │ │ ├── fronteradocs.pyc │ │ └── fronteradocs.py │ ├── topics │ │ ├── _images │ │ │ ├── site_01.png │ │ │ ├── site_02.png │ │ │ ├── site_03.png │ │ │ ├── site_04.png │ │ │ ├── frontier_01.png │ │ │ ├── frontier_02.png │ │ │ ├── frontera-design.png │ │ │ └── high-level-arc.png │ │ ├── contributing.rst │ │ ├── dns-service.rst │ │ ├── loggers.rst │ │ ├── glossary.rst │ │ ├── faq.rst │ │ ├── installation.rst │ │ ├── frontier-canonicalsolvers.rst │ │ ├── own_crawling_strategy.rst │ │ ├── what-is-cf.rst │ │ ├── fine-tuning.rst │ │ ├── examples.rst │ │ ├── quick-start-distributed.rst │ │ ├── quick-start-single.rst │ │ ├── overview.rst │ │ ├── requests-integration.rst │ │ ├── frontier-objects.rst │ │ ├── message_bus.rst │ │ ├── frontier-tester.rst │ │ ├── scrapy-recorder.rst │ │ ├── run-modes.rst │ │ ├── architecture.rst │ │ ├── tests.rst │ │ ├── frontier-middlewares.rst │ │ ├── cluster-setup.rst │ │ ├── frontier-api.rst │ │ └── scrapy-integration.rst │ └── index.rst └── README └── readme.rst /frontera/VERSION: -------------------------------------------------------------------------------- 1 | 0.2.0 -------------------------------------------------------------------------------- /frontera/contrib/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /frontera/logger/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /frontera/utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /frontera/contrib/requests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /frontera/contrib/scrapy/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /frontera/contrib/middlewares/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /frontera/contrib/scrapy/middlewares/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /frontera/contrib/scrapy/schedulers/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /frontera/worker/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | six>=1.8.0 2 | w3lib>=1.15.0 3 | -------------------------------------------------------------------------------- /frontera/contrib/messagebus/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- -------------------------------------------------------------------------------- /frontera/contrib/backends/remote/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- -------------------------------------------------------------------------------- /frontera/contrib/messagebus/kafka/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- -------------------------------------------------------------------------------- /frontera/contrib/backends/remote/codecs/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- -------------------------------------------------------------------------------- /docs/source/_ext/fronteradocs.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xsren/frontera-docs-zh_CN/HEAD/docs/source/_ext/fronteradocs.pyc -------------------------------------------------------------------------------- /frontera/exceptions.py: -------------------------------------------------------------------------------- 1 | class NotConfigured(Exception): 2 | """Indicates a missing configuration situation""" 3 | pass 4 | -------------------------------------------------------------------------------- /docs/source/topics/_images/site_01.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xsren/frontera-docs-zh_CN/HEAD/docs/source/topics/_images/site_01.png -------------------------------------------------------------------------------- /docs/source/topics/_images/site_02.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xsren/frontera-docs-zh_CN/HEAD/docs/source/topics/_images/site_02.png -------------------------------------------------------------------------------- /docs/source/topics/_images/site_03.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xsren/frontera-docs-zh_CN/HEAD/docs/source/topics/_images/site_03.png -------------------------------------------------------------------------------- /docs/source/topics/_images/site_04.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xsren/frontera-docs-zh_CN/HEAD/docs/source/topics/_images/site_04.png -------------------------------------------------------------------------------- /docs/source/topics/_images/frontier_01.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xsren/frontera-docs-zh_CN/HEAD/docs/source/topics/_images/frontier_01.png -------------------------------------------------------------------------------- /docs/source/topics/_images/frontier_02.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xsren/frontera-docs-zh_CN/HEAD/docs/source/topics/_images/frontier_02.png -------------------------------------------------------------------------------- /frontera/utils/graphs/diagrams/graph_01.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xsren/frontera-docs-zh_CN/HEAD/frontera/utils/graphs/diagrams/graph_01.png -------------------------------------------------------------------------------- /frontera/utils/graphs/diagrams/graph_02.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xsren/frontera-docs-zh_CN/HEAD/frontera/utils/graphs/diagrams/graph_02.png -------------------------------------------------------------------------------- /frontera/utils/graphs/diagrams/graph_03.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xsren/frontera-docs-zh_CN/HEAD/frontera/utils/graphs/diagrams/graph_03.png -------------------------------------------------------------------------------- /frontera/utils/graphs/diagrams/graph_04.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xsren/frontera-docs-zh_CN/HEAD/frontera/utils/graphs/diagrams/graph_04.png -------------------------------------------------------------------------------- /frontera/utils/graphs/diagrams/graph_05.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xsren/frontera-docs-zh_CN/HEAD/frontera/utils/graphs/diagrams/graph_05.png -------------------------------------------------------------------------------- /frontera/utils/graphs/diagrams/graph_06.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xsren/frontera-docs-zh_CN/HEAD/frontera/utils/graphs/diagrams/graph_06.png -------------------------------------------------------------------------------- /frontera/utils/graphs/diagrams/graph_07.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xsren/frontera-docs-zh_CN/HEAD/frontera/utils/graphs/diagrams/graph_07.png -------------------------------------------------------------------------------- /frontera/utils/graphs/diagrams/graph_08.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xsren/frontera-docs-zh_CN/HEAD/frontera/utils/graphs/diagrams/graph_08.png -------------------------------------------------------------------------------- /docs/source/topics/_images/frontera-design.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xsren/frontera-docs-zh_CN/HEAD/docs/source/topics/_images/frontera-design.png -------------------------------------------------------------------------------- /docs/source/topics/_images/high-level-arc.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xsren/frontera-docs-zh_CN/HEAD/docs/source/topics/_images/high-level-arc.png -------------------------------------------------------------------------------- /frontera/contrib/canonicalsolvers/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from . import basic 4 | from .common import CorporateWebsiteFriendly 5 | Basic = basic.BasicCanonicalSolver 6 | -------------------------------------------------------------------------------- /frontera/utils/graphs/__init__.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from . import data 3 | from .manager import CrawlGraphManager as Manager 4 | from .models import CrawlPage as Page 5 | from .models import CrawlPageRelation as Relation -------------------------------------------------------------------------------- /frontera/logger/handlers/__init__.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | import sys 3 | import logging 4 | 5 | from frontera.logger import formatters 6 | 7 | CONSOLE = logging.StreamHandler(stream=sys.stdout) 8 | CONSOLE.setFormatter(formatters.CONSOLE) 9 | -------------------------------------------------------------------------------- /frontera/logger/formatters/json.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | 3 | from pythonjsonlogger.jsonlogger import JsonFormatter 4 | from frontera.utils.encoders import DateTimeEncoder 5 | 6 | 7 | class JSONFormatter(JsonFormatter): 8 | def __init__(self): 9 | json_encoder = DateTimeEncoder 10 | super(JSONFormatter, self).__init__(json_encoder=json_encoder) 11 | -------------------------------------------------------------------------------- /frontera/__init__.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from .core.manager import FrontierManager 3 | from .core.models import Request, Response 4 | from .core.components import Backend, DistributedBackend, Middleware 5 | from .settings import Settings 6 | from .utils.tester import FrontierTester 7 | 8 | from ._version import get_versions 9 | 10 | __version__ = get_versions()['version'] 11 | del get_versions 12 | -------------------------------------------------------------------------------- /docs/source/topics/contributing.rst: -------------------------------------------------------------------------------- 1 | ======================= 2 | 贡献指引 3 | ======================= 4 | 5 | * 所有的问题和讨论请使用 `Frontera google group`_ 。 6 | * 提交补丁请使用 `Github repo`_ 的 pull request 。 7 | * Github issues 中请提交 Frontera 将来能受益的问题。请将自己运行 Frontera 的问题提交到 google group 。 8 | 9 | 我们总是乐意接受有文档和测试的解决方案。 10 | 11 | .. _`Frontera google group`: https://groups.google.com/a/scrapinghub.com/forum/#!forum/frontera 12 | .. _`Github repo`: https://github.com/scrapinghub/frontera -------------------------------------------------------------------------------- /docs/source/topics/dns-service.rst: -------------------------------------------------------------------------------- 1 | =========== 2 | DNS 服务 3 | =========== 4 | 5 | 除了提到的 :ref:`basic_requirements` 你可能还需要一个专用的 DNS 服务。特别是在你的爬虫会产生大量 DNS 请求的情况下。在广度优先抓取或者其他短时间内访问大量网站的情况下,使用专用的 DNS 服务都是正确的。 6 | 7 | 由于负载巨大,DNS 服务最终可能会被您的网络提供商阻止。 8 | 9 | DNS策略有两种选择: 10 | 11 | * 递归DNS解析, 12 | * 利用上游服务器(大规模的DNS缓存像OpenDNS或Verizon)。 13 | 14 | 第二个仍然容易阻塞。 15 | 16 | NLnet 实验室发布的 DNS 服务软件很好用 https://www.unbound.net/ 。它允许选择上述策略之一,并维护本地DNS缓存。 17 | 18 | 请查看 Scrapy 选项 ``REACTOR_THREADPOOL_MAXSIZE`` 和 ``DNS_TIMEOUT`` 。 -------------------------------------------------------------------------------- /frontera/contrib/requests/manager.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from frontera.utils.managers import FrontierManagerWrapper 3 | from .converters import RequestConverter, ResponseConverter 4 | 5 | 6 | class RequestsFrontierManager(FrontierManagerWrapper): 7 | 8 | def __init__(self, settings): 9 | super(RequestsFrontierManager, self).__init__(settings) 10 | self.request_converter = RequestConverter() 11 | self.response_converter = ResponseConverter(self.request_converter) 12 | -------------------------------------------------------------------------------- /frontera/utils/encoders.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | import json 3 | import datetime 4 | 5 | 6 | class DateTimeEncoder(json.JSONEncoder): 7 | def default(self, obj): 8 | if isinstance(obj, datetime.datetime): 9 | return obj.isoformat() 10 | elif isinstance(obj, datetime.date): 11 | return obj.isoformat() 12 | elif isinstance(obj, datetime.timedelta): 13 | return (datetime.datetime.min + obj).time().isoformat() 14 | else: 15 | return super(DateTimeEncoder, self).default(obj) 16 | 17 | -------------------------------------------------------------------------------- /frontera/contrib/scrapy/overusedbuffer.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from scrapy.utils.httpobj import urlparse_cached 3 | from scrapy.resolver import dnscache 4 | 5 | from frontera.core import OverusedBuffer 6 | 7 | 8 | class OverusedBufferScrapy(OverusedBuffer): 9 | """ 10 | Scrapy optimized version of OverusedBuffer. Url parsing and dns resolving are made using Scrapy primitives. 11 | """ 12 | 13 | def _get_key(self, request, type): 14 | key = urlparse_cached(request).hostname or '' 15 | if type == 'ip': 16 | key = dnscache.get(key, key) 17 | return key 18 | -------------------------------------------------------------------------------- /frontera/contrib/scrapy/manager.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from frontera.utils.managers import FrontierManagerWrapper 3 | from .converters import RequestConverter, ResponseConverter 4 | 5 | 6 | class ScrapyFrontierManager(FrontierManagerWrapper): 7 | 8 | spider = None 9 | 10 | def set_spider(self, spider): 11 | assert self.spider is None, 'Spider is already set. Only one spider is supported per process.' 12 | self.spider = spider 13 | self.request_converter = RequestConverter(self.spider) 14 | self.response_converter = ResponseConverter(self.spider, self.request_converter) 15 | -------------------------------------------------------------------------------- /frontera/logger/formatters/__init__.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | import logging 3 | 4 | LOG_FORMAT = "[%(name)s] %(message)s" 5 | 6 | try: 7 | from .color import ColorFormatter 8 | 9 | LOG_COLOR_FORMAT = "%(log_color)s"+LOG_FORMAT 10 | COLORS = { 11 | "DEBUG": "white", 12 | "INFO": "green", 13 | "WARNING": "yellow", 14 | "ERROR": "red", 15 | "CRITICAL": "bold_purple", 16 | } 17 | 18 | CONSOLE = ColorFormatter( 19 | format=LOG_COLOR_FORMAT, 20 | log_colors=COLORS.copy(), 21 | log_color_field="levelname") 22 | except ImportError: 23 | CONSOLE = logging.Formatter(fmt=LOG_FORMAT) 24 | -------------------------------------------------------------------------------- /frontera/utils/converters.py: -------------------------------------------------------------------------------- 1 | class BaseRequestConverter(object): 2 | """Converts between frontera and XXX request objects""" 3 | def to_frontier(self, request): 4 | """request: XXX > Frontier""" 5 | raise NotImplementedError 6 | 7 | def from_frontier(self, request): 8 | """request: Frontier > XXX""" 9 | raise NotImplementedError 10 | 11 | 12 | class BaseResponseConverter(object): 13 | """Converts between frontera and XXX response objects""" 14 | def to_frontier(self, response): 15 | """response: XXX > Frontier""" 16 | raise NotImplementedError 17 | 18 | def from_frontier(self, response): 19 | """response: Frontier > XXX""" 20 | raise NotImplementedError 21 | -------------------------------------------------------------------------------- /frontera/contrib/scrapy/middlewares/seeds/__init__.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | 3 | 4 | class SeedLoader(object): 5 | def __init__(self, crawler): 6 | self.crawler = crawler 7 | self.configure(crawler.settings) 8 | 9 | def configure(self, settings): 10 | raise NotImplementedError 11 | 12 | @classmethod 13 | def from_crawler(cls, crawler): 14 | return cls(crawler) 15 | 16 | def process_start_requests(self, start_requests, spider): 17 | urls = [url for url in self.load_seeds() if not url.startswith('#')] 18 | return [spider.make_requests_from_url(url) for url in urls] 19 | 20 | def load_seeds(self): 21 | raise NotImplementedError 22 | 23 | 24 | 25 | -------------------------------------------------------------------------------- /docs/source/topics/loggers.rst: -------------------------------------------------------------------------------- 1 | ======= 2 | Logging 3 | ======= 4 | 5 | Frontera 使用 Python 原生日志系统。这允许用户通过配置文件(参考 :setting:`LOGGING_CONFIG` )或者在运行时配置 logger。 6 | 7 | Logger 配置语法在这里 8 | https://docs.python.org/2/library/logging.config.html 9 | 10 | 使用的Loggers 11 | ============ 12 | 13 | * kafka 14 | * hbase.backend 15 | * hbase.states 16 | * hbase.queue 17 | * sqlalchemy.revisiting.queue 18 | * sqlalchemy.metadata 19 | * sqlalchemy.states 20 | * sqlalchemy.queue 21 | * offset-fetcher 22 | * messagebus-backend 23 | * cf-server 24 | * db-worker 25 | * strategy-worker 26 | * messagebus.kafka 27 | * memory.queue 28 | * memory.dequequeue 29 | * memory.states 30 | * manager.components 31 | * manager 32 | * frontera.contrib.scrapy.schedulers.FronteraScheduler 33 | 34 | -------------------------------------------------------------------------------- /docs/source/topics/glossary.rst: -------------------------------------------------------------------------------- 1 | ======== 2 | 术语表 3 | ======== 4 | 5 | 6 | .. glossary:: 7 | spider log 8 | 来自爬虫的编码消息流。每个消息都是从文档中提取,通常是链接,分数,分类结果。 9 | 10 | scoring log 11 | 从 strategy worker 到 db worker,包含更新事件和调度标志(如果链接需要调度下载)的分数。 12 | 13 | spider feed 14 | 从 :term:`db worker` 到爬虫,包含新的一批需要抓取的文档。 15 | 16 | strategy worker 17 | 一种特殊的 worker,运行抓取策略代码:为链接评分,决定链接是否需要被调度(查询 :term:`state cache` )和合适停止抓取。这种 worker 是分片的。 18 | 19 | db worker 20 | 负责和存储数据库通信,主要存储元数据和内容,同时拉取新的任务去下载。 21 | 22 | state cache 23 | 内存数据结构,包含文档是否被抓取的状态信息。 24 | 定期与持久存储同步。 25 | 26 | message bus 27 | 传输层抽象机制。提供传输层抽象和实现的接口。 28 | 29 | spider 30 | 从 Web 检索和提取内容,使用 :term:`spider feed` 作为输入队列,将结果存储到 :term:`spider log` 。在这个文档中,提取器被用作同义词。 -------------------------------------------------------------------------------- /docs/source/topics/faq.rst: -------------------------------------------------------------------------------- 1 | ====== 2 | F.A.Q. 3 | ====== 4 | 5 | .. _efficient-parallel-downloading: 6 | 7 | 如何并行有效下载? 8 | ---------------------------------------- 9 | 10 | 通常,URL排序的设计意味着从同一个域获取许多URL。 如果抓取过程需要有礼貌,则必须保留一些延迟和请求率。 另一方面,下载器同时可以同时下载许多网址(例如100)。 因此,来自同一域的URL的并发导致下载器连接池资源的低效浪费。 11 | 12 | 这是一个简短的例子。 想像一下,我们有来自许多不同域的10K网址的队列。 我们的任务是尽快取得它。 在下载期间,我们希望对每个主机有礼貌和并限制RPS。 同时,我们有一个优先级,它倾向于对来自同一个域的URL进行分组。 当抓取工具正在请求批量的URL以获取时,将从同一主机获取数百个URL。 由于RPS限制和延迟,下载器将无法快速获取它们。 因此,从队列中挑选统一域的URL可以让我们浪费时间,因为下载器的连接池浪费了大部分时间。 13 | 14 | 解决方案是在下载程序中为 Frontera 后端提供主机名/ ip(通常但不是必需的)使用。 我们在方法 :attr:`get_next_requests ` 中有一个关键字参数,用于传递这些统计信息到 Fronter a后端。 任何类型的信息都可以通过。 这个参数通常设置在 Frontera 之外,然后通过 :class:`FrontierManagerWrapper ` 子类传递给CF到后端。 -------------------------------------------------------------------------------- /frontera/contrib/scrapy/middlewares/schedulers.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | class BaseSchedulerMiddleware(object): 4 | 5 | def __init__(self, crawler): 6 | self.crawler = crawler 7 | 8 | @classmethod 9 | def from_crawler(cls, crawler): 10 | return cls(crawler) 11 | 12 | @property 13 | def scheduler(self): 14 | return self.crawler.engine.slot.scheduler 15 | 16 | 17 | class SchedulerSpiderMiddleware(BaseSchedulerMiddleware): 18 | def process_spider_output(self, response, result, spider): 19 | return self.scheduler.process_spider_output(response, result, spider) 20 | 21 | 22 | class SchedulerDownloaderMiddleware(BaseSchedulerMiddleware): 23 | def process_exception(self, request, exception, spider): 24 | return self.scheduler.process_exception(request, exception, spider) 25 | -------------------------------------------------------------------------------- /docs/source/topics/installation.rst: -------------------------------------------------------------------------------- 1 | ================== 2 | 安装指南 3 | ================== 4 | 5 | 下面的步骤假定您已经安装了以下必需的软件: 6 | 7 | * `Python`_ 2.7+ 或 3.4+ 8 | 9 | * `pip`_ 和 `setuptools`_ Python 包。 10 | 11 | 你可以使用 pip 安装 Frontera. 12 | 13 | 使用 pip 安装:: 14 | 15 | pip install frontera[option1,option2,...optionN] 16 | 17 | 选项 18 | ======= 19 | Each option installs dependencies needed for particular functionality. 20 | 每个选项安装所需的特定功能的依赖。 21 | 22 | * *sql* - 关系型数据库, 23 | * *graphs* - Graph Manager, 24 | * *logging* - 彩色日志, 25 | * *tldextract* - 可以使用 :setting:`TLDEXTRACT_DOMAIN_INFO` 26 | * *hbase* - HBase 分布式后端, 27 | * *zeromq* - ZeroMQ 消息总线, 28 | * *kafka* - Kafka 消息总线, 29 | * *distributed* - workers 依赖. 30 | 31 | .. _Python: http://www.python.org 32 | .. _pip: http://www.pip-installer.org/en/latest/installing.html 33 | .. _setuptools: https://pypi.python.org/pypi/setuptools 34 | -------------------------------------------------------------------------------- /docs/source/topics/frontier-canonicalsolvers.rst: -------------------------------------------------------------------------------- 1 | .. _canonical-url-solver: 2 | 3 | ==================== 4 | Canonical URL Solver 5 | 规范 URL 解算器 6 | ==================== 7 | 8 | 规范 URL 解算器 是一种特殊的 :ref:`middleware ` 对象,用来识别网页的规范 URL,并根据这个修改 request 或者 response 的元数据。通常规范 URL 解算器是在调用后端方法之前最后一个执行的 middleware。 9 | 10 | 此组件的主要目的是防止元数据记录重复和混淆与其相关联的抓取器行为。原因是: 11 | 12 | - 不同的重定向链将指向相同的文档。 13 | 14 | - 同一份文件可以通过多个不同的URL访问。 15 | 16 | 17 | 精心设计的系统具有自己的稳定算法,为每个文档选择正确的 URL。另见 `Canonical link element`_ 。 18 | 19 | 规范 URL 解算器在Frontera Manager初始化期间使用 :setting:`CANONICAL_SOLVER` 设置中的类来实例化。 20 | 21 | 22 | 内置规范 URL 解算器参考 23 | ======================================== 24 | 25 | 基本的 26 | ----- 27 | 用作默认值。 28 | 29 | .. autoclass:: frontera.contrib.canonicalsolvers.basic.BasicCanonicalSolver 30 | 31 | 32 | .. _Canonical link element: https://en.wikipedia.org/wiki/Canonical_link_element#Purpose -------------------------------------------------------------------------------- /frontera/contrib/scrapy/settings_adapter.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from frontera.settings import BaseSettings, DefaultSettings 3 | 4 | 5 | class ScrapySettingsAdapter(BaseSettings): 6 | """ 7 | Wrapps the frontera settings, falling back to scrapy and default settings 8 | """ 9 | def __init__(self, crawler_settings): 10 | frontera_settings = crawler_settings.get('FRONTERA_SETTINGS', None) 11 | super(ScrapySettingsAdapter, self).__init__(module=frontera_settings) 12 | self._crawler_settings = crawler_settings or {} 13 | self._default_settings = DefaultSettings() 14 | 15 | def get(self, key, default_value=None): 16 | val = super(ScrapySettingsAdapter, self).get(key) 17 | if val is not None: 18 | return val 19 | 20 | val = self._crawler_settings.get(key) 21 | if val is not None: 22 | return val 23 | 24 | return self._default_settings.get(key, default_value) 25 | -------------------------------------------------------------------------------- /docs/source/topics/own_crawling_strategy.rst: -------------------------------------------------------------------------------- 1 | ================= 2 | 抓取策略 3 | ================= 4 | 5 | 使用 ``cluster`` 例子和 ``frontera.worker.strategies.bfs`` 模型进行参考。 6 | Use ``cluster`` example and ``frontera.worker.strategies.bfs`` module for reference. 一般来说,你需要写一个 7 | 抓取策略子类,参照: 8 | 9 | .. autoclass:: frontera.worker.strategies.BaseCrawlingStrategy 10 | 11 | **Methods** 12 | 13 | .. automethod:: frontera.worker.strategies.BaseCrawlingStrategy.from_worker 14 | .. automethod:: frontera.worker.strategies.BaseCrawlingStrategy.add_seeds 15 | .. automethod:: frontera.worker.strategies.BaseCrawlingStrategy.page_crawled 16 | .. automethod:: frontera.worker.strategies.BaseCrawlingStrategy.page_error 17 | .. automethod:: frontera.worker.strategies.BaseCrawlingStrategy.finished 18 | .. automethod:: frontera.worker.strategies.BaseCrawlingStrategy.close 19 | 20 | 21 | 该类可以放在任何模块中,并在启动时使用命令行选项或 :setting:`CRAWLING_STRATEGY` 设置传递给 :term:`strategy worker`。 22 | 23 | 这个策略类在 strategy worker 中实例化,可以使用自己的存储或任何其他类型的资源。所有来着 :term:`spider log` 的都会传给这些方法。返回的分数不一定与方法参数相同。``finished()`` 方法会被周期性的调用来检测抓取目标是否达到了。 24 | -------------------------------------------------------------------------------- /frontera/contrib/scrapy/middlewares/seeds/file.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | import codecs 3 | 4 | from scrapy.exceptions import NotConfigured 5 | 6 | from frontera.contrib.scrapy.middlewares.seeds import SeedLoader 7 | 8 | 9 | class FileSeedLoader(SeedLoader): 10 | def configure(self, settings): 11 | self.seeds_source = settings.get('SEEDS_SOURCE') 12 | if not self.seeds_source: 13 | raise NotConfigured 14 | 15 | def load_seeds(self): 16 | # TODO check if it's an existing file or a folder 17 | return self.load_seeds_from_file(self.seeds_source) 18 | 19 | def load_seeds_from_file(self, file_path): 20 | with codecs.open(file_path, 'rU') as f: 21 | return self.load_seeds_from_data((f)) 22 | 23 | def load_seeds_from_data(self, data): 24 | seeds = [] 25 | for seed in data: 26 | clean_seed = self.clean_seed(seed) 27 | if clean_seed: 28 | seeds.append(clean_seed) 29 | return seeds 30 | 31 | def clean_seed(self, url): 32 | return url.strip('\t\n\r') 33 | -------------------------------------------------------------------------------- /frontera/contrib/canonicalsolvers/common.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from .basic import BasicCanonicalSolver 4 | from frontera.utils.url import parse_url 5 | 6 | 7 | class CorporateWebsiteFriendly(BasicCanonicalSolver): 8 | 9 | def _set_canonical(self, obj): 10 | if b'redirect_urls' in obj.meta: 11 | # if home page is requested then leave the target page as canonical 12 | urls = obj.meta[b'redirect_urls'] 13 | scheme, netloc, path, params, query, fragment = parse_url(urls[0]) 14 | if not path or path in ['/', 'index.html', 'index.htm', 'default.htm']: 15 | return 16 | 17 | # check if redirect is within the same hostname 18 | target = parse_url(obj.url) 19 | src_hostname, _, _ = netloc.partition(':') 20 | trg_hostname, _, _ = target.netloc.partition(':') 21 | if src_hostname == trg_hostname: 22 | return 23 | 24 | # otherwise default behavior 25 | super(CorporateWebsiteFriendly, self)._set_canonical(obj) 26 | -------------------------------------------------------------------------------- /readme.rst: -------------------------------------------------------------------------------- 1 | Frontera 文档中文翻译 2 | =================== 3 | 4 | **注意:Frontera对Windows的兼容性不好,Windows开发者慎用** 5 | 6 | 在线阅读地址 7 | ------------- 8 | 9 | http://frontera-docs-zh-cn.rtfd.io/ 10 | 11 | 12 | 版本库 13 | ------- 14 | 15 | https://github.com/xsren/frontera-docs-zh_CN 16 | 17 | 18 | 19 | 翻译流程 20 | ---------- 21 | 22 | #. 因为在官方文档中还有很多自动生成的 API 文档,这些 API 文档作为查阅资料并不需要翻译 23 | #. 将官方文档中( https://github.com/scrapinghub/frontera/tree/master/docs )的需要翻译的章节文档原始 rst 文件整理到本项目中 24 | #. 翻译人员即在本项目中的 rst 文件开始文档翻译工作 25 | #. 本项目的文件版本使用 git 进行管理,版本库托管在 github 上 26 | #. 协作方式按照通常的 fork、pull-request、merge 方式进行 27 | 28 | 29 | 自动发布流程 30 | ------------ 31 | 32 | 因为本项目基于 Sphinx ( http://www.sphinx-doc.org/ ) 构建,并且已经关联了 ReadtheDocs ( https://readthedocs.org/ ) 在线服务,所以在每次代码库有变动之后,文档就会在 ReadtheDocs 自动构建,输出友好阅读版本( 地址: http://pandas-docs-zh-cn.rtfd.io/ ) 33 | 34 | 35 | 文档版本 36 | --------- 37 | 38 | 依照 Frontera v0.6 的文档 ( https://github.com/scrapinghub/frontera/ ) 39 | 40 | 41 | 协作交流 42 | --------- 43 | 44 | 45 | 46 | TODO 47 | ------ 48 | 49 | #. 整理官方文档 rst 文件到本项目代码库中 50 | #. 去掉官方文档中的自动生成的文档,以及具体 API 调用说明文档,这些资料直接查阅英文文档更合适 51 | #. 去掉官方文中的类库模块相关引用,以及非标准标记语句 52 | #. 召集人员进行翻译工作 53 | -------------------------------------------------------------------------------- /frontera/utils/graphs/generate_diagrams.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import print_function 3 | import os 4 | import sys 5 | 6 | from .manager import CrawlGraphManager 7 | from .data import GRAPHS 8 | 9 | SCRIPT_FOLDER = os.path.abspath(os.path.split(sys.argv[0])[0]) 10 | CHARTS_FOLDER = os.path.join(SCRIPT_FOLDER, 'diagrams') 11 | 12 | 13 | def generate_filename(graph_name): 14 | name = graph_name 15 | name = name.replace(' ', '_') 16 | name = name.lower() 17 | name = '%s.png' % name 18 | return name 19 | 20 | 21 | def generate_graph_diagram(filename, title, graph): 22 | print("generating png diagram for test '%s'..." % title) 23 | manager = CrawlGraphManager() 24 | manager.add_site_list(graph) 25 | manager.render(filename, label=title, use_urls=graph.use_urls) 26 | 27 | 28 | def generate_diagrams(): 29 | for graph in GRAPHS: 30 | generate_graph_diagram(filename=os.path.join(CHARTS_FOLDER, generate_filename(graph.name)), 31 | title=graph.name, 32 | graph=graph) 33 | 34 | 35 | if __name__ == '__main__': 36 | generate_diagrams() 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | -------------------------------------------------------------------------------- /docs/source/topics/what-is-cf.rst: -------------------------------------------------------------------------------- 1 | .. _crawl-frontier: 2 | 3 | ========================= 4 | 什么是 Crawl Frontier? 5 | ========================= 6 | 7 | Frontera 一个实现 crawl frontier 的框架。crawler frontier 是爬虫系统的一部分,它决定了爬虫抓取网页时候的逻辑和策略(哪些页面应该被抓取,优先级和排序,页面被重新抓取的频率等)。 8 | 9 | 通常的 crawl frontier 方案是: 10 | 11 | .. image:: _images/frontier_01.png 12 | :width: 300px 13 | :height: 178px 14 | 15 | 16 | frontier 以 URL 列表初始化,我们称之为种子。 一旦边界初始化,爬虫程序会询问下一步应该访问哪些页面。 当爬虫开始访问页面并获取结果时,它将通知 frontier 每个页面响应以及页面中包含的超链接。 这些链接被 frontier 当做新的请求加入,安装抓取策略进行抓取。 17 | 18 | 这个过程(请求新的任务/通知结果)会一直重复直到达到爬虫的结束条件。一些爬虫可能不会停止,我们称之为永久爬虫。 19 | 20 | Frontier 抓取策略几乎可以基于任何的逻辑。常见情况是基于得分/优先级,它们通过一个或多个页面的属性(新鲜度,更新时间,某些条款的内容相关性等)计算得来。也可以基于很简单的逻辑,比如 `FIFO`_/`LIFO`_ 或 `DFS`_/`BFS`_ 。 21 | 22 | 根据 frontier 的逻辑,可能需要持久存储系统来存储,更新或查询页面信息。 其他系统可能是100%不稳定的,并且不会在不同爬虫之间共享任何信息。 23 | 24 | 更多 crawl frontier 理论请参照 Christopher D. Manning, Prabhakar Raghavan & Hinrich Schütze 写的 `URL frontier`_ 文章。 25 | 26 | .. _FIFO: http://en.wikipedia.org/wiki/FIFO 27 | .. _LIFO: http://en.wikipedia.org/wiki/LIFO_(computing) 28 | .. _DFS: http://en.wikipedia.org/wiki/Depth-first_search 29 | .. _BFS: http://en.wikipedia.org/wiki/Breadth-first_search 30 | .. _URL frontier: http://nlp.stanford.edu/IR-book/html/htmledition/the-url-frontier-1.html -------------------------------------------------------------------------------- /docs/source/topics/fine-tuning.rst: -------------------------------------------------------------------------------- 1 | =============================== 2 | Frontera 集群优化 3 | =============================== 4 | 5 | 6 | 为什么爬行速度如此之低? 7 | ============================= 8 | 寻找瓶颈。 9 | 10 | * 所有请求都针对少量的几个网站, 11 | * DNS 解析(参考 :doc:`DNS Service ` ), 12 | * :term:`strategy worker` 性能问题 13 | * :term:`db worker` 生成任务不足 14 | * HBase 相应时间过长 15 | * 集群内的网络过载。 16 | 17 | 优化 HBase 18 | ============ 19 | * 在 HBase 中增加块缓存。 20 | * 在每个 HBase 区服务器上部署 Thrift 服务器,并将负载从 SW 传播到 Thrift。 21 | * 启用 Snappy 压缩 (参照 :setting:`HBASE_USE_SNAPPY`). 22 | 23 | 优化 Kafka 24 | ============ 25 | * 将日志大小降至最低,并优化系统以避免在 Kafka 存储大量数据。 一旦写入数据,它应尽可能快地消耗。 26 | * 使用SSD或甚至RAM存储 Kafka logs, 27 | * 启用 Snappy 压缩。 28 | 29 | 30 | 各种组件之间的流量控制 31 | ======================================= 32 | 33 | :setting:`MAX_NEXT_REQUESTS` 用于控制批量任务大小。 在爬虫配置中,它控制每个 :attr:`get_next_requests ` 调用返回多少任务。同时在 DB worker 中配置它,它会设置每个分区生成的任务数。 设置这些参数时要牢记: 34 | 35 | * DB worker 和爬虫值必须保持一致,以避免消息总线过载和消息丢失。 换句话说,DB worker 产生的任务要比爬虫消耗的要少一些,因为即使DB worker还没来得及产生新的任务,蜘蛛应该仍然可以获取新的页面。 36 | * 爬虫消费率取决于许多因素:互联网连接延迟,蜘蛛解析/抓取工作量,延迟和自动限制设置,代理使用等。 37 | * 保持爬虫任务队列总是满的,以防止蜘蛛空闲。 38 | * 一般建议是设置 DB worker值比爬虫大2-4倍。 39 | * 批量生成任务数量不应太大,这样才不会在后端产生太多的负载,并允许系统对队列更改做出快速反应。 40 | * 注意有关丢失的消息的警告。 41 | -------------------------------------------------------------------------------- /frontera/worker/strategies/bfs.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from six.moves.urllib.parse import urlparse 4 | from frontera.core.components import States 5 | from frontera.worker.strategies import BaseCrawlingStrategy 6 | 7 | 8 | class CrawlingStrategy(BaseCrawlingStrategy): 9 | 10 | def add_seeds(self, seeds): 11 | for seed in seeds: 12 | if seed.meta[b'state'] is States.NOT_CRAWLED: 13 | seed.meta[b'state'] = States.QUEUED 14 | self.schedule(seed) 15 | 16 | def page_crawled(self, response): 17 | response.meta[b'state'] = States.CRAWLED 18 | 19 | def links_extracted(self, request, links): 20 | for link in links: 21 | if link.meta[b'state'] is States.NOT_CRAWLED: 22 | link.meta[b'state'] = States.QUEUED 23 | self.schedule(link, self.get_score(link.url)) 24 | 25 | def page_error(self, request, error): 26 | request.meta[b'state'] = States.ERROR 27 | self.schedule(request, score=0.0, dont_queue=True) 28 | 29 | def get_score(self, url): 30 | url_parts = urlparse(url) 31 | path_parts = url_parts.path.split('/') 32 | return 1.0 / (max(len(path_parts), 1.0) + len(url_parts.path) * 0.1) 33 | -------------------------------------------------------------------------------- /docs/source/topics/examples.rst: -------------------------------------------------------------------------------- 1 | ======== 2 | 例子 3 | ======== 4 | 5 | 这个项目包含了 ``examples`` 文件夹,其中有使用 Frontera 的代码 :: 6 | 7 | examples/ 8 | requests/ 9 | general-spider/ 10 | scrapy_recording/ 11 | scripts/ 12 | 13 | 14 | - **requests**: Example script with `Requests`_ library. 15 | - **general-spider**: Scrapy 整合示例项目。 16 | - **scrapy_recording**: Scrapy 记录示例项目。 17 | - **scripts**: 一些简单的脚本。 18 | 19 | .. note:: 20 | 21 | **这个例子可能需要安装额外的库才能工作**. 22 | 23 | 你可以使用 pip 来安装它们:: 24 | 25 | 26 | pip install -r requirements/examples.txt 27 | 28 | 29 | requests 30 | ======== 31 | 32 | 一个使用 `Requests`_ 库,抓取一个网站所有链接的脚本。 33 | 34 | 运行:: 35 | 36 | python links_follower.py 37 | 38 | 39 | general-spider 40 | ============== 41 | 42 | 一个简单的 Scrapy 爬虫,执行所有的种子任务。包含单进程,分布式爬虫和后端运行模式的配置文件。 43 | 44 | 查看 :doc:`quick-start-distributed` 如何运行。 45 | 46 | cluster 47 | ======= 48 | 49 | 是一个大型可扩展爬虫程序,用于抓取每个域有限制的大量网站。它在HBase中保留每个域的状态,并在安排新的下载请求时使用它。设计用于使用 HBase 在分布式后端运行模式下运行。 50 | 51 | scrapy_recording 52 | ================ 53 | 54 | 一个带有爬虫的简单脚本,可以跟踪站点的所有链接,记录抓取结果。 55 | 56 | 运行:: 57 | 58 | scrapy crawl recorder 59 | 60 | 61 | scripts 62 | ======= 63 | 64 | 一些关于如何使用不同 frontier 组件的示例脚本。 65 | 66 | 67 | .. _Requests: http://docs.python-requests.org/en/latest/ -------------------------------------------------------------------------------- /frontera/contrib/scrapy/middlewares/seeds/s3.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from six.moves.urllib.parse import urlparse 3 | from boto import connect_s3 4 | from scrapy.exceptions import NotConfigured 5 | 6 | from frontera.contrib.scrapy.middlewares.seeds.file import FileSeedLoader 7 | 8 | 9 | class S3SeedLoader(FileSeedLoader): 10 | def configure(self, settings): 11 | source = settings.get('SEEDS_SOURCE') 12 | u = urlparse(source) 13 | if not u.hostname or not u.scheme == 's3': 14 | raise NotConfigured 15 | self.bucket_name = u.hostname 16 | self.bucket_keys_prefix = u.path.lstrip('/') 17 | self.s3_aws_access_key = settings.get('SEEDS_AWS_ACCESS_KEY') 18 | self.s3_aws_secret_key = settings.get('SEEDS_AWS_SECRET_ACCESS_KEY') 19 | 20 | def load_seeds(self): 21 | conn = connect_s3(self.s3_aws_access_key, 22 | self.s3_aws_secret_key) 23 | bucket = conn.get_bucket(self.bucket_name) 24 | seeds = [] 25 | for key in bucket.list(self.bucket_keys_prefix): 26 | if key.name.endswith(".txt"): 27 | data = key.get_contents_as_string(encoding='utf-8').split() 28 | file_seeds = self.load_seeds_from_data(data) 29 | seeds.extend(file_seeds) 30 | return seeds 31 | -------------------------------------------------------------------------------- /frontera/contrib/backends/partitioners.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from struct import unpack 4 | from binascii import unhexlify 5 | 6 | from frontera.core.components import Partitioner 7 | from frontera.utils.misc import get_crc32 8 | 9 | 10 | class Crc32NamePartitioner(Partitioner): 11 | def partition(self, key, partitions=None): 12 | if key is None: 13 | return self.partitions[0] 14 | value = get_crc32(key) 15 | return self.partition_by_hash(value, partitions if partitions else self.partitions) 16 | 17 | def partition_by_hash(self, value, partitions): 18 | size = len(partitions) 19 | idx = value % size 20 | return partitions[idx] 21 | 22 | def __call__(self, key, all_partitions, available): 23 | return self.partition(key, all_partitions) 24 | 25 | 26 | class FingerprintPartitioner(Partitioner): 27 | def partition(self, key, partitions=None): 28 | if not partitions: 29 | partitions = self.partitions 30 | digest = unhexlify(key[0:2] + key[5:7] + key[10:12] + key[15:17]) 31 | value = unpack("i16s", host_checksum, doc_fprint)) 38 | return fprint 39 | -------------------------------------------------------------------------------- /docs/source/topics/quick-start-distributed.rst: -------------------------------------------------------------------------------- 1 | ============================ 2 | 分布式模式快速入门 3 | ============================ 4 | 5 | 这篇文档教您在本地快速搭建单机、多进程的 Frontera 系统。我们将使用 SQLite 和 ZeroMQ 构建可能最简单的 Frontera 系统。如果要搭建生产环境下的 Frontera 系统,请参考 :doc:`cluster-setup`。 6 | 7 | .. _basic_requirements: 8 | 9 | 前提 10 | ============= 11 | 12 | Here is what services needs to be installed and configured before running Frontera: 13 | 以下是运行 Frontera 之前需要安装和配置的: 14 | 15 | - Python 2.7+ 或 3.4+ 16 | - Scrapy 17 | 18 | 安装 Frontera 19 | --------------------- 20 | Ubuntu 系统, 在命令行中输入: :: 21 | 22 | $ pip install frontera[distributed,zeromq,sql] 23 | 24 | 25 | 得到一个爬虫例子代码 26 | ========================= 27 | 28 | 首先从 Github 上下载 Frontera: 29 | :: 30 | 31 | $ git clone https://github.com/scrapinghub/frontera.git 32 | 33 | 在 ``examples/general-spider`` 中有一个普通的爬虫例子。 34 | 35 | 这是一个很普通的爬虫,它仅仅从下载的内容中抽取链接。它同样包含了一些配置文件,请参考 :doc:`settings reference ` 获取更多的信息。 36 | 37 | .. _running_zeromq_broker: 38 | 39 | 启动集群 40 | ============= 41 | 42 | 首先,让我们启动 ZeroMQ 代理。 :: 43 | 44 | $ python -m frontera.contrib.messagebus.zeromq.broker 45 | 46 | 你应该看到代理打印出 spider 与 DB worker 之间传递信息的统计信息。 47 | 48 | 后续所有的命令都可以在``general-spider``的根目录下执行。 49 | 50 | 第二步,启动 DB worker。:: 51 | 52 | $ python -m frontera.worker.db --config frontier.workersettings 53 | 54 | 你应该注意点 DB worker 正在输出信息。此时没有向 ZeroMQ 发送信息是正常的,因为现在系统中缺乏种子URL。 55 | 56 | 在目录下有一批西班牙的URL,让我们把它们当做种子来启动爬虫。启动爬虫: :: 57 | 58 | $ scrapy crawl general -L INFO -s FRONTERA_SETTINGS=frontier.spider_settings -s SEEDS_SOURCE=seeds_es_smp.txt -s SPIDER_PARTITION_ID=0 59 | $ scrapy crawl general -L INFO -s FRONTERA_SETTINGS=frontier.spider_settings -s SPIDER_PARTITION_ID=1 60 | 61 | 最后应该有两个爬虫进程在运行。每个爬虫应该读取自己的Frontera配置,并且第一个应该使用 ``SEEDS_SOURCE`` 选项来读取种子和启动 Frontera 集群。 62 | 63 | 一段时间以后,种子会被准备好,以供爬虫抓取。此时爬虫已经被启动了。现在你可以周期性的检查 DB worker 的输出或者 ``metadata`` 表信息来确认爬虫确实在运行。 64 | 65 | -------------------------------------------------------------------------------- /frontera/utils/url.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from six.moves.urllib import parse 3 | from w3lib.util import to_native_str 4 | 5 | 6 | def parse_url(url, encoding=None): 7 | """Return urlparsed url from the given argument (which could be an already 8 | parsed url) 9 | """ 10 | return url if isinstance(url, parse.ParseResult) else \ 11 | parse.urlparse(to_native_str(url)) 12 | 13 | 14 | def parse_domain_from_url(url): 15 | """ 16 | Extract domain info from a passed url. 17 | Examples: 18 | ------------------------------------------------------------------------------------------------------- 19 | URL NETLOC NAME SCHEME SLD TLD SUBDOMAIN 20 | ------------------------------------------------------------------------------------------------------- 21 | http://www.google.com www.google.com google.com http google com www 22 | http://docs.google.com docs.google.com google.com http google com docs 23 | https://google.es/mail google.es google.es https google es 24 | ------------------------------------------------------------------------------------------------------- 25 | """ 26 | import tldextract 27 | extracted = tldextract.extract(url) 28 | scheme, _, _, _, _, _ = parse_url(url) 29 | 30 | sld = extracted.domain 31 | tld = extracted.suffix 32 | subdomain = extracted.subdomain 33 | name = '.'.join([sld, tld]) if tld else sld 34 | netloc = '.'.join([subdomain, name]) if subdomain else name 35 | 36 | return netloc, name, scheme, sld, tld, subdomain 37 | 38 | 39 | def parse_domain_from_url_fast(url): 40 | """ 41 | Extract domain info from a passed url, without analyzing subdomains and tld 42 | """ 43 | result = parse_url(url) 44 | return result.netloc, result.hostname, result.scheme, "", "", "" 45 | -------------------------------------------------------------------------------- /frontera/contrib/canonicalsolvers/basic.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from frontera.core.components import CanonicalSolver 4 | 5 | 6 | class BasicCanonicalSolver(CanonicalSolver): 7 | """ 8 | Implements a simple CanonicalSolver taking always first URL from redirect chain, if there were redirects. 9 | It allows easily to avoid leaking of requests in Frontera (e.g. when request issued by 10 | :attr:`get_next_requests() ` never matched in 11 | :attr:`page_crawled() `) at the price of duplicating 12 | records in Frontera for pages having more than one URL or complex redirects chains. 13 | """ 14 | def frontier_start(self): 15 | pass 16 | 17 | def frontier_stop(self): 18 | pass 19 | 20 | def add_seeds(self, seeds): 21 | for seed in seeds: 22 | self._set_canonical(seed) 23 | 24 | def page_crawled(self, response): 25 | self._set_canonical(response) 26 | 27 | def links_extracted(self, request, links): 28 | for link in links: 29 | self._set_canonical(link) 30 | 31 | def request_error(self, page, error): 32 | self._set_canonical(page) 33 | 34 | def _set_canonical(self, obj): 35 | if b'redirect_urls' in obj.meta: 36 | redirect_urls = obj.meta[b'redirect_urls'] 37 | redirect_fingerprints = obj.meta[b'redirect_fingerprints'] 38 | redirect_urls.append(obj.url) 39 | redirect_fingerprints.append(obj.meta[b'fingerprint']) 40 | obj._url = redirect_urls[0] 41 | obj.meta[b'fingerprint'] = redirect_fingerprints[0] 42 | 43 | if b'redirect_domains' in obj.meta: 44 | redirect_domains = obj.meta[b'redirect_domains'] 45 | redirect_domains.append(obj.meta[b'domain']) 46 | obj.meta[b'domain'] = redirect_domains[0] 47 | -------------------------------------------------------------------------------- /frontera/contrib/requests/converters.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from requests.models import Request as RequestsRequest 3 | from requests.models import Response as RequestsResponse 4 | 5 | from frontera.core.models import Request as FrontierRequest 6 | from frontera.core.models import Response as FrontierResponse 7 | from frontera.utils.converters import BaseRequestConverter, BaseResponseConverter 8 | 9 | 10 | class RequestConverter(BaseRequestConverter): 11 | """Converts between frontera and Requests request objects""" 12 | def to_frontier(self, request): 13 | """request: Requests > Frontier""" 14 | return FrontierRequest(url=request.url, 15 | method=request.method, 16 | headers=request.headers, 17 | cookies=request.cookies if hasattr(request, 'cookies') else {}) 18 | 19 | def from_frontier(self, request): 20 | """request: Frontier > Scrapy""" 21 | return RequestsRequest(url=request.url, 22 | method=request.method, 23 | headers=request.headers, 24 | cookies=request.cookies) 25 | 26 | 27 | class ResponseConverter(BaseResponseConverter): 28 | """Converts between frontera and Scrapy response objects""" 29 | def __init__(self, request_converter): 30 | self._request_converter = request_converter 31 | 32 | def to_frontier(self, response): 33 | """response: Scrapy > Frontier""" 34 | return FrontierResponse(url=response.url, 35 | status_code=response.status_code, 36 | headers=response.headers, 37 | body=response.text, 38 | request=self._request_converter.to_frontier(response.request)) 39 | 40 | def from_frontier(self, response): 41 | """response: Frontier > Scrapy""" 42 | raise NotImplementedError 43 | -------------------------------------------------------------------------------- /frontera/logger/formatters/color.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | 3 | import logging 4 | import sys 5 | 6 | from colorlog.escape_codes import escape_codes 7 | from colorlog import ColoredFormatter 8 | 9 | 10 | class ColorFormatter(ColoredFormatter): 11 | 12 | def __init__(self, format, log_colors, log_color_field, datefmt=None, reset=True, style='%'): 13 | super(ColorFormatter, self).__init__(fmt=format, datefmt=datefmt, log_colors=log_colors, 14 | reset=reset, style=style) 15 | self.log_color_field = log_color_field 16 | 17 | def format(self, record): 18 | if not hasattr(record, self.log_color_field): 19 | setattr(record, self.log_color_field, '?') 20 | 21 | record.__dict__.update(escape_codes) 22 | 23 | color_field = self._get_color_field(record) 24 | if color_field and color_field in self.log_colors: 25 | color = self.log_colors[color_field] 26 | record.log_color = escape_codes[color] 27 | else: 28 | record.log_color = "" 29 | 30 | # Format the message 31 | if sys.version_info > (2, 7): 32 | message = super(ColoredFormatter, self).format(record) 33 | else: 34 | message = logging.Formatter.format(self, record) 35 | 36 | # Add a reset code to the end of the message 37 | # (if it wasn't explicitly added in format str) 38 | if self.reset and not message.endswith(escape_codes['reset']): 39 | message += escape_codes['reset'] 40 | 41 | return message 42 | 43 | def _get_color_field(self, record): 44 | if not self.log_color_field: 45 | return None 46 | elif hasattr(record, self.log_color_field): 47 | return getattr(record, self.log_color_field) 48 | elif isinstance(record.msg, dict) and self.log_color_field in record.msg: 49 | return record.msg[self.log_color_field] 50 | else: 51 | return None 52 | 53 | 54 | 55 | -------------------------------------------------------------------------------- /frontera/contrib/messagebus/zeromq/socket_config.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Contains the SocketConfig class 4 | """ 5 | from __future__ import absolute_import 6 | from socket import getaddrinfo, gaierror 7 | 8 | 9 | class SocketConfig(object): 10 | """ 11 | Converts address to IPv4 or IPv6 or * and returns the necessary socket 12 | addresses. 13 | NOTE: When using * it defaults to IPv4 14 | """ 15 | def __init__(self, address, base_port): 16 | if address == '*': 17 | self.ip_addr = '*' 18 | self.base_port = base_port 19 | self.is_ipv6 = False 20 | else: 21 | try: 22 | addr_tuple = getaddrinfo(address, base_port)[0][4] 23 | except gaierror: 24 | raise gaierror("Hostname '%s' could not be resolved" % address) 25 | self.ip_addr = addr_tuple[0] 26 | self.base_port = addr_tuple[1] 27 | self.is_ipv6 = True if len(addr_tuple) == 4 else False 28 | 29 | def spiders_in(self): 30 | """ 31 | TCP socket for incoming spider messages 32 | """ 33 | return 'tcp://%s:%d' % (self.ip_addr, self.base_port) 34 | 35 | def spiders_out(self): 36 | """ 37 | TCP socket for outgoing spider messages 38 | """ 39 | return 'tcp://%s:%d' % (self.ip_addr, self.base_port + 1) 40 | 41 | def sw_in(self): 42 | """ 43 | TCP socket for incoming SW messages 44 | """ 45 | return 'tcp://%s:%d' % (self.ip_addr, self.base_port + 2) 46 | 47 | def sw_out(self): 48 | """ 49 | TCP socket for outgoing SW messages 50 | """ 51 | return 'tcp://%s:%d' % (self.ip_addr, self.base_port + 3) 52 | 53 | def db_in(self): 54 | """ 55 | TCP socket for incoming messages 56 | """ 57 | return 'tcp://%s:%d' % (self.ip_addr, self.base_port + 4) 58 | 59 | def db_out(self): 60 | """ 61 | TCP socket for outgoing DW messages 62 | """ 63 | return 'tcp://%s:%d' % (self.ip_addr, self.base_port + 5) 64 | -------------------------------------------------------------------------------- /docs/source/topics/quick-start-single.rst: -------------------------------------------------------------------------------- 1 | ========================== 2 | 单进程模式快速入门 3 | ========================== 4 | 5 | 1. 创建您的爬虫 6 | ===================== 7 | 8 | 按照通常的方式创建 Scrapy 项目。输入您要存储代码的目录,然后运行:: 9 | 10 | scrapy startproject tutorial 11 | 12 | 这会创建一个 tutorial 目录,包含以下内容:: 13 | 14 | 15 | tutorial/ 16 | scrapy.cfg 17 | tutorial/ 18 | __init__.py 19 | items.py 20 | pipelines.py 21 | settings.py 22 | spiders/ 23 | __init__.py 24 | ... 25 | 26 | 这些是最基本的: 27 | 28 | - **scrapy.cfg**: 项目的配置文件 29 | - **tutorial/**: 项目的 python 模块,后续您将从这里引用您的代码。 30 | - **tutorial/items.py**: 项目的 items 文件。 31 | - **tutorial/pipelines.py**: 项目的 pipelines 文件。 32 | - **tutorial/settings.py**: 项目的 settings 文件。 33 | - **tutorial/spiders/**: 放爬虫的目录。 34 | 35 | 2. 安装 Frontera 36 | =================== 37 | 38 | 请看 :doc:`installation`. 39 | 40 | 3. 集成您的爬虫和 Frontera 41 | ========================================== 42 | 43 | 这篇文章 :doc:`集成 Scrapy ` 详细介绍了这一步。 44 | 45 | 46 | 4. 选择您的后端 47 | ====================== 48 | 49 | 为 Frontera 设置内置的后端,比如内存中的BFS后端(广度优先):: 50 | 51 | BACKEND = 'frontera.contrib.backends.memory.BFS' 52 | 53 | 5. 运行爬虫 54 | ================= 55 | 56 | 按照通常的方式从命令行启动 Scrapy 爬虫:: 57 | 58 | scrapy crawl myspider 59 | 60 | 就是这样! 您成功将您的爬虫与 Frontera 集成了。 61 | 62 | 还有什么? 63 | ========== 64 | 65 | 您已经看到了一个使用 Frontera 集成 Scrapy 的例子,但是这个仅仅是最基本的功能。Frontera 还提供了许多让 frontier 管理更加简单、有效率的强大功能,比如: 66 | 67 | * 内置 :ref:`database storage ` 支持存储抓取数据。 68 | 69 | * 通过 :doc:`API ` 可以方便的 :doc:`与 Scrapy 集成 ` 或者与其他爬虫集成。 70 | 71 | * 通过使用 ZeroMq 或 Kafka 和分布式的后端,实现 :ref:`两种分布式抓取模式 ` 。 72 | 73 | * 通过 :doc:`自定义您的后端 ` 创建不同抓取策略或者逻辑。 74 | 75 | * 使用 :doc:`middlewares ` 插入您自己的 request/response 修改策略。 76 | 77 | * 使用 :doc:`Graph Manager ` 创建假的网站地图,并可以不用爬虫而可以重现抓取过程。 78 | 79 | * :doc:`记录您的 Scrapy 抓取结果 ` ,后续可以用它测试 frontier。 80 | 81 | * 您可以用 hook 的方式使用日志工具,捕捉错误和调试您的 frontiers。 82 | 83 | 84 | 85 | 86 | -------------------------------------------------------------------------------- /docs/source/topics/overview.rst: -------------------------------------------------------------------------------- 1 | ==================== 2 | Frontera 概览 3 | ==================== 4 | 5 | Frontera 是 crawl frontier 的实现,用于在从网络下载之前累积URL /链接的网络爬虫组件。 Frontera的主要特征: 6 | 7 | * 面向在线处理, 8 | * 分布式爬虫和后端架构, 9 | * 可定制抓取策略, 10 | * Scrapy易于集成, 11 | * 集成 `SQLAlchemy`_ 支持关系型数据库(Mysql, PostgreSQL, sqlite 等等), 集成 `HBase`_ 非常好得支持键值对数据库, 12 | * 使用 `ZeroMQ`_ and `Kafka`_ 为分布式爬虫实现消息总线, 13 | * 使用 :doc:`Graph Manager ` 创建伪站点地图和模拟抓取,进行精确抓取逻辑调优。 14 | 15 | * 透明的传输层概念(:term:`message bus`)和通信协议, 16 | * 纯 Python 实现 。 17 | * 支持 Python 3 。 18 | 19 | 20 | 使用案例 21 | --------- 22 | 23 | 下面是一些 crawl frontier 适用的案例: 24 | 25 | * 与爬虫的 URL 排序/排队隔离(例如,需要远端服务器管理排序/排队的分布式爬虫集群), 26 | * 需要存储 URL 的元信息(在一些地方验证它的内容), 27 | * 需要高级的 URL 排序逻辑,但在爬虫或者抓取器中很难维护。 28 | 29 | 一次抓取,少量网站 30 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 31 | 32 | 这种情况下使用单进程可能是最好的选择。 Frontier 提供以下现成的优先级模型: 33 | 34 | * FIFO, 35 | * LIFO, 36 | * 广度优先 (BFS), 37 | * 深度优先 (DFS), 38 | * 基于提供的得分,从 0.0 映射到 1.0。 39 | 40 | 如果网站很大,抓取所有网页太浪费, Frontera 可以控制爬虫抓取最重要的网页。 41 | 42 | 43 | 分布式抓取, 少量网站 44 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 45 | 46 | 如果考虑提高抓取速度可以使用分布式爬虫模式。在这种模式下,Frontera 为爬虫进程分发任务,并且只有一个后端实例。请求任务通过你选择的 :term:`message bus` 进行分发,通过自定义分区调整任务分发策略。默认情况下请求任务是随机分发给爬虫的,抓取速度可以在爬虫中设置。 47 | 48 | 也考虑一下代理服务,比如 `Crawlera`_。 49 | 50 | 51 | 重新抓取 52 | ^^^^^^^^^^ 53 | 54 | 有一组网站,并且需要以及时(或其他)方式重新抓取它们。Frontera 提供了简单的重新抓取后端,根据设置的时间间隔定期抓取已经抓取的网页。这个后端使用关系系数据库持久化数据,并可以应用在单进程模式或者分布式爬虫模式中。 55 | 56 | 看门狗案例 - 当需要通知文档变化时,也可以使用这样的后端和少量的自定义。 57 | 58 | 广度抓取 59 | ^^^^^^^^^^^^^^ 60 | 61 | 这种使用案例要求完全的分布式:爬虫和后端都是分布式。除了运行 spiders,还应该运行 :term:`strategy worker` (s) 和 :term:`db worker` (s),这取决于选择的分区策略。 62 | 63 | Frontera可用于与大规模网络抓取相关的一系列广泛任务: 64 | 65 | * 广泛的网页抓取,任意数量的网站和页面(我们在45M文档卷和100K网站上做过测试), 66 | * 以主机为中心的抓取:当您有超过100个网站时, 67 | * 聚焦抓取: 68 | 69 | * 主题:您搜索关于某个预定义主题的页面, 70 | * PageRank,HITS或其他链接图算法指导。 71 | 72 | 下面是一些真实世界的问题: 73 | 74 | * 抓取网络中的内容检索构建搜索引擎。 75 | * 网络图的各种研究工作:收集链接,统计,图结构,跟踪域名计数等。 76 | * 更普遍的集中抓取任务:比如,您搜索的是大中心的网页,并且频繁更改时间。 77 | 78 | .. _`Frontera`: http://github.com/scrapinghub/frontera 79 | .. _`Crawlera`: http://crawlera.com/ 80 | .. _`Kafka`: http://kafka.apache.org/ 81 | .. _`ZeroMQ`: http://zeromq.org/ 82 | .. _`HBase`: http://hbase.apache.org/ 83 | .. _`Scrapy`: http://scrapy.org/ 84 | .. _`SQLAlchemy`: http://www.sqlalchemy.org/ 85 | -------------------------------------------------------------------------------- /docs/source/topics/requests-integration.rst: -------------------------------------------------------------------------------- 1 | ================================ 2 | Frontier + Requests 3 | ================================ 4 | 5 | 为了结合 frontier 和 `Requests`_ ,提供了 ``RequestsFrontierManager`` 类。 6 | 7 | 这个类是一个简单的 :class:`FrontierManager ` 封装,它使用 `Requests`_ 对象 (``Request``/``Response``),将他们和 frontier 相互转换。 8 | 9 | 10 | 和 :class:`FrontierManager ` 一样使用,使用你的 settings 初始化它。 ``get_next_requests`` 将返回 `Requests`_ ``Request`` 对象。 11 | 12 | 一个例子:: 13 | 14 | import re 15 | 16 | import requests 17 | 18 | from urlparse import urljoin 19 | 20 | from frontera.contrib.requests.manager import RequestsFrontierManager 21 | from frontera import Settings 22 | 23 | SETTINGS = Settings() 24 | SETTINGS.BACKEND = 'frontera.contrib.backends.memory.FIFO' 25 | SETTINGS.LOGGING_MANAGER_ENABLED = True 26 | SETTINGS.LOGGING_BACKEND_ENABLED = True 27 | SETTINGS.MAX_REQUESTS = 100 28 | SETTINGS.MAX_NEXT_REQUESTS = 10 29 | 30 | SEEDS = [ 31 | 'http://www.imdb.com', 32 | ] 33 | 34 | LINK_RE = re.compile(r'href="(.*?)"') 35 | 36 | 37 | def extract_page_links(response): 38 | return [urljoin(response.url, link) for link in LINK_RE.findall(response.text)] 39 | 40 | if __name__ == '__main__': 41 | 42 | frontier = RequestsFrontierManager(SETTINGS) 43 | frontier.add_seeds([requests.Request(url=url) for url in SEEDS]) 44 | while True: 45 | next_requests = frontier.get_next_requests() 46 | if not next_requests: 47 | break 48 | for request in next_requests: 49 | try: 50 | response = requests.get(request.url) 51 | links = [requests.Request(url=url) for url in extract_page_links(response)] 52 | frontier.page_crawled(response=response) 53 | frontier.links_extracted(request=request, links=links) 54 | except requests.RequestException, e: 55 | error_code = type(e).__name__ 56 | frontier.request_error(request, error_code) 57 | 58 | 59 | .. _Requests: http://docs.python-requests.org/en/latest/ 60 | -------------------------------------------------------------------------------- /docs/source/topics/frontier-objects.rst: -------------------------------------------------------------------------------- 1 | ================ 2 | Frontier 对象 3 | ================ 4 | 5 | Frontier 使用两种对象类型: :class:`Request ` 6 | and :class:`Response `. 他们各自代表 HTTP 请求和 HTTP 返回. 7 | 8 | 这两个类会被大多数的 Frontera API 方法调用,根据方法不同可能作为参数也可能作为返回值。 9 | 10 | Frontera 同样也会使用这两种对象在内部组件之间传递数据(比如 middlewares 和 backend)。 11 | 12 | Request 对象 13 | =============== 14 | 15 | .. autoclass:: frontera.core.models.Request 16 | :members: 17 | 18 | 19 | Response 对象 20 | ================ 21 | 22 | .. autoclass:: frontera.core.models.Response 23 | :members: 24 | 25 | ``domain`` 和 ``fingerprint`` 字段被 :ref:`内置 middlewares ` 添加。 26 | 27 | 对象唯一识别标志 28 | ========================== 29 | 30 | 因为 Frontera 对象会在爬虫和服务器之间传递,所以需要一些机制来唯一标示一个对象。这个识别机制会基于 Frontera 逻辑不同而有所变化(大多数情况是根据后端的逻辑)。 31 | 32 | 默认 Frontera 会激活 :ref:`fingerprint middleware ` ,根据 :attr:`Request.url ` 33 | 和 :attr:`Response.url ` 分别生成一个唯一标示,并分别赋值给 :attr:`Request.meta ` and 34 | :attr:`Response.meta `。你可以使用这个中间件或者自己定义。 35 | 36 | 一个为 :class:`Request ` 生成指纹的例子:: 37 | 38 | >>> request.url 39 | 'http://thehackernews.com' 40 | 41 | >>> request.meta['fingerprint'] 42 | '198d99a8b2284701d6c147174cd69a37a7dea90f' 43 | 44 | 45 | .. _frontier-objects-additional-data: 46 | 47 | 48 | 为对象添加其他值 49 | ================================= 50 | 51 | 大多数情况下 Frontera 存储了系统运行所需要的参数。 52 | 53 | 同样的,其他信息也可以存入 :attr:`Request.meta ` 和 :attr:`Response.meta ` 54 | 55 | 例如,激活 :ref:`domain middleware ` 会为每个 :attr:`Request.meta ` 和 56 | :attr:`Response.meta ` 添加 ``domain`` 字段:: 57 | 58 | >>> request.url 59 | 'http://www.scrapinghub.com' 60 | 61 | >>> request.meta['domain'] 62 | { 63 | "name": "scrapinghub.com", 64 | "netloc": "www.scrapinghub.com", 65 | "scheme": "http", 66 | "sld": "scrapinghub", 67 | "subdomain": "www", 68 | "tld": "com" 69 | } 70 | -------------------------------------------------------------------------------- /frontera/utils/async.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from twisted.internet import reactor, error 4 | from twisted.internet.defer import Deferred 5 | from six.moves import range 6 | 7 | 8 | class CallLaterOnce(object): 9 | """Schedule a function to be called in the next reactor loop, but only if 10 | it hasn't been already scheduled since the last time it run. 11 | """ 12 | def __init__(self, func, reactor=reactor, *a, **kw): 13 | self._func = func 14 | self._reactor = reactor 15 | self._a = a 16 | self._kw = kw 17 | self._call = None 18 | self._errfunc = None 19 | self._erra = None 20 | self._errkw = None 21 | 22 | def setErrback(self, func, *a, **kw): 23 | self._errfunc = func 24 | self._erra = a 25 | self._errkw = kw 26 | 27 | def schedule(self, delay=0.0): 28 | if self._call is None: 29 | d = Deferred() 30 | d.addCallback(self) 31 | if self._errfunc: 32 | d.addErrback(self.error) 33 | self._call = self._reactor.callLater(delay, d.callback, None) 34 | 35 | def cancel(self): 36 | if self._call: 37 | self._call.cancel() 38 | 39 | def __call__(self, *args, **kwargs): 40 | self._call = None 41 | return self._func(*self._a, **self._kw) 42 | 43 | def error(self, f): 44 | self._call = None 45 | if self._errfunc: 46 | return self._errfunc(f, *self._erra, **self._errkw) 47 | return f 48 | 49 | 50 | def listen_tcp(portrange, host, factory, reactor=reactor): 51 | """Like reactor.listenTCP but tries different ports in a range.""" 52 | if isinstance(portrange, int): 53 | return reactor.listenTCP(portrange, factory, interface=host) 54 | assert len(portrange) <= 2, "invalid portrange: %s" % portrange 55 | if not portrange: 56 | return reactor.listenTCP(0, factory, interface=host) 57 | if len(portrange) == 1: 58 | return reactor.listenTCP(portrange[0], factory, interface=host) 59 | for x in range(portrange[0], portrange[1] + 1): 60 | try: 61 | return reactor.listenTCP(x, factory, interface=host) 62 | except error.CannotListenError: 63 | if x == portrange[1]: 64 | raise 65 | -------------------------------------------------------------------------------- /frontera/utils/heap.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import print_function 3 | import heapq 4 | import math 5 | from io import StringIO 6 | 7 | 8 | def show_tree(tree, total_width=80, fill=' '): 9 | """Pretty-print a tree.""" 10 | print('-' * total_width) 11 | output = StringIO() 12 | last_row = -1 13 | for i, n in enumerate(tree): 14 | if i: 15 | row = int(math.floor(math.log(i+1, 2))) 16 | else: 17 | row = 0 18 | if row != last_row: 19 | output.write('\n') 20 | columns = 2**row 21 | col_width = int(math.floor((total_width * 1.0) / columns)) 22 | output.write(str(n).center(col_width, fill)) 23 | last_row = row 24 | print(output.getvalue()) 25 | print('-' * total_width) 26 | print() 27 | return 28 | 29 | 30 | class HeapObjectWrapper(object): 31 | def __init__(self, obj, compare_function): 32 | self.obj = obj 33 | self.compare_function = compare_function 34 | 35 | def __cmp__(self, other): 36 | return self.compare_function(self.obj, other.obj) 37 | 38 | def __lt__(self, other): 39 | if self.compare_function(self.obj, other.obj) == -1: 40 | return True 41 | else: 42 | return False 43 | 44 | def __eq__(self, other): 45 | if self.compare_function(self.obj, other.obj) == 0: 46 | return True 47 | else: 48 | return False 49 | 50 | def __repr__(self): 51 | return repr(self.obj) 52 | 53 | def __str__(self): 54 | return str(self.obj) 55 | 56 | 57 | class Heap(object): 58 | def __init__(self, compare_function): 59 | self.heap = [] 60 | self._compare_function = compare_function 61 | 62 | def push(self, obj): 63 | heapq.heappush(self.heap, HeapObjectWrapper(obj, self._compare_function)) 64 | 65 | def pop(self, n): 66 | pages = [] 67 | page = self._extract_object() 68 | while page: 69 | pages.append(page) 70 | if n and len(pages) >= n: 71 | break 72 | else: 73 | page = self._extract_object() 74 | return pages 75 | 76 | def _extract_object(self): 77 | try: 78 | wrapper = heapq.heappop(self.heap) 79 | return wrapper.obj 80 | except IndexError: 81 | return None 82 | -------------------------------------------------------------------------------- /frontera/utils/misc.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from importlib import import_module 3 | from zlib import crc32 4 | from six.moves import range 5 | from w3lib.util import to_bytes 6 | import six 7 | 8 | 9 | def load_object(path): 10 | """Load an object given its absolute object path, and return it. 11 | 12 | object can be a class, function, variable o instance. 13 | path ie: 'myproject.frontier.models.Page' 14 | """ 15 | 16 | try: 17 | dot = path.rindex('.') 18 | except ValueError: 19 | raise ValueError("Error loading object '%s': not a full path" % path) 20 | 21 | module, name = path[:dot], path[dot+1:] 22 | try: 23 | mod = import_module(module) 24 | except ImportError as e: 25 | raise ImportError("Error loading object '%s': %s" % (path, e)) 26 | 27 | try: 28 | obj = getattr(mod, name) 29 | except AttributeError: 30 | raise NameError("Module '%s' doesn't define any object named '%s'" % (module, name)) 31 | 32 | return obj 33 | 34 | 35 | def get_crc32(name): 36 | """ signed crc32 of bytes or unicode. 37 | In python 3, return the same number as in python 2, converting to 38 | [-2**31, 2**31-1] range. This is done to maintain backwards compatibility 39 | with python 2, since checksums are stored in the database, so this allows 40 | to keep the same database schema. 41 | """ 42 | return to_signed32(crc32(to_bytes(name, 'utf-8', 'ignore'))) 43 | 44 | 45 | def to_signed32(x): 46 | """ If x is an usigned 32-bit int, convert it to a signed 32-bit. 47 | """ 48 | return x - 0x100000000 if x > 0x7fffffff else x 49 | 50 | 51 | def chunks(l, n): 52 | for i in range(0, len(l), n): 53 | yield l[i:i+n] 54 | 55 | 56 | def dict_to_bytes(obj): 57 | if isinstance(obj, dict): 58 | return {dict_to_bytes(k): dict_to_bytes(v) for k, v in six.iteritems(obj)} 59 | if isinstance(obj, six.text_type): 60 | return obj.encode('utf8') 61 | if isinstance(obj, list): 62 | return map(dict_to_bytes, obj) 63 | else: 64 | return obj 65 | 66 | 67 | def dict_to_unicode(obj): 68 | if isinstance(obj, dict): 69 | return {dict_to_unicode(k): dict_to_unicode(v) for k, v in six.iteritems(obj)} 70 | if isinstance(obj, six.binary_type): 71 | return obj.decode('utf8') 72 | if isinstance(obj, list): 73 | return map(dict_to_unicode, obj) 74 | else: 75 | return obj -------------------------------------------------------------------------------- /frontera/utils/managers.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from frontera.core.manager import FrontierManager 3 | from .converters import BaseRequestConverter, BaseResponseConverter 4 | 5 | 6 | class FrontierManagerWrapper(object): 7 | def __init__(self, settings, manager=None): 8 | manager = manager or FrontierManager 9 | self.manager = manager.from_settings(settings) 10 | self.request_converter = None 11 | self.response_converter = None 12 | 13 | def start(self): 14 | if not hasattr(self, 'request_converter'): 15 | raise NotImplementedError("Request converter should be instantiated in subclass") 16 | if not hasattr(self, 'response_converter'): 17 | raise NotImplementedError("Response converter should be instantiated in subclass") 18 | assert isinstance(self.request_converter, BaseRequestConverter), 'request_converter ' \ 19 | 'must be instance of BaseRequestConverter' 20 | assert isinstance(self.response_converter, BaseResponseConverter), 'response_converter ' \ 21 | 'must be instance of BaseResponseConverter' 22 | self.manager.start() 23 | 24 | def stop(self): 25 | self.manager.stop() 26 | 27 | def add_seeds(self, seeds): 28 | frontier_seeds = [self.request_converter.to_frontier(seed) for seed in seeds] 29 | self.manager.add_seeds(seeds=frontier_seeds) 30 | 31 | def get_next_requests(self, max_next_requests=0, **kwargs): 32 | frontier_requests = self.manager.get_next_requests(max_next_requests=max_next_requests, **kwargs) 33 | return [self.request_converter.from_frontier(frontier_request) for frontier_request in frontier_requests] 34 | 35 | def page_crawled(self, response): 36 | self.manager.page_crawled(self.response_converter.to_frontier(response)) 37 | 38 | def links_extracted(self, request, links): 39 | frontier_links = [self.request_converter.to_frontier(link) for link in links] 40 | self.manager.links_extracted(request=self.request_converter.to_frontier(request), 41 | links=frontier_links) 42 | 43 | def request_error(self, request, error): 44 | self.manager.request_error(request=self.request_converter.to_frontier(request), 45 | error=error) 46 | 47 | def finished(self): 48 | return self.manager.finished 49 | -------------------------------------------------------------------------------- /frontera/logger/filters/__init__.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | import logging 3 | import six 4 | from w3lib.util import to_native_str 5 | 6 | 7 | class PlainValuesFilter(logging.Filter): 8 | def __init__(self, separator=None, excluded_fields=None, msg_max_length=0): 9 | super(PlainValuesFilter, self).__init__() 10 | self.separator = to_native_str(separator or " ") 11 | self.excluded_fields = excluded_fields or [] 12 | self.msg_max_length = msg_max_length 13 | 14 | def filter(self, record): 15 | if isinstance(record.msg, dict): 16 | for field_name in self.excluded_fields: 17 | setattr(record, field_name, record.msg.get(field_name, '')) 18 | record.msg = self.separator.join([to_native_str(value) 19 | for key, value in six.iteritems(record.msg) 20 | if key not in self.excluded_fields]) 21 | if self.msg_max_length and len(record.msg) > self.msg_max_length: 22 | record.msg = record.msg[0:self.msg_max_length-3] + "..." 23 | 24 | return True 25 | 26 | 27 | class FilterFields(logging.Filter): 28 | def __init__(self, field_name): 29 | super(FilterFields, self).__init__() 30 | self.field_name = field_name 31 | 32 | def _get_field(self, record): 33 | if not self.field_name: 34 | return None 35 | elif hasattr(record, self.field_name): 36 | return getattr(record, self.field_name) 37 | elif isinstance(record.msg, dict) and self.field_name in record.msg: 38 | return record.msg[self.field_name] 39 | else: 40 | return None 41 | 42 | 43 | class IncludeFields(FilterFields): 44 | def __init__(self, field_name, included_values): 45 | super(IncludeFields, self).__init__(field_name) 46 | self.included_values = included_values 47 | 48 | def filter(self, record): 49 | field = self._get_field(record) 50 | if field: 51 | return field in self.included_values 52 | return True 53 | 54 | 55 | class ExcludeFields(FilterFields): 56 | def __init__(self, field_name, excluded_fields): 57 | super(ExcludeFields, self).__init__(field_name) 58 | self.excluded_fields = excluded_fields 59 | 60 | def filter(self, record): 61 | field = self._get_field(record) 62 | if field: 63 | return field not in self.excluded_fields 64 | return True 65 | 66 | 67 | PLAINVALUES = PlainValuesFilter 68 | INCLUDEFIELDS = IncludeFields 69 | EXCLUDEFIELDS = ExcludeFields 70 | -------------------------------------------------------------------------------- /docs/source/topics/message_bus.rst: -------------------------------------------------------------------------------- 1 | =========== 2 | 消息总线 3 | =========== 4 | 5 | 消息总线是传输层抽象机制。Frontera 提供了接口和几个实现。同一时间只能使用一种类型的消息总线,并通过 :setting:`MESSAGE_BUS` 设置。 6 | 7 | 爬虫进程可以使用 8 | 9 | .. autoclass:: frontera.contrib.backends.remote.messagebus.MessageBusBackend 10 | 11 | 和消息总线进行通信。 12 | 13 | 14 | 内置消息总线参考 15 | ============================== 16 | 17 | ZeroMQ 18 | ------ 19 | 这是默认选项,使用轻量级的 `ZeroMQ`_ 库实现 20 | 21 | .. autoclass:: frontera.contrib.messagebus.zeromq.MessageBus 22 | 23 | 可以使用 :ref:`zeromq-settings` 配置。 24 | 25 | ZeroMQ 需要按照 ZeroMQ 库,并且启动broker进程,请参考 :ref:`running_zeromq_broker` 。 26 | 27 | 总的来说,使用 ZeroMQ 消息总线是为了用最少的部署实现 PoC (Patch Output Converter 成批输出转换程序)。因为它很容易 28 | 在组件的数据流未正确调整或启动过程中发生消息丢失,所以请参照下面的顺序启动组件: 29 | 30 | #. :term:`db worker` 31 | #. :term:`strategy worker` 32 | #. :term:`spiders` 33 | 34 | 不幸的是,停止执行未完成抓取的爬虫时,无法避免消息丢失。如果你的爬虫程序对少量的信息丢失敏感的话,我建议你使用 Kafka。 35 | 36 | .. pull-quote:: 37 | 警告!ZeroMQ消息总线不支持多个 SW worker 和 DB worker, 每种 woker 只能有一个实例。 38 | 39 | Kafka 40 | ----- 41 | 使用这个类 42 | 43 | .. autoclass:: frontera.contrib.messagebus.kafkabus.MessageBus 44 | 45 | 使用 :ref:`kafka-settings` 配置。 46 | 47 | 需要运行 `Kafka`_ 服务,这个服务更适合大规模采集。 48 | 49 | .. _Kafka: http://kafka.apache.org/ 50 | .. _ZeroMQ: http://zeromq.org/ 51 | 52 | 53 | .. _message_bus_protocol: 54 | 55 | 协议 56 | ======== 57 | 58 | 根据数据流,Frontera 使用几种消息类型来编码它的消息。每种消息是用 `msgpack`_ 或 JSON 序列化的 python 对象。可以使用 :setting:`MESSAGE_BUS_CODEC` 选择编解码器模块,并且需要导出编码器和解码器类。 59 | 60 | 以下是子类实现自己的编解码器所需的类: 61 | 62 | .. autoclass:: frontera.core.codec.BaseEncoder 63 | 64 | .. automethod:: frontera.core.codec.BaseEncoder.encode_add_seeds 65 | .. automethod:: frontera.core.codec.BaseEncoder.encode_page_crawled 66 | .. automethod:: frontera.core.codec.BaseEncoder.encode_request_error 67 | .. automethod:: frontera.core.codec.BaseEncoder.encode_request 68 | .. automethod:: frontera.core.codec.BaseEncoder.encode_update_score 69 | .. automethod:: frontera.core.codec.BaseEncoder.encode_new_job_id 70 | .. automethod:: frontera.core.codec.BaseEncoder.encode_offset 71 | 72 | .. autoclass:: frontera.core.codec.BaseDecoder 73 | 74 | .. automethod:: frontera.core.codec.BaseDecoder.decode 75 | .. automethod:: frontera.core.codec.BaseDecoder.decode_request 76 | 77 | 78 | 可用的编解码器 79 | ================ 80 | 81 | MsgPack 82 | ------- 83 | .. automodule:: frontera.contrib.backends.remote.codecs.msgpack 84 | 85 | Module: frontera.contrib.backends.remote.codecs.msgpack 86 | 87 | JSON 88 | ---- 89 | .. automodule:: frontera.contrib.backends.remote.codecs.json 90 | 91 | Module: frontera.contrib.backends.remote.codecs.json 92 | 93 | 94 | .. _msgpack: http://msgpack.org/index.html -------------------------------------------------------------------------------- /docs/source/topics/frontier-tester.rst: -------------------------------------------------------------------------------- 1 | ================== 2 | 测试一个 Frontier 3 | ================== 4 | 5 | Frontier Tester是一个帮助类,用于方便 frontier 测试。 6 | 7 | 基本上它基于 Frontier 运行一个模拟的爬取过程,爬虫信息是使用Graph Manager实例伪造的。 8 | 9 | 创建一个 Frontier Tester 10 | ========================== 11 | 12 | FrontierTester 需要一个 :doc:`Graph Manager ` 和一个 13 | :class:`FrontierManager ` 实例:: 14 | 15 | >>> from frontera import FrontierManager, FrontierTester 16 | >>> from frontera.utils import graphs 17 | >>> graph = graphs.Manager('sqlite:///graph.db') # Crawl fake data loading 18 | >>> frontier = FrontierManager.from_settings() # Create frontier from default settings 19 | >>> tester = FrontierTester(frontier, graph) 20 | 21 | 运行一个 Test 22 | ============== 23 | 24 | tester 已经被实例化,现在只需调用 `run` 函数运行:: 25 | 26 | >>> tester.run() 27 | 28 | 当 run 方法被调用 tester 将: 29 | 30 | 1. 从图中添加所有的种子。 31 | 2. 向 frontier 询问新的任务。 32 | 3. 模拟页面响应,并通知 frontier 关于页面抓取及其链接。 33 | 34 | 重复步骤1和2,直到抓取或 frontier 结束。 35 | 36 | 测试完成后,抓取页面 ``sequence`` 可作为 frontier :class:`Request ` 对象列表使用。 37 | 38 | 测试参数 39 | =============== 40 | 41 | 在某些测试用例中,您可能需要将所有页面添加为种子,这可以通过参数 ``add_all_pages`` 来完成:: 42 | 43 | >>> tester.run(add_all_pages=True) 44 | 45 | 每个 :attr:`get_next_requests ` 调用的最大返回页数可以使用 frontier settings进行设置,也可创建 FrontierTester 时使用 ``max_next_pages`` 参数进行修改:: 46 | 47 | >>> tester = FrontierTester(frontier, graph, max_next_pages=10) 48 | 49 | 50 | 使用的例子 51 | ================= 52 | 53 | 一个使用 graph 测试数据和 :ref:`basic backends ` 的例子 :: 54 | 55 | from frontera import FrontierManager, Settings, FrontierTester, graphs 56 | 57 | 58 | def test_backend(backend): 59 | # Graph 60 | graph = graphs.Manager() 61 | graph.add_site_list(graphs.data.SITE_LIST_02) 62 | 63 | # Frontier 64 | settings = Settings() 65 | settings.BACKEND = backend 66 | settings.TEST_MODE = True 67 | frontier = FrontierManager.from_settings(settings) 68 | 69 | # Tester 70 | tester = FrontierTester(frontier, graph) 71 | tester.run(add_all_pages=True) 72 | 73 | # Show crawling sequence 74 | print '-'*40 75 | print frontier.backend.name 76 | print '-'*40 77 | for page in tester.sequence: 78 | print page.url 79 | 80 | if __name__ == '__main__': 81 | test_backend('frontera.contrib.backends.memory.heapq.FIFO') 82 | test_backend('frontera.contrib.backends.memory.heapq.LIFO') 83 | test_backend('frontera.contrib.backends.memory.heapq.BFS') 84 | test_backend('frontera.contrib.backends.memory.heapq.DFS') 85 | -------------------------------------------------------------------------------- /docs/source/topics/scrapy-recorder.rst: -------------------------------------------------------------------------------- 1 | ======================== 2 | 记录 Scrapy 抓取过程 3 | ======================== 4 | 5 | Scrapy Recorder 是一套 `Scrapy middlewares`_ ,可以让您记录scrapy抓取过程并将其存储到 :doc:`Graph Manager ` 中。 6 | 7 | 这可以用于执行 frontier 测试,而不必再次爬取整个站点,甚至不用使用 Scrapy。 8 | 9 | 10 | 激活 recorder 11 | ======================= 12 | 13 | recorder 使用了两个中间件: ``CrawlRecorderSpiderMiddleware`` 和 ``CrawlRecorderDownloaderMiddleware`` 。 14 | 15 | 要在 Scrapy 项目中激活 recorder,只需要将他们加入 `SPIDER_MIDDLEWARES`_ 和 `DOWNLOADER_MIDDLEWARES`_ 配置中:: 16 | 17 | SPIDER_MIDDLEWARES.update({ 18 | 'frontera.contrib.scrapy.middlewares.recording.CrawlRecorderSpiderMiddleware': 1000, 19 | }) 20 | 21 | DOWNLOADER_MIDDLEWARES.update({ 22 | 'frontera.contrib.scrapy.middlewares.recording.CrawlRecorderDownloaderMiddleware': 1000, 23 | }) 24 | 25 | 26 | 选择你的存储引擎 27 | ============================ 28 | 29 | 因为 recorder 内部使用 :doc:`Graph Manager ` 存储抓取的网页,所以你可以选择存储引擎,参照 :ref:`different storage engines ` 。 30 | 31 | 我们使用 :setting:`RECORDER_STORAGE_ENGINE ` 配置存储引擎:: 32 | 33 | RECORDER_STORAGE_ENGINE = 'sqlite:///my_record.db' 34 | 35 | 您还可以选择重置数据库表或仅重置数据:: 36 | 37 | RECORDER_STORAGE_DROP_ALL_TABLES = True 38 | RECORDER_STORAGE_CLEAR_CONTENT = True 39 | 40 | 运行爬虫 41 | ================= 42 | 43 | 和之前一样从命令行运行爬虫:: 44 | 45 | scrapy crawl myspider 46 | 47 | 一旦完成,抓取过程会被记录。 48 | 49 | 如果你需要取消记录,可以设置 :setting:`RECORDER_ENABLED ` :: 50 | 51 | scrapy crawl myspider -s RECORDER_ENABLED=False 52 | 53 | Recorder 设置 54 | ================= 55 | 56 | 以下是所有可用Scrapy Recorder设置的列表,按字母顺序排列,以及默认值及其应用范围。 57 | 58 | .. setting:: RECORDER_ENABLED 59 | 60 | RECORDER_ENABLED 61 | ---------------- 62 | 63 | 默认: ``True`` 64 | 65 | 激活或停用中间件。 66 | 67 | .. setting:: RECORDER_STORAGE_CLEAR_CONTENT 68 | 69 | RECORDER_STORAGE_CLEAR_CONTENT 70 | ------------------------------ 71 | 72 | 默认: ``True`` 73 | 74 | 删除 :ref:`storage database ` 中的数据。 75 | 76 | .. setting:: RECORDER_STORAGE_DROP_ALL_TABLES 77 | 78 | RECORDER_STORAGE_DROP_ALL_TABLES 79 | -------------------------------- 80 | 81 | 默认: ``True`` 82 | 83 | 删除 :ref:`storage database ` 中的表。 84 | 85 | .. setting:: RECORDER_STORAGE_ENGINE 86 | 87 | RECORDER_STORAGE_ENGINE 88 | ----------------------- 89 | 90 | 默认: ``None`` 91 | 92 | 设置 :ref:`Graph Manager storage engine ` 来存储记录。 93 | 94 | .. _Scrapy middlewares: http://doc.scrapy.org/en/latest/topics/downloader-middleware.html 95 | .. _DOWNLOADER_MIDDLEWARES: http://doc.scrapy.org/en/latest/topics/settings.html#std:setting-DOWNLOADER_MIDDLEWARES 96 | .. _SPIDER_MIDDLEWARES: http://doc.scrapy.org/en/latest/topics/settings.html#std:setting-SPIDER_MIDDLEWARES 97 | -------------------------------------------------------------------------------- /frontera/settings/default_settings.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from datetime import timedelta 3 | 4 | 5 | AUTO_START = True 6 | BACKEND = 'frontera.contrib.backends.memory.FIFO' 7 | BC_MIN_REQUESTS = 64 8 | BC_MIN_HOSTS = 24 9 | BC_MAX_REQUESTS_PER_HOST = 128 10 | CANONICAL_SOLVER = 'frontera.contrib.canonicalsolvers.Basic' 11 | DELAY_ON_EMPTY = 5.0 12 | DOMAIN_FINGERPRINT_FUNCTION = 'frontera.utils.fingerprint.sha1' 13 | 14 | HBASE_THRIFT_HOST = 'localhost' 15 | HBASE_THRIFT_PORT = 9090 16 | HBASE_NAMESPACE = 'crawler' 17 | HBASE_DROP_ALL_TABLES = False 18 | HBASE_METADATA_TABLE = 'metadata' 19 | HBASE_USE_SNAPPY = False 20 | HBASE_USE_FRAMED_COMPACT = False 21 | HBASE_BATCH_SIZE = 9216 22 | HBASE_STATE_CACHE_SIZE_LIMIT = 3000000 23 | HBASE_QUEUE_TABLE = 'queue' 24 | KAFKA_GET_TIMEOUT = 5.0 25 | MAX_NEXT_REQUESTS = 64 26 | MAX_REQUESTS = 0 27 | MESSAGE_BUS = 'frontera.contrib.messagebus.zeromq.MessageBus' 28 | MESSAGE_BUS_CODEC = 'frontera.contrib.backends.remote.codecs.msgpack' 29 | MIDDLEWARES = [ 30 | 'frontera.contrib.middlewares.fingerprint.UrlFingerprintMiddleware', 31 | ] 32 | NEW_BATCH_DELAY = 30.0 33 | OVERUSED_SLOT_FACTOR = 5.0 34 | QUEUE_HOSTNAME_PARTITIONING = False 35 | REQUEST_MODEL = 'frontera.core.models.Request' 36 | RESPONSE_MODEL = 'frontera.core.models.Response' 37 | 38 | SCORING_PARTITION_ID = 0 39 | SCORING_LOG_CONSUMER_BATCH_SIZE = 512 40 | SPIDER_LOG_CONSUMER_BATCH_SIZE = 512 41 | SPIDER_LOG_PARTITIONS = 1 42 | SPIDER_FEED_PARTITIONS = 1 43 | SPIDER_PARTITION_ID = 0 44 | SQLALCHEMYBACKEND_CACHE_SIZE = 10000 45 | SQLALCHEMYBACKEND_CLEAR_CONTENT = True 46 | SQLALCHEMYBACKEND_DROP_ALL_TABLES = True 47 | SQLALCHEMYBACKEND_ENGINE = 'sqlite:///:memory:' 48 | SQLALCHEMYBACKEND_ENGINE_ECHO = False 49 | SQLALCHEMYBACKEND_MODELS = { 50 | 'MetadataModel': 'frontera.contrib.backends.sqlalchemy.models.MetadataModel', 51 | 'StateModel': 'frontera.contrib.backends.sqlalchemy.models.StateModel', 52 | 'QueueModel': 'frontera.contrib.backends.sqlalchemy.models.QueueModel' 53 | } 54 | SQLALCHEMYBACKEND_REVISIT_INTERVAL = timedelta(days=1) 55 | STATE_CACHE_SIZE = 1000000 56 | STATE_CACHE_SIZE_LIMIT = 0 57 | STORE_CONTENT = False 58 | TEST_MODE = False 59 | TLDEXTRACT_DOMAIN_INFO = False 60 | URL_FINGERPRINT_FUNCTION = 'frontera.utils.fingerprint.sha1' 61 | 62 | ZMQ_ADDRESS = '127.0.0.1' 63 | ZMQ_BASE_PORT = 5550 64 | 65 | LOGGING_CONFIG = 'logging.conf' 66 | 67 | #-------------------------------------------------------- 68 | # Kafka 69 | #-------------------------------------------------------- 70 | 71 | SPIDER_FEED_TOPIC = "frontier-todo" 72 | SPIDER_LOG_TOPIC = "frontier-done" 73 | SCORING_LOG_TOPIC = "frontier-score" 74 | 75 | SPIDER_LOG_DBW_GROUP = "dbw-spider-log" 76 | SPIDER_LOG_SW_GROUP = "sw-spider-log" 77 | SCORING_LOG_DBW_GROUP = "dbw-scoring-log" 78 | SPIDER_FEED_GROUP = "fetchers-spider-feed" 79 | 80 | KAFKA_CODEC = None -------------------------------------------------------------------------------- /docs/source/topics/run-modes.rst: -------------------------------------------------------------------------------- 1 | ========= 2 | 运行模式 3 | ========= 4 | 5 | 下图展示了运行模式的架构图: 6 | 7 | .. image:: _images/high-level-arc.png 8 | 9 | 10 | ==================== ========================================================================= ====================================================== ===================== 11 | 模式 父类 所需组件 可用的后端 12 | ==================== ========================================================================= ====================================================== ===================== 13 | 单进程 :class:`Backend ` 单进程运行爬虫 内存, SQLAlchemy 14 | 分布式爬虫 :class:`Backend ` 多个爬虫和单个 :term:`db worker` 内存, SQLAlchemy 15 | 分布式后端 :class:`DistributedBackend ` 多个爬虫, 多个 :term:`strategy worker` (s) 和多个 db worker(s). SQLAlchemy, HBase 16 | ==================== ========================================================================= ====================================================== ===================== 17 | 18 | 19 | 单进程 20 | ============== 21 | 22 | Frontera 与 fetcher 在相同的过程中实例化(例如在 Scrapy 中)。要实现这个,需要设置 :setting:`BACKEND` 为 :class:`Backend ` 的子类。这种模式适合那种少量文档并且时间要求不紧的应用。 23 | 24 | 分布式爬虫 25 | =================== 26 | 27 | 爬虫是分布式的,但后端不是。后端运行在 :term:`db worker` 中,并通过 :term:`message bus` 与爬虫通信。 28 | 29 | 1. 将爬虫进程中的 :setting:`BACKEND` 设置为 :class:`MessageBusBackend ` 30 | 2. 在 DB worker 中 :setting:`BACKEND` 应该指向 :class:`Backend ` 的子类。 31 | 3. 每个爬虫进程应该有它自己的 :setting:`SPIDER_PARTITION_ID`,值为从0到 :setting:`SPIDER_FEED_PARTITIONS`。 32 | 4. 爬虫和 DB worker 都应该将 :setting:`MESSAGE_BUS` 设置为你选择的消息总线类或者其他你自定义的实现。 33 | 34 | 此模式适用于需要快速获取文档,同时文档的数量相对较小的应用。 35 | 36 | 37 | 分布式爬虫和后端 38 | =============================== 39 | 40 | 爬虫和后端都是分布式的。后端分成了两部分: :term:`strategy worker` 和 :term:`db worker`。strategy worker 实例被分配给他们自己的 :term:`spider log` 部分。 41 | 42 | 1. 将爬虫进程中的 :setting:`BACKEND` 设置为 :class:`MessageBusBackend ` 43 | 2. DB workers 和 SW workers 的 :setting:`BACKEND` 应该指向 :class:`DistributedBackend ` 的子类。同时还需要配置您选择的后端。 44 | 3. 每个爬虫进程应该有它自己的 :setting:`SPIDER_PARTITION_ID`,值为从0到 :setting:`SPIDER_FEED_PARTITIONS`。最后一个必须可以被所有 DB worker 实例访问。 45 | 4. 每个 SW worker 应该有自己的 :setting:`SCORING_PARTITION_ID`,值为从0到 :setting:`SPIDER_LOG_PARTITIONS`。最后一个必须可以被所有 SW worker 实例访问。 46 | 5. 爬虫和所有的 worker 都应该将 :setting:`MESSAGE_BUS` 设置为你选择的消息总线类或者其他你自定义的实现。 47 | 48 | 在这种模式下,只有 Kafka 消息总线、SqlAlchemy 和 Habse 后端是默认支持的。 49 | 50 | 此模式适用于广度优先抓取和网页数量巨大的情况。 51 | -------------------------------------------------------------------------------- /frontera/settings/__init__.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | import six 3 | from importlib import import_module 4 | 5 | from . import default_settings 6 | 7 | 8 | class BaseSettings(object): 9 | """ 10 | An object that holds frontier settings values. 11 | 12 | This also defines the base interface for all classes that are to be used 13 | as settings in frontera. 14 | """ 15 | def __init__(self, module=None, attributes=None): 16 | """ 17 | :param object/string module: A :class:`Settings ` object or a path string. 18 | :param dict attributes: A dict object containing the settings values. 19 | 20 | """ 21 | self.attributes = {} 22 | if module: 23 | self.add_module(module) 24 | if attributes: 25 | self.set_from_dict(attributes) 26 | 27 | @classmethod 28 | def from_params(cls, **kwargs): 29 | return cls(attributes=kwargs) 30 | 31 | @classmethod 32 | def object_from(cls, settings): 33 | """ 34 | Generates a new settings object based on a previous obj or settings 35 | file. 36 | 37 | `settings` can either be a string path pointing to settings file or a \ 38 | :class:`BaseSettings ` object instance. 39 | """ 40 | if isinstance(settings, BaseSettings): 41 | return settings 42 | else: 43 | return cls(settings) 44 | 45 | def __getattr__(self, name): 46 | val = self.get(name) 47 | if val is not None: 48 | return val 49 | else: 50 | return self.__dict__[name] 51 | 52 | def __setattr__(self, name, value): 53 | if name.isupper(): 54 | self.attributes[name] = value 55 | else: 56 | self.__dict__[name] = value 57 | 58 | def add_module(self, module): 59 | if isinstance(module, six.string_types): 60 | module = import_module(module) 61 | for key in dir(module): 62 | if key.isupper(): 63 | self.set(key, getattr(module, key)) 64 | 65 | def get(self, key, default_value=None): 66 | if not key.isupper(): 67 | return None 68 | return self.attributes.get(key, default_value) 69 | 70 | def set(self, key, value): 71 | if key.isupper(): 72 | self.attributes[key] = value 73 | 74 | def set_from_dict(self, attributes): 75 | for name, value in attributes.items(): 76 | self.set(name, value) 77 | 78 | 79 | class DefaultSettings(BaseSettings): 80 | def __init__(self): 81 | super(DefaultSettings, self).__init__(default_settings) 82 | 83 | 84 | class Settings(BaseSettings): 85 | def __init__(self, module=None, attributes=None): 86 | super(Settings, self).__init__(default_settings, attributes) 87 | 88 | if module: 89 | self.add_module(module) 90 | -------------------------------------------------------------------------------- /frontera/contrib/backends/sqlalchemy/models.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from sqlalchemy import Column, String, Integer, PickleType, SmallInteger, Float, DateTime, BigInteger 4 | from sqlalchemy.ext.declarative import declarative_base 5 | 6 | DeclarativeBase = declarative_base() 7 | 8 | 9 | class MetadataModel(DeclarativeBase): 10 | __tablename__ = 'metadata' 11 | __table_args__ = ( 12 | { 13 | 'mysql_charset': 'utf8', 14 | 'mysql_engine': 'InnoDB', 15 | 'mysql_row_format': 'DYNAMIC', 16 | }, 17 | ) 18 | 19 | fingerprint = Column(String(40), primary_key=True, nullable=False) 20 | url = Column(String(1024), nullable=False) 21 | depth = Column(Integer, nullable=False) 22 | created_at = Column(DateTime, nullable=False) 23 | fetched_at = Column(DateTime, nullable=True) 24 | status_code = Column(String(20)) 25 | score = Column(Float) 26 | error = Column(String(128)) 27 | meta = Column(PickleType()) 28 | headers = Column(PickleType()) 29 | cookies = Column(PickleType()) 30 | method = Column(String(6)) 31 | 32 | @classmethod 33 | def query(cls, session): 34 | return session.query(cls) 35 | 36 | def __repr__(self): 37 | return '' % (self.url, self.fingerprint) 38 | 39 | 40 | class StateModel(DeclarativeBase): 41 | __tablename__ = 'states' 42 | __table_args__ = ( 43 | { 44 | 'mysql_charset': 'utf8', 45 | 'mysql_engine': 'InnoDB', 46 | 'mysql_row_format': 'DYNAMIC', 47 | }, 48 | ) 49 | 50 | fingerprint = Column(String(40), primary_key=True, nullable=False) 51 | state = Column(SmallInteger()) 52 | 53 | @classmethod 54 | def query(cls, session): 55 | return session.query(cls) 56 | 57 | def __repr__(self): 58 | return '' % (self.fingerprint, self.state) 59 | 60 | 61 | class QueueModelMixin(object): 62 | __table_args__ = ( 63 | { 64 | 'mysql_charset': 'utf8', 65 | 'mysql_engine': 'InnoDB', 66 | 'mysql_row_format': 'DYNAMIC', 67 | }, 68 | ) 69 | 70 | id = Column(Integer, primary_key=True) 71 | partition_id = Column(Integer, index=True) 72 | score = Column(Float, index=True) 73 | url = Column(String(1024), nullable=False) 74 | fingerprint = Column(String(40), nullable=False) 75 | host_crc32 = Column(Integer, nullable=False) 76 | meta = Column(PickleType()) 77 | headers = Column(PickleType()) 78 | cookies = Column(PickleType()) 79 | method = Column(String(6)) 80 | created_at = Column(BigInteger, index=True) 81 | depth = Column(SmallInteger) 82 | 83 | 84 | class QueueModel(QueueModelMixin, DeclarativeBase): 85 | __tablename__ = 'queue' 86 | 87 | @classmethod 88 | def query(cls, session): 89 | return session.query(cls) 90 | 91 | def __repr__(self): 92 | return '' % (self.url, self.id) 93 | -------------------------------------------------------------------------------- /frontera/core/__init__.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from six.moves.urllib.parse import urlparse 3 | from socket import getaddrinfo 4 | from collections import defaultdict, deque 5 | import six 6 | 7 | 8 | def get_slot_key(request, type): # TODO: Probably use caching here 9 | """ 10 | Get string representing a downloader slot key, which will be used in downloader as id for domain/ip load 11 | statistics and in backend for distinguishing free and overloaded resources. This method used in all Frontera 12 | backends. 13 | 14 | :param object request: is the instance of :class:`Request `. 15 | :param str type: either 'domain'(default) or 'ip'. 16 | :return: string 17 | """ 18 | key = urlparse(request.url).hostname or '' 19 | if type == 'ip': 20 | for result in getaddrinfo(key, 80): 21 | key = result[4][0] 22 | break 23 | return key 24 | 25 | 26 | class OverusedBuffer(object): 27 | """ 28 | A buffering object for implementing the buffer of Frontera requests for overused domains/ips. It can be used 29 | when customizing backend to address efficient downloader pool usage. 30 | """ 31 | def __init__(self, _get_func, log_func=None): 32 | """ 33 | :param _get_func: reference to get_next_requests() method of binded class 34 | :param log_func: optional logging function, for logging of internal state 35 | """ 36 | self._pending = defaultdict(deque) 37 | self._get = _get_func 38 | self._log = log_func 39 | 40 | def _get_key(self, request, type): 41 | return get_slot_key(request, type) 42 | 43 | def _get_pending_count(self): 44 | return sum(six.moves.map(len, six.itervalues(self._pending))) 45 | 46 | def _get_pending(self, max_n_requests, overused_set): 47 | pending = self._pending 48 | i, keys = 0, set(pending) - overused_set 49 | 50 | while i < max_n_requests and keys: 51 | for key in keys.copy(): 52 | try: 53 | yield pending[key].popleft() 54 | i += 1 55 | except IndexError: 56 | keys.discard(key) 57 | del pending[key] 58 | 59 | def get_next_requests(self, max_n_requests, **kwargs): 60 | if self._log: 61 | self._log("Overused keys: %s" % str(kwargs['overused_keys'])) 62 | self._log("Pending: %d" % self._get_pending_count()) 63 | 64 | overused_set = set(kwargs['overused_keys']) 65 | requests = list(self._get_pending(max_n_requests, overused_set)) 66 | 67 | if len(requests) == max_n_requests: 68 | return requests 69 | 70 | for request in self._get(max_n_requests-len(requests), **kwargs): 71 | key = self._get_key(request, kwargs['key_type']) 72 | if key in overused_set: 73 | self._pending[key].append(request) 74 | else: 75 | requests.append(request) 76 | return requests 77 | -------------------------------------------------------------------------------- /frontera/contrib/backends/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from collections import OrderedDict 4 | 5 | from frontera import Backend 6 | from frontera.core.components import States 7 | 8 | 9 | class CommonBackend(Backend): 10 | """ 11 | A simpliest possible backend, performing one-time crawl: if page was crawled once, it will not be crawled again. 12 | """ 13 | component_name = 'Common Backend' 14 | 15 | @classmethod 16 | def from_manager(cls, manager): 17 | return cls(manager) 18 | 19 | def frontier_start(self): 20 | self.metadata.frontier_start() 21 | self.queue.frontier_start() 22 | self.states.frontier_start() 23 | self.queue_size = self.queue.count() 24 | 25 | def frontier_stop(self): 26 | self.metadata.frontier_stop() 27 | self.queue.frontier_stop() 28 | self.states.frontier_stop() 29 | 30 | def add_seeds(self, seeds): 31 | for seed in seeds: 32 | seed.meta[b'depth'] = 0 33 | self.metadata.add_seeds(seeds) 34 | self.states.fetch([seed.meta[b'fingerprint'] for seed in seeds]) 35 | self.states.set_states(seeds) 36 | self._schedule(seeds) 37 | self.states.update_cache(seeds) 38 | 39 | def _schedule(self, requests): 40 | batch = [] 41 | queue_incr = 0 42 | for request in requests: 43 | schedule = True if request.meta[b'state'] in [States.NOT_CRAWLED, States.ERROR, None] else False 44 | batch.append((request.meta[b'fingerprint'], self._get_score(request), request, schedule)) 45 | if schedule: 46 | queue_incr += 1 47 | request.meta[b'state'] = States.QUEUED 48 | self.queue.schedule(batch) 49 | self.metadata.update_score(batch) 50 | self.queue_size += queue_incr 51 | 52 | def _get_score(self, obj): 53 | return obj.meta.get(b'score', 1.0) 54 | 55 | def get_next_requests(self, max_next_requests, **kwargs): 56 | partitions = kwargs.pop('partitions', [0]) # TODO: Collect from all known partitions 57 | batch = [] 58 | for partition_id in partitions: 59 | batch.extend(self.queue.get_next_requests(max_next_requests, partition_id, **kwargs)) 60 | self.queue_size -= len(batch) 61 | return batch 62 | 63 | def page_crawled(self, response): 64 | response.meta[b'state'] = States.CRAWLED 65 | self.states.update_cache(response) 66 | self.metadata.page_crawled(response) 67 | 68 | def links_extracted(self, request, links): 69 | to_fetch = OrderedDict() 70 | for link in links: 71 | to_fetch[link.meta[b'fingerprint']] = link 72 | link.meta[b'depth'] = request.meta.get(b'depth', 0)+1 73 | self.states.fetch(to_fetch.keys()) 74 | self.states.set_states(links) 75 | unique_links = to_fetch.values() 76 | self.metadata.links_extracted(request, unique_links) 77 | self._schedule(unique_links) 78 | self.states.update_cache(unique_links) 79 | 80 | def request_error(self, request, error): 81 | request.meta[b'state'] = States.ERROR 82 | self.metadata.request_error(request, error) 83 | self.states.update_cache(request) 84 | 85 | def finished(self): 86 | return self.queue_size == 0 87 | -------------------------------------------------------------------------------- /frontera/core/codec.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from abc import ABCMeta, abstractmethod 4 | import six 5 | 6 | 7 | @six.add_metaclass(ABCMeta) 8 | class BaseDecoder(object): 9 | 10 | @abstractmethod 11 | def decode(self, buffer): 12 | """ 13 | Decodes the message. 14 | 15 | :param bytes buffer: encoded message 16 | :return: tuple of message type and related objects 17 | """ 18 | pass 19 | 20 | @abstractmethod 21 | def decode_request(self, buffer): 22 | """ 23 | Decodes Request objects. 24 | 25 | :param bytes buffer: serialized string 26 | :return: object Request 27 | """ 28 | pass 29 | 30 | 31 | @six.add_metaclass(ABCMeta) 32 | class BaseEncoder(object): 33 | 34 | @abstractmethod 35 | def encode_add_seeds(self, seeds): 36 | """ 37 | Encodes add_seeds message 38 | 39 | :param list seeds: A list of frontier Request objects 40 | :return: bytes encoded message 41 | """ 42 | pass 43 | 44 | @abstractmethod 45 | def encode_page_crawled(self, response): 46 | """ 47 | Encodes a page_crawled message 48 | 49 | :param object response: A frontier Response object 50 | 51 | :return: bytes encoded message 52 | """ 53 | pass 54 | 55 | @abstractmethod 56 | def encode_links_extracted(self, request, links): 57 | """ 58 | Encodes a links_extracted message 59 | 60 | :param object request: A frontier Request object 61 | :param list links: A list of Request objects 62 | 63 | :return: bytes encoded message 64 | """ 65 | pass 66 | 67 | @abstractmethod 68 | def encode_request_error(self, request, error): 69 | """ 70 | Encodes a request_error message 71 | 72 | :param object request: A frontier Request object 73 | :param string error: Error description 74 | 75 | :return: bytes encoded message 76 | """ 77 | pass 78 | 79 | @abstractmethod 80 | def encode_request(self, request): 81 | """ 82 | Encodes requests for spider feed stream. 83 | 84 | :param object request: Frontera Request object 85 | :return: bytes encoded message 86 | """ 87 | pass 88 | 89 | @abstractmethod 90 | def encode_update_score(self, request, score, schedule): 91 | """ 92 | Encodes update_score messages for scoring log stream. 93 | 94 | :param object request: Frontera Request object 95 | :param float score: score 96 | :param bool schedule: True if document needs to be scheduled for download 97 | :return: bytes encoded message 98 | """ 99 | pass 100 | 101 | @abstractmethod 102 | def encode_new_job_id(self, job_id): 103 | """ 104 | Encodes changing of job_id parameter. 105 | 106 | :param int job_id: 107 | :return: bytes encoded message 108 | """ 109 | pass 110 | 111 | @abstractmethod 112 | def encode_offset(self, partition_id, offset): 113 | """ 114 | Encodes current spider offset in spider feed. 115 | 116 | :param int partition_id: 117 | :param int offset: 118 | :return: bytes encoded message 119 | """ 120 | pass 121 | -------------------------------------------------------------------------------- /docs/source/topics/architecture.rst: -------------------------------------------------------------------------------- 1 | ===================== 2 | 架构概述 3 | ===================== 4 | 5 | 本文档介绍了Frontera Manager管道,分布式组件以及它们的交互方式。 6 | 7 | 单进程 8 | ============== 9 | 10 | 下图显示了Frontera管道的架构,其组件(由数字引用)和系统内发生的数据流的轮廓。 有关组件的简要说明,请参见以下有关它们的更多详细信息的链接。 数据流也在下面描述。 11 | 12 | .. image:: _images/frontier_02.png 13 | :width: 793px 14 | :height: 280px 15 | 16 | 组件 17 | ---------- 18 | 19 | Fetcher 20 | ^^^^^^^ 21 | 22 | Fetcher(2)负责从网站(1)中获取网页,并将其提供给管理接下来要抓取哪些页面的 frontier。 23 | 24 | Fetcher 可以使用 `Scrapy`_ 或任何其他爬虫框架/系统来实现,因为框架提供了通用的 frontier 功能。 25 | 26 | 在分布式运行模式下,Fetcher由Frontera Manager侧的消息总线生产者和Fetcher侧的消费者替代。 27 | 28 | Frontera API / Manager 29 | ^^^^^^^^^^^^^^^^^^^^^^ 30 | 31 | Frontera API(3)的主要入口点是 :class:`FrontierManager ` 对象。Frontier 用户(在我们的案例中是Fetcher(2))将通过它与 Frontier 进行通信。 32 | 33 | 更多请参考 :doc:`frontier-api` 。 34 | 35 | Middlewares 36 | ^^^^^^^^^^^ 37 | 38 | Frontier middlewares (4) 是位于Manager(3)和Backend(5)之间的特定钩子。 这些中间件在传入和传出 Frontier 和 Backend 时处理 :class:`Request ` 和 :class:`Response ` 对象。 它们通过插入自定义代码提供了一种方便的扩展功能的机制。 规范URL解算器是一种特殊的中间件,负责替代非规范文档URL。 39 | 40 | 更改请参考 :doc:`frontier-middlewares` 和 :doc:`frontier-canonicalsolvers` 。 41 | 42 | Backend 43 | ^^^^^^^ 44 | 45 | frontier Backend(5)是爬行逻辑/策略所在的地方。 它负责接收所有抓取信息并选择接下来要抓取的页面。 Backend 旨在在更高级别上运行,而:class:`Queue `, :class:`Metadata ` 和 46 | :class:`States ` 对象负责低级存储通信代码。 47 | 48 | 根据实现的逻辑,可能需要一个持久性存储(6)来管理 :class:`Request ` 和 :class:`Response ` 对象信息。 49 | 50 | 更多请参考 :doc:`frontier-backends` 。 51 | 52 | .. _frontier-data-flow: 53 | 54 | 数据流 55 | --------- 56 | 57 | Frontera 的数据流由 Frontier Manager 控制,所有数据都通过 manager-middlewares-backend 流程,如下所示: 58 | 59 | 1. frontier初始化为种子请求列表(种子URL)作为爬虫的入口点。 60 | 2. fetcher请求一批任务去抓取。 61 | 3. 每个url都被提取,并且 frontier 被通知回传抓取结果以及页面包含的提取数据。 如果在爬行中出现问题,frontier 也会被通知。 62 | 63 | 一旦所有 url 被抓取,重复步骤2-3,直到达到 frontier 结束条件。每个循环(步骤2-3)重复被称为 :ref:`frontier 迭代 ` 。 64 | 65 | 分布式 66 | =========== 67 | 68 | 在分布式模式下运行时,所有 Frontera 进程都使用相同的 Frontera Manager。 69 | 70 | 整体系统形成一个封闭的圆圈,所有的组件都在无限循环中作为守护进程工作。 有一个 :term:`message bus` 负责在组件,持久存储和 fetcher(当和提取结合时,fetcher又叫做spider)之间传输消息。 有一个传输和存储层抽象,所以可以插上它自己的实现。 分布式后端运行模式具有三种类型的实例: 71 | 72 | - **Spiders** 或者 fetchers,使用Scrapy(分片)实现。 73 | 负责解决DNS查询,从互联网获取内容并从内容中进行链接(或其他数据)提取。 74 | 75 | - **Strategy workers** (分片)。 76 | 运行爬网策略代码:为链接链接,决定链接是否需要被抓取,以及何时停止抓取。 77 | 78 | - **DB workers** (分片)。 79 | 存储所有元数据,包括分数和内容,并生成新的批量任务以供爬虫下载。 80 | 81 | *分片*意味着组件仅消耗分配的分区的消息,例如处理数据流的某些共享。*复制*是组件消耗数据流,而不管分区。 82 | 83 | 这样的设计允许在线操作。可以更改抓取策略,而无需停止抓取。 :doc:`爬虫策略 ` 也可以作为单独的模块实现; 包含用于检查爬网停止条件,URL排序和评分模型的逻辑。 84 | 85 | Frontera 的设计是对Web友好的,每个主机由不超过一个的爬虫进程下载。这是通过数据流分区实现的。 86 | 87 | .. image:: _images/frontera-design.png 88 | 89 | 数据流 90 | --------- 91 | 92 | 我们从爬虫开始吧。爬虫内的用户定义的种子URL通过 :term:`spider log` 流发送给 strategy workers 和 DB 93 | workers。strategy workers 使用状态缓存决定抓取哪些页面,为每个页面分配一个分数,并将结果发送到 :term:`scoring log` 流。 94 | 95 | DB Worker存储各种元数据,包括内容和分数。另外 DB Worker 检查爬虫消费者的偏移量,并在需要时生成新的任务,并将其发送到 :term:`spider feed` 流。爬虫消耗这些任务,下载每个页面并从中提取链接。然后将链接发送到 ‘Spider Log’ 流,并将其存储和记分。 这样,流量将无限期地重复。 96 | 97 | 98 | .. _`Kafka`: http://kafka.apache.org/ 99 | .. _`ZeroMQ`: http://zeromq.org/ 100 | .. _`HBase`: http://hbase.apache.org/ 101 | .. _`Scrapy`: http://scrapy.org/ 102 | .. _`Frontera`: http://github.com/scrapinghub/frontera 103 | 104 | -------------------------------------------------------------------------------- /frontera/utils/graphs/models.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from sqlalchemy import Column, String, Integer, Boolean, ForeignKey 3 | from sqlalchemy.orm import relation 4 | from sqlalchemy import UniqueConstraint 5 | from sqlalchemy.ext.declarative import declarative_base 6 | from sqlalchemy import types 7 | import six 8 | 9 | Base = declarative_base() 10 | 11 | 12 | class Choice(types.TypeDecorator): 13 | impl = types.CHAR 14 | 15 | def __init__(self, choices, default, **kwargs): 16 | self.choices = dict(choices) 17 | values = [k for k, v in six.iteritems(self.choices)] 18 | if default not in values: 19 | raise ValueError("default value '%s' not found in choices %s" % (default, values)) 20 | self.default = default 21 | super(Choice, self).__init__(**kwargs) 22 | 23 | def process_bind_param(self, value, dialect): 24 | return value or self.default 25 | 26 | def process_result_value(self, value, dialect): 27 | return self.choices[value] 28 | 29 | 30 | class BaseModel(object): 31 | __abstract__ = True 32 | 33 | @classmethod 34 | def get_pk_name(cls): 35 | return cls.__mapper__.primary_key[0].name 36 | 37 | @classmethod 38 | def get_pk_field(cls): 39 | return getattr(cls, cls.get_pk_name()) 40 | 41 | @classmethod 42 | def query(cls, session): 43 | return session.query(cls) 44 | 45 | @classmethod 46 | def query_pk(cls, session): 47 | return session.query(cls.get_pk_field()) 48 | 49 | @classmethod 50 | def get_or_create(cls, session, **kwargs): 51 | instance = session.query(cls).filter_by(**kwargs).first() 52 | if instance: 53 | return instance, False 54 | else: 55 | instance = cls(**kwargs) 56 | session.add(instance) 57 | return instance, True 58 | 59 | def get_pk(self): 60 | return getattr(self, self.get_pk_name()) 61 | 62 | def exists(self, session): 63 | q = self.query(session).filter_by(**{self.get_pk_name(): self.get_pk()}) 64 | return session.query(q.exists()).scalar() 65 | 66 | 67 | class Model(Base, BaseModel): 68 | pass 69 | 70 | 71 | class CrawlPageRelation(Model): 72 | __tablename__ = 'crawl_page_relations' 73 | parent_id = Column(Integer, ForeignKey('crawl_pages.id'), primary_key=True, index=True) 74 | child_id = Column(Integer, ForeignKey('crawl_pages.id'), primary_key=True, index=True) 75 | 76 | 77 | class CrawlPage(Model): 78 | __tablename__ = 'crawl_pages' 79 | __table_args__ = ( 80 | UniqueConstraint('url'), 81 | ) 82 | 83 | id = Column(Integer, primary_key=True, nullable=False, index=True, unique=True) 84 | url = Column(String(1000)) 85 | status = Column(String(50)) 86 | n_redirects = Column(Integer, default=0) 87 | is_seed = Column(Boolean, default=False) 88 | referers = relation( 89 | 'CrawlPage', 90 | secondary='crawl_page_relations', 91 | primaryjoin=CrawlPageRelation.child_id == id, 92 | secondaryjoin=CrawlPageRelation.parent_id == id, 93 | backref="links") 94 | 95 | def __repr__(self): 96 | return '<%s:%s%s>' % (self.id, self.url, '*' if self.is_seed else '') 97 | 98 | def _get_status_code(self): 99 | try: 100 | return int(self.status) 101 | except TypeError: 102 | return None 103 | 104 | @property 105 | def has_errors(self): 106 | return self._get_status_code() is None 107 | 108 | @property 109 | def is_redirection(self): 110 | status_code = self._get_status_code() 111 | if status_code: 112 | return 300 <= status_code < 400 113 | else: 114 | return False 115 | -------------------------------------------------------------------------------- /docs/source/index.rst: -------------------------------------------------------------------------------- 1 | .. _topics-index: 2 | 3 | ================================ 4 | Frontera |version| 文档 5 | ================================ 6 | 7 | `Frontera`_ 是一个爬虫工具箱,它可以让你构建任何规模和任意目的的爬虫。 8 | 9 | `Frontera`_ 提供 :ref:`crawl frontier ` 框架,这个框架可以帮助解决*何时抓取下一个URL*、*下个抓取的URL是什么*和检查*抓取结果*等问题。 10 | 11 | Frontera 还为所有的爬虫组件提供了复制、分片、隔离的特性,这可以方便的扩展爬虫规模和将爬虫做成分布式。 12 | 13 | Fronteta 包含完全支持 `Scrapy`_ 的组件,可以使用Scrapy的所有功能创建爬虫。尽管它最初是为Scrapy设计的,但是它也可以完美契合其他任何的框架或系统,因为它可以作为一个框架提供通用的工具箱。 14 | 15 | 介绍 16 | ============ 17 | 18 | 这一章的目的是介绍 Frontera 的概念,通过阅读本章,你可以知道 Frontera 的设计理念和确定它能不能满足你的需求。 19 | 20 | 21 | .. toctree:: 22 | :hidden: 23 | 24 | topics/overview 25 | topics/run-modes 26 | topics/quick-start-single 27 | topics/quick-start-distributed 28 | topics/cluster-setup 29 | 30 | :doc:`topics/overview` 31 | 明白什么是 Frontera ?它能为你做什么? 32 | 33 | :doc:`topics/run-modes` 34 | Frontera的高层体系结构和运行模式。 35 | 36 | :doc:`topics/quick-start-single` 37 | 使用 Scrapy 作为容器来运行 Frontera。 38 | 39 | :doc:`topics/quick-start-distributed` 40 | 引入 SQLite 和 ZeroMQ。 41 | 42 | :doc:`topics/cluster-setup` 43 | Setting up clustered version of Frontera on multiple machines with HBase and Kafka. 44 | 使用 HBase 和 Kafka 在多台机器上部署 Frontera 集群。 45 | 46 | 47 | 使用 Frontera 48 | ============== 49 | 50 | .. toctree:: 51 | :hidden: 52 | 53 | topics/installation 54 | topics/frontier-objects 55 | topics/frontier-middlewares 56 | topics/frontier-canonicalsolvers 57 | topics/frontier-backends 58 | topics/message_bus 59 | topics/own_crawling_strategy 60 | topics/scrapy-integration 61 | topics/frontera-settings 62 | 63 | :doc:`topics/installation` 64 | 安装方法和依赖的选项。 65 | 66 | :doc:`topics/frontier-objects` 67 | 理解用来代表网络请求和网络响应的类。 68 | 69 | :doc:`topics/frontier-middlewares` 70 | 过滤或者更改链接和网页的信息。 71 | 72 | :doc:`topics/frontier-canonicalsolvers` 73 | 确认和使用网页的规范url。 74 | 75 | :doc:`topics/frontier-backends` 76 | 自定义抓取规则和存储方式。 77 | 78 | :doc:`topics/message_bus` 79 | 内置消息总线参考。 80 | 81 | :doc:`topics/own_crawling_strategy` 82 | 为分布式后端实现自己的抓取策略。 83 | 84 | :doc:`topics/scrapy-integration` 85 | 学习如何使用 Frontera + Scrapy 。 86 | 87 | :doc:`topics/frontera-settings` 88 | 设置参考。 89 | 90 | 91 | 高级用法 92 | ============== 93 | 94 | .. toctree:: 95 | :hidden: 96 | 97 | topics/what-is-cf 98 | topics/graph-manager 99 | topics/scrapy-recorder 100 | topics/fine-tuning 101 | topics/dns-service 102 | 103 | :doc:`topics/what-is-cf` 104 | 学习 Crawl Frontier 理论。 105 | 106 | :doc:`topics/graph-manager` 107 | 定义假的抓取规则来测试你的 frontier 。 108 | 109 | :doc:`topics/scrapy-recorder` 110 | 创建 Scrapy 抓取记录,并在之后重现他们。 111 | 112 | :doc:`topics/fine-tuning` 113 | 机器部署和微调信息。 114 | 115 | :doc:`topics/dns-service` 116 | DNS 服务搭建简介。 117 | 118 | 开发者文档 119 | ======================= 120 | 121 | .. toctree:: 122 | :hidden: 123 | 124 | topics/architecture 125 | topics/frontier-api 126 | topics/requests-integration 127 | topics/examples 128 | topics/tests 129 | topics/loggers 130 | topics/frontier-tester 131 | topics/faq 132 | topics/contributing 133 | topics/glossary 134 | 135 | 136 | 137 | 138 | :doc:`topics/architecture` 139 | 了解 Frontera 如何工作和它的不同组件。 140 | 141 | :doc:`topics/frontier-api` 142 | 学习如何使用 frontier 。 143 | 144 | :doc:`topics/requests-integration` 145 | 学习如何使用 Frontera + Requests 。 146 | 147 | :doc:`topics/examples` 148 | 一些使用 Frontera 的示例工程和示例脚本。 149 | 150 | :doc:`topics/tests` 151 | 如果运行和写 Frontera 的测试用例。 152 | 153 | :doc:`topics/loggers` 154 | 使用 python 原生日志系统创建的一些 loggers 。 155 | 156 | :doc:`topics/frontier-tester` 157 | 使用一个简单的方法测试你的 frontier。 158 | 159 | :doc:`topics/faq` 160 | 常见问题。 161 | 162 | :doc:`topics/contributing` 163 | 如何贡献。 164 | 165 | 166 | :doc:`topics/glossary` 167 | 术语表。 168 | 169 | 170 | .. _Crawling System: http://en.wikipedia.org/wiki/Web_crawler 171 | .. _Scrapy: http://scrapy.org/ 172 | .. _`Frontera`: http://github.com/scrapinghub/frontera -------------------------------------------------------------------------------- /frontera/contrib/middlewares/domain.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | import re 3 | 4 | from frontera.core.components import Middleware 5 | from frontera.utils.url import parse_domain_from_url_fast, parse_domain_from_url 6 | from w3lib.util import to_bytes 7 | 8 | # TODO: Why not to put the whole url_parse result here in meta? 9 | 10 | 11 | class DomainMiddleware(Middleware): 12 | """ 13 | This :class:`Middleware ` will add a ``domain`` info field for every 14 | :attr:`Request.meta ` and 15 | :attr:`Response.meta ` if is activated. 16 | 17 | 18 | ``domain`` object will contain the following fields, with both keys and values as bytes: 19 | 20 | - **netloc**: URL netloc according to `RFC 1808`_ syntax specifications 21 | - **name**: Domain name 22 | - **scheme**: URL scheme 23 | - **tld**: Top level domain 24 | - **sld**: Second level domain 25 | - **subdomain**: URL subdomain(s) 26 | 27 | An example for a :class:`Request ` object:: 28 | 29 | >>> request.url 30 | 'http://www.scrapinghub.com:8080/this/is/an/url' 31 | 32 | >>> request.meta['domain'] 33 | { 34 | "name": "scrapinghub.com", 35 | "netloc": "www.scrapinghub.com", 36 | "scheme": "http", 37 | "sld": "scrapinghub", 38 | "subdomain": "www", 39 | "tld": "com" 40 | } 41 | 42 | If :setting:`TEST_MODE` is active, It will accept testing URLs, parsing letter domains:: 43 | 44 | >>> request.url 45 | 'A1' 46 | 47 | >>> request.meta['domain'] 48 | { 49 | "name": "A", 50 | "netloc": "A", 51 | "scheme": "-", 52 | "sld": "-", 53 | "subdomain": "-", 54 | "tld": "-" 55 | } 56 | 57 | .. _`RFC 1808`: http://tools.ietf.org/html/rfc1808.html 58 | 59 | """ 60 | component_name = 'Domain Middleware' 61 | 62 | def __init__(self, manager): 63 | self.manager = manager 64 | use_tldextract = self.manager.settings.get('TLDEXTRACT_DOMAIN_INFO', False) 65 | self.parse_domain_func = parse_domain_from_url if use_tldextract else parse_domain_from_url_fast 66 | 67 | @classmethod 68 | def from_manager(cls, manager): 69 | return cls(manager) 70 | 71 | def frontier_start(self): 72 | pass 73 | 74 | def frontier_stop(self): 75 | pass 76 | 77 | def add_seeds(self, seeds): 78 | for seed in seeds: 79 | self._add_domain(seed) 80 | return seeds 81 | 82 | def page_crawled(self, response): 83 | return self._add_domain(response) 84 | 85 | def links_extracted(self, request, links): 86 | for link in links: 87 | self._add_domain(link) 88 | return self._add_domain(request) 89 | 90 | def request_error(self, request, error): 91 | return self._add_domain(request) 92 | 93 | def _add_domain(self, obj): 94 | obj.meta[b'domain'] = self.parse_domain_info(obj.url, self.manager.test_mode) 95 | if b'redirect_urls' in obj.meta: 96 | obj.meta[b'redirect_domains'] = [self.parse_domain_info(url, self.manager.test_mode) 97 | for url in obj.meta[b'redirect_urls']] 98 | return obj 99 | 100 | def parse_domain_info(self, url, test_mode=False): 101 | if test_mode: 102 | match = re.match('([A-Z])\w+', url) 103 | netloc = name = to_bytes(match.groups()[0]) if match else b'?' 104 | scheme = sld = tld = subdomain = b'-' 105 | else: 106 | netloc, name, scheme, sld, tld, subdomain = self.parse_domain_func(url) 107 | return { 108 | b'netloc': to_bytes(netloc), 109 | b'name': to_bytes(name), 110 | b'scheme': to_bytes(scheme), 111 | b'sld': to_bytes(sld), 112 | b'tld': to_bytes(tld), 113 | b'subdomain': to_bytes(subdomain), 114 | } 115 | -------------------------------------------------------------------------------- /docs/source/topics/tests.rst: -------------------------------------------------------------------------------- 1 | ===== 2 | Tests 3 | ===== 4 | 5 | Frontera 测试使用 `pytest`_ 工具实现。 6 | 7 | 您可以使用pip安装 `pytest`_ 和测试中使用的附加必需库:: 8 | 9 | pip install -r requirements/tests.txt 10 | 11 | 12 | 运行 tests 13 | ============= 14 | 15 | 要运行所有测试,请转到源代码的根目录并运行:: 16 | 17 | py.test 18 | 19 | 20 | 写 tests 21 | ============= 22 | 23 | 所有功能(包括新功能和错误修复)都必须包含一个测试用例,以检查它是否按预期工作,所以如果你希望他们能够早点上线,请尽快写好相关的测试。 24 | 25 | 26 | 后端 testing 27 | =============== 28 | 29 | 有一个继承 `pytest`_ 的类用来测试 :class:`Backend `: 30 | :class:`BackendTest ` 31 | 32 | .. autoclass:: tests.backends.BackendTest 33 | 34 | .. automethod:: tests.backends.BackendTest.get_settings 35 | .. automethod:: tests.backends.BackendTest.get_frontier 36 | .. automethod:: tests.backends.BackendTest.setup_backend 37 | .. automethod:: tests.backends.BackendTest.teardown_backend 38 | 39 | 40 | 比方说,你想测试你的后端 ``MyBackend`` 并为每个测试方法调用创建一个新的 frontier 实例,你可以定义一个这样的测试类:: 41 | 42 | 43 | class TestMyBackend(backends.BackendTest): 44 | 45 | backend_class = 'frontera.contrib.backend.abackend.MyBackend' 46 | 47 | def test_one(self): 48 | frontier = self.get_frontier() 49 | ... 50 | 51 | def test_two(self): 52 | frontier = self.get_frontier() 53 | ... 54 | 55 | ... 56 | 57 | 58 | 如果它使用一个数据库文件,你需要在每次测试之前和之后进行清理:: 59 | 60 | 61 | class TestMyBackend(backends.BackendTest): 62 | 63 | backend_class = 'frontera.contrib.backend.abackend.MyBackend' 64 | 65 | def setup_backend(self, method): 66 | self._delete_test_db() 67 | 68 | def teardown_backend(self, method): 69 | self._delete_test_db() 70 | 71 | def _delete_test_db(self): 72 | try: 73 | os.remove('mytestdb.db') 74 | except OSError: 75 | pass 76 | 77 | def test_one(self): 78 | frontier = self.get_frontier() 79 | ... 80 | 81 | def test_two(self): 82 | frontier = self.get_frontier() 83 | ... 84 | 85 | ... 86 | 87 | 88 | 测试后端执行顺序 89 | ========================= 90 | 91 | 为了测试 :class:`Backend ` 抓取顺序你可以使用 :class:`BackendSequenceTest ` 类。 92 | 93 | .. autoclass:: tests.backends.BackendSequenceTest 94 | 95 | .. automethod:: tests.backends.BackendSequenceTest.get_sequence 96 | .. automethod:: tests.backends.BackendSequenceTest.assert_sequence 97 | 98 | 99 | :class:`BackendSequenceTest ` 将依据传过来的网站图进行一遍完整的抓取,并返回后端访问网页的顺序。 100 | 101 | 比如你想测试一个按照字母顺序抓取网页的后端。你可以这样写测试:: 102 | 103 | 104 | class TestAlphabeticSortBackend(backends.BackendSequenceTest): 105 | 106 | backend_class = 'frontera.contrib.backend.abackend.AlphabeticSortBackend' 107 | 108 | SITE_LIST = [ 109 | [ 110 | ('C', []), 111 | ('B', []), 112 | ('A', []), 113 | ], 114 | ] 115 | 116 | def test_one(self): 117 | # Check sequence is the expected one 118 | self.assert_sequence(site_list=self.SITE_LIST, 119 | expected_sequence=['A', 'B', 'C'], 120 | max_next_requests=0) 121 | 122 | def test_two(self): 123 | # Get sequence and work with it 124 | sequence = self.get_sequence(site_list=SITE_LIST, 125 | max_next_requests=0) 126 | assert len(sequence) > 2 127 | 128 | ... 129 | 130 | 131 | 测试基本算法 132 | ======================== 133 | 134 | 如果你的后端使用 :ref:`基本算法逻辑 ` 中的一个,你可以继承对应的测试类,之后顺序会被自动测试:: 135 | 136 | from tests import backends 137 | 138 | 139 | class TestMyBackendFIFO(backends.FIFOBackendTest): 140 | backend_class = 'frontera.contrib.backends.abackend.MyBackendFIFO' 141 | 142 | 143 | class TestMyBackendLIFO(backends.LIFOBackendTest): 144 | backend_class = 'frontera.contrib.backends.abackend.MyBackendLIFO' 145 | 146 | 147 | class TestMyBackendDFS(backends.DFSBackendTest): 148 | backend_class = 'frontera.contrib.backends.abackend.MyBackendDFS' 149 | 150 | 151 | class TestMyBackendBFS(backends.BFSBackendTest): 152 | backend_class = 'frontera.contrib.backends.abackend.MyBackendBFS' 153 | 154 | 155 | class TestMyBackendRANDOM(backends.RANDOMBackendTest): 156 | backend_class = 'frontera.contrib.backends.abackend.MyBackendRANDOM' 157 | 158 | 159 | 160 | .. _pytest: http://pytest.org/latest/ 161 | 162 | -------------------------------------------------------------------------------- /frontera/contrib/middlewares/fingerprint.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from frontera.core.components import Middleware 3 | from frontera.exceptions import NotConfigured 4 | from w3lib.url import canonicalize_url 5 | from frontera.utils.misc import load_object 6 | 7 | 8 | class BaseFingerprintMiddleware(Middleware): 9 | component_name = 'Base Fingerprint Middleware' 10 | fingerprint_function_name = '' 11 | 12 | def __init__(self, manager): 13 | fingerprint_function_name = manager.settings.get(self.fingerprint_function_name, None) 14 | if not fingerprint_function_name: 15 | raise NotConfigured 16 | self.fingerprint_function = load_object(fingerprint_function_name) 17 | 18 | @classmethod 19 | def from_manager(cls, manager): 20 | return cls(manager) 21 | 22 | def frontier_start(self): 23 | pass 24 | 25 | def frontier_stop(self): 26 | pass 27 | 28 | def add_seeds(self, seeds): 29 | for seed in seeds: 30 | self._add_fingerprint(seed) 31 | return seeds 32 | 33 | def page_crawled(self, response): 34 | return self._add_fingerprint(response) 35 | 36 | def links_extracted(self, request, links): 37 | for link in links: 38 | self._add_fingerprint(link) 39 | return self._add_fingerprint(request) 40 | 41 | def request_error(self, request, error): 42 | return self._add_fingerprint(request) 43 | 44 | def _add_fingerprint(self, obj): 45 | raise NotImplementedError 46 | 47 | 48 | class UrlFingerprintMiddleware(BaseFingerprintMiddleware): 49 | """ 50 | This :class:`Middleware ` will add a ``fingerprint`` field for every 51 | :attr:`Request.meta ` and 52 | :attr:`Response.meta ` if is activated. 53 | 54 | Fingerprint will be calculated from object ``URL``, using the function defined in 55 | :setting:`URL_FINGERPRINT_FUNCTION` setting. 56 | You can write your own fingerprint calculation function and use by changing this setting. 57 | The fingerprint must be bytes. 58 | 59 | An example for a :class:`Request ` object:: 60 | 61 | >>> request.url 62 | 'http//www.scrapinghub.com:8080' 63 | 64 | >>> request.meta['fingerprint'] 65 | '60d846bc2969e9706829d5f1690f11dafb70ed18' 66 | 67 | """ 68 | 69 | component_name = 'URL Fingerprint Middleware' 70 | fingerprint_function_name = 'URL_FINGERPRINT_FUNCTION' 71 | 72 | def _get_fingerprint(self, url): 73 | return self.fingerprint_function(canonicalize_url(url)) 74 | 75 | def _add_fingerprint(self, obj): 76 | obj.meta[b'fingerprint'] = self._get_fingerprint(obj.url) 77 | if b'redirect_urls' in obj.meta: 78 | obj.meta[b'redirect_fingerprints'] = [self._get_fingerprint(url) for url in obj.meta[b'redirect_urls']] 79 | return obj 80 | 81 | 82 | class DomainFingerprintMiddleware(BaseFingerprintMiddleware): 83 | """ 84 | This :class:`Middleware ` will add a ``fingerprint`` field for every 85 | :attr:`Request.meta ` and 86 | :attr:`Response.meta ` ``domain`` fields if is activated. 87 | 88 | Fingerprint will be calculated from object ``URL``, using the function defined in 89 | :setting:`DOMAIN_FINGERPRINT_FUNCTION` setting. 90 | You can write your own fingerprint calculation function and use by changing this setting. 91 | The fingerprint must be bytes 92 | 93 | An example for a :class:`Request ` object:: 94 | 95 | >>> request.url 96 | 'http//www.scrapinghub.com:8080' 97 | 98 | >>> request.meta['domain'] 99 | { 100 | "fingerprint": "5bab61eb53176449e25c2c82f172b82cb13ffb9d", 101 | "name": "scrapinghub.com", 102 | "netloc": "www.scrapinghub.com", 103 | "scheme": "http", 104 | "sld": "scrapinghub", 105 | "subdomain": "www", 106 | "tld": "com" 107 | } 108 | 109 | """ 110 | 111 | component_name = 'Domain Fingerprint Middleware' 112 | fingerprint_function_name = 'DOMAIN_FINGERPRINT_FUNCTION' 113 | 114 | def _add_fingerprint(self, obj): 115 | if b'domain' in obj.meta and b'name' in obj.meta[b'domain']: 116 | obj.meta[b'domain'][b'fingerprint'] = self.fingerprint_function(obj.meta[b'domain'][b'name']) 117 | if b'redirect_domains' in obj.meta: 118 | for domain in obj.meta[b'redirect_domains']: 119 | domain[b'fingerprint'] = self.fingerprint_function(domain[b'name']) 120 | return obj 121 | -------------------------------------------------------------------------------- /frontera/contrib/backends/remote/messagebus.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from frontera import Backend 4 | from frontera.core import OverusedBuffer 5 | from frontera.utils.misc import load_object 6 | import logging 7 | import six 8 | 9 | 10 | class MessageBusBackend(Backend): 11 | def __init__(self, manager): 12 | settings = manager.settings 13 | messagebus = load_object(settings.get('MESSAGE_BUS')) 14 | self.mb = messagebus(settings) 15 | codec_path = settings.get('MESSAGE_BUS_CODEC') 16 | encoder_cls = load_object(codec_path+".Encoder") 17 | decoder_cls = load_object(codec_path+".Decoder") 18 | store_content = settings.get('STORE_CONTENT') 19 | self._encoder = encoder_cls(manager.request_model, send_body=store_content) 20 | self._decoder = decoder_cls(manager.request_model, manager.response_model) 21 | self.spider_log_producer = self.mb.spider_log().producer() 22 | spider_feed = self.mb.spider_feed() 23 | self.partition_id = int(settings.get('SPIDER_PARTITION_ID')) 24 | if self.partition_id < 0 or self.partition_id >= settings.get('SPIDER_FEED_PARTITIONS'): 25 | raise ValueError("Spider partition id cannot be less than 0 or more than SPIDER_FEED_PARTITIONS.") 26 | self.consumer = spider_feed.consumer(partition_id=self.partition_id) 27 | self._get_timeout = float(settings.get('KAFKA_GET_TIMEOUT')) 28 | self._logger = logging.getLogger("messagebus-backend") 29 | self._buffer = OverusedBuffer(self._get_next_requests, 30 | self._logger.debug) 31 | self._logger.info("Consuming from partition id %d", self.partition_id) 32 | 33 | @classmethod 34 | def from_manager(cls, manager): 35 | return cls(manager) 36 | 37 | def frontier_start(self): 38 | pass 39 | 40 | def frontier_stop(self): 41 | self.spider_log_producer.flush() 42 | 43 | def add_seeds(self, seeds): 44 | per_host = aggregate_per_host(seeds) 45 | for host_fprint, host_links in six.iteritems(per_host): 46 | self.spider_log_producer.send(host_fprint, 47 | self._encoder.encode_add_seeds(host_links)) 48 | 49 | def page_crawled(self, response): 50 | host_fprint = get_host_fprint(response) 51 | self.spider_log_producer.send(host_fprint, self._encoder.encode_page_crawled(response)) 52 | 53 | def links_extracted(self, request, links): 54 | per_host = aggregate_per_host(links) 55 | for host_fprint, host_links in six.iteritems(per_host): 56 | self.spider_log_producer.send(host_fprint, 57 | self._encoder.encode_links_extracted(request, host_links)) 58 | 59 | def request_error(self, page, error): 60 | host_fprint = get_host_fprint(page) 61 | self.spider_log_producer.send(host_fprint, self._encoder.encode_request_error(page, error)) 62 | 63 | def _get_next_requests(self, max_n_requests, **kwargs): 64 | requests = [] 65 | for encoded in self.consumer.get_messages(count=max_n_requests, timeout=self._get_timeout): 66 | try: 67 | request = self._decoder.decode_request(encoded) 68 | except Exception as exc: 69 | self._logger.warning("Could not decode message: {0}, error {1}".format(encoded, str(exc))) 70 | else: 71 | requests.append(request) 72 | self.spider_log_producer.send(b'0123456789abcdef0123456789abcdef012345678', 73 | self._encoder.encode_offset(self.partition_id, 74 | self.consumer.get_offset(self.partition_id))) 75 | return requests 76 | 77 | def get_next_requests(self, max_n_requests, **kwargs): 78 | return self._buffer.get_next_requests(max_n_requests, **kwargs) 79 | 80 | def finished(self): 81 | return False 82 | 83 | @property 84 | def metadata(self): 85 | return None 86 | 87 | @property 88 | def queue(self): 89 | return None 90 | 91 | @property 92 | def states(self): 93 | return None 94 | 95 | 96 | def aggregate_per_host(requests): 97 | per_host = dict() 98 | for link in requests: 99 | if b'fingerprint' not in link.meta[b'domain']: 100 | continue 101 | host_fprint = link.meta[b'domain'][b'fingerprint'] 102 | if host_fprint not in per_host: 103 | per_host[host_fprint] = [] 104 | per_host[host_fprint].append(link) 105 | return per_host 106 | 107 | 108 | def get_host_fprint(request): 109 | if b'fingerprint' not in request.meta[b'domain']: 110 | return None 111 | return request.meta[b'domain'][b'fingerprint'] -------------------------------------------------------------------------------- /docs/source/topics/frontier-middlewares.rst: -------------------------------------------------------------------------------- 1 | .. _frontier-middlewares: 2 | 3 | =========== 4 | Middlewares(中间件) 5 | =========== 6 | 7 | Frontier :class:`Middleware ` 位于 8 | :class:`FrontierManager ` 和 9 | :class:`Backend ` objects 之间, 根据 :ref:`frontier data flow ` 的流程,处理 :class:`Request ` 和 :class:`Response `。 10 | 11 | Middlewares 是一个轻量级、低层次的系统,可以用来过滤和更改 Frontier 的 requests 和 responses。 12 | 13 | .. _frontier-activating-middleware: 14 | 15 | 激活一个 middleware 16 | ======================= 17 | 18 | 要激活 :class:`Middleware ` component, 需要添加它到 19 | :setting:`MIDDLEWARES` setting(这是一个列表,包含类的路径或者一个 :class:`Middleware ` 对象)。 20 | 21 | 这是一个例子:: 22 | 23 | MIDDLEWARES = [ 24 | 'frontera.contrib.middlewares.domain.DomainMiddleware', 25 | ] 26 | 27 | Middlewares按照它们在列表中定义的相同顺序进行调用,根据你自己的需要安排顺序。 该顺序很重要,因为每个中间件执行不同的操作,并且您的中间件可能依赖于一些先前(或后续的)执行的中间件。 28 | 29 | 最后,记住一些 middlewares 需要通过特殊的 setting。详细请参考 :ref:`each middleware documentation ` 。 30 | 31 | .. _frontier-writing-middleware: 32 | 33 | 写你自己的 middleware 34 | =========================== 35 | 36 | 37 | 写自己的 Frontera middleware 是很简单的。每个 :class:`Middleware ` 是一个继承 :class:`Component ` 的 Python 类。 38 | 39 | 40 | :class:`FrontierManager ` 会通过下面的方法和所有激活的 middlewares 通信。 41 | 42 | 43 | .. class:: frontera.core.components.Middleware 44 | 45 | **Methods** 46 | 47 | .. automethod:: frontera.core.components.Middleware.frontier_start 48 | .. automethod:: frontera.core.components.Middleware.frontier_stop 49 | .. automethod:: frontera.core.components.Middleware.add_seeds 50 | 51 | :return: :class:`Request ` object list or ``None`` 52 | 53 | 应该返回 ``None`` 或者 :class:`Request ` 的列表。 54 | 55 | 如果返回 ``None`` , :class:`FrontierManager ` 将不会处理任何中间件,并且种子也不会到达 :class:`Backend ` 。 56 | 57 | 如果返回 :class:`Request ` 列表,该列表将会传给下个中间件。这个过程会在每个激活的中间件重复,直到它到达 :class:`Backend `。 58 | 59 | 如果要过滤任何种子,请不要将其包含在返回的对象列表中。 60 | 61 | .. automethod:: frontera.core.components.Middleware.page_crawled 62 | 63 | :return: :class:`Response ` or ``None`` 64 | 65 | 应该返回 ``None`` 或者一个 :class:`Response ` 对象。 66 | 67 | 如果返回 ``None`` ,:class:`FrontierManager ` 将不会处理任何中间件,并且 :class:`Backend ` 不会被通知。 68 | 69 | 如果返回 :class:`Response `,它将会被传给下个中间件。这个过程会在每个激活的中间件重复,直到它到达 :class:`Backend `。 70 | 71 | 如果要过滤页面,只需返回 None。 72 | 73 | .. automethod:: frontera.core.components.Middleware.request_error 74 | 75 | 76 | :return: :class:`Request ` or ``None`` 77 | 78 | 应该返回 ``None`` 或者一个 :class:`Request ` 对象。 79 | 80 | 如果返回 ``None``,:class:`FrontierManager ` 将不会和其他任何中间件通信,并且 :class:`Backend ` 不会被通知。 81 | 82 | 如果返回一个 :class:`Response ` 对象,它将会被传给下个中间件。这个过程会在每个激活的中间件重复,直到它到达 :class:`Backend `。 83 | 84 | 如果要过滤页面错误,只需返回 None。 85 | 86 | **Class Methods** 87 | 88 | .. automethod:: frontera.core.components.Middleware.from_manager 89 | 90 | 91 | 92 | .. _frontier-built-in-middleware: 93 | 94 | 内置 middleware 参考 95 | ============================= 96 | 97 | 这篇文章描述了 Frontera 所有的 :class:`Middleware ` 组件。如何使用和写自己的 middleware,请参考 :ref:`middleware usage guide. `。 98 | 99 | 有关默认启用的组件列表(及其顺序),请参阅 MIDDLEWARES 设置。 100 | 101 | 102 | .. _frontier-domain-middleware: 103 | 104 | DomainMiddleware 105 | ---------------- 106 | 107 | .. autoclass:: frontera.contrib.middlewares.domain.DomainMiddleware() 108 | 109 | 110 | .. _frontier-url-fingerprint-middleware: 111 | 112 | UrlFingerprintMiddleware 113 | ------------------------ 114 | 115 | .. autoclass:: frontera.contrib.middlewares.fingerprint.UrlFingerprintMiddleware() 116 | .. autofunction:: frontera.utils.fingerprint.hostname_local_fingerprint 117 | 118 | 119 | 120 | .. _frontier-domain-fingerprint-middleware: 121 | 122 | DomainFingerprintMiddleware 123 | --------------------------- 124 | 125 | .. autoclass:: frontera.contrib.middlewares.fingerprint.DomainFingerprintMiddleware() 126 | -------------------------------------------------------------------------------- /frontera/contrib/backends/remote/codecs/msgpack.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ A MsgPack codec for Frontera. Implemented using native msgpack-python library. 3 | """ 4 | from __future__ import absolute_import 5 | import logging 6 | from msgpack import packb, unpackb 7 | 8 | from frontera.core.codec import BaseDecoder, BaseEncoder 9 | import six 10 | from w3lib.util import to_native_str 11 | 12 | 13 | logger = logging.getLogger(__name__) 14 | 15 | 16 | def _prepare_request_message(request): 17 | def serialize(obj): 18 | """Recursively walk object's hierarchy.""" 19 | if isinstance(obj, (bool, six.integer_types, float, six.binary_type, six.text_type)) or obj is None: 20 | return obj 21 | elif isinstance(obj, dict): 22 | obj = obj.copy() 23 | for key in obj: 24 | obj[key] = serialize(obj[key]) 25 | return obj 26 | elif isinstance(obj, list): 27 | return [serialize(item) for item in obj] 28 | elif isinstance(obj, tuple): 29 | return tuple(serialize([item for item in obj])) 30 | elif hasattr(obj, '__dict__'): 31 | return serialize(obj.__dict__) 32 | else: 33 | logger.warning('unable to serialize object: {}'.format(obj)) 34 | return None 35 | return [request.url, request.method, request.headers, request.cookies, serialize(request.meta)] 36 | 37 | 38 | def _prepare_response_message(response, send_body): 39 | return [response.url, response.status_code, response.meta, response.body if send_body else None] 40 | 41 | 42 | class Encoder(BaseEncoder): 43 | def __init__(self, request_model, *a, **kw): 44 | self.send_body = True if 'send_body' in kw and kw['send_body'] else False 45 | 46 | def encode_add_seeds(self, seeds): 47 | return packb([b'as', [_prepare_request_message(seed) for seed in seeds]], use_bin_type=True) 48 | 49 | def encode_page_crawled(self, response): 50 | return packb([b'pc', _prepare_response_message(response, self.send_body)], use_bin_type=True) 51 | 52 | def encode_links_extracted(self, request, links): 53 | return packb([b'le', _prepare_request_message(request), [_prepare_request_message(link) for link in links]], 54 | use_bin_type=True) 55 | 56 | def encode_request_error(self, request, error): 57 | return packb([b're', _prepare_request_message(request), str(error)], use_bin_type=True) 58 | 59 | def encode_request(self, request): 60 | return packb(_prepare_request_message(request), use_bin_type=True) 61 | 62 | def encode_update_score(self, request, score, schedule): 63 | return packb([b'us', _prepare_request_message(request), score, schedule], use_bin_type=True) 64 | 65 | def encode_new_job_id(self, job_id): 66 | return packb([b'njid', int(job_id)], use_bin_type=True) 67 | 68 | def encode_offset(self, partition_id, offset): 69 | return packb([b'of', int(partition_id), int(offset)], use_bin_type=True) 70 | 71 | 72 | class Decoder(BaseDecoder): 73 | def __init__(self, request_model, response_model, *a, **kw): 74 | self._request_model = request_model 75 | self._response_model = response_model 76 | 77 | def _response_from_object(self, obj): 78 | url = to_native_str(obj[0]) 79 | return self._response_model(url=url, 80 | status_code=obj[1], 81 | body=obj[3], 82 | request=self._request_model(url=url, 83 | meta=obj[2])) 84 | 85 | def _request_from_object(self, obj): 86 | return self._request_model(url=to_native_str(obj[0]), 87 | method=obj[1], 88 | headers=obj[2], 89 | cookies=obj[3], 90 | meta=obj[4]) 91 | 92 | def decode(self, buffer): 93 | obj = unpackb(buffer, encoding='utf-8') 94 | if obj[0] == b'pc': 95 | return ('page_crawled', 96 | self._response_from_object(obj[1])) 97 | if obj[0] == b'le': 98 | return ('links_extracted', 99 | self._request_from_object(obj[1]), 100 | [self._request_from_object(x) for x in obj[2]]) 101 | if obj[0] == b'us': 102 | return ('update_score', self._request_from_object(obj[1]), obj[2], obj[3]) 103 | if obj[0] == b're': 104 | return ('request_error', self._request_from_object(obj[1]), to_native_str(obj[2])) 105 | if obj[0] == b'as': 106 | return ('add_seeds', [self._request_from_object(x) for x in obj[1]]) 107 | if obj[0] == b'njid': 108 | return ('new_job_id', int(obj[1])) 109 | if obj[0] == b'of': 110 | return ('offset', int(obj[1]), int(obj[2])) 111 | return TypeError('Unknown message type') 112 | 113 | def decode_request(self, buffer): 114 | return self._request_from_object(unpackb(buffer, encoding='utf-8')) 115 | -------------------------------------------------------------------------------- /frontera/worker/strategies/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from frontera.core.models import Request 4 | from frontera.contrib.middlewares.fingerprint import UrlFingerprintMiddleware 5 | 6 | from abc import ABCMeta, abstractmethod 7 | import six 8 | 9 | 10 | @six.add_metaclass(ABCMeta) 11 | class BaseCrawlingStrategy(object): 12 | """ 13 | Interface definition for a crawling strategy. 14 | 15 | Before calling these methods strategy worker is adding 'state' key to meta field in every 16 | :class:`Request ` with state of the URL. Pleases refer for the states to HBaseBackend 17 | implementation. 18 | 19 | After exiting from all of these methods states from meta field are passed back and stored in the backend. 20 | """ 21 | 22 | def __init__(self, manager, mb_stream, states_context): 23 | self._mb_stream = mb_stream 24 | self._states_context = states_context 25 | self.url_mw = UrlFingerprintMiddleware(manager) 26 | 27 | @classmethod 28 | def from_worker(cls, manager, mb_stream, states_context): 29 | """ 30 | Called on instantiation in strategy worker. 31 | 32 | :param manager: :class: `Backend ` instance 33 | :param mb_stream: :class: `UpdateScoreStream ` instance 34 | :return: new instance 35 | """ 36 | return cls(manager, mb_stream, states_context) 37 | 38 | @abstractmethod 39 | def add_seeds(self, seeds): 40 | """ 41 | Called when add_seeds event is received from spider log. 42 | 43 | :param list seeds: A list of :class:`Request ` objects. 44 | """ 45 | 46 | @abstractmethod 47 | def page_crawled(self, response): 48 | """ 49 | Called every time document was successfully crawled, and receiving page_crawled event from spider log. 50 | 51 | :param object response: The :class:`Response ` object for the crawled page. 52 | """ 53 | 54 | @abstractmethod 55 | def links_extracted(self, request, links): 56 | """ 57 | Called every time document was successfully crawled, and receiving page_crawled event from spider log. 58 | 59 | :param object request: The :class:`Request ` object for the crawled page. 60 | :param list links: A list of :class:`Request ` objects generated from \ 61 | the links extracted for the crawled page. 62 | """ 63 | 64 | @abstractmethod 65 | def page_error(self, request, error): 66 | """ 67 | Called every time there was error during page downloading. 68 | 69 | :param object request: The fetched with error :class:`Request ` object. 70 | :param str error: A string identifier for the error. 71 | """ 72 | 73 | def finished(self): 74 | """ 75 | Called by Strategy worker, after finishing processing each cycle of spider log. If this method returns true, 76 | then Strategy worker reports that crawling goal is achieved, stops and exits. 77 | 78 | :return: bool 79 | """ 80 | return False 81 | 82 | def close(self): 83 | """ 84 | Called when strategy worker is about to close crawling strategy. 85 | """ 86 | self._mb_stream.flush() 87 | self._states_context.release() 88 | 89 | def schedule(self, request, score=1.0, dont_queue=False): 90 | """ 91 | Schedule document for crawling with specified score. 92 | 93 | :param request: A :class:`Request ` object. 94 | :param score: float from 0.0 to 1.0 95 | :param dont_queue: bool, True - if no need to schedule, only update the score 96 | """ 97 | self._mb_stream.send(request, score, dont_queue) 98 | 99 | def create_request(self, url, method=b'GET', headers=None, cookies=None, meta=None, body=b''): 100 | """ 101 | Creates request with specified fields, with state fetched from backend. This method only creates request, but 102 | isn't getting it's state from storage. Use self.refresh_states on a batch of requests to get their states 103 | from storage. 104 | 105 | :param url: str 106 | :param method: str 107 | :param headers: dict 108 | :param cookies: dict 109 | :param meta: dict 110 | :param body: str 111 | :return: :class:`Request ` 112 | """ 113 | r = Request(url, method=method, headers=headers, cookies=cookies, meta=meta, body=body) 114 | self.url_mw._add_fingerprint(r) 115 | return r 116 | 117 | def refresh_states(self, requests): 118 | """ 119 | Retrieves states for all requests from storage. 120 | 121 | :param requests: list(:class:`Request `) 122 | """ 123 | self._states_context.refresh_and_keep(requests) 124 | -------------------------------------------------------------------------------- /frontera/contrib/scrapy/converters.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from scrapy.http.request import Request as ScrapyRequest 3 | from scrapy.http.response import Response as ScrapyResponse 4 | 5 | from frontera.core.models import Request as FrontierRequest 6 | from frontera.core.models import Response as FrontierResponse 7 | from frontera.utils.converters import BaseRequestConverter, BaseResponseConverter 8 | from w3lib.util import to_bytes, to_native_str 9 | 10 | 11 | class RequestConverter(BaseRequestConverter): 12 | """Converts between frontera and Scrapy request objects""" 13 | def __init__(self, spider): 14 | self.spider = spider 15 | 16 | def to_frontier(self, scrapy_request): 17 | """request: Scrapy > Frontier""" 18 | if isinstance(scrapy_request.cookies, dict): 19 | cookies = scrapy_request.cookies 20 | else: 21 | cookies = dict(sum([list(d.items()) for d in scrapy_request.cookies], [])) 22 | cb = scrapy_request.callback 23 | if callable(cb): 24 | cb = _find_method(self.spider, cb) 25 | eb = scrapy_request.errback 26 | if callable(eb): 27 | eb = _find_method(self.spider, eb) 28 | 29 | scrapy_meta = scrapy_request.meta 30 | meta = {} 31 | if b'frontier_request' in scrapy_meta: 32 | request = scrapy_meta[b'frontier_request'] 33 | if isinstance(request, FrontierRequest): 34 | meta = request.meta 35 | del scrapy_meta[b'frontier_request'] 36 | 37 | meta.update({ 38 | b'scrapy_callback': cb, 39 | b'scrapy_errback': eb, 40 | b'scrapy_meta': scrapy_meta, 41 | b'origin_is_frontier': True, 42 | }) 43 | if 'redirect_urls' in scrapy_meta: 44 | meta[b'redirect_urls'] = scrapy_meta['redirect_urls'] 45 | return FrontierRequest(url=scrapy_request.url, 46 | method=scrapy_request.method, 47 | headers=scrapy_request.headers, 48 | cookies=cookies, 49 | meta=meta, 50 | body=scrapy_request.body) 51 | 52 | def from_frontier(self, frontier_request): 53 | """request: Frontier > Scrapy""" 54 | cb = frontier_request.meta.get(b'scrapy_callback', None) 55 | if cb and self.spider: 56 | cb = _get_method(self.spider, cb) 57 | eb = frontier_request.meta.get(b'scrapy_errback', None) 58 | if eb and self.spider: 59 | eb = _get_method(self.spider, eb) 60 | body = frontier_request.body 61 | meta = frontier_request.meta.get(b'scrapy_meta', {}) 62 | meta[b'frontier_request'] = frontier_request 63 | return ScrapyRequest(url=frontier_request.url, 64 | callback=cb, 65 | errback=eb, 66 | body=body, 67 | method=to_native_str(frontier_request.method), 68 | headers=frontier_request.headers, 69 | cookies=frontier_request.cookies, 70 | meta=meta, 71 | dont_filter=True) 72 | 73 | 74 | class ResponseConverter(BaseResponseConverter): 75 | """Converts between frontera and Scrapy response objects""" 76 | def __init__(self, spider, request_converter): 77 | self.spider = spider 78 | self._request_converter = request_converter 79 | 80 | def to_frontier(self, scrapy_response): 81 | """response: Scrapy > Frontier""" 82 | frontier_request = scrapy_response.meta[b'frontier_request'] 83 | frontier_request.meta[b'scrapy_meta'] = scrapy_response.meta 84 | if 'redirect_urls' in scrapy_response.meta: 85 | frontier_request.meta[b'redirect_urls'] = scrapy_response.meta['redirect_urls'] 86 | del scrapy_response.meta[b'frontier_request'] 87 | return FrontierResponse(url=scrapy_response.url, 88 | status_code=scrapy_response.status, 89 | headers=scrapy_response.headers, 90 | body=scrapy_response.body, 91 | request=frontier_request) 92 | 93 | def from_frontier(self, response): 94 | """response: Frontier > Scrapy""" 95 | return ScrapyResponse(url=response.url, 96 | status=response.status_code, 97 | headers=response.headers, 98 | body=response.body, 99 | request=self._request_converter.from_frontier(response.request)) 100 | 101 | 102 | def _find_method(obj, func): 103 | if obj and hasattr(func, '__self__') and func.__self__ is obj: 104 | return to_bytes(func.__func__.__name__) 105 | else: 106 | raise ValueError("Function %s is not a method of: %s" % (func, obj)) 107 | 108 | 109 | def _get_method(obj, name): 110 | name = to_native_str(name) 111 | try: 112 | return getattr(obj, name) 113 | except AttributeError: 114 | raise ValueError("Method %r not found in: %s" % (name, obj)) 115 | -------------------------------------------------------------------------------- /docs/source/topics/cluster-setup.rst: -------------------------------------------------------------------------------- 1 | =================== 2 | 集群安装指南 3 | =================== 4 | 5 | 这个指南的目标是教你如何初始化爬虫集群,在实践过程中一些步骤可能需要微调。这篇指南假设你使用 Kafka作为消息总线(官方推荐),当然用 Zero MQ 也是可以的,但是可靠性会差点。 6 | 7 | 需要决定的事情 8 | 9 | ================ 10 | * 你抓取的速度, 11 | * 爬虫进程的数量(假设单个爬虫的最大速度是1200个网页/分钟), 12 | * DB worker 和 Strategy worker 的数量。 13 | 14 | 启动之前需要安装的 15 | ================================ 16 | * Kafka, 17 | * HBase (推荐 1.0.x 或更高的版本), 18 | * :doc:`DNS Service ` (推荐但并不是必须的). 19 | 20 | 启动之前需要实现的 21 | ==================================== 22 | * :doc:`Crawling strategy ` 23 | * 爬虫代码 24 | 25 | 配置 Kafka 26 | ================= 27 | 28 | 为 Kafka 消息总线创建所有需要的 topic 29 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 30 | 31 | * :term:`spider log` (`frontier-done` (see :setting:`SPIDER_LOG_TOPIC`)), 设置 topic 分区数与 Strategy worker 实例数相同, 32 | * :term:`spider feed` (`frontier-todo` (see :setting:`SPIDER_FEED_TOPIC`)), 设置 topic 分区数与爬虫数相同, 33 | * :term:`scoring log` (`frontier-score` (see :setting:`SCORING_LOG_TOPIC`)) 34 | 35 | 36 | 配置 HBase 37 | ================= 38 | * 创建一个 namespace ``crawler`` (请参照 :setting:`HBASE_NAMESPACE`), 39 | * 确保原生支持 Snappy 压缩。 40 | 41 | 42 | 配置 Frontera 43 | ==================== 44 | 每个 Frontera 组件需要自己的配置模块,但是一些配置项是共享的,所以我们推荐创建一个公有的配置模块,并在自有的配置中引入这个公有模块。 45 | 46 | 1. 创建一个公有模块并添加如下信息: :: 47 | 48 | from __future__ import absolute_import 49 | from frontera.settings.default_settings import MIDDLEWARES 50 | MAX_NEXT_REQUESTS = 512 51 | SPIDER_FEED_PARTITIONS = 2 # number of spider processes 52 | SPIDER_LOG_PARTITIONS = 2 # worker instances 53 | MIDDLEWARES.extend([ 54 | 'frontera.contrib.middlewares.domain.DomainMiddleware', 55 | 'frontera.contrib.middlewares.fingerprint.DomainFingerprintMiddleware' 56 | ]) 57 | 58 | QUEUE_HOSTNAME_PARTITIONING = True 59 | KAFKA_LOCATION = 'localhost:9092' # your Kafka broker host:port 60 | SCORING_TOPIC = 'frontier-scoring' 61 | URL_FINGERPRINT_FUNCTION='frontera.utils.fingerprint.hostname_local_fingerprint' 62 | 63 | 2. 创建 workers 的公有模块: :: 64 | 65 | from __future__ import absolute_import 66 | from .common import * 67 | 68 | BACKEND = 'frontera.contrib.backends.hbase.HBaseBackend' 69 | 70 | MAX_NEXT_REQUESTS = 2048 71 | NEW_BATCH_DELAY = 3.0 72 | 73 | HBASE_THRIFT_HOST = 'localhost' # HBase Thrift server host and port 74 | HBASE_THRIFT_PORT = 9090 75 | 76 | 3. 创建 DB worker 配置模块: :: 77 | 78 | from __future__ import absolute_import 79 | from .worker import * 80 | 81 | LOGGING_CONFIG='logging-db.conf' # if needed 82 | 83 | 4. 创建 Strategy worker 配置模块: :: 84 | 85 | from __future__ import absolute_import 86 | from .worker import * 87 | 88 | CRAWLING_STRATEGY = '' # path to the crawling strategy class 89 | LOGGING_CONFIG='logging-sw.conf' # if needed 90 | 91 | logging 配置可参考 https://docs.python.org/2/library/logging.config.html 请看 92 | :doc:`list of loggers `. 93 | 94 | 5. 设置爬虫配置模块: :: 95 | 96 | from __future__ import absolute_import 97 | from .common import * 98 | 99 | BACKEND = 'frontera.contrib.backends.remote.messagebus.MessageBusBackend' 100 | KAFKA_GET_TIMEOUT = 0.5 101 | 102 | 103 | 6. 配置 Scrapy settings 模块. 这个模块在 Scrapy 项目文件夹中,并被 scrapy.cfg 引用 。 添加如下:: 104 | 105 | FRONTERA_SETTINGS = '' # module path to your Frontera spider config module 106 | 107 | SCHEDULER = 'frontera.contrib.scrapy.schedulers.frontier.FronteraScheduler' 108 | 109 | SPIDER_MIDDLEWARES = { 110 | 'frontera.contrib.scrapy.middlewares.schedulers.SchedulerSpiderMiddleware': 999, 111 | 'frontera.contrib.scrapy.middlewares.seeds.file.FileSeedLoader': 1, 112 | } 113 | DOWNLOADER_MIDDLEWARES = { 114 | 'frontera.contrib.scrapy.middlewares.schedulers.SchedulerDownloaderMiddleware': 999, 115 | } 116 | 117 | 118 | 启动集群 119 | ==================== 120 | 121 | 首先,启动 DB worker: :: 122 | 123 | # start DB worker only for batch generation 124 | $ python -m frontera.worker.db --config [db worker config module] --no-incoming 125 | ... 126 | # Then start next one dedicated to spider log processing 127 | $ python -m frontera.worker.db --no-batches --config [db worker config module] 128 | 129 | 130 | 之后,启动strategy workers,每个 spider log topic 的分区需要对应一个 strategy workers 的实例: :: 131 | 132 | $ python -m frontera.worker.strategy --config [strategy worker config] --partition-id 0 133 | $ python -m frontera.worker.strategy --config [strategy worker config] --partition-id 1 134 | ... 135 | $ python -m frontera.worker.strategy --config [strategy worker config] --partition-id N 136 | 137 | 你应该注意到所有的进程会向 log 中写信息。如果没有数据传递相关的 log 信息也是正常的,因为现在系统中还没有种子 URLS。 138 | 139 | 140 | 让我们在文件中每行放一个 URL 作为种子,来启动爬虫。每个爬虫进程对应一个 spider feed topic 的分区: :: 141 | 142 | $ scrapy crawl [spider] -L INFO -s SEEDS_SOURCE = 'seeds.txt' -s SPIDER_PARTITION_ID=0 143 | ... 144 | $ scrapy crawl [spider] -L INFO -s SPIDER_PARTITION_ID=1 145 | $ scrapy crawl [spider] -L INFO -s SPIDER_PARTITION_ID=2 146 | ... 147 | $ scrapy crawl [spider] -L INFO -s SPIDER_PARTITION_ID=N 148 | 149 | 最后你应该启动 N 个爬虫进程。通常一个爬虫实例从 ``SEEDS_SOURCE`` 中读取种子发送给 Frontera 集群就足够了。只有爬虫的任务队列为空时才会读取种子。也可以从配置文件中读取 :setting:`SPIDER_PARTITION_ID` 。 150 | 151 | 一段时间以后,种子会被准备好,以供爬虫抓取。爬虫真正启动了。 152 | -------------------------------------------------------------------------------- /frontera/utils/tester.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | 3 | from collections import OrderedDict, deque 4 | from six.moves.urllib.parse import urlparse 5 | import six 6 | from six.moves import range 7 | 8 | 9 | class FrontierTester(object): 10 | 11 | def __init__(self, frontier, graph_manager, downloader_simulator, max_next_requests=0): 12 | self.frontier = frontier 13 | self.graph_manager = graph_manager 14 | self.max_next_requests = max_next_requests 15 | self.sequence = [] 16 | self.downloader_simulator = downloader_simulator 17 | 18 | def run(self, add_all_pages=False): 19 | if not self.frontier.auto_start: 20 | self.frontier.start() 21 | if not add_all_pages: 22 | self._add_seeds() 23 | else: 24 | self._add_all() 25 | while True: 26 | result = self._run_iteration() 27 | self.sequence.append(result) 28 | requests, iteration, dl_info = result 29 | if not requests and self.downloader_simulator.idle(): 30 | break 31 | self.frontier.stop() 32 | 33 | def _add_seeds(self): 34 | self.frontier.add_seeds([self._make_request(seed.url) for seed in self.graph_manager.seeds]) 35 | 36 | def _add_all(self): 37 | for page in self.graph_manager.pages: 38 | if page.is_seed: 39 | self.frontier.add_seeds([self._make_request(page.url)]) 40 | if not page.has_errors: 41 | for link in page.links: 42 | self.frontier.add_seeds([self._make_request(link.url)]) 43 | 44 | def _make_request(self, url): 45 | r = self.frontier.request_model(url=url, 46 | headers={ 47 | b'X-Important-Header': b'Frontera' 48 | }, 49 | method=b'POST', 50 | cookies={b'currency': b'USD'}) 51 | r.meta[b'this_param'] = b'should be passed over' 52 | return r 53 | 54 | def _make_response(self, url, status_code, request): 55 | return self.frontier.response_model(url=url, status_code=status_code, request=request) 56 | 57 | def _run_iteration(self): 58 | kwargs = self.downloader_simulator.downloader_info() 59 | if self.max_next_requests: 60 | kwargs['max_next_requests'] = self.max_next_requests 61 | 62 | requests = self.frontier.get_next_requests(**kwargs) 63 | 64 | self.downloader_simulator.update(requests) 65 | 66 | for page_to_crawl in self.downloader_simulator.download(): 67 | crawled_page = self.graph_manager.get_page(url=page_to_crawl.url) 68 | if not crawled_page.has_errors: 69 | response = self._make_response(url=page_to_crawl.url, 70 | status_code=crawled_page.status, 71 | request=page_to_crawl) 72 | self.frontier.page_crawled(response=response) 73 | self.frontier.links_extracted(request=response.request, 74 | links=[self._make_request(link.url) for link in crawled_page.links]) 75 | else: 76 | self.frontier.request_error(request=page_to_crawl, 77 | error=crawled_page.status) 78 | assert page_to_crawl.meta[b'this_param'] == b'should be passed over' 79 | assert page_to_crawl.headers[b'X-Important-Header'] == b'Frontera' 80 | assert page_to_crawl.method == b'POST' 81 | assert page_to_crawl.cookies[b'currency'] == b'USD' 82 | return (requests, self.frontier.iteration, kwargs) 83 | 84 | 85 | class BaseDownloaderSimulator(object): 86 | def __init__(self): 87 | self.requests = None 88 | 89 | def update(self, requests): 90 | self.requests = requests 91 | 92 | def download(self): 93 | return self.requests 94 | 95 | def downloader_info(self): 96 | return { 97 | 'key_type': 'domain', 98 | 'overused_keys': [] 99 | } 100 | 101 | def idle(self): 102 | return True 103 | 104 | 105 | class DownloaderSimulator(BaseDownloaderSimulator): 106 | def __init__(self, rate): 107 | self._requests_per_slot = rate 108 | self.slots = OrderedDict() 109 | super(DownloaderSimulator, self).__init__() 110 | 111 | def update(self, requests): 112 | for request in requests: 113 | hostname = urlparse(request.url).hostname or '' 114 | self.slots.setdefault(hostname, deque()).append(request) 115 | 116 | def download(self): 117 | output = [] 118 | _trash_can = [] 119 | for key, requests in six.iteritems(self.slots): 120 | for i in range(min(len(requests), self._requests_per_slot)): 121 | output.append(requests.popleft()) 122 | if not requests: 123 | _trash_can.append(key) 124 | 125 | for key in _trash_can: 126 | del self.slots[key] 127 | return output 128 | 129 | def downloader_info(self): 130 | info = { 131 | 'key_type': 'domain', 132 | 'overused_keys': [] 133 | } 134 | for key, requests in six.iteritems(self.slots): 135 | if len(requests) > self._requests_per_slot: 136 | info['overused_keys'].append(key) 137 | return info 138 | 139 | def idle(self): 140 | return len(self.slots) == 0 141 | -------------------------------------------------------------------------------- /frontera/worker/server.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from logging import getLogger 4 | from json import JSONDecoder, JSONEncoder 5 | from sys import exc_info 6 | from traceback import format_exception 7 | 8 | from twisted.web import server, resource 9 | 10 | from frontera.utils.async import listen_tcp 11 | 12 | logger = getLogger("cf-server") 13 | 14 | 15 | def jsonrpc_error(id, code, message, data=None): 16 | """Create JSON-RPC error response""" 17 | return { 18 | 'jsonrpc': '2.0', 19 | 'error': { 20 | 'code': code, 21 | 'message': message, 22 | 'data': data, 23 | }, 24 | 'id': id, 25 | } 26 | 27 | 28 | def jsonrpc_result(id, result): 29 | """Create JSON-RPC result response""" 30 | return { 31 | 'jsonrpc': '2.0', 32 | 'result': result, 33 | 'id': id, 34 | } 35 | 36 | 37 | class JsonRpcError(Exception): 38 | 39 | def __init__(self, code, message): 40 | self.code = code 41 | self.message = message 42 | 43 | def __call__(self, id): 44 | return jsonrpc_error(id, self.code, self.message) 45 | 46 | 47 | class JsonResource(resource.Resource): 48 | 49 | json_encoder = JSONEncoder() 50 | json_decoder = JSONDecoder() 51 | 52 | def render(self, txrequest): 53 | r = resource.Resource.render(self, txrequest) 54 | return self.render_object(r, txrequest) 55 | 56 | def render_object(self, obj, txrequest): 57 | r = self.json_encoder.encode(obj) + "\n" 58 | txrequest.setHeader('Content-Type', 'application/json') 59 | txrequest.setHeader('Access-Control-Allow-Origin', '*') 60 | txrequest.setHeader('Access-Control-Allow-Methods', 'GET, POST, PATCH, PUT, DELETE') 61 | txrequest.setHeader('Access-Control-Allow-Headers', 'X-Requested-With') 62 | txrequest.setHeader('Content-Length', len(r)) 63 | return r 64 | 65 | def parse_jsonrpc(self, txrequest): 66 | if hasattr(txrequest.content, 'read'): 67 | data = txrequest.content.read() 68 | else: 69 | data = txrequest.content.getvalue() 70 | return self.json_decoder.decode(data) 71 | 72 | 73 | class StatusResource(JsonResource): 74 | 75 | ws_name = 'status' 76 | 77 | def __init__(self, worker): 78 | self.worker = worker 79 | JsonResource.__init__(self) 80 | 81 | def render_GET(self, txrequest): 82 | return { 83 | 'is_finishing': self.worker.slot.is_finishing, 84 | 'disable_new_batches': self.worker.slot.no_batches, 85 | 'stats': self.worker.stats 86 | } 87 | 88 | 89 | class JsonRpcResource(JsonResource): 90 | 91 | ws_name = 'jsonrpc' 92 | 93 | def __init__(self): 94 | JsonResource.__init__(self) 95 | 96 | def render_POST(self, txrequest): 97 | jrequest = self.parse_jsonrpc(txrequest) 98 | method = jrequest['method'] 99 | try: 100 | try: 101 | return self.process_request(method, jrequest) 102 | except Exception as err: 103 | if isinstance(err, JsonRpcError): 104 | raise err 105 | trace_lines = format_exception(*exc_info()) 106 | raise JsonRpcError(500, "Error processing request: %s" % (str("").join(trace_lines))) 107 | except JsonRpcError as err: 108 | return err(jrequest['id']) 109 | 110 | 111 | class WorkerJsonRpcResource(JsonRpcResource): 112 | 113 | def __init__(self, worker): 114 | self.worker = worker 115 | JsonRpcResource.__init__(self) 116 | 117 | def process_request(self, method, jrequest): 118 | if method == 'disable_new_batches': 119 | self.worker.disable_new_batches() 120 | return jsonrpc_result(jrequest['id'], "success") 121 | 122 | if method == 'enable_new_batches': 123 | self.worker.enable_new_batches() 124 | return jsonrpc_result(jrequest['id'], "success") 125 | raise JsonRpcError(400, "Unknown method") 126 | 127 | 128 | class RootResource(JsonResource): 129 | 130 | def render_GET(self, txrequest): 131 | return {'resources': list(self.children.keys())} 132 | 133 | def getChild(self, name, txrequest): 134 | if name == '': 135 | return self 136 | return JsonResource.getChild(self, name, txrequest) 137 | 138 | 139 | class JsonRpcService(server.Site): 140 | def __init__(self, root, settings): 141 | logfile = settings.get('JSONRPC_LOGFILE') 142 | self.portrange = settings.get('JSONRPC_PORT', [6023, 6073]) 143 | self.host = settings.get('JSONRPC_HOST', '127.0.0.1') 144 | 145 | server.Site.__init__(self, root, logPath=logfile) 146 | self.noisy = False 147 | 148 | def start_listening(self): 149 | self.port = listen_tcp(self.portrange, self.host, self) 150 | h = self.port.getHost() 151 | logger.info('Web service listening on %(host)s:%(port)d'.format(host=h.host, port=h.port)) 152 | 153 | def stop_listening(self): 154 | self.port.stopListening() 155 | 156 | 157 | class WorkerJsonRpcService(JsonRpcService): 158 | def __init__(self, worker, settings): 159 | root = RootResource() 160 | root.putChild('status', StatusResource(worker)) 161 | root.putChild('jsonrpc', WorkerJsonRpcResource(worker)) 162 | JsonRpcService.__init__(self, root, settings) 163 | self.worker = worker 164 | 165 | def start_listening(self): 166 | JsonRpcService.start_listening(self) 167 | address = self.port.getHost() 168 | self.worker.set_process_info("%s:%d" % (address.host, address.port)) 169 | -------------------------------------------------------------------------------- /frontera/utils/graphs/manager.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from sqlalchemy import create_engine 3 | from sqlalchemy.orm import sessionmaker 4 | 5 | from .models import Base, CrawlPage 6 | from .data import CrawlSiteData, CrawlSiteListData 7 | 8 | DEFAULT_ENGINE = 'sqlite:///:memory:' 9 | 10 | 11 | class CrawlGraphManager(object): 12 | def __init__(self, engine=DEFAULT_ENGINE, autocommit=False, autoflush=False, 13 | echo=False, drop_all_tables=False, clear_content=False): 14 | self.engine = create_engine(engine, echo=echo) 15 | if drop_all_tables: 16 | Base.metadata.drop_all(self.engine) 17 | Base.metadata.create_all(self.engine) 18 | self.Session = sessionmaker() 19 | self.Session.configure(bind=self.engine, autocommit=autocommit, autoflush=autoflush) 20 | self.session = self.Session() 21 | if clear_content: 22 | for name, table in Base.metadata.tables.items(): 23 | self.session.execute(table.delete()) 24 | 25 | @property 26 | def pages(self): 27 | return [page for page in CrawlPage.query(self.session).all()] 28 | 29 | @property 30 | def seeds(self): 31 | return self.session.query(CrawlPage).filter_by(is_seed=True).all() 32 | 33 | def add_page(self, url, status=200, n_redirects=0, is_seed=False, commit=True): 34 | page, created = CrawlPage.get_or_create(self.session, url=url) 35 | if created: 36 | page.is_seed = is_seed 37 | page.status = status 38 | page.n_redirects = n_redirects 39 | if commit: 40 | self.session.commit() 41 | return page 42 | 43 | def add_link(self, page, url, commit=True, status=200): 44 | link_page, created = CrawlPage.get_or_create(self.session, url=url) 45 | if created: 46 | link_page.status = status 47 | if link_page not in page.links: 48 | page.links.append(link_page) 49 | if commit: 50 | self.session.commit() 51 | return link_page 52 | 53 | def get_page(self, url): 54 | return self.session.query(CrawlPage).filter_by(url=url).first() 55 | 56 | def add_site(self, site, default_status=200, default_n_redirects=0): 57 | pages = site.pages if isinstance(site, CrawlSiteData) else site 58 | for i, (info, links) in enumerate(pages): 59 | if isinstance(info, tuple): 60 | if len(info) == 2: 61 | status, page_url, n_redirects = (info[0], info[1], default_n_redirects) 62 | else: 63 | status, page_url, n_redirects = info 64 | else: 65 | status, page_url, n_redirects = (default_status, info, default_n_redirects) 66 | page = self.add_page(url=page_url, status=status, n_redirects=n_redirects, is_seed=(i == 0)) 67 | for link_url in links: 68 | self.add_link(page=page, url=link_url, status=default_status) 69 | 70 | def add_site_list(self, graph, default_status=200, default_n_redirects=0): 71 | sites = graph.sites if isinstance(graph, CrawlSiteListData) else graph 72 | for site in sites: 73 | self.add_site(site=site, default_status=default_status, default_n_redirects=default_n_redirects) 74 | 75 | def save(self): 76 | self.session.commit() 77 | 78 | def render(self, filename, label='', labelloc='t', labeljust='c', 79 | rankdir="TB", ranksep=0.7, 80 | fontname='Arial', fontsize=24, 81 | use_urls=False, 82 | node_fixedsize='true', nodesep=0.1, node_width=0.85, node_height=0.85, node_fontsize=15, 83 | include_ids=False): 84 | import pydot 85 | 86 | # Graph 87 | graph_args = { 88 | "rankdir": rankdir, 89 | "ranksep": ranksep, 90 | "nodesep": nodesep, 91 | "fontname": fontname, 92 | "fontsize": fontsize, 93 | } 94 | if label: 95 | graph_args.update({ 96 | "labelloc": labelloc, 97 | "labeljust": labeljust, 98 | "label": label 99 | }) 100 | graph = pydot.Dot(**graph_args) 101 | 102 | # Node 103 | node_args = { 104 | "fontsize": node_fontsize, 105 | } 106 | if use_urls: 107 | node_seed_shape = 'rectangle' 108 | node_shape = 'oval' 109 | else: 110 | node_seed_shape = 'square' 111 | node_shape = 'circle' 112 | node_args.update({ 113 | "fixedsize": node_fixedsize, 114 | "width": node_width, 115 | "height": node_height, 116 | }) 117 | 118 | graph.set_node_defaults(**node_args) 119 | for page in self.pages: 120 | graph.add_node(pydot.Node(name=self._clean_page_name(page, include_id=include_ids), 121 | fontname=fontname, 122 | fontsize=node_fontsize, 123 | shape=node_seed_shape if page.is_seed else node_shape)) 124 | for link in page.links: 125 | graph.add_edge(pydot.Edge(self._clean_page_name(page, include_id=include_ids), 126 | self._clean_page_name(link, include_id=include_ids))) 127 | graph.write_png(filename) 128 | 129 | def _clean_page_name(self, page, include_id): 130 | cleaned_name = page.url 131 | cleaned_name = cleaned_name.replace('http://', '') 132 | cleaned_name = cleaned_name.replace('https://', '') 133 | if include_id: 134 | cleaned_name = "%d. %s" % (page.id, cleaned_name) 135 | return cleaned_name -------------------------------------------------------------------------------- /frontera/contrib/backends/sqlalchemy/revisiting.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | import logging 4 | from datetime import datetime, timedelta 5 | from time import time, sleep 6 | from calendar import timegm 7 | 8 | from sqlalchemy import Column, BigInteger 9 | 10 | from frontera import Request 11 | from frontera.contrib.backends.partitioners import Crc32NamePartitioner 12 | from frontera.contrib.backends.sqlalchemy import SQLAlchemyBackend 13 | from frontera.contrib.backends.sqlalchemy.models import QueueModelMixin, DeclarativeBase 14 | from frontera.core.components import Queue as BaseQueue, States 15 | from frontera.utils.misc import get_crc32 16 | from frontera.utils.url import parse_domain_from_url_fast 17 | from six.moves import range 18 | 19 | 20 | def utcnow_timestamp(): 21 | d = datetime.utcnow() 22 | return timegm(d.timetuple()) 23 | 24 | 25 | class RevisitingQueueModel(QueueModelMixin, DeclarativeBase): 26 | __tablename__ = 'revisiting_queue' 27 | 28 | crawl_at = Column(BigInteger, nullable=False) 29 | 30 | 31 | def retry_and_rollback(func): 32 | def func_wrapper(self, *args, **kwargs): 33 | tries = 5 34 | while True: 35 | try: 36 | return func(self, *args, **kwargs) 37 | except Exception as exc: 38 | self.logger.exception(exc) 39 | self.session.rollback() 40 | sleep(5) 41 | tries -= 1 42 | if tries > 0: 43 | self.logger.info("Tries left %i" % tries) 44 | continue 45 | else: 46 | raise exc 47 | return func_wrapper 48 | 49 | 50 | class RevisitingQueue(BaseQueue): 51 | def __init__(self, session_cls, queue_cls, partitions): 52 | self.session = session_cls() 53 | self.queue_model = queue_cls 54 | self.logger = logging.getLogger("sqlalchemy.revisiting.queue") 55 | self.partitions = [i for i in range(0, partitions)] 56 | self.partitioner = Crc32NamePartitioner(self.partitions) 57 | 58 | def frontier_stop(self): 59 | self.session.close() 60 | 61 | def get_next_requests(self, max_n_requests, partition_id, **kwargs): 62 | results = [] 63 | try: 64 | for item in self.session.query(self.queue_model).\ 65 | filter(RevisitingQueueModel.crawl_at <= utcnow_timestamp(), 66 | RevisitingQueueModel.partition_id == partition_id).\ 67 | limit(max_n_requests): 68 | method = 'GET' if not item.method else item.method 69 | results.append(Request(item.url, method=method, meta=item.meta, headers=item.headers, 70 | cookies=item.cookies)) 71 | self.session.delete(item) 72 | self.session.commit() 73 | except Exception as exc: 74 | self.logger.exception(exc) 75 | self.session.rollback() 76 | return results 77 | 78 | @retry_and_rollback 79 | def schedule(self, batch): 80 | to_save = [] 81 | for fprint, score, request, schedule in batch: 82 | if schedule: 83 | _, hostname, _, _, _, _ = parse_domain_from_url_fast(request.url) 84 | if not hostname: 85 | self.logger.error("Can't get hostname for URL %s, fingerprint %s" % (request.url, fprint)) 86 | partition_id = self.partitions[0] 87 | host_crc32 = 0 88 | else: 89 | partition_id = self.partitioner.partition(hostname, self.partitions) 90 | host_crc32 = get_crc32(hostname) 91 | schedule_at = request.meta[b'crawl_at'] if b'crawl_at' in request.meta else utcnow_timestamp() 92 | q = self.queue_model(fingerprint=fprint, score=score, url=request.url, meta=request.meta, 93 | headers=request.headers, cookies=request.cookies, method=request.method, 94 | partition_id=partition_id, host_crc32=host_crc32, created_at=time()*1E+6, 95 | crawl_at=schedule_at) 96 | to_save.append(q) 97 | request.meta[b'state'] = States.QUEUED 98 | self.session.bulk_save_objects(to_save) 99 | self.session.commit() 100 | 101 | @retry_and_rollback 102 | def count(self): 103 | return self.session.query(self.queue_model).count() 104 | 105 | 106 | class Backend(SQLAlchemyBackend): 107 | 108 | def _create_queue(self, settings): 109 | self.interval = settings.get("SQLALCHEMYBACKEND_REVISIT_INTERVAL") 110 | assert isinstance(self.interval, timedelta) 111 | self.interval = self.interval.total_seconds() 112 | return RevisitingQueue(self.session_cls, RevisitingQueueModel, settings.get('SPIDER_FEED_PARTITIONS')) 113 | 114 | def _schedule(self, requests): 115 | batch = [] 116 | for request in requests: 117 | if request.meta[b'state'] in [States.NOT_CRAWLED]: 118 | request.meta[b'crawl_at'] = utcnow_timestamp() 119 | elif request.meta[b'state'] in [States.CRAWLED, States.ERROR]: 120 | request.meta[b'crawl_at'] = utcnow_timestamp() + self.interval 121 | else: 122 | continue # QUEUED 123 | batch.append((request.meta[b'fingerprint'], self._get_score(request), request, True)) 124 | self.queue.schedule(batch) 125 | self.metadata.update_score(batch) 126 | self.queue_size += len(batch) 127 | 128 | def page_crawled(self, response): 129 | super(Backend, self).page_crawled(response) 130 | self.states.set_states(response.request) 131 | self._schedule([response.request]) 132 | self.states.update_cache(response.request) 133 | -------------------------------------------------------------------------------- /frontera/contrib/scrapy/schedulers/recording.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | import pprint 3 | 4 | from scrapy.core.scheduler import Scheduler 5 | from scrapy.http import Request 6 | from scrapy import log 7 | 8 | from frontera import graphs 9 | 10 | # Default Values 11 | DEFAULT_RECORDER_ENABLED = True 12 | DEFAULT_RECORDER_STORAGE_DROP_ALL_TABLES = True 13 | DEFAULT_RECORDER_STORAGE_CLEAR_CONTENT = True 14 | 15 | STATS_PREFIX = 'recorder' 16 | 17 | 18 | class StatsManager(object): 19 | """ 20 | 'recorder/pages_count': xx, 21 | 'recorder/seeds_count': xx, 22 | 'recorder/links_count': xx, 23 | """ 24 | def __init__(self, stats, prefix=STATS_PREFIX): 25 | self.stats = stats 26 | self.prefix = prefix 27 | 28 | def add_page(self, is_seed=False): 29 | self._inc_value('pages_count') 30 | if is_seed: 31 | self._inc_value('seeds_count') 32 | 33 | def remove_pages(self, count): 34 | self._inc_value('pages_count', -count) 35 | 36 | def add_link(self): 37 | self._inc_value('links_count') 38 | 39 | def remove_links(self, count): 40 | self._inc_value('links_count', -count) 41 | 42 | def _get_stats_name(self, variable): 43 | return '%s/%s' % (self.prefix, variable) 44 | 45 | def _inc_value(self, variable, count=1): 46 | self.stats.inc_value(self._get_stats_name(variable), count) 47 | 48 | def _set_value(self, variable, value): 49 | self.stats.set_value(self._get_stats_name(variable), value) 50 | 51 | 52 | class RecorderScheduler(Scheduler): 53 | 54 | def open(self, spider): 55 | super(RecorderScheduler, self).open(spider) 56 | 57 | self.stats_manager = StatsManager(spider.crawler.stats) 58 | 59 | settings = spider.crawler.settings 60 | self.recorder_enabled = settings.get('RECORDER_ENABLED', DEFAULT_RECORDER_ENABLED) 61 | 62 | if not self.recorder_enabled: 63 | log.msg('Recorder disabled!', log.WARNING) 64 | return 65 | 66 | log.msg('Starting recorder', log.INFO) 67 | 68 | recorder_storage = settings.get('RECORDER_STORAGE_ENGINE', None) 69 | if not recorder_storage: 70 | self.recorder_enabled = False 71 | log.msg('Missing Recorder storage! Recorder disabled...', log.WARNING) 72 | return 73 | 74 | self.graph = graphs.Manager( 75 | engine=recorder_storage, 76 | drop_all_tables=settings.getbool('RECORDER_STORAGE_DROP_ALL_TABLES', 77 | DEFAULT_RECORDER_STORAGE_DROP_ALL_TABLES), 78 | clear_content=settings.getbool('RECORDER_STORAGE_CLEAR_CONTENT', 79 | DEFAULT_RECORDER_STORAGE_CLEAR_CONTENT)) 80 | 81 | def close(self, reason): 82 | super(RecorderScheduler, self).close(reason) 83 | if self.recorder_enabled: 84 | log.msg('Finishing recorder (%s)' % reason, log.INFO) 85 | pages = self.graph.session.query(graphs.Page).filter_by(status=None).all() 86 | for page in pages: 87 | n_deleted_links = self.graph.session.query(graphs.Relation).filter_by(child_id=page.id).delete() 88 | if n_deleted_links: 89 | self.stats_manager.remove_links(n_deleted_links) 90 | n_deleted_pages = self.graph.session.query(graphs.Page).filter_by(status=None).delete() 91 | if n_deleted_pages: 92 | self.stats_manager.remove_pages(n_deleted_pages) 93 | self.graph.save() 94 | 95 | def enqueue_request(self, request): 96 | if not request.dont_filter and self.df.request_seen(request): 97 | self.df.log(request, self.spider) 98 | return 99 | dqok = self._dqpush(request) 100 | if dqok: 101 | self.stats.inc_value('scheduler/enqueued/disk', spider=self.spider) 102 | else: 103 | self._mqpush(request) 104 | self.stats.inc_value('scheduler/enqueued/memory', spider=self.spider) 105 | self.stats.inc_value('scheduler/enqueued', spider=self.spider) 106 | if self.recorder_enabled: 107 | is_seed = b'rule' not in request.meta and \ 108 | b'origin_is_recorder' not in request.meta 109 | page = self.graph.add_page(url=request.url, is_seed=is_seed) 110 | self.stats_manager.add_page(is_seed) 111 | request.meta[b'is_seed'] = is_seed 112 | request.meta[b'page'] = page 113 | 114 | def next_request(self): 115 | request = super(RecorderScheduler, self).next_request() 116 | if self.recorder_enabled and request: 117 | request.meta[b'origin_is_recorder'] = True 118 | return request 119 | 120 | def process_spider_output(self, response, result, spider): 121 | if not self.recorder_enabled: 122 | for r in result: 123 | yield r 124 | return 125 | 126 | page = response.meta[b'page'] 127 | page.status = response.status 128 | self.graph.save() 129 | requests = [r for r in result if isinstance(r, Request)] 130 | for request in requests: 131 | link = self.graph.add_link(page=page, url=request.url) 132 | request.meta[b'page'] = link 133 | request.meta[b'referer'] = page 134 | self.stats_manager.add_link() 135 | yield request 136 | 137 | def process_exception(self, request, exception, spider): 138 | if self.recorder_enabled: 139 | error_code = self._get_exception_code(exception) 140 | page = request.meta[b'page'] 141 | page.status = error_code 142 | self.graph.save() 143 | 144 | def _get_exception_code(self, exception): 145 | try: 146 | return exception.__class__.__name__ 147 | except: 148 | return '?' 149 | -------------------------------------------------------------------------------- /docs/source/topics/frontier-api.rst: -------------------------------------------------------------------------------- 1 | ============ 2 | Frontera API 3 | ============ 4 | 5 | 本节介绍了 Frontera 核心API,适用于中间件和后端的开发人员。 6 | 7 | Frontera API / Manager 8 | ====================== 9 | 10 | Frontera API的主要入口点是 :class:`FrontierManager ` 对象,通过from_manager类方法传递给中间件和后端。该对象提供对所有Frontera核心组件的访问,并且是中间件和后端访问它们并将其功能挂接到Frontera中的唯一方法。 11 | 12 | :class:`FrontierManager ` 负责加载安装的中间件和后端,以及用于管理整个 frontier 的数据流。 13 | 14 | .. _frontier-loading-from-settings: 15 | 16 | 从settings加载 17 | ===================== 18 | 19 | 尽管 :class:`FrontierManager ` 可以通过参数初始化,但最常用的初始化方法还是使用 :doc:`Frontera Settings ` 。 20 | 21 | 这个可以通过 :attr:`from_settings ` 类方法实现,使用字符串路径:: 22 | 23 | >>> from frontera import FrontierManager 24 | >>> frontier = FrontierManager.from_settings('my_project.frontier.settings') 25 | 26 | 或者一个 :class:`BaseSettings ` 对象:: 27 | 28 | >>> from frontera import FrontierManager, Settings 29 | >>> settings = Settings() 30 | >>> settings.MAX_PAGES = 0 31 | >>> frontier = FrontierManager.from_settings(settings) 32 | 33 | 也可以无参数初始化,这种情况下 frontier 会使用 :ref:`默认配置 ` :: 34 | 35 | >>> from frontera import FrontierManager, Settings 36 | >>> frontier = FrontierManager.from_settings() 37 | 38 | 39 | Frontier Manager 40 | ================ 41 | 42 | 43 | .. autoclass:: frontera.core.manager.FrontierManager 44 | 45 | **Attributes** 46 | 47 | .. autoattribute:: frontera.core.manager.FrontierManager.request_model 48 | .. autoattribute:: frontera.core.manager.FrontierManager.response_model 49 | .. autoattribute:: frontera.core.manager.FrontierManager.backend 50 | .. autoattribute:: frontera.core.manager.FrontierManager.middlewares 51 | .. autoattribute:: frontera.core.manager.FrontierManager.test_mode 52 | .. autoattribute:: frontera.core.manager.FrontierManager.max_requests 53 | .. autoattribute:: frontera.core.manager.FrontierManager.max_next_requests 54 | .. autoattribute:: frontera.core.manager.FrontierManager.auto_start 55 | .. autoattribute:: frontera.core.manager.FrontierManager.settings 56 | .. autoattribute:: frontera.core.manager.FrontierManager.iteration 57 | .. autoattribute:: frontera.core.manager.FrontierManager.n_requests 58 | .. autoattribute:: frontera.core.manager.FrontierManager.finished 59 | 60 | **API Methods** 61 | 62 | .. automethod:: frontera.core.manager.FrontierManager.start 63 | .. automethod:: frontera.core.manager.FrontierManager.stop 64 | .. automethod:: frontera.core.manager.FrontierManager.add_seeds 65 | .. automethod:: frontera.core.manager.FrontierManager.get_next_requests 66 | .. automethod:: frontera.core.manager.FrontierManager.page_crawled 67 | .. automethod:: frontera.core.manager.FrontierManager.request_error 68 | 69 | **Class Methods** 70 | 71 | .. automethod:: frontera.core.manager.FrontierManager.from_settings 72 | 73 | 74 | .. _frontier-start-stop: 75 | 76 | 启动/停止 frontier 77 | ============================== 78 | 79 | 有时,frontier 组件需要执行初始化和最终化操作。frontier 通过 :attr:`start() ` 和 :attr:`stop() ` 方法去通知不同组件启动或者停止。 80 | 81 | 默认 :attr:`auto_start ` 值是激活的,这意味着在创建 :class:`FrontierManager ` 对象后,组件将被通知。如果您需要对初始化不同组件时进行更精细的控制,请停用 :attr:`auto_start ` 并手动调用frontier API :attr:`start() ` 和 :attr:`stop() ` 方法。 82 | 83 | .. note:: 84 | 当 :attr:`auto_start ` 处于激活状态时,Frontier :attr:`stop() ` 方法不会自动调用(因为frontier 不知道抓取状态)。如果您需要通知 frontier 组件,您应该手动调用该方法。 85 | 86 | 87 | .. _frontier-iterations: 88 | 89 | Frontier 迭代 90 | =================== 91 | 92 | 一旦 frontier 运行,通常的过程就是 :ref:`data flow ` 部分所描述的过程。 93 | 94 | 爬虫调用 :attr:`get_next_requests() ` 方法请求接下来要抓取的页面。每次 frontier 返回一个非空列表(可用数据),就是我们所说的前沿迭代。当前 frontier 迭代可以通过 :attr:`iteration ` 属性访问。 95 | 96 | 97 | .. _frontier-finish: 98 | 99 | 结束 frontier 100 | ====================== 101 | 102 | 抓取过程可以被爬虫程序或者 Frontera 停止。当返回最大页数时,Frontera 将结束。此限制由 :attr:`max_requests ` 属性控制( :setting:`MAX_REQUESTS` 设置)。 103 | 104 | 如果 :attr:`max_requests ` 设置为0,那么 frontier 会无限抓取下去。 105 | 106 | 一旦 frontier 完成,:attr:`get_next_requests ` 方法将不再返回任何页面,并且 :attr:`finished ` 属性将为True。 107 | 108 | .. _frontier-test-mode: 109 | 110 | 组件对象 111 | ================= 112 | 113 | .. autoclass:: frontera.core.components.Component 114 | 115 | **Attributes** 116 | 117 | .. autoattribute:: frontera.core.components.Component.name 118 | 119 | **Abstract methods** 120 | 121 | .. automethod:: frontera.core.components.Component.frontier_start 122 | .. automethod:: frontera.core.components.Component.frontier_stop 123 | .. automethod:: frontera.core.components.Component.add_seeds 124 | .. automethod:: frontera.core.components.Component.page_crawled 125 | .. automethod:: frontera.core.components.Component.request_error 126 | 127 | **Class Methods** 128 | 129 | .. automethod:: frontera.core.components.Component.from_manager 130 | 131 | 132 | 测试模式 133 | ========= 134 | 135 | 在某些情况下,在测试中,frontier 组件需要采用与通常不同的方式(例如,在测试模式下解析域URL时,:ref:`domain middleware ` 会接受诸如 ``'A1'`` 或者 ``'B1'`` 之类的非有效URL)。 136 | 137 | 组件可以通过 :attr:`test_mode ` 属性知道 frontier 是否处于测试模式。 138 | 139 | 140 | .. _frontier-another-ways: 141 | 142 | 使用 frontier 的其他方法 143 | ================================== 144 | 145 | 与 frontier 通信也可以通过HTTP API或队列系统等其他机制完成。这些功能暂时不可用,但希望包含在将来的版本中。 -------------------------------------------------------------------------------- /frontera/core/models.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | import copy 3 | from w3lib.util import to_bytes, to_native_str 4 | from w3lib.url import safe_url_string 5 | 6 | 7 | class FrontierObject(object): 8 | def copy(self): 9 | return copy.copy(self) 10 | 11 | 12 | class Request(FrontierObject): 13 | """ 14 | A :class:`Request ` object represents an HTTP request, which is generated for 15 | seeds, extracted page links and next pages to crawl. Each one should be associated to a 16 | :class:`Response ` object when crawled. 17 | 18 | """ 19 | def __init__(self, url, method=b'GET', headers=None, cookies=None, meta=None, body=''): 20 | """ 21 | :param string url: URL to send. 22 | :param string method: HTTP method to use. 23 | :param dict headers: dictionary of headers to send. 24 | :param dict cookies: dictionary of cookies to attach to this request. 25 | :param dict meta: dictionary that contains arbitrary metadata for this request, the keys must be bytes and \ 26 | the values must be either bytes or serializable objects such as lists, tuples, dictionaries with byte type items. 27 | """ 28 | self._url = to_native_str(url) 29 | self._method = to_bytes((method or b'GET').upper()) 30 | self._headers = headers or {} 31 | self._cookies = cookies or {} 32 | self._meta = meta or {b'scrapy_meta': {}} 33 | self._body = body 34 | 35 | @property 36 | def url(self): 37 | """ 38 | A string containing the URL of this request. 39 | """ 40 | return self._url 41 | 42 | @property 43 | def method(self): 44 | """ 45 | A string representing the HTTP method in the request. This is guaranteed to be uppercase. 46 | Example: ``GET``, ``POST``, ``PUT``, etc 47 | """ 48 | return self._method 49 | 50 | @property 51 | def headers(self): 52 | """ 53 | A dictionary which contains the request headers. 54 | """ 55 | return self._headers 56 | 57 | @property 58 | def cookies(self): 59 | """ 60 | Dictionary of cookies to attach to this request. 61 | """ 62 | return self._cookies 63 | 64 | @property 65 | def meta(self): 66 | """ 67 | A dict that contains arbitrary metadata for this request. This dict is empty for new Requests, and is usually 68 | populated by different Frontera components (middlewares, etc). So the data contained in this dict depends 69 | on the components you have enabled. The keys are bytes and the values are either bytes or serializable objects \ 70 | such as lists, tuples, dictionaries with byte type items. 71 | """ 72 | return self._meta 73 | 74 | @property 75 | def body(self): 76 | """ 77 | A string representing the request body. 78 | """ 79 | return self._body 80 | 81 | def __str__(self): 82 | return "<%s at 0x%0x %s meta=%s body=%s... cookies=%s, headers=%s>" % (type(self).__name__, id(self), self.url, 83 | str(self.meta), str(self.body[:20]), 84 | str(self.cookies), str(self.headers)) 85 | 86 | __repr__ = __str__ 87 | 88 | 89 | class Response(FrontierObject): 90 | """ 91 | A :class:`Response ` object represents an HTTP response, which is usually 92 | downloaded (by the crawler) and sent back to the frontier for processing. 93 | 94 | """ 95 | 96 | def __init__(self, url, status_code=200, headers=None, body='', request=None): 97 | """ 98 | :param string url: URL of this response. 99 | :param int status_code: the HTTP status of the response. Defaults to 200. 100 | :param dict headers: dictionary of headers to send. 101 | :param str body: the response body. 102 | :param Request request: The Request object that generated this response. 103 | """ 104 | 105 | self._url = to_native_str(url) 106 | self._status_code = int(status_code) 107 | self._headers = headers or {} 108 | self._body = body 109 | self._request = request 110 | 111 | @property 112 | def url(self): 113 | """ 114 | A string containing the URL of the response. 115 | """ 116 | return self._url 117 | 118 | @property 119 | def status_code(self): 120 | """ 121 | An integer representing the HTTP status of the response. Example: ``200``, ``404``, ``500``. 122 | """ 123 | return self._status_code 124 | 125 | @property 126 | def headers(self): 127 | """ 128 | A dictionary object which contains the response headers. 129 | """ 130 | return self._headers 131 | 132 | @property 133 | def body(self): 134 | """ 135 | A str containing the body of this Response. 136 | """ 137 | return self._body 138 | 139 | @property 140 | def request(self): 141 | """ 142 | The :class:`Request ` object that generated this response. 143 | """ 144 | return self._request 145 | 146 | @property 147 | def meta(self): 148 | """ 149 | A shortcut to the :attr:`Request.meta ` attribute of the 150 | :attr:`Response.request ` object (ie. self.request.meta). 151 | """ 152 | try: 153 | return self.request.meta 154 | except AttributeError: 155 | raise AttributeError("Response.meta not available, this response " 156 | "is not tied to any request") 157 | 158 | def __str__(self): 159 | return "<%s at 0x%0x %s %s meta=%s body=%s... headers=%s>" % (type(self).__name__, 160 | id(self), self.status_code, 161 | self.url, str(self.meta), 162 | str(self.body[:20]), str(self.headers)) 163 | 164 | __repr__ = __str__ 165 | -------------------------------------------------------------------------------- /docs/source/topics/scrapy-integration.rst: -------------------------------------------------------------------------------- 1 | ============================== 2 | 使用 Frontier 和 Scrapy 3 | ============================== 4 | 5 | 6 | Scrapy 中使用 Frontera 非常简单,它包含一组 `Scrapy middlewares`_ 和 Scrapy 调度程序,封装了 Frontera 的功能 ,可以使用 `Scrapy settings`_ 轻松配置。 7 | 8 | 激活 frontier 9 | ======================= 10 | 11 | Frontera 使用两种不同的中间件:``SchedulerSpiderMiddleware`` and ``SchedulerDownloaderMiddleware`` 和自己的调度程序 ``FronteraScheduler``。 12 | 13 | 要在你的 Scrapy 项目中激活 Frontera,只要把它们加入到 `SPIDER_MIDDLEWARES`_, 14 | `DOWNLOADER_MIDDLEWARES`_ 和 `SCHEDULER`_ settings:: 15 | 16 | SPIDER_MIDDLEWARES.update({ 17 | 'frontera.contrib.scrapy.middlewares.schedulers.SchedulerSpiderMiddleware': 1000, 18 | }) 19 | 20 | DOWNLOADER_MIDDLEWARES.update({ 21 | 'frontera.contrib.scrapy.middlewares.schedulers.SchedulerDownloaderMiddleware': 1000, 22 | }) 23 | 24 | SCHEDULER = 'frontera.contrib.scrapy.schedulers.frontier.FronteraScheduler' 25 | 26 | 创建一个 Frontera ``settings.py`` 并把它加入到你的 Scrapy 设置中:: 27 | 28 | FRONTERA_SETTINGS = 'tutorial.frontera.settings' 29 | 30 | 另一种选择是将这些设置放在 Scrapy 设置模块中。 31 | 32 | 33 | 34 | 组织文件 35 | ================ 36 | 37 | 当使用 Scrapy 和 frontier 时,我们有以下目录结构:: 38 | 39 | my_scrapy_project/ 40 | my_scrapy_project/ 41 | frontera/ 42 | __init__.py 43 | settings.py 44 | middlewares.py 45 | backends.py 46 | spiders/ 47 | ... 48 | __init__.py 49 | settings.py 50 | scrapy.cfg 51 | 52 | 这些都是基本的: 53 | 54 | - ``my_scrapy_project/frontera/settings.py``: Frontera settings 文件。 55 | - ``my_scrapy_project/frontera/middlewares.py``: Frontera 的中间件。 56 | - ``my_scrapy_project/frontera/backends.py``: Frontera 使用的后端。 57 | - ``my_scrapy_project/spiders``: Scrapy spiders 文件夹 58 | - ``my_scrapy_project/settings.py``: Scrapy settings 文件 59 | - ``scrapy.cfg``: Scrapy 配置文件 60 | 61 | 运行爬虫 62 | ================= 63 | 64 | 只要按照常规从命令行运行你的 Scrapy 爬虫:: 65 | 66 | scrapy crawl myspider 67 | 68 | 69 | Frontier Scrapy settings 70 | ======================== 71 | 你可以使用两种方式配置 frontier : 72 | 73 | .. setting:: FRONTERA_SETTINGS 74 | 75 | - 使用 ``FRONTERA_SETTINGS``,可以在 Scrapy 配置文件中指明 Frontera 配置文件的路径,默认是 ``None`` 76 | 77 | - 将 frontier 设置定义到 Scrapy 设置文件中。 78 | 79 | 通过 Scrapy 配置 frontier 80 | ---------------------------------------------- 81 | 82 | :ref:`Frontier settings ` 也可以通过 Scrapy 配置。在这种情况下,配置优先级顺序如下: 83 | 84 | 1. :setting:`FRONTERA_SETTINGS` 指向的文件中的配置(更高的优先级) 85 | 2. Scrapy 配置文件中的配置 86 | 3. 默认 frontier 配置 87 | 88 | 89 | .. _Scrapy middlewares: http://doc.scrapy.org/en/latest/topics/downloader-middleware.html 90 | .. _Scrapy settings: http://doc.scrapy.org/en/latest/topics/settings.html 91 | .. _DOWNLOADER_MIDDLEWARES: http://doc.scrapy.org/en/latest/topics/settings.html#std:setting-DOWNLOADER_MIDDLEWARES 92 | .. _SPIDER_MIDDLEWARES: http://doc.scrapy.org/en/latest/topics/settings.html#std:setting-SPIDER_MIDDLEWARES 93 | .. _SCHEDULER: http://doc.scrapy.org/en/latest/topics/settings.html#std:setting-SCHEDULER 94 | 95 | 96 | 写 Scrapy 爬虫 97 | ===================== 98 | 99 | 爬虫逻辑 100 | ------------ 101 | 创建基本的 Scrapy 爬虫在 `Quick start single process`_ 中做了描述。 102 | 103 | 这也是一个防止爬虫因为队列中请求不足而关闭的好方法:: 104 | 105 | @classmethod 106 | def from_crawler(cls, crawler, *args, **kwargs): 107 | spider = cls(*args, **kwargs) 108 | spider._set_crawler(crawler) 109 | spider.crawler.signals.connect(spider.spider_idle, signal=signals.spider_idle) 110 | return spider 111 | 112 | def spider_idle(self): 113 | self.log("Spider idle signal caught.") 114 | raise DontCloseSpider 115 | 116 | 117 | 配置准则 118 | ------------------------ 119 | 120 | 您可以进行多种调整,以实现高效的广泛抓取。 121 | 122 | 添加一个种子加载器,用于启动爬虫进程:: 123 | 124 | SPIDER_MIDDLEWARES.update({ 125 | 'frontera.contrib.scrapy.middlewares.seeds.file.FileSeedLoader': 1, 126 | }) 127 | 128 | 适合广泛抓取的各种设置:: 129 | 130 | HTTPCACHE_ENABLED = False # 关闭磁盘缓存,它在大量抓取中具有较低的命中率 131 | REDIRECT_ENABLED = True 132 | COOKIES_ENABLED = False 133 | DOWNLOAD_TIMEOUT = 120 134 | RETRY_ENABLED = False # 重试可以由 Frontera 本身处理,具体取决于爬网策略 135 | DOWNLOAD_MAXSIZE = 10 * 1024 * 1024 # 最大文档大小,如果未设置,会导致OOM 136 | LOGSTATS_INTERVAL = 10 # 每10秒钟向控制台打印统计 137 | 138 | 自动限流和并发设置,以方便有礼貌和负责任的抓取:: 139 | 140 | # auto throttling 141 | AUTOTHROTTLE_ENABLED = True 142 | AUTOTHROTTLE_DEBUG = False 143 | AUTOTHROTTLE_MAX_DELAY = 3.0 144 | AUTOTHROTTLE_START_DELAY = 0.25 # 任何足够小的值,它将在平均运行期间通过瓶颈响应延迟进行调整。 145 | RANDOMIZE_DOWNLOAD_DELAY = False 146 | 147 | # concurrency 148 | CONCURRENT_REQUESTS = 256 # 取决于许多因素,应通过实验确定 149 | CONCURRENT_REQUESTS_PER_DOMAIN = 10 150 | DOWNLOAD_DELAY = 0.0 151 | 152 | 具体参照 `Scrapy broad crawling`_. 153 | 154 | 155 | .. _`Quick start single process`: http://frontera.readthedocs.org/en/latest/topics/quick-start-single.html 156 | .. _`Scrapy broad crawling`: http://doc.scrapy.org/en/master/topics/broad-crawls.html 157 | 158 | 159 | Scrapy 种子加载器 160 | =================== 161 | 162 | Frontera 有一些内置的 Scrapy 中间件用于种子装载。 163 | 164 | 种子装载使用 ``process_start_requests`` 方法从源中生成请求,这些请求后续会被加入 :class:`FrontierManager ` 。 165 | 166 | 激活一个种子加载器 167 | ------------------------ 168 | 169 | 只需将种子加载器中间件加入 ``SPIDER_MIDDLEWARES`` 中:: 170 | 171 | SPIDER_MIDDLEWARES.update({ 172 | 'frontera.contrib.scrapy.middlewares.seeds.FileSeedLoader': 650 173 | }) 174 | 175 | 176 | .. _seed_loader_file: 177 | 178 | FileSeedLoader 179 | -------------- 180 | 181 | 从文件中导入种子。该文件必须是每行一个 URL 的格式:: 182 | 183 | http://www.asite.com 184 | http://www.anothersite.com 185 | ... 186 | 187 | 你可以使用 ``#`` 注释掉某一行:: 188 | 189 | ... 190 | #http://www.acommentedsite.com 191 | ... 192 | 193 | **Settings**: 194 | 195 | - ``SEEDS_SOURCE``: 种子文件路径 196 | 197 | 198 | .. _seed_loader_s3: 199 | 200 | S3SeedLoader 201 | ------------ 202 | 203 | 从存储在 Amazon S3 中的文件导入种子 204 | Load seeds from a file stored in an Amazon S3 bucket 205 | 206 | 文件格式应该和 :ref:`FileSeedLoader ` 中的一样。 207 | 208 | Settings: 209 | 210 | - ``SEEDS_SOURCE``: S3 bucket 文件路径。 例如: ``s3://some-project/seed-urls/`` 211 | 212 | - ``SEEDS_AWS_ACCESS_KEY``: S3 credentials Access Key 213 | 214 | - ``SEEDS_AWS_SECRET_ACCESS_KEY``: S3 credentials Secret Access Key 215 | 216 | 217 | .. _`Scrapy Middleware doc`: http://doc.scrapy.org/en/latest/topics/spider-middleware.html 218 | -------------------------------------------------------------------------------- /frontera/utils/graphs/data.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from six.moves import range 3 | 4 | 5 | def create_test_site(prefix, max_depth, n_links_per_page, self_link=False, site=None, depth=0): 6 | if not site: 7 | site = [] 8 | prefix += str(1) 9 | depth += 1 10 | if depth < max_depth: 11 | page = prefix 12 | links = [page + str(l) for l in range(1, n_links_per_page+1)] 13 | site.append((page, links)) 14 | for link in links: 15 | create_test_site(prefix=link, 16 | max_depth=max_depth, 17 | n_links_per_page=n_links_per_page, 18 | self_link=self_link, 19 | site=site, 20 | depth=depth) 21 | if self_link: 22 | links.append(page) 23 | return site 24 | 25 | 26 | class CrawlSiteData(object): 27 | def __init__(self, pages, name='', description=''): 28 | self.name = name 29 | self.description = description 30 | self.pages = pages 31 | 32 | def __repr__(self): 33 | return '' % (self.name, len(self.pages)) 34 | 35 | @property 36 | def nodes(self): 37 | n = set() 38 | for page, links in self.pages: 39 | n.add(page) 40 | for link in links: 41 | n.add(link) 42 | return n 43 | 44 | def __len__(self): 45 | return len(self.nodes) 46 | 47 | 48 | class CrawlSiteListData(object): 49 | def __init__(self, sites, name='', description='', use_urls=False): 50 | self.name = name 51 | self.description = description 52 | self.sites = sites 53 | self.use_urls = use_urls 54 | 55 | def __repr__(self): 56 | return '' % (self.name, len(self.sites)) 57 | 58 | def __len__(self): 59 | return sum([len(site) for site in self.sites]) 60 | 61 | 62 | #----------------------------------------------------- 63 | # Sites 64 | #----------------------------------------------------- 65 | SITE_A = CrawlSiteData( 66 | name='A', 67 | description='', 68 | pages=create_test_site('A', 4, 2)) 69 | 70 | SITE_B = CrawlSiteData( 71 | name='B', 72 | description='', 73 | pages=create_test_site('B', 4, 2)) 74 | 75 | SITE_C = CrawlSiteData( 76 | name='C', 77 | description='', 78 | pages=create_test_site('C', 5, 2, self_link=True)) 79 | 80 | 81 | #----------------------------------------------------- 82 | # Graphs 83 | #----------------------------------------------------- 84 | SITE_LIST_01 = CrawlSiteListData( 85 | name='GRAPH 01', 86 | description='', 87 | sites=[ 88 | SITE_A, 89 | ]) 90 | 91 | SITE_LIST_02 = CrawlSiteListData( 92 | name='GRAPH 02', 93 | description='', 94 | sites=[ 95 | SITE_A, 96 | SITE_B, 97 | ]) 98 | 99 | SITE_LIST_03 = CrawlSiteListData( 100 | name='GRAPH 03', 101 | description='', 102 | sites=[ 103 | SITE_C, 104 | ]) 105 | 106 | SITE_LIST_04 = CrawlSiteListData( 107 | name='GRAPH 04', 108 | description='', 109 | sites=[ 110 | [ 111 | ('A', ['B']), 112 | ('B', ['A']), 113 | ], 114 | ]) 115 | 116 | SITE_LIST_05 = CrawlSiteListData( 117 | name='GRAPH 05', 118 | description='', 119 | sites=[ 120 | [ 121 | ('A', ['B', 'C']), 122 | ('B', ['A', 'C']), 123 | ('C', ['A', 'B']), 124 | ], 125 | ]) 126 | 127 | SITE_LIST_06 = CrawlSiteListData( 128 | name='GRAPH 06', 129 | description='', 130 | sites=[ 131 | [ 132 | ('A', ['B', 'C']), 133 | ('B', []), 134 | ('C', ['B']), 135 | ] 136 | ]) 137 | 138 | SITE_LIST_07 = CrawlSiteListData( 139 | name='GRAPH 07', 140 | description='', 141 | sites=[ 142 | [ 143 | ('A', ['A']), 144 | ] 145 | ]) 146 | 147 | SITE_LIST_08 = CrawlSiteListData( 148 | name='GRAPH 08', 149 | description='', 150 | use_urls=True, 151 | sites=[ 152 | [ 153 | ('https://www.a.com', [ 154 | 'http://www.a.com/2', 155 | 'http://www.a.net', 156 | ]), 157 | ], 158 | [ 159 | ('https://www.a.net', []), 160 | ], 161 | [ 162 | ('http://b.com', [ 163 | 'http://b.com/2', 164 | 'http://www.a.net', 165 | 'http://test.cloud.c.com', 166 | 'http://b.com', 167 | ]), 168 | ('http://b.com/entries?page=2', [ 169 | 'http://b.com/entries?page=2', 170 | 'http://b.com', 171 | ]), 172 | ], 173 | [ 174 | ('http://test.cloud.c.com', [ 175 | 'http://cloud.c.com', 176 | 'http://test.cloud.c.com/2', 177 | ]), 178 | ('http://test.cloud.c.com/2', [ 179 | 'http://b.com/entries?page=2', 180 | 'http://test.cloud.c.com', 181 | ]), 182 | ], 183 | ]) 184 | 185 | SITE_LIST_09 = CrawlSiteListData( 186 | name='GRAPH 09', 187 | description='', 188 | use_urls=True, 189 | sites=[ 190 | [ 191 | ('https://www.a.com', [ 192 | 'http://www.a.com/2', 193 | 'http://www.a.com/2/1', 194 | 'http://www.a.com/3', 195 | 'http://www.a.com/2/1/3', 196 | 'http://www.a.com/2/4/1', 197 | 'http://www.a.com/2/4/2', 198 | 'http://www.a.net', 199 | ]), 200 | ], 201 | [ 202 | ('http://b.com', [ 203 | 'http://b.com/2', 204 | 'http://www.a.net', 205 | 'http://test.cloud.c.com', 206 | 'http://b.com', 207 | ]), 208 | ('http://b.com/entries?page=2', [ 209 | 'http://b.com/entries?page=2', 210 | 'http://b.com', 211 | ]), 212 | ], 213 | [ 214 | ('http://test.cloud.c.com', [ 215 | 'http://cloud.c.com', 216 | 'http://test.cloud.c.com/2', 217 | ]), 218 | ('http://test.cloud.c.com/2', [ 219 | 'http://b.com/entries?page=2', 220 | 'http://test.cloud.c.com', 221 | ]), 222 | ], 223 | ]) 224 | 225 | GRAPHS = [obj for obj in locals().values() if isinstance(obj, CrawlSiteListData)] 226 | #GRAPHS = [SITE_LIST_08] 227 | -------------------------------------------------------------------------------- /frontera/contrib/messagebus/zeromq/broker.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from __future__ import absolute_import 4 | from time import time 5 | from datetime import timedelta 6 | import logging 7 | from argparse import ArgumentParser 8 | from struct import unpack 9 | 10 | import zmq 11 | from zmq.eventloop.ioloop import IOLoop 12 | from zmq.eventloop.zmqstream import ZMQStream 13 | 14 | from frontera.settings import Settings 15 | from .socket_config import SocketConfig 16 | 17 | 18 | class Server(object): 19 | 20 | ctx = None 21 | loop = None 22 | stats = None 23 | spiders_in = None 24 | spiders_out = None 25 | sw_in = None 26 | sw_out = None 27 | db_in = None 28 | db_out = None 29 | 30 | def __init__(self, address, base_port): 31 | self.ctx = zmq.Context() 32 | self.loop = IOLoop.instance() 33 | self.stats = { 34 | 'started': time(), 35 | 'spiders_out_recvd': 0, 36 | 'spiders_in_recvd': 0, 37 | 'db_in_recvd': 0, 38 | 'db_out_recvd': 0, 39 | 'sw_in_recvd': 0, 40 | 'sw_out_recvd': 0 41 | } 42 | 43 | socket_config = SocketConfig(address, base_port) 44 | 45 | if socket_config.is_ipv6: 46 | self.ctx.setsockopt(zmq.IPV6, True) 47 | 48 | spiders_in_s = self.ctx.socket(zmq.XPUB) 49 | spiders_out_s = self.ctx.socket(zmq.XSUB) 50 | sw_in_s = self.ctx.socket(zmq.XPUB) 51 | sw_out_s = self.ctx.socket(zmq.XSUB) 52 | db_in_s = self.ctx.socket(zmq.XPUB) 53 | db_out_s = self.ctx.socket(zmq.XSUB) 54 | 55 | spiders_in_s.bind(socket_config.spiders_in()) 56 | spiders_out_s.bind(socket_config.spiders_out()) 57 | sw_in_s.bind(socket_config.sw_in()) 58 | sw_out_s.bind(socket_config.sw_out()) 59 | db_in_s.bind(socket_config.db_in()) 60 | db_out_s.bind(socket_config.db_out()) 61 | 62 | self.spiders_in = ZMQStream(spiders_in_s) 63 | self.spiders_out = ZMQStream(spiders_out_s) 64 | self.sw_in = ZMQStream(sw_in_s) 65 | self.sw_out = ZMQStream(sw_out_s) 66 | self.db_in = ZMQStream(db_in_s) 67 | self.db_out = ZMQStream(db_out_s) 68 | 69 | self.spiders_out.on_recv(self.handle_spiders_out_recv) 70 | self.sw_out.on_recv(self.handle_sw_out_recv) 71 | self.db_out.on_recv(self.handle_db_out_recv) 72 | 73 | self.sw_in.on_recv(self.handle_sw_in_recv) 74 | self.db_in.on_recv(self.handle_db_in_recv) 75 | self.spiders_in.on_recv(self.handle_spiders_in_recv) 76 | logging.basicConfig(format="%(asctime)s %(message)s", 77 | datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO) 78 | self.logger = logging.getLogger("distributed_frontera.messagebus" 79 | ".zeromq.broker.Server") 80 | self.logger.info("Using socket: {}:{}".format(socket_config.ip_addr, 81 | socket_config.base_port)) 82 | 83 | def start(self): 84 | self.logger.info("Distributed Frontera ZeroMQ broker is started.") 85 | self.log_stats() 86 | try: 87 | self.loop.start() 88 | except KeyboardInterrupt: 89 | pass 90 | 91 | def log_stats(self): 92 | self.logger.info(self.stats) 93 | self.loop.add_timeout(timedelta(seconds=10), self.log_stats) 94 | 95 | def handle_spiders_out_recv(self, msg): 96 | self.sw_in.send_multipart(msg) 97 | self.db_in.send_multipart(msg) 98 | self.stats['spiders_out_recvd'] += 1 99 | 100 | def handle_sw_out_recv(self, msg): 101 | self.db_in.send_multipart(msg) 102 | self.stats['sw_out_recvd'] += 1 103 | 104 | def handle_db_out_recv(self, msg): 105 | self.spiders_in.send_multipart(msg) 106 | self.stats['db_out_recvd'] += 1 107 | 108 | def handle_db_in_recv(self, msg): 109 | self.stats['db_in_recvd'] += 1 110 | if b'\x01' in msg[0] or b'\x00' in msg[0]: 111 | action, identity, partition_id = self.decode_subscription(msg[0]) 112 | if identity == b'sl': 113 | self.spiders_out.send_multipart(msg) 114 | return 115 | if identity == b'us': 116 | self.sw_out.send_multipart(msg) 117 | return 118 | raise AttributeError('Unknown identity in channel subscription.') 119 | 120 | def handle_sw_in_recv(self, msg): 121 | if b'\x01' in msg[0] or b'\x00' in msg[0]: 122 | self.spiders_out.send_multipart(msg) 123 | self.stats['sw_in_recvd'] += 1 124 | 125 | def handle_spiders_in_recv(self, msg): 126 | if b'\x01' in msg[0] or b'\x00' in msg[0]: 127 | self.db_out.send_multipart(msg) 128 | self.stats['spiders_in_recvd'] += 1 129 | 130 | def decode_subscription(self, msg): 131 | """ 132 | 133 | :param msg: 134 | :return: tuple of action, identity, partition_id 135 | where 136 | action is 1 - subscription, 0 - unsubscription, 137 | identity - 2 characters, 138 | partition_id - 8 bit unsigned integer (None if absent) 139 | """ 140 | if len(msg) == 4: 141 | return unpack(">B2sB", msg) 142 | elif len(msg) == 3: 143 | action, identity = unpack(">B2s", msg) 144 | return action, identity, None 145 | raise ValueError("Can't decode subscription correctly.") 146 | 147 | 148 | def main(): 149 | """ 150 | Parse arguments, set configuration values, then start the broker 151 | """ 152 | parser = ArgumentParser(description="Crawl frontier worker.") 153 | parser.add_argument( 154 | '--config', type=str, 155 | help='Settings module name, should be accessible by import.') 156 | parser.add_argument( 157 | '--address', type=str, 158 | help='Hostname, IP address or Wildcard * to bind. Default is 127.0.0.1' 159 | '. When binding to wildcard it defaults to IPv4.') 160 | parser.add_argument( 161 | '--log-level', '-L', type=str, default='INFO', 162 | help='Log level, for ex. DEBUG, INFO, WARN, ERROR, FATAL. Default is' 163 | ' INFO.') 164 | parser.add_argument( 165 | '--port', type=int, 166 | help='Base port number, server will bind to 6 ports starting from base' 167 | '. Default is 5550') 168 | args = parser.parse_args() 169 | 170 | settings = Settings(module=args.config) 171 | address = args.address if args.address else settings.get("ZMQ_ADDRESS") 172 | port = args.port if args.port else settings.get("ZMQ_BASE_PORT") 173 | server = Server(address, port) 174 | server.logger.setLevel(args.log_level) 175 | server.start() 176 | 177 | 178 | if __name__ == '__main__': 179 | main() 180 | --------------------------------------------------------------------------------