├── .bumpversion.cfg ├── .editorconfig ├── .gitignore ├── .travis.yml ├── AUTHORS ├── CHANGES ├── LICENSE ├── MANIFEST.in ├── README.rst ├── brownant ├── __init__.py ├── app.py ├── dinergate.py ├── exceptions.py ├── pipeline │ ├── __init__.py │ ├── base.py │ ├── html.py │ └── network.py ├── request.py ├── site.py └── utils.py ├── docs ├── Makefile ├── _static │ └── .gitkeep ├── api.rst ├── changes.rst ├── conf.py ├── index.rst ├── requirements.txt └── userguide │ ├── introduction.rst │ └── quickstart.rst ├── setup.cfg ├── setup.py ├── tests ├── __init__.py ├── test_app.py ├── test_deprecation.py ├── test_dinergate.py ├── test_pipeline │ ├── __init__.py │ ├── test_base.py │ ├── test_html.py │ └── test_network.py ├── test_site.py └── test_utils.py └── tox.ini /.bumpversion.cfg: -------------------------------------------------------------------------------- 1 | [bumpversion] 2 | files = setup.py docs/conf.py brownant/__init__.py 3 | commit = True 4 | tag = False 5 | current_version = 0.1.7 6 | 7 | -------------------------------------------------------------------------------- /.editorconfig: -------------------------------------------------------------------------------- 1 | # http://editorconfig.org 2 | 3 | root = true 4 | 5 | [*.py] 6 | indent_style = space 7 | indent_size = 4 8 | charset = utf-8 9 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.py[cod] 2 | 3 | # C extensions 4 | *.so 5 | 6 | # Packages 7 | *.egg 8 | *.egg-info 9 | dist 10 | build 11 | eggs 12 | parts 13 | bin 14 | var 15 | sdist 16 | develop-eggs 17 | .installed.cfg 18 | lib 19 | lib64 20 | __pycache__ 21 | 22 | # Installer logs 23 | pip-log.txt 24 | 25 | # Unit test / coverage reports 26 | .coverage 27 | .tox 28 | .cache 29 | 30 | # Translations 31 | *.mo 32 | 33 | # Editor 34 | *.sw[po] 35 | 36 | # Sphinx 37 | docs/_build 38 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | python: 3 | - "2.7" 4 | - "3.5" 5 | - "3.6" 6 | - "pypy" 7 | install: 8 | - "pip install ." 9 | - "pip install pytest>=2.4.2 -U" 10 | - "pip install pytest-cov pytest-pep8 mock coveralls" 11 | script: "py.test" 12 | after_success: 13 | coveralls 14 | branches: 15 | only: 16 | - master 17 | - develop 18 | -------------------------------------------------------------------------------- /AUTHORS: -------------------------------------------------------------------------------- 1 | Authors & Contributors 2 | ====================== 3 | 4 | - Jiangge Zhang https://github.com/tonyseek 5 | - VeryCB https://github.com/VeryCB 6 | - dongweiming https://github.com/dongweiming 7 | - Caratpine https://github.com/Caratpine 8 | -------------------------------------------------------------------------------- /CHANGES: -------------------------------------------------------------------------------- 1 | Release 0.1.7 (Mar 3, 2017) 2 | =========================== 3 | 4 | - Add support to lxml >= 3.7.3. (by Caratpine) 5 | - Compatible with pip > 7.1.2 && support Python 3.5/3.6. (by dongweiming) 6 | 7 | Release 0.1.6 (Jul 9, 2015) 8 | =========================== 9 | 10 | - Add JSONResponseProperty to support parsing JSON response. (by dongweiming) 11 | 12 | 13 | Release 0.1.5 (Apr 8, 2014) 14 | =========================== 15 | 16 | Some API will be changed without backward compatibility in next major release. 17 | 18 | - Add support to redirect while executing request handler. 19 | - Add support to run on Python 3.4 without any modified. 20 | - Refactor the ``http_client`` into a pipeline property. 21 | - Remove the magic arguments of ``http_client`` 22 | - Expose the classes in the top-level package. We can import all from 23 | ``brownant`` and ``brownant.pipeline`` now. 24 | - Rename the ``BrownAnt`` into ``Brownant``. The ``BrownAnt`` is still usable 25 | but not recommended. 26 | - Fix the unicode error for URLs which included non-ascii query. 27 | - Fix the lxml compatible problem with PyPy. 28 | 29 | 30 | Release 0.1.4 (Oct 24, 2013) 31 | ============================ 32 | 33 | - Fix the RequestRedirect raised problem. 34 | - Add the new pick mode "keep" of XPathTextProperty. (by VeryCB) 35 | - Add the encoding parameter of the ElementTreeProperty. That could let the 36 | property provide bytes instead of unicode string. (by VeryCB) 37 | 38 | 39 | Release 0.1.3 (Oct 19, 2013) 40 | ============================ 41 | 42 | - Fix the broken CI (travis-ci). 43 | 44 | 45 | Release 0.1.2 (Oct 19, 2013) 46 | ============================ 47 | 48 | - Fix some unicode compatible problems for URL string. 49 | - Prevent the invalid URL string input. 50 | - Change the theme of document into built-in one named "nature". 51 | 52 | 53 | Release 0.1.1 (Sep 30, 2013) 54 | ============================ 55 | 56 | - Refine the documents and give an example in the Quick Start section. 57 | 58 | 59 | Release 0.1.0 (Sep 29, 2013) 60 | ============================ 61 | 62 | - First public release. 63 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2014, Douban Inc. 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 5 | 6 | 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 7 | 8 | 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 9 | 10 | 3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. 11 | 12 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 13 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include AUTHORS CHANGES LICENSE README.rst tox.ini 2 | recursive-include tests * 3 | recursive-include docs * 4 | recursive-exclude docs *.pyc 5 | recursive-exclude docs *.pyo 6 | recursive-exclude tests *.pyc 7 | recursive-exclude tests *.pyo 8 | prune docs/_build 9 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | |Build Status| |Coverage Status| |PyPI Version| |PyPI Downloads| |Wheel Status| 2 | 3 | Brownant 4 | ======== 5 | 6 | Brownant is a lightweight web data extracting framework. 7 | 8 | 9 | Who uses it? 10 | ------------ 11 | 12 | At the moment, `dongxi.douban.com `_ 13 | (a.k.a. Douban Dongxi) uses Brownant in production environment. 14 | 15 | 16 | Installation 17 | ------------ 18 | 19 | :: 20 | 21 | $ pip install brownant 22 | 23 | 24 | Links 25 | ----- 26 | 27 | - `Document `_ 28 | - `Issue Track `_ 29 | 30 | 31 | Issues 32 | ------ 33 | 34 | If you want to report bugs or request features, please create issues on 35 | `GitHub Issues `_. 36 | 37 | 38 | Contributes 39 | ----------- 40 | 41 | You can send a pull reueqst on 42 | `GitHub `_. 43 | 44 | 45 | .. |Build Status| image:: https://travis-ci.org/douban/brownant.svg?branch=master,develop 46 | :target: https://travis-ci.org/douban/brownant 47 | :alt: Build Status 48 | .. |Coverage Status| image:: https://img.shields.io/coveralls/douban/brownant/develop.svg 49 | :target: https://coveralls.io/r/douban/brownant 50 | :alt: Coverage Status 51 | .. |Wheel Status| image:: https://img.shields.io/pypi/wheel/brownant.svg 52 | :target: https://pypi.python.org/pypi/brownant 53 | :alt: Wheel Status 54 | .. |PyPI Version| image:: https://img.shields.io/pypi/v/brownant.svg 55 | :target: https://pypi.python.org/pypi/brownant 56 | :alt: PyPI Version 57 | .. |PyPI Downloads| image:: https://img.shields.io/pypi/dm/brownant.svg 58 | :target: https://pypi.python.org/pypi/brownant 59 | :alt: Downloads 60 | -------------------------------------------------------------------------------- /brownant/__init__.py: -------------------------------------------------------------------------------- 1 | from .app import Brownant, BrownAnt, redirect 2 | from .dinergate import Dinergate 3 | from .site import Site 4 | 5 | 6 | __version__ = "0.1.7" 7 | __all__ = ["Brownant", "BrownAnt", "redirect", "Dinergate", "Site"] 8 | -------------------------------------------------------------------------------- /brownant/app.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import, unicode_literals 2 | 3 | from warnings import warn 4 | 5 | from six import string_types 6 | from six.moves import urllib 7 | from werkzeug.utils import import_string 8 | from werkzeug.urls import url_decode, url_encode 9 | from werkzeug.routing import Map, Rule, NotFound, RequestRedirect 10 | 11 | from .request import Request 12 | from .exceptions import NotSupported 13 | from .utils import to_bytes_safe 14 | 15 | 16 | class Brownant(object): 17 | """The app which could manage whole crawler system.""" 18 | 19 | def __init__(self): 20 | self.url_map = Map(strict_slashes=False, host_matching=True, 21 | redirect_defaults=False) 22 | 23 | def add_url_rule(self, host, rule_string, endpoint, **options): 24 | """Add a url rule to the app instance. 25 | 26 | The url rule is the same with Flask apps and other Werkzeug apps. 27 | 28 | :param host: the matched hostname. e.g. "www.python.org" 29 | :param rule_string: the matched path pattern. e.g. "/news/" 30 | :param endpoint: the endpoint name as a dispatching key such as the 31 | qualified name of the object. 32 | """ 33 | rule = Rule(rule_string, host=host, endpoint=endpoint, **options) 34 | self.url_map.add(rule) 35 | 36 | def parse_url(self, url_string): 37 | """Parse the URL string with the url map of this app instance. 38 | 39 | :param url_string: the origin URL string. 40 | :returns: the tuple as `(url, url_adapter, query_args)`, the url is 41 | parsed by the standard library `urlparse`, the url_adapter is 42 | from the werkzeug bound URL map, the query_args is a 43 | multidict from the werkzeug. 44 | """ 45 | url = urllib.parse.urlparse(url_string) 46 | url = self.validate_url(url) 47 | url_adapter = self.url_map.bind(server_name=url.hostname, 48 | url_scheme=url.scheme, 49 | path_info=url.path) 50 | query_args = url_decode(url.query) 51 | return url, url_adapter, query_args 52 | 53 | def validate_url(self, url): 54 | """Validate the :class:`~urllib.parse.ParseResult` object. 55 | 56 | This method will make sure the :meth:`~brownant.app.BrownAnt.parse_url` 57 | could work as expected even meet a unexpected URL string. 58 | 59 | :param url: the parsed url. 60 | :type url: :class:`~urllib.parse.ParseResult` 61 | """ 62 | # fix up the non-ascii path 63 | url_path = to_bytes_safe(url.path) 64 | url_path = urllib.parse.quote(url_path, safe=b"/%") 65 | 66 | # fix up the non-ascii query 67 | url_query = to_bytes_safe(url.query) 68 | url_query = urllib.parse.quote(url_query, safe=b"?=&") 69 | 70 | url = urllib.parse.ParseResult(url.scheme, url.netloc, url_path, 71 | url.params, url_query, url.fragment) 72 | 73 | # validate the components of URL 74 | has_hostname = url.hostname is not None and len(url.hostname) > 0 75 | has_http_scheme = url.scheme in ("http", "https") 76 | has_path = not len(url.path) or url.path.startswith("/") 77 | 78 | if not (has_hostname and has_http_scheme and has_path): 79 | raise NotSupported("invalid url: %s" % repr(url)) 80 | 81 | return url 82 | 83 | def dispatch_url(self, url_string): 84 | """Dispatch the URL string to the target endpoint function. 85 | 86 | :param url_string: the origin URL string. 87 | :returns: the return value of calling dispatched function. 88 | """ 89 | url, url_adapter, query_args = self.parse_url(url_string) 90 | 91 | try: 92 | endpoint, kwargs = url_adapter.match() 93 | except NotFound: 94 | raise NotSupported(url_string) 95 | except RequestRedirect as e: 96 | new_url = "{0.new_url}?{1}".format(e, url_encode(query_args)) 97 | return self.dispatch_url(new_url) 98 | 99 | try: 100 | handler = import_string(endpoint) 101 | request = Request(url=url, args=query_args) 102 | return handler(request, **kwargs) 103 | except RequestRedirect as e: 104 | return self.dispatch_url(e.new_url) 105 | 106 | def mount_site(self, site): 107 | """Mount a supported site to this app instance. 108 | 109 | :param site: the site instance be mounted. 110 | """ 111 | if isinstance(site, string_types): 112 | site = import_string(site) 113 | site.play_actions(target=self) 114 | 115 | 116 | class BrownAnt(Brownant): 117 | def __init__(self, *args, **kwargs): 118 | warn("The class name 'BrownAnt' has been deprecated. Please use " 119 | "'Brownant' instead.", DeprecationWarning) 120 | super(BrownAnt, self).__init__(*args, **kwargs) 121 | 122 | 123 | def redirect(url): 124 | """Raise the :class:`~werkzeug.routing.RequestRedirect` exception to lead 125 | the app dispatching current request to another URL. 126 | 127 | :param url: the target URL. 128 | """ 129 | raise RequestRedirect(url) 130 | -------------------------------------------------------------------------------- /brownant/dinergate.py: -------------------------------------------------------------------------------- 1 | from six import with_metaclass 2 | from werkzeug.utils import cached_property 3 | 4 | from brownant.pipeline.network import HTTPClientProperty 5 | 6 | 7 | class DinergateType(type): 8 | """The metaclass of :class:`~brownant.dinergate.Dinergate` and its 9 | subclasses. 10 | 11 | This metaclass will give all members are instance of 12 | :class:`~werkzeug.utils.cached_property` default names. It is because many 13 | pipeline properties are subclasses of 14 | :class:`~werkzeug.utils.cached_property`, but them would not be created by 15 | decorating functions. They will has not built-in :attr:`__name__`, which 16 | may cause them could not cache values as expected. 17 | """ 18 | 19 | def __new__(metacls, name, bases, members): 20 | cls = type.__new__(metacls, name, bases, members) 21 | for name in dir(cls): 22 | value = getattr(cls, name) 23 | if isinstance(value, cached_property) and not value.__name__: 24 | value.__name__ = name 25 | value.__module__ = cls.__module__ 26 | return cls 27 | 28 | 29 | class Dinergate(with_metaclass(DinergateType)): 30 | """The simple classify crawler. 31 | 32 | In order to work with unnamed properties such as the instances of 33 | :class:`~brownant.pipeline.base.PipelineProperty`, the meta class 34 | :class:`~brownant.dinergate.DinergateType` will scan subclasses of this 35 | class and name all unnamed members which are instances of 36 | :class:`~werkzeug.utils.cached_property`. 37 | 38 | :param request: the standard parameter passed by app. 39 | :type request: :class:`~brownant.request.Request` 40 | :param http_client: the session instance of python-requests. 41 | :type http_client: :class:`requests.Session` 42 | :param kwargs: other arguments from the URL pattern. 43 | """ 44 | 45 | #: the URL template string for generating crawled target. the `self` could 46 | #: be referenced in the template. 47 | #: (e.g. `"http://www.example.com/items/{self.item_id}?page={self.page}"`) 48 | URL_TEMPLATE = None 49 | 50 | http_client = HTTPClientProperty() 51 | 52 | def __init__(self, request, http_client=None, **kwargs): 53 | self.request = request 54 | if http_client: 55 | self.http_client = http_client 56 | # assign arguments from URL pattern 57 | vars(self).update(kwargs) 58 | 59 | @property 60 | def url(self): 61 | """The fetching target URL. 62 | 63 | The default behavior of this property is build URL string with the 64 | :const:`~brownant.dinergate.Dinergate.URL_TEMPLATE`. 65 | 66 | The subclasses could override 67 | :const:`~brownant.dinergate.Dinergate.URL_TEMPLATE` or use a different 68 | implementation. 69 | """ 70 | if not self.URL_TEMPLATE: 71 | raise NotImplementedError 72 | return self.URL_TEMPLATE.format(self=self) 73 | -------------------------------------------------------------------------------- /brownant/exceptions.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import, unicode_literals 2 | 3 | 4 | class BrownantException(Exception): 5 | """The base exception of the Brownant framework.""" 6 | 7 | 8 | class NotSupported(BrownantException): 9 | """The given URL or other identity is from a platform which not support. 10 | 11 | This exception means any url rules of the app which matched the URL could 12 | not be found. 13 | """ 14 | -------------------------------------------------------------------------------- /brownant/pipeline/__init__.py: -------------------------------------------------------------------------------- 1 | from .base import PipelineProperty 2 | from .html import ElementTreeProperty, XPathTextProperty 3 | from .network import (HTTPClientProperty, URLQueryProperty, 4 | TextResponseProperty, JSONResponseProperty) 5 | 6 | 7 | __all__ = ["PipelineProperty", "ElementTreeProperty", "XPathTextProperty", 8 | "HTTPClientProperty", "URLQueryProperty", "TextResponseProperty", 9 | "JSONResponseProperty"] 10 | -------------------------------------------------------------------------------- /brownant/pipeline/base.py: -------------------------------------------------------------------------------- 1 | from werkzeug.utils import cached_property 2 | 3 | 4 | class PipelineProperty(cached_property): 5 | """The base class of pipeline properties. 6 | 7 | There are three kinds of initial parameters. 8 | 9 | - The required attribute. If a keyword argument's name was defined in 10 | :attr:`~brownant.pipeline.base.PipelineProperty.required_attrs`, it will 11 | be assigned as an instance attribute. 12 | 13 | - The attr_name. It is the member of 14 | :attr:`~brownant.pipeline.base.PipelineProperty.attr_names`, whose name 15 | always end with `_attr`, such as `text_attr`. 16 | 17 | - The option. It will be placed at an instance owned :class:`dict` named 18 | :attr:`~brownant.pipeline.base.PipelineProperty.options`. The subclasses 19 | could set default option value in the 20 | :meth:`~brownant.pipeline.base.PipelineProperty.prepare`. 21 | 22 | A workable subclass of :class:`~brownant.pipeline.base.PipelineProperty` 23 | should implement the abstruct method 24 | :meth:`~PipelineProperty.provide_value`, which accept an argument, the 25 | instance of :class:`~brownant.dinergate.Dinergate`. 26 | 27 | Overriding :meth:`~brownant.pipeline.base.PipelineProperty.prepare` is 28 | optional in subclasses. 29 | 30 | :param kwargs: the parameters with the three kinds. 31 | """ 32 | 33 | #: the names of required attributes. 34 | required_attrs = set() 35 | 36 | def __init__(self, **kwargs): 37 | super(PipelineProperty, self).__init__(self.provide_value) 38 | self.__name__ = None 39 | self.__module__ = None 40 | self.__doc__ = None 41 | 42 | #: the definition of attr_names 43 | self.attr_names = {} 44 | #: the definition of options 45 | self.options = {} 46 | 47 | assigned_attrs = set() 48 | for name, value in kwargs.items(): 49 | assigned_attrs.add(name) 50 | 51 | # names of attrs 52 | if name.endswith("_attr"): 53 | self.attr_names[name] = value 54 | # required attrs 55 | elif name in self.required_attrs: 56 | setattr(self, name, value) 57 | # optional attrs 58 | else: 59 | self.options[name] = value 60 | missing_attrs = self.required_attrs - assigned_attrs 61 | if missing_attrs: 62 | raise TypeError("missing %r" % ", ".join(missing_attrs)) 63 | 64 | self.prepare() 65 | 66 | def prepare(self): 67 | """This method will be called after instance ininialized. The 68 | subclasses could override the implementation. 69 | 70 | In general purpose, the implementation of this method should give 71 | default value to options and the members of 72 | :attr:`~brownant.pipeline.base.PipelineProperty.attr_names`. 73 | 74 | Example: 75 | 76 | .. code-block:: python 77 | 78 | def prepare(self): 79 | self.attr_names.setdefault("text_attr", "text") 80 | self.options.setdefault("use_proxy", False) 81 | """ 82 | 83 | def get_attr(self, obj, name): 84 | """Get attribute of the target object with the configured attribute 85 | name in the :attr:`~brownant.pipeline.base.PipelineProperty.attr_names` 86 | of this instance. 87 | 88 | :param obj: the target object. 89 | :type obj: :class:`~brownant.dinergate.Dinergate` 90 | :param name: the internal name used in the 91 | :attr:`~brownant.pipeline.base.PipelineProperty.attr_names`. 92 | (.e.g. `"text_attr"`) 93 | """ 94 | attr_name = self.attr_names[name] 95 | return getattr(obj, attr_name) 96 | -------------------------------------------------------------------------------- /brownant/pipeline/html.py: -------------------------------------------------------------------------------- 1 | import lxml.html 2 | 3 | from brownant.pipeline.base import PipelineProperty 4 | 5 | 6 | class ElementTreeProperty(PipelineProperty): 7 | """The element tree built from a text response property. There is an usage 8 | example:: 9 | 10 | class MySite(Dinergate): 11 | text_response = "" 12 | div_response = "
" 13 | xml_response = (u"" 14 | u"\u6d4b\u8bd5") 15 | etree = ElementTreeProperty() 16 | div_etree = ElementTreeProperty(text_response_attr="div_response") 17 | xml_etree = ElementTreeProperty(text_response_attr="xml_response", 18 | encoding="utf-8") 19 | 20 | site = MySite(request) 21 | print(site.etree) # output: 22 | print(site.div_etree) # output: 23 | print(site.xml_etree) # output: 24 | 25 | :param text_response_attr: optional. default: `"text_response"`. 26 | :param encoding: optional. default: `None`. The output text could be 27 | encoded to a specific encoding. 28 | 29 | .. versionadded:: 0.1.4 30 | The `encoding` optional parameter. 31 | """ 32 | 33 | def prepare(self): 34 | self.attr_names.setdefault("text_response_attr", "text_response") 35 | self.options.setdefault("encoding", None) 36 | 37 | def provide_value(self, obj): 38 | text_response = self.get_attr(obj, "text_response_attr") 39 | if self.options["encoding"]: 40 | text_response = text_response.encode(self.options["encoding"]) 41 | return lxml.html.fromstring(text_response) 42 | 43 | 44 | class XPathTextProperty(PipelineProperty): 45 | """The text extracted from a element tree property by XPath. There is an 46 | example for usage:: 47 | 48 | class MySite(Dinergate): 49 | # omit page_etree 50 | title = XPathTextProperty(xpath=".//h1[@id='title']/text()", 51 | etree_attr="page_etree", 52 | strip_spaces=True, 53 | pick_mode="first") 54 | links = XPathTextProperty(xpath=".//*[@id='links']/a/@href", 55 | etree_attr="page_etree", 56 | strip_spaces=True, 57 | pick_mode="join", 58 | joiner="|") 59 | 60 | :param xpath: the xpath expression for extracting text. 61 | :param etree_attr: optional. default: `"etree"`. 62 | :param strip_spaces: optional. default: `False`. if it be `True`, 63 | the spaces in the beginning and the end of texts will 64 | be striped. 65 | :param pick_mode: optional. default: `"join"`, and could be "join", "first" 66 | or "keep". while `"join"` be detected, the texts will be 67 | joined to one. if the `"first"` be detected, only 68 | the first text would be picked. if the `"keep"` be 69 | detected, the original value will be picked. 70 | :param joiner: optional. default is a space string. it is no sense in 71 | assigning this parameter while the `pick_mode` is not 72 | `"join"`. otherwise, the texts will be joined by this 73 | string. 74 | 75 | .. versionadded:: 0.1.4 76 | The new option value `"keep"` of the `pick_mode` parameter. 77 | """ 78 | 79 | required_attrs = {"xpath"} 80 | 81 | def prepare(self): 82 | self.attr_names.setdefault("etree_attr", "etree") 83 | self.options.setdefault("strip_spaces", False) 84 | self.options.setdefault("pick_mode", "join") 85 | self.options.setdefault("joiner", " ") 86 | 87 | def choice_pick_impl(self): 88 | pick_mode = self.options["pick_mode"] 89 | impl = { 90 | "join": self.pick_joining, 91 | "first": self.pick_first, 92 | "keep": self.keep_value, 93 | }.get(pick_mode) 94 | 95 | if not impl: 96 | raise ValueError("%r is not valid pick mode" % pick_mode) 97 | return impl 98 | 99 | def pick_joining(self, value): 100 | joiner = self.options["joiner"] 101 | return joiner.join(value) 102 | 103 | def pick_first(self, value): 104 | return value[0] if value else "" 105 | 106 | def keep_value(self, value): 107 | return value 108 | 109 | def provide_value(self, obj): 110 | etree = self.get_attr(obj, "etree_attr") 111 | value = etree.xpath(self.xpath) 112 | pick_value = self.choice_pick_impl() 113 | 114 | if self.options["strip_spaces"]: 115 | value = [v.strip() for v in value if v.strip()] 116 | 117 | return pick_value(value) 118 | -------------------------------------------------------------------------------- /brownant/pipeline/network.py: -------------------------------------------------------------------------------- 1 | from requests import Session 2 | 3 | from brownant.pipeline.base import PipelineProperty 4 | from brownant.exceptions import NotSupported 5 | 6 | 7 | class HTTPClientProperty(PipelineProperty): 8 | """The python-requests session property. 9 | 10 | :param session_class: the class of session instance. default be 11 | :class:`~requests.Session`. 12 | """ 13 | 14 | def prepare(self): 15 | self.options.setdefault("session_class", Session) 16 | 17 | def provide_value(self, obj): 18 | session_class = self.options["session_class"] 19 | session = session_class() 20 | return session 21 | 22 | 23 | class URLQueryProperty(PipelineProperty): 24 | """The query argument property. The usage is simple:: 25 | 26 | class MySite(Dinergate): 27 | item_id = URLQueryProperty(name="item_id", type=int) 28 | 29 | It equals to this:: 30 | 31 | class MySite(Dinergate): 32 | @cached_property 33 | def item_id(self): 34 | value = self.request.args.get("item_id", type=int) 35 | if not value: 36 | raise NotSupported 37 | return value 38 | 39 | A failure convertion with given type (:exc:`ValueError` be raised) will 40 | lead the value fallback to :obj:`None`. It is the same with the behavior of 41 | the :class:`~werkzeug.datastructures.MultiDict`. 42 | 43 | :param name: the query argument name. 44 | :param request_attr: optional. default: `"request"`. 45 | :param type: optionl. default: `None`. this value will be passed to 46 | :meth:`~werkzeug.datastructures.MultiDict.get`. 47 | :param required: optionl. default: `True`. while this value be true, the 48 | :exc:`~brownant.exceptions.NotSupported` will be raised 49 | for meeting empty value. 50 | """ 51 | 52 | required_attrs = {"name"} 53 | 54 | def prepare(self): 55 | self.attr_names.setdefault("request_attr", "request") 56 | self.options.setdefault("type", None) 57 | self.options.setdefault("required", True) 58 | 59 | def provide_value(self, obj): 60 | request = self.get_attr(obj, "request_attr") 61 | value = request.args.get(self.name, type=self.options["type"]) 62 | if self.options["required"] and value is None: 63 | raise NotSupported 64 | return value 65 | 66 | 67 | class ResponseProperty(PipelineProperty): 68 | """The base class of response properties. 69 | 70 | You can't use this class directly. 71 | 72 | :param content_method: required. it point to response content method. 73 | """ 74 | def prepare(self): 75 | self.attr_names.setdefault("url_attr", "url") 76 | self.attr_names.setdefault("http_client_attr", "http_client") 77 | self.options.setdefault("method", "GET") 78 | 79 | def provide_value(self, obj): 80 | if "content_method" not in self.attr_names: 81 | raise KeyError("You need create a subclass which inheritance " 82 | "ResponseProperty, and assign `content_method` " 83 | "into self.attr_names") 84 | url = self.get_attr(obj, "url_attr") 85 | http_client = self.get_attr(obj, "http_client_attr") 86 | content_method = self.attr_names.get("content_method") 87 | response = http_client.request(url=url, **self.options) 88 | response.raise_for_status() 89 | content = getattr(response, content_method) 90 | if callable(content): 91 | content = content() 92 | return content 93 | 94 | 95 | class TextResponseProperty(ResponseProperty): 96 | """The text response which returned by fetching network resource. 97 | 98 | Getting this property is network I/O operation in the first time. The 99 | http request implementations are all provided by :mod:`requests`. 100 | 101 | The usage example:: 102 | 103 | class MySite(Dinergate): 104 | foo_http = requests.Session() 105 | foo_url = "http://example.com" 106 | foo_text = TextResponseProperty(url_attr="foo_url", 107 | http_client="foo_http", 108 | proxies=PROXIES) 109 | 110 | :param url_attr: optional. default: `"url"`. it point to the property which 111 | could provide the fetched url. 112 | :param http_client_attr: optional. default: `"http_client"`. it point to 113 | an http client property which is instance of 114 | :class:`requests.Session` 115 | :param method: optional. default: `"GET"`. the request method which 116 | used by http_client. 117 | :param kwargs: the optional arguments which will be passed to 118 | :meth:`requests.Session.request` 119 | """ 120 | 121 | def prepare(self): 122 | super(TextResponseProperty, self).prepare() 123 | self.attr_names.setdefault("content_method", "content") 124 | 125 | 126 | class JSONResponseProperty(ResponseProperty): 127 | """The json response which returned by fetching network resource. 128 | 129 | Getting this property is network I/O operation in the first time. The 130 | http request implementations are all provided by :mod:`requests`. 131 | 132 | The usage example:: 133 | 134 | class MySite(Dinergate): 135 | foo_http = requests.Session() 136 | foo_url = "http://example.com" 137 | foo_json = JSONResponseProperty(url_attr="foo_url", 138 | http_client="foo_http", 139 | proxies=PROXIES) 140 | 141 | :param url_attr: optional. default: `"url"`. it point to the property which 142 | could provide the fetched url. 143 | :param http_client_attr: optional. default: `"http_client"`. it point to 144 | an http client property which is instance of 145 | :class:`requests.Session` 146 | :param method: optional. default: `"GET"`. the request method which 147 | used by http_client. 148 | :param kwargs: the optional arguments which will be passed to 149 | :meth:`requests.Session.request` 150 | """ 151 | 152 | def prepare(self): 153 | super(JSONResponseProperty, self).prepare() 154 | self.attr_names.setdefault("content_method", "json") 155 | -------------------------------------------------------------------------------- /brownant/request.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import, unicode_literals 2 | 3 | 4 | class Request(object): 5 | """The request object. 6 | 7 | :param url: the raw URL inputted from the dispatching app. 8 | :type url: :class:`urllib.parse.ParseResult` 9 | :param args: the query arguments decoded from query string of the URL. 10 | :type args: :class:`werkzeug.datastructures.MultiDict` 11 | """ 12 | 13 | def __init__(self, url, args): 14 | self.url = url 15 | self.args = args 16 | 17 | def __repr__(self): 18 | return "Request(url={self.url}, args={self.args})".format(self=self) 19 | -------------------------------------------------------------------------------- /brownant/site.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import, unicode_literals 2 | 3 | 4 | class Site(object): 5 | """The site supported object which could be mounted to app instance. 6 | 7 | :param name: the name of the supported site. 8 | """ 9 | 10 | def __init__(self, name): 11 | self.name = name 12 | self.actions = [] 13 | 14 | def record_action(self, method_name, *args, **kwargs): 15 | """Record the method-calling action. 16 | 17 | The actions expect to be played on an target object. 18 | 19 | :param method_name: the name of called method. 20 | :param args: the general arguments for calling method. 21 | :param kwargs: the keyword arguments for calling method. 22 | """ 23 | self.actions.append((method_name, args, kwargs)) 24 | 25 | def play_actions(self, target): 26 | """Play record actions on the target object. 27 | 28 | :param target: the target which recive all record actions, is a brown 29 | ant app instance normally. 30 | :type target: :class:`~brownant.app.Brownant` 31 | """ 32 | for method_name, args, kwargs in self.actions: 33 | method = getattr(target, method_name) 34 | method(*args, **kwargs) 35 | 36 | def route(self, host, rule, **options): 37 | """The decorator to register wrapped function as the brown ant app. 38 | 39 | All optional parameters of this method are compatible with the 40 | :meth:`~brownant.app.Brownant.add_url_rule`. 41 | 42 | Registered functions or classes must be import-able with its qualified 43 | name. It is different from the :class:`~flask.Flask`, but like a 44 | lazy-loading mode. Registered objects only be loaded before the first 45 | using. 46 | 47 | The right way:: 48 | 49 | @site.route("www.example.com", "/item/") 50 | def spam(request, item_id): 51 | pass 52 | 53 | The wrong way:: 54 | 55 | def egg(): 56 | # the function could not be imported by its qualified name 57 | @site.route("www.example.com", "/item/") 58 | def spam(request, item_id): 59 | pass 60 | 61 | egg() 62 | 63 | :param host: the limited host name. 64 | :param rule: the URL path rule as string. 65 | :param options: the options to be forwarded to the 66 | :class:`werkzeug.routing.Rule` object. 67 | """ 68 | def decorator(func): 69 | endpoint = "{func.__module__}:{func.__name__}".format(func=func) 70 | self.record_action("add_url_rule", host, rule, endpoint, **options) 71 | return func 72 | return decorator 73 | -------------------------------------------------------------------------------- /brownant/utils.py: -------------------------------------------------------------------------------- 1 | from six import text_type 2 | 3 | 4 | def to_bytes_safe(text, encoding="utf-8"): 5 | """Convert the input value into bytes type. 6 | 7 | If the input value is string type and could be encode as UTF-8 bytes, the 8 | encoded value will be returned. Otherwise, the encoding has failed, the 9 | origin value will be returned as well. 10 | 11 | :param text: the input value which could be string or bytes. 12 | :param encoding: the expected encoding be used while converting the string 13 | input into bytes. 14 | :rtype: :class:`~__builtin__.bytes` 15 | """ 16 | if not isinstance(text, (bytes, text_type)): 17 | raise TypeError("must be string type") 18 | 19 | if isinstance(text, text_type): 20 | return text.encode(encoding) 21 | 22 | return text 23 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | PAPER = 8 | BUILDDIR = _build 9 | 10 | # Internal variables. 11 | PAPEROPT_a4 = -D latex_paper_size=a4 12 | PAPEROPT_letter = -D latex_paper_size=letter 13 | ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . 14 | # the i18n builder cannot share the environment and doctrees with the others 15 | I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . 16 | 17 | .PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext 18 | 19 | help: 20 | @echo "Please use \`make ' where is one of" 21 | @echo " html to make standalone HTML files" 22 | @echo " dirhtml to make HTML files named index.html in directories" 23 | @echo " singlehtml to make a single large HTML file" 24 | @echo " pickle to make pickle files" 25 | @echo " json to make JSON files" 26 | @echo " htmlhelp to make HTML files and a HTML help project" 27 | @echo " qthelp to make HTML files and a qthelp project" 28 | @echo " devhelp to make HTML files and a Devhelp project" 29 | @echo " epub to make an epub" 30 | @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" 31 | @echo " latexpdf to make LaTeX files and run them through pdflatex" 32 | @echo " text to make text files" 33 | @echo " man to make manual pages" 34 | @echo " texinfo to make Texinfo files" 35 | @echo " info to make Texinfo files and run them through makeinfo" 36 | @echo " gettext to make PO message catalogs" 37 | @echo " changes to make an overview of all changed/added/deprecated items" 38 | @echo " linkcheck to check all external links for integrity" 39 | @echo " doctest to run all doctests embedded in the documentation (if enabled)" 40 | 41 | clean: 42 | -rm -rf $(BUILDDIR)/* 43 | 44 | html: 45 | $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html 46 | @echo 47 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." 48 | 49 | dirhtml: 50 | $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml 51 | @echo 52 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." 53 | 54 | singlehtml: 55 | $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml 56 | @echo 57 | @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml." 58 | 59 | pickle: 60 | $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle 61 | @echo 62 | @echo "Build finished; now you can process the pickle files." 63 | 64 | json: 65 | $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json 66 | @echo 67 | @echo "Build finished; now you can process the JSON files." 68 | 69 | htmlhelp: 70 | $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp 71 | @echo 72 | @echo "Build finished; now you can run HTML Help Workshop with the" \ 73 | ".hhp project file in $(BUILDDIR)/htmlhelp." 74 | 75 | qthelp: 76 | $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp 77 | @echo 78 | @echo "Build finished; now you can run "qcollectiongenerator" with the" \ 79 | ".qhcp project file in $(BUILDDIR)/qthelp, like this:" 80 | @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/brownant.qhcp" 81 | @echo "To view the help file:" 82 | @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/brownant.qhc" 83 | 84 | devhelp: 85 | $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp 86 | @echo 87 | @echo "Build finished." 88 | @echo "To view the help file:" 89 | @echo "# mkdir -p $$HOME/.local/share/devhelp/brownant" 90 | @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/brownant" 91 | @echo "# devhelp" 92 | 93 | epub: 94 | $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub 95 | @echo 96 | @echo "Build finished. The epub file is in $(BUILDDIR)/epub." 97 | 98 | latex: 99 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 100 | @echo 101 | @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." 102 | @echo "Run \`make' in that directory to run these through (pdf)latex" \ 103 | "(use \`make latexpdf' here to do that automatically)." 104 | 105 | latexpdf: 106 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 107 | @echo "Running LaTeX files through pdflatex..." 108 | $(MAKE) -C $(BUILDDIR)/latex all-pdf 109 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." 110 | 111 | text: 112 | $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text 113 | @echo 114 | @echo "Build finished. The text files are in $(BUILDDIR)/text." 115 | 116 | man: 117 | $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man 118 | @echo 119 | @echo "Build finished. The manual pages are in $(BUILDDIR)/man." 120 | 121 | texinfo: 122 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo 123 | @echo 124 | @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo." 125 | @echo "Run \`make' in that directory to run these through makeinfo" \ 126 | "(use \`make info' here to do that automatically)." 127 | 128 | info: 129 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo 130 | @echo "Running Texinfo files through makeinfo..." 131 | make -C $(BUILDDIR)/texinfo info 132 | @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo." 133 | 134 | gettext: 135 | $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale 136 | @echo 137 | @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale." 138 | 139 | changes: 140 | $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes 141 | @echo 142 | @echo "The overview file is in $(BUILDDIR)/changes." 143 | 144 | linkcheck: 145 | $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck 146 | @echo 147 | @echo "Link check complete; look for any errors in the above output " \ 148 | "or in $(BUILDDIR)/linkcheck/output.txt." 149 | 150 | doctest: 151 | $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest 152 | @echo "Testing of doctests in the sources finished, look at the " \ 153 | "results in $(BUILDDIR)/doctest/output.txt." 154 | -------------------------------------------------------------------------------- /docs/_static/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/douban/brownant/3c7e6d30f67b8f0f8ca1f823ea3daed74e8725cd/docs/_static/.gitkeep -------------------------------------------------------------------------------- /docs/api.rst: -------------------------------------------------------------------------------- 1 | .. _api: 2 | 3 | Basic API 4 | ========= 5 | 6 | The basic API included the application framework and routing system (provided 7 | by :mod:`werkzeug.routing`) of Brownant. 8 | 9 | brownant.app 10 | ------------ 11 | 12 | .. autoclass:: brownant.app.Brownant 13 | :members: 14 | :inherited-members: 15 | 16 | .. autofunction:: brownant.app.redirect 17 | 18 | brownant.request 19 | ---------------- 20 | 21 | .. autoclass:: brownant.request.Request 22 | :members: 23 | :inherited-members: 24 | 25 | brownant.site 26 | ------------- 27 | 28 | .. autoclass:: brownant.site.Site 29 | :members: 30 | :inherited-members: 31 | 32 | brownant.exceptions 33 | ------------------- 34 | 35 | .. autoexception:: brownant.exceptions.BrownantException 36 | 37 | .. autoexception:: brownant.exceptions.NotSupported 38 | :show-inheritance: 39 | 40 | brownant.utils 41 | -------------- 42 | 43 | .. autofunction:: brownant.utils.to_bytes_safe 44 | 45 | Declarative API 46 | =============== 47 | 48 | The declarative API is around the "dinergate" and "pipeline property". 49 | 50 | brownant.dinergate 51 | ------------------ 52 | 53 | .. autoclass:: brownant.dinergate.Dinergate 54 | :members: 55 | :inherited-members: 56 | 57 | .. autoclass:: brownant.dinergate.DinergateType 58 | :show-inheritance: 59 | 60 | brownant.pipeline.base 61 | ---------------------- 62 | 63 | .. autoclass:: brownant.pipeline.base.PipelineProperty 64 | :members: 65 | :inherited-members: 66 | :show-inheritance: 67 | 68 | .. method:: provide_value(obj) 69 | 70 | The abstruct method which should be implemented by subclasses. It provide 71 | the value expected by us from the subject instance. 72 | 73 | :param obj: the subject instance. 74 | :type obj: :class:`~brownant.dinergate.Dinergate` 75 | 76 | brownant.pipeline.network 77 | ------------------------- 78 | 79 | .. autoclass:: brownant.pipeline.network.URLQueryProperty 80 | :members: 81 | 82 | .. autoclass:: brownant.pipeline.network.TextResponseProperty 83 | :members: 84 | 85 | brownant.pipeline.html 86 | ---------------------- 87 | 88 | .. autoclass:: brownant.pipeline.html.ElementTreeProperty 89 | :members: 90 | 91 | .. autoclass:: brownant.pipeline.html.XPathTextProperty 92 | :members: 93 | -------------------------------------------------------------------------------- /docs/changes.rst: -------------------------------------------------------------------------------- 1 | .. _changes: 2 | 3 | .. include:: ../CHANGES 4 | -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # brownant documentation build configuration file, created by 4 | # sphinx-quickstart on Sun Sep 29 00:53:05 2013. 5 | # 6 | # This file is execfile()d with the current directory set to its containing dir. 7 | # 8 | # Note that not all possible configuration values are present in this 9 | # autogenerated file. 10 | # 11 | # All configuration values have a default; values that are commented out 12 | # serve to show the default. 13 | 14 | import sys, os 15 | 16 | import alabaster 17 | 18 | # If extensions (or modules to document with autodoc) are in another directory, 19 | # add these directories to sys.path here. If the directory is relative to the 20 | # documentation root, use os.path.abspath to make it absolute, like shown here. 21 | sys.path.insert(0, os.path.abspath('..')) 22 | 23 | # -- General configuration ----------------------------------------------------- 24 | 25 | # If your documentation needs a minimal Sphinx version, state it here. 26 | #needs_sphinx = '1.0' 27 | 28 | # Add any Sphinx extension module names here, as strings. They can be extensions 29 | # coming with Sphinx (named 'sphinx.ext.*') or your custom ones. 30 | extensions = ['sphinx.ext.autodoc', 'sphinx.ext.intersphinx'] 31 | 32 | # Add any paths that contain templates here, relative to this directory. 33 | templates_path = ['_templates'] 34 | 35 | # The suffix of source filenames. 36 | source_suffix = '.rst' 37 | 38 | # The encoding of source files. 39 | #source_encoding = 'utf-8-sig' 40 | 41 | # The master toctree document. 42 | master_doc = 'index' 43 | 44 | # General information about the project. 45 | project = u'Brownant' 46 | copyright = u'2014, Douban Inc.' 47 | 48 | # The version info for the project you're documenting, acts as replacement for 49 | # |version| and |release|, also used in various other places throughout the 50 | # built documents. 51 | # 52 | # The short X.Y version. 53 | version = '0.1.7' 54 | # The full version, including alpha/beta/rc tags. 55 | release = '0.1.7' 56 | 57 | # The language for content autogenerated by Sphinx. Refer to documentation 58 | # for a list of supported languages. 59 | #language = None 60 | 61 | # There are two options for replacing |today|: either, you set today to some 62 | # non-false value, then it is used: 63 | #today = '' 64 | # Else, today_fmt is used as the format for a strftime call. 65 | #today_fmt = '%B %d, %Y' 66 | 67 | # List of patterns, relative to source directory, that match files and 68 | # directories to ignore when looking for source files. 69 | exclude_patterns = ['_build', '_static'] 70 | 71 | # The reST default role (used for this markup: `text`) to use for all documents. 72 | #default_role = None 73 | 74 | # If true, '()' will be appended to :func: etc. cross-reference text. 75 | #add_function_parentheses = True 76 | 77 | # If true, the current module name will be prepended to all description 78 | # unit titles (such as .. function::). 79 | #add_module_names = True 80 | 81 | # If true, sectionauthor and moduleauthor directives will be shown in the 82 | # output. They are ignored by default. 83 | #show_authors = False 84 | 85 | # The name of the Pygments (syntax highlighting) style to use. 86 | pygments_style = 'sphinx' 87 | 88 | # A list of ignored prefixes for module index sorting. 89 | #modindex_common_prefix = [] 90 | 91 | 92 | # -- Options for HTML output --------------------------------------------------- 93 | 94 | # The theme to use for HTML and HTML Help pages. See the documentation for 95 | # a list of builtin themes. 96 | html_theme = 'alabaster' 97 | 98 | # Theme options are theme-specific and customize the look and feel of a theme 99 | # further. For a list of options available for each theme, see the 100 | # documentation. 101 | html_theme_options = { 102 | 'github_user': 'douban', 103 | 'github_repo': 'brownant', 104 | } 105 | 106 | # Add any paths that contain custom themes here, relative to this directory. 107 | html_theme_path = [alabaster.get_path()] 108 | 109 | # The name for this set of Sphinx documents. If None, it defaults to 110 | # " v documentation". 111 | #html_title = None 112 | 113 | # A shorter title for the navigation bar. Default is the same as html_title. 114 | #html_short_title = None 115 | 116 | # The name of an image file (relative to this directory) to place at the top 117 | # of the sidebar. 118 | #html_logo = None 119 | 120 | # The name of an image file (within the static path) to use as favicon of the 121 | # docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 122 | # pixels large. 123 | #html_favicon = None 124 | 125 | # Add any paths that contain custom static files (such as style sheets) here, 126 | # relative to this directory. They are copied after the builtin static files, 127 | # so a file named "default.css" will overwrite the builtin "default.css". 128 | html_static_path = ['_static'] 129 | 130 | # If not '', a 'Last updated on:' timestamp is inserted at every page bottom, 131 | # using the given strftime format. 132 | #html_last_updated_fmt = '%b %d, %Y' 133 | 134 | # If true, SmartyPants will be used to convert quotes and dashes to 135 | # typographically correct entities. 136 | #html_use_smartypants = True 137 | 138 | # Custom sidebar templates, maps document names to template names. 139 | html_sidebars = { 140 | '**': [ 141 | 'about.html', 142 | 'localtoc.html', 143 | 'relations.html', 144 | 'sourcelink.html', 145 | 'searchbox.html' 146 | ] 147 | } 148 | 149 | # Additional templates that should be rendered to pages, maps page names to 150 | # template names. 151 | #html_additional_pages = {} 152 | 153 | # If false, no module index is generated. 154 | #html_domain_indices = True 155 | 156 | # If false, no index is generated. 157 | #html_use_index = True 158 | 159 | # If true, the index is split into individual pages for each letter. 160 | #html_split_index = False 161 | 162 | # If true, links to the reST sources are added to the pages. 163 | #html_show_sourcelink = True 164 | 165 | # If true, "Created using Sphinx" is shown in the HTML footer. Default is True. 166 | #html_show_sphinx = True 167 | 168 | # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. 169 | #html_show_copyright = True 170 | 171 | # If true, an OpenSearch description file will be output, and all pages will 172 | # contain a tag referring to it. The value of this option must be the 173 | # base URL from which the finished HTML is served. 174 | #html_use_opensearch = '' 175 | 176 | # This is the file name suffix for HTML files (e.g. ".xhtml"). 177 | #html_file_suffix = None 178 | 179 | # Output file base name for HTML help builder. 180 | htmlhelp_basename = 'brownantdoc' 181 | 182 | 183 | # -- Options for LaTeX output -------------------------------------------------- 184 | 185 | latex_elements = { 186 | # The paper size ('letterpaper' or 'a4paper'). 187 | #'papersize': 'letterpaper', 188 | 189 | # The font size ('10pt', '11pt' or '12pt'). 190 | #'pointsize': '10pt', 191 | 192 | # Additional stuff for the LaTeX preamble. 193 | #'preamble': '', 194 | } 195 | 196 | # Grouping the document tree into LaTeX files. List of tuples 197 | # (source start file, target name, title, author, documentclass [howto/manual]). 198 | latex_documents = [ 199 | ('index', 'brownant.tex', u'Brownant Documentation', 200 | u'Douban Inc.', 'manual'), 201 | ] 202 | 203 | # The name of an image file (relative to this directory) to place at the top of 204 | # the title page. 205 | #latex_logo = None 206 | 207 | # For "manual" documents, if this is true, then toplevel headings are parts, 208 | # not chapters. 209 | #latex_use_parts = False 210 | 211 | # If true, show page references after internal links. 212 | #latex_show_pagerefs = False 213 | 214 | # If true, show URL addresses after external links. 215 | #latex_show_urls = False 216 | 217 | # Documents to append as an appendix to all manuals. 218 | #latex_appendices = [] 219 | 220 | # If false, no module index is generated. 221 | #latex_domain_indices = True 222 | 223 | 224 | # -- Options for manual page output -------------------------------------------- 225 | 226 | # One entry per manual page. List of tuples 227 | # (source start file, name, description, authors, manual section). 228 | man_pages = [ 229 | ('index', 'brownant', u'Brownant Documentation', 230 | [u'Douban Inc.'], 1) 231 | ] 232 | 233 | # If true, show URL addresses after external links. 234 | #man_show_urls = False 235 | 236 | 237 | # -- Options for Texinfo output ------------------------------------------------ 238 | 239 | # Grouping the document tree into Texinfo files. List of tuples 240 | # (source start file, target name, title, author, 241 | # dir menu entry, description, category) 242 | texinfo_documents = [ 243 | ('index', 'brownant', u'Brownant Documentation', 244 | u'Douban Inc.', 'brownant', 'One line description of project.', 245 | 'Miscellaneous'), 246 | ] 247 | 248 | # Documents to append as an appendix to all manuals. 249 | #texinfo_appendices = [] 250 | 251 | # If false, no module index is generated. 252 | #texinfo_domain_indices = True 253 | 254 | # How to display URL addresses: 'footnote', 'no', or 'inline'. 255 | #texinfo_show_urls = 'footnote' 256 | 257 | 258 | # -- Options for Epub output --------------------------------------------------- 259 | 260 | # Bibliographic Dublin Core info. 261 | epub_title = u'Brownant' 262 | epub_author = u'Douban Inc.' 263 | epub_publisher = u'Douban Inc.' 264 | epub_copyright = u'2014, Douban Inc.' 265 | 266 | # The language of the text. It defaults to the language option 267 | # or en if the language is not set. 268 | #epub_language = '' 269 | 270 | # The scheme of the identifier. Typical schemes are ISBN or URL. 271 | #epub_scheme = '' 272 | 273 | # The unique identifier of the text. This can be a ISBN number 274 | # or the project homepage. 275 | #epub_identifier = '' 276 | 277 | # A unique identification for the text. 278 | #epub_uid = '' 279 | 280 | # A tuple containing the cover image and cover page html template filenames. 281 | #epub_cover = () 282 | 283 | # HTML files that should be inserted before the pages created by sphinx. 284 | # The format is a list of tuples containing the path and title. 285 | #epub_pre_files = [] 286 | 287 | # HTML files shat should be inserted after the pages created by sphinx. 288 | # The format is a list of tuples containing the path and title. 289 | #epub_post_files = [] 290 | 291 | # A list of files that should not be packed into the epub file. 292 | #epub_exclude_files = [] 293 | 294 | # The depth of the table of contents in toc.ncx. 295 | #epub_tocdepth = 3 296 | 297 | # Allow duplicate toc entries. 298 | #epub_tocdup = True 299 | 300 | 301 | # Example configuration for intersphinx: refer to the Python standard library. 302 | intersphinx_mapping = { 303 | 'http://docs.python.org/dev': None, 304 | 'http://docs.python-requests.org/en/latest/': None, 305 | 'http://werkzeug.pocoo.org/docs/': None, 306 | 'http://flask.pocoo.org/docs/': None, 307 | } 308 | -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | Welcome to Brownant 2 | =================== 3 | 4 | Brownant is a lightweight web data extracting framework. 5 | 6 | 7 | User's Guide 8 | ------------ 9 | 10 | .. toctree:: 11 | :maxdepth: 2 12 | 13 | userguide/introduction 14 | 15 | .. toctree:: 16 | :maxdepth: 2 17 | 18 | userguide/quickstart 19 | 20 | 21 | API Reference 22 | ------------- 23 | 24 | .. toctree:: 25 | :maxdepth: 2 26 | 27 | api 28 | 29 | 30 | Release Changes 31 | --------------- 32 | 33 | .. toctree:: 34 | :maxdepth: 2 35 | 36 | changes 37 | 38 | 39 | Author & Contributor 40 | -------------------- 41 | 42 | .. include:: ../AUTHORS 43 | :start-line: 2 44 | 45 | 46 | Indices and tables 47 | ================== 48 | 49 | * :ref:`genindex` 50 | * :ref:`modindex` 51 | * :ref:`search` 52 | 53 | -------------------------------------------------------------------------------- /docs/requirements.txt: -------------------------------------------------------------------------------- 1 | alabaster==0.4.1 2 | Sphinx==1.2.2 3 | -------------------------------------------------------------------------------- /docs/userguide/introduction.rst: -------------------------------------------------------------------------------- 1 | .. _introduction: 2 | 3 | Introduction 4 | ============ 5 | 6 | .. include:: ../../README.rst 7 | :start-line: 5 8 | :end-line: 36 9 | -------------------------------------------------------------------------------- /docs/userguide/quickstart.rst: -------------------------------------------------------------------------------- 1 | .. _quickstart: 2 | 3 | Quick Start 4 | =========== 5 | 6 | There are some simple examples built with Brownant. 7 | 8 | 9 | The Minimal Demo 10 | ---------------- 11 | 12 | This demo could get the download link from the PyPI home page of given 13 | project. 14 | 15 | .. code-block:: python 16 | 17 | # example.py 18 | from brownant import Brownant, Site 19 | from lxml import html 20 | from requests import Session 21 | 22 | site = Site(name="pypi") 23 | http = Session() 24 | 25 | 26 | @site.route("pypi.python.org", "/pypi/", defaults={"version": None}) 27 | @site.route("pypi.python.org", "/pypi//") 28 | def pypi_info(request, name, version): 29 | url = request.url.geturl() 30 | etree = html.fromstring(http.get(url).content) 31 | download_url = etree.xpath(".//div[@id='download-button']/a/@href")[0] 32 | 33 | return {"name": name, "version": version, "download_url": download_url} 34 | 35 | app = Brownant() 36 | app.mount_site(site) 37 | 38 | if __name__ == "__main__": 39 | from pprint import pprint 40 | pprint(app.dispatch_url("https://pypi.python.org/pypi/Werkzeug/0.9.4")) 41 | 42 | And run it, we will get the output:: 43 | 44 | $ python example.py 45 | {'download_url': 'https://.../source/W/Werkzeug/Werkzeug-0.9.4.tar.gz', 46 | 'name': u'Werkzeug', 47 | 'version': u'0.9.4'} 48 | 49 | 50 | The Declarative Demo 51 | -------------------- 52 | 53 | With the declarative usage, the workflow will be flexible and readable. 54 | 55 | First, we define the "dinergate" in a site supported module: 56 | 57 | .. code-block:: python 58 | 59 | # sites/pypi.py 60 | from brownant.site import Site 61 | from brownant.dinergate import Dinergate 62 | from brownant.pipeline.network import TextResponseProperty 63 | from brownant.pipeline.html import ElementTreeProperty, XPathTextProperty 64 | 65 | site = Site(name="pypi") 66 | 67 | 68 | @site.route("pypi.python.org", "/pypi//") 69 | class PythonPackageInfo(Dinergate): 70 | 71 | URL_TEMPLATE = "http://pypi.python.org/pypi/{self.name}/{self.version}" 72 | 73 | text_response = TextResponseProperty() 74 | etree = ElementTreeProperty() 75 | download_url = XPathTextProperty( 76 | xpath=".//div[@id='download-button']/a/@href", 77 | strip_spaces=True, pick_mode="first") 78 | 79 | @property 80 | def info(self): 81 | return {"name": self.name, "version": self.version, 82 | "download_url": self.download_url} 83 | 84 | And then we define an application instance and mount the site. 85 | 86 | .. code-block:: python 87 | 88 | # app.py 89 | from brownant import Brownant 90 | 91 | app = Brownant() 92 | app.mount_site("sites.pypi:site") 93 | 94 | 95 | if __name__ == "__main__": 96 | from pprint import pprint 97 | pkg = app.dispatch_url("https://pypi.python.org/pypi/Werkzeug/0.9.4") 98 | pprint(pkg.info) 99 | 100 | And run it, we will get the same output. 101 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [pytest] 2 | addopts = --cov brownant --pep8 3 | pep8ignore = 4 | docs/conf.py ALL 5 | docs/_themes/* ALL 6 | [bdist_wheel] 7 | universal = 1 8 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | from os.path import dirname, realpath, join 3 | 4 | current_dir = dirname(realpath(__file__)) 5 | 6 | with open(join(current_dir, "README.rst")) as long_description_file: 7 | long_description = long_description_file.read() 8 | 9 | install_requires = [ 10 | "Werkzeug >= 0.8", 11 | "requests >= 1.0", 12 | "lxml >= 3.7.3", 13 | "six", 14 | ] 15 | 16 | setup( 17 | name="brownant", 18 | packages=find_packages(exclude=["tests", "docs"]), 19 | version="0.1.7", 20 | description="A lightweight web data extracting framework.", 21 | long_description=long_description, 22 | author="Subject-Dev Team, Douban Inc.", 23 | author_email="subject-dev@douban.com", 24 | url="https://github.com/douban/brownant", 25 | license="BSD", 26 | keywords=["extract", "web data"], 27 | classifiers=[ 28 | "Programming Language :: Python", 29 | "Programming Language :: Python :: 2.7", 30 | "Programming Language :: Python :: 3.3", 31 | "Programming Language :: Python :: 3.4", 32 | "Programming Language :: Python :: Implementation :: PyPy", 33 | "Development Status :: 3 - Alpha", 34 | "License :: OSI Approved :: BSD License", 35 | "Intended Audience :: Developers", 36 | "Operating System :: OS Independent", 37 | "Environment :: Other Environment", 38 | "Topic :: Software Development :: Libraries :: Python Modules", 39 | ], 40 | install_requires=install_requires, 41 | ) 42 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/douban/brownant/3c7e6d30f67b8f0f8ca1f823ea3daed74e8725cd/tests/__init__.py -------------------------------------------------------------------------------- /tests/test_app.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import, unicode_literals 2 | 3 | from pytest import fixture, raises 4 | from mock import patch 5 | 6 | from brownant import Brownant, redirect 7 | from brownant.exceptions import NotSupported 8 | 9 | 10 | class StubEndpoint(object): 11 | 12 | name = __name__ + ".StubEndpoint" 13 | 14 | def __init__(self, request, id_, **kwargs): 15 | self.request = request 16 | self.id_ = id_ 17 | 18 | 19 | def redirect_endpoint(request, **kwargs): 20 | should_redirect = (request.args.get("r") == "1") 21 | if should_redirect: 22 | return redirect("http://redirect.example.com/42?id=24") 23 | return kwargs, request 24 | 25 | 26 | redirect_endpoint.__qualname__ = __name__ + "." + redirect_endpoint.__name__ 27 | 28 | 29 | @fixture 30 | def app(): 31 | _app = Brownant() 32 | _app.add_url_rule("m.example.com", "/item/", StubEndpoint.name) 33 | _app.add_url_rule("m.example.co.jp", "/item/", StubEndpoint.name) 34 | return _app 35 | 36 | 37 | def test_new_app(app): 38 | assert isinstance(app, Brownant) 39 | assert callable(app.add_url_rule) 40 | assert callable(app.dispatch_url) 41 | assert callable(app.mount_site) 42 | 43 | 44 | def test_match_url(app): 45 | stub = app.dispatch_url("http://m.example.com/item/289263?page=1&q=t") 46 | 47 | assert stub.id_ == 289263 48 | assert stub.request.args["page"] == "1" 49 | assert stub.request.args["q"] == "t" 50 | 51 | with raises(KeyError): 52 | stub.request.args["other"] 53 | 54 | assert repr(stub.request).startswith("Request(") 55 | assert repr(stub.request).endswith(")") 56 | assert "url=" in repr(stub.request) 57 | assert "m.example.com" in repr(stub.request) 58 | assert "/item/289263" in repr(stub.request) 59 | assert "args=" in repr(stub.request) 60 | 61 | assert stub.request.url.scheme == "http" 62 | assert stub.request.url.hostname == "m.example.com" 63 | assert stub.request.url.path == "/item/289263" 64 | 65 | assert stub.request.args.get("page", type=int) == 1 66 | assert stub.request.args["q"] == "t" 67 | 68 | 69 | def test_match_url_without_redirect(app): 70 | app.add_url_rule("detail.example.com", "/item/", 71 | StubEndpoint.name, defaults={"p": "a"}) 72 | app.add_url_rule("mdetail.example.com", "/item/", 73 | StubEndpoint.name, defaults={"p": "a"}) 74 | 75 | stub = app.dispatch_url("http://detail.example.com/item/12346?page=6") 76 | assert stub.id_ == 12346 77 | assert stub.request.args.get("page", type=int) == 6 78 | 79 | stub = app.dispatch_url("http://mdetail.example.com/item/12346?page=6") 80 | assert stub.id_ == 12346 81 | assert stub.request.args.get("page", type=int) == 6 82 | 83 | 84 | def test_match_url_with_redirect(app): 85 | app.add_url_rule("m.example.com", "/42", StubEndpoint.name, 86 | redirect_to="item/42") 87 | 88 | stub = app.dispatch_url("http://m.example.com/item/42/?page=6") 89 | assert stub.id_ == 42 90 | assert stub.request.args.get("page", type=int) == 6 91 | 92 | stub = app.dispatch_url("http://m.example.com/42?page=6") 93 | assert stub.id_ == 42 94 | assert stub.request.args.get("page", type=int) == 6 95 | 96 | stub = app.dispatch_url("http://m.example.com/item/42/") 97 | assert stub.id_ == 42 98 | with raises(KeyError): 99 | stub.request.args["page"] 100 | 101 | stub = app.dispatch_url("http://m.example.com/42") 102 | assert stub.id_ == 42 103 | with raises(KeyError): 104 | stub.request.args["page"] 105 | 106 | 107 | def test_match_url_and_handle_user_redirect(app): 108 | domain = "redirect.example.com" 109 | app.add_url_rule(domain, "/", redirect_endpoint.__qualname__) 110 | 111 | kwargs, request = app.dispatch_url("http://{0}/123?id=5".format(domain)) 112 | assert kwargs == {"id": "123"} 113 | assert request.args["id"] == "5" 114 | 115 | kwargs, request = app.dispatch_url("http://{0}/1?id=5&r=1".format(domain)) 116 | assert kwargs == {"id": "42"} 117 | assert request.args["id"] == "24" 118 | 119 | 120 | def test_match_non_ascii_url(app): 121 | url = u"http://m.example.co.jp/item/\u30de\u30a4\u30f3\u30c9" 122 | stub = app.dispatch_url(url) 123 | 124 | encoded_path = "/item/%E3%83%9E%E3%82%A4%E3%83%B3%E3%83%89" 125 | assert stub.request.url.scheme == "http" 126 | assert stub.request.url.hostname == "m.example.co.jp" 127 | assert stub.request.url.path == encoded_path 128 | 129 | 130 | def test_match_non_ascii_query(app): 131 | url = u"http://m.example.co.jp/item/test?src=\u63a2\u9669&r=1" 132 | stub = app.dispatch_url(url) 133 | 134 | assert stub.request.url.scheme == "http" 135 | assert stub.request.url.hostname == "m.example.co.jp" 136 | assert stub.request.url.path == "/item/test" 137 | assert stub.request.url.query == "src=%E6%8E%A2%E9%99%A9&r=1" 138 | 139 | assert set(stub.request.args) == {"src", "r"} 140 | assert stub.request.args["src"] == u"\u63a2\u9669" 141 | assert stub.request.args["r"] == "1" 142 | 143 | 144 | def test_match_unexcepted_url(app): 145 | unexcepted_url = "http://m.example.com/category/19352" 146 | 147 | with raises(NotSupported) as error: 148 | app.dispatch_url(unexcepted_url) 149 | 150 | # ensure the exception information is useful 151 | assert unexcepted_url in str(error) 152 | 153 | # ensure the rule could be added in runtime 154 | app.add_url_rule("m.example.com", "/category/", StubEndpoint.name) 155 | stub = app.dispatch_url(unexcepted_url) 156 | assert stub.id_ == 19352 157 | assert len(stub.request.args) == 0 158 | 159 | 160 | def test_match_invalid_url(app): 161 | # empty string 162 | with raises(NotSupported) as error: 163 | app.dispatch_url("") 164 | assert "invalid" in str(error) 165 | 166 | # has not hostname 167 | with raises(NotSupported) as error: 168 | app.dispatch_url("/") 169 | assert "invalid" in str(error) 170 | 171 | # has not hostname and path 172 | with raises(NotSupported) as error: 173 | app.dispatch_url("\\") 174 | assert "invalid" in str(error) 175 | 176 | # not http scheme 177 | with raises(NotSupported) as error: 178 | app.dispatch_url("ftp://example.com") 179 | assert "invalid" in str(error) 180 | 181 | # valid input 182 | with raises(NotSupported) as error: 183 | app.dispatch_url("http://example.com") 184 | assert "invalid" not in str(error) 185 | 186 | with raises(NotSupported) as error: 187 | app.dispatch_url("https://example.com") 188 | assert "invalid" not in str(error) 189 | 190 | 191 | foo_site = object() 192 | 193 | 194 | def test_mount_site(app): 195 | foo_site_name = __name__ + ".foo_site" 196 | with patch(foo_site_name): 197 | app.mount_site(foo_site) 198 | foo_site.play_actions.assert_called_with(target=app) 199 | 200 | 201 | def test_mount_site_by_string_name(app): 202 | foo_site_name = __name__ + ".foo_site" 203 | with patch(foo_site_name): 204 | app.mount_site(foo_site_name) 205 | foo_site.play_actions.assert_called_with(target=app) 206 | -------------------------------------------------------------------------------- /tests/test_deprecation.py: -------------------------------------------------------------------------------- 1 | from brownant import Brownant, BrownAnt 2 | 3 | 4 | def test_deprecation(recwarn): 5 | app = BrownAnt() 6 | warning = recwarn.pop(DeprecationWarning) 7 | 8 | assert isinstance(app, Brownant) 9 | assert issubclass(warning.category, DeprecationWarning) 10 | assert "Brownant" in str(warning.message) 11 | assert "app.py" in warning.filename 12 | assert warning.lineno 13 | -------------------------------------------------------------------------------- /tests/test_dinergate.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import, unicode_literals 2 | 3 | from mock import Mock 4 | from pytest import raises 5 | 6 | from brownant import Dinergate 7 | 8 | 9 | def test_basic(): 10 | from requests import Session 11 | from werkzeug.utils import cached_property 12 | 13 | @cached_property 14 | def func_without_name(self): 15 | return [self] 16 | func_without_name.__name__ = None 17 | 18 | class FooDinergate(Dinergate): 19 | bar = func_without_name 20 | 21 | assert FooDinergate.bar.__name__ == "bar" 22 | 23 | mock_request = Mock() 24 | ant = FooDinergate(mock_request) 25 | 26 | assert ant.request is mock_request 27 | assert isinstance(ant.http_client, Session) 28 | assert ant.bar == [ant] 29 | 30 | 31 | def test_custom_kwargs(): 32 | mock_request = Mock() 33 | ant = Dinergate(mock_request, foo=42, bar="hello") 34 | assert ant.foo == 42 35 | assert ant.bar == "hello" 36 | 37 | 38 | def test_custom_http_client(): 39 | mock_request = Mock() 40 | mock_http_client = Mock() 41 | ant = Dinergate(mock_request, mock_http_client) 42 | 43 | ant.request.args.get("name", type=str) 44 | mock_request.args.get.assert_called_once_with("name", type=str) 45 | 46 | ant.http_client.post("http://example.com") 47 | mock_http_client.post.assert_called_once_with("http://example.com") 48 | 49 | 50 | def test_url_template(): 51 | class FooDinergate(Dinergate): 52 | foo = 42 53 | bar = "page" 54 | 55 | URL_TEMPLATE = "http://example.com/{self.bar}/{self.foo}" 56 | 57 | ant = FooDinergate(request=Mock(), http_client=Mock()) 58 | assert ant.url == "http://example.com/page/42" 59 | 60 | dead_ant = Dinergate(request=Mock(), http_client=Mock()) 61 | with raises(NotImplementedError): 62 | dead_ant.url 63 | -------------------------------------------------------------------------------- /tests/test_pipeline/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/douban/brownant/3c7e6d30f67b8f0f8ca1f823ea3daed74e8725cd/tests/test_pipeline/__init__.py -------------------------------------------------------------------------------- /tests/test_pipeline/test_base.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import, unicode_literals 2 | 3 | from pytest import raises 4 | 5 | from brownant.pipeline.base import PipelineProperty 6 | 7 | 8 | def test_required_attrs(): 9 | class SpamProperty(PipelineProperty): 10 | required_attrs = {"egg"} 11 | 12 | def provide_value(self, obj): 13 | return obj 14 | 15 | # valid 16 | spam_property = SpamProperty(egg=42) 17 | assert spam_property.egg == 42 18 | assert "egg" not in spam_property.options 19 | assert "egg" not in spam_property.attr_names 20 | with raises(AttributeError): 21 | spam_property.foo 22 | 23 | # invalid 24 | with raises(TypeError) as excinfo: 25 | spam_property = SpamProperty(spam=42) 26 | assert "egg" in repr(excinfo.value) 27 | 28 | 29 | def test_attr_name(): 30 | class SpamProperty(PipelineProperty): 31 | def prepare(self): 32 | self.attr_names.setdefault("egg_attr", "egg") 33 | 34 | def provide_value(self, obj): 35 | return self.get_attr(obj, "egg_attr") 36 | 37 | class Spam(object): 38 | def __init__(self, **kwargs): 39 | vars(self).update(kwargs) 40 | 41 | spam_a = SpamProperty(egg=42) 42 | assert spam_a.attr_names["egg_attr"] == "egg" 43 | assert spam_a.provide_value(Spam(egg=1024)) == 1024 44 | 45 | spam_b = SpamProperty(egg=42, egg_attr="foo_egg") 46 | assert spam_b.attr_names["egg_attr"] == "foo_egg" 47 | assert spam_b.provide_value(Spam(foo_egg=2048)) == 2048 48 | 49 | 50 | def test_optional_attr(): 51 | class SpamProperty(PipelineProperty): 52 | required_attrs = {"egg"} 53 | 54 | def provide_value(self, obj): 55 | return obj 56 | 57 | spam = SpamProperty(egg=41, foo=42, bar=43, aha_attr=44) 58 | assert spam.options["foo"] == 42 59 | assert spam.options["bar"] == 43 60 | assert "egg" not in spam.options 61 | assert "aha_attr" not in spam.options 62 | -------------------------------------------------------------------------------- /tests/test_pipeline/test_html.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import, unicode_literals 2 | 3 | from pytest import raises 4 | from mock import patch, Mock 5 | 6 | from brownant.pipeline.html import ElementTreeProperty, XPathTextProperty 7 | 8 | 9 | # ElementTreeProperty 10 | 11 | def test_etree_default_attr_name(): 12 | etree = ElementTreeProperty() 13 | assert etree.attr_names["text_response_attr"] == "text_response" 14 | 15 | 16 | def test_etree_default_encoding_show_be_none(): 17 | etree = ElementTreeProperty() 18 | assert etree.options["encoding"] is None 19 | 20 | 21 | @patch("lxml.html.fromstring") 22 | def test_etree_general_parse_with_default(fromstring): 23 | mock = Mock() 24 | etree = ElementTreeProperty() 25 | etree.provide_value(mock) 26 | fromstring.assert_called_once_with(mock.text_response) 27 | 28 | 29 | @patch("lxml.html.fromstring") 30 | def test_etree_general(fromstring): 31 | mock = Mock() 32 | etree = ElementTreeProperty(text_response_attr="foo") 33 | etree.provide_value(mock) 34 | fromstring.assert_called_once_with(mock.foo) 35 | 36 | 37 | @patch("lxml.html.fromstring") 38 | def test_etree_general_parse_with_encoding(fromstring): 39 | mock = Mock() 40 | etree = ElementTreeProperty(text_response_attr="foo", 41 | encoding="utf-8") 42 | etree.provide_value(mock) 43 | fromstring.assert_called_once_with(mock.foo.encode("utf-8")) 44 | 45 | 46 | # XPathTextProperty 47 | 48 | def test_xpath_default_attr_name(): 49 | with raises(TypeError): 50 | XPathTextProperty() 51 | 52 | text = XPathTextProperty(xpath="//path") 53 | assert text.xpath == "//path" 54 | assert text.attr_names["etree_attr"] == "etree" 55 | assert text.options["strip_spaces"] is False 56 | assert text.options["pick_mode"] == "join" 57 | assert text.options["joiner"] == " " 58 | 59 | 60 | def test_xpath_without_spaces(): 61 | mock = Mock() 62 | mock.tree.xpath.return_value = ["a", "b", "c"] 63 | 64 | # pick_mode: join 65 | text = XPathTextProperty(xpath="//path", etree_attr="tree", 66 | pick_mode="join", joiner="|") 67 | rv = text.provide_value(mock) 68 | mock.tree.xpath.assert_called_with("//path") 69 | assert rv == "a|b|c" 70 | 71 | # pick_mode: first 72 | text = XPathTextProperty(xpath="//another-path", etree_attr="tree", 73 | pick_mode="first") 74 | rv = text.provide_value(mock) 75 | mock.tree.xpath.assert_called_with("//another-path") 76 | assert rv == "a" 77 | 78 | 79 | def test_xpath_with_striping_spaces(): 80 | mock = Mock() 81 | mock.tree.xpath.return_value = [" a ", "\n b \n", "\n\n c \t"] 82 | 83 | # strip_spaces and join 84 | text = XPathTextProperty(xpath="//foo-path", etree_attr="tree", 85 | pick_mode="join", strip_spaces=True) 86 | rv = text.provide_value(mock) 87 | mock.tree.xpath.assert_called_with("//foo-path") 88 | assert rv == "a b c" 89 | 90 | # strip_spaces and first 91 | text = XPathTextProperty(xpath="//bar-path", etree_attr="tree", 92 | pick_mode="first", strip_spaces=True) 93 | rv = text.provide_value(mock) 94 | mock.tree.xpath.assert_called_with("//bar-path") 95 | assert rv == "a" 96 | 97 | 98 | def test_xpath_keep_pick_mode(): 99 | mock = Mock() 100 | value = ['a', 'b', 'c'] 101 | mock.tree.xpath.return_value = value 102 | 103 | text = XPathTextProperty(xpath="//foo-path", etree_attr="tree", 104 | pick_mode="keep") 105 | rv = text.provide_value(mock) 106 | mock.tree.xpath.assert_called_with("//foo-path") 107 | assert rv == value 108 | 109 | 110 | def test_xpath_invalid_pick_mode(): 111 | with raises(ValueError) as excinfo: 112 | text = XPathTextProperty(xpath="//foo-path", pick_mode="unknown") 113 | text.provide_value(Mock()) 114 | assert "unknown" in repr(excinfo.value) 115 | -------------------------------------------------------------------------------- /tests/test_pipeline/test_network.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import, unicode_literals 2 | 3 | from mock import Mock, patch 4 | from pytest import raises 5 | 6 | from brownant.exceptions import NotSupported 7 | from brownant.pipeline.network import (HTTPClientProperty, URLQueryProperty, 8 | TextResponseProperty, ResponseProperty, 9 | JSONResponseProperty) 10 | 11 | 12 | def test_http_client(): 13 | dinergate = Mock() 14 | with patch("requests.Session") as Session: 15 | instance = Session.return_value 16 | http_client = HTTPClientProperty(session_class=Session) 17 | assert http_client.provide_value(dinergate) is instance 18 | Session.assert_called_once_with() 19 | 20 | 21 | def test_url_query(): 22 | mock = Mock() 23 | mock.request.args.get.return_value = "42" 24 | 25 | url_query = URLQueryProperty(name="value") 26 | rv = url_query.provide_value(mock) 27 | 28 | assert rv == "42" 29 | mock.request.args.get.assert_called_once_with("value", type=None) 30 | 31 | 32 | def test_url_query_type(): 33 | mock = Mock() 34 | mock.request.args.get.return_value = 42 35 | 36 | url_query = URLQueryProperty(name="value", type=int) 37 | rv = url_query.provide_value(mock) 38 | 39 | assert rv == 42 40 | mock.request.args.get.assert_called_once_with("value", type=int) 41 | 42 | 43 | def test_url_query_required(): 44 | mock = Mock() 45 | mock.request.args.get.return_value = None 46 | 47 | url_query = URLQueryProperty(name="value") # default be required 48 | with raises(NotSupported): 49 | url_query.provide_value(mock) 50 | 51 | 52 | def test_url_query_optional(): 53 | mock = Mock() 54 | mock.request.args.get.return_value = None 55 | 56 | url_query = URLQueryProperty(name="d", type=float, required=False) 57 | rv = url_query.provide_value(mock) 58 | 59 | assert rv is None 60 | mock.request.args.get.assert_called_once_with("d", type=float) 61 | 62 | 63 | def test_url_query_required_boundary_condition(): 64 | mock = Mock() 65 | mock.request.args.get.return_value = 0 66 | 67 | url_query = URLQueryProperty(name="num") 68 | rv = url_query.provide_value(mock) 69 | 70 | assert rv == 0 71 | mock.request.args.get.assert_called_once_with("num", type=None) 72 | 73 | 74 | def test_base_response(): 75 | response = Mock() 76 | response.text = "OK" 77 | 78 | mock = Mock() 79 | mock.url = "http://example.com" 80 | mock.http_client.request.return_value = response 81 | 82 | response = ResponseProperty() 83 | with raises(KeyError): 84 | response.provide_value(mock) 85 | 86 | 87 | def test_text_response(): 88 | class HTTPError(Exception): 89 | pass 90 | 91 | response = Mock() 92 | response.content = "OK" 93 | response.raise_for_status.side_effect = [None, HTTPError()] 94 | 95 | mock = Mock() 96 | mock.url = "http://example.com" 97 | mock.http_client.request.return_value = response 98 | 99 | text = TextResponseProperty(method="POST") 100 | rv = text.provide_value(mock) 101 | 102 | assert rv == "OK" 103 | response.raise_for_status.assert_called_once_with() 104 | mock.http_client.request.assert_called_once_with( 105 | method="POST", url="http://example.com") 106 | 107 | with raises(HTTPError): 108 | text.provide_value(mock) 109 | 110 | 111 | def test_json_response(): 112 | class HTTPError(Exception): 113 | pass 114 | 115 | response = Mock() 116 | response.json.return_value = {'a': 1, 'b': {'c': 2, 'd': 3}} 117 | response.raise_for_status.side_effect = [None, HTTPError()] 118 | 119 | mock = Mock() 120 | mock.url = "http://example.com" 121 | mock.http_client.request.return_value = response 122 | 123 | json = JSONResponseProperty(method="POST") 124 | rv = json.provide_value(mock) 125 | 126 | assert rv == { 127 | 'a': 1, 128 | 'b': { 129 | 'c': 2, 130 | 'd': 3 131 | } 132 | } 133 | response.raise_for_status.assert_called_once_with() 134 | mock.http_client.request.assert_called_once_with( 135 | method="POST", url="http://example.com") 136 | 137 | with raises(HTTPError): 138 | json.provide_value(mock) 139 | -------------------------------------------------------------------------------- /tests/test_site.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import, unicode_literals 2 | 3 | from pytest import fixture 4 | from mock import Mock 5 | 6 | from brownant import Site 7 | 8 | 9 | @fixture 10 | def sites(): 11 | _sites = { 12 | "s1": Site("s1"), 13 | "s2": Site("s2"), 14 | "s3": Site("s3"), 15 | } 16 | return _sites 17 | 18 | 19 | def test_new_site(sites): 20 | assert sites["s1"].name == "s1" 21 | assert sites["s2"].name == "s2" 22 | assert sites["s3"].name == "s3" 23 | 24 | assert sites["s1"].actions == [] 25 | assert sites["s2"].actions == [] 26 | assert sites["s3"].actions == [] 27 | 28 | 29 | def test_record_and_play_actions(sites): 30 | site = sites["s1"] 31 | 32 | mock = Mock() 33 | site.record_action("method_a", 10, "s", is_it=True) 34 | site.play_actions(target=mock) 35 | mock.method_a.assert_called_once_with(10, "s", is_it=True) 36 | 37 | 38 | def test_route(sites): 39 | site = sites["s1"] 40 | 41 | @site.route("m.example.com", "/article/") 42 | def handler(request, article_id): 43 | pass 44 | 45 | mock = Mock() 46 | site.play_actions(target=mock) 47 | mock.add_url_rule.assert_called_once_with( 48 | "m.example.com", 49 | "/article/", 50 | __name__ + ":handler" 51 | ) 52 | -------------------------------------------------------------------------------- /tests/test_utils.py: -------------------------------------------------------------------------------- 1 | from pytest import raises 2 | 3 | from brownant.utils import to_bytes_safe 4 | 5 | 6 | UNICODE_STRING_SAMPLE = u"\u5b89\u5168 SAFE" 7 | BYTES_SEQUENCE_SAMPLE = b"\xe5\xae\x89\xe5\x85\xa8 SAFE" 8 | 9 | 10 | def test_to_bytes_safe(): 11 | assert to_bytes_safe(UNICODE_STRING_SAMPLE) == BYTES_SEQUENCE_SAMPLE 12 | assert to_bytes_safe(BYTES_SEQUENCE_SAMPLE) == BYTES_SEQUENCE_SAMPLE 13 | assert to_bytes_safe(u"ABC") == b"ABC" 14 | assert to_bytes_safe(b"ABC") == b"ABC" 15 | 16 | assert type(to_bytes_safe(UNICODE_STRING_SAMPLE)) is bytes 17 | assert type(to_bytes_safe(BYTES_SEQUENCE_SAMPLE)) is bytes 18 | assert type(to_bytes_safe(u"ABC")) is bytes 19 | assert type(to_bytes_safe(b"ABC")) is bytes 20 | 21 | with raises(TypeError): 22 | to_bytes_safe(42) 23 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | [tox] 2 | envlist = py27,py33,py34,pypy 3 | [testenv] 4 | deps = 5 | pytest 6 | pytest-cov 7 | pytest-pep8 8 | mock 9 | commands = 10 | py.test 11 | --------------------------------------------------------------------------------