├── .bumpversion.cfg
├── .editorconfig
├── .gitignore
├── .travis.yml
├── AUTHORS
├── CHANGES
├── LICENSE
├── MANIFEST.in
├── README.rst
├── brownant
├── __init__.py
├── app.py
├── dinergate.py
├── exceptions.py
├── pipeline
│ ├── __init__.py
│ ├── base.py
│ ├── html.py
│ └── network.py
├── request.py
├── site.py
└── utils.py
├── docs
├── Makefile
├── _static
│ └── .gitkeep
├── api.rst
├── changes.rst
├── conf.py
├── index.rst
├── requirements.txt
└── userguide
│ ├── introduction.rst
│ └── quickstart.rst
├── setup.cfg
├── setup.py
├── tests
├── __init__.py
├── test_app.py
├── test_deprecation.py
├── test_dinergate.py
├── test_pipeline
│ ├── __init__.py
│ ├── test_base.py
│ ├── test_html.py
│ └── test_network.py
├── test_site.py
└── test_utils.py
└── tox.ini
/.bumpversion.cfg:
--------------------------------------------------------------------------------
1 | [bumpversion]
2 | files = setup.py docs/conf.py brownant/__init__.py
3 | commit = True
4 | tag = False
5 | current_version = 0.1.7
6 |
7 |
--------------------------------------------------------------------------------
/.editorconfig:
--------------------------------------------------------------------------------
1 | # http://editorconfig.org
2 |
3 | root = true
4 |
5 | [*.py]
6 | indent_style = space
7 | indent_size = 4
8 | charset = utf-8
9 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | *.py[cod]
2 |
3 | # C extensions
4 | *.so
5 |
6 | # Packages
7 | *.egg
8 | *.egg-info
9 | dist
10 | build
11 | eggs
12 | parts
13 | bin
14 | var
15 | sdist
16 | develop-eggs
17 | .installed.cfg
18 | lib
19 | lib64
20 | __pycache__
21 |
22 | # Installer logs
23 | pip-log.txt
24 |
25 | # Unit test / coverage reports
26 | .coverage
27 | .tox
28 | .cache
29 |
30 | # Translations
31 | *.mo
32 |
33 | # Editor
34 | *.sw[po]
35 |
36 | # Sphinx
37 | docs/_build
38 |
--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | language: python
2 | python:
3 | - "2.7"
4 | - "3.5"
5 | - "3.6"
6 | - "pypy"
7 | install:
8 | - "pip install ."
9 | - "pip install pytest>=2.4.2 -U"
10 | - "pip install pytest-cov pytest-pep8 mock coveralls"
11 | script: "py.test"
12 | after_success:
13 | coveralls
14 | branches:
15 | only:
16 | - master
17 | - develop
18 |
--------------------------------------------------------------------------------
/AUTHORS:
--------------------------------------------------------------------------------
1 | Authors & Contributors
2 | ======================
3 |
4 | - Jiangge Zhang https://github.com/tonyseek
5 | - VeryCB https://github.com/VeryCB
6 | - dongweiming https://github.com/dongweiming
7 | - Caratpine https://github.com/Caratpine
8 |
--------------------------------------------------------------------------------
/CHANGES:
--------------------------------------------------------------------------------
1 | Release 0.1.7 (Mar 3, 2017)
2 | ===========================
3 |
4 | - Add support to lxml >= 3.7.3. (by Caratpine)
5 | - Compatible with pip > 7.1.2 && support Python 3.5/3.6. (by dongweiming)
6 |
7 | Release 0.1.6 (Jul 9, 2015)
8 | ===========================
9 |
10 | - Add JSONResponseProperty to support parsing JSON response. (by dongweiming)
11 |
12 |
13 | Release 0.1.5 (Apr 8, 2014)
14 | ===========================
15 |
16 | Some API will be changed without backward compatibility in next major release.
17 |
18 | - Add support to redirect while executing request handler.
19 | - Add support to run on Python 3.4 without any modified.
20 | - Refactor the ``http_client`` into a pipeline property.
21 | - Remove the magic arguments of ``http_client``
22 | - Expose the classes in the top-level package. We can import all from
23 | ``brownant`` and ``brownant.pipeline`` now.
24 | - Rename the ``BrownAnt`` into ``Brownant``. The ``BrownAnt`` is still usable
25 | but not recommended.
26 | - Fix the unicode error for URLs which included non-ascii query.
27 | - Fix the lxml compatible problem with PyPy.
28 |
29 |
30 | Release 0.1.4 (Oct 24, 2013)
31 | ============================
32 |
33 | - Fix the RequestRedirect raised problem.
34 | - Add the new pick mode "keep" of XPathTextProperty. (by VeryCB)
35 | - Add the encoding parameter of the ElementTreeProperty. That could let the
36 | property provide bytes instead of unicode string. (by VeryCB)
37 |
38 |
39 | Release 0.1.3 (Oct 19, 2013)
40 | ============================
41 |
42 | - Fix the broken CI (travis-ci).
43 |
44 |
45 | Release 0.1.2 (Oct 19, 2013)
46 | ============================
47 |
48 | - Fix some unicode compatible problems for URL string.
49 | - Prevent the invalid URL string input.
50 | - Change the theme of document into built-in one named "nature".
51 |
52 |
53 | Release 0.1.1 (Sep 30, 2013)
54 | ============================
55 |
56 | - Refine the documents and give an example in the Quick Start section.
57 |
58 |
59 | Release 0.1.0 (Sep 29, 2013)
60 | ============================
61 |
62 | - First public release.
63 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Copyright (c) 2014, Douban Inc.
2 | All rights reserved.
3 |
4 | Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
5 |
6 | 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
7 |
8 | 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
9 |
10 | 3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
11 |
12 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
13 |
--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include AUTHORS CHANGES LICENSE README.rst tox.ini
2 | recursive-include tests *
3 | recursive-include docs *
4 | recursive-exclude docs *.pyc
5 | recursive-exclude docs *.pyo
6 | recursive-exclude tests *.pyc
7 | recursive-exclude tests *.pyo
8 | prune docs/_build
9 |
--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
1 | |Build Status| |Coverage Status| |PyPI Version| |PyPI Downloads| |Wheel Status|
2 |
3 | Brownant
4 | ========
5 |
6 | Brownant is a lightweight web data extracting framework.
7 |
8 |
9 | Who uses it?
10 | ------------
11 |
12 | At the moment, `dongxi.douban.com `_
13 | (a.k.a. Douban Dongxi) uses Brownant in production environment.
14 |
15 |
16 | Installation
17 | ------------
18 |
19 | ::
20 |
21 | $ pip install brownant
22 |
23 |
24 | Links
25 | -----
26 |
27 | - `Document `_
28 | - `Issue Track `_
29 |
30 |
31 | Issues
32 | ------
33 |
34 | If you want to report bugs or request features, please create issues on
35 | `GitHub Issues `_.
36 |
37 |
38 | Contributes
39 | -----------
40 |
41 | You can send a pull reueqst on
42 | `GitHub `_.
43 |
44 |
45 | .. |Build Status| image:: https://travis-ci.org/douban/brownant.svg?branch=master,develop
46 | :target: https://travis-ci.org/douban/brownant
47 | :alt: Build Status
48 | .. |Coverage Status| image:: https://img.shields.io/coveralls/douban/brownant/develop.svg
49 | :target: https://coveralls.io/r/douban/brownant
50 | :alt: Coverage Status
51 | .. |Wheel Status| image:: https://img.shields.io/pypi/wheel/brownant.svg
52 | :target: https://pypi.python.org/pypi/brownant
53 | :alt: Wheel Status
54 | .. |PyPI Version| image:: https://img.shields.io/pypi/v/brownant.svg
55 | :target: https://pypi.python.org/pypi/brownant
56 | :alt: PyPI Version
57 | .. |PyPI Downloads| image:: https://img.shields.io/pypi/dm/brownant.svg
58 | :target: https://pypi.python.org/pypi/brownant
59 | :alt: Downloads
60 |
--------------------------------------------------------------------------------
/brownant/__init__.py:
--------------------------------------------------------------------------------
1 | from .app import Brownant, BrownAnt, redirect
2 | from .dinergate import Dinergate
3 | from .site import Site
4 |
5 |
6 | __version__ = "0.1.7"
7 | __all__ = ["Brownant", "BrownAnt", "redirect", "Dinergate", "Site"]
8 |
--------------------------------------------------------------------------------
/brownant/app.py:
--------------------------------------------------------------------------------
1 | from __future__ import absolute_import, unicode_literals
2 |
3 | from warnings import warn
4 |
5 | from six import string_types
6 | from six.moves import urllib
7 | from werkzeug.utils import import_string
8 | from werkzeug.urls import url_decode, url_encode
9 | from werkzeug.routing import Map, Rule, NotFound, RequestRedirect
10 |
11 | from .request import Request
12 | from .exceptions import NotSupported
13 | from .utils import to_bytes_safe
14 |
15 |
16 | class Brownant(object):
17 | """The app which could manage whole crawler system."""
18 |
19 | def __init__(self):
20 | self.url_map = Map(strict_slashes=False, host_matching=True,
21 | redirect_defaults=False)
22 |
23 | def add_url_rule(self, host, rule_string, endpoint, **options):
24 | """Add a url rule to the app instance.
25 |
26 | The url rule is the same with Flask apps and other Werkzeug apps.
27 |
28 | :param host: the matched hostname. e.g. "www.python.org"
29 | :param rule_string: the matched path pattern. e.g. "/news/"
30 | :param endpoint: the endpoint name as a dispatching key such as the
31 | qualified name of the object.
32 | """
33 | rule = Rule(rule_string, host=host, endpoint=endpoint, **options)
34 | self.url_map.add(rule)
35 |
36 | def parse_url(self, url_string):
37 | """Parse the URL string with the url map of this app instance.
38 |
39 | :param url_string: the origin URL string.
40 | :returns: the tuple as `(url, url_adapter, query_args)`, the url is
41 | parsed by the standard library `urlparse`, the url_adapter is
42 | from the werkzeug bound URL map, the query_args is a
43 | multidict from the werkzeug.
44 | """
45 | url = urllib.parse.urlparse(url_string)
46 | url = self.validate_url(url)
47 | url_adapter = self.url_map.bind(server_name=url.hostname,
48 | url_scheme=url.scheme,
49 | path_info=url.path)
50 | query_args = url_decode(url.query)
51 | return url, url_adapter, query_args
52 |
53 | def validate_url(self, url):
54 | """Validate the :class:`~urllib.parse.ParseResult` object.
55 |
56 | This method will make sure the :meth:`~brownant.app.BrownAnt.parse_url`
57 | could work as expected even meet a unexpected URL string.
58 |
59 | :param url: the parsed url.
60 | :type url: :class:`~urllib.parse.ParseResult`
61 | """
62 | # fix up the non-ascii path
63 | url_path = to_bytes_safe(url.path)
64 | url_path = urllib.parse.quote(url_path, safe=b"/%")
65 |
66 | # fix up the non-ascii query
67 | url_query = to_bytes_safe(url.query)
68 | url_query = urllib.parse.quote(url_query, safe=b"?=&")
69 |
70 | url = urllib.parse.ParseResult(url.scheme, url.netloc, url_path,
71 | url.params, url_query, url.fragment)
72 |
73 | # validate the components of URL
74 | has_hostname = url.hostname is not None and len(url.hostname) > 0
75 | has_http_scheme = url.scheme in ("http", "https")
76 | has_path = not len(url.path) or url.path.startswith("/")
77 |
78 | if not (has_hostname and has_http_scheme and has_path):
79 | raise NotSupported("invalid url: %s" % repr(url))
80 |
81 | return url
82 |
83 | def dispatch_url(self, url_string):
84 | """Dispatch the URL string to the target endpoint function.
85 |
86 | :param url_string: the origin URL string.
87 | :returns: the return value of calling dispatched function.
88 | """
89 | url, url_adapter, query_args = self.parse_url(url_string)
90 |
91 | try:
92 | endpoint, kwargs = url_adapter.match()
93 | except NotFound:
94 | raise NotSupported(url_string)
95 | except RequestRedirect as e:
96 | new_url = "{0.new_url}?{1}".format(e, url_encode(query_args))
97 | return self.dispatch_url(new_url)
98 |
99 | try:
100 | handler = import_string(endpoint)
101 | request = Request(url=url, args=query_args)
102 | return handler(request, **kwargs)
103 | except RequestRedirect as e:
104 | return self.dispatch_url(e.new_url)
105 |
106 | def mount_site(self, site):
107 | """Mount a supported site to this app instance.
108 |
109 | :param site: the site instance be mounted.
110 | """
111 | if isinstance(site, string_types):
112 | site = import_string(site)
113 | site.play_actions(target=self)
114 |
115 |
116 | class BrownAnt(Brownant):
117 | def __init__(self, *args, **kwargs):
118 | warn("The class name 'BrownAnt' has been deprecated. Please use "
119 | "'Brownant' instead.", DeprecationWarning)
120 | super(BrownAnt, self).__init__(*args, **kwargs)
121 |
122 |
123 | def redirect(url):
124 | """Raise the :class:`~werkzeug.routing.RequestRedirect` exception to lead
125 | the app dispatching current request to another URL.
126 |
127 | :param url: the target URL.
128 | """
129 | raise RequestRedirect(url)
130 |
--------------------------------------------------------------------------------
/brownant/dinergate.py:
--------------------------------------------------------------------------------
1 | from six import with_metaclass
2 | from werkzeug.utils import cached_property
3 |
4 | from brownant.pipeline.network import HTTPClientProperty
5 |
6 |
7 | class DinergateType(type):
8 | """The metaclass of :class:`~brownant.dinergate.Dinergate` and its
9 | subclasses.
10 |
11 | This metaclass will give all members are instance of
12 | :class:`~werkzeug.utils.cached_property` default names. It is because many
13 | pipeline properties are subclasses of
14 | :class:`~werkzeug.utils.cached_property`, but them would not be created by
15 | decorating functions. They will has not built-in :attr:`__name__`, which
16 | may cause them could not cache values as expected.
17 | """
18 |
19 | def __new__(metacls, name, bases, members):
20 | cls = type.__new__(metacls, name, bases, members)
21 | for name in dir(cls):
22 | value = getattr(cls, name)
23 | if isinstance(value, cached_property) and not value.__name__:
24 | value.__name__ = name
25 | value.__module__ = cls.__module__
26 | return cls
27 |
28 |
29 | class Dinergate(with_metaclass(DinergateType)):
30 | """The simple classify crawler.
31 |
32 | In order to work with unnamed properties such as the instances of
33 | :class:`~brownant.pipeline.base.PipelineProperty`, the meta class
34 | :class:`~brownant.dinergate.DinergateType` will scan subclasses of this
35 | class and name all unnamed members which are instances of
36 | :class:`~werkzeug.utils.cached_property`.
37 |
38 | :param request: the standard parameter passed by app.
39 | :type request: :class:`~brownant.request.Request`
40 | :param http_client: the session instance of python-requests.
41 | :type http_client: :class:`requests.Session`
42 | :param kwargs: other arguments from the URL pattern.
43 | """
44 |
45 | #: the URL template string for generating crawled target. the `self` could
46 | #: be referenced in the template.
47 | #: (e.g. `"http://www.example.com/items/{self.item_id}?page={self.page}"`)
48 | URL_TEMPLATE = None
49 |
50 | http_client = HTTPClientProperty()
51 |
52 | def __init__(self, request, http_client=None, **kwargs):
53 | self.request = request
54 | if http_client:
55 | self.http_client = http_client
56 | # assign arguments from URL pattern
57 | vars(self).update(kwargs)
58 |
59 | @property
60 | def url(self):
61 | """The fetching target URL.
62 |
63 | The default behavior of this property is build URL string with the
64 | :const:`~brownant.dinergate.Dinergate.URL_TEMPLATE`.
65 |
66 | The subclasses could override
67 | :const:`~brownant.dinergate.Dinergate.URL_TEMPLATE` or use a different
68 | implementation.
69 | """
70 | if not self.URL_TEMPLATE:
71 | raise NotImplementedError
72 | return self.URL_TEMPLATE.format(self=self)
73 |
--------------------------------------------------------------------------------
/brownant/exceptions.py:
--------------------------------------------------------------------------------
1 | from __future__ import absolute_import, unicode_literals
2 |
3 |
4 | class BrownantException(Exception):
5 | """The base exception of the Brownant framework."""
6 |
7 |
8 | class NotSupported(BrownantException):
9 | """The given URL or other identity is from a platform which not support.
10 |
11 | This exception means any url rules of the app which matched the URL could
12 | not be found.
13 | """
14 |
--------------------------------------------------------------------------------
/brownant/pipeline/__init__.py:
--------------------------------------------------------------------------------
1 | from .base import PipelineProperty
2 | from .html import ElementTreeProperty, XPathTextProperty
3 | from .network import (HTTPClientProperty, URLQueryProperty,
4 | TextResponseProperty, JSONResponseProperty)
5 |
6 |
7 | __all__ = ["PipelineProperty", "ElementTreeProperty", "XPathTextProperty",
8 | "HTTPClientProperty", "URLQueryProperty", "TextResponseProperty",
9 | "JSONResponseProperty"]
10 |
--------------------------------------------------------------------------------
/brownant/pipeline/base.py:
--------------------------------------------------------------------------------
1 | from werkzeug.utils import cached_property
2 |
3 |
4 | class PipelineProperty(cached_property):
5 | """The base class of pipeline properties.
6 |
7 | There are three kinds of initial parameters.
8 |
9 | - The required attribute. If a keyword argument's name was defined in
10 | :attr:`~brownant.pipeline.base.PipelineProperty.required_attrs`, it will
11 | be assigned as an instance attribute.
12 |
13 | - The attr_name. It is the member of
14 | :attr:`~brownant.pipeline.base.PipelineProperty.attr_names`, whose name
15 | always end with `_attr`, such as `text_attr`.
16 |
17 | - The option. It will be placed at an instance owned :class:`dict` named
18 | :attr:`~brownant.pipeline.base.PipelineProperty.options`. The subclasses
19 | could set default option value in the
20 | :meth:`~brownant.pipeline.base.PipelineProperty.prepare`.
21 |
22 | A workable subclass of :class:`~brownant.pipeline.base.PipelineProperty`
23 | should implement the abstruct method
24 | :meth:`~PipelineProperty.provide_value`, which accept an argument, the
25 | instance of :class:`~brownant.dinergate.Dinergate`.
26 |
27 | Overriding :meth:`~brownant.pipeline.base.PipelineProperty.prepare` is
28 | optional in subclasses.
29 |
30 | :param kwargs: the parameters with the three kinds.
31 | """
32 |
33 | #: the names of required attributes.
34 | required_attrs = set()
35 |
36 | def __init__(self, **kwargs):
37 | super(PipelineProperty, self).__init__(self.provide_value)
38 | self.__name__ = None
39 | self.__module__ = None
40 | self.__doc__ = None
41 |
42 | #: the definition of attr_names
43 | self.attr_names = {}
44 | #: the definition of options
45 | self.options = {}
46 |
47 | assigned_attrs = set()
48 | for name, value in kwargs.items():
49 | assigned_attrs.add(name)
50 |
51 | # names of attrs
52 | if name.endswith("_attr"):
53 | self.attr_names[name] = value
54 | # required attrs
55 | elif name in self.required_attrs:
56 | setattr(self, name, value)
57 | # optional attrs
58 | else:
59 | self.options[name] = value
60 | missing_attrs = self.required_attrs - assigned_attrs
61 | if missing_attrs:
62 | raise TypeError("missing %r" % ", ".join(missing_attrs))
63 |
64 | self.prepare()
65 |
66 | def prepare(self):
67 | """This method will be called after instance ininialized. The
68 | subclasses could override the implementation.
69 |
70 | In general purpose, the implementation of this method should give
71 | default value to options and the members of
72 | :attr:`~brownant.pipeline.base.PipelineProperty.attr_names`.
73 |
74 | Example:
75 |
76 | .. code-block:: python
77 |
78 | def prepare(self):
79 | self.attr_names.setdefault("text_attr", "text")
80 | self.options.setdefault("use_proxy", False)
81 | """
82 |
83 | def get_attr(self, obj, name):
84 | """Get attribute of the target object with the configured attribute
85 | name in the :attr:`~brownant.pipeline.base.PipelineProperty.attr_names`
86 | of this instance.
87 |
88 | :param obj: the target object.
89 | :type obj: :class:`~brownant.dinergate.Dinergate`
90 | :param name: the internal name used in the
91 | :attr:`~brownant.pipeline.base.PipelineProperty.attr_names`.
92 | (.e.g. `"text_attr"`)
93 | """
94 | attr_name = self.attr_names[name]
95 | return getattr(obj, attr_name)
96 |
--------------------------------------------------------------------------------
/brownant/pipeline/html.py:
--------------------------------------------------------------------------------
1 | import lxml.html
2 |
3 | from brownant.pipeline.base import PipelineProperty
4 |
5 |
6 | class ElementTreeProperty(PipelineProperty):
7 | """The element tree built from a text response property. There is an usage
8 | example::
9 |
10 | class MySite(Dinergate):
11 | text_response = ""
12 | div_response = ""
13 | xml_response = (u""
14 | u"\u6d4b\u8bd5")
15 | etree = ElementTreeProperty()
16 | div_etree = ElementTreeProperty(text_response_attr="div_response")
17 | xml_etree = ElementTreeProperty(text_response_attr="xml_response",
18 | encoding="utf-8")
19 |
20 | site = MySite(request)
21 | print(site.etree) # output:
22 | print(site.div_etree) # output:
23 | print(site.xml_etree) # output:
24 |
25 | :param text_response_attr: optional. default: `"text_response"`.
26 | :param encoding: optional. default: `None`. The output text could be
27 | encoded to a specific encoding.
28 |
29 | .. versionadded:: 0.1.4
30 | The `encoding` optional parameter.
31 | """
32 |
33 | def prepare(self):
34 | self.attr_names.setdefault("text_response_attr", "text_response")
35 | self.options.setdefault("encoding", None)
36 |
37 | def provide_value(self, obj):
38 | text_response = self.get_attr(obj, "text_response_attr")
39 | if self.options["encoding"]:
40 | text_response = text_response.encode(self.options["encoding"])
41 | return lxml.html.fromstring(text_response)
42 |
43 |
44 | class XPathTextProperty(PipelineProperty):
45 | """The text extracted from a element tree property by XPath. There is an
46 | example for usage::
47 |
48 | class MySite(Dinergate):
49 | # omit page_etree
50 | title = XPathTextProperty(xpath=".//h1[@id='title']/text()",
51 | etree_attr="page_etree",
52 | strip_spaces=True,
53 | pick_mode="first")
54 | links = XPathTextProperty(xpath=".//*[@id='links']/a/@href",
55 | etree_attr="page_etree",
56 | strip_spaces=True,
57 | pick_mode="join",
58 | joiner="|")
59 |
60 | :param xpath: the xpath expression for extracting text.
61 | :param etree_attr: optional. default: `"etree"`.
62 | :param strip_spaces: optional. default: `False`. if it be `True`,
63 | the spaces in the beginning and the end of texts will
64 | be striped.
65 | :param pick_mode: optional. default: `"join"`, and could be "join", "first"
66 | or "keep". while `"join"` be detected, the texts will be
67 | joined to one. if the `"first"` be detected, only
68 | the first text would be picked. if the `"keep"` be
69 | detected, the original value will be picked.
70 | :param joiner: optional. default is a space string. it is no sense in
71 | assigning this parameter while the `pick_mode` is not
72 | `"join"`. otherwise, the texts will be joined by this
73 | string.
74 |
75 | .. versionadded:: 0.1.4
76 | The new option value `"keep"` of the `pick_mode` parameter.
77 | """
78 |
79 | required_attrs = {"xpath"}
80 |
81 | def prepare(self):
82 | self.attr_names.setdefault("etree_attr", "etree")
83 | self.options.setdefault("strip_spaces", False)
84 | self.options.setdefault("pick_mode", "join")
85 | self.options.setdefault("joiner", " ")
86 |
87 | def choice_pick_impl(self):
88 | pick_mode = self.options["pick_mode"]
89 | impl = {
90 | "join": self.pick_joining,
91 | "first": self.pick_first,
92 | "keep": self.keep_value,
93 | }.get(pick_mode)
94 |
95 | if not impl:
96 | raise ValueError("%r is not valid pick mode" % pick_mode)
97 | return impl
98 |
99 | def pick_joining(self, value):
100 | joiner = self.options["joiner"]
101 | return joiner.join(value)
102 |
103 | def pick_first(self, value):
104 | return value[0] if value else ""
105 |
106 | def keep_value(self, value):
107 | return value
108 |
109 | def provide_value(self, obj):
110 | etree = self.get_attr(obj, "etree_attr")
111 | value = etree.xpath(self.xpath)
112 | pick_value = self.choice_pick_impl()
113 |
114 | if self.options["strip_spaces"]:
115 | value = [v.strip() for v in value if v.strip()]
116 |
117 | return pick_value(value)
118 |
--------------------------------------------------------------------------------
/brownant/pipeline/network.py:
--------------------------------------------------------------------------------
1 | from requests import Session
2 |
3 | from brownant.pipeline.base import PipelineProperty
4 | from brownant.exceptions import NotSupported
5 |
6 |
7 | class HTTPClientProperty(PipelineProperty):
8 | """The python-requests session property.
9 |
10 | :param session_class: the class of session instance. default be
11 | :class:`~requests.Session`.
12 | """
13 |
14 | def prepare(self):
15 | self.options.setdefault("session_class", Session)
16 |
17 | def provide_value(self, obj):
18 | session_class = self.options["session_class"]
19 | session = session_class()
20 | return session
21 |
22 |
23 | class URLQueryProperty(PipelineProperty):
24 | """The query argument property. The usage is simple::
25 |
26 | class MySite(Dinergate):
27 | item_id = URLQueryProperty(name="item_id", type=int)
28 |
29 | It equals to this::
30 |
31 | class MySite(Dinergate):
32 | @cached_property
33 | def item_id(self):
34 | value = self.request.args.get("item_id", type=int)
35 | if not value:
36 | raise NotSupported
37 | return value
38 |
39 | A failure convertion with given type (:exc:`ValueError` be raised) will
40 | lead the value fallback to :obj:`None`. It is the same with the behavior of
41 | the :class:`~werkzeug.datastructures.MultiDict`.
42 |
43 | :param name: the query argument name.
44 | :param request_attr: optional. default: `"request"`.
45 | :param type: optionl. default: `None`. this value will be passed to
46 | :meth:`~werkzeug.datastructures.MultiDict.get`.
47 | :param required: optionl. default: `True`. while this value be true, the
48 | :exc:`~brownant.exceptions.NotSupported` will be raised
49 | for meeting empty value.
50 | """
51 |
52 | required_attrs = {"name"}
53 |
54 | def prepare(self):
55 | self.attr_names.setdefault("request_attr", "request")
56 | self.options.setdefault("type", None)
57 | self.options.setdefault("required", True)
58 |
59 | def provide_value(self, obj):
60 | request = self.get_attr(obj, "request_attr")
61 | value = request.args.get(self.name, type=self.options["type"])
62 | if self.options["required"] and value is None:
63 | raise NotSupported
64 | return value
65 |
66 |
67 | class ResponseProperty(PipelineProperty):
68 | """The base class of response properties.
69 |
70 | You can't use this class directly.
71 |
72 | :param content_method: required. it point to response content method.
73 | """
74 | def prepare(self):
75 | self.attr_names.setdefault("url_attr", "url")
76 | self.attr_names.setdefault("http_client_attr", "http_client")
77 | self.options.setdefault("method", "GET")
78 |
79 | def provide_value(self, obj):
80 | if "content_method" not in self.attr_names:
81 | raise KeyError("You need create a subclass which inheritance "
82 | "ResponseProperty, and assign `content_method` "
83 | "into self.attr_names")
84 | url = self.get_attr(obj, "url_attr")
85 | http_client = self.get_attr(obj, "http_client_attr")
86 | content_method = self.attr_names.get("content_method")
87 | response = http_client.request(url=url, **self.options)
88 | response.raise_for_status()
89 | content = getattr(response, content_method)
90 | if callable(content):
91 | content = content()
92 | return content
93 |
94 |
95 | class TextResponseProperty(ResponseProperty):
96 | """The text response which returned by fetching network resource.
97 |
98 | Getting this property is network I/O operation in the first time. The
99 | http request implementations are all provided by :mod:`requests`.
100 |
101 | The usage example::
102 |
103 | class MySite(Dinergate):
104 | foo_http = requests.Session()
105 | foo_url = "http://example.com"
106 | foo_text = TextResponseProperty(url_attr="foo_url",
107 | http_client="foo_http",
108 | proxies=PROXIES)
109 |
110 | :param url_attr: optional. default: `"url"`. it point to the property which
111 | could provide the fetched url.
112 | :param http_client_attr: optional. default: `"http_client"`. it point to
113 | an http client property which is instance of
114 | :class:`requests.Session`
115 | :param method: optional. default: `"GET"`. the request method which
116 | used by http_client.
117 | :param kwargs: the optional arguments which will be passed to
118 | :meth:`requests.Session.request`
119 | """
120 |
121 | def prepare(self):
122 | super(TextResponseProperty, self).prepare()
123 | self.attr_names.setdefault("content_method", "content")
124 |
125 |
126 | class JSONResponseProperty(ResponseProperty):
127 | """The json response which returned by fetching network resource.
128 |
129 | Getting this property is network I/O operation in the first time. The
130 | http request implementations are all provided by :mod:`requests`.
131 |
132 | The usage example::
133 |
134 | class MySite(Dinergate):
135 | foo_http = requests.Session()
136 | foo_url = "http://example.com"
137 | foo_json = JSONResponseProperty(url_attr="foo_url",
138 | http_client="foo_http",
139 | proxies=PROXIES)
140 |
141 | :param url_attr: optional. default: `"url"`. it point to the property which
142 | could provide the fetched url.
143 | :param http_client_attr: optional. default: `"http_client"`. it point to
144 | an http client property which is instance of
145 | :class:`requests.Session`
146 | :param method: optional. default: `"GET"`. the request method which
147 | used by http_client.
148 | :param kwargs: the optional arguments which will be passed to
149 | :meth:`requests.Session.request`
150 | """
151 |
152 | def prepare(self):
153 | super(JSONResponseProperty, self).prepare()
154 | self.attr_names.setdefault("content_method", "json")
155 |
--------------------------------------------------------------------------------
/brownant/request.py:
--------------------------------------------------------------------------------
1 | from __future__ import absolute_import, unicode_literals
2 |
3 |
4 | class Request(object):
5 | """The request object.
6 |
7 | :param url: the raw URL inputted from the dispatching app.
8 | :type url: :class:`urllib.parse.ParseResult`
9 | :param args: the query arguments decoded from query string of the URL.
10 | :type args: :class:`werkzeug.datastructures.MultiDict`
11 | """
12 |
13 | def __init__(self, url, args):
14 | self.url = url
15 | self.args = args
16 |
17 | def __repr__(self):
18 | return "Request(url={self.url}, args={self.args})".format(self=self)
19 |
--------------------------------------------------------------------------------
/brownant/site.py:
--------------------------------------------------------------------------------
1 | from __future__ import absolute_import, unicode_literals
2 |
3 |
4 | class Site(object):
5 | """The site supported object which could be mounted to app instance.
6 |
7 | :param name: the name of the supported site.
8 | """
9 |
10 | def __init__(self, name):
11 | self.name = name
12 | self.actions = []
13 |
14 | def record_action(self, method_name, *args, **kwargs):
15 | """Record the method-calling action.
16 |
17 | The actions expect to be played on an target object.
18 |
19 | :param method_name: the name of called method.
20 | :param args: the general arguments for calling method.
21 | :param kwargs: the keyword arguments for calling method.
22 | """
23 | self.actions.append((method_name, args, kwargs))
24 |
25 | def play_actions(self, target):
26 | """Play record actions on the target object.
27 |
28 | :param target: the target which recive all record actions, is a brown
29 | ant app instance normally.
30 | :type target: :class:`~brownant.app.Brownant`
31 | """
32 | for method_name, args, kwargs in self.actions:
33 | method = getattr(target, method_name)
34 | method(*args, **kwargs)
35 |
36 | def route(self, host, rule, **options):
37 | """The decorator to register wrapped function as the brown ant app.
38 |
39 | All optional parameters of this method are compatible with the
40 | :meth:`~brownant.app.Brownant.add_url_rule`.
41 |
42 | Registered functions or classes must be import-able with its qualified
43 | name. It is different from the :class:`~flask.Flask`, but like a
44 | lazy-loading mode. Registered objects only be loaded before the first
45 | using.
46 |
47 | The right way::
48 |
49 | @site.route("www.example.com", "/item/")
50 | def spam(request, item_id):
51 | pass
52 |
53 | The wrong way::
54 |
55 | def egg():
56 | # the function could not be imported by its qualified name
57 | @site.route("www.example.com", "/item/")
58 | def spam(request, item_id):
59 | pass
60 |
61 | egg()
62 |
63 | :param host: the limited host name.
64 | :param rule: the URL path rule as string.
65 | :param options: the options to be forwarded to the
66 | :class:`werkzeug.routing.Rule` object.
67 | """
68 | def decorator(func):
69 | endpoint = "{func.__module__}:{func.__name__}".format(func=func)
70 | self.record_action("add_url_rule", host, rule, endpoint, **options)
71 | return func
72 | return decorator
73 |
--------------------------------------------------------------------------------
/brownant/utils.py:
--------------------------------------------------------------------------------
1 | from six import text_type
2 |
3 |
4 | def to_bytes_safe(text, encoding="utf-8"):
5 | """Convert the input value into bytes type.
6 |
7 | If the input value is string type and could be encode as UTF-8 bytes, the
8 | encoded value will be returned. Otherwise, the encoding has failed, the
9 | origin value will be returned as well.
10 |
11 | :param text: the input value which could be string or bytes.
12 | :param encoding: the expected encoding be used while converting the string
13 | input into bytes.
14 | :rtype: :class:`~__builtin__.bytes`
15 | """
16 | if not isinstance(text, (bytes, text_type)):
17 | raise TypeError("must be string type")
18 |
19 | if isinstance(text, text_type):
20 | return text.encode(encoding)
21 |
22 | return text
23 |
--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
1 | # Makefile for Sphinx documentation
2 | #
3 |
4 | # You can set these variables from the command line.
5 | SPHINXOPTS =
6 | SPHINXBUILD = sphinx-build
7 | PAPER =
8 | BUILDDIR = _build
9 |
10 | # Internal variables.
11 | PAPEROPT_a4 = -D latex_paper_size=a4
12 | PAPEROPT_letter = -D latex_paper_size=letter
13 | ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
14 | # the i18n builder cannot share the environment and doctrees with the others
15 | I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
16 |
17 | .PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext
18 |
19 | help:
20 | @echo "Please use \`make ' where is one of"
21 | @echo " html to make standalone HTML files"
22 | @echo " dirhtml to make HTML files named index.html in directories"
23 | @echo " singlehtml to make a single large HTML file"
24 | @echo " pickle to make pickle files"
25 | @echo " json to make JSON files"
26 | @echo " htmlhelp to make HTML files and a HTML help project"
27 | @echo " qthelp to make HTML files and a qthelp project"
28 | @echo " devhelp to make HTML files and a Devhelp project"
29 | @echo " epub to make an epub"
30 | @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
31 | @echo " latexpdf to make LaTeX files and run them through pdflatex"
32 | @echo " text to make text files"
33 | @echo " man to make manual pages"
34 | @echo " texinfo to make Texinfo files"
35 | @echo " info to make Texinfo files and run them through makeinfo"
36 | @echo " gettext to make PO message catalogs"
37 | @echo " changes to make an overview of all changed/added/deprecated items"
38 | @echo " linkcheck to check all external links for integrity"
39 | @echo " doctest to run all doctests embedded in the documentation (if enabled)"
40 |
41 | clean:
42 | -rm -rf $(BUILDDIR)/*
43 |
44 | html:
45 | $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
46 | @echo
47 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
48 |
49 | dirhtml:
50 | $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
51 | @echo
52 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
53 |
54 | singlehtml:
55 | $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml
56 | @echo
57 | @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml."
58 |
59 | pickle:
60 | $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
61 | @echo
62 | @echo "Build finished; now you can process the pickle files."
63 |
64 | json:
65 | $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
66 | @echo
67 | @echo "Build finished; now you can process the JSON files."
68 |
69 | htmlhelp:
70 | $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
71 | @echo
72 | @echo "Build finished; now you can run HTML Help Workshop with the" \
73 | ".hhp project file in $(BUILDDIR)/htmlhelp."
74 |
75 | qthelp:
76 | $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
77 | @echo
78 | @echo "Build finished; now you can run "qcollectiongenerator" with the" \
79 | ".qhcp project file in $(BUILDDIR)/qthelp, like this:"
80 | @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/brownant.qhcp"
81 | @echo "To view the help file:"
82 | @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/brownant.qhc"
83 |
84 | devhelp:
85 | $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp
86 | @echo
87 | @echo "Build finished."
88 | @echo "To view the help file:"
89 | @echo "# mkdir -p $$HOME/.local/share/devhelp/brownant"
90 | @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/brownant"
91 | @echo "# devhelp"
92 |
93 | epub:
94 | $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub
95 | @echo
96 | @echo "Build finished. The epub file is in $(BUILDDIR)/epub."
97 |
98 | latex:
99 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
100 | @echo
101 | @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
102 | @echo "Run \`make' in that directory to run these through (pdf)latex" \
103 | "(use \`make latexpdf' here to do that automatically)."
104 |
105 | latexpdf:
106 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
107 | @echo "Running LaTeX files through pdflatex..."
108 | $(MAKE) -C $(BUILDDIR)/latex all-pdf
109 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
110 |
111 | text:
112 | $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text
113 | @echo
114 | @echo "Build finished. The text files are in $(BUILDDIR)/text."
115 |
116 | man:
117 | $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man
118 | @echo
119 | @echo "Build finished. The manual pages are in $(BUILDDIR)/man."
120 |
121 | texinfo:
122 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
123 | @echo
124 | @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo."
125 | @echo "Run \`make' in that directory to run these through makeinfo" \
126 | "(use \`make info' here to do that automatically)."
127 |
128 | info:
129 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
130 | @echo "Running Texinfo files through makeinfo..."
131 | make -C $(BUILDDIR)/texinfo info
132 | @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo."
133 |
134 | gettext:
135 | $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale
136 | @echo
137 | @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale."
138 |
139 | changes:
140 | $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
141 | @echo
142 | @echo "The overview file is in $(BUILDDIR)/changes."
143 |
144 | linkcheck:
145 | $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
146 | @echo
147 | @echo "Link check complete; look for any errors in the above output " \
148 | "or in $(BUILDDIR)/linkcheck/output.txt."
149 |
150 | doctest:
151 | $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
152 | @echo "Testing of doctests in the sources finished, look at the " \
153 | "results in $(BUILDDIR)/doctest/output.txt."
154 |
--------------------------------------------------------------------------------
/docs/_static/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/douban/brownant/3c7e6d30f67b8f0f8ca1f823ea3daed74e8725cd/docs/_static/.gitkeep
--------------------------------------------------------------------------------
/docs/api.rst:
--------------------------------------------------------------------------------
1 | .. _api:
2 |
3 | Basic API
4 | =========
5 |
6 | The basic API included the application framework and routing system (provided
7 | by :mod:`werkzeug.routing`) of Brownant.
8 |
9 | brownant.app
10 | ------------
11 |
12 | .. autoclass:: brownant.app.Brownant
13 | :members:
14 | :inherited-members:
15 |
16 | .. autofunction:: brownant.app.redirect
17 |
18 | brownant.request
19 | ----------------
20 |
21 | .. autoclass:: brownant.request.Request
22 | :members:
23 | :inherited-members:
24 |
25 | brownant.site
26 | -------------
27 |
28 | .. autoclass:: brownant.site.Site
29 | :members:
30 | :inherited-members:
31 |
32 | brownant.exceptions
33 | -------------------
34 |
35 | .. autoexception:: brownant.exceptions.BrownantException
36 |
37 | .. autoexception:: brownant.exceptions.NotSupported
38 | :show-inheritance:
39 |
40 | brownant.utils
41 | --------------
42 |
43 | .. autofunction:: brownant.utils.to_bytes_safe
44 |
45 | Declarative API
46 | ===============
47 |
48 | The declarative API is around the "dinergate" and "pipeline property".
49 |
50 | brownant.dinergate
51 | ------------------
52 |
53 | .. autoclass:: brownant.dinergate.Dinergate
54 | :members:
55 | :inherited-members:
56 |
57 | .. autoclass:: brownant.dinergate.DinergateType
58 | :show-inheritance:
59 |
60 | brownant.pipeline.base
61 | ----------------------
62 |
63 | .. autoclass:: brownant.pipeline.base.PipelineProperty
64 | :members:
65 | :inherited-members:
66 | :show-inheritance:
67 |
68 | .. method:: provide_value(obj)
69 |
70 | The abstruct method which should be implemented by subclasses. It provide
71 | the value expected by us from the subject instance.
72 |
73 | :param obj: the subject instance.
74 | :type obj: :class:`~brownant.dinergate.Dinergate`
75 |
76 | brownant.pipeline.network
77 | -------------------------
78 |
79 | .. autoclass:: brownant.pipeline.network.URLQueryProperty
80 | :members:
81 |
82 | .. autoclass:: brownant.pipeline.network.TextResponseProperty
83 | :members:
84 |
85 | brownant.pipeline.html
86 | ----------------------
87 |
88 | .. autoclass:: brownant.pipeline.html.ElementTreeProperty
89 | :members:
90 |
91 | .. autoclass:: brownant.pipeline.html.XPathTextProperty
92 | :members:
93 |
--------------------------------------------------------------------------------
/docs/changes.rst:
--------------------------------------------------------------------------------
1 | .. _changes:
2 |
3 | .. include:: ../CHANGES
4 |
--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | #
3 | # brownant documentation build configuration file, created by
4 | # sphinx-quickstart on Sun Sep 29 00:53:05 2013.
5 | #
6 | # This file is execfile()d with the current directory set to its containing dir.
7 | #
8 | # Note that not all possible configuration values are present in this
9 | # autogenerated file.
10 | #
11 | # All configuration values have a default; values that are commented out
12 | # serve to show the default.
13 |
14 | import sys, os
15 |
16 | import alabaster
17 |
18 | # If extensions (or modules to document with autodoc) are in another directory,
19 | # add these directories to sys.path here. If the directory is relative to the
20 | # documentation root, use os.path.abspath to make it absolute, like shown here.
21 | sys.path.insert(0, os.path.abspath('..'))
22 |
23 | # -- General configuration -----------------------------------------------------
24 |
25 | # If your documentation needs a minimal Sphinx version, state it here.
26 | #needs_sphinx = '1.0'
27 |
28 | # Add any Sphinx extension module names here, as strings. They can be extensions
29 | # coming with Sphinx (named 'sphinx.ext.*') or your custom ones.
30 | extensions = ['sphinx.ext.autodoc', 'sphinx.ext.intersphinx']
31 |
32 | # Add any paths that contain templates here, relative to this directory.
33 | templates_path = ['_templates']
34 |
35 | # The suffix of source filenames.
36 | source_suffix = '.rst'
37 |
38 | # The encoding of source files.
39 | #source_encoding = 'utf-8-sig'
40 |
41 | # The master toctree document.
42 | master_doc = 'index'
43 |
44 | # General information about the project.
45 | project = u'Brownant'
46 | copyright = u'2014, Douban Inc.'
47 |
48 | # The version info for the project you're documenting, acts as replacement for
49 | # |version| and |release|, also used in various other places throughout the
50 | # built documents.
51 | #
52 | # The short X.Y version.
53 | version = '0.1.7'
54 | # The full version, including alpha/beta/rc tags.
55 | release = '0.1.7'
56 |
57 | # The language for content autogenerated by Sphinx. Refer to documentation
58 | # for a list of supported languages.
59 | #language = None
60 |
61 | # There are two options for replacing |today|: either, you set today to some
62 | # non-false value, then it is used:
63 | #today = ''
64 | # Else, today_fmt is used as the format for a strftime call.
65 | #today_fmt = '%B %d, %Y'
66 |
67 | # List of patterns, relative to source directory, that match files and
68 | # directories to ignore when looking for source files.
69 | exclude_patterns = ['_build', '_static']
70 |
71 | # The reST default role (used for this markup: `text`) to use for all documents.
72 | #default_role = None
73 |
74 | # If true, '()' will be appended to :func: etc. cross-reference text.
75 | #add_function_parentheses = True
76 |
77 | # If true, the current module name will be prepended to all description
78 | # unit titles (such as .. function::).
79 | #add_module_names = True
80 |
81 | # If true, sectionauthor and moduleauthor directives will be shown in the
82 | # output. They are ignored by default.
83 | #show_authors = False
84 |
85 | # The name of the Pygments (syntax highlighting) style to use.
86 | pygments_style = 'sphinx'
87 |
88 | # A list of ignored prefixes for module index sorting.
89 | #modindex_common_prefix = []
90 |
91 |
92 | # -- Options for HTML output ---------------------------------------------------
93 |
94 | # The theme to use for HTML and HTML Help pages. See the documentation for
95 | # a list of builtin themes.
96 | html_theme = 'alabaster'
97 |
98 | # Theme options are theme-specific and customize the look and feel of a theme
99 | # further. For a list of options available for each theme, see the
100 | # documentation.
101 | html_theme_options = {
102 | 'github_user': 'douban',
103 | 'github_repo': 'brownant',
104 | }
105 |
106 | # Add any paths that contain custom themes here, relative to this directory.
107 | html_theme_path = [alabaster.get_path()]
108 |
109 | # The name for this set of Sphinx documents. If None, it defaults to
110 | # " v documentation".
111 | #html_title = None
112 |
113 | # A shorter title for the navigation bar. Default is the same as html_title.
114 | #html_short_title = None
115 |
116 | # The name of an image file (relative to this directory) to place at the top
117 | # of the sidebar.
118 | #html_logo = None
119 |
120 | # The name of an image file (within the static path) to use as favicon of the
121 | # docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32
122 | # pixels large.
123 | #html_favicon = None
124 |
125 | # Add any paths that contain custom static files (such as style sheets) here,
126 | # relative to this directory. They are copied after the builtin static files,
127 | # so a file named "default.css" will overwrite the builtin "default.css".
128 | html_static_path = ['_static']
129 |
130 | # If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
131 | # using the given strftime format.
132 | #html_last_updated_fmt = '%b %d, %Y'
133 |
134 | # If true, SmartyPants will be used to convert quotes and dashes to
135 | # typographically correct entities.
136 | #html_use_smartypants = True
137 |
138 | # Custom sidebar templates, maps document names to template names.
139 | html_sidebars = {
140 | '**': [
141 | 'about.html',
142 | 'localtoc.html',
143 | 'relations.html',
144 | 'sourcelink.html',
145 | 'searchbox.html'
146 | ]
147 | }
148 |
149 | # Additional templates that should be rendered to pages, maps page names to
150 | # template names.
151 | #html_additional_pages = {}
152 |
153 | # If false, no module index is generated.
154 | #html_domain_indices = True
155 |
156 | # If false, no index is generated.
157 | #html_use_index = True
158 |
159 | # If true, the index is split into individual pages for each letter.
160 | #html_split_index = False
161 |
162 | # If true, links to the reST sources are added to the pages.
163 | #html_show_sourcelink = True
164 |
165 | # If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
166 | #html_show_sphinx = True
167 |
168 | # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
169 | #html_show_copyright = True
170 |
171 | # If true, an OpenSearch description file will be output, and all pages will
172 | # contain a tag referring to it. The value of this option must be the
173 | # base URL from which the finished HTML is served.
174 | #html_use_opensearch = ''
175 |
176 | # This is the file name suffix for HTML files (e.g. ".xhtml").
177 | #html_file_suffix = None
178 |
179 | # Output file base name for HTML help builder.
180 | htmlhelp_basename = 'brownantdoc'
181 |
182 |
183 | # -- Options for LaTeX output --------------------------------------------------
184 |
185 | latex_elements = {
186 | # The paper size ('letterpaper' or 'a4paper').
187 | #'papersize': 'letterpaper',
188 |
189 | # The font size ('10pt', '11pt' or '12pt').
190 | #'pointsize': '10pt',
191 |
192 | # Additional stuff for the LaTeX preamble.
193 | #'preamble': '',
194 | }
195 |
196 | # Grouping the document tree into LaTeX files. List of tuples
197 | # (source start file, target name, title, author, documentclass [howto/manual]).
198 | latex_documents = [
199 | ('index', 'brownant.tex', u'Brownant Documentation',
200 | u'Douban Inc.', 'manual'),
201 | ]
202 |
203 | # The name of an image file (relative to this directory) to place at the top of
204 | # the title page.
205 | #latex_logo = None
206 |
207 | # For "manual" documents, if this is true, then toplevel headings are parts,
208 | # not chapters.
209 | #latex_use_parts = False
210 |
211 | # If true, show page references after internal links.
212 | #latex_show_pagerefs = False
213 |
214 | # If true, show URL addresses after external links.
215 | #latex_show_urls = False
216 |
217 | # Documents to append as an appendix to all manuals.
218 | #latex_appendices = []
219 |
220 | # If false, no module index is generated.
221 | #latex_domain_indices = True
222 |
223 |
224 | # -- Options for manual page output --------------------------------------------
225 |
226 | # One entry per manual page. List of tuples
227 | # (source start file, name, description, authors, manual section).
228 | man_pages = [
229 | ('index', 'brownant', u'Brownant Documentation',
230 | [u'Douban Inc.'], 1)
231 | ]
232 |
233 | # If true, show URL addresses after external links.
234 | #man_show_urls = False
235 |
236 |
237 | # -- Options for Texinfo output ------------------------------------------------
238 |
239 | # Grouping the document tree into Texinfo files. List of tuples
240 | # (source start file, target name, title, author,
241 | # dir menu entry, description, category)
242 | texinfo_documents = [
243 | ('index', 'brownant', u'Brownant Documentation',
244 | u'Douban Inc.', 'brownant', 'One line description of project.',
245 | 'Miscellaneous'),
246 | ]
247 |
248 | # Documents to append as an appendix to all manuals.
249 | #texinfo_appendices = []
250 |
251 | # If false, no module index is generated.
252 | #texinfo_domain_indices = True
253 |
254 | # How to display URL addresses: 'footnote', 'no', or 'inline'.
255 | #texinfo_show_urls = 'footnote'
256 |
257 |
258 | # -- Options for Epub output ---------------------------------------------------
259 |
260 | # Bibliographic Dublin Core info.
261 | epub_title = u'Brownant'
262 | epub_author = u'Douban Inc.'
263 | epub_publisher = u'Douban Inc.'
264 | epub_copyright = u'2014, Douban Inc.'
265 |
266 | # The language of the text. It defaults to the language option
267 | # or en if the language is not set.
268 | #epub_language = ''
269 |
270 | # The scheme of the identifier. Typical schemes are ISBN or URL.
271 | #epub_scheme = ''
272 |
273 | # The unique identifier of the text. This can be a ISBN number
274 | # or the project homepage.
275 | #epub_identifier = ''
276 |
277 | # A unique identification for the text.
278 | #epub_uid = ''
279 |
280 | # A tuple containing the cover image and cover page html template filenames.
281 | #epub_cover = ()
282 |
283 | # HTML files that should be inserted before the pages created by sphinx.
284 | # The format is a list of tuples containing the path and title.
285 | #epub_pre_files = []
286 |
287 | # HTML files shat should be inserted after the pages created by sphinx.
288 | # The format is a list of tuples containing the path and title.
289 | #epub_post_files = []
290 |
291 | # A list of files that should not be packed into the epub file.
292 | #epub_exclude_files = []
293 |
294 | # The depth of the table of contents in toc.ncx.
295 | #epub_tocdepth = 3
296 |
297 | # Allow duplicate toc entries.
298 | #epub_tocdup = True
299 |
300 |
301 | # Example configuration for intersphinx: refer to the Python standard library.
302 | intersphinx_mapping = {
303 | 'http://docs.python.org/dev': None,
304 | 'http://docs.python-requests.org/en/latest/': None,
305 | 'http://werkzeug.pocoo.org/docs/': None,
306 | 'http://flask.pocoo.org/docs/': None,
307 | }
308 |
--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
1 | Welcome to Brownant
2 | ===================
3 |
4 | Brownant is a lightweight web data extracting framework.
5 |
6 |
7 | User's Guide
8 | ------------
9 |
10 | .. toctree::
11 | :maxdepth: 2
12 |
13 | userguide/introduction
14 |
15 | .. toctree::
16 | :maxdepth: 2
17 |
18 | userguide/quickstart
19 |
20 |
21 | API Reference
22 | -------------
23 |
24 | .. toctree::
25 | :maxdepth: 2
26 |
27 | api
28 |
29 |
30 | Release Changes
31 | ---------------
32 |
33 | .. toctree::
34 | :maxdepth: 2
35 |
36 | changes
37 |
38 |
39 | Author & Contributor
40 | --------------------
41 |
42 | .. include:: ../AUTHORS
43 | :start-line: 2
44 |
45 |
46 | Indices and tables
47 | ==================
48 |
49 | * :ref:`genindex`
50 | * :ref:`modindex`
51 | * :ref:`search`
52 |
53 |
--------------------------------------------------------------------------------
/docs/requirements.txt:
--------------------------------------------------------------------------------
1 | alabaster==0.4.1
2 | Sphinx==1.2.2
3 |
--------------------------------------------------------------------------------
/docs/userguide/introduction.rst:
--------------------------------------------------------------------------------
1 | .. _introduction:
2 |
3 | Introduction
4 | ============
5 |
6 | .. include:: ../../README.rst
7 | :start-line: 5
8 | :end-line: 36
9 |
--------------------------------------------------------------------------------
/docs/userguide/quickstart.rst:
--------------------------------------------------------------------------------
1 | .. _quickstart:
2 |
3 | Quick Start
4 | ===========
5 |
6 | There are some simple examples built with Brownant.
7 |
8 |
9 | The Minimal Demo
10 | ----------------
11 |
12 | This demo could get the download link from the PyPI home page of given
13 | project.
14 |
15 | .. code-block:: python
16 |
17 | # example.py
18 | from brownant import Brownant, Site
19 | from lxml import html
20 | from requests import Session
21 |
22 | site = Site(name="pypi")
23 | http = Session()
24 |
25 |
26 | @site.route("pypi.python.org", "/pypi/", defaults={"version": None})
27 | @site.route("pypi.python.org", "/pypi//")
28 | def pypi_info(request, name, version):
29 | url = request.url.geturl()
30 | etree = html.fromstring(http.get(url).content)
31 | download_url = etree.xpath(".//div[@id='download-button']/a/@href")[0]
32 |
33 | return {"name": name, "version": version, "download_url": download_url}
34 |
35 | app = Brownant()
36 | app.mount_site(site)
37 |
38 | if __name__ == "__main__":
39 | from pprint import pprint
40 | pprint(app.dispatch_url("https://pypi.python.org/pypi/Werkzeug/0.9.4"))
41 |
42 | And run it, we will get the output::
43 |
44 | $ python example.py
45 | {'download_url': 'https://.../source/W/Werkzeug/Werkzeug-0.9.4.tar.gz',
46 | 'name': u'Werkzeug',
47 | 'version': u'0.9.4'}
48 |
49 |
50 | The Declarative Demo
51 | --------------------
52 |
53 | With the declarative usage, the workflow will be flexible and readable.
54 |
55 | First, we define the "dinergate" in a site supported module:
56 |
57 | .. code-block:: python
58 |
59 | # sites/pypi.py
60 | from brownant.site import Site
61 | from brownant.dinergate import Dinergate
62 | from brownant.pipeline.network import TextResponseProperty
63 | from brownant.pipeline.html import ElementTreeProperty, XPathTextProperty
64 |
65 | site = Site(name="pypi")
66 |
67 |
68 | @site.route("pypi.python.org", "/pypi//")
69 | class PythonPackageInfo(Dinergate):
70 |
71 | URL_TEMPLATE = "http://pypi.python.org/pypi/{self.name}/{self.version}"
72 |
73 | text_response = TextResponseProperty()
74 | etree = ElementTreeProperty()
75 | download_url = XPathTextProperty(
76 | xpath=".//div[@id='download-button']/a/@href",
77 | strip_spaces=True, pick_mode="first")
78 |
79 | @property
80 | def info(self):
81 | return {"name": self.name, "version": self.version,
82 | "download_url": self.download_url}
83 |
84 | And then we define an application instance and mount the site.
85 |
86 | .. code-block:: python
87 |
88 | # app.py
89 | from brownant import Brownant
90 |
91 | app = Brownant()
92 | app.mount_site("sites.pypi:site")
93 |
94 |
95 | if __name__ == "__main__":
96 | from pprint import pprint
97 | pkg = app.dispatch_url("https://pypi.python.org/pypi/Werkzeug/0.9.4")
98 | pprint(pkg.info)
99 |
100 | And run it, we will get the same output.
101 |
--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [pytest]
2 | addopts = --cov brownant --pep8
3 | pep8ignore =
4 | docs/conf.py ALL
5 | docs/_themes/* ALL
6 | [bdist_wheel]
7 | universal = 1
8 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import setup, find_packages
2 | from os.path import dirname, realpath, join
3 |
4 | current_dir = dirname(realpath(__file__))
5 |
6 | with open(join(current_dir, "README.rst")) as long_description_file:
7 | long_description = long_description_file.read()
8 |
9 | install_requires = [
10 | "Werkzeug >= 0.8",
11 | "requests >= 1.0",
12 | "lxml >= 3.7.3",
13 | "six",
14 | ]
15 |
16 | setup(
17 | name="brownant",
18 | packages=find_packages(exclude=["tests", "docs"]),
19 | version="0.1.7",
20 | description="A lightweight web data extracting framework.",
21 | long_description=long_description,
22 | author="Subject-Dev Team, Douban Inc.",
23 | author_email="subject-dev@douban.com",
24 | url="https://github.com/douban/brownant",
25 | license="BSD",
26 | keywords=["extract", "web data"],
27 | classifiers=[
28 | "Programming Language :: Python",
29 | "Programming Language :: Python :: 2.7",
30 | "Programming Language :: Python :: 3.3",
31 | "Programming Language :: Python :: 3.4",
32 | "Programming Language :: Python :: Implementation :: PyPy",
33 | "Development Status :: 3 - Alpha",
34 | "License :: OSI Approved :: BSD License",
35 | "Intended Audience :: Developers",
36 | "Operating System :: OS Independent",
37 | "Environment :: Other Environment",
38 | "Topic :: Software Development :: Libraries :: Python Modules",
39 | ],
40 | install_requires=install_requires,
41 | )
42 |
--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/douban/brownant/3c7e6d30f67b8f0f8ca1f823ea3daed74e8725cd/tests/__init__.py
--------------------------------------------------------------------------------
/tests/test_app.py:
--------------------------------------------------------------------------------
1 | from __future__ import absolute_import, unicode_literals
2 |
3 | from pytest import fixture, raises
4 | from mock import patch
5 |
6 | from brownant import Brownant, redirect
7 | from brownant.exceptions import NotSupported
8 |
9 |
10 | class StubEndpoint(object):
11 |
12 | name = __name__ + ".StubEndpoint"
13 |
14 | def __init__(self, request, id_, **kwargs):
15 | self.request = request
16 | self.id_ = id_
17 |
18 |
19 | def redirect_endpoint(request, **kwargs):
20 | should_redirect = (request.args.get("r") == "1")
21 | if should_redirect:
22 | return redirect("http://redirect.example.com/42?id=24")
23 | return kwargs, request
24 |
25 |
26 | redirect_endpoint.__qualname__ = __name__ + "." + redirect_endpoint.__name__
27 |
28 |
29 | @fixture
30 | def app():
31 | _app = Brownant()
32 | _app.add_url_rule("m.example.com", "/item/", StubEndpoint.name)
33 | _app.add_url_rule("m.example.co.jp", "/item/", StubEndpoint.name)
34 | return _app
35 |
36 |
37 | def test_new_app(app):
38 | assert isinstance(app, Brownant)
39 | assert callable(app.add_url_rule)
40 | assert callable(app.dispatch_url)
41 | assert callable(app.mount_site)
42 |
43 |
44 | def test_match_url(app):
45 | stub = app.dispatch_url("http://m.example.com/item/289263?page=1&q=t")
46 |
47 | assert stub.id_ == 289263
48 | assert stub.request.args["page"] == "1"
49 | assert stub.request.args["q"] == "t"
50 |
51 | with raises(KeyError):
52 | stub.request.args["other"]
53 |
54 | assert repr(stub.request).startswith("Request(")
55 | assert repr(stub.request).endswith(")")
56 | assert "url=" in repr(stub.request)
57 | assert "m.example.com" in repr(stub.request)
58 | assert "/item/289263" in repr(stub.request)
59 | assert "args=" in repr(stub.request)
60 |
61 | assert stub.request.url.scheme == "http"
62 | assert stub.request.url.hostname == "m.example.com"
63 | assert stub.request.url.path == "/item/289263"
64 |
65 | assert stub.request.args.get("page", type=int) == 1
66 | assert stub.request.args["q"] == "t"
67 |
68 |
69 | def test_match_url_without_redirect(app):
70 | app.add_url_rule("detail.example.com", "/item/",
71 | StubEndpoint.name, defaults={"p": "a"})
72 | app.add_url_rule("mdetail.example.com", "/item/",
73 | StubEndpoint.name, defaults={"p": "a"})
74 |
75 | stub = app.dispatch_url("http://detail.example.com/item/12346?page=6")
76 | assert stub.id_ == 12346
77 | assert stub.request.args.get("page", type=int) == 6
78 |
79 | stub = app.dispatch_url("http://mdetail.example.com/item/12346?page=6")
80 | assert stub.id_ == 12346
81 | assert stub.request.args.get("page", type=int) == 6
82 |
83 |
84 | def test_match_url_with_redirect(app):
85 | app.add_url_rule("m.example.com", "/42", StubEndpoint.name,
86 | redirect_to="item/42")
87 |
88 | stub = app.dispatch_url("http://m.example.com/item/42/?page=6")
89 | assert stub.id_ == 42
90 | assert stub.request.args.get("page", type=int) == 6
91 |
92 | stub = app.dispatch_url("http://m.example.com/42?page=6")
93 | assert stub.id_ == 42
94 | assert stub.request.args.get("page", type=int) == 6
95 |
96 | stub = app.dispatch_url("http://m.example.com/item/42/")
97 | assert stub.id_ == 42
98 | with raises(KeyError):
99 | stub.request.args["page"]
100 |
101 | stub = app.dispatch_url("http://m.example.com/42")
102 | assert stub.id_ == 42
103 | with raises(KeyError):
104 | stub.request.args["page"]
105 |
106 |
107 | def test_match_url_and_handle_user_redirect(app):
108 | domain = "redirect.example.com"
109 | app.add_url_rule(domain, "/", redirect_endpoint.__qualname__)
110 |
111 | kwargs, request = app.dispatch_url("http://{0}/123?id=5".format(domain))
112 | assert kwargs == {"id": "123"}
113 | assert request.args["id"] == "5"
114 |
115 | kwargs, request = app.dispatch_url("http://{0}/1?id=5&r=1".format(domain))
116 | assert kwargs == {"id": "42"}
117 | assert request.args["id"] == "24"
118 |
119 |
120 | def test_match_non_ascii_url(app):
121 | url = u"http://m.example.co.jp/item/\u30de\u30a4\u30f3\u30c9"
122 | stub = app.dispatch_url(url)
123 |
124 | encoded_path = "/item/%E3%83%9E%E3%82%A4%E3%83%B3%E3%83%89"
125 | assert stub.request.url.scheme == "http"
126 | assert stub.request.url.hostname == "m.example.co.jp"
127 | assert stub.request.url.path == encoded_path
128 |
129 |
130 | def test_match_non_ascii_query(app):
131 | url = u"http://m.example.co.jp/item/test?src=\u63a2\u9669&r=1"
132 | stub = app.dispatch_url(url)
133 |
134 | assert stub.request.url.scheme == "http"
135 | assert stub.request.url.hostname == "m.example.co.jp"
136 | assert stub.request.url.path == "/item/test"
137 | assert stub.request.url.query == "src=%E6%8E%A2%E9%99%A9&r=1"
138 |
139 | assert set(stub.request.args) == {"src", "r"}
140 | assert stub.request.args["src"] == u"\u63a2\u9669"
141 | assert stub.request.args["r"] == "1"
142 |
143 |
144 | def test_match_unexcepted_url(app):
145 | unexcepted_url = "http://m.example.com/category/19352"
146 |
147 | with raises(NotSupported) as error:
148 | app.dispatch_url(unexcepted_url)
149 |
150 | # ensure the exception information is useful
151 | assert unexcepted_url in str(error)
152 |
153 | # ensure the rule could be added in runtime
154 | app.add_url_rule("m.example.com", "/category/", StubEndpoint.name)
155 | stub = app.dispatch_url(unexcepted_url)
156 | assert stub.id_ == 19352
157 | assert len(stub.request.args) == 0
158 |
159 |
160 | def test_match_invalid_url(app):
161 | # empty string
162 | with raises(NotSupported) as error:
163 | app.dispatch_url("")
164 | assert "invalid" in str(error)
165 |
166 | # has not hostname
167 | with raises(NotSupported) as error:
168 | app.dispatch_url("/")
169 | assert "invalid" in str(error)
170 |
171 | # has not hostname and path
172 | with raises(NotSupported) as error:
173 | app.dispatch_url("\\")
174 | assert "invalid" in str(error)
175 |
176 | # not http scheme
177 | with raises(NotSupported) as error:
178 | app.dispatch_url("ftp://example.com")
179 | assert "invalid" in str(error)
180 |
181 | # valid input
182 | with raises(NotSupported) as error:
183 | app.dispatch_url("http://example.com")
184 | assert "invalid" not in str(error)
185 |
186 | with raises(NotSupported) as error:
187 | app.dispatch_url("https://example.com")
188 | assert "invalid" not in str(error)
189 |
190 |
191 | foo_site = object()
192 |
193 |
194 | def test_mount_site(app):
195 | foo_site_name = __name__ + ".foo_site"
196 | with patch(foo_site_name):
197 | app.mount_site(foo_site)
198 | foo_site.play_actions.assert_called_with(target=app)
199 |
200 |
201 | def test_mount_site_by_string_name(app):
202 | foo_site_name = __name__ + ".foo_site"
203 | with patch(foo_site_name):
204 | app.mount_site(foo_site_name)
205 | foo_site.play_actions.assert_called_with(target=app)
206 |
--------------------------------------------------------------------------------
/tests/test_deprecation.py:
--------------------------------------------------------------------------------
1 | from brownant import Brownant, BrownAnt
2 |
3 |
4 | def test_deprecation(recwarn):
5 | app = BrownAnt()
6 | warning = recwarn.pop(DeprecationWarning)
7 |
8 | assert isinstance(app, Brownant)
9 | assert issubclass(warning.category, DeprecationWarning)
10 | assert "Brownant" in str(warning.message)
11 | assert "app.py" in warning.filename
12 | assert warning.lineno
13 |
--------------------------------------------------------------------------------
/tests/test_dinergate.py:
--------------------------------------------------------------------------------
1 | from __future__ import absolute_import, unicode_literals
2 |
3 | from mock import Mock
4 | from pytest import raises
5 |
6 | from brownant import Dinergate
7 |
8 |
9 | def test_basic():
10 | from requests import Session
11 | from werkzeug.utils import cached_property
12 |
13 | @cached_property
14 | def func_without_name(self):
15 | return [self]
16 | func_without_name.__name__ = None
17 |
18 | class FooDinergate(Dinergate):
19 | bar = func_without_name
20 |
21 | assert FooDinergate.bar.__name__ == "bar"
22 |
23 | mock_request = Mock()
24 | ant = FooDinergate(mock_request)
25 |
26 | assert ant.request is mock_request
27 | assert isinstance(ant.http_client, Session)
28 | assert ant.bar == [ant]
29 |
30 |
31 | def test_custom_kwargs():
32 | mock_request = Mock()
33 | ant = Dinergate(mock_request, foo=42, bar="hello")
34 | assert ant.foo == 42
35 | assert ant.bar == "hello"
36 |
37 |
38 | def test_custom_http_client():
39 | mock_request = Mock()
40 | mock_http_client = Mock()
41 | ant = Dinergate(mock_request, mock_http_client)
42 |
43 | ant.request.args.get("name", type=str)
44 | mock_request.args.get.assert_called_once_with("name", type=str)
45 |
46 | ant.http_client.post("http://example.com")
47 | mock_http_client.post.assert_called_once_with("http://example.com")
48 |
49 |
50 | def test_url_template():
51 | class FooDinergate(Dinergate):
52 | foo = 42
53 | bar = "page"
54 |
55 | URL_TEMPLATE = "http://example.com/{self.bar}/{self.foo}"
56 |
57 | ant = FooDinergate(request=Mock(), http_client=Mock())
58 | assert ant.url == "http://example.com/page/42"
59 |
60 | dead_ant = Dinergate(request=Mock(), http_client=Mock())
61 | with raises(NotImplementedError):
62 | dead_ant.url
63 |
--------------------------------------------------------------------------------
/tests/test_pipeline/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/douban/brownant/3c7e6d30f67b8f0f8ca1f823ea3daed74e8725cd/tests/test_pipeline/__init__.py
--------------------------------------------------------------------------------
/tests/test_pipeline/test_base.py:
--------------------------------------------------------------------------------
1 | from __future__ import absolute_import, unicode_literals
2 |
3 | from pytest import raises
4 |
5 | from brownant.pipeline.base import PipelineProperty
6 |
7 |
8 | def test_required_attrs():
9 | class SpamProperty(PipelineProperty):
10 | required_attrs = {"egg"}
11 |
12 | def provide_value(self, obj):
13 | return obj
14 |
15 | # valid
16 | spam_property = SpamProperty(egg=42)
17 | assert spam_property.egg == 42
18 | assert "egg" not in spam_property.options
19 | assert "egg" not in spam_property.attr_names
20 | with raises(AttributeError):
21 | spam_property.foo
22 |
23 | # invalid
24 | with raises(TypeError) as excinfo:
25 | spam_property = SpamProperty(spam=42)
26 | assert "egg" in repr(excinfo.value)
27 |
28 |
29 | def test_attr_name():
30 | class SpamProperty(PipelineProperty):
31 | def prepare(self):
32 | self.attr_names.setdefault("egg_attr", "egg")
33 |
34 | def provide_value(self, obj):
35 | return self.get_attr(obj, "egg_attr")
36 |
37 | class Spam(object):
38 | def __init__(self, **kwargs):
39 | vars(self).update(kwargs)
40 |
41 | spam_a = SpamProperty(egg=42)
42 | assert spam_a.attr_names["egg_attr"] == "egg"
43 | assert spam_a.provide_value(Spam(egg=1024)) == 1024
44 |
45 | spam_b = SpamProperty(egg=42, egg_attr="foo_egg")
46 | assert spam_b.attr_names["egg_attr"] == "foo_egg"
47 | assert spam_b.provide_value(Spam(foo_egg=2048)) == 2048
48 |
49 |
50 | def test_optional_attr():
51 | class SpamProperty(PipelineProperty):
52 | required_attrs = {"egg"}
53 |
54 | def provide_value(self, obj):
55 | return obj
56 |
57 | spam = SpamProperty(egg=41, foo=42, bar=43, aha_attr=44)
58 | assert spam.options["foo"] == 42
59 | assert spam.options["bar"] == 43
60 | assert "egg" not in spam.options
61 | assert "aha_attr" not in spam.options
62 |
--------------------------------------------------------------------------------
/tests/test_pipeline/test_html.py:
--------------------------------------------------------------------------------
1 | from __future__ import absolute_import, unicode_literals
2 |
3 | from pytest import raises
4 | from mock import patch, Mock
5 |
6 | from brownant.pipeline.html import ElementTreeProperty, XPathTextProperty
7 |
8 |
9 | # ElementTreeProperty
10 |
11 | def test_etree_default_attr_name():
12 | etree = ElementTreeProperty()
13 | assert etree.attr_names["text_response_attr"] == "text_response"
14 |
15 |
16 | def test_etree_default_encoding_show_be_none():
17 | etree = ElementTreeProperty()
18 | assert etree.options["encoding"] is None
19 |
20 |
21 | @patch("lxml.html.fromstring")
22 | def test_etree_general_parse_with_default(fromstring):
23 | mock = Mock()
24 | etree = ElementTreeProperty()
25 | etree.provide_value(mock)
26 | fromstring.assert_called_once_with(mock.text_response)
27 |
28 |
29 | @patch("lxml.html.fromstring")
30 | def test_etree_general(fromstring):
31 | mock = Mock()
32 | etree = ElementTreeProperty(text_response_attr="foo")
33 | etree.provide_value(mock)
34 | fromstring.assert_called_once_with(mock.foo)
35 |
36 |
37 | @patch("lxml.html.fromstring")
38 | def test_etree_general_parse_with_encoding(fromstring):
39 | mock = Mock()
40 | etree = ElementTreeProperty(text_response_attr="foo",
41 | encoding="utf-8")
42 | etree.provide_value(mock)
43 | fromstring.assert_called_once_with(mock.foo.encode("utf-8"))
44 |
45 |
46 | # XPathTextProperty
47 |
48 | def test_xpath_default_attr_name():
49 | with raises(TypeError):
50 | XPathTextProperty()
51 |
52 | text = XPathTextProperty(xpath="//path")
53 | assert text.xpath == "//path"
54 | assert text.attr_names["etree_attr"] == "etree"
55 | assert text.options["strip_spaces"] is False
56 | assert text.options["pick_mode"] == "join"
57 | assert text.options["joiner"] == " "
58 |
59 |
60 | def test_xpath_without_spaces():
61 | mock = Mock()
62 | mock.tree.xpath.return_value = ["a", "b", "c"]
63 |
64 | # pick_mode: join
65 | text = XPathTextProperty(xpath="//path", etree_attr="tree",
66 | pick_mode="join", joiner="|")
67 | rv = text.provide_value(mock)
68 | mock.tree.xpath.assert_called_with("//path")
69 | assert rv == "a|b|c"
70 |
71 | # pick_mode: first
72 | text = XPathTextProperty(xpath="//another-path", etree_attr="tree",
73 | pick_mode="first")
74 | rv = text.provide_value(mock)
75 | mock.tree.xpath.assert_called_with("//another-path")
76 | assert rv == "a"
77 |
78 |
79 | def test_xpath_with_striping_spaces():
80 | mock = Mock()
81 | mock.tree.xpath.return_value = [" a ", "\n b \n", "\n\n c \t"]
82 |
83 | # strip_spaces and join
84 | text = XPathTextProperty(xpath="//foo-path", etree_attr="tree",
85 | pick_mode="join", strip_spaces=True)
86 | rv = text.provide_value(mock)
87 | mock.tree.xpath.assert_called_with("//foo-path")
88 | assert rv == "a b c"
89 |
90 | # strip_spaces and first
91 | text = XPathTextProperty(xpath="//bar-path", etree_attr="tree",
92 | pick_mode="first", strip_spaces=True)
93 | rv = text.provide_value(mock)
94 | mock.tree.xpath.assert_called_with("//bar-path")
95 | assert rv == "a"
96 |
97 |
98 | def test_xpath_keep_pick_mode():
99 | mock = Mock()
100 | value = ['a', 'b', 'c']
101 | mock.tree.xpath.return_value = value
102 |
103 | text = XPathTextProperty(xpath="//foo-path", etree_attr="tree",
104 | pick_mode="keep")
105 | rv = text.provide_value(mock)
106 | mock.tree.xpath.assert_called_with("//foo-path")
107 | assert rv == value
108 |
109 |
110 | def test_xpath_invalid_pick_mode():
111 | with raises(ValueError) as excinfo:
112 | text = XPathTextProperty(xpath="//foo-path", pick_mode="unknown")
113 | text.provide_value(Mock())
114 | assert "unknown" in repr(excinfo.value)
115 |
--------------------------------------------------------------------------------
/tests/test_pipeline/test_network.py:
--------------------------------------------------------------------------------
1 | from __future__ import absolute_import, unicode_literals
2 |
3 | from mock import Mock, patch
4 | from pytest import raises
5 |
6 | from brownant.exceptions import NotSupported
7 | from brownant.pipeline.network import (HTTPClientProperty, URLQueryProperty,
8 | TextResponseProperty, ResponseProperty,
9 | JSONResponseProperty)
10 |
11 |
12 | def test_http_client():
13 | dinergate = Mock()
14 | with patch("requests.Session") as Session:
15 | instance = Session.return_value
16 | http_client = HTTPClientProperty(session_class=Session)
17 | assert http_client.provide_value(dinergate) is instance
18 | Session.assert_called_once_with()
19 |
20 |
21 | def test_url_query():
22 | mock = Mock()
23 | mock.request.args.get.return_value = "42"
24 |
25 | url_query = URLQueryProperty(name="value")
26 | rv = url_query.provide_value(mock)
27 |
28 | assert rv == "42"
29 | mock.request.args.get.assert_called_once_with("value", type=None)
30 |
31 |
32 | def test_url_query_type():
33 | mock = Mock()
34 | mock.request.args.get.return_value = 42
35 |
36 | url_query = URLQueryProperty(name="value", type=int)
37 | rv = url_query.provide_value(mock)
38 |
39 | assert rv == 42
40 | mock.request.args.get.assert_called_once_with("value", type=int)
41 |
42 |
43 | def test_url_query_required():
44 | mock = Mock()
45 | mock.request.args.get.return_value = None
46 |
47 | url_query = URLQueryProperty(name="value") # default be required
48 | with raises(NotSupported):
49 | url_query.provide_value(mock)
50 |
51 |
52 | def test_url_query_optional():
53 | mock = Mock()
54 | mock.request.args.get.return_value = None
55 |
56 | url_query = URLQueryProperty(name="d", type=float, required=False)
57 | rv = url_query.provide_value(mock)
58 |
59 | assert rv is None
60 | mock.request.args.get.assert_called_once_with("d", type=float)
61 |
62 |
63 | def test_url_query_required_boundary_condition():
64 | mock = Mock()
65 | mock.request.args.get.return_value = 0
66 |
67 | url_query = URLQueryProperty(name="num")
68 | rv = url_query.provide_value(mock)
69 |
70 | assert rv == 0
71 | mock.request.args.get.assert_called_once_with("num", type=None)
72 |
73 |
74 | def test_base_response():
75 | response = Mock()
76 | response.text = "OK"
77 |
78 | mock = Mock()
79 | mock.url = "http://example.com"
80 | mock.http_client.request.return_value = response
81 |
82 | response = ResponseProperty()
83 | with raises(KeyError):
84 | response.provide_value(mock)
85 |
86 |
87 | def test_text_response():
88 | class HTTPError(Exception):
89 | pass
90 |
91 | response = Mock()
92 | response.content = "OK"
93 | response.raise_for_status.side_effect = [None, HTTPError()]
94 |
95 | mock = Mock()
96 | mock.url = "http://example.com"
97 | mock.http_client.request.return_value = response
98 |
99 | text = TextResponseProperty(method="POST")
100 | rv = text.provide_value(mock)
101 |
102 | assert rv == "OK"
103 | response.raise_for_status.assert_called_once_with()
104 | mock.http_client.request.assert_called_once_with(
105 | method="POST", url="http://example.com")
106 |
107 | with raises(HTTPError):
108 | text.provide_value(mock)
109 |
110 |
111 | def test_json_response():
112 | class HTTPError(Exception):
113 | pass
114 |
115 | response = Mock()
116 | response.json.return_value = {'a': 1, 'b': {'c': 2, 'd': 3}}
117 | response.raise_for_status.side_effect = [None, HTTPError()]
118 |
119 | mock = Mock()
120 | mock.url = "http://example.com"
121 | mock.http_client.request.return_value = response
122 |
123 | json = JSONResponseProperty(method="POST")
124 | rv = json.provide_value(mock)
125 |
126 | assert rv == {
127 | 'a': 1,
128 | 'b': {
129 | 'c': 2,
130 | 'd': 3
131 | }
132 | }
133 | response.raise_for_status.assert_called_once_with()
134 | mock.http_client.request.assert_called_once_with(
135 | method="POST", url="http://example.com")
136 |
137 | with raises(HTTPError):
138 | json.provide_value(mock)
139 |
--------------------------------------------------------------------------------
/tests/test_site.py:
--------------------------------------------------------------------------------
1 | from __future__ import absolute_import, unicode_literals
2 |
3 | from pytest import fixture
4 | from mock import Mock
5 |
6 | from brownant import Site
7 |
8 |
9 | @fixture
10 | def sites():
11 | _sites = {
12 | "s1": Site("s1"),
13 | "s2": Site("s2"),
14 | "s3": Site("s3"),
15 | }
16 | return _sites
17 |
18 |
19 | def test_new_site(sites):
20 | assert sites["s1"].name == "s1"
21 | assert sites["s2"].name == "s2"
22 | assert sites["s3"].name == "s3"
23 |
24 | assert sites["s1"].actions == []
25 | assert sites["s2"].actions == []
26 | assert sites["s3"].actions == []
27 |
28 |
29 | def test_record_and_play_actions(sites):
30 | site = sites["s1"]
31 |
32 | mock = Mock()
33 | site.record_action("method_a", 10, "s", is_it=True)
34 | site.play_actions(target=mock)
35 | mock.method_a.assert_called_once_with(10, "s", is_it=True)
36 |
37 |
38 | def test_route(sites):
39 | site = sites["s1"]
40 |
41 | @site.route("m.example.com", "/article/")
42 | def handler(request, article_id):
43 | pass
44 |
45 | mock = Mock()
46 | site.play_actions(target=mock)
47 | mock.add_url_rule.assert_called_once_with(
48 | "m.example.com",
49 | "/article/",
50 | __name__ + ":handler"
51 | )
52 |
--------------------------------------------------------------------------------
/tests/test_utils.py:
--------------------------------------------------------------------------------
1 | from pytest import raises
2 |
3 | from brownant.utils import to_bytes_safe
4 |
5 |
6 | UNICODE_STRING_SAMPLE = u"\u5b89\u5168 SAFE"
7 | BYTES_SEQUENCE_SAMPLE = b"\xe5\xae\x89\xe5\x85\xa8 SAFE"
8 |
9 |
10 | def test_to_bytes_safe():
11 | assert to_bytes_safe(UNICODE_STRING_SAMPLE) == BYTES_SEQUENCE_SAMPLE
12 | assert to_bytes_safe(BYTES_SEQUENCE_SAMPLE) == BYTES_SEQUENCE_SAMPLE
13 | assert to_bytes_safe(u"ABC") == b"ABC"
14 | assert to_bytes_safe(b"ABC") == b"ABC"
15 |
16 | assert type(to_bytes_safe(UNICODE_STRING_SAMPLE)) is bytes
17 | assert type(to_bytes_safe(BYTES_SEQUENCE_SAMPLE)) is bytes
18 | assert type(to_bytes_safe(u"ABC")) is bytes
19 | assert type(to_bytes_safe(b"ABC")) is bytes
20 |
21 | with raises(TypeError):
22 | to_bytes_safe(42)
23 |
--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
1 | [tox]
2 | envlist = py27,py33,py34,pypy
3 | [testenv]
4 | deps =
5 | pytest
6 | pytest-cov
7 | pytest-pep8
8 | mock
9 | commands =
10 | py.test
11 |
--------------------------------------------------------------------------------