├── tests
    ├── __init__.py
    ├── extractors
    │   ├── __init__.py
    │   └── test_extractor.py
    ├── test_quantity.py
    └── test_parsers.py
├── requirements.txt
├── xextract
    ├── __init__.py
    ├── extractors
    │   ├── __init__.py
    │   ├── extractor_list.py
    │   └── lxml_extractor.py
    ├── quantity.py
    └── parsers.py
├── MANIFEST.in
├── CHANGELOG
├── .travis.yml
├── LICENSE
├── .gitignore
├── setup.py
└── README.rst


/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/extractors/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | lxml
2 | cssselect
3 | 


--------------------------------------------------------------------------------
/xextract/__init__.py:
--------------------------------------------------------------------------------
1 | from .parsers import *
2 | 
3 | __version__ = '0.1.9'
4 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include README.rst
2 | include LICENSE
3 | include MANIFEST.in
4 | include requirements.txt
5 | 


--------------------------------------------------------------------------------
/CHANGELOG:
--------------------------------------------------------------------------------
1 |     * Argument `quant` is deprecated. Use `count` instead.
2 |     * Dropped support for Python 2.
3 | 


--------------------------------------------------------------------------------
/xextract/extractors/__init__.py:
--------------------------------------------------------------------------------
1 | __all__ = ['XPathExtractor', 'XmlXPathExtractor', 'HtmlXPathExtractor']
2 | 
3 | 
4 | from .lxml_extractor import XPathExtractor, XmlXPathExtractor, HtmlXPathExtractor
5 | 


--------------------------------------------------------------------------------
/xextract/extractors/extractor_list.py:
--------------------------------------------------------------------------------
 1 | class XPathExtractorList(list):
 2 |     def __getslice__(self, i, j):
 3 |         return self.__class__(list.__getslice__(self, i, j))
 4 | 
 5 |     def select(self, xpath):
 6 |         return self.__class__(node for extractor in self for node in extractor.select(xpath))
 7 | 
 8 |     def extract(self):
 9 |         return [x.extract() for x in self]
10 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: python
 2 | 
 3 | python:
 4 |   - "3.5"
 5 |   - "3.6"
 6 |   - "3.7"
 7 |   - "3.8"
 8 |   - "3.9"
 9 |   - "3.10"
10 |   - "3.11"
11 |   - "pypy3"
12 | 
13 | # Use container-based infrastructure
14 | sudo: false
15 | 
16 | install:
17 |   - pip install .
18 |   - pip install -r requirements.txt
19 |   - pip install coverage
20 | 
21 | script:
22 |   - coverage run --source=xextract -m unittest discover
23 | 
24 | after_success:
25 |  - pip install coveralls
26 |  - coveralls
27 | 
28 | after_script:
29 |  - coverage report
30 |  - pip install pep8 pyflakes
31 |  - pyflakes .| tee >(wc -l)
32 |  - pep8 --statistics --count .
33 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright 2016 Michal "Mimino" Danilak
 2 | 
 3 |     Licensed under the Apache License, Version 2.0 (the "License");
 4 |     you may not use this file except in compliance with the License.
 5 |     You may obtain a copy of the License at
 6 | 
 7 |        http://www.apache.org/licenses/LICENSE-2.0
 8 | 
 9 |     Unless required by applicable law or agreed to in writing, software
10 |     distributed under the License is distributed on an "AS IS" BASIS,
11 |     WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 |     See the License for the specific language governing permissions and
13 |     limitations under the License.
14 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | 
 5 | # C extensions
 6 | *.so
 7 | 
 8 | # Distribution / packaging
 9 | .Python
10 | env/
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | lib/
17 | lib64/
18 | parts/
19 | sdist/
20 | var/
21 | *.egg-info/
22 | .installed.cfg
23 | *.egg
24 | 
25 | # PyInstaller
26 | #  Usually these files are written by a python script from a template
27 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
28 | *.manifest
29 | *.spec
30 | 
31 | # Installer logs
32 | pip-log.txt
33 | pip-delete-this-directory.txt
34 | 
35 | # Unit test / coverage reports
36 | htmlcov/
37 | .tox/
38 | .coverage
39 | .cache
40 | nosetests.xml
41 | coverage.xml
42 | 
43 | # Translations
44 | *.mo
45 | *.pot
46 | 
47 | # Django stuff:
48 | *.log
49 | 
50 | # Sphinx documentation
51 | docs/_build/
52 | 
53 | # PyBuilder
54 | target/
55 | 
56 | .DS_Store
57 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup
 2 | 
 3 | 
 4 | with open('README.rst', 'r') as f:
 5 |     readme = f.read()
 6 | 
 7 | 
 8 | setup(
 9 |     name='xextract',
10 |     version='0.1.9',
11 |     description='Extract structured data from HTML and XML documents like a boss.',
12 |     long_description=readme,
13 |     long_description_content_type='text/x-rst',
14 |     author='Michal "Mimino" Danilak',
15 |     author_email='michal.danilak@gmail.com',
16 |     url='https://github.com/Mimino666/python-xextract',
17 |     keywords='HTML parse parsing extraction extract crawl',
18 |     packages=['xextract',
19 |               'xextract.extractors'],
20 |     package_data={'': ['LICENSE']},
21 |     include_package_data=True,
22 |     install_requires=['lxml', 'cssselect'],
23 |     test_suite='tests',
24 |     license='MIT',
25 |     zip_safe=False,
26 |     classifiers=(
27 |         'Development Status :: 4 - Beta',
28 |         'Intended Audience :: Developers',
29 |         'Natural Language :: English',
30 |         'License :: OSI Approved :: MIT License',
31 |         'Programming Language :: Python',
32 |         'Programming Language :: Python :: 3',
33 |         'Programming Language :: Python :: 3.5',
34 |         'Programming Language :: Python :: 3.6',
35 |         'Programming Language :: Python :: 3.7',
36 |         'Programming Language :: Python :: 3.8',
37 |         'Programming Language :: Python :: 3.9',
38 |         'Programming Language :: Python :: 3.10',
39 |         'Programming Language :: Python :: 3.11',
40 |         'Programming Language :: Python :: 3 :: Only',
41 |         'Programming Language :: Python :: Implementation :: CPython',
42 |         'Programming Language :: Python :: Implementation :: PyPy',
43 |     ),
44 | )
45 | 


--------------------------------------------------------------------------------
/tests/test_quantity.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | from xextract.quantity import Quantity
 4 | 
 5 | 
 6 | class TestQuantity(unittest.TestCase):
 7 |     def test_create(self):
 8 |         self.assertEqual(Quantity().raw_quantity, '*')
 9 |         good = ['   *', ' +    ', '?    ', ' 1321     ', '007',
10 |             ' 8800,    9231   ', '1,2', '9999', '5,5', 9999, '0', '10000', 0, 10000,
11 |             (1, 2), (0, 0)]
12 |         for g in good:
13 |             quantity = Quantity(g)
14 |             self.assertIsNotNone(quantity._check_quantity_func)
15 | 
16 |         bad = ['', None, ' * * ', '+*', '  ', '1 2', '1,2,3', '+2', '-2', '3,2', 1.0,
17 |                 -1, (3, 2), (-1, 5)]
18 |         for b in bad:
19 |             self.assertRaises(ValueError, Quantity, b)
20 | 
21 |     def test_err(self):
22 |         q = Quantity('*')
23 |         err = ['0', [0], None, 'help']
24 |         for e in err:
25 |             self.assertRaises(ValueError, q.check_quantity, e)
26 | 
27 |     def test_star(self):
28 |         q = Quantity('*')
29 |         self._test_good(q, [0, 1, 2, 5, 10, 1000, 2**30])
30 |         self._test_bad(q, [-1, -2])
31 | 
32 |     def test_plus(self):
33 |         q = Quantity('+')
34 |         self._test_good(q, [1, 2, 5, 10, 1000, 2**30])
35 |         self._test_bad(q, [0, -1, -2])
36 | 
37 |     def test_ques(self):
38 |         q = Quantity('?')
39 |         self._test_good(q, [0, 1])
40 |         self._test_bad(q, [-2, -1, 2, 3, 10, 100])
41 | 
42 |     def test_1d(self):
43 |         q = Quantity('47')
44 |         self._test_good(q, [47])
45 |         self._test_bad(q, [0, 1, -1, -47, 46, 48, 100])
46 | 
47 |         q = Quantity(47)
48 |         self._test_good(q, [47])
49 |         self._test_bad(q, [0, 1, -1, -47, 46, 48, 100])
50 | 
51 |     def test_2d(self):
52 |         q = Quantity('5, 10')
53 |         self._test_good(q, [5, 6, 7, 8, 9, 10])
54 |         self._test_bad(q, [0, 1, 2, 3, 4, 11, 12, 13, -5, -10])
55 | 
56 |         q = Quantity((5, 10))
57 |         self._test_good(q, [5, 6, 7, 8, 9, 10])
58 |         self._test_bad(q, [0, 1, 2, 3, 4, 11, 12, 13, -5, -10])
59 | 
60 |     def _test_good(self, q, good):
61 |         for g in good:
62 |             self.assertTrue(q.check_quantity(g))
63 | 
64 |     def _test_bad(self, q, bad):
65 |         for b in bad:
66 |             self.assertFalse(q.check_quantity(b))
67 | 


--------------------------------------------------------------------------------
/xextract/extractors/lxml_extractor.py:
--------------------------------------------------------------------------------
 1 | from lxml import etree
 2 | 
 3 | from .extractor_list import XPathExtractorList
 4 | 
 5 | 
 6 | class XPathExtractor(object):
 7 |     _parser = etree.HTMLParser
 8 |     _tostring_method = 'html'
 9 | 
10 |     def __init__(self, body=None, namespaces=None, _root=None):
11 |         self.namespaces = namespaces
12 |         if _root is None:
13 |             self._root = self._get_root(body)
14 |         else:
15 |             self._root = _root
16 | 
17 |     def _get_root(self, body, encoding=None):
18 |         body = body.strip() or self._empty_doc
19 |         if isinstance(body, str):
20 |             body = body.encode('utf-8')
21 |             encoding = 'utf-8'
22 |         parser = self._parser(recover=True, encoding=encoding)
23 |         return etree.fromstring(body, parser=parser)
24 | 
25 |     def select(self, xpath):
26 |         if not hasattr(self._root, 'xpath'):
27 |             return XPathExtractorList([])
28 | 
29 |         if isinstance(xpath, etree.XPath):
30 |             result = xpath(self._root)
31 |         else:
32 |             result = self._root.xpath(xpath, namespaces=self.namespaces)
33 | 
34 |         if not isinstance(result, list):
35 |             result = [result]
36 | 
37 |         return XPathExtractorList(self.__class__(_root=x, namespaces=self.namespaces) for x in result)
38 | 
39 |     def extract(self):
40 |         try:
41 |             return etree.tostring(self._root, method=self._tostring_method,
42 |                                   encoding=str, with_tail=False)
43 |         except (AttributeError, TypeError):
44 |             if self._root is True:
45 |                 return '1'
46 |             elif self._root is False:
47 |                 return '0'
48 |             else:
49 |                 return str(self._root)
50 | 
51 |     def register_namespace(self, prefix, uri):
52 |         if self.namespaces is None:
53 |             self.namespaces = {}
54 |         self.namespaces[prefix] = uri
55 | 
56 |     def __nonzero__(self):
57 |         return bool(self.extract())
58 | 
59 |     def __str__(self):
60 |         data = repr(self.extract()[:40])
61 |         return '<%s data=%s>' % (type(self).__name__, data)
62 | 
63 |     __repr__ = __str__
64 | 
65 | 
66 | class XmlXPathExtractor(XPathExtractor):
67 |     _parser = etree.XMLParser
68 |     _tostring_method = 'xml'
69 |     _empty_doc = '<?xml version="1.0" encoding="UTF-8"?>'
70 | 
71 | 
72 | class HtmlXPathExtractor(XPathExtractor):
73 |     _parser = etree.HTMLParser
74 |     _tostring_method = 'html'
75 |     _empty_doc = '<html/>'
76 | 


--------------------------------------------------------------------------------
/xextract/quantity.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | 
  3 | 
  4 | class Quantity(object):
  5 |     '''
  6 |     Quantity is used to verify that the number of items satisfies the
  7 |     expected quantity, which you specify with a regexp-like syntax.
  8 | 
  9 |     Syntax:
 10 |         * - zero or more items
 11 |         + - one or more items
 12 |         ? - zero or one item
 13 |         count - specified number of items (0 <= count)
 14 |         lower, upper - number of items in interval [lower, upper] (0 <= lower <= upper).
 15 |     '''
 16 | 
 17 |     def __init__(self, quantity='*'):
 18 |         self.raw_quantity = quantity
 19 |         self.lower = self.upper = 0  # lower and upper bounds on quantity
 20 |         self._check_quantity_func = self._parse_quantity(quantity)
 21 | 
 22 |     def check_quantity(self, n):
 23 |         '''Return True, if `n` matches the specified quantity.'''
 24 | 
 25 |         if not isinstance(n, int):
 26 |             raise ValueError(
 27 |                 'Invalid argument for `check_quantity()`. '
 28 |                 'Integer expected, %s received: %s' % (type(n), repr(n)))
 29 |         return self._check_quantity_func(n)
 30 | 
 31 |     def _check_star(self, n):
 32 |         return n >= 0
 33 | 
 34 |     def _check_plus(self, n):
 35 |         return n >= 1
 36 | 
 37 |     def _check_question_mark(self, n):
 38 |         return n == 0 or n == 1
 39 | 
 40 |     def _check_1d(self, n):
 41 |         return n == self.upper
 42 | 
 43 |     def _check_2d(self, n):
 44 |         return self.lower <= n <= self.upper
 45 | 
 46 |     _quantity_parsers = (
 47 |         # regex, check_funcname
 48 |         (re.compile(r'^\s*\*\s*$'), '_check_star'),
 49 |         (re.compile(r'^\s*\+\s*$'), '_check_plus'),
 50 |         (re.compile(r'^\s*\?\s*$'), '_check_question_mark'),
 51 |         (re.compile(r'^\s*(?P<upper>\d+)\s*$'), '_check_1d'),
 52 |         (re.compile(r'^\s*(?P<lower>\d+)\s*,\s*(?P<upper>\d+)\s*$'), '_check_2d'))
 53 | 
 54 |     def _parse_quantity(self, quantity):
 55 |         '''
 56 |         If `quantity` represents a valid quantity expression, return the
 57 |         method that checks for the specified quantity.
 58 | 
 59 |         Otherwise raise ValueError.
 60 |         '''
 61 | 
 62 |         # quantity is specified as a single integer
 63 |         if isinstance(quantity, int):
 64 |             self.upper = quantity
 65 |             if 0 <= self.upper:
 66 |                 return self._check_1d
 67 |             else:
 68 |                 raise ValueError('Invalid quantity: %s' % repr(quantity))
 69 | 
 70 |         # quantity is specified as a pair of integers
 71 |         if isinstance(quantity, (list, tuple)) and len(quantity) == 2:
 72 |             self.lower, self.upper = quantity
 73 |             if (isinstance(self.lower, int) and
 74 |                     isinstance(self.upper, int) and
 75 |                     0 <= self.lower <= self.upper):
 76 |                 return self._check_2d
 77 |             else:
 78 |                 raise ValueError('Invalid quantity: %s' % repr(quantity))
 79 | 
 80 |         # quantity is specified as a string
 81 |         if isinstance(quantity, str):
 82 |             for parser, check_funcname in self._quantity_parsers:
 83 |                 match = parser.search(quantity)
 84 |                 if match:
 85 |                     # set lower and upper values
 86 |                     gd = match.groupdict()
 87 |                     self.lower = int(gd.get('lower', 0))
 88 |                     self.upper = int(gd.get('upper', 0))
 89 |                     # check lower/upper bounds
 90 |                     if self.lower <= self.upper:
 91 |                         return getattr(self, check_funcname)
 92 |                     else:
 93 |                         raise ValueError('Invalid quantity: %s' % repr(quantity))
 94 | 
 95 |         # quantity is of a bad type
 96 |         raise ValueError('Invalid quantity: %s' % repr(quantity))
 97 | 
 98 |     @property
 99 |     def is_single(self):
100 |         '''True, if the quantity represents a single element.'''
101 | 
102 |         return (
103 |             self._check_quantity_func == self._check_question_mark or
104 |             (self._check_quantity_func == self._check_1d and self.upper <= 1))
105 | 


--------------------------------------------------------------------------------
/tests/extractors/test_extractor.py:
--------------------------------------------------------------------------------
  1 | import unittest
  2 | 
  3 | from xextract.extractors.lxml_extractor import XPathExtractor, XmlXPathExtractor, HtmlXPathExtractor
  4 | 
  5 | 
  6 | class TestXpathExtractor(unittest.TestCase):
  7 |     xs_cls = XPathExtractor
  8 |     hxs_cls = HtmlXPathExtractor
  9 |     xxs_cls = XmlXPathExtractor
 10 | 
 11 |     def test_extractor_simple(self):
 12 |         text = '<p><input name="a" value="1"/><input name="b" value="2"/></p>'
 13 |         xpath = self.hxs_cls(text)
 14 | 
 15 |         xl = xpath.select('//input')
 16 |         self.assertEqual(2, len(xl))
 17 |         for x in xl:
 18 |             self.assertIsInstance(x, self.hxs_cls)
 19 | 
 20 |         self.assertEqual(xpath.select('//input').extract(),
 21 |                          [x.extract() for x in xpath.select('//input')])
 22 | 
 23 |         self.assertEqual([x.extract() for x in xpath.select('//input[@name="a"]/@name')],
 24 |                          ['a'])
 25 |         self.assertEqual([x.extract() for x in xpath.select('number(concat(//input[@name="a"]/@value, //input[@name="b"]/@value))')],
 26 |                          ['12.0'])
 27 | 
 28 |         self.assertEqual(xpath.select('concat("xpath", "rules")').extract(),
 29 |                          ['xpathrules'])
 30 |         self.assertEqual([x.extract() for x in xpath.select('concat(//input[@name="a"]/@value, //input[@name="b"]/@value)')],
 31 |                          ['12'])
 32 | 
 33 |     def test_extractor_unicode_query(self):
 34 |         text = '<p><input name="\xa9" value="1"/></p>'
 35 |         xpath = self.hxs_cls(text)
 36 |         self.assertEqual(xpath.select('//input[@name="\xa9"]/@value').extract(), ['1'])
 37 | 
 38 |     def test_extractor_same_type(self):
 39 |         '''Test XPathExtractor returning the same type in x() method.'''
 40 |         text = '<p>test<p>'
 41 |         self.assertIsInstance(self.xxs_cls(text).select('//p')[0],
 42 |                               self.xxs_cls)
 43 |         self.assertIsInstance(self.hxs_cls(text).select('//p')[0],
 44 |                               self.hxs_cls)
 45 | 
 46 |     def test_extractor_boolean_result(self):
 47 |         text = '<p><input name="a" value="1"/><input name="b" value="2"/></p>'
 48 |         xs = self.hxs_cls(text)
 49 |         self.assertEqual(xs.select('//input[@name="a"]/@name="a"').extract(), ['1'])
 50 |         self.assertEqual(xs.select('//input[@name="a"]/@name="n"').extract(), ['0'])
 51 | 
 52 |     def test_extractor_xml_html(self):
 53 |         '''Test that XML and HTML XPathExtractor's behave differently.'''
 54 |         # some text which is parsed differently by XML and HTML flavors
 55 |         text = '<div><img src="a.jpg"><p>Hello</div>'
 56 | 
 57 |         self.assertEqual(self.xxs_cls(text).select('//div').extract(),
 58 |                          ['<div><img src="a.jpg"><p>Hello</p></img></div>'])
 59 | 
 60 |         self.assertEqual(self.hxs_cls(text).select('//div').extract(),
 61 |                          ['<div><img src="a.jpg"><p>Hello</p></div>'])
 62 | 
 63 |     def test_extractor_nested(self):
 64 |         '''Nested extractor tests.'''
 65 |         text = '''<body>
 66 |                     <div class='one'>
 67 |                       <ul>
 68 |                         <li>one</li><li>two</li>
 69 |                       </ul>
 70 |                     </div>
 71 |                     <div class='two'>
 72 |                       <ul>
 73 |                         <li>four</li><li>five</li><li>six</li>
 74 |                       </ul>
 75 |                     </div>
 76 |                   </body>'''
 77 | 
 78 |         x = self.hxs_cls(text)
 79 |         divtwo = x.select('//div[@class="two"]')
 80 |         self.assertEqual(list(map(str.strip, divtwo.select('//li').extract())),
 81 |                          ['<li>one</li>', '<li>two</li>', '<li>four</li>', '<li>five</li>', '<li>six</li>'])
 82 |         self.assertEqual(list(map(str.strip, divtwo.select('./ul/li').extract())),
 83 |                          ['<li>four</li>', '<li>five</li>', '<li>six</li>'])
 84 |         self.assertEqual(list(map(str.strip, divtwo.select('.//li').extract())),
 85 |                          ['<li>four</li>', '<li>five</li>', '<li>six</li>'])
 86 |         self.assertEqual(divtwo.select('./li').extract(),
 87 |                          [])
 88 | 
 89 |     def test_dont_strip(self):
 90 |         hxs = self.hxs_cls('<div>fff: <a href="#">zzz</a></div>')
 91 |         self.assertEqual(hxs.select('//text()').extract(), ['fff: ', 'zzz'])
 92 | 
 93 |     def test_extractor_namespaces_simple(self):
 94 |         text = '''
 95 |         <test xmlns:somens="http://github.com/">
 96 |            <somens:a id="foo">take this</a>
 97 |            <a id="bar">found</a>
 98 |         </test>
 99 |         '''
100 |         x = self.xxs_cls(text)
101 |         x.register_namespace('somens', 'http://github.com/')
102 |         self.assertEqual(x.select('//somens:a/text()').extract(), ['take this'])
103 | 
104 |     def test_extractor_namespaces_multiple(self):
105 |         text = '''<?xml version="1.0" encoding="UTF-8"?>
106 |         <BrowseNode xmlns="http://webservices.amazon.com/AWSECommerceService/2005-10-05"
107 |                 xmlns:b="http://somens.com"
108 |                 xmlns:p="http://www.github.com/product" >
109 |             <b:Operation>hello</b:Operation>
110 |             <TestTag b:att="value"><Other>value</Other></TestTag>
111 |             <p:SecondTestTag><material>iron</material><price>90</price><p:name>Dried Rose</p:name></p:SecondTestTag>
112 |         </BrowseNode>
113 |         '''
114 |         x = self.xxs_cls(text)
115 | 
116 |         x.register_namespace('xmlns', 'http://webservices.amazon.com/AWSECommerceService/2005-10-05')
117 |         x.register_namespace('p', 'http://www.github.com/product')
118 |         x.register_namespace('b', 'http://somens.com')
119 |         self.assertEqual(len(x.select('//xmlns:TestTag')), 1)
120 |         self.assertEqual(x.select('//b:Operation/text()').extract()[0], 'hello')
121 |         self.assertEqual(x.select('//xmlns:TestTag/@b:att').extract()[0], 'value')
122 |         self.assertEqual(x.select('//p:SecondTestTag/xmlns:price/text()').extract()[0], '90')
123 |         self.assertEqual(x.select('//p:SecondTestTag').select('./xmlns:price/text()')[0].extract(), '90')
124 |         self.assertEqual(x.select('//p:SecondTestTag/xmlns:material/text()').extract()[0], 'iron')
125 | 
126 |     def test_extractor_over_text(self):
127 |         hxs = self.hxs_cls('<root>lala</root>')
128 |         self.assertEqual(hxs.extract(),
129 |                          '<html><body><root>lala</root></body></html>')
130 | 
131 |         xxs = self.xxs_cls('<root>lala</root>')
132 |         self.assertEqual(xxs.extract(),
133 |                          '<root>lala</root>')
134 | 
135 |         xxs = self.xxs_cls('<root>lala</root>')
136 |         self.assertEqual(xxs.select('.').extract(),
137 |                          ['<root>lala</root>'])
138 | 
139 |     def test_extractor_invalid_xpath(self):
140 |         x = self.hxs_cls('<html></html>')
141 |         xpath = '//test[@foo="bar]'
142 |         self.assertRaises(Exception, x.select, xpath)
143 | 
144 |     def test_empty_bodies(self):
145 |         # shouldn't raise errors
146 |         self.hxs_cls('').select('//text()').extract()
147 |         self.xxs_cls('').select('//text()').extract()
148 | 
149 |     def test_null_bytes(self):
150 |         # shouldn't raise errors
151 |         text = '<root>pre\x00post</root>'
152 |         self.hxs_cls(text).select('//text()').extract()
153 |         self.xxs_cls(text).select('//text()').extract()
154 | 
155 |     def test_select_on_unevaluable_nodes(self):
156 |         r = self.hxs_cls('<span class="big">some text</span>')
157 |         # Text node
158 |         x1 = r.select('//text()')
159 |         self.assertEqual(x1.extract(), ['some text'])
160 |         self.assertEqual(x1.select('.//b').extract(), [])
161 |         # Tag attribute
162 |         x1 = r.select('//span/@class')
163 |         self.assertEqual(x1.extract(), ['big'])
164 |         self.assertEqual(x1.select('.//text()').extract(), [])
165 | 
166 |     def test_select_on_text_nodes(self):
167 |         r = self.hxs_cls('<div><b>Options:</b>opt1</div><div><b>Other</b>opt2</div>')
168 |         x1 = r.select('//div/descendant::text()[preceding-sibling::b[contains(text(), "Options")]]')
169 |         self.assertEqual(x1.extract(), ['opt1'])
170 | 
171 |         x1 = r.select('//div/descendant::text()/preceding-sibling::b[contains(text(), "Options")]')
172 |         self.assertEqual(x1.extract(), ['<b>Options:</b>'])
173 | 


--------------------------------------------------------------------------------
/xextract/parsers.py:
--------------------------------------------------------------------------------
  1 | from datetime import datetime
  2 | from urllib.parse import urljoin
  3 | 
  4 | from cssselect import GenericTranslator
  5 | from lxml import etree
  6 | 
  7 | from .extractors import XPathExtractor, HtmlXPathExtractor, XmlXPathExtractor
  8 | from .quantity import Quantity
  9 | 
 10 | 
 11 | __all__ = ['ParserError', 'ParsingError',
 12 |            'Prefix', 'Group', 'Element', 'String', 'Url', 'DateTime', 'Date']
 13 | 
 14 | 
 15 | class ParserError(Exception):
 16 |     '''Parser is badly initialized.'''
 17 | 
 18 | 
 19 | class ParsingError(Exception):
 20 |     '''Numebr of parsed elements doesn't match the expected quantity.'''
 21 | 
 22 | 
 23 | class BaseParser(object):
 24 |     def __init__(self, css=None, xpath=None, namespaces=None):
 25 |         if xpath and css:
 26 |             raise ParserError('At most one of "xpath" or "css" attributes can be specified.')
 27 | 
 28 |         if xpath:
 29 |             self.raw_xpath = xpath
 30 |         elif css:
 31 |             self.raw_xpath = GenericTranslator().css_to_xpath(css)
 32 |         else:
 33 |             self.raw_xpath = 'self::*'
 34 | 
 35 |         self.namespaces = namespaces
 36 |         self._compiled_xpath = None  # compile xpath lazily
 37 | 
 38 |     def __call__(self, body, url=None):
 39 |         return self.parse(body, url)
 40 | 
 41 |     def parse(self, body, url=None):
 42 |         if isinstance(body, XPathExtractor):
 43 |             extractor = body
 44 |         else:
 45 |             if '<?xml' in body[:128]:
 46 |                 extractor = XmlXPathExtractor(body)
 47 |             else:
 48 |                 extractor = HtmlXPathExtractor(body)
 49 | 
 50 |         return self._parse(extractor, {'url': url})
 51 | 
 52 |     def parse_html(self, body, url=None):
 53 |         '''Force `etree.HTMLParser`.'''
 54 |         return self._parse(HtmlXPathExtractor(body), {'url': url})
 55 | 
 56 |     def parse_xml(self, body, url=None):
 57 |         '''Force `etree.XMLParser`.'''
 58 |         return self._parse(XmlXPathExtractor(body), {'url': url})
 59 | 
 60 |     def _parse(self, extractor, context):
 61 |         nodes = extractor.select(self.compiled_xpath)
 62 |         return self._process_nodes(nodes, context)
 63 | 
 64 |     def _process_nodes(self, nodes, context):
 65 |         raise NotImplementedError
 66 | 
 67 |     @property
 68 |     def compiled_xpath(self):
 69 |         if self._compiled_xpath is None:
 70 |             self._compiled_xpath = etree.XPath(self.raw_xpath, namespaces=self.namespaces)
 71 |         return self._compiled_xpath
 72 | 
 73 | 
 74 | def propagate_namespaces(parser):
 75 |     '''Recursively propagate namespaces to children parsers.'''
 76 | 
 77 |     if parser.namespaces and hasattr(parser, 'children'):
 78 |         for child in parser.children:
 79 |             if not child.namespaces:
 80 |                 child.namespaces = parser.namespaces
 81 |                 propagate_namespaces(child)
 82 | 
 83 | 
 84 | class ChildrenParserMixin(object):
 85 |     def __init__(self, **kwargs):
 86 |         self.children = kwargs.pop('children', None)
 87 |         if self.children is None:
 88 |             raise ParserError('You must specify "children" for %s parser.' % self.__class__.__name__)
 89 | 
 90 |         super(ChildrenParserMixin, self).__init__(**kwargs)
 91 | 
 92 |         # ensure that all children elements inherited from BaseNamedParser have names
 93 |         for child in self.children:
 94 |             if isinstance(child, BaseNamedParser) and child.name is None:
 95 |                 raise ParserError('Children elements inherited from BaseNamedParser should have "name" specified.')
 96 | 
 97 |         # propagate namespaces to children parsers
 98 |         propagate_namespaces(self)
 99 | 
100 | 
101 | class Prefix(ChildrenParserMixin, BaseParser):
102 |     '''
103 |     This parser doesn't actually parse any data on its own.
104 |     Instead you can use it, when many of your parsers share the same css/xpath selector prefix.
105 |     '''
106 | 
107 |     def __init__(self, **kwargs):
108 |         self.callback = kwargs.pop('callback', None)
109 |         super(Prefix, self).__init__(**kwargs)
110 | 
111 |     def _process_nodes(self, nodes, context):
112 |         parsed_data = {}
113 |         for child in self.children:
114 |             parsed_data.update(child._parse(nodes, context))
115 | 
116 |         if self.callback is not None:
117 |             parsed_data = self.callback(parsed_data)
118 | 
119 |         return parsed_data
120 | 
121 | 
122 | class BaseNamedParser(BaseParser):
123 |     def __init__(self, name=None, count=None, quant=None, callback=None, **kwargs):  # `quant` is deprecated
124 |         if quant is not None:
125 |             if count is not None:
126 |                 raise ParserError('At most one of "count" or "quant" attributes can be specified.')
127 |             count = quant
128 | 
129 |         if count is None:
130 |             count = '*'
131 | 
132 |         super(BaseNamedParser, self).__init__(**kwargs)
133 |         self.name = name
134 |         self.quantity = Quantity(count)
135 |         self.callback = callback
136 | 
137 |     def _process_nodes(self, nodes, context):
138 |         # validate number of nodes
139 |         num_nodes = len(nodes)
140 |         if not self.quantity.check_quantity(num_nodes):
141 |             if self.name:
142 |                 name_msg = '(name="%s")' % self.name
143 |             else:
144 |                 name_msg = '(xpath="%s")' % self.raw_xpath
145 |             raise ParsingError(
146 |                 'Parser %s%s matched %s elements ("%s" expected).' %
147 |                 (self.__class__.__name__, name_msg, num_nodes, self.quantity.raw_quantity))
148 | 
149 |         values = self._process_named_nodes(nodes, context)
150 | 
151 |         if self.callback is not None:
152 |             values = [self.callback(x) for x in values]
153 | 
154 |         if self.name is None:
155 |             return self._flatten_values(values)
156 |         else:
157 |             return {self.name: self._flatten_values(values)}
158 | 
159 |     def _process_named_nodes(self, nodes, context):
160 |         raise NotImplementedError
161 | 
162 |     def _flatten_values(self, values):
163 |         if self.quantity.is_single:
164 |             return values[0] if values else None
165 |         else:
166 |             return values
167 | 
168 | 
169 | class Group(ChildrenParserMixin, BaseNamedParser):
170 |     '''
171 |     For each element matched by css/xpath selector returns the dictionary
172 |     containing the data extracted by the parsers listed in children parameter.
173 | 
174 |     All parsers listed in `children` parameter must have `name` specified -
175 |     this is then used as the key in dictionary.
176 |     '''
177 | 
178 |     def _process_named_nodes(self, nodes, context):
179 |         values = []
180 |         for node in nodes:
181 |             child_parsed_data = {}
182 |             for child in self.children:
183 |                 child_parsed_data.update(child._parse(node, context))
184 |             values.append(child_parsed_data)
185 |         return values
186 | 
187 | 
188 | class Element(BaseNamedParser):
189 |     '''
190 |     Returns lxml instance (`lxml.etree._Element`) of the matched element(s).
191 | 
192 |     If you use xpath expression and match the text content of the element
193 |     (e.g. `text()` or `@attr`), unicode is returned.
194 |     '''
195 | 
196 |     def _process_named_nodes(self, nodes, context):
197 |         return [node._root for node in nodes]
198 | 
199 | 
200 | class String(BaseNamedParser):
201 |     '''
202 |     Extract string data from the matched element(s). Extracted value is always unicode.
203 | 
204 |     By default, `String` extracts the text content of only the matched element,
205 |     but not its descendants. To extract and concatenate the text out of every
206 |     descendant element, use `attr` parameter with the special value "_all_text"
207 |     '''
208 | 
209 |     def __init__(self, attr='_text', **kwargs):
210 |         super(String, self).__init__(**kwargs)
211 |         if attr == '_text':
212 |             self.attr = 'text()'
213 |         elif attr == '_all_text':
214 |             self.attr = 'descendant-or-self::*/text()'
215 |         elif attr == '_name':
216 |             self.attr = 'name()'
217 |         else:
218 |             self.attr = '@' + attr
219 | 
220 |     def _process_named_nodes(self, nodes, context):
221 |         values = []
222 |         for node in nodes:
223 |             value = ''.join(node.select(self.attr).extract())
224 |             values.append(value)
225 |         return self._process_values(values, context)
226 | 
227 |     def _process_values(self, values, context):
228 |         return values
229 | 
230 | 
231 | class Url(String):
232 |     '''
233 |     Behaves like `String` parser, but with two exceptions:
234 |         - default value for `attr` parameter is "href"
235 |         - if you pass `url` parameter to parse() method, the absolute url will be constructed and returned
236 | 
237 |     If callback is specified, it is called after the absolute urls are constructed.
238 |     '''
239 | 
240 |     def __init__(self, **kwargs):
241 |         kwargs.setdefault('attr', 'href')
242 |         super(Url, self).__init__(**kwargs)
243 | 
244 |     def _process_values(self, values, context):
245 |         url = context.get('url')
246 |         if url:
247 |             result = []
248 |             for v in values:
249 |                 clean_v = v.strip()
250 |                 try:
251 |                     result.append(urljoin(url, clean_v))
252 |                 # In rare cases, urljoin() might fail with ValueError.
253 |                 except ValueError:
254 |                     result.append(clean_v)
255 |             return result
256 |         else:
257 |             return [v.strip() for v in values]
258 | 
259 | 
260 | class DateTime(String):
261 |     '''
262 |     Returns the `datetime.datetime` object constructed out of the extracted data:
263 |         `datetime.strptime(extracted_data, format)`
264 |     '''
265 | 
266 |     def __init__(self, format, **kwargs):
267 |         super(DateTime, self).__init__(**kwargs)
268 |         self.format = format
269 | 
270 |     def _process_values(self, values, context):
271 |         return [datetime.strptime(v, self.format) for v in values]
272 | 
273 | 
274 | class Date(DateTime):
275 |     '''
276 |     Returns the `datetime.date` object constructed out of the extracted data:
277 |         `datetime.strptime(extracted_data, format).date()`
278 |     '''
279 | 
280 |     def _process_values(self, values, context):
281 |         values = super(Date, self)._process_values(values, context)
282 |         return [v.date() for v in values]
283 | 


--------------------------------------------------------------------------------
/tests/test_parsers.py:
--------------------------------------------------------------------------------
  1 | from datetime import datetime, date
  2 | from urllib.parse import urlparse
  3 | import copy
  4 | import unittest
  5 | 
  6 | from lxml import etree
  7 | 
  8 | from xextract.parsers import (
  9 |     ParserError, ParsingError, BaseParser, BaseNamedParser,
 10 |     Prefix, Group, Element, String, Url, DateTime, Date)
 11 | 
 12 | 
 13 | class TestBuild(unittest.TestCase):
 14 |     def test_build(self):
 15 |         # missing children for Group / Prefix parsers
 16 |         self.assertRaisesRegex(ParserError, r'You must specify "children" for Prefix parser', Prefix)
 17 |         self.assertRaisesRegex(ParserError, r'You must specify "children" for Group parser', Group)
 18 |         # missing name of children elements
 19 |         self.assertRaisesRegex(ParserError, r'Children elements inherited from BaseNamedParser',
 20 |             lambda: Prefix(children=[String()]))
 21 |         self.assertRaisesRegex(ParserError, r'Children elements inherited from BaseNamedParser',
 22 |             lambda: Group(children=[String()]))
 23 |         self.assertRaisesRegex(ParserError, r'Children elements inherited from BaseNamedParser',
 24 |             lambda: Prefix(children=[Prefix(children=[String()])]))
 25 |         self.assertRaisesRegex(ParserError, r'Children elements inherited from BaseNamedParser',
 26 |             lambda: Prefix(children=[Group(name='x', children=[String()])]))
 27 | 
 28 | 
 29 | class TestBaseParser(unittest.TestCase):
 30 |     parser_class = BaseParser
 31 |     parser_kwargs = {}
 32 | 
 33 |     def test_init(self):
 34 |         # xpath / css missing
 35 |         parser = self.parser_class(**self.parser_kwargs)
 36 |         self.assertEqual(parser.raw_xpath, 'self::*')
 37 |         # both xpath / css specified
 38 |         self.assertRaises(ParserError, self.parser_class, css='a', xpath='//a', **self.parser_kwargs)
 39 |         # css specified
 40 |         self.parser_class(css='a', **self.parser_kwargs)
 41 |         # xpath specified
 42 |         self.parser_class(xpath='//a', **self.parser_kwargs)
 43 | 
 44 | 
 45 | class MockParser(BaseParser):
 46 |     def _process_nodes(self, nodes, context):
 47 |         return nodes
 48 | 
 49 | 
 50 | class TestParser(TestBaseParser):
 51 |     parser_class = MockParser
 52 | 
 53 |     def test_html_extraction(self):
 54 |         html = '''
 55 |         <ul>
 56 |             <li>a</li>
 57 |             <li>b</li>
 58 |         </ul>
 59 |         '''
 60 | 
 61 |         # xpath
 62 |         self.assertEqual(len(MockParser(xpath='//ul').parse(html)), 1)
 63 |         self.assertEqual(len(MockParser(xpath='ul').parse(html)), 0)
 64 |         self.assertEqual(len(MockParser(xpath='/ul').parse(html)), 0)
 65 | 
 66 |         self.assertEqual(len(MockParser(xpath='//li').parse(html)), 2)
 67 |         self.assertEqual(len(MockParser(xpath='li').parse(html)), 0)
 68 |         self.assertEqual(len(MockParser(xpath='/li').parse(html)), 0)
 69 | 
 70 |         self.assertEqual(len(MockParser(xpath='//ul/li').parse(html)), 2)
 71 |         self.assertEqual(len(MockParser(xpath='//ul//li').parse(html)), 2)
 72 | 
 73 |         # css
 74 |         self.assertEqual(len(MockParser(css='ol').parse(html)), 0)
 75 |         self.assertEqual(len(MockParser(css='ul').parse(html)), 1)
 76 |         self.assertEqual(len(MockParser(css='li').parse(html)), 2)
 77 | 
 78 |     def test_xml_extraction(self):
 79 |         xml = '''
 80 |         <?xml version="1.0" encoding="UTF-8"?>
 81 |         <body xmlns="http://test.com/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
 82 |             <ul>
 83 |                 <li>a</li>
 84 |                 <xsi:li>b</xsi:li>
 85 |             </ul>
 86 |         </body>
 87 |         '''
 88 | 
 89 |         namespaces = {'a': 'http://test.com/', 'b': 'http://www.w3.org/2001/XMLSchema-instance'}
 90 |         # xpath
 91 |         self.assertEqual(len(MockParser(xpath='//a:ul', namespaces=namespaces).parse(xml)), 1)
 92 |         self.assertEqual(len(MockParser(xpath='a:ul', namespaces=namespaces).parse(xml)), 1)
 93 |         self.assertEqual(len(MockParser(xpath='/a:ul', namespaces=namespaces).parse(xml)), 0)
 94 | 
 95 |         self.assertEqual(len(MockParser(xpath='//a:li', namespaces=namespaces).parse(xml)), 1)
 96 |         self.assertEqual(len(MockParser(xpath='a:li', namespaces=namespaces).parse(xml)), 0)
 97 |         self.assertEqual(len(MockParser(xpath='/a:li', namespaces=namespaces).parse(xml)), 0)
 98 | 
 99 |         self.assertEqual(len(MockParser(xpath='//b:li', namespaces=namespaces).parse(xml)), 1)
100 |         self.assertEqual(len(MockParser(xpath='b:li', namespaces=namespaces).parse(xml)), 0)
101 |         self.assertEqual(len(MockParser(xpath='/b:li', namespaces=namespaces).parse(xml)), 0)
102 | 
103 | 
104 | class MockNamedParser(BaseNamedParser):
105 |     def _process_named_nodes(self, nodes, context):
106 |         return [node.extract() for node in nodes]
107 | 
108 | 
109 | class TestBaseNamedParser(TestBaseParser):
110 |     parser_class = MockNamedParser
111 |     parser_kwargs = {'name': 'val'}
112 |     return_value_type = str
113 | 
114 |     html = '''
115 |     <ul>
116 |         <li>a</li>
117 |         <li>b</li>
118 |     </ul>
119 |     '''
120 | 
121 |     def test_check_quantity(self):
122 |         self.assertRaises(ParsingError, self.parser_class(css='li', count=0, **self.parser_kwargs).parse, self.html)
123 |         self.assertRaises(ParsingError, self.parser_class(css='li', count=1, **self.parser_kwargs).parse, self.html)
124 |         self.assertRaises(ParsingError, self.parser_class(css='ul', count=2, **self.parser_kwargs).parse, self.html)
125 |         self.assertRaises(ParsingError, self.parser_class(css='ul', count=(2, 3), **self.parser_kwargs).parse, self.html)
126 |         self.assertRaises(ParsingError, self.parser_class(css='li', count='?', **self.parser_kwargs).parse, self.html)
127 |         self.assertRaises(ParsingError, self.parser_class(css='ol', count='+', **self.parser_kwargs).parse, self.html)
128 | 
129 |     def test_check_quantity_return_type(self):
130 |         self.assertIsNone(self.parser_class(css='ol', count=0, **self.parser_kwargs).parse(self.html)['val'])
131 |         self.assertIsInstance(self.parser_class(css='ul', count=1, **self.parser_kwargs).parse(self.html)['val'], self.return_value_type)
132 |         self.assertIsInstance(self.parser_class(css='li', count=2, **self.parser_kwargs).parse(self.html)['val'], list)
133 | 
134 |         self.assertIsInstance(self.parser_class(css='ol', count=(0, 0), **self.parser_kwargs).parse(self.html)['val'], list)
135 |         self.assertIsInstance(self.parser_class(css='ul', count=(1, 1), **self.parser_kwargs).parse(self.html)['val'], list)
136 |         self.assertIsInstance(self.parser_class(css='li', count=(1, 2), **self.parser_kwargs).parse(self.html)['val'], list)
137 | 
138 |         self.assertIsNone(self.parser_class(css='ol', count='?', **self.parser_kwargs).parse(self.html)['val'])
139 |         self.assertIsInstance(self.parser_class(css='ul', count='?', **self.parser_kwargs).parse(self.html)['val'], self.return_value_type)
140 | 
141 |         self.assertIsInstance(self.parser_class(css='ol', count='*', **self.parser_kwargs).parse(self.html)['val'], list)
142 |         self.assertIsInstance(self.parser_class(css='ul', count='*', **self.parser_kwargs).parse(self.html)['val'], list)
143 |         self.assertIsInstance(self.parser_class(css='li', count='*', **self.parser_kwargs).parse(self.html)['val'], list)
144 | 
145 |         self.assertIsInstance(self.parser_class(css='ul', count='+', **self.parser_kwargs).parse(self.html)['val'], list)
146 |         self.assertIsInstance(self.parser_class(css='li', count='+', **self.parser_kwargs).parse(self.html)['val'], list)
147 | 
148 |     def test_missing_name(self):
149 |         no_name_parser_kwargs = copy.copy(self.parser_kwargs)
150 |         del no_name_parser_kwargs['name']
151 |         self.assertIsNone(self.parser_class(css='ol', count=0, **no_name_parser_kwargs).parse(self.html))
152 |         self.assertIsInstance(self.parser_class(css='ul', count=1, **no_name_parser_kwargs).parse(self.html), self.return_value_type)
153 |         self.assertIsInstance(self.parser_class(css='li', count=2, **no_name_parser_kwargs).parse(self.html), list)
154 | 
155 | 
156 | class TestString(TestBaseNamedParser):
157 |     parser_class = String
158 | 
159 |     def test_basic(self):
160 |         html = '<span data-val="rocks">Hello <b>world</b>!</span>'
161 | 
162 |         # by default extract _text
163 |         self.assertEqual(String(name='val', css='span', count=1).parse(html)['val'], 'Hello !')
164 | 
165 |         self.assertEqual(String(name='val', css='span', count=1, attr='_text').parse(html)['val'], 'Hello !')
166 |         self.assertEqual(String(name='val', css='span', count=1, attr='_all_text').parse(html)['val'], 'Hello world!')
167 |         self.assertEqual(String(name='val', css='span', count=1, attr='data-val').parse(html)['val'], 'rocks')
168 |         self.assertEqual(String(name='val', css='span', count=1, attr='data-invalid').parse(html)['val'], '')
169 | 
170 |     def test_callback(self):
171 |         html = '<span>1</span><span>2</span>'
172 |         self.assertListEqual(String(css='span').parse(html), ['1', '2'])
173 |         self.assertListEqual(String(css='span', callback=int).parse(html), [1, 2])
174 |         self.assertEqual(String(css='span:first-child', callback=int, count=1).parse(html), 1)
175 |         self.assertListEqual(String(css='div', callback=int).parse(html), [])
176 | 
177 | 
178 | class TestUrl(TestBaseNamedParser):
179 |     parser_class = Url
180 | 
181 |     def test_basic(self):
182 |         html = '<a href="/test?a=b" data-val="/val">Hello <b>world</b>!</a>'
183 | 
184 |         # by default extract href
185 |         self.assertEqual(Url(name='val', css='a', count=1).parse(html)['val'], '/test?a=b')
186 |         self.assertEqual(Url(name='val', css='a', count=1).parse(html, url='http://example.com/a/b/c')['val'], 'http://example.com/test?a=b')
187 | 
188 |         self.assertEqual(Url(name='val', css='a', count=1, attr='data-val').parse(html)['val'], '/val')
189 |         self.assertEqual(Url(name='val', css='a', count=1, attr='data-val').parse(html, url='http://example.com/a/b/c')['val'], 'http://example.com/val')
190 | 
191 |     def test_callback(self):
192 |         def _parse_scheme(url):
193 |             return urlparse(url).scheme
194 |         html = '<a href="/test"></a>'
195 |         self.assertEqual(Url(css='a', count=1, callback=_parse_scheme).parse(html), '')
196 |         self.assertEqual(Url(css='a', count=1, callback=_parse_scheme).parse(html, url='http://example.com/a/b/c'), 'http')
197 | 
198 | 
199 | class TestDateTime(TestBaseNamedParser):
200 |     parser_class = DateTime
201 |     parser_kwargs = {'name': 'val', 'format': '%d.%m.%Y %H:%M'}
202 |     return_value_type = datetime
203 |     html = '''<ul><li>1.1.2001 22:14</li><li>2.1.2001 12:12</li>20.3.2002 0:0</ul>'''
204 | 
205 |     def test_basic(self):
206 |         html = '<span data-val="1.1.2001">24.11.2015 10:12</span>'
207 |         val = DateTime(name='val', css='span', count=1, format='%d.%m.%Y %H:%M').parse(html)['val']
208 |         self.assertEqual(val, datetime(year=2015, month=11, day=24, hour=10, minute=12))
209 | 
210 |         val = DateTime(name='val', css='span', count=1, format='%d.%m.%Y', attr='data-val').parse(html)['val']
211 |         self.assertEqual(val, datetime(year=2001, month=1, day=1))
212 | 
213 |         # invalid format
214 |         self.assertRaises(ValueError, DateTime(name='val', css='span', count=1, format='%d').parse, html)
215 | 
216 |     def test_callback(self):
217 |         def _get_day(dt):
218 |             return dt.day
219 |         html = '<span>24.11.2015</span>'
220 |         self.assertEqual(
221 |             DateTime(css='span', count=1, format='%d.%m.%Y', callback=_get_day).parse(html),
222 |             24)
223 | 
224 | 
225 | class TestDate(TestBaseNamedParser):
226 |     parser_class = Date
227 |     parser_kwargs = {'name': 'val', 'format': '%d.%m.%Y'}
228 |     return_value_type = date
229 |     html = '''<ul><li>1.1.2001</li><li>2.1.2001</li>20.3.2002</ul>'''
230 | 
231 |     def test_basic(self):
232 |         html = '<span data-val="1.1.2001">24.11.2015</span>'
233 |         val = Date(name='val', css='span', count=1, format='%d.%m.%Y').parse(html)['val']
234 |         self.assertEqual(val, date(year=2015, month=11, day=24))
235 | 
236 |         val = Date(name='val', css='span', count=1, format='%d.%m.%Y', attr='data-val').parse(html)['val']
237 |         self.assertEqual(val, date(year=2001, month=1, day=1))
238 | 
239 |         # invalid format
240 |         self.assertRaises(ValueError, Date(name='val', css='span', count=1, format='%d').parse, html)
241 | 
242 |     def test_callback(self):
243 |         def _get_day(dt):
244 |             return dt.day
245 |         html = '<span>24.11.2015</span>'
246 |         self.assertEqual(
247 |             Date(css='span', count=1, format='%d.%m.%Y', callback=_get_day).parse(html),
248 |             24)
249 | 
250 | 
251 | class TestElement(TestBaseNamedParser):
252 |     parser_class = Element
253 |     parser_kwargs = {'name': 'val'}
254 |     return_value_type = etree._Element
255 | 
256 |     def test_basic(self):
257 |         html = '<span>Hello <b>world</b>!</span>'
258 | 
259 |         val = Element(name='val', css='span', count=1).parse(html)['val']
260 |         self.assertEqual(val.tag, 'span')
261 |         val = Element(name='val', css='b', count=1).parse(html)['val']
262 |         self.assertEqual(val.tag, 'b')
263 | 
264 |     def test_callback(self):
265 |         html = '<span>Hello <b>world</b>!</span>'
266 |         val = Element(css='b', count=1, callback=lambda el: el.text).parse(html)
267 |         self.assertEqual(val, 'world')
268 | 
269 |     def test_text_extract(self):
270 |         html = '<span>Hello<br> world<b> nothing to see </b>!</span>'
271 |         val = Element(xpath='//span/text()').parse(html)
272 |         self.assertListEqual(val, ['Hello', ' world', '!'])
273 | 
274 |         html = '<span class="nice"></span>'
275 |         val = Element(xpath='//span/@class', count=1).parse(html)
276 |         self.assertEqual(val, 'nice')
277 | 
278 | 
279 | class TestGroup(TestBaseNamedParser):
280 |     parser_class = Group
281 |     parser_kwargs = {'name': 'val', 'children': []}
282 |     return_value_type = dict
283 |     html = '''
284 |         <ul>
285 |             <li>
286 |                 <span>Mike</span>
287 |             </li>
288 |             <li>
289 |                 <span>John</span>
290 |                 <a href="/test">link</a>
291 |             </li>
292 |         </ul>'''
293 | 
294 |     def test_basic(self):
295 |         extracted = {'val': [
296 |             {'name': 'Mike', 'link': None},
297 |             {'name': 'John', 'link': 'http://example.com/test'}]}
298 | 
299 |         # css
300 |         val = Group(name='val', css='li', count=2, children=[
301 |             String(name='name', css='span', count=1),
302 |             Url(name='link', css='a', count='?')
303 |         ]).parse(self.html, url='http://example.com/')
304 |         self.assertDictEqual(val, extracted)
305 | 
306 |         # xpath
307 |         val = Group(name='val', css='li', count=2, children=[
308 |             String(name='name', xpath='span', count=1),
309 |             Url(name='link', xpath='a', count='?')
310 |         ]).parse(self.html, url='http://example.com/')
311 |         self.assertDictEqual(val, extracted)
312 | 
313 |         val = Group(name='val', css='li', count=2, children=[
314 |             String(name='name', xpath='descendant::span', count=1),
315 |             Url(name='link', xpath='descendant::a', count='?')
316 |         ]).parse(self.html, url='http://example.com/')
317 |         self.assertDictEqual(val, extracted)
318 | 
319 |     def test_callback(self):
320 |         val = Group(css='li', count=2, callback=lambda d: d['name'], children=[
321 |             String(name='name', css='span', count=1),
322 |         ]).parse(self.html)
323 |         self.assertListEqual(val, ['Mike', 'John'])
324 | 
325 | 
326 | class TestPrefix(TestBaseParser):
327 |     parser_class = Prefix
328 |     parser_kwargs = {'children': []}
329 |     html = '''
330 |         <ul>
331 |             <li>
332 |                 <span>Mike</span>
333 |             </li>
334 | 
335 |             <li>
336 |                 <span>John</span>
337 |             </li>
338 |         </ul>
339 |     '''
340 | 
341 |     def test_basic(self):
342 |         # css
343 |         val = Prefix(css='li', children=[
344 |             String(name='name', css='span', count=2)
345 |         ]).parse(self.html)
346 |         self.assertDictEqual(val, {'name': ['Mike', 'John']})
347 | 
348 |         # xpath
349 |         val = Prefix(xpath='//li', children=[
350 |             String(name='name', xpath='span', count=2)
351 |         ]).parse(self.html)
352 |         self.assertDictEqual(val, {'name': ['Mike', 'John']})
353 | 
354 |     def test_callback(self):
355 |         val = Prefix(xpath='//li', callback=lambda d: d['name'], children=[
356 |             String(name='name', css='span', count=2),
357 |         ]).parse(self.html)
358 |         self.assertListEqual(val, ['Mike', 'John'])
359 | 


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
  1 | ********
  2 | xextract
  3 | ********
  4 | 
  5 | Extract structured data from HTML and XML documents like a boss.
  6 | 
  7 | **xextract** is simple enough for writing a one-line parser, yet powerful enough to be used in a big project.
  8 | 
  9 | 
 10 | **Features**
 11 | 
 12 | - Parsing of HTML and XML documents
 13 | - Supports **xpath** and **css** selectors
 14 | - Simple declarative style of parsers
 15 | - Built-in self-validation to let you know when the structure of the website has changed
 16 | - Speed - under the hood the library uses `lxml library <http://lxml.de/>`_ with compiled xpath selectors
 17 | 
 18 | 
 19 | **Table of Contents**
 20 | 
 21 | .. contents::
 22 |     :local:
 23 |     :depth: 2
 24 |     :backlinks: none
 25 | 
 26 | 
 27 | ====================
 28 | A little taste of it
 29 | ====================
 30 | 
 31 | Let's parse `The Shawshank Redemption <http://www.imdb.com/title/tt0111161/>`_'s IMDB page:
 32 | 
 33 | .. code-block:: python
 34 | 
 35 |   # fetch the website
 36 |   >>> import requests
 37 |   >>> response = requests.get('http://www.imdb.com/title/tt0111161/')
 38 | 
 39 |   # parse like a boss
 40 |   >>> from xextract import String, Group
 41 | 
 42 |   # extract title with css selector
 43 |   >>> String(css='h1[itemprop="name"]', count=1).parse(response.text)
 44 |   'The Shawshank Redemption'
 45 | 
 46 |   # extract release year with xpath selector
 47 |   >>> String(xpath='//*[@id="titleYear"]/a', count=1, callback=int).parse(response.text)
 48 |   1994
 49 | 
 50 |   # extract structured data
 51 |   >>> Group(css='.cast_list tr:not(:first-child)', children=[
 52 |   ...   String(name='name', css='[itemprop="actor"]', attr='_all_text', count=1),
 53 |   ...   String(name='character', css='.character', attr='_all_text', count=1)
 54 |   ... ]).parse(response.text)
 55 |   [
 56 |    {'name': 'Tim Robbins', 'character': 'Andy Dufresne'},
 57 |    {'name': 'Morgan Freeman', 'character': "Ellis Boyd 'Red' Redding"},
 58 |    ...
 59 |   ]
 60 | 
 61 | 
 62 | ============
 63 | Installation
 64 | ============
 65 | 
 66 | To install **xextract**, simply run:
 67 | 
 68 | .. code-block:: bash
 69 | 
 70 |     $ pip install xextract
 71 | 
 72 | Requirements: lxml, cssselect
 73 | 
 74 | Supported Python versions are 3.5 - 3.11.
 75 | 
 76 | Windows users can download lxml binary `here <http://www.lfd.uci.edu/~gohlke/pythonlibs/#lxml>`_.
 77 | 
 78 | 
 79 | =======
 80 | Parsers
 81 | =======
 82 | 
 83 | ------
 84 | String
 85 | ------
 86 | 
 87 | **Parameters**: `name`_ (optional), `css / xpath`_ (optional, default ``"self::*"``), `count`_ (optional, default ``"*"``), `attr`_ (optional, default ``"_text"``), `callback`_ (optional), `namespaces`_ (optional)
 88 | 
 89 | Extract string data from the matched element(s).
 90 | Extracted value is always unicode.
 91 | 
 92 | By default, ``String`` extracts the text content of only the matched element, but not its descendants.
 93 | To extract and concatenate the text out of every descendant element, use ``attr`` parameter with the special value ``"_all_text"``:
 94 | 
 95 | Use ``attr`` parameter to extract the data from an HTML/XML attribute.
 96 | 
 97 | Use ``callback`` parameter to post-process extracted values.
 98 | 
 99 | Example:
100 | 
101 | .. code-block:: python
102 | 
103 |     >>> from xextract import String
104 |     >>> String(css='span', count=1).parse('<span>Hello <b>world</b>!</span>')
105 |     'Hello !'
106 | 
107 |     >>> String(css='span', count=1, attr='class').parse('<span class="text-success"></span>')
108 |     'text-success'
109 | 
110 |     # use special `attr` value `_all_text` to extract and concantenate text out of all descendants
111 |     >>> String(css='span', count=1, attr='_all_text').parse('<span>Hello <b>world</b>!</span>')
112 |     'Hello world!'
113 | 
114 |     # use special `attr` value `_name` to extract tag name of the matched element
115 |     >>> String(css='span', count=1, attr='_name').parse('<span>hello</span>')
116 |     'span'
117 | 
118 |     >>> String(css='span', callback=int).parse('<span>1</span><span>2</span>')
119 |     [1, 2]
120 | 
121 | ---
122 | Url
123 | ---
124 | 
125 | **Parameters**: `name`_ (optional), `css / xpath`_ (optional, default ``"self::*"``), `count`_ (optional, default ``"*"``), `attr`_ (optional, default ``"href"``), `callback`_ (optional), `namespaces`_ (optional)
126 | 
127 | Behaves like ``String`` parser, but with two exceptions:
128 | 
129 | * default value for ``attr`` parameter is ``"href"``
130 | * if you pass ``url`` parameter to ``parse()`` method, the absolute url will be constructed and returned
131 | 
132 | If ``callback`` is specified, it is called *after* the absolute urls are constructed.
133 | 
134 | Example:
135 | 
136 | .. code-block:: python
137 | 
138 |     >>> from xextract import Url, Prefix
139 |     >>> content = '<div id="main"> <a href="/test">Link</a> </div>'
140 | 
141 |     >>> Url(css='a', count=1).parse(content)
142 |     '/test'
143 | 
144 |     >>> Url(css='a', count=1).parse(content, url='http://github.com/Mimino666')
145 |     'http://github.com/test'  # absolute url address. Told ya!
146 | 
147 |     >>> Prefix(css='#main', children=[
148 |     ...   Url(css='a', count=1)
149 |     ... ]).parse(content, url='http://github.com/Mimino666')  # you can pass url also to ancestor's parse(). It will propagate down.
150 |     'http://github.com/test'
151 | 
152 | 
153 | --------
154 | DateTime
155 | --------
156 | 
157 | **Parameters**: `name`_ (optional), `css / xpath`_ (optional, default ``"self::*"``), ``format`` (**required**), `count`_ (optional, default ``"*"``), `attr`_ (optional, default ``"_text"``), `callback`_ (optional) `namespaces`_ (optional)
158 | 
159 | Returns the ``datetime.datetime`` object constructed out of the extracted data: ``datetime.strptime(extracted_data, format)``.
160 | 
161 | ``format`` syntax is described in the `Python documentation <https://docs.python.org/2/library/datetime.html#strftime-strptime-behavior>`_.
162 | 
163 | If ``callback`` is specified, it is called *after* the datetime objects are constructed.
164 | 
165 | Example:
166 | 
167 | .. code-block:: python
168 | 
169 |     >>> from xextract import DateTime
170 |     >>> DateTime(css='span', count=1, format='%d.%m.%Y %H:%M').parse('<span>24.12.2015 5:30</span>')
171 |     datetime.datetime(2015, 12, 24, 50, 30)
172 | 
173 | 
174 | ----
175 | Date
176 | ----
177 | 
178 | **Parameters**: `name`_ (optional), `css / xpath`_ (optional, default ``"self::*"``), ``format`` (**required**), `count`_ (optional, default ``"*"``), `attr`_ (optional, default ``"_text"``), `callback`_ (optional) `namespaces`_ (optional)
179 | 
180 | Returns the ``datetime.date`` object constructed out of the extracted data: ``datetime.strptime(extracted_data, format).date()``.
181 | 
182 | ``format`` syntax is described in the `Python documentation <https://docs.python.org/2/library/datetime.html#strftime-strptime-behavior>`_.
183 | 
184 | If ``callback`` is specified, it is called *after* the datetime objects are constructed.
185 | 
186 | Example:
187 | 
188 | .. code-block:: python
189 | 
190 |     >>> from xextract import Date
191 |     >>> Date(css='span', count=1, format='%d.%m.%Y').parse('<span>24.12.2015</span>')
192 |     datetime.date(2015, 12, 24)
193 | 
194 | 
195 | -------
196 | Element
197 | -------
198 | 
199 | **Parameters**: `name`_ (optional), `css / xpath`_ (optional, default ``"self::*"``), `count`_ (optional, default ``"*"``), `callback`_ (optional), `namespaces`_ (optional)
200 | 
201 | Returns lxml instance (``lxml.etree._Element``) of the matched element(s).
202 | If you use xpath expression and match the text content of the element (e.g. ``text()`` or ``@attr``), unicode is returned.
203 | 
204 | If ``callback`` is specified, it is called with ``lxml.etree._Element`` instance.
205 | 
206 | Example:
207 | 
208 | .. code-block:: python
209 | 
210 |     >>> from xextract import Element
211 |     >>> Element(css='span', count=1).parse('<span>Hello</span>')
212 |     <Element span at 0x2ac2990>
213 | 
214 |     >>> Element(css='span', count=1, callback=lambda el: el.text).parse('<span>Hello</span>')
215 |     'Hello'
216 | 
217 |     # same as above
218 |     >>> Element(xpath='//span/text()', count=1).parse('<span>Hello</span>')
219 |     'Hello'
220 | 
221 | 
222 | -----
223 | Group
224 | -----
225 | 
226 | **Parameters**: `name`_ (optional), `css / xpath`_ (optional, default ``"self::*"``), `children`_ (**required**), `count`_ (optional, default ``"*"``), `callback`_ (optional), `namespaces`_ (optional)
227 | 
228 | For each element matched by css/xpath selector returns the dictionary containing the data extracted by the parsers listed in ``children`` parameter.
229 | All parsers listed in ``children`` parameter **must** have ``name`` specified - this is then used as the key in dictionary.
230 | 
231 | Typical use case for this parser is when you want to parse structured data, e.g. list of user profiles, where each profile contains fields like name, address, etc. Use ``Group`` parser to group the fields of each user profile together.
232 | 
233 | If ``callback`` is specified, it is called with the dictionary of parsed children values.
234 | 
235 | Example:
236 | 
237 | .. code-block:: python
238 | 
239 |     >>> from xextract import Group
240 |     >>> content = '<ul><li id="id1">michal</li> <li id="id2">peter</li></ul>'
241 | 
242 |     >>> Group(css='li', count=2, children=[
243 |     ...     String(name='id', xpath='self::*', count=1, attr='id'),
244 |     ...     String(name='name', xpath='self::*', count=1)
245 |     ... ]).parse(content)
246 |     [{'name': 'michal', 'id': 'id1'},
247 |      {'name': 'peter', 'id': 'id2'}]
248 | 
249 | 
250 | ------
251 | Prefix
252 | ------
253 | 
254 | **Parameters**: `css / xpath`_ (optional, default ``"self::*"``), `children`_ (**required**), `namespaces`_ (optional)
255 | 
256 | This parser doesn't actually parse any data on its own. Instead you can use it, when many of your parsers share the same css/xpath selector prefix.
257 | 
258 | ``Prefix`` parser always returns a single dictionary containing the data extracted by the parsers listed in ``children`` parameter.
259 | All parsers listed in ``children`` parameter **must** have ``name`` specified - this is then used as the key in dictionary.
260 | 
261 | Example:
262 | 
263 | .. code-block:: python
264 | 
265 |     # instead of...
266 |     >>> String(css='#main .name').parse(...)
267 |     >>> String(css='#main .date').parse(...)
268 | 
269 |     # ...you can use
270 |     >>> from xextract import Prefix
271 |     >>> Prefix(css='#main', children=[
272 |     ...   String(name="name", css='.name'),
273 |     ...   String(name="date", css='.date')
274 |     ... ]).parse(...)
275 | 
276 | 
277 | =================
278 | Parser parameters
279 | =================
280 | 
281 | ----
282 | name
283 | ----
284 | 
285 | **Parsers**: `String`_, `Url`_, `DateTime`_, `Date`_, `Element`_, `Group`_
286 | 
287 | **Default value**: ``None``
288 | 
289 | If specified, then the extracted data will be returned in a dictionary, with the ``name`` as the key and the data as the value.
290 | 
291 | All parsers listed in ``children`` parameter of ``Group`` or ``Prefix`` parser **must** have ``name`` specified.
292 | If multiple children parsers have the same ``name``, the behavior is undefined.
293 | 
294 | Example:
295 | 
296 | .. code-block:: python
297 | 
298 |   # when `name` is not specified, raw value is returned
299 |   >>> String(css='span', count=1).parse('<span>Hello!</span>')
300 |   'Hello!'
301 | 
302 |   # when `name` is specified, dictionary is returned with `name` as the key
303 |   >>> String(name='message', css='span', count=1).parse('<span>Hello!</span>')
304 |   {'message': 'Hello!'}
305 | 
306 | 
307 | -----------
308 | css / xpath
309 | -----------
310 | 
311 | **Parsers**: `String`_, `Url`_, `DateTime`_, `Date`_, `Element`_, `Group`_, `Prefix`_
312 | 
313 | **Default value (xpath)**: ``"self::*"``
314 | 
315 | Use either ``css`` or ``xpath`` parameter (but not both) to select the elements from which to extract the data.
316 | 
317 | Under the hood css selectors are translated into equivalent xpath selectors.
318 | 
319 | For the children of ``Prefix`` or ``Group`` parsers, the elements are selected relative to the elements matched by the parent parser.
320 | 
321 | Example:
322 | 
323 | .. code-block:: python
324 | 
325 |     Prefix(xpath='//*[@id="profile"]', children=[
326 |         # equivalent to: //*[@id="profile"]/descendant-or-self::*[@class="name"]
327 |         String(name='name', css='.name', count=1),
328 | 
329 |         # equivalent to: //*[@id="profile"]/*[@class="title"]
330 |         String(name='title', xpath='*[@class="title"]', count=1),
331 | 
332 |         # equivalent to: //*[@class="subtitle"]
333 |         String(name='subtitle', xpath='//*[@class="subtitle"]', count=1)
334 |     ])
335 | 
336 | 
337 | -----
338 | count
339 | -----
340 | 
341 | **Parsers**: `String`_, `Url`_, `DateTime`_, `Date`_, `Element`_, `Group`_
342 | 
343 | **Default value**: ``"*"``
344 | 
345 | ``count`` specifies the expected number of elements to be matched with css/xpath selector. It serves two purposes:
346 | 
347 | 1. Number of matched elements is checked against the ``count`` parameter. If the number of elements doesn't match the expected countity, ``xextract.parsers.ParsingError`` exception is raised. This way you will be notified, when the website has changed its structure.
348 | 2. It tells the parser whether to return a single extracted value or a list of values. See the table below.
349 | 
350 | Syntax for ``count`` mimics the regular expressions.
351 | You can either pass the value as a string, single integer or tuple of two integers.
352 | 
353 | Depending on the value of ``count``, the parser returns either a single extracted value or a list of values.
354 | 
355 | +-------------------+-----------------------------------------------+-----------------------------+
356 | | Value of ``count``| Meaning                                       | Extracted data              |
357 | +===================+===============================================+=============================+
358 | | ``"*"`` (default) | Zero or more elements.                        | List of values              |
359 | +-------------------+-----------------------------------------------+-----------------------------+
360 | | ``"+"``           | One or more elements.                         | List of values              |
361 | +-------------------+-----------------------------------------------+-----------------------------+
362 | | ``"?"``           | Zero or one element.                          | Single value or ``None``    |
363 | +-------------------+-----------------------------------------------+-----------------------------+
364 | | ``num``           | Exactly ``num`` elements.                     | ``num`` == 0: ``None``      |
365 | |                   |                                               |                             |
366 | |                   | You can pass either string or integer.        | ``num`` == 1: Single value  |
367 | |                   |                                               |                             |
368 | |                   |                                               | ``num`` > 1: List of values |
369 | +-------------------+-----------------------------------------------+-----------------------------+
370 | | ``(num1, num2)``  | Number of elements has to be between          | List of values              |
371 | |                   | ``num1`` and ``num2``, inclusive.             |                             |
372 | |                   |                                               |                             |
373 | |                   | You can pass either a string or 2-tuple.      |                             |
374 | +-------------------+-----------------------------------------------+-----------------------------+
375 | 
376 | Example:
377 | 
378 | .. code-block:: python
379 | 
380 |     >>> String(css='.full-name', count=1).parse(content)  # return single value
381 |     'John Rambo'
382 | 
383 |     >>> String(css='.full-name', count='1').parse(content)  # same as above
384 |     'John Rambo'
385 | 
386 |     >>> String(css='.full-name', count=(1,2)).parse(content)  # return list of values
387 |     ['John Rambo']
388 | 
389 |     >>> String(css='.full-name', count='1,2').parse(content)  # same as above
390 |     ['John Rambo']
391 | 
392 |     >>> String(css='.middle-name', count='?').parse(content)  # return single value or None
393 |     None
394 | 
395 |     >>> String(css='.job-titles', count='+').parse(content)  # return list of values
396 |     ['President', 'US Senator', 'State Senator', 'Senior Lecturer in Law']
397 | 
398 |     >>> String(css='.friends', count='*').parse(content)  # return possibly empty list of values
399 |     []
400 | 
401 |     >>> String(css='.friends', count='+').parse(content)  # raise exception, when no elements are matched
402 |     xextract.parsers.ParsingError: Parser String matched 0 elements ("+" expected).
403 | 
404 | 
405 | ----
406 | attr
407 | ----
408 | 
409 | **Parsers**: `String`_, `Url`_, `DateTime`_, `Date`_
410 | 
411 | **Default value**: ``"href"`` for ``Url`` parser. ``"_text"`` otherwise.
412 | 
413 | Use ``attr`` parameter to specify what data to extract from the matched element.
414 | 
415 | +-------------------+-----------------------------------------------------+
416 | | Value of ``attr`` | Meaning                                             |
417 | +===================+=====================================================+
418 | | ``"_text"``       | Extract the text content of the matched element.    |
419 | +-------------------+-----------------------------------------------------+
420 | | ``"_all_text"``   | Extract and concatenate the text content of         |
421 | |                   | the matched element and all its descendants.        |
422 | +-------------------+-----------------------------------------------------+
423 | | ``"_name"``       | Extract tag name of the matched element.            |
424 | +-------------------+-----------------------------------------------------+
425 | | ``att_name``      | Extract the value out of ``att_name`` attribute of  |
426 | |                   | the matched element.                                |
427 | |                   |                                                     |
428 | |                   | If such attribute doesn't exist, empty string is    |
429 | |                   | returned.                                           |
430 | +-------------------+-----------------------------------------------------+
431 | 
432 | Example:
433 | 
434 | .. code-block:: python
435 | 
436 |     >>> from xextract import String, Url
437 |     >>> content = '<span class="name">Barack <strong>Obama</strong> III.</span> <a href="/test">Link</a>'
438 | 
439 |     >>> String(css='.name', count=1).parse(content)  # default attr is "_text"
440 |     'Barack  III.'
441 | 
442 |     >>> String(css='.name', count=1, attr='_text').parse(content)  # same as above
443 |     'Barack  III.'
444 | 
445 |     >>> String(css='.name', count=1, attr='_all_text').parse(content)  # all text
446 |     'Barack Obama III.'
447 | 
448 |     >>> String(css='.name', count=1, attr='_name').parse(content)  # tag name
449 |     'span'
450 | 
451 |     >>> Url(css='a', count='1').parse(content)  # Url extracts href by default
452 |     '/test'
453 | 
454 |     >>> String(css='a', count='1', attr='id').parse(content)  # non-existent attributes return empty string
455 |     ''
456 | 
457 | 
458 | --------
459 | callback
460 | --------
461 | 
462 | **Parsers**: `String`_, `Url`_, `DateTime`_, `Date`_, `Element`_, `Group`_
463 | 
464 | Provides an easy way to post-process extracted values.
465 | It should be a function that takes a single argument, the extracted value, and returns the postprocessed value.
466 | 
467 | Example:
468 | 
469 | .. code-block:: python
470 | 
471 |     >>> String(css='span', callback=int).parse('<span>1</span><span>2</span>')
472 |     [1, 2]
473 | 
474 |     >>> Element(css='span', count=1, callback=lambda el: el.text).parse('<span>Hello</span>')
475 |     'Hello'
476 | 
477 | --------
478 | children
479 | --------
480 | 
481 | **Parsers**: `Group`_, `Prefix`_
482 | 
483 | Specifies the children parsers for the ``Group`` and ``Prefix`` parsers.
484 | All parsers listed in ``children`` parameter **must** have ``name`` specified
485 | 
486 | Css/xpath selectors in the children parsers are relative to the selectors specified in the parent parser.
487 | 
488 | Example:
489 | 
490 | .. code-block:: python
491 | 
492 |     Prefix(xpath='//*[@id="profile"]', children=[
493 |         # equivalent to: //*[@id="profile"]/descendant-or-self::*[@class="name"]
494 |         String(name='name', css='.name', count=1),
495 | 
496 |         # equivalent to: //*[@id="profile"]/*[@class="title"]
497 |         String(name='title', xpath='*[@class="title"]', count=1),
498 | 
499 |         # equivalent to: //*[@class="subtitle"]
500 |         String(name='subtitle', xpath='//*[@class="subtitle"]', count=1)
501 |     ])
502 | 
503 | ----------
504 | namespaces
505 | ----------
506 | 
507 | **Parsers**: `String`_, `Url`_, `DateTime`_, `Date`_, `Element`_, `Group`_, `Prefix`_
508 | 
509 | When parsing XML documents containing namespace prefixes, pass the dictionary mapping namespace prefixes to namespace URIs.
510 | Use then full name for elements in xpath selector in the form ``"prefix:element"``
511 | 
512 | As for the moment, you **cannot use default namespace** for parsing (see `lxml docs <http://lxml.de/FAQ.html#how-can-i-specify-a-default-namespace-for-xpath-expressions>`_ for more information).  Just use an arbitrary prefix.
513 | 
514 | Example:
515 | 
516 | .. code-block:: python
517 | 
518 |     >>> content = '''<?xml version='1.0' encoding='UTF-8'?>
519 |     ... <movie xmlns="http://imdb.com/ns/">
520 |     ...   <title>The Shawshank Redemption</title>
521 |     ...   <year>1994</year>
522 |     ... </movie>'''
523 |     >>> nsmap = {'imdb': 'http://imdb.com/ns/'}  # use arbitrary prefix for default namespace
524 | 
525 |     >>> Prefix(xpath='//imdb:movie', namespaces=nsmap, children=[  # pass namespaces to the outermost parser
526 |     ...   String(name='title', xpath='imdb:title', count=1),
527 |     ...   String(name='year', xpath='imdb:year', count=1)
528 |     ... ]).parse(content)
529 |     {'title': 'The Shawshank Redemption', 'year': '1994'}
530 | 
531 | 
532 | ====================
533 | HTML vs. XML parsing
534 | ====================
535 | 
536 | To extract data from HTML or XML document, simply call ``parse()`` method of the parser:
537 | 
538 | .. code-block:: python
539 | 
540 |     >>> from xextract import *
541 |     >>> parser = Prefix(..., children=[...])
542 |     >>> extracted_data = parser.parse(content)
543 | 
544 | 
545 | ``content`` can be either string or unicode, containing the content of the document.
546 | 
547 | Under the hood **xextact** uses either ``lxml.etree.XMLParser`` or ``lxml.etree.HTMLParser`` to parse the document.
548 | To select the parser, **xextract** looks for ``"<?xml"`` string in the first 128 bytes of the document. If it is found, then ``XMLParser`` is used.
549 | 
550 | To force either of the parsers, you can call ``parse_html()`` or ``parse_xml()`` method:
551 | 
552 | .. code-block:: python
553 | 
554 |     >>> parser.parse_html(content)  # force lxml.etree.HTMLParser
555 |     >>> parser.parse_xml(content)   # force lxml.etree.XMLParser
556 | 


--------------------------------------------------------------------------------