├── hq
├── __init__.py
├── hquery
│ ├── __init__.py
│ ├── functions
│ │ ├── __init__.py
│ │ ├── extend_node_set.py
│ │ ├── core_boolean.py
│ │ ├── core_node_set.py
│ │ ├── extend_string.py
│ │ ├── core_string.py
│ │ └── core_number.py
│ ├── computed_constructors
│ │ ├── __init__.py
│ │ ├── hash_key_value.py
│ │ ├── html_attribute.py
│ │ ├── html_element.py
│ │ ├── json_array.py
│ │ └── json_hash.py
│ ├── syntax_error.py
│ ├── evaluation_error.py
│ ├── variables.py
│ ├── evaluation_in_context.py
│ ├── sequences.py
│ ├── axis.py
│ ├── expression_context.py
│ ├── function_support.py
│ ├── union_decomposition.py
│ ├── object_type.py
│ ├── equality_operators.py
│ ├── relational_operators.py
│ ├── flwor.py
│ ├── node_test.py
│ ├── string_interpolation.py
│ ├── location_path.py
│ └── tokens.py
├── config.py
├── __main__.py
├── string_util.py
├── verbosity.py
├── output.py
├── hq.py
└── soup_util.py
├── test
├── __init__.py
├── hquery
│ ├── __init__.py
│ ├── test_strings.py
│ ├── hquery_test_util.py
│ ├── test_if_then_else.py
│ ├── test_expressions.py
│ ├── test_union_decomposition.py
│ ├── test_sequences_and_ranges.py
│ ├── test_name_tests.py
│ ├── test_arithmetic_operators.py
│ ├── test_computed_html_construction.py
│ ├── test_node_tests.py
│ ├── test_equality_operators.py
│ ├── test_relational_operators.py
│ ├── test_interpolated_strings.py
│ ├── test_extended_functions.py
│ ├── test_location_paths.py
│ ├── test_flwor.py
│ ├── test_computed_json_construction.py
│ ├── test_core_functions.py
│ ├── test_xpath1_abbreviated_samples.py
│ ├── test_axes.py
│ └── test_xpath1_unabbreviated_samples.py
├── conftest.py
├── common_test_util.py
├── test_unicode_support.py
└── test_cli.py
├── MANIFEST.in
├── requirements
├── base.txt
└── dev.txt
├── .travis.yml
├── hq_runner.py
├── tox.ini
├── .gitignore
├── LICENSE
├── setup.py
└── README.md
/hq/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/test/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/hq/hquery/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/test/hquery/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include LICENSE
2 |
--------------------------------------------------------------------------------
/hq/hquery/functions/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/hq/hquery/computed_constructors/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/hq/config.py:
--------------------------------------------------------------------------------
1 |
2 | class settings:
3 | VERBOSE = False
4 |
--------------------------------------------------------------------------------
/hq/hquery/syntax_error.py:
--------------------------------------------------------------------------------
1 |
2 | class HquerySyntaxError(ValueError):
3 | pass
4 |
--------------------------------------------------------------------------------
/requirements/base.txt:
--------------------------------------------------------------------------------
1 | beautifulsoup4==4.11.1
2 | docopt==0.6.2
3 | wheel==0.37.1
4 |
--------------------------------------------------------------------------------
/requirements/dev.txt:
--------------------------------------------------------------------------------
1 | -r base.txt
2 |
3 | pytest==7.1.3
4 | pytest-cov==3.0.0
5 | pytest-mock==3.8.2
6 | tox==3.26.0
7 |
8 | hq~=0.0.4
9 | setuptools~=65.4.0
10 |
--------------------------------------------------------------------------------
/hq/__main__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | """
4 | hq.__main__: executed when hq module is executed as script. (Based on Jan-Philip Gehrcke's python-cmdline-bootstrap.)
5 | """
6 |
7 |
8 | from .hq import main
9 | main()
10 |
--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | git:
2 | depth: 3
3 | language: python
4 | python:
5 | - "2.7"
6 | - "3.4"
7 | - "3.5"
8 | install:
9 | - pip install -r requirements/dev.txt
10 | - pip install coveralls
11 | script: py.test --cov=hq
12 | after_success:
13 | coveralls
14 |
--------------------------------------------------------------------------------
/hq_runner.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | """
4 | Convenience wrapper for running hq directly from source tree. (Based on Jan-Philip Gehrcke's python-cmdline-bootstrap.)
5 | """
6 |
7 | from hq.hq import main
8 |
9 |
10 | if __name__ == '__main__':
11 | main()
12 |
--------------------------------------------------------------------------------
/test/conftest.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 |
4 | sys.path.insert(0, os.path.abspath('..'))
5 |
6 | from hq.verbosity import set_verbosity
7 |
8 |
9 | def pytest_addoption(parser):
10 | parser.addoption("--gabby",
11 | action="store_true",
12 | help="Print verbose (debug) information to stderr")
13 |
14 |
15 | def pytest_configure(config):
16 | set_verbosity(bool(config.getvalue('--gabby')))
17 | pass
18 |
--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
1 | # Tox (http://tox.testrun.org/) is a tool for running tests
2 | # in multiple virtualenvs. This configuration file will run the
3 | # test suite on all supported python versions. To use it, "pip install tox"
4 | # and then run "tox" from this directory.
5 |
6 | [tox]
7 | envlist = python3.5, python3.6, python3.7, py38, py39
8 |
9 | [testenv]
10 | commands = py.test
11 | deps =
12 | beautifulsoup4
13 | docopt
14 | mock
15 | pytest-mock
16 |
--------------------------------------------------------------------------------
/test/hquery/test_strings.py:
--------------------------------------------------------------------------------
1 | from test.common_test_util import expected_result
2 | from test.hquery.hquery_test_util import query_html_doc
3 |
4 |
5 | def test_escapes_work_in_string_literals():
6 | assert query_html_doc('', '"foo
bar"') == expected_result("""
7 | foo
8 | bar""")
9 | assert query_html_doc('', "'foo
bar'") == expected_result("""
10 | foo
11 | bar""")
12 | assert query_html_doc('', '`foo
bar`') == expected_result("""
13 | foo
14 | bar""")
15 |
--------------------------------------------------------------------------------
/hq/hquery/evaluation_error.py:
--------------------------------------------------------------------------------
1 | from hq.hquery.object_type import is_node_set, is_sequence
2 |
3 |
4 | class HqueryEvaluationError(RuntimeError):
5 |
6 | @classmethod
7 | def must_be_node_set(cls, obj):
8 | if not is_node_set(obj):
9 | raise HqueryEvaluationError('Expected a node set, but found a(n) {0}'.format(obj.__class__.__name__))
10 |
11 | @classmethod
12 | def must_be_node_set_or_sequence(cls, obj):
13 | if not (is_node_set(obj) or is_sequence(obj)):
14 | raise HqueryEvaluationError('Expected a node set or sequence, but found a(n) {0}'.format(
15 | obj.__class__.__name__
16 | ))
17 |
--------------------------------------------------------------------------------
/hq/string_util.py:
--------------------------------------------------------------------------------
1 | import re
2 | from html.entities import name2codepoint
3 |
4 |
5 | def html_entity_decode(s):
6 | result = re.sub('&(%s);' % '|'.join(name2codepoint), lambda m: str(chr(name2codepoint[m.group(1)])), s)
7 | result = re.sub(r'(\d{2,3});', lambda m: chr(int(m.group(1))), result)
8 | return result
9 |
10 |
11 | def is_a_string(obj):
12 | class_name = obj.__class__.__name__
13 | return class_name.endswith('str') or class_name.endswith('unicode')
14 |
15 |
16 | def truncate_string(s, length, one_line=True, suffix='...'):
17 | if len(s) <= length:
18 | result = s
19 | else:
20 | result = s[:length + 1].rsplit(' ', 1)[0] + suffix
21 | if one_line:
22 | result = result.replace('\n', '\\n')
23 | return result
24 |
--------------------------------------------------------------------------------
/hq/verbosity.py:
--------------------------------------------------------------------------------
1 | import sys
2 |
3 | from .config import settings
4 | from .string_util import is_a_string
5 |
6 |
7 | indent_level = 0
8 |
9 |
10 | def set_verbosity(verbose):
11 | setattr(settings, 'VERBOSE', verbose)
12 |
13 |
14 | def push_indent():
15 | global indent_level
16 | indent_level += 2
17 |
18 | def pop_indent():
19 | global indent_level
20 | indent_level -= 2
21 |
22 |
23 | def verbose_print(text, indent_after=False, outdent_before=False):
24 | if settings.VERBOSE:
25 | if outdent_before:
26 | pop_indent()
27 | if not is_a_string(text):
28 | text = text()
29 | print(u'{0}{1}'.format(' ' * indent_level, text), file=sys.stderr)
30 | if indent_after:
31 | push_indent()
32 |
--------------------------------------------------------------------------------
/hq/hquery/functions/extend_node_set.py:
--------------------------------------------------------------------------------
1 | from hq.hquery.evaluation_error import HqueryEvaluationError
2 | from hq.hquery.expression_context import get_context_node, peek_context
3 | from hq.hquery.functions.core_boolean import boolean
4 |
5 | exports = ('class_', 'even', 'odd')
6 |
7 |
8 | def class_(*args):
9 | if len(args) == 1:
10 | tag = get_context_node()
11 | name = args[0]
12 | elif len(args) == 2:
13 | HqueryEvaluationError.must_be_node_set(args[0])
14 | tag = args[0][0]
15 | name = args[1]
16 | else:
17 | raise HqueryEvaluationError('class() expects one or two arguments; got {0}'.format(len(args)))
18 |
19 | return boolean(name in tag['class'])
20 |
21 |
22 | def even():
23 | return boolean(peek_context().position % 2 == 0)
24 |
25 |
26 | def odd():
27 | return boolean(peek_context().position % 2 == 1)
28 |
--------------------------------------------------------------------------------
/hq/hquery/computed_constructors/hash_key_value.py:
--------------------------------------------------------------------------------
1 | from hq.hquery.object_type import debug_dump_anything
2 | from hq.verbosity import verbose_print
3 |
4 |
5 | class ComputedHashKeyValueConstructor:
6 |
7 | def __init__(self, key):
8 | self.key = key
9 | self.value_fn = None
10 |
11 |
12 | def set_value(self, fn):
13 | self.value_fn = fn
14 |
15 |
16 | def evaluate(self):
17 | verbose_print('Evaluating value expression for constructed hash key "{0}"'.format(self.key), indent_after=True)
18 |
19 | value = self.value_fn()
20 |
21 | msg = u'Finished evaluating; value of constructed hash key "{0}" is {1}'
22 | verbose_print(lambda: msg.format(self.key, debug_dump_anything(value)), outdent_before=True)
23 |
24 | return HashKeyValue(self.key, value)
25 |
26 |
27 |
28 | class HashKeyValue:
29 |
30 | def __init__(self, key, value):
31 | self.key = key
32 | self.value = value
33 |
--------------------------------------------------------------------------------
/test/hquery/hquery_test_util.py:
--------------------------------------------------------------------------------
1 | from hq.output import convert_results_to_output_text
2 | from hq.soup_util import make_soup, is_any_node, root_tag_from_soup
3 | from hq.hquery.hquery_processor import HqueryProcessor
4 | from test.common_test_util import soup_with_body, eliminate_blank_lines
5 |
6 |
7 | def query_html_doc(html_body, hquery, preserve_space=False, wrap_body=True):
8 | soup = soup_with_body(html_body) if wrap_body else make_soup(html_body)
9 | raw_result = HqueryProcessor(hquery, preserve_space=preserve_space).query(soup)
10 | return eliminate_blank_lines(convert_results_to_output_text(raw_result, preserve_space=preserve_space).strip())
11 |
12 |
13 | def query_context_node(node_or_source, hquery):
14 | if not is_any_node(node_or_source):
15 | node_or_source = root_tag_from_soup(make_soup(node_or_source))
16 | raw_result = HqueryProcessor(hquery).query(node_or_source)
17 | return eliminate_blank_lines(convert_results_to_output_text(raw_result).strip())
18 |
--------------------------------------------------------------------------------
/test/hquery/test_if_then_else.py:
--------------------------------------------------------------------------------
1 | from test.common_test_util import expected_result
2 | from test.hquery.hquery_test_util import query_html_doc
3 |
4 |
5 | def test_if_then_else_works_with_literal_conditions():
6 | assert query_html_doc('', 'if (true()) then "foo" else "bar"') == 'foo'
7 | assert query_html_doc('', 'if ("") then "foo" else "bar"') == 'bar'
8 | assert query_html_doc('', 'if (0.001) then "foo" else "bar"') == 'foo'
9 |
10 |
11 | def test_if_then_else_works_with_node_sets():
12 | html_body = """
13 |
eekaboo
"""
14 | assert query_html_doc(html_body, 'if (//p) then //p else 1 to 3') == expected_result("""
15 |
16 | eekaboo
17 |
""")
18 | assert query_html_doc(html_body, 'if (//div) then //p else 1 to 3') == expected_result("""
19 | 1
20 | 2
21 | 3""")
22 |
23 |
24 | def test_if_then_else_works_with_variables_in_a_flwor():
25 | assert query_html_doc('', 'let $x := 0.1 return if ($x - 0.1) then $x else $x + 1') == '1.1'
26 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | env/
12 | build/
13 | develop-eggs/
14 | dist/
15 | downloads/
16 | eggs/
17 | .eggs/
18 | lib/
19 | lib64/
20 | parts/
21 | sdist/
22 | var/
23 | *.egg-info/
24 | .installed.cfg
25 | *.egg
26 |
27 | # PyInstaller
28 | # Usually these files are written by a python script from a template
29 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
30 | *.manifest
31 | *.spec
32 |
33 | # Installer logs
34 | pip-log.txt
35 | pip-delete-this-directory.txt
36 |
37 | # Unit test / coverage reports
38 | htmlcov/
39 | .tox/
40 | .coverage
41 | .coverage.*
42 | .cache
43 | nosetests.xml
44 | coverage.xml
45 | *,cover
46 | .hypothesis/
47 |
48 | # Translations
49 | *.mo
50 | *.pot
51 |
52 | # Django stuff:
53 | *.log
54 |
55 | # Sphinx documentation
56 | docs/_build/
57 |
58 | # PyBuilder
59 | target/
60 |
61 | #Ipython Notebook
62 | .ipynb_checkpoints
63 |
64 | # JetBrains
65 | .idea
66 |
--------------------------------------------------------------------------------
/hq/hquery/functions/core_boolean.py:
--------------------------------------------------------------------------------
1 | from math import isnan
2 |
3 | from hq.hquery.object_type import is_node_set, is_number, is_boolean
4 |
5 |
6 | exports = ('boolean', 'false', 'not_', 'true')
7 |
8 |
9 | class boolean:
10 |
11 | def __init__(self, obj):
12 | if is_node_set(obj):
13 | self.value = len(obj) > 0
14 | elif is_number(obj):
15 | f = float(obj)
16 | self.value = bool(f) and not isnan(f)
17 | else:
18 | self.value = bool(obj)
19 |
20 | def __bool__(self):
21 | return self.value
22 |
23 | def __nonzero__(self):
24 | return self.__bool__()
25 |
26 | def __str__(self):
27 | return str(self.value).lower()
28 |
29 | def __eq__(self, other):
30 | return is_boolean(other) and self.value == other.value
31 |
32 | def __repr__(self):
33 | return 'boolean({0})'.format(self.value)
34 |
35 |
36 | def false():
37 | return boolean(False)
38 |
39 |
40 | def not_(value):
41 | return boolean(not boolean(value))
42 |
43 |
44 | def true():
45 | return boolean(True)
46 |
--------------------------------------------------------------------------------
/hq/output.py:
--------------------------------------------------------------------------------
1 | from builtins import str
2 |
3 | from .hquery.object_type import is_sequence
4 | from .soup_util import is_text_node, is_attribute_node, is_comment_node, is_tag_node, derive_text_from_node, \
5 | is_root_node
6 |
7 |
8 | def convert_results_to_output_text(results, pretty=True, preserve_space=False):
9 | if is_sequence(results):
10 | return '\n'.join(value_object_to_text(object, pretty, preserve_space) for object in results)
11 | else:
12 | return value_object_to_text(results, pretty, preserve_space)
13 |
14 |
15 | def value_object_to_text(obj, pretty, preserve_space):
16 | if is_comment_node(obj):
17 | return u''.format(str(obj).strip())
18 | elif is_tag_node(obj) or is_root_node(obj):
19 | return obj.prettify().rstrip(' \t\n') if pretty else str(obj)
20 | elif is_attribute_node(obj):
21 | return u'{0}="{1}"'.format(obj.name, derive_text_from_node(obj, preserve_space=preserve_space))
22 | elif is_text_node(obj):
23 | return derive_text_from_node(obj, preserve_space=preserve_space)
24 | else:
25 | return str(obj)
26 |
--------------------------------------------------------------------------------
/hq/hquery/variables.py:
--------------------------------------------------------------------------------
1 | from hq.hquery.object_type import debug_dump_anything
2 | from hq.verbosity import verbose_print
3 |
4 | variable_stack = []
5 | NAME, VALUE = range(2)
6 |
7 |
8 | class variable_scope:
9 | def __enter__(self):
10 | self.mark = len(variable_stack)
11 |
12 | def __exit__(self, *args):
13 | del variable_stack[self.mark:]
14 |
15 |
16 | def push_variable(name, value):
17 | global variable_stack
18 | verbose_print(lambda: u'Pushing variable onto stack: let ${0} := {1}'.format(name, debug_dump_anything(value)))
19 | variable_stack.append((name, value))
20 |
21 |
22 | def value_of_variable(name):
23 | if len(variable_stack) > 0:
24 | for index in range(len(variable_stack) - 1, -1, -1):
25 | if variable_stack[index][NAME] == name:
26 | reverse_index = len(variable_stack) - (index + 1)
27 | verbose_print('Variable "${0}" found on stack (position {1}).'.format(name, reverse_index))
28 | return variable_stack[index][VALUE]
29 |
30 | verbose_print('Variable "${0}" NOT FOUND on variable stack.'.format(name))
31 | return None
32 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | The MIT License (MIT)
2 |
3 | Copyright (c) 2016 Richard B. Winslow
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/hq/hquery/evaluation_in_context.py:
--------------------------------------------------------------------------------
1 | from hq.hquery.evaluation_error import HqueryEvaluationError
2 | from hq.hquery.expression_context import push_context, pop_context
3 | from hq.hquery.sequences import make_node_set
4 | from hq.soup_util import is_any_node, debug_dump_long_string
5 |
6 |
7 | def evaluate_across_contexts(node_set, expression_fn):
8 | HqueryEvaluationError.must_be_node_set(node_set)
9 |
10 | node_set_len = len(node_set)
11 | ragged = [evaluate_in_context(node, expression_fn, position=index+1, size=node_set_len)
12 | for index, node in enumerate(node_set)]
13 | return make_node_set([item for sublist in ragged for item in sublist])
14 |
15 |
16 | def evaluate_in_context(node, expression_fn, position=1, size=1, preserve_space=None):
17 | if not is_any_node(node):
18 | raise HqueryEvaluationError('cannot use {0} "{1}" as context node'.format(type(node),
19 | debug_dump_long_string(str(node))))
20 | push_context(node, position, size, preserve_space)
21 | result = expression_fn()
22 | pop_context()
23 | return result
24 |
--------------------------------------------------------------------------------
/test/hquery/test_expressions.py:
--------------------------------------------------------------------------------
1 | from test.common_test_util import expected_result
2 | from test.hquery.hquery_test_util import query_html_doc
3 |
4 |
5 | def test_parentheses_boost_precedence():
6 | assert query_html_doc('', '(2+3)*3') == expected_result('15')
7 | assert query_html_doc('', '3*(3+2)') == expected_result('15')
8 | assert query_html_doc('', '2+3*3 != (2+3)*3') == expected_result('true')
9 |
10 |
11 | def test_union_operator_combines_node_sets():
12 | html_body = """
13 | one
14 | two
15 | three
"""
16 | assert query_html_doc(html_body, '//div | //p') == expected_result("""
17 |
18 | one
19 |
20 |
21 | two
22 |
23 |
24 | three
25 |
""")
26 |
27 |
28 | def test_union_operator_produces_node_set_sorted_in_document_order():
29 | html_body = """
30 | one
31 | two
32 | three
"""
33 | assert query_html_doc(html_body, '//p | //div') == expected_result("""
34 |
35 | one
36 |
37 |
38 | two
39 |
40 |
41 | three
42 |
""")
43 |
--------------------------------------------------------------------------------
/test/hquery/test_union_decomposition.py:
--------------------------------------------------------------------------------
1 |
2 | from test.common_test_util import expected_result
3 | from test.hquery.hquery_test_util import query_html_doc
4 |
5 |
6 | def test_union_decomposition_with_parentheses():
7 | html_body = """
8 | heading
9 | content
10 | another heading """
11 | assert query_html_doc(html_body, '(//h1 | //p) => ("fizz" | "buzz")') == expected_result("""
12 | fizz
13 | buzz
14 | fizz""")
15 |
16 |
17 | def test_union_decomposition_naked():
18 | html_body = """
19 | heading
20 | content
21 | another heading """
22 | assert query_html_doc(html_body, '(//h1 | //p) => `h1 $_` | `p $_`') == expected_result("""
23 | h1 heading
24 | p content
25 | h1 another heading""")
26 |
27 |
28 | def test_union_decomposition_applies_first_matching_clause():
29 | html_body = """
30 | div1
31 | p1
32 | """
35 | query = '(//p | /html/body/div | /html/body//*) => "one" | "two" | "three"'
36 | assert query_html_doc(html_body, query) == expected_result("""
37 | two
38 | one
39 | two
40 | one""")
41 |
--------------------------------------------------------------------------------
/hq/hquery/sequences.py:
--------------------------------------------------------------------------------
1 | from itertools import filterfalse
2 |
3 | from hq.hquery.evaluation_error import HqueryEvaluationError
4 | from hq.hquery.object_type import object_type_name
5 | from hq.soup_util import is_any_node
6 |
7 |
8 | def make_node_set(node_set, reverse=False):
9 | ids = set()
10 |
11 | def is_unique_id(node):
12 | node_id = id(node)
13 | if node_id in ids:
14 | return False
15 | else:
16 | ids.add(node_id)
17 | return True
18 |
19 | if not isinstance(node_set, list):
20 | node_set = [node_set]
21 |
22 | non_node_member = next(filterfalse(is_any_node, node_set), False)
23 | if non_node_member:
24 | format_str = 'Constructed node set that includes {0} object "{1}"'
25 | raise HqueryEvaluationError(format_str.format(object_type_name(non_node_member), non_node_member))
26 |
27 | node_set = list(sorted(filter(is_unique_id, node_set), key=lambda n: n.hq_doc_index, reverse=reverse))
28 |
29 | return node_set
30 |
31 |
32 | def make_sequence(sequence):
33 | if not isinstance(sequence, list):
34 | sequence = [sequence]
35 | return sequence
36 |
37 |
38 | def sequence_concat(first, second):
39 | first = make_sequence(first)
40 | first.extend(make_sequence(second))
41 | return first
42 |
--------------------------------------------------------------------------------
/test/common_test_util.py:
--------------------------------------------------------------------------------
1 | from textwrap import dedent
2 |
3 | from hq.soup_util import make_soup
4 |
5 |
6 | def capture_console_output(capsys, strip=True):
7 | output, errors = capsys.readouterr()
8 | output = output.rstrip('\n')
9 | return eliminate_blank_lines(output.strip()) if strip else output, eliminate_blank_lines(errors.strip())
10 |
11 |
12 | def eliminate_blank_lines(s):
13 | return '\n'.join([line for line in s.split('\n') if line.strip() != ''])
14 |
15 |
16 | def expected_result(contents):
17 | return dedent(contents.lstrip('\n'))
18 |
19 |
20 | def simulate_args_dict(**kwargs):
21 | args = {
22 | '': '',
23 | '-f': False,
24 | '--file': False,
25 | '--preserve': False,
26 | '--program': '',
27 | '-u': False,
28 | '--ugly': False,
29 | '-v': False,
30 | '--verbose': False
31 | }
32 | for key, value in kwargs.items():
33 | if key == 'expression':
34 | format_string = '<{0}>'
35 | elif len(key) == 1:
36 | format_string = '-{0}'
37 | else:
38 | format_string = '--{0}'
39 | args[format_string.format(key)] = value
40 | return args
41 |
42 |
43 | def soup_with_body(contents):
44 | return make_soup(wrap_html_body(contents))
45 |
46 |
47 | def wrap_html_body(contents):
48 | return u'{0}'.format(contents)
49 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 |
4 | import os
5 | import re
6 | from setuptools import setup, find_packages
7 |
8 | version = re.search("^__version__\s*=\s*'(.*)'", open('hq/hq.py').read(), re.M).group(1)
9 |
10 |
11 | long_description = 'Powerful HTML slicing and dicing at the command line.'
12 | if os.path.exists('README.md'):
13 | with open('README.md', 'rb') as f:
14 | long_description = f.read().decode('utf-8')
15 |
16 |
17 | classifiers = [
18 | 'Development Status :: 2 - Pre-Alpha',
19 | 'Intended Audience :: Developers',
20 | 'Environment :: Console',
21 | 'License :: OSI Approved :: MIT License',
22 | 'Programming Language :: Python',
23 | 'Topic :: Text Processing :: Markup :: HTML',
24 | ]
25 |
26 |
27 | setup(name='hq',
28 | packages=find_packages(exclude=['test']),
29 | entry_points={'console_scripts': ['hq = hq.hq:main']},
30 | version=version,
31 | description='Command-line tool for querying, slicing & dicing HTML using the XPath/XQuery derivative HQuery.',
32 | long_description=long_description,
33 | author='Richard B. Winslow',
34 | author_email='richard.b.winslow@gmail.com',
35 | license='MIT',
36 | url='https://github.com/rbwinslow/hq',
37 | keywords='html xpath query xquery hquery jq cmdline cli',
38 | classifiers=classifiers,
39 | install_requires=['beautifulsoup4', 'docopt', 'wheel'])
40 |
--------------------------------------------------------------------------------
/test/hquery/test_sequences_and_ranges.py:
--------------------------------------------------------------------------------
1 | from test.common_test_util import expected_result
2 | from test.hquery.hquery_test_util import query_html_doc
3 |
4 |
5 | def test_range_expression_produces_expected_sequence():
6 | assert query_html_doc('', '(1 to 3)') == expected_result("""
7 | 1
8 | 2
9 | 3""")
10 |
11 |
12 | def test_range_expression_works_without_parentheses():
13 | assert query_html_doc('', '1 to 3') == expected_result("""
14 | 1
15 | 2
16 | 3""")
17 |
18 |
19 | def test_range_operator_is_interpreted_as_name_test_in_appropriate_contexts():
20 | html_body = 'from '
21 | assert query_html_doc(html_body, '//to') == expected_result("""
22 |
23 | from
24 | """)
25 |
26 |
27 | def test_range_within_sequence_constructor_collapses_into_sequence():
28 | assert query_html_doc('', '(1, 2 to 4)') == expected_result("""
29 | 1
30 | 2
31 | 3
32 | 4""")
33 |
34 |
35 | def test_sequences_collapse():
36 | assert query_html_doc('', '(1, (2, 3), 4)') == expected_result("""
37 | 1
38 | 2
39 | 3
40 | 4""")
41 |
42 |
43 | def test_string_value_of_a_sequence_is_concatenation_of_all_items_unlike_node_set():
44 | html_body = """
45 | one
46 | two
"""
47 |
48 | assert query_html_doc(html_body, 'let $_ := //p/text() return string($_)') == 'one'
49 | assert query_html_doc(html_body, 'let $_ := ("one", "two") return string($_)') == 'onetwo'
50 |
--------------------------------------------------------------------------------
/hq/hquery/axis.py:
--------------------------------------------------------------------------------
1 |
2 | from enum import Enum
3 |
4 |
5 | class Axis(Enum):
6 | # standard
7 | ancestor = 1
8 | ancestor_or_self = 2
9 | attribute = 3
10 | child = 4
11 | descendant = 5
12 | descendant_or_self = 6
13 | following = 7
14 | following_sibling = 8
15 | parent = 9
16 | preceding = 10
17 | preceding_sibling = 11
18 | self = 12
19 | # extended
20 | css_class = 13
21 |
22 |
23 | def is_reverse_order(self):
24 | return self in reverse_order_axes
25 |
26 | def token(self):
27 | return self.name.replace('_', '-')
28 |
29 | @classmethod
30 | def abbreviations(self):
31 | return _abbreviations.keys()
32 |
33 | @classmethod
34 | def canonicalize(cls, name):
35 | if name in _abbreviations.keys():
36 | result = _abbreviations[name]
37 | else:
38 | result = name.replace('-', '_')
39 | return result
40 |
41 |
42 | _abbreviations = {
43 | '^': 'ancestor',
44 | '^^': 'ancestor_or_self',
45 | '@': 'attribute',
46 | '.': 'css_class',
47 | 'class': 'css_class',
48 | '~': 'descendant',
49 | '>>': 'following',
50 | '>': 'following_sibling',
51 | '<<': 'preceding',
52 | '<': 'preceding_sibling',
53 | }
54 |
55 |
56 | reverse_order_axes = {Axis.ancestor, Axis.ancestor_or_self, Axis.preceding, Axis.preceding_sibling}
57 |
--------------------------------------------------------------------------------
/test/test_unicode_support.py:
--------------------------------------------------------------------------------
1 | from hq.hq import main
2 | from test.common_test_util import expected_result, wrap_html_body, simulate_args_dict, capture_console_output
3 |
4 |
5 | def test_tolerates_latin_characters_in_element_contents(capsys, mocker):
6 | mocker.patch('hq.hq.docopt').return_value = simulate_args_dict(expression='//div')
7 | mocker.patch('sys.stdin.read').return_value = wrap_html_body(u"""
8 |
9 | T\xeate\xa0\xe0\xa0t\xeate
10 |
""")
11 |
12 | main()
13 |
14 | actual, _ = capture_console_output(capsys)
15 | assert actual == expected_result(u"""
16 |
17 | T\xeate\xa0\xe0\xa0t\xeate
18 |
""")
19 |
20 |
21 | def test_tolerates_latin_characters_in_attribute_contents(capsys, mocker):
22 | mocker.patch('hq.hq.docopt').return_value = simulate_args_dict(expression='//div/@role')
23 | mocker.patch('sys.stdin.read').return_value = wrap_html_body(u"""
24 |
25 |
""")
26 |
27 | main()
28 |
29 | actual, _ = capture_console_output(capsys)
30 | assert actual == expected_result(u'role="prim\xe4r"')
31 |
32 |
33 | def test_tolerates_latin_characters_in_comments(capsys, mocker):
34 | mocker.patch('hq.hq.docopt').return_value = simulate_args_dict(expression='//comment()')
35 | mocker.patch('sys.stdin.read').return_value = wrap_html_body(u"""
36 | """)
37 |
38 | main()
39 |
40 | actual, _ = capture_console_output(capsys)
41 | assert actual == expected_result(u'')
42 |
--------------------------------------------------------------------------------
/hq/hquery/functions/core_node_set.py:
--------------------------------------------------------------------------------
1 | from hq.hquery.evaluation_error import HqueryEvaluationError
2 | from hq.hquery.expression_context import peek_context, get_context_node
3 | from hq.hquery.functions.core_number import number
4 | from hq.hquery.object_type import string_value, is_sequence, object_type_name
5 | from hq.hquery.sequences import make_node_set
6 | from hq.soup_util import root_tag_from_any_tag, is_tag_node
7 |
8 | exports = ('count', 'id', 'last', 'name', 'position')
9 |
10 |
11 | def count(sequence):
12 | HqueryEvaluationError.must_be_node_set_or_sequence(sequence)
13 | return number(len(sequence))
14 |
15 |
16 | def id(ids):
17 | if is_sequence(ids):
18 | ids = set(string_value(item) for item in ids)
19 | else:
20 | ids = set(string_value(ids).split())
21 | result = []
22 | for node in root_tag_from_any_tag(get_context_node()).descendants:
23 | if is_tag_node(node) and 'id' in node.attrs and node['id'] in ids:
24 | result.append(node)
25 | return make_node_set(result)
26 |
27 |
28 | def last():
29 | return number(peek_context().size)
30 |
31 |
32 | def name(*args):
33 | if len(args) > 0:
34 | value = args[0]
35 | if is_sequence(value):
36 | value = value[0]
37 | if is_tag_node(value):
38 | return value.name
39 | else:
40 | return ''
41 | else:
42 | node = get_context_node()
43 | if is_tag_node(node):
44 | return node.name
45 | else:
46 | return ''
47 |
48 |
49 | def position():
50 | return number(peek_context().position)
51 |
--------------------------------------------------------------------------------
/test/hquery/test_name_tests.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 |
4 | from hq.output import convert_results_to_output_text
5 | from hq.soup_util import make_soup
6 | from hq.hquery.hquery_processor import HqueryProcessor
7 |
8 | sys.path.insert(0, os.path.abspath('../..'))
9 |
10 | from ..common_test_util import expected_result
11 | from test.hquery.hquery_test_util import query_html_doc
12 |
13 |
14 | def test_name_test_is_case_insensitive():
15 | html_body = """
16 | one
17 | two
18 | three """
19 | actual = query_html_doc(html_body, '/html/body/SpAn')
20 | assert actual == expected_result("""
21 |
22 | one
23 |
24 |
25 | two
26 |
27 |
28 | three
29 | """)
30 |
31 |
32 | def test_name_test_at_root_ignores_all_but_root_element():
33 | html = """
34 |
35 |
36 |
37 | """
38 | raw_result = HqueryProcessor('/html').query(make_soup(html))
39 | actual = convert_results_to_output_text(raw_result)
40 | assert actual == expected_result("""
41 |
42 | """)
43 |
44 |
45 | def test_name_test_tolerates_hyphens_in_element_names():
46 | html_body = " "
47 | assert query_html_doc(html_body, '//special-name') == expected_result("""
48 |
49 | """)
50 |
51 |
52 | def test_name_test_tolerates_hyphens_in_attribute_names():
53 | html_body = "
"
54 | assert query_html_doc(html_body, '//div/@special-name') == expected_result('special-name="special-value"')
55 |
--------------------------------------------------------------------------------
/hq/hquery/expression_context.py:
--------------------------------------------------------------------------------
1 | from hq.verbosity import verbose_print
2 | from ..soup_util import debug_dump_node
3 |
4 | context_stack = []
5 |
6 |
7 | class ExpressionContext:
8 |
9 | def __init__(self, node, position=1, size=1, preserve_space=None):
10 | self.node = node
11 | self.position = position
12 | self.size = size
13 | if preserve_space is not None:
14 | self.preserve_space = preserve_space
15 | else:
16 | try:
17 | self.preserve_space = peek_context().preserve_space
18 | except ExpressionStackEmptyError:
19 | self.preserve_space = False
20 |
21 | def __str__(self):
22 | return 'context(node={0})'.format(str(self.node))
23 |
24 |
25 | class ExpressionStackEmptyError(RuntimeError):
26 | pass
27 |
28 |
29 |
30 | def get_context_node():
31 | return peek_context().node
32 |
33 |
34 | def peek_context():
35 | try:
36 | return context_stack[-1]
37 | except IndexError:
38 | raise ExpressionStackEmptyError('tried to peek while expression stack was empty')
39 |
40 |
41 | def pop_context():
42 | result = context_stack.pop()
43 | msg = u'Popping (node={0}, position={1}, size={2} off of context stack.'
44 | verbose_print(lambda: msg.format(debug_dump_node(result.node), result.position, result.size))
45 | return result
46 |
47 |
48 | def push_context(node, position=1, size=1, preserve_space=None):
49 | msg = u'Pushing (node={0}, position={1}, size={2} on context stack.'
50 | verbose_print(lambda: msg.format(debug_dump_node(node), position, size))
51 | context_stack.append(ExpressionContext(node=node, position=position, size=size, preserve_space=preserve_space))
52 |
--------------------------------------------------------------------------------
/hq/hquery/computed_constructors/html_attribute.py:
--------------------------------------------------------------------------------
1 | from hq.hquery.evaluation_error import HqueryEvaluationError
2 | from hq.hquery.object_type import is_string, is_number, is_boolean, object_type_name, string_value
3 | from hq.hquery.sequences import make_sequence
4 | from hq.hquery.syntax_error import HquerySyntaxError
5 | from hq.soup_util import debug_dump_node, is_any_node, AttributeNode, is_attribute_node, is_tag_node
6 |
7 |
8 | class ComputedHtmlAttributeConstructor:
9 |
10 | def __init__(self, name):
11 | self.name = name
12 | self.contents = None
13 |
14 |
15 | def set_content(self, expression_fn):
16 | if self.contents is not None:
17 | raise HquerySyntaxError('Computed attribute constructor already has contents')
18 | self.contents = expression_fn
19 |
20 |
21 | def evaluate(self):
22 | result = ''
23 |
24 | for value in make_sequence(self.contents()) if self.contents is not None else []:
25 | if is_string(value) or is_number(value) or is_boolean(value):
26 | result = self._append_to_contents(result, str(value))
27 | elif is_attribute_node(value):
28 | result = self._append_to_contents(result, value.value)
29 | elif is_tag_node(value):
30 | result = self._append_to_contents(result, string_value(value))
31 | else:
32 | value_desc = debug_dump_node(value) if is_any_node(value) else object_type_name(value)
33 | raise HqueryEvaluationError(
34 | 'Cannot use {0} as a content object in a computed attribute constructor'.format(value_desc)
35 | )
36 |
37 | return AttributeNode(self.name, result)
38 |
39 |
40 | def _append_to_contents(self, so_far, more_content):
41 | return '{0}{1}{2}'.format(so_far, ' ' if len(so_far) > 0 else '', more_content)
42 |
--------------------------------------------------------------------------------
/hq/hquery/computed_constructors/html_element.py:
--------------------------------------------------------------------------------
1 | from bs4 import BeautifulSoup
2 | from hq.hquery.evaluation_error import HqueryEvaluationError
3 | from hq.hquery.object_type import is_string, object_type_name, is_number, is_boolean
4 | from hq.hquery.sequences import make_node_set, make_sequence
5 | from hq.hquery.syntax_error import HquerySyntaxError
6 | from hq.soup_util import debug_dump_node, is_any_node, is_tag_node, is_attribute_node
7 |
8 |
9 | class ComputedHtmlElementConstructor:
10 |
11 | def __init__(self, name):
12 | self.name = name
13 | self.contents = None
14 |
15 |
16 | def set_content(self, expression_fn):
17 | if self.contents is not None:
18 | raise HquerySyntaxError('Computed element constructor already has contents')
19 | self.contents = expression_fn
20 |
21 |
22 | def evaluate(self):
23 | soup = BeautifulSoup('<{0}>{0}>'.format(self.name), 'html.parser')
24 | result = getattr(soup, self.name)
25 |
26 | for value in make_sequence(self.contents()) if self.contents is not None else []:
27 | if is_tag_node(value):
28 | result.append(self._clone_tag(value))
29 | elif is_attribute_node(value):
30 | result[value.name] = value.value
31 | elif is_string(value) or is_number(value) or is_boolean(value):
32 | result.append(str(value))
33 | else:
34 | value_desc = debug_dump_node(value) if is_any_node(value) else object_type_name(value)
35 | raise HqueryEvaluationError(
36 | 'Cannot use {0} as a content object in a computed element constructor'.format(value_desc)
37 | )
38 |
39 | return make_node_set(result)
40 |
41 |
42 | def _clone_tag(self, tag):
43 | name = tag.name
44 | soup = BeautifulSoup(str(tag), 'html.parser')
45 | return getattr(soup, name)
46 |
--------------------------------------------------------------------------------
/hq/hquery/function_support.py:
--------------------------------------------------------------------------------
1 | import os
2 | import re
3 | from inspect import isclass, isfunction
4 | from pkgutil import iter_modules
5 |
6 | from hq.hquery.evaluation_error import HqueryEvaluationError
7 | from hq.verbosity import verbose_print
8 |
9 |
10 | class FunctionSupport:
11 | all_functions = None
12 |
13 |
14 | def call_function(self, name, *args):
15 | self._load_all_functions()
16 |
17 | py_name = name.replace('-', '_')
18 |
19 | try:
20 | fn = self.all_functions[py_name]
21 | except KeyError:
22 | raise HqueryEvaluationError('Unknown function name "{0}"'.format(name))
23 |
24 | try:
25 | return fn(*args)
26 | except TypeError as err:
27 | if re.search(r'\d+ (?:.+ )?argument', err.args[0]):
28 | raise HqueryEvaluationError(err.args[0])
29 | else:
30 | raise
31 |
32 |
33 | def _load_all_functions(self):
34 | if self.all_functions is None:
35 | self.all_functions = dict()
36 | my_package_dir = os.path.dirname(__file__)
37 | verbose_print('FunctionSupport loading all function modules in {0}.'.format(my_package_dir),
38 | indent_after=True)
39 | for importer, modname, ispkg in iter_modules([os.path.join(my_package_dir, 'functions')]):
40 | verbose_print('Found candidate module {0} -- loading.'.format(modname))
41 | module = importer.find_module(modname).load_module(modname)
42 |
43 | if hasattr(module, 'exports'):
44 | exports = {name.rstrip('_'): getattr(module, name) for name in getattr(module, 'exports')}
45 | verbose_print('Module {0} exports are: {1}'.format(modname, exports.keys()))
46 | if any(not (isclass(obj) or isfunction(obj)) for obj in exports.values()):
47 | raise RuntimeError('Non-class/function export(s) loaded from module {0}'.format(modname))
48 | self.all_functions.update(exports)
49 | else:
50 | verbose_print('Module {0} defined no exports.'.format(modname))
51 |
52 | verbose_print('Finished loading function modules.', outdent_before=True)
53 |
--------------------------------------------------------------------------------
/hq/hquery/union_decomposition.py:
--------------------------------------------------------------------------------
1 | from hq.hquery.evaluation_error import HqueryEvaluationError
2 | from hq.hquery.object_type import debug_dump_anything
3 | from hq.hquery.sequences import make_sequence, sequence_concat
4 | from hq.hquery.variables import push_variable, variable_scope
5 | from hq.verbosity import verbose_print
6 |
7 |
8 | class UnionDecomposition:
9 |
10 | def __init__(self):
11 | self.mapping_generators = None
12 | self.union_expression = None
13 |
14 |
15 | def __str__(self):
16 | union_str = ' | '.join('' * len(self.mapping_generators))
17 | return '{0} => {0}'.format(union_str)
18 |
19 |
20 | def evaluate(self):
21 | verbose_print('Evaluating union decomposition ({} clauses)'.format(len(self.mapping_generators)),
22 | indent_after=True)
23 |
24 | sequence = make_sequence(self.union_expression())
25 | result = []
26 |
27 | for item in sequence:
28 | verbose_print(lambda: u'Visiting item {0}'.format(debug_dump_anything(item)), indent_after=True)
29 |
30 | with variable_scope():
31 | push_variable('_', make_sequence(item))
32 | if not hasattr(item, 'union_index'):
33 | raise HqueryEvaluationError(
34 | "Union decomposition applied to something that wasn't produced by a union"
35 | )
36 | if item.union_index >= len(self.mapping_generators):
37 | raise HqueryEvaluationError("Decomposed union had more clauses than its mapping")
38 | this_result = make_sequence(self.mapping_generators[item.union_index]())
39 | verbose_print(
40 | 'Mapping yielded {0} results for this visit'.format(
41 | len(this_result)))
42 | result = sequence_concat(result, this_result)
43 |
44 | verbose_print('Visit finished', outdent_before=True)
45 |
46 | verbose_print('Union decomposition completed', outdent_before=True)
47 | return result
48 |
49 |
50 | def set_mapping_generators(self, mgs):
51 | self.mapping_generators = mgs
52 |
53 |
54 | def set_union_expression(self, ug):
55 | self.union_expression = ug
56 |
--------------------------------------------------------------------------------
/test/hquery/test_arithmetic_operators.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 |
4 | sys.path.insert(0, os.path.abspath('../..'))
5 |
6 | from ..common_test_util import expected_result
7 | from test.hquery.hquery_test_util import query_html_doc
8 |
9 |
10 | def test_the_sum_of_decimals_is_a_decimal():
11 | assert query_html_doc('', '90+8.6') == expected_result('98.6')
12 | assert query_html_doc('', '-0.2 + 0.1') == expected_result('-0.1')
13 |
14 |
15 | def test_the_sum_of_integers_is_an_integer():
16 | assert query_html_doc('', '40+2') == expected_result('42')
17 | assert query_html_doc('', '-1 + 1') == expected_result('0')
18 |
19 |
20 | def test_integer_result_of_adding_decimals_is_an_integer():
21 | assert query_html_doc('', '41.5 + 0.5') == expected_result('42')
22 |
23 |
24 | def test_subtraction_operator():
25 | assert query_html_doc('', '43.5 - 1.5') == expected_result('42')
26 |
27 |
28 | def test_multiplication_operator():
29 | assert query_html_doc('', '3 * 3.1') == expected_result('9.3')
30 |
31 |
32 | def test_div_operator():
33 | assert query_html_doc('', '6div2') == expected_result('3')
34 |
35 |
36 | def test_mod_operator():
37 | assert query_html_doc('', '11 mod 5') == expected_result('1')
38 |
39 |
40 | def test_interpretation_of_div_and_mod_and_other_arithmetic_operators_as_operators_vs_node_tests():
41 | div = """
42 |
43 |
"""
44 | mod = """
45 |
46 | """
47 |
48 | assert query_html_doc(div, 'div', wrap_body=False) == expected_result(div)
49 | assert query_html_doc(mod, '/ mod', wrap_body=False) == expected_result(mod)
50 | assert query_html_doc(div, 'boolean(div)', wrap_body=False) == 'true'
51 | assert query_html_doc(mod, 'boolean(div)', wrap_body=False) == 'false'
52 |
53 | div_with_text = 'bar
'
54 | query_with_div_after_comma = 'starts-with(concat("foo ", div), "foo ba")'
55 | assert query_html_doc(div_with_text, query_with_div_after_comma, wrap_body=False) == 'true'
56 |
57 | assert query_html_doc(div, 'number("84")div2') == '42'
58 | assert query_html_doc(div, 'let $x := 4 return $x div 2') == '2'
59 |
60 | rect = ' '
61 | assert query_html_doc(rect, 'let $r := //rect return $r/@height * $r/@width') == '20'
62 |
63 | num_in_text = """
64 | not selected
65 | 42 """
66 | assert query_html_doc(num_in_text, '//span[@id="foo"] mod 10') == '2'
67 |
--------------------------------------------------------------------------------
/hq/hquery/computed_constructors/json_array.py:
--------------------------------------------------------------------------------
1 | import json
2 |
3 | from hq.hquery.evaluation_error import HqueryEvaluationError
4 | from hq.hquery.object_type import string_value, is_string, debug_dump_anything, is_hash, \
5 | is_boolean, is_number
6 | from hq.hquery.sequences import make_sequence
7 | from hq.hquery.syntax_error import HquerySyntaxError
8 | from hq.soup_util import is_tag_node, is_text_node
9 | from hq.verbosity import verbose_print
10 |
11 |
12 | class JsonArray:
13 |
14 | def __init__(self, contents):
15 | if not isinstance(contents, list):
16 | raise HqueryEvaluationError('Attempted to construct a JSON array based on a(n) {0} object'.format(
17 | contents.__class__.__name__))
18 | self.contents = contents
19 |
20 |
21 | def __repr__(self):
22 | return 'ARRAY {0}'.format(repr(self.contents))
23 |
24 |
25 | def __str__(self):
26 | return json.dumps(self.contents)
27 |
28 |
29 |
30 | class ComputedJsonArrayConstructor:
31 |
32 | def __init__(self):
33 | self.contents = None
34 |
35 |
36 | def set_contents(self, expression_fn):
37 | if self.contents is not None:
38 | raise HquerySyntaxError('computed JSON array constructor already has contents')
39 | self.contents = expression_fn
40 |
41 |
42 | def evaluate(self):
43 | return JsonArray([self._make_array_item(item) for item in make_sequence(self.contents())])
44 |
45 |
46 | def _make_array_item(self, value):
47 | if is_tag_node(value):
48 | self._gab(lambda: 'appending text contents of element "{0}" to array'.format(debug_dump_anything(value)))
49 | return string_value(value)
50 | elif is_text_node(value) or is_string(value):
51 | value = string_value(value)
52 | self._gab(lambda: u'appending text "{0}" to array'.format(debug_dump_anything(value)))
53 | return value
54 | elif is_boolean(value) or is_number(value):
55 | self._gab(lambda: 'appending {0} to array'.format(debug_dump_anything(value)))
56 | return value.value
57 | elif is_hash(value):
58 | self._gab(lambda: u'appending JSON {0} to array'.format(debug_dump_anything(value)))
59 | return value.contents
60 | else:
61 | raise HqueryEvaluationError("Can't use {0} as contents in a computed JSON array constructor".format(
62 | debug_dump_anything(value)))
63 |
64 |
65 | def _gab(self, message):
66 | verbose_print('JSON array constructor {0}'.format(message))
67 |
--------------------------------------------------------------------------------
/hq/hquery/functions/extend_string.py:
--------------------------------------------------------------------------------
1 | import re
2 |
3 | from hq.hquery.evaluation_error import HqueryEvaluationError
4 | from hq.hquery.expression_context import get_context_node
5 | from hq.hquery.functions.core_boolean import boolean
6 | from hq.hquery.object_type import string_value
7 |
8 | exports = ('lower_case', 'matches', 'replace', 'string_join', 'tokenize', 'upper_case')
9 |
10 |
11 | def lower_case(value):
12 | return string_value(value).lower()
13 |
14 |
15 | def matches(*args):
16 | scenario = len(args)
17 | flags = 0
18 |
19 | if scenario < 1 or scenario > 3:
20 | raise HqueryEvaluationError('matches() called with {0} arguments; expected one, two or three.'.format(scenario))
21 |
22 | if scenario == 1:
23 | input = string_value(get_context_node())
24 | pattern = args[0]
25 | else:
26 | input = string_value(args[0])
27 | pattern = args[1]
28 | if scenario == 3:
29 | flags = _xpath_flags_to_re_flags(args[2])
30 |
31 | return boolean(re.search(pattern, input, flags))
32 |
33 |
34 | def replace(*args):
35 | argc = len(args)
36 | if argc < 3 or argc > 4:
37 | raise HqueryEvaluationError('replace() expects 3 or 4 arguments; was passed {0}'.format(argc))
38 |
39 | input = string_value(args[0])
40 | pattern = args[1]
41 | replacement = args[2]
42 | if argc == 4:
43 | flags = _xpath_flags_to_re_flags(args[3])
44 | else:
45 | flags = 0
46 |
47 | return re.sub(pattern, replacement, input, flags=flags)
48 |
49 |
50 | def string_join(sequence, *args):
51 | if len(args) > 0:
52 | delimiter = args[0]
53 | else:
54 | delimiter = ''
55 | return delimiter.join([string_value(x) for x in sequence])
56 |
57 |
58 | def tokenize(*args):
59 | argc = len(args)
60 | if argc < 2 or argc > 3:
61 | raise HqueryEvaluationError('replace() expects 2 or 3 arguments; was passed {0}'.format(argc))
62 |
63 | input = string_value(args[0])
64 | pattern = args[1]
65 | if argc == 3:
66 | flags = _xpath_flags_to_re_flags(args[2])
67 | else:
68 | flags = 0
69 |
70 | return re.split(pattern, input, flags=flags)
71 |
72 |
73 | def upper_case(value):
74 | return string_value(value).upper()
75 |
76 |
77 | def _xpath_flags_to_re_flags(flags):
78 | re_flags_map = {
79 | 'i': re.IGNORECASE,
80 | 'm': re.MULTILINE,
81 | 's': re.DOTALL,
82 | 'x': re.VERBOSE,
83 | }
84 |
85 | try:
86 | result = 0
87 | for flag in flags:
88 | result |= re_flags_map[flag]
89 | return result
90 | except KeyError as e:
91 | raise HqueryEvaluationError('Unexpected regular expression flag "{0}"'.format(e.args[0]))
92 |
--------------------------------------------------------------------------------
/test/hquery/test_computed_html_construction.py:
--------------------------------------------------------------------------------
1 | from test.common_test_util import expected_result
2 | from test.hquery.hquery_test_util import query_html_doc
3 |
4 |
5 | def test_simple_element_construction_with_string_content():
6 | assert query_html_doc('', 'element foo { "bar" }') == expected_result("""
7 |
8 | bar
9 | """)
10 |
11 |
12 | def test_element_constructor_accepts_numbers_and_booleans():
13 | assert query_html_doc('', 'element test { 98.6 }') == expected_result("""
14 |
15 | 98.6
16 | """)
17 |
18 | assert query_html_doc('', 'element test { false() }') == expected_result("""
19 |
20 | false
21 | """)
22 |
23 |
24 | def test_construction_of_elements_containing_content_queried_from_original_document():
25 | html_body = """
26 |
27 |
Hello, world!
28 |
other div
29 |
"""
30 | assert query_html_doc(html_body, 'element hello { //div }') == expected_result("""
31 |
32 |
33 |
34 | Hello, world!
35 |
36 |
37 | other div
38 |
39 |
40 |
41 | other div
42 |
43 | """)
44 |
45 |
46 | def test_element_constructor_accepts_attributes_from_original_document_including_multi_values_like_classes():
47 | html_body = """
48 |
49 | contents
50 |
"""
51 |
52 | assert query_html_doc(html_body, 'element test { //p/@* }') == expected_result("""
53 |
54 | """)
55 |
56 | assert query_html_doc(html_body, 'element test { //p/@three, //p }') == expected_result("""
57 |
58 |
59 | contents
60 |
61 | """)
62 |
63 |
64 | def test_element_constructor_can_be_nested():
65 | assert query_html_doc('', 'element moe {element larry {}, element curly {"Hey, Moe!"}}') == expected_result("""
66 |
67 |
68 |
69 |
70 | Hey, Moe!
71 |
72 | """)
73 |
74 |
75 | def test_attribute_constructor_adds_attributes_to_an_element():
76 | html_body = '
'
77 | assert query_html_doc(html_body, 'element model { attribute name { //p[1]/@ng-bind } }') == expected_result("""
78 |
79 | """)
80 |
81 |
82 | def test_attribute_constructor_takes_string_value_of_tag_nodes_in_content_sequence_and_separates_with_spaces():
83 | html_body = 'Easy as
'
84 | assert query_html_doc(html_body, 'element foo {attribute bar {//p, 1 to 3}}') == expected_result("""
85 |
86 | """)
87 |
--------------------------------------------------------------------------------
/hq/hquery/functions/core_string.py:
--------------------------------------------------------------------------------
1 | import re
2 |
3 | from hq.hquery.evaluation_error import HqueryEvaluationError
4 | from hq.hquery.expression_context import get_context_node
5 | from hq.hquery.functions.core_boolean import boolean
6 | from hq.hquery.functions.core_number import number
7 | from hq.hquery.functions.extend_string import _xpath_flags_to_re_flags
8 | from hq.hquery.object_type import string_value, normalize_content, is_string
9 |
10 | exports = ('concat', 'contains', 'normalize_space', 'starts_with', 'string', 'string_length', 'substring',
11 | 'substring_after', 'substring_before')
12 |
13 |
14 | def concat(*args):
15 | return ''.join(string_value(arg) for arg in args)
16 |
17 |
18 | def contains(*args):
19 | argc = len(args)
20 | if argc < 2 or argc > 3:
21 | raise HqueryEvaluationError('contains() function expects two or three arguments; {0} passed'.format(argc))
22 | if argc == 3:
23 | flags = args[2]
24 | else:
25 | flags = ''
26 |
27 | pattern = re.escape(string_value(args[1]))
28 | to_search = string_value(args[0])
29 | return boolean(bool(re.search(pattern, to_search, flags=_xpath_flags_to_re_flags(flags))))
30 |
31 |
32 | def normalize_space(*args):
33 | if len(args) == 1:
34 | return normalize_content(args[0])
35 | else:
36 | return normalize_content(get_context_node())
37 |
38 |
39 | def starts_with(left, right):
40 | return boolean(string_value(left).startswith(string_value(right)))
41 |
42 |
43 | def string(*args):
44 | if len(args) == 1:
45 | return string_value(args[0])
46 | else:
47 | return string_value(get_context_node())
48 |
49 |
50 | def string_length(*args):
51 | value = args[0] if len(args) == 1 else string_value(get_context_node())
52 | if not is_string(value):
53 | raise HqueryEvaluationError('string_length() expecting a string, got a {0}'.format(value.__class__.__name__))
54 | return number(len(value))
55 |
56 |
57 | def substring(*args):
58 | if len(args) < 2:
59 | raise HqueryEvaluationError('substring() expects at least 2 arguments; {0} were passed'.format(len(args)))
60 | value = args[0]
61 | start_index = args[1].value
62 | start = int(round(start_index) - 1)
63 | if len(args) >= 3:
64 | end = start + int(round(args[2].value))
65 | else:
66 | end = len(value) - start + 1
67 | return value[start if start >= 0 else 0:end]
68 |
69 |
70 | def substring_after(first, second):
71 | first = string_value(first)
72 | index = first.find(second)
73 | if index < 0:
74 | return ''
75 | else:
76 | return first[index + 1:]
77 |
78 |
79 | def substring_before(first, second):
80 | first = string_value(first)
81 | index = first.find(second)
82 | if index < 0:
83 | return ''
84 | else:
85 | return first[:index]
86 |
--------------------------------------------------------------------------------
/hq/hq.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | """hq - Powerful HTML querying, filtering, slicing and dicing!
4 |
5 | Usage:
6 | hq.py [options]
7 | hq.py [options] -p
8 | hq.py --version
9 | hq.py (-h | --help)
10 |
11 | Options:
12 | -f, --file Read HTML input from a file rather than stdin.
13 | --preserve Preserve extra whitespace in string values derived
14 | from HTML contents. The default behavior is to
15 | automatically apply normalize-string to all string
16 | values derived from HTML elements and attributes, and
17 | to convert non-breaking spaces into plain spaces.
18 | -p, --program Read HQuery expression from a file instead of the
19 | command line.
20 | -u, --ugly Do not pretty-print HTML markup on output.
21 | -v, --verbose Print verbose query parsing and evaluation information
22 | to stderr.
23 | --version Display the installed HQ version.
24 |
25 | HTML is read from stdin.
26 |
27 | """
28 |
29 | from docopt import docopt
30 |
31 | from .hquery.evaluation_error import HqueryEvaluationError
32 | from .hquery.hquery_processor import HqueryProcessor, HquerySyntaxError
33 | from .output import convert_results_to_output_text
34 | from .soup_util import make_soup
35 | from .verbosity import verbose_print, set_verbosity
36 |
37 | __version__ = '0.0.4'
38 |
39 |
40 | def main():
41 | from sys import stderr, stdin # So py.tests have a chance to hook stdout & stderr
42 |
43 | args = docopt(__doc__, version='HQ {0}'.format(__version__))
44 | preserve_space = bool(args['--preserve'])
45 | set_verbosity(bool(args['--verbose']))
46 |
47 | try:
48 | if args['--file']:
49 | with open(args['--file']) as file:
50 | source = file.read()
51 | else:
52 | source = stdin.read()
53 | verbose_print('Read {0} characters of input'.format(len(source)))
54 | soup = make_soup(source)
55 |
56 | if args['--program']:
57 | with open(args['--program']) as file:
58 | expression = file.read()
59 | else:
60 | expression = args['']
61 | if len(expression) > 0:
62 | result = HqueryProcessor(expression, preserve_space).query(soup)
63 | else:
64 | result = [soup]
65 |
66 | print(convert_results_to_output_text(result, pretty=(not args['--ugly']), preserve_space=preserve_space))
67 |
68 | except HquerySyntaxError as error:
69 | print('\nSYNTAX ERROR: {0}\n'.format(str(error)), file=stderr)
70 | except HqueryEvaluationError as error:
71 | print('\nQUERY ERROR: {0}\n'.format(str(error)), file=stderr)
72 |
73 |
74 | if __name__ == '__main__':
75 | main()
76 |
--------------------------------------------------------------------------------
/test/hquery/test_node_tests.py:
--------------------------------------------------------------------------------
1 |
2 | import os
3 | import sys
4 |
5 | sys.path.insert(0, os.path.abspath('../..'))
6 |
7 | from ..common_test_util import expected_result
8 | from .hquery_test_util import query_html_doc
9 |
10 |
11 | def test_any_node_test_selects_all_node_types():
12 | html_body = """Has he ever whaled it any?
13 |
14 |
15 |
"""
16 | assert query_html_doc(html_body, '/html/body/node()') == expected_result("""
17 | Has he ever whaled it any?
18 |
19 |
20 |
21 |
22 |
""")
23 |
24 |
25 | def test_tag_node_test_selects_tag_children_but_not_other_stuff():
26 | html_body = """
27 |
28 |
29 | Has he ever whaled it any?
30 |
"""
31 | actual = query_html_doc(html_body, '/html/body/*')
32 | assert actual == expected_result("""
33 |
34 |
35 |
36 |
""")
37 |
38 |
39 | def test_tag_node_test_selects_descendants():
40 | html_body = """
41 |
42 | """
45 | actual = query_html_doc(html_body, '/html/body/descendant::*')
46 | assert actual == expected_result("""
47 |
52 |
53 | text
54 |
""")
55 |
56 |
57 | def test_tag_node_test_selects_parent():
58 | html_body = """
59 | """
64 | actual = query_html_doc(html_body, '/html/body/section/div/p/parent::*')
65 | assert actual == expected_result("""
66 | """)
70 |
71 |
72 | def test_tag_node_test_selects_ancestors():
73 | html_body = """
74 | """
77 | actual = query_html_doc(html_body, '/html/body/div/p/ancestor::*')
78 | assert actual == expected_result("""
79 |
80 |
81 |
85 |
86 |
87 |
88 |
92 |
93 | """)
97 |
98 |
99 | def test_text_node_test_selects_disjoint_text_nodes():
100 | html_body = """onetwo three
"""
101 | actual = query_html_doc(html_body, '/html/body/p/text()')
102 | assert actual == expected_result("""
103 | one
104 | three""")
105 |
106 |
107 | def test_comment_node_test_selects_comments():
108 | html_body = """
109 |
110 |
111 |
112 |
"""
113 | assert query_html_doc(html_body, '//comment()') == expected_result("""
114 |
115 | """)
116 |
--------------------------------------------------------------------------------
/test/hquery/test_equality_operators.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 |
4 | sys.path.insert(0, os.path.abspath('../..'))
5 |
6 | from ..common_test_util import expected_result
7 | from test.hquery.hquery_test_util import query_html_doc
8 |
9 |
10 | def test_node_set_equality_is_based_on_text_contents():
11 | html_body = """
12 | foo
13 | foo
"""
14 | actual = query_html_doc(html_body, '//p = //div')
15 | assert actual == expected_result('true')
16 |
17 |
18 | def test_node_sets_are_equal_if_string_value_of_any_one_node_matches_string_value_of_any_from_other_set():
19 | html_body = """
20 |
21 | one
22 | two
23 |
24 |
25 | two
26 | three
27 |
"""
28 | actual = query_html_doc(html_body, '//div/span = //p/span')
29 | assert actual == expected_result('true')
30 |
31 |
32 | def test_equals_operator_compares_numbers():
33 | actual = query_html_doc('', '2.0 != 2.1')
34 | assert actual == expected_result('true')
35 |
36 |
37 | def test_equals_operator_interprets_integer_and_fractional_numbers_correctly():
38 | actual = query_html_doc('', '101.0 != 101')
39 | assert actual == expected_result('false')
40 |
41 |
42 | def test_equals_operator_compares_string_value_of_node_converted_to_number_with_number():
43 | actual = query_html_doc('042.0
', '//p = 42')
44 | assert actual == expected_result('true')
45 |
46 |
47 | def test_equals_operator_compares_boolean_coercion_of_node_set_with_boolean():
48 | html_body = '
'
49 | actual = query_html_doc(html_body, '//p = false()')
50 | assert actual == expected_result('false')
51 |
52 |
53 | def test_equals_operator_compares_text_node_contents_with_string():
54 | html_body = """
55 |
58 | """
61 | actual = query_html_doc(html_body, '/html/body/div[p/text() = "two"]')
62 | assert actual == expected_result("""
63 | """)
68 |
69 |
70 | def test_equals_operator_converts_non_node_sets_to_boolean_when_comparing_to_a_boolean():
71 | assert query_html_doc('', '1 = true()') == expected_result('true')
72 | assert query_html_doc('', '0 != false()') == expected_result('false')
73 | assert query_html_doc('', '"" = false()') == expected_result('true')
74 | assert query_html_doc('', '" " = true()') == expected_result('true')
75 |
76 |
77 | def test_equals_operator_converts_non_node_sets_to_number_when_comparing_to_a_number():
78 | assert query_html_doc('', '0.1 = "0"') == expected_result('false')
79 | assert query_html_doc('', '"42" = 42.0') == expected_result('true')
80 | assert query_html_doc('', '"foo" = 0') == expected_result('false') # It's NaN, not zero.
81 |
82 |
83 | def test_equals_operator_works_with_node_sets_containing_attributes():
84 | html_body = """
85 |
86 |
"""
87 | assert query_html_doc(html_body, '//div/attribute::id = "two"') == expected_result('true')
88 |
--------------------------------------------------------------------------------
/test/hquery/test_relational_operators.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 |
4 | sys.path.insert(0, os.path.abspath('../..'))
5 |
6 | from ..common_test_util import expected_result
7 | from test.hquery.hquery_test_util import query_html_doc
8 |
9 |
10 | def test_relational_comparison_of_numbers():
11 | assert query_html_doc('', '1.01>1') == 'true'
12 | assert query_html_doc('', '1 > 2') == 'false'
13 | assert query_html_doc('', '2>2') == 'false'
14 |
15 | assert query_html_doc('', '1 < 2') == 'true'
16 | assert query_html_doc('', '2<1.9999') == 'false'
17 | assert query_html_doc('', '42 <42') == 'false'
18 |
19 | assert query_html_doc('', '3>=3') == 'true'
20 | assert query_html_doc('', '3>= 3.01') == 'false'
21 |
22 | assert query_html_doc('', '2 <=2') == 'true'
23 | assert query_html_doc('', '1.999<= 2') == 'true'
24 | assert query_html_doc('', '2.001 <= 2') == 'false'
25 |
26 |
27 | def test_relational_comparison_of_booleans_with_one_another_and_with_other_non_node_set_primitives():
28 | assert query_html_doc('', 'true() <= false()') == 'false'
29 | assert query_html_doc('', 'true() <= 0') == 'false'
30 | assert query_html_doc('', '1 > false()') == 'true'
31 | assert query_html_doc('', 'true() >= 25') == 'true'
32 | assert query_html_doc('', 'true() > "0"') == 'false'
33 |
34 |
35 | def test_relational_comparison_of_numbers_with_non_boolean_non_numeric_primitives_aka_strings():
36 | assert query_html_doc('', '"5" < 4') == 'false'
37 | assert query_html_doc('', '5 > "4"') == 'true'
38 | assert query_html_doc('', '"foo" >= 1') == 'false'
39 |
40 |
41 | def test_relational_comparison_of_non_boolean_non_numeric_primitives_aka_strings_with_one_another():
42 | assert query_html_doc('', '"low" > "high"') == 'true'
43 | assert query_html_doc('', '"1.0" >= "1.1"') == 'false'
44 | assert query_html_doc('', '"1.1" >= "1.1"') == 'true'
45 |
46 |
47 | def test_relational_comparison_involving_two_node_sets():
48 | html_body = """
49 | 9
50 | 10
51 | 10
52 | 11
"""
53 |
54 | assert query_html_doc(html_body, '//p > //div') == 'false'
55 | assert query_html_doc(html_body, '//p >= //div') == 'true'
56 | assert query_html_doc(html_body, '//div[position()=1] <= //p') == 'true'
57 |
58 |
59 | def test_relational_comparison_between_a_node_set_and_a_number():
60 | html_body = """
61 | 9.9
62 | 10.1
"""
63 | assert query_html_doc(html_body, '//div > 10') == 'true'
64 | assert query_html_doc(html_body, '10.1 < //div') == 'false'
65 | assert query_html_doc(html_body, '//div <= 9.9') == 'true'
66 |
67 |
68 | def test_relational_comparison_between_a_node_set_and_a_string():
69 | html_body = """
70 | 9.9
71 | 10.1
"""
72 | assert query_html_doc(html_body, '//div > "10"') == 'true'
73 | assert query_html_doc(html_body, '"10.1" < //div') == 'false'
74 | assert query_html_doc(html_body, '//div <= "9.9"') == 'true'
75 |
76 |
77 | def test_relational_comparison_between_a_node_set_and_a_boolean_value():
78 | html_body = """
79 | 2
80 | 1
"""
81 | assert query_html_doc(html_body, '//div <= false()') == 'false'
82 | assert query_html_doc(html_body, 'true() >= //div') == 'true'
83 |
--------------------------------------------------------------------------------
/test/hquery/test_interpolated_strings.py:
--------------------------------------------------------------------------------
1 | from test.common_test_util import expected_result
2 | from test.hquery.hquery_test_util import query_html_doc
3 |
4 |
5 | def test_location_path_works_as_interpolated_string_expression():
6 | assert query_html_doc("world
", '`Hello, ${//div/text()}!`') == expected_result('Hello, world!')
7 |
8 |
9 | def test_element_node_becomes_normalized_text_contents_in_interpolated_string():
10 | html_body = """
11 |
12 | foo bar
13 |
"""
14 | assert query_html_doc(html_body, '`-->${//p}<--`') == expected_result('-->foo bar<--')
15 |
16 |
17 | def test_text_between_embedded_expressions_gets_picked_up():
18 | html_body = """
19 | one
20 | two
21 | three
"""
22 | assert query_html_doc(html_body, 'let $_ := 2 return `${//p[1]}, $_, ${//p[3]}`') == 'one, 2, three'
23 |
24 |
25 | def test_join_filter_joins_string_values_from_node_set():
26 | html_body = """
27 | one
28 | two
29 | three
"""
30 | assert query_html_doc(html_body, '`${j:,://p}`') == expected_result('one,two,three')
31 |
32 |
33 | def test_join_filter_defaults_to_empty_string_delimiter():
34 | html_body = """
35 | one
36 | two
"""
37 | assert query_html_doc(html_body, '`${j:://p}`') == expected_result('onetwo')
38 |
39 |
40 | def test_truncate_filter_elides_contents():
41 | html_body = 'The quick brown fox jumped over the lazy dog.
'
42 | assert query_html_doc(html_body, '`${tru:23:?://p}`') == expected_result('The quick brown fox?')
43 |
44 |
45 | def test_truncate_filter_defaults_to_no_suffix():
46 | html_body = 'short, sharp shock
'
47 | assert query_html_doc(html_body, '`${tru:15:://p}`') == expected_result('short, sharp')
48 |
49 |
50 | def test_regex_replace_filter_replaces_stuff_with_other_stuff():
51 | html_body = 'May 25, 1979'
52 | assert query_html_doc(html_body, r'`${rr:(\w+) (\d+)(, \d+):\2th of \1\3:://span}`') == '25th of May, 1979'
53 |
54 |
55 | def test_use_of_escapes_for_forbidden_characters_in_regex_replace_patterns():
56 | assert query_html_doc('', r"""`it's ${rr:\w{3}:dog::"a cat's"} life`""") == "it's a dog's life"
57 | assert query_html_doc('', r'`${rr:: ::: let $x := "re: " return concat($x, "search")}`') == 'research'
58 |
59 |
60 | def test_regex_replace_filter_can_be_used_to_replace_unicode_characters():
61 | assert query_html_doc('', u'`${rr: : :: "non-breaking\u00a0space"}`') == 'non-breaking space'
62 |
63 |
64 | def test_filters_chain_left_to_right():
65 | html_body = """
66 | one
67 | two
68 | three
"""
69 | assert query_html_doc(html_body, '`${j:, :tru:12: ...://p} whatever!`') == 'one, two, ... whatever!'
70 |
71 |
72 | def test_character_escape_is_not_prematurely_decoded_in_interpolated_string():
73 | query = 'let $x := "foo" return `Variable "$x" contains value $x`'
74 | assert query_html_doc('', query) == 'Variable "$x" contains value foo' # Not 'Variable "foo" contains...'
75 |
76 |
77 | def test_filters_are_applied_to_all_items_in_sequence_when_input_is_not_atomic():
78 | html_body = """
79 | Hello, world!
80 | Goodbye, world!
"""
81 | assert query_html_doc(html_body, '`${tru:8:://p}`') == 'Hello,Goodbye,'
82 | assert query_html_doc(html_body, '`${rr:world:test:://p}`') == 'Hello, test!Goodbye, test!'
83 |
--------------------------------------------------------------------------------
/hq/hquery/object_type.py:
--------------------------------------------------------------------------------
1 | import re
2 |
3 | from hq.hquery.expression_context import peek_context
4 | from hq.string_util import truncate_string, is_a_string
5 |
6 | from ..verbosity import verbose_print
7 | from ..soup_util import is_any_node, is_tag_node, is_text_node, is_attribute_node, debug_dump_node, \
8 | debug_dump_long_string, derive_text_from_node
9 |
10 |
11 | BOOLEAN, SEQUENCE, NUMBER, STRING = range(4)
12 | TYPE_NAMES = ('BOOLEAN', 'SEQUENCE', 'NUMBER', 'STRING')
13 |
14 |
15 | def debug_dump_anything(obj):
16 | if is_any_node(obj):
17 | result = debug_dump_node(obj)
18 | elif is_boolean(obj) or is_number(obj) or is_hash(obj) or is_array(obj):
19 | result = repr(obj)
20 | elif is_string(obj):
21 | result = u'string("{0}")'.format(obj)
22 | elif is_node_set(obj):
23 | result = u'node-set({0})'.format(', '.join(truncate_string(debug_dump_node(node), 20) for node in obj))
24 | elif is_sequence(obj):
25 | result = u'sequence({0})'.format(', '.join(truncate_string(debug_dump_anything(item), 20) for item in obj))
26 | else:
27 | raise RuntimeError("debug_dump_anything doesn't know how to handle {0}".format(obj.__class__.__name__))
28 | return debug_dump_long_string(result)
29 |
30 |
31 | def is_array(obj):
32 | return obj.__class__.__name__ == 'JsonArray'
33 |
34 |
35 | def is_boolean(obj):
36 | return obj.__class__.__name__ == 'boolean'
37 |
38 |
39 | def is_hash(obj):
40 | return obj.__class__.__name__ == 'JsonHash'
41 |
42 |
43 | def is_node_set(obj):
44 | return isinstance(obj, list) and all(is_any_node(x) for x in obj)
45 |
46 |
47 | def is_number(obj):
48 | return obj.__class__.__name__ == 'number'
49 |
50 |
51 | def is_sequence(obj):
52 | return isinstance(obj, list)
53 |
54 |
55 | def is_string(obj):
56 | return is_a_string(obj)
57 |
58 |
59 | def normalize_content(value):
60 | return re.sub(r'\s+', ' ', string_value(value))
61 |
62 |
63 | def object_type(obj):
64 | if is_boolean(obj):
65 | return BOOLEAN
66 | elif is_node_set(obj):
67 | return SEQUENCE
68 | elif is_sequence(obj):
69 | return SEQUENCE
70 | elif is_number(obj):
71 | return NUMBER
72 | elif is_string(obj):
73 | return STRING
74 | else:
75 | verbose_print('UH-OH! Returning None from object_type({0})'.format(obj.__class__.__name__))
76 | return None
77 |
78 |
79 | def object_type_name(obj):
80 | result = 'NULL OR UNKNOWN TYPE'
81 |
82 | if obj is not None:
83 | if isinstance(obj, int):
84 | index = obj
85 | else:
86 | index = object_type(obj)
87 | result = TYPE_NAMES[index]
88 |
89 | return result
90 |
91 |
92 | def string_value(obj):
93 | if is_tag_node(obj) or is_attribute_node(obj) or is_text_node(obj):
94 | return derive_text_from_node(obj, peek_context().preserve_space)
95 | elif is_number(obj) or is_boolean(obj):
96 | return str(obj)
97 | elif is_node_set(obj):
98 | return string_value(obj[0]) if len(obj) > 0 else ''
99 | elif is_sequence(obj):
100 | return ''.join(string_value(item) for item in obj)
101 | elif is_string(obj):
102 | return obj
103 | else:
104 | raise NotImplementedError('string_value not implemented for type "{0}"'.format(obj.__class__.__name__))
105 |
--------------------------------------------------------------------------------
/hq/hquery/equality_operators.py:
--------------------------------------------------------------------------------
1 | from hq.verbosity import verbose_print
2 | from hq.hquery.functions.core_boolean import boolean
3 | from hq.hquery.functions.core_number import number
4 | from hq.hquery.object_type import object_type, string_value, object_type_name
5 | from hq.hquery.evaluation_error import HqueryEvaluationError
6 |
7 |
8 | def _eq_bool_vs_primitive(bool_val, other_val):
9 | verbose_print('Comparing boolean value {0} with non-node-set value {1} (coerced to {2})'.format(bool_val, other_val, boolean(other_val)))
10 | return bool_val == boolean(other_val)
11 |
12 |
13 | def _eq_native(first, second):
14 | return first == second
15 |
16 |
17 | def _eq_node_sets(first, second):
18 | first_values = set([string_value(node) for node in first])
19 | second_values = set([string_value(node) for node in second])
20 |
21 | verbose_print('Comparing two nodes sets (size {0} and {1}).'.format(len(first_values), len(second_values)))
22 |
23 | for first_value in first_values:
24 | if first_value in second_values:
25 | verbose_print(u'Found value "{0}" from first node set in second node set'.format(first_value))
26 | return True
27 |
28 | verbose_print('Found no matching nodes between node sets.')
29 | return False
30 |
31 |
32 | def _eq_node_set_vs_bool(bool_val, nodes_val):
33 | return bool_val == boolean(nodes_val)
34 |
35 |
36 | def _eq_node_set_vs_number(nodes_val, num_val):
37 | verbose_print('(=) comparing number {0} to {1} nodes'.format(num_val, len(nodes_val)))
38 |
39 | for node in nodes_val:
40 | node_str_val = string_value(node)
41 | node_num_val = number(node_str_val)
42 | verbose_print('(=) node string value "{0}" is{1} equal to "{2}"'.format(
43 | node_num_val,
44 | (' not' if node_num_val == num_val else ''),
45 | num_val))
46 |
47 | if node_num_val == num_val:
48 | return True
49 |
50 | return False
51 |
52 |
53 | def _eq_node_set_vs_string(nodes_val, string_val):
54 | string_val = str(string_val)
55 | verbose_print(u'(=) comparing number "{0}" to {1} nodes'.format(string_val, len(nodes_val)))
56 |
57 | for node in nodes_val:
58 | node_val_string = string_value(node)
59 | verbose_print(u'(=) node string value "{0}" is{1} equal to "{2}"'.format(
60 | node_val_string,
61 | ('' if node_val_string == string_val else ' not'),
62 | string_val))
63 |
64 | if node_val_string == string_val:
65 | return True
66 |
67 | return False
68 |
69 |
70 | def _eq_num_vs_string(num_val, string_val):
71 | return num_val == number(string_val)
72 |
73 |
74 | equality_ops_table = (
75 | # BOOLEAN, SEQUENCE, NUMBER, STRING
76 | (_eq_native, _eq_node_set_vs_bool, _eq_bool_vs_primitive, _eq_bool_vs_primitive), # BOOLEAN
77 | (None, _eq_node_sets, _eq_node_set_vs_number, _eq_node_set_vs_string), # SEQUENCE
78 | (None, None, _eq_native, _eq_num_vs_string), # NUMBER
79 | (None, None, None, _eq_native), # STRING
80 | )
81 |
82 |
83 | def equals(first, second):
84 | first_type = object_type(first)
85 | second_type = object_type(second)
86 | try:
87 | reverse = first_type > second_type
88 | op = equality_ops_table[first_type if not reverse else second_type][second_type if not reverse else first_type]
89 | return boolean(op(first if not reverse else second, second if not reverse else first))
90 | except TypeError:
91 | msg = 'type mismatch comparing {0} and {1} for equality'
92 | raise HqueryEvaluationError(msg.format(object_type_name(first_type), object_type_name(second_type)))
93 |
94 |
95 | def not_equals(first, second):
96 | return boolean(not bool(equals(first, second)))
97 |
--------------------------------------------------------------------------------
/hq/hquery/functions/core_number.py:
--------------------------------------------------------------------------------
1 | import math
2 |
3 | from hq.hquery.evaluation_error import HqueryEvaluationError
4 | from hq.soup_util import is_any_node
5 | from hq.hquery.object_type import is_number, is_node_set, string_value, is_boolean
6 | from hq.hquery.sequences import make_sequence
7 |
8 | exports = ('ceiling', 'floor', 'number', 'round_', 'sum')
9 |
10 |
11 | class number:
12 |
13 | def __init__(self, obj):
14 | if isinstance(obj, number):
15 | self.value = obj.value
16 | elif is_boolean(obj):
17 | self.value = 1 if obj else 0
18 | elif is_node_set(obj) or is_any_node(obj):
19 | self.value = self._int_or_float(float(string_value(obj)))
20 | else:
21 | try:
22 | self.value = self._int_or_float(float(obj))
23 | except ValueError:
24 | self.value = float('nan')
25 |
26 | def __float__(self):
27 | return float(self.value)
28 |
29 | def __int__(self):
30 | return int(self.value)
31 |
32 | def __str__(self):
33 | result = str(self.value)
34 | if result == 'nan':
35 | result = 'NaN'
36 | return result
37 |
38 | def __hash__(self):
39 | return self.value.__hash__()
40 |
41 | def __add__(self, other):
42 | return number(self.value + self._value_of_other_operand(other))
43 |
44 | def __sub__(self, other):
45 | return number(self.value - self._value_of_other_operand(other))
46 |
47 | def __neg__(self):
48 | return number(-self.value)
49 |
50 | def __mul__(self, other):
51 | return number(self.value * self._value_of_other_operand(other))
52 |
53 | def __div__(self, other):
54 | return self.__truediv__(other)
55 |
56 | def __truediv__(self, other):
57 | other = self._value_of_other_operand(other)
58 | if other == 0:
59 | return number(float('nan'))
60 | else:
61 | return number(self.value / other)
62 |
63 | def __mod__(self, other):
64 | return number(self.value % self._value_of_other_operand(other))
65 |
66 | def __eq__(self, other):
67 | return self.value == self._value_of_other_operand(other)
68 |
69 | def __ge__(self, other):
70 | return self.value >= self._value_of_other_operand(other)
71 |
72 | def __gt__(self, other):
73 | return self.value > self._value_of_other_operand(other)
74 |
75 | def __le__(self, other):
76 | return self.value <= self._value_of_other_operand(other)
77 |
78 | def __lt__(self, other):
79 | return self.value < self._value_of_other_operand(other)
80 |
81 | def __repr__(self):
82 | return 'number({0})'.format(str(self.value))
83 |
84 | @staticmethod
85 | def _int_or_float(numeric_value):
86 | if isinstance(numeric_value, int) or numeric_value % 1 != 0:
87 | return numeric_value
88 | else:
89 | return int(numeric_value)
90 |
91 | @staticmethod
92 | def _value_of_other_operand(other):
93 | return other.value if is_number(other) else other
94 |
95 |
96 | def ceiling(value):
97 | return number(math.ceil(value.value))
98 |
99 |
100 | def floor(value):
101 | return number(math.floor(value.value))
102 |
103 |
104 | def round_(*args):
105 | if len(args) == 0:
106 | raise HqueryEvaluationError('round() function requires at least one argument')
107 | value = args[0]
108 | if math.isnan(value.value):
109 | return value
110 | else:
111 | return number(round(value.value, 0 if len(args) < 2 else args[1].value))
112 |
113 |
114 | def sum(*args):
115 | if len(args) >= 1:
116 | sequence = make_sequence(args[0])
117 | else:
118 | sequence = make_sequence([])
119 | if len(args) >= 2:
120 | zero = args[1]
121 | else:
122 | zero = number(0)
123 |
124 | if len(sequence) == 0:
125 | return zero
126 | else:
127 | result = number(0)
128 | for item in sequence:
129 | result += number(item)
130 | return result
131 |
--------------------------------------------------------------------------------
/hq/hquery/relational_operators.py:
--------------------------------------------------------------------------------
1 | from operator import gt, lt, ge, le
2 |
3 | from hq.hquery.functions.core_string import string
4 | from hq.verbosity import verbose_print
5 | from hq.hquery.functions.core_boolean import boolean
6 | from hq.hquery.functions.core_number import number
7 | from hq.hquery.object_type import object_type, is_boolean, is_number
8 | from hq.hquery.syntax_error import HquerySyntaxError
9 |
10 |
11 | class RelationalOperator:
12 |
13 | def __init__(self, op):
14 | if op == '>':
15 | self.base_op = gt
16 | elif op == '>=':
17 | self.base_op = ge
18 | elif op == '<':
19 | self.base_op = lt
20 | elif op == '<=':
21 | self.base_op = le
22 | else:
23 | raise HquerySyntaxError('unexpected relational operator "{0}"'.format(op))
24 |
25 |
26 | def evaluate(self, first, second):
27 | first_type = object_type(first)
28 | second_type = object_type(second)
29 | cmp = comparison_method_table[first_type][second_type]
30 | return boolean(cmp(self.base_op, first, second))
31 |
32 |
33 | @property
34 | def name(self):
35 | return self.base_op.__name__
36 |
37 |
38 |
39 | def _cmp_node_sets(base_op, first, second):
40 | first_values = set([number(node) for node in first])
41 | second_values = set([number(node) for node in second])
42 |
43 | verbose_print('Comparing two nodes sets (size {0} and {1}).'.format(len(first_values), len(second_values)))
44 |
45 | for first_value in first_values:
46 | for second_value in second_values:
47 | if base_op(first_value, second_value):
48 | msg = 'Comparison succeeded for "{0}" from first node set and "{1}" in second node set'
49 | verbose_print(msg.format(first_value, second_value))
50 | return True
51 |
52 | verbose_print('Comparison failed for all nodes in both node sets.')
53 | return False
54 |
55 |
56 | def _cmp_nodes_to_value(base_op, first, second):
57 | node_values = set([number(node) for node in first])
58 | second = number(second)
59 | verbose_print('Comparing {0} nodes in node set to value {1}'.format(len(node_values), second))
60 |
61 | for node_value in node_values:
62 | if base_op(node_value, second):
63 | verbose_print('Comparison succeeded for node value "{0}" and value "{1}"'.format(node_value, second))
64 | return True
65 |
66 | verbose_print('Comparison failed for all nodes in the node set.')
67 | return False
68 |
69 |
70 | def _cmp_value_to_nodes(base_op, first, second):
71 | node_values = set([number(node) for node in second])
72 | first = number(first)
73 | verbose_print('Comparing {0} nodes in node set to value "{1}"'.format(len(node_values), first))
74 |
75 | for node_value in node_values:
76 | if base_op(first, node_value):
77 | verbose_print('Comparison succeeded for value "{0}" and node value "{1}'.format(first, node_value))
78 | return True
79 |
80 | verbose_print('Comparison failed for all nodes in the node set.')
81 | return False
82 |
83 |
84 | def _cmp_values(base_op, first, second):
85 | if is_boolean(first) or is_boolean(second):
86 | return base_op(1 if boolean(first) else 0, 1 if boolean(second) else 0)
87 | elif is_number(first) or is_number(second):
88 | return base_op(number(first), number(second))
89 | else:
90 | return base_op(string(first), string(second))
91 |
92 |
93 | comparison_method_table = (
94 | # BOOLEAN, SEQUENCE, NUMBER, STRING
95 | (_cmp_values, _cmp_value_to_nodes, _cmp_values, _cmp_values), # BOOLEAN
96 | (_cmp_nodes_to_value, _cmp_node_sets, _cmp_nodes_to_value, _cmp_nodes_to_value), # SEQUENCE
97 | (_cmp_values, _cmp_value_to_nodes, _cmp_values, _cmp_values), # NUMBER
98 | (_cmp_values, _cmp_value_to_nodes, _cmp_values, _cmp_values), # STRING
99 | )
100 |
--------------------------------------------------------------------------------
/test/test_cli.py:
--------------------------------------------------------------------------------
1 | import re
2 |
3 | try:
4 | from mock import mock_open
5 | except ImportError:
6 | from unittest.mock import mock_open
7 |
8 | from hq.hq import main
9 | from test.common_test_util import simulate_args_dict, wrap_html_body, capture_console_output
10 |
11 |
12 | def test_preserve_space_flag_turns_off_space_normalization(capsys, mocker):
13 | hquery = '`${//p}`'
14 | content_with_spaces = ' PyCharm rocks! '
15 | mocker.patch('sys.stdin.read').return_value = wrap_html_body('{0}
'.format(content_with_spaces))
16 |
17 | mocker.patch('hq.hq.docopt').return_value = simulate_args_dict(expression=hquery, preserve='s')
18 | main()
19 | actual, _ = capture_console_output(capsys, strip=False)
20 | assert actual == content_with_spaces
21 |
22 | mocker.patch('hq.hq.docopt').return_value = simulate_args_dict(expression=hquery, preserve='')
23 | main()
24 | actual, _ = capture_console_output(capsys, strip=False)
25 | assert actual == 'PyCharm rocks!'
26 |
27 |
28 | def test_preserve_space_flag_causes_non_breaking_spaces_to_be_how_shall_we_say_preserved(capsys, mocker):
29 | mocker.patch('sys.stdin.read').return_value = wrap_html_body(u'non\u00a0breaking spaces?
')
30 |
31 | mocker.patch('hq.hq.docopt').return_value = simulate_args_dict(expression='//p/text()', preserve='s')
32 | main()
33 | actual, _ = capture_console_output(capsys)
34 | assert actual == u'non\u00a0breaking\u00a0spaces?'
35 |
36 | mocker.patch('hq.hq.docopt').return_value = simulate_args_dict(expression='//p/text()', preserve='')
37 | main()
38 | actual, _ = capture_console_output(capsys)
39 | assert actual == u'non breaking spaces?'
40 |
41 |
42 | def test_ugly_flag_preserves_markup_formatting(capsys, mocker):
43 | expected = 'I, too, enjoy PyCharm.
'
44 | mocker.patch('hq.hq.docopt').return_value = simulate_args_dict(expression='//p', ugly=True)
45 | mocker.patch('sys.stdin.read').return_value = wrap_html_body(expected)
46 |
47 | main()
48 |
49 | actual, _ = capture_console_output(capsys, strip=False)
50 | assert actual == expected
51 |
52 |
53 | def test_syntax_error_prints_proper_error_message(capsys, mocker):
54 | mocker.patch('hq.hq.docopt').return_value = simulate_args_dict(expression='child:://')
55 | mocker.patch('sys.stdin.read').return_value = wrap_html_body('')
56 |
57 | main()
58 |
59 | _, actual = capture_console_output(capsys)
60 | assert re.match(r'^syntax error.+expected.+name.+got.+slash', actual.lower())
61 |
62 |
63 | def test_query_error_prints_proper_error_message(capsys, mocker):
64 | mocker.patch('hq.hq.docopt').return_value = simulate_args_dict(expression='no-such-function()')
65 | mocker.patch('sys.stdin.read').return_value = wrap_html_body('')
66 |
67 | main()
68 |
69 | _, actual = capture_console_output(capsys)
70 | assert re.match(r'^query error.+unknown function.+no-such-function', actual.lower())
71 |
72 |
73 | def test_reading_input_from_a_file_instead_of_stdin(capsys, mocker):
74 | expected_filename = 'filename.html'
75 | mocked_open = mock_open(read_data=wrap_html_body('foo
'))
76 | mocker.patch('hq.hq.docopt').return_value = simulate_args_dict(
77 | expression='//p/text()', file=expected_filename)
78 | mocker.patch('hq.hq.open', mocked_open, create=True)
79 |
80 | main()
81 |
82 | actual, _ = capture_console_output(capsys)
83 | mocked_open.assert_called_with(expected_filename)
84 | assert actual == 'foo'
85 |
86 |
87 | def test_program_flag_reads_hquery_program_from_file(capsys, mocker):
88 | expected_filename = 'filename.hq'
89 | mocked_open = mock_open(read_data='''
90 | //p
91 | ->
92 | $_/text()''')
93 | mocker.patch('hq.hq.docopt').return_value = simulate_args_dict(
94 | program=expected_filename)
95 | mocker.patch('sys.stdin.read').return_value = wrap_html_body('foo
')
96 | mocker.patch('hq.hq.open', mocked_open, create=True)
97 |
98 | main()
99 |
100 | actual, _ = capture_console_output(capsys)
101 | mocked_open.assert_called_with(expected_filename)
102 | assert actual == 'foo'
103 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # hq
2 | **Powerful HTML slicing and dicing at the command line.**
3 |
4 | [](https://travis-ci.org/rbwinslow/hq) [](https://coveralls.io/github/rbwinslow/hq?branch=master)
5 |
6 | `hq` is a Python-based command-line tool for querying HTML, manipulating data and producing results as HTML, JSON or any other format. It's based on a compact, flexible expression language that started out as an XPath implementation but ended up going a few different places, so I'm going ahead and calling it HQuery.
7 |
8 | HQuery is 99% compliant with the [XPath 1.0](https://www.w3.org/TR/xpath/) standard, minus some features not applicable to HTML. That's nice for querying, but you need more power to take control of the shape and format of the data you produce. To that end, HQuery also includes...
9 |
10 | * **Nuggets of XQuery** — only a few of the good parts! Just enough for iteration, branching and the like.
11 | * **XPath expansions for HTML** — including a `class::` axis and `class()` function, plus abbreviated axes to keep things terse.
12 | * **Super-charged string interpolation** — with powerful filters that you can chain together to transform data as you produce it.
13 | * **Computed constructors for HTML and JSON** — so you can programmatically assemble and output new HTML or JSON objects and arrays.
14 | * **Out-of-left-field union decomposition** — enabling amazingly terse and powerful mappings from clauses in a union to different expressions.
15 |
16 | ## Installing `hq`
17 |
18 | pip install hq
19 |
20 | ## Running `hq`
21 |
22 | cat /path/to/file.html | hq '`Hello, ${/html/head/title}!`'
23 |
24 | ...or...
25 |
26 | hq -f /path/to/file.html '`Hello, ${/html/head/title}!`'
27 |
28 | To print usage information:
29 |
30 | hq --help
31 |
32 | ## Running `hq` in a container
33 |
34 | There's a Docker image [project](https://github.com/frioux/hq.dkr) available that makes it super-easy to try out `hq` without installing any software (aside from Docker):
35 |
36 | cat /path/to/file.html | docker run -i frew/hq '//some/hquery'
37 |
38 | Thanks, Frew!
39 |
40 | ## Learning `hq`
41 |
42 | The [wiki](https://github.com/rbwinslow/hq/wiki) discusses the [motivations](https://github.com/rbwinslow/hq/wiki/Why-HQuery%3F) guiding the HQuery language's design and provides a [language reference](https://github.com/rbwinslow/hq/wiki/Language-Reference).
43 |
44 | ## Contributing to `hq`
45 |
46 | `hq` is tested against Pythons 3.5 through 3.9. The file structure and `setup.py` script for the project are based on [this blog post](https://gehrcke.de/2014/02/distributing-a-python-command-line-application/).
47 |
48 | `hq`'s dependencies are split into a "base" file, the subset needed to run the application, and a "dev" file providing the tools necessary to run tests and the like. To do development:
49 |
50 | pip install -r requirements/dev.txt
51 |
52 | The parsing logic in `hquery_processor.py` is based on the [top-down operator precendence](https://www.crockford.com/javascript/tdop/tdop.html) approach.
53 |
54 | ### Running Tests
55 |
56 | py.test
57 |
58 | The "dev.txt" dependencies also include [pytest-cov](https://pypi.python.org/pypi/pytest-cov), so you can generate a nice coverage report (which you'll find in the `htmlcov` directory):
59 |
60 | py.test --cov=hq --cov-report html
61 |
62 | If you want to turn verbosity on to figure out what's going on in a test, you need the `--gabby` flag (since `py.test` owns its own `-v` flag). You'll probably also want to run just one test at a time, because `--gabby` is way gabby:
63 |
64 | py.test --gabby -vv -k some_particular_test_function
65 |
66 | ### Uploading to PyPI
67 |
68 | This and other aspects of project setup, including running the CLI locally and using setup.py, are covered in the blog post linked above. I'm copying the PyPI upload stuff here for my own convenience, but I ask, of course, that you please submit pull requests rather than uploading to PyPI yourself:
69 |
70 | $ python setup.py sdist
71 | $ ls dist
72 | hq-0.0.4.tar.gz
73 |
74 | $ pip install twine
75 | $ twine upload dist/hq-0.0.4.tar.gz
76 | Uploading distributions to https://pypi.python.org/pypi
77 | Uploading hq-0.0.4.tar.gz
78 | Finished
79 |
--------------------------------------------------------------------------------
/hq/hquery/flwor.py:
--------------------------------------------------------------------------------
1 | from hq.hquery.object_type import debug_dump_anything
2 | from hq.hquery.sequences import make_sequence, sequence_concat
3 | from hq.hquery.syntax_error import HquerySyntaxError
4 | from hq.hquery.variables import push_variable, variable_scope
5 | from hq.soup_util import debug_dump_long_string
6 | from hq.verbosity import verbose_print
7 |
8 |
9 | class Flwor:
10 |
11 | def __init__(self):
12 | self.global_variables = []
13 | self.per_iteration_variables = []
14 | self.return_expression = None
15 | self.sequence_expression = None
16 | self.sequence_variable = None
17 |
18 |
19 | def __str__(self):
20 | return '{0}{1}return '.format(
21 | '' if self.sequence_expression is None else 'for ${0}:= '.format(self.sequence_variable),
22 | (' '.join('let ${0} := '.format(v[0]) for v in self.per_iteration_variables) + ' ') if len(self.per_iteration_variables) else ''
23 | )
24 |
25 |
26 | def append_let(self, variable_name, expression_fn):
27 | var_tuple = (variable_name, expression_fn)
28 | if self.sequence_expression is None:
29 | self.global_variables.append(var_tuple)
30 | else:
31 | self.per_iteration_variables.append(var_tuple)
32 |
33 |
34 | def debug_dump(self):
35 | return debug_dump_long_string(str(self))
36 |
37 |
38 | def evaluate(self):
39 | verbose_print('Evaluating FLWOR {0}'.format(self), indent_after=True)
40 |
41 | if self.sequence_expression is not None:
42 | result = self._evaluate_iteration()
43 | else:
44 | result = self._evaluate_without_iteration()
45 |
46 | verbose_print(lambda: 'FLWOR evaluation completed; returning {0}'.format(debug_dump_anything(result)),
47 | outdent_before=True)
48 | return result
49 |
50 |
51 | def set_iteration_expression(self, variable_name, expression_fn):
52 | if self.sequence_expression is not None:
53 | raise HquerySyntaxError('More than one "for" clause found in FLWOR "{0}"'.format(self.debug_dump()))
54 | self.sequence_variable = variable_name
55 | self.sequence_expression = expression_fn
56 |
57 |
58 | def set_return_expression(self, expression_fn):
59 | if self.return_expression is not None:
60 | raise HquerySyntaxError('More than one return clause found for FLWOR {0}'.format(self.debug_dump()))
61 | self.return_expression = expression_fn
62 |
63 |
64 | def _evaluate_iteration(self):
65 | with variable_scope():
66 | self._push_global_variables()
67 |
68 | sequence = make_sequence(self.sequence_expression())
69 | verbose_print('Iterating over sequence containing {0} items'.format(len(sequence)))
70 | result = []
71 |
72 | for item in sequence:
73 | verbose_print(lambda: u'Visiting item {0}'.format(debug_dump_anything(item)), indent_after=True)
74 |
75 | with variable_scope():
76 | push_variable(self.sequence_variable, make_sequence(item))
77 | self._push_iteration_variables()
78 | this_result = make_sequence(self.return_expression())
79 | verbose_print('Return clause yielded {0} results for this visit'.format(len(this_result)))
80 | result = sequence_concat(result, this_result)
81 |
82 | verbose_print('Visit finished', outdent_before=True)
83 |
84 | return result
85 |
86 |
87 | def _evaluate_without_iteration(self):
88 | with variable_scope():
89 | self._push_global_variables()
90 | verbose_print('Evaluating return expression.', indent_after=True)
91 | result = self.return_expression()
92 | verbose_print('Return expression produced {0}'.format(str(result)), outdent_before=True)
93 | return result
94 |
95 |
96 | def _push_global_variables(self):
97 | for let in self.global_variables:
98 | verbose_print('Evaluating let {0} := '.format(let[0]))
99 | push_variable(let[0], let[1]())
100 |
101 |
102 | def _push_iteration_variables(self):
103 | for let in self.per_iteration_variables:
104 | verbose_print('Evaluating let {0} := '.format(let[0]))
105 | push_variable(let[0], let[1]())
106 |
--------------------------------------------------------------------------------
/hq/hquery/node_test.py:
--------------------------------------------------------------------------------
1 | from hq.hquery.axis import Axis
2 |
3 | from ..soup_util import is_root_node, is_tag_node, is_text_node, AttributeNode, is_attribute_node, is_any_node, root_tag_from_soup, \
4 | is_comment_node
5 |
6 |
7 | def _accept_principal_node_type(node, axis=None):
8 | return is_attribute_node(node) if axis == Axis.attribute else is_tag_node(node)
9 |
10 |
11 | def _make_axis_agnostic_accept_fn(fn):
12 | def evaluate(node, axis=None):
13 | return fn(node)
14 | return evaluate
15 |
16 |
17 | def _make_name_accept_fn(value):
18 | def evaluate(node, axis=None):
19 | if axis == Axis.css_class:
20 | return is_tag_node(node) and 'class' in node.attrs and value in node['class']
21 | else:
22 | type_fn = is_attribute_node if axis == Axis.attribute else is_tag_node
23 | return type_fn(node) and node.name.lower() == value
24 | return evaluate
25 |
26 |
27 | class NodeTest:
28 |
29 | def __init__(self, value, name_test=False):
30 | value = value.lower()
31 | self.repr = value
32 | self.is_name_test = name_test
33 |
34 | if name_test:
35 | self.accept_fn = _make_name_accept_fn(value)
36 | elif value == '*':
37 | self.accept_fn = _accept_principal_node_type
38 | elif value == 'node':
39 | self.accept_fn = _make_axis_agnostic_accept_fn(is_any_node)
40 | elif value == 'text':
41 | self.accept_fn = _make_axis_agnostic_accept_fn(is_text_node)
42 | elif value == 'comment':
43 | self.accept_fn = _make_axis_agnostic_accept_fn(is_comment_node)
44 |
45 | self.repr = '{0}{1}'.format(self.repr, '' if name_test or value == '*' else '()')
46 |
47 |
48 | def __repr__(self):
49 | return self.repr
50 |
51 |
52 | def apply(self, axis, node):
53 | nodes = getattr(self, 'gather_{0}'.format(axis.name))(node)
54 | return [node for node in nodes if self.accept_fn(node, axis=axis)]
55 |
56 |
57 | def gather_ancestor(self, node):
58 | if hasattr(node, 'parents'):
59 | return list(node.parents)
60 | else:
61 | return []
62 |
63 |
64 | def gather_ancestor_or_self(self, node):
65 | result = self.gather_self(node)
66 | result.extend(self.gather_ancestor(node))
67 | return result
68 |
69 |
70 | def gather_attribute(self, node):
71 | return list(AttributeNode.enumerate(node))
72 |
73 |
74 | def gather_child(self, node):
75 | if is_root_node(node):
76 | return [root_tag_from_soup(node)]
77 | elif is_tag_node(node):
78 | return node.contents
79 | else:
80 | return []
81 |
82 |
83 | def gather_css_class(self, node):
84 | return self.gather_child(node)
85 |
86 |
87 | def gather_descendant(self, node):
88 | if hasattr(node, 'descendants'):
89 | return list(node.descendants)
90 | else:
91 | return []
92 |
93 |
94 | def gather_descendant_or_self(self, node):
95 | result = self.gather_self(node)
96 | result.extend(self.gather_descendant(node))
97 | return result
98 |
99 |
100 | def gather_following(self, node):
101 | result = []
102 | while is_tag_node(node):
103 | for sibling in node.next_siblings:
104 | result.append(sibling)
105 | result.extend(self.gather_descendant(sibling))
106 | node = node.parent
107 | return result
108 |
109 |
110 | def gather_following_sibling(self, node):
111 | if hasattr(node, 'next_siblings'):
112 | return list(node.next_siblings)
113 | else:
114 | return []
115 |
116 |
117 | def gather_parent(self, node):
118 | if hasattr(node, 'parent') and node.parent is not None:
119 | return [node.parent]
120 | else:
121 | return []
122 |
123 |
124 | def gather_preceding(self, node):
125 | result = []
126 | while is_tag_node(node):
127 | for sibling in node.previous_siblings:
128 | result.append(sibling)
129 | result.extend(self.gather_descendant(sibling))
130 | node = node.parent
131 | return result
132 |
133 |
134 | def gather_preceding_sibling(self, node):
135 | if hasattr(node, 'previous_siblings'):
136 | return list(node.previous_siblings)
137 | else:
138 | return []
139 |
140 |
141 | def gather_self(self, node):
142 | return [node]
143 |
--------------------------------------------------------------------------------
/test/hquery/test_extended_functions.py:
--------------------------------------------------------------------------------
1 | import re
2 |
3 | from test.common_test_util import expected_result
4 | from test.hquery.hquery_test_util import query_html_doc
5 |
6 |
7 | def test_class_function_returns_true_when_element_has_name_in_class_attribute():
8 | html_body = """
9 | not selected
10 | expected
"""
11 |
12 | assert query_html_doc(html_body, 'class(//p[1], "foo")') == 'false'
13 | assert query_html_doc(html_body, 'class(//p[2], "foo")') == 'true'
14 | assert query_html_doc(html_body, '//p[class("bar")]/text()') == 'expected'
15 |
16 |
17 | def test_even_and_odd_functions_select_the_appropriate_elements_based_on_position():
18 | html_body = """
19 | You
20 | I
21 | are
22 | am
23 | odd.
24 | even.
"""
25 |
26 | assert query_html_doc(html_body, '//p[even()]/text()') == expected_result("""
27 | I
28 | am
29 | even.""")
30 | assert query_html_doc(html_body, '//p[odd()]/text()') == expected_result("""
31 | You
32 | are
33 | odd.""")
34 |
35 |
36 | def test_lower_case_and_upper_case_change_string_case_as_expected():
37 | assert query_html_doc('', 'lower-case("Foo BAR")') == 'foo bar'
38 | assert query_html_doc('', 'upper-case("fOO bar")') == 'FOO BAR'
39 |
40 |
41 | def test_matches_function_performs_regex_matching_as_per_xpath_30_functions_spec():
42 | html_body = """
43 | moe
44 | larry
45 | curly
"""
46 |
47 | assert query_html_doc(html_body, '//p[matches(text(), "^l[ary]+")]/text()') == expected_result('larry')
48 | assert query_html_doc(html_body, '//p[matches(text(), ".URL.", "i")]/text()') == expected_result('curly')
49 |
50 |
51 | def test_matches_function_supports_a_subset_of_xpath_30_flag_values():
52 | html_body = """
53 | first
54 | second one
55 |
56 | multiple
57 | lines
58 | of
59 | text
60 |
"""
61 | multiline_pattern = r'.+multiple.+text.+'
62 |
63 | assert query_html_doc(html_body, r'//p[matches(text(), "\w+RST", "i")]/text()') == expected_result('first')
64 | assert query_html_doc(html_body, r'//p[matches(text(), ".+lines.+text")]', preserve_space=True) == ''
65 | assert re.match(
66 | multiline_pattern,
67 | query_html_doc(html_body, r'//p[matches(text(), ".+lines.+text", "s")]', preserve_space=True),
68 | re.S
69 | )
70 | assert query_html_doc(html_body, r'//p[matches(text(), "^ *lines$")]', preserve_space=True) == ''
71 | assert re.match(
72 | multiline_pattern,
73 | query_html_doc(html_body, r'//p[matches(text(), "^\s*lines$", "m")]', preserve_space=True),
74 | re.S
75 | )
76 | assert query_html_doc(html_body, r'//p[matches(text(), "sec ond\sone")]/text()') == ''
77 | assert query_html_doc(html_body, r'//p[matches(text(), "sec ond\sone", "x")]/text()') == 'second one'
78 |
79 |
80 | def test_matches_function_extends_to_using_context_node_when_passed_no_input_string():
81 | html_body = """
82 | bar
83 | foo
"""
84 |
85 | assert query_html_doc(html_body, '//p[matches("^f.+")]/text()') == expected_result('foo')
86 |
87 |
88 | def test_replace_function_performs_regex_replacement_as_per_xpath_30_functions_spec():
89 | assert query_html_doc('', 'replace("dog mattress dog", "^dog", "cat")') == 'cat mattress dog'
90 |
91 |
92 | def test_replace_function_extends_standard_by_taking_string_value_of_any_type_of_input_object():
93 | assert query_html_doc('hello
', 'replace(//p, "h", "j")') == 'jello'
94 |
95 |
96 | def test_string_join_function_accepts_sequence_as_first_parameter_and_delimiter_as_second():
97 | assert query_html_doc('', 'string-join(1 to 3, ", ")') == '1, 2, 3'
98 |
99 |
100 | def test_string_join_second_argument_is_optional():
101 | assert query_html_doc('', 'string-join(1 to 2)') == '12'
102 |
103 |
104 | def test_tokenize_function_breaks_up_strings_as_per_xpath_30_functions_spec():
105 | assert query_html_doc('', 'tokenize("Moe:Larry:..Curly", ":\.*")') == expected_result("""
106 | Moe
107 | Larry
108 | Curly""")
109 | assert query_html_doc('', 'tokenize("HaxtaXpatience", "x", "i")') == expected_result("""
110 | Ha
111 | ta
112 | patience""")
113 | assert query_html_doc('', 'count(tokenize("haxtaxstax", "x"))') == '4'
114 |
115 |
116 | def test_tokenize_function_extends_standard_by_supporting_any_object_as_input():
117 | assert query_html_doc('foo,bar
', 'tokenize(//p, ",")') == expected_result("""
118 | foo
119 | bar""")
120 |
--------------------------------------------------------------------------------
/test/hquery/test_location_paths.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 |
4 | sys.path.insert(0, os.path.abspath('../..'))
5 |
6 | from hq.output import convert_results_to_output_text
7 | from hq.soup_util import make_soup
8 | from hq.hquery.hquery_processor import HqueryProcessor
9 |
10 | from ..common_test_util import expected_result
11 | from test.hquery.hquery_test_util import query_html_doc
12 |
13 |
14 | def test_absolute_location_path_should_find_multiple_grandchildren():
15 | actual = query_html_doc('one
not a div
two
', '/html/body/div')
16 | assert actual == expected_result("""
17 |
18 | one
19 |
20 |
21 | two
22 |
""")
23 |
24 |
25 | def test_path_to_root_tag_succeeds_despite_other_root_level_objects():
26 | html = """
27 |
28 |
29 |
30 |
31 | """
32 | raw_result = HqueryProcessor('/*').query(make_soup(html))
33 | actual = convert_results_to_output_text(raw_result)
34 | assert actual == expected_result("""
35 |
36 |
37 | """)
38 |
39 |
40 | def test_relative_location_path_as_predicate():
41 | html_body = """
42 |
43 | one
44 |
45 |
48 |
49 | three
50 |
"""
51 | actual = query_html_doc(html_body, '/html/body/div[span]')
52 | assert actual == expected_result("""
53 |
54 |
55 | one
56 |
57 |
58 |
59 |
60 | three
61 |
62 |
""")
63 |
64 |
65 | def test_abbreviated_context_node_works_in_path():
66 | html_body = """
67 |
70 | two
71 | """
74 | actual = query_html_doc(html_body, '/html/body/div/./p')
75 | assert actual == expected_result("""
76 |
77 | one
78 |
79 |
80 | three
81 |
""")
82 |
83 |
84 | def test_abbreviated_context_node_works_in_predicate():
85 | html_body = """
86 |
89 | two
90 |
91 | three
92 |
93 |
96 | """
97 | actual = query_html_doc(html_body, '/html/body/node()[./p]')
98 | assert actual == expected_result("""
99 |
100 |
101 | one
102 |
103 |
104 |
105 |
106 | four
107 |
108 |
""")
109 |
110 |
111 | def test_abbreviated_parent_node_works_in_path():
112 | html_body = """
113 |
114 | one
115 |
116 |
117 |
118 | two
119 |
"""
120 | actual = query_html_doc(html_body, '//p/br/../span')
121 | assert actual == expected_result("""
122 |
123 | two
124 | """)
125 |
126 |
127 | def test_abbreviated_parent_node_works_in_predicate():
128 | html_body = """
129 |
130 |
131 | one
132 |
133 |
134 | two
135 |
136 |
137 |
138 | three
139 |
"""
140 | actual = query_html_doc(html_body, '//span[../br]')
141 | assert actual == expected_result("""
142 |
143 | one
144 |
145 |
146 | three
147 | """)
148 |
149 |
150 | def test_double_slash_works_within_path():
151 | html_body = """
152 |
161 | joe besser
162 | """
165 | assert query_html_doc(html_body, '//section//p') == expected_result("""
166 |
167 | moe
168 |
169 |
170 | larry
171 |
172 |
173 | curly
174 |
175 |
176 | shemp
177 |
""")
178 |
179 |
180 | def test_predicate_can_be_applied_to_variable_containing_node_set():
181 | html_body = """
182 | not selected
183 | selected
"""
184 | assert query_html_doc(html_body, 'let $x := //p return $x[@id="foo"]') == expected_result("""
185 |
186 | selected
187 |
""")
188 |
189 |
190 | def test_no_space_between_text_runs_crossing_element_boundaries_in_element_string_value_if_there_was_none_in_doc():
191 | html_body = """"so-called " Klingon
"""
192 | assert query_html_doc(html_body, 'string(//p)') == '"so-called" Klingon'
193 | assert query_html_doc('one two
', 'string(//p)') == 'one two'
194 |
--------------------------------------------------------------------------------
/hq/hquery/string_interpolation.py:
--------------------------------------------------------------------------------
1 | import re
2 |
3 | from hq.hquery.functions.extend_string import _xpath_flags_to_re_flags, string_join
4 | from hq.hquery.object_type import string_value, is_sequence
5 | from hq.hquery.syntax_error import HquerySyntaxError
6 | from hq.soup_util import debug_dump_long_string
7 | from hq.string_util import truncate_string, html_entity_decode
8 | from hq.verbosity import verbose_print
9 |
10 |
11 | clauses_pattern = re.compile(r'(\$\{[^\}]+\})|(\$[a-zA-Z_]\w*)|((?:[^\$]+))')
12 |
13 |
14 | def _join_filter_link(arguments):
15 | if arguments is None or len(arguments) == 0:
16 | delimiter = ''
17 | else:
18 | delimiter = arguments[0]
19 |
20 | def construct(eval_fn):
21 | return lambda: string_join(eval_fn(), delimiter)
22 |
23 | return construct
24 |
25 |
26 | def _regex_replace_filter_link(arguments):
27 | if arguments is None or len(arguments) < 2:
28 | msg = 'interpolated string regex replace filter expects three arguments; got {0}'
29 | raise HquerySyntaxError(msg.format(arguments))
30 |
31 | if len(arguments) == 3:
32 | flags = _xpath_flags_to_re_flags(arguments[2])
33 | else:
34 | flags = 0
35 |
36 | def construct(eval_fn):
37 | def evaluate():
38 | value = eval_fn()
39 | if is_sequence(value):
40 | return [re.sub(arguments[0], arguments[1], string_value(item), flags=flags) for item in value]
41 | else:
42 | return re.sub(arguments[0], arguments[1], string_value(value), flags=flags)
43 | return evaluate
44 |
45 | return construct
46 |
47 |
48 | def _truncate_filter_link(arguments):
49 |
50 | def construct(eval_fn):
51 | length = int(arguments[0])
52 | if len(arguments) == 1:
53 | suffix = ''
54 | else:
55 | suffix = arguments[1]
56 |
57 | def evaluate():
58 | value = eval_fn()
59 | if is_sequence(value):
60 | return [truncate_string(string_value(item), length, suffix=suffix) for item in value]
61 | else:
62 | return truncate_string(string_value(value), length, suffix=suffix)
63 |
64 | return evaluate
65 |
66 | return construct
67 |
68 |
69 | filters = {
70 | r'j:([^:]*):': _join_filter_link,
71 | r'rr:([^:]+):([^:]*):([i]*):': _regex_replace_filter_link,
72 | r'tru:(\d+):([^:]*):': _truncate_filter_link,
73 | }
74 |
75 |
76 | def reduce_filters_and_expression(remainder, parse_interface, chain=None):
77 | for pattern in filters:
78 | match = re.match(pattern, remainder)
79 | if match is not None:
80 | filter_constructor = filters[pattern]([html_entity_decode(arg) for arg in match.groups()])
81 | remainder = remainder[match.span()[1]:]
82 | if chain is None:
83 | return reduce_filters_and_expression(remainder, parse_interface, filter_constructor)
84 | else:
85 | return reduce_filters_and_expression(remainder,
86 | parse_interface,
87 | lambda eval_fn: filter_constructor(chain(eval_fn)))
88 |
89 | eval_fn = parse_interface.parse_in_new_processor(remainder)
90 | if chain is None:
91 | return eval_fn
92 | else:
93 | return chain(eval_fn)
94 |
95 |
96 | def parse_interpolated_string(source, parse_interface):
97 | verbose_print(u'Parsing interpolated string contents `{0}`'.format(source), indent_after=True)
98 |
99 | expressions = []
100 | for embedded_expr, embedded_var, literal in clauses_pattern.findall(source):
101 | if embedded_expr:
102 | verbose_print(u'Adding embedded expression: {0}'.format(embedded_expr))
103 | expressions.append(reduce_filters_and_expression(embedded_expr[2:-1], parse_interface))
104 | elif embedded_var:
105 | verbose_print('Adding embedded variable reference: {0}'.format(embedded_var))
106 | expressions.append(parse_interface.parse_in_new_processor(embedded_var))
107 | else:
108 | verbose_print(u'Adding literal string contents `{0}`'.format(literal))
109 | expressions.append(_make_literal_identity_closure(literal))
110 |
111 | def evaluate():
112 | chunks = [string_value(exp()) for exp in expressions]
113 | verbose_print(u'Interpolated string evaluation assembling {0} chunks{1}.'.format(
114 | len(chunks),
115 | '' if len(chunks) == 0 else u' ("{0}")'.format(u'", "'.join(chunks)))
116 | )
117 | return ''.join(chunks)
118 |
119 | verbose_print(
120 | u'Finished parsing interpolated string `{0}` ({1} chunk(s) found)'.format(debug_dump_long_string(source),
121 | len(expressions)),
122 | outdent_before=True
123 | )
124 | return evaluate
125 |
126 |
127 | def _make_literal_identity_closure(value):
128 | return lambda: html_entity_decode(value)
129 |
--------------------------------------------------------------------------------
/test/hquery/test_flwor.py:
--------------------------------------------------------------------------------
1 | from hq.hquery.syntax_error import HquerySyntaxError
2 | from pytest import raises
3 | from test.common_test_util import expected_result
4 | from test.hquery.hquery_test_util import query_html_doc
5 |
6 |
7 | def test_variable_declaration_and_reference_in_a_flwor():
8 | expected = 'bar'
9 | assert query_html_doc('', 'let $foo := "{0}" return $foo'.format(expected)) == expected
10 |
11 |
12 | def test_variable_declarations_are_processed_in_order():
13 | hquery = 'let $hello := "hello, " let $whole-phrase := concat($hello, "world!") return $whole-phrase'
14 | assert query_html_doc('', hquery) == 'hello, world!'
15 |
16 |
17 | def test_variable_is_accessible_inside_interpolated_string():
18 | assert query_html_doc('', 'let $foo := "bar" return `foo is $foo`') == 'foo is bar'
19 | assert query_html_doc('', 'let $foo := (1 to 3) return `${j:, :$foo}`') == '1, 2, 3'
20 |
21 |
22 | def test_multiple_return_clauses_are_not_allowed():
23 | with raises(HquerySyntaxError):
24 | query_html_doc('', 'let $x := "whatever" return $x return "uh-oh"')
25 |
26 |
27 | def test_that_no_other_clauses_are_allowed_after_a_return():
28 | with raises(HquerySyntaxError):
29 | query_html_doc('', 'let $x := "whatevs" return $x let $uh-oh := "oh no"')
30 |
31 |
32 | def test_iteration_using_for():
33 | html_body = """
34 | one
35 | two
36 | three
"""
37 | assert query_html_doc(html_body, 'for $x in //p return $x/text()') == expected_result("""
38 | one
39 | two
40 | three""")
41 |
42 |
43 | def test_flwor_variable_declaration_within_iteration():
44 | query = 'for $x in (1 to 2) let $y := concat("Thing ", string($x)) return $y'
45 | assert query_html_doc('', query) == expected_result("""
46 | Thing 1
47 | Thing 2""")
48 |
49 |
50 | def test_rooted_location_paths_work_with_both_kinds_of_slash():
51 | html_body = """
52 |
57 | """
62 |
63 | assert query_html_doc(html_body, 'for $x in //section return $x/div') == expected_result("""
64 |
69 | """)
74 |
75 | assert query_html_doc(html_body, 'for $x in //section return $x//div') == expected_result("""
76 |
81 |
82 | foo
83 |
84 |
89 |
90 | bar
91 |
""")
92 |
93 |
94 | def test_variables_before_for_have_global_scope_and_within_for_have_iteration_scope():
95 | query = """
96 | let $x := 2
97 | let $z := $x
98 | for $_ in (1, $x)
99 | let $y := $_
100 | let $x := $_
101 | return ($x, $z, $x = $y)"""
102 |
103 | assert query_html_doc('', ' '.join(query.split('\n'))) == expected_result("""
104 | 1
105 | 2
106 | true
107 | 2
108 | 2
109 | true""")
110 |
111 |
112 | def test_flwor_with_multiple_for_clauses_is_a_syntax_error():
113 | with raises(HquerySyntaxError):
114 | query_html_doc('', 'for $x in (1, 2) let $y := 0 for $z in (3, 4) return $z')
115 |
116 |
117 | def test_flwor_with_multiple_return_clauses_is_a_syntax_error():
118 | with raises(HquerySyntaxError):
119 | query_html_doc('', 'let $x := 0 return $x return $x + 1')
120 |
121 |
122 | def test_abbreviated_flowr_provides_expected_iteration_variable_in_value_clause():
123 | html_body = """
124 | one
125 | two
126 | three
"""
127 |
128 | assert query_html_doc(html_body, '//p -> $_/text()') == expected_result("""
129 | one
130 | two
131 | three""")
132 |
133 |
134 | def test_nested_abbreviated_flwors_evaluate_as_expected():
135 | html_body = """
136 |
137 |
one
138 |
two
139 |
140 |
141 |
three
142 |
four
143 |
five
144 |
"""
145 |
146 | assert query_html_doc(html_body, '//div -> $_/p[odd()] -> $_/text()') == expected_result("""
147 | one
148 | three
149 | five""")
150 |
151 |
152 | def test_comma_as_sequence_cat_operator_does_not_bind_at_end_of_return_clause():
153 | assert query_html_doc('', 'for $x in (1 to 2) return $x, "!"') == expected_result("""
154 | 1
155 | 2
156 | !""")
157 | assert query_html_doc('', 'sum(for $x in //span return $x, "zero")') == 'zero'
158 | assert query_html_doc('', 'sum(//span -> $_, "zero")') == 'zero'
159 |
160 |
161 | def test_lack_of_return_at_end_of_flwor_is_a_syntax_error():
162 | with raises(HquerySyntaxError):
163 | query_html_doc('', 'let $nil := "nothing"')
164 |
165 |
166 | def test_comma_can_be_used_to_declare_multiple_variables_in_a_let_clause():
167 | assert query_html_doc('', 'let $foo := "foo", $bar := "bar" return string-join(($foo, $bar), " ")') == 'foo bar'
168 |
--------------------------------------------------------------------------------
/hq/soup_util.py:
--------------------------------------------------------------------------------
1 | import re
2 | from builtins import str
3 | from bs4 import BeautifulSoup
4 |
5 | from .string_util import truncate_string
6 | from .verbosity import verbose_print
7 |
8 |
9 | class AttributeNode:
10 |
11 | def __init__(self, name, value):
12 | self.name = name
13 | self.value = ' '.join(value) if isinstance(value, list) else value
14 |
15 | def __repr__(self):
16 | return 'AttributeNode("{0}", "{1}")'.format(self.name, self.value)
17 |
18 | @classmethod
19 | def enumerate(cls, node):
20 | if hasattr(node, 'hq_attrs') and _isnt_root_with_odd_ghost_hq_attrs_on_it_for_reasons_i_dont_understand(node):
21 | return node.hq_attrs
22 | else:
23 | return []
24 |
25 |
26 | def debug_dump_long_string(s, length=50, one_line=True, suffix='...'):
27 | return truncate_string(s, length, one_line, suffix)
28 |
29 |
30 | def debug_dump_node(obj):
31 | if is_root_node(obj):
32 | return 'ROOT DOCUMENT'
33 | elif is_tag_node(obj):
34 | return u'ELEMENT {0}'.format(debug_dump_long_string(str(obj)))
35 | elif is_attribute_node(obj):
36 | return 'ATTRIBUTE {0}="{1}"'.format(obj.name, debug_dump_long_string(obj.value))
37 | elif is_text_node(obj):
38 | return u'TEXT "{0}"'.format(debug_dump_long_string(obj.string))
39 | elif is_comment_node(obj):
40 | return u'COMMENT "{0}"'.format(debug_dump_long_string(obj.string))
41 | else:
42 | return 'NODE type {0}'.format(obj.__class__.__name__)
43 |
44 |
45 | def derive_text_from_node(obj, preserve_space=False):
46 | if is_tag_node(obj) or is_root_node(obj):
47 | result = u''
48 | strings = list(obj.strings)
49 | cursor = 0
50 | for run in (strings if preserve_space else obj.stripped_strings):
51 | if preserve_space:
52 | add_space = False
53 | else:
54 | while cursor < len(strings):
55 | if run in strings[cursor]:
56 | break
57 | else:
58 | cursor += 1
59 | if cursor < len(strings):
60 | add_space = strings[cursor][0].isspace() or (cursor > 0 and strings[cursor - 1][-1].isspace())
61 | else:
62 | add_space = False
63 | result += u'{0}{1}'.format(' ' if add_space else '', run)
64 | elif is_attribute_node(obj):
65 | result = obj.value
66 | elif is_text_node(obj):
67 | result = str(obj)
68 | else:
69 | raise RuntimeError("don't know how to derive test from {0}".format(debug_dump_node(obj)))
70 |
71 | if not preserve_space:
72 | result = re.sub(u'\u00a0', ' ', result)
73 | result = re.sub(r'\s+', ' ', result).strip()
74 |
75 | return result
76 |
77 |
78 | def is_any_node(obj):
79 | return is_root_node(obj) or is_tag_node(obj) or is_attribute_node(obj) or is_text_node(obj) or is_comment_node(obj)
80 |
81 |
82 | def is_attribute_node(obj):
83 | return isinstance(obj, AttributeNode)
84 |
85 |
86 | def is_comment_node(obj):
87 | return obj.__class__.__name__ == 'Comment'
88 |
89 |
90 | def is_root_node(obj):
91 | return obj.__class__.__name__ == 'BeautifulSoup'
92 |
93 |
94 | def is_tag_node(obj):
95 | return obj.__class__.__name__ == 'Tag'
96 |
97 |
98 | def is_text_node(obj):
99 | return obj.__class__.__name__ == 'NavigableString'
100 |
101 |
102 | def make_soup(source):
103 | soup = BeautifulSoup(source, 'html.parser')
104 | counter = [0]
105 |
106 | def visit_node(node):
107 | node.hq_doc_index = counter[0]
108 | counter[0] += 1
109 | if is_tag_node(node):
110 | attr_names = sorted(node.attrs.keys(), key=lambda name: name.lower())
111 | node.hq_attrs = [AttributeNode(name, node.attrs[name]) for name in attr_names]
112 | for attr in node.hq_attrs:
113 | visit_node(attr)
114 |
115 | preorder_traverse_node_tree(soup, visit_node, filter=is_any_node)
116 | verbose_print('Loaded HTML document containing {0} indexed nodes.'.format(counter[0]))
117 | return soup
118 |
119 |
120 | def preorder_traverse_node_tree(node, fn, filter=lambda n: is_tag_node(n) or is_root_node(n)):
121 | if filter(node):
122 | fn(node)
123 | if hasattr(node, 'hq_attrs') and _isnt_root_with_odd_ghost_hq_attrs_on_it_for_reasons_i_dont_understand(node):
124 | for attr in node.hq_attrs:
125 | preorder_traverse_node_tree(attr, fn, filter)
126 | if hasattr(node, 'children'):
127 | for child in node.children:
128 | preorder_traverse_node_tree(child, fn, filter)
129 |
130 |
131 | def root_tag_from_any_tag(obj):
132 | return root_tag_from_soup(soup_from_any_tag(obj))
133 |
134 |
135 | def root_tag_from_soup(soup):
136 | return next(tag for tag in soup.children if is_tag_node(tag))
137 |
138 |
139 | def soup_from_any_tag(obj):
140 | while obj.parent is not None:
141 | obj = obj.parent
142 | return obj
143 |
144 |
145 | def _isnt_root_with_odd_ghost_hq_attrs_on_it_for_reasons_i_dont_understand(node):
146 | return node.hq_attrs is not None
147 |
--------------------------------------------------------------------------------
/hq/hquery/location_path.py:
--------------------------------------------------------------------------------
1 | from hq.hquery.axis import Axis
2 | from hq.hquery.syntax_error import HquerySyntaxError
3 | from hq.soup_util import debug_dump_node, soup_from_any_tag, debug_dump_long_string
4 | from hq.verbosity import verbose_print
5 | from hq.hquery.expression_context import get_context_node, peek_context
6 | from hq.hquery.evaluation_in_context import evaluate_across_contexts, evaluate_in_context
7 | from hq.hquery.functions.core_number import number
8 | from hq.hquery.object_type import is_number
9 | from hq.hquery.sequences import make_node_set
10 |
11 |
12 | class LocationPath:
13 |
14 | def __init__(self, first_axis, first_node_test, first_predicates, absolute=False, root_expression=None):
15 | self.absolute = absolute
16 | self.root_expression = root_expression
17 | self.steps = []
18 | self.append_step(first_axis, first_node_test, first_predicates)
19 | if self.absolute and self.root_expression is not None:
20 | raise HquerySyntaxError('internal error forming location path; it looks both rooted and absolute')
21 |
22 |
23 | def __len__(self):
24 | return len(self.steps)
25 |
26 |
27 | def __str__(self):
28 | return '{0}{1}{2}'.format('' if self.root_expression is None else '/',
29 | '/' if self.absolute else '',
30 | '/'.join([str(step) for step in self.steps]))
31 |
32 |
33 | def append_step(self, axis, node_test, predicates):
34 | if axis == Axis.css_class and not node_test.is_name_test:
35 | raise HquerySyntaxError('CSS class axis must be followed by a name test, not a node test')
36 | self.steps.append(LocationPathStep(axis, node_test, predicates))
37 |
38 |
39 | def debug_dump(self):
40 | return debug_dump_long_string(str(self))
41 |
42 |
43 | def evaluate(self):
44 | verbose_print(lambda: 'Evaluating location path {0}'.format(self.debug_dump()), indent_after=True)
45 |
46 | if self.absolute:
47 | verbose_print('Switching context to root because this path is absolute.')
48 | results = evaluate_in_context(soup_from_any_tag(get_context_node()),
49 | lambda: self._evaluate_steps(self.steps))
50 | elif self.root_expression is not None:
51 | results = evaluate_across_contexts(self.root_expression(), lambda: self._evaluate_steps(self.steps))
52 | else:
53 | results = self._evaluate_steps(self.steps)
54 |
55 | verbose_print('Evaluation completed; location path selected {0} nodes'.format(len(results)),
56 | outdent_before=True)
57 | return make_node_set(results, reverse=False)
58 |
59 |
60 | def _evaluate_steps(self, remaining_steps):
61 | step = remaining_steps[0]
62 | verbose_print(lambda: 'Evaluating step {0}'.format(remaining_steps[0]), indent_after=True)
63 |
64 | result_set = make_node_set(step.node_test.apply(step.axis, get_context_node()),
65 | reverse=step.axis.is_reverse_order())
66 | verbose_print(lambda: 'Axis and node test produced {0} matching nodes'.format(len(result_set)))
67 |
68 | for index, expression_fn in enumerate(step.predicates):
69 | def accept_context_node():
70 | context = peek_context()
71 |
72 | format_str = u'Evaluating predicate expression for context node at position {0} of {1}: {2}.'
73 | verbose_print(lambda: format_str.format(context.position, context.size, debug_dump_node(context.node)))
74 |
75 | value = expression_fn()
76 | if is_number(value):
77 | accept = number(context.position) == value
78 | else:
79 | accept = bool(value)
80 |
81 | verbose_print(lambda: u'{0} node {1}'.format('Accepted' if accept else 'Rejected',
82 | debug_dump_node(context.node)))
83 | return [context.node] if accept else []
84 |
85 | verbose_print(lambda: 'Evaluating predicate #{0} against {1} nodes'.format(index + 1, len(result_set)),
86 | indent_after=True)
87 | result_set = evaluate_across_contexts(result_set, accept_context_node)
88 | verbose_print(
89 | lambda: 'Evaluation of predicate #{0} complete; accepted {1} nodes.'.format(index + 1, len(result_set)),
90 | outdent_before=True)
91 |
92 | if len(remaining_steps) > 1:
93 | result_set = evaluate_across_contexts(result_set, lambda: self._evaluate_steps(remaining_steps[1:]))
94 |
95 | verbose_print(lambda: 'Step evaluation completed; returning {0} nodes.'.format(len(result_set)),
96 | outdent_before=True)
97 | return result_set
98 |
99 |
100 |
101 | class LocationPathStep:
102 |
103 | def __init__(self, axis, node_test, predicates):
104 | self.axis = axis
105 | self.node_test = node_test
106 | self.predicates = predicates
107 |
108 | def __str__(self):
109 | return '{0}::{1}{2}'.format(self.axis.name, repr(self.node_test), '[predicate]' * len(self.predicates))
110 |
--------------------------------------------------------------------------------
/hq/hquery/computed_constructors/json_hash.py:
--------------------------------------------------------------------------------
1 | import json
2 | import re
3 |
4 | from hq.hquery.computed_constructors.hash_key_value import HashKeyValue
5 | from hq.hquery.evaluation_error import HqueryEvaluationError
6 | from hq.hquery.expression_context import peek_context
7 | from hq.hquery.functions.core_number import number
8 | from hq.hquery.object_type import string_value, object_type_name, is_string, is_number, is_boolean, \
9 | is_hash, is_array, is_sequence
10 | from hq.hquery.sequences import make_sequence
11 | from hq.hquery.syntax_error import HquerySyntaxError
12 | from hq.soup_util import is_tag_node, debug_dump_node, is_any_node, is_text_node, debug_dump_long_string
13 | from hq.verbosity import verbose_print
14 |
15 |
16 | class JsonHash:
17 |
18 | def __init__(self, contents):
19 | if not isinstance(contents, dict):
20 | raise HqueryEvaluationError('Attempted to construct a JSON hash based on a(n) {0} object'.format(
21 | contents.__class__.__name__))
22 | self.contents = contents
23 |
24 |
25 | def __repr__(self):
26 | return 'HASH {0}'.format(repr(self.contents))
27 |
28 |
29 | def __str__(self):
30 | return json.dumps(self.contents)
31 |
32 |
33 |
34 | def _construct_array_filter(tag_names):
35 | tag_names = tag_names.split(',')
36 |
37 | def evaluate(hash):
38 | for key, value in hash.items():
39 | if key in tag_names:
40 | if not isinstance(value, list):
41 | verbose_print('JSON hash constructor array filter converting attribute "{0}" to array'.format(key))
42 | hash[key] = [value]
43 |
44 | return evaluate
45 |
46 |
47 | def _construct_map_filter(mappings):
48 | mappings = {old: new for (old, _, new) in [m.partition('>') for m in mappings.split(',')]}
49 |
50 | def evaluate(hash):
51 | to_replace = []
52 | for key, value in hash.items():
53 | if key in mappings:
54 | verbose_print('JSON hash constructor mapping filter converting attribute name "{0}" to "{1}"'.format(key, value))
55 | to_replace.append(key)
56 |
57 | for key in to_replace:
58 | hash[mappings[key]] = hash[key]
59 | del hash[key]
60 |
61 | return evaluate
62 |
63 |
64 | def _construct_number_filter(tag_names):
65 | tag_names = tag_names.split(',')
66 |
67 | def evaluate(hash):
68 | for key, value in hash.items():
69 | if key in tag_names:
70 | verbose_print(
71 | 'JSON hash constructor number filter converting attribute "{0}" value(s) to numbers'.format(key)
72 | )
73 | if isinstance(value, list):
74 | hash[key] = [number(v).value for v in value]
75 | else:
76 | hash[key] = number(value).value
77 |
78 | return evaluate
79 |
80 |
81 | _name_list_arg_regex = r'(([a-zA-Z]\w*,?)+)'
82 |
83 | def _skip_over_embedded_groups_from_list_matches(groups):
84 | return groups[::2]
85 |
86 |
87 | _filter_map = {
88 | r'a:{0}:'.format(_name_list_arg_regex): _construct_array_filter,
89 | r'm:(([a-zA-Z]\w*>[a-zA-Z]\w*,?)+):': _construct_map_filter,
90 | r'n:{0}:'.format(_name_list_arg_regex): _construct_number_filter,
91 | }
92 |
93 |
94 | class ComputedJsonHashConstructor:
95 |
96 | def __init__(self):
97 | self.contents = None
98 | self.filters = []
99 |
100 |
101 | def set_contents(self, expression_fn):
102 | if self.contents is not None:
103 | raise HquerySyntaxError('computed JSON hash constructor already has contents')
104 | self.contents = expression_fn
105 |
106 |
107 | def set_filters(self, source):
108 | while len(source) > 0:
109 | match = None
110 | for regex, constructor in _filter_map.items():
111 | match = re.match(regex, source)
112 | if match:
113 | filter_fn = constructor(*_skip_over_embedded_groups_from_list_matches(match.groups()))
114 | self.filters.append(filter_fn)
115 | source = source[match.span()[1]:]
116 | break
117 | if match is None:
118 | raise HquerySyntaxError(
119 | 'Malformed filter "{0}" in computed JSON hash constructor filter clause'.format(source)
120 | )
121 |
122 |
123 | def evaluate(self):
124 | result = dict()
125 |
126 | for item in make_sequence(self.contents()) if self.contents is not None else []:
127 | if isinstance(item, HashKeyValue):
128 | if is_sequence(item.value) and len(item.value) == 1:
129 | item.value = item.value[0]
130 |
131 | if is_number(item.value) or is_boolean(item.value):
132 | result[item.key] = item.value.value
133 | elif is_hash(item.value) or is_array(item.value):
134 | result[item.key] = item.value.contents
135 | else:
136 | result[item.key] = string_value(item.value)
137 | elif is_tag_node(item):
138 | self._gab('adding element "{0}" to contents'.format(item.name))
139 | self._process_tag(result, item)
140 | elif is_text_node(item) or is_string(item):
141 | self._gab('adding text "{0}" to contents'.format(debug_dump_long_string(string_value(item))))
142 | result['text'] = self._append_to_text(result['text'] if 'text' in result else '', string_value(item))
143 | else:
144 | value_desc = debug_dump_node(item) if is_any_node(item) else object_type_name(item)
145 | raise HqueryEvaluationError(
146 | 'Cannot use {0} as a content object in a computed JSON hash constructor'.format(value_desc)
147 | )
148 |
149 | self._process_filters(result)
150 |
151 | return JsonHash(result)
152 |
153 |
154 | def _append_to_text(self, so_far, more_content):
155 | return '{0}{1}{2}'.format(so_far, ' ' if len(so_far) > 0 else '', more_content)
156 |
157 |
158 | def _gab(self, message):
159 | verbose_print('JSON hash constructor {0}'.format(message))
160 |
161 |
162 | def _process_filters(self, result):
163 | for filter in self.filters:
164 | filter(result)
165 |
166 |
167 | def _process_tag(self, result, value):
168 | new_value = string_value(value)
169 | if value.name in result:
170 | if isinstance(result[value.name], list):
171 | result[value.name].append(new_value)
172 | else:
173 | result[value.name] = [result[value.name], new_value]
174 | else:
175 | result[value.name] = new_value
176 |
--------------------------------------------------------------------------------
/test/hquery/test_computed_json_construction.py:
--------------------------------------------------------------------------------
1 | import json
2 |
3 | from test.hquery.hquery_test_util import query_html_doc
4 |
5 |
6 | def test_hash_constructor_turns_tags_into_tag_name_keys_with_tag_content_values():
7 | html_body = """
8 | foo
9 | bar
"""
10 | actual = json.loads(query_html_doc(html_body, 'hash { /html/body/* }'))
11 | assert actual['p'] == 'foo'
12 | assert actual['div'] == 'bar'
13 |
14 |
15 | def test_hash_constructor_turns_text_into_attribute_named_text():
16 | html_body = 'Hello, world!
'
17 | expected = '{"text": "Hello, world!"}'
18 | assert query_html_doc(html_body, 'hash { //p/text() }') == expected
19 | assert query_html_doc('', 'hash { "Hello, world!" }') == expected
20 |
21 |
22 | def test_hash_constructor_joins_discontinuous_text_from_content_sequence_with_spaces_in_between():
23 | html_body = 'vidi
'
24 | assert query_html_doc(html_body, 'hash { "veni", //p/text(), "vici" }') == '{"text": "veni vidi vici"}'
25 |
26 |
27 | def test_hash_constructor_coalesces_like_elements_into_an_array_by_default():
28 | html_body = """
29 | one
30 | two
31 | three
"""
32 |
33 | actual = json.loads(query_html_doc(html_body, 'hash { /html/body/* }'))
34 | assert isinstance(actual['p'], list)
35 | assert len(actual['p']) == 2
36 | assert actual['p'][1] == 'three'
37 | assert actual['div'] == 'two'
38 |
39 |
40 | def test_hash_constructor_array_filter_causes_matching_elements_to_be_put_in_an_array():
41 | html_body = """
42 | zero
43 | one
"""
44 | actual = json.loads(query_html_doc(html_body, 'hash {a:h1:} { /html/body/* }'))
45 |
46 | assert actual['p'] == 'one'
47 | assert isinstance(actual['h1'], list)
48 | assert len(actual['h1']) == 1
49 | assert actual['h1'][0] == 'zero'
50 |
51 |
52 | def test_hash_constructor_number_filter_causes_contents_of_matching_elements_to_be_interpreted_as_numbers():
53 | html_body = """
54 | 20
55 | 20
56 | 20.20 """
57 |
58 | actual = json.loads(query_html_doc(html_body, 'hash {n:div,h1:} { /html/body/* }'))
59 |
60 | assert actual['p'] == '20'
61 | assert actual['div'] == 20
62 | assert actual['h1'] == 20.2
63 |
64 |
65 | def test_hash_constructor_filters_can_be_combined():
66 | html_body = """
67 | 20
68 | 20
69 | 20.20 """
70 |
71 | actual = json.loads(query_html_doc(html_body, 'hash {a:p,h1:n:div,h1:} { /html/body/* }'))
72 | assert isinstance(actual['p'], list)
73 | assert isinstance(actual['h1'], list)
74 | assert actual['p'][0] == '20'
75 | assert actual['div'] == 20
76 | assert actual['h1'][0] == 20.2
77 |
78 | actual = json.loads(query_html_doc(html_body, 'hash {n:div,h1:a:p,h1:} { /html/body/* }'))
79 | assert isinstance(actual['p'], list)
80 | assert isinstance(actual['h1'], list)
81 | assert actual['p'][0] == '20'
82 | assert actual['div'] == 20
83 | assert actual['h1'][0] == 20.2
84 |
85 |
86 | def test_hash_constructor_mapping_filter_renames_attributes_derived_from_element_content():
87 | html_body = """
88 | foo
89 | bar
"""
90 |
91 | actual = json.loads(query_html_doc(html_body, 'hash {m:p>paragraph,div>other:} { /html/body/* }'))
92 |
93 | assert 'paragraph' in actual
94 | assert 'other' in actual
95 | assert 'p' not in actual
96 | assert 'div' not in actual
97 | assert actual['paragraph'] == 'foo'
98 | assert actual['other'] == 'bar'
99 |
100 |
101 | def test_hash_constructor_can_contain_a_sequence_assembled_from_node_sets():
102 | html_body = """
103 | foo
104 | bar
"""
105 |
106 | actual = json.loads(query_html_doc(html_body, 'hash { /html/body/p, /html/body/div }'))
107 |
108 | assert 'p' in actual
109 | assert 'div' in actual
110 | assert actual['p'] == 'foo'
111 | assert actual['div'] == 'bar'
112 |
113 |
114 | def test_hash_keys_can_be_used_to_define_attributes_in_a_constructed_hash():
115 | actual = json.loads(query_html_doc('', 'hash {foo: "bar", moe: "larry"}'))
116 |
117 | assert 'foo' in actual
118 | assert actual['foo'] == 'bar'
119 | assert 'moe' in actual
120 | assert actual['moe'] == 'larry'
121 |
122 |
123 | def test_hash_keys_can_be_mixed_with_other_types_of_content_in_a_constructed_hash():
124 | html_body = """
125 | Wake up and go back to sleep!
126 | I'm trying to think, but nothing happens! """
127 |
128 | actual = json.loads(query_html_doc(html_body, 'hash {//moe, larry: "The pain goes away on payday.", //curly}'))
129 |
130 | assert 'moe' in actual
131 | assert 'larry' in actual
132 | assert 'curly' in actual
133 | assert actual['moe'] == 'Wake up and go back to sleep!'
134 | assert actual['larry'] == 'The pain goes away on payday.'
135 | assert actual['curly'] == "I'm trying to think, but nothing happens!"
136 |
137 |
138 | def test_non_string_types_survive_conversion_to_json():
139 | actual = json.loads(query_html_doc('', 'hash { integer: 1, float: 1.1, boolean: true() }'))
140 |
141 | assert all(name in actual for name in ('integer', 'float', 'boolean'))
142 | assert isinstance(actual['integer'], int)
143 | assert isinstance(actual['float'], float)
144 | assert isinstance(actual['boolean'], bool)
145 |
146 |
147 | def test_hash_can_contain_key_values_that_are_other_computed_json_objects():
148 | actual = json.loads(query_html_doc('', 'hash {a_hash: hash {foo: "bar"}, an_array: array {"one", 2}}'))
149 |
150 | assert 'a_hash' in actual
151 | assert 'an_array' in actual
152 | assert isinstance(actual['a_hash'], dict)
153 | assert isinstance(actual['an_array'], list)
154 | assert 'foo' in actual['a_hash']
155 | assert actual['a_hash']['foo'] == 'bar'
156 | assert len(actual['an_array']) == 2
157 | assert actual['an_array'][0] == 'one'
158 | assert actual['an_array'][1] == 2
159 |
160 |
161 | def test_element_value_in_hash_key_is_transformed_into_string_value_by_default():
162 | html_body = 'you are here
'
163 |
164 | actual = json.loads(query_html_doc(html_body, 'hash { placement: //p }')) == 'You are here'
165 |
166 |
167 | def test_array_constructor_uses_string_value_of_elements_when_given_node_sets_as_contents():
168 | html_body = """
169 | one
170 | two
171 | three
"""
172 |
173 | actual = json.loads(query_html_doc(html_body, 'array { //p, //div }'))
174 |
175 | assert len(actual) == 3
176 | assert actual[0] == 'one'
177 | assert actual[1] == 'three'
178 | assert actual[2] == 'two'
179 |
180 |
181 | def test_array_constructor_properly_handles_hash_constructors_as_contents():
182 | actual = json.loads(query_html_doc('', 'array { (0 to 2) -> hash {value: $_} }'))
183 |
184 | assert len(actual) == 3
185 | assert all('value' in hash for hash in actual)
186 | assert all(actual[i]['value'] == i for i in range(0, 3))
187 |
188 |
189 | def test_text_content_normalization_is_applied_to_attribute_values_in_hash_constructor():
190 | preserved = u'\u00a0non\u00a0breaking\u00a0spaces '
191 | html_body = u'{0}
'.format(preserved)
192 |
193 | actual = json.loads(query_html_doc(html_body, 'hash {para: //p/text()}'))
194 | assert actual['para'] == 'non breaking spaces'
195 |
196 | actual = json.loads(query_html_doc(html_body, 'hash {para: //p/text()}', preserve_space=True))
197 | assert actual['para'] == preserved
198 |
--------------------------------------------------------------------------------
/test/hquery/test_core_functions.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 |
4 | sys.path.insert(0, os.path.abspath('../..'))
5 |
6 | from ..common_test_util import expected_result
7 | from test.hquery.hquery_test_util import query_html_doc
8 |
9 |
10 | def test_boolean_function_converts_numbers_according_to_w3c_rules():
11 | assert query_html_doc('', 'boolean(0)') == expected_result('false')
12 | assert query_html_doc('', 'boolean(-0)') == expected_result('false')
13 | assert query_html_doc('', 'boolean(1)') == expected_result('true')
14 | assert query_html_doc('', 'boolean(-1)') == expected_result('true')
15 | assert query_html_doc('', 'false() = boolean(false())') == expected_result('true')
16 | assert query_html_doc('', 'boolean(0 div 0)') == expected_result('false')
17 |
18 |
19 | def test_boolean_function_converts_node_sets_according_to_w3c_rules():
20 | assert query_html_doc('
', 'boolean(//div)') == expected_result('true')
21 | assert query_html_doc('
', 'boolean(//p)') == expected_result('false')
22 |
23 |
24 | def test_boolean_function_converts_strings_according_to_w3c_rules():
25 | assert query_html_doc('', 'boolean("")') == expected_result('false')
26 | assert query_html_doc('', 'boolean(" ")') == expected_result('true')
27 |
28 |
29 | def test_ceiling_returns_expected_integer_values_baserd_on_xpath_3_examples():
30 | assert query_html_doc('', 'ceiling(10.5)') == '11'
31 | assert query_html_doc('', 'ceiling(-10.5)') == '-10'
32 |
33 |
34 | def test_floor_returns_expected_integer_values_baserd_on_xpath_3_examples():
35 | assert query_html_doc('', 'floor(10.5)') == '10'
36 | assert query_html_doc('', 'floor(-10.5)') == '-11'
37 |
38 |
39 | def test_id_function_returns_node_set_where_node_ids_match_any_names_in_whitespace_separated_list():
40 | html_body = """
41 | one
42 | two
43 | three
"""
44 | assert query_html_doc(html_body, 'id("one")') == expected_result("""
45 |
46 | one
47 |
""")
48 | assert query_html_doc(html_body, 'id("one 3")') == expected_result("""
49 |
50 | one
51 |
52 |
53 | three
54 |
""")
55 | assert query_html_doc(html_body, 'id(3)') == expected_result("""
56 |
57 | three
58 |
""")
59 |
60 |
61 | def test_id_function_crazy_use_case_where_id_values_are_derived_from_string_values_of_nodes_in_node_set():
62 | html_body = """
63 |
67 | one
68 | two
"""
69 | assert query_html_doc(html_body, 'id(//li)') == expected_result("""
70 |
71 | one
72 |
73 |
74 | two
75 |
""")
76 |
77 |
78 | def test_name_function_returns_tag_name_of_given_element_or_first_element_if_given_a_node_set():
79 | html_body = '
'
80 | assert query_html_doc(html_body, 'name(/html/body/*)') == 'div'
81 |
82 |
83 | def test_name_function_returns_name_of_context_node_if_passed_no_argument():
84 | html_body = """
85 | not selected
86 | selected
"""
87 | assert query_html_doc(html_body, '/html/body/*[name() = "p"]') == expected_result("""
88 |
89 | selected
90 |
""")
91 |
92 |
93 | def test_name_function_returns_empty_string_if_passed_a_node_that_is_not_an_element():
94 | html_body = 'Text comes first then element '
95 | assert query_html_doc(html_body, 'name(/html/body/node()[1])') == ''
96 | assert query_html_doc(html_body, 'name(/html/body/node()[2])') == 'span'
97 |
98 |
99 | def test_not_function_produces_expected_results():
100 | assert query_html_doc('', 'not(false())') == expected_result('true')
101 | assert query_html_doc('', 'not(not("foo" = "bar"))') == expected_result('false')
102 | assert query_html_doc('', 'not(0)') == expected_result('true')
103 | assert query_html_doc('', 'not(10000)') == expected_result('false')
104 |
105 |
106 | def test_number_function_converts_string_to_number():
107 | assert query_html_doc('', 'number("43") + number("-1")') == expected_result('42')
108 | assert query_html_doc('', 'number("10") + number("1.11")') == expected_result('11.11')
109 |
110 |
111 | def test_number_function_converts_boolean_values_to_one_and_zero():
112 | assert query_html_doc('', 'number(true())') == expected_result('1')
113 | assert query_html_doc('', 'number(false())') == expected_result('0')
114 |
115 |
116 | def test_number_function_converts_node_set_based_on_string_value_of_first_node_in_doc_order():
117 | html_body = """
118 |
123 | 24
"""
124 | assert query_html_doc(html_body, 'number(//p)') == expected_result('98.6')
125 |
126 |
127 | def test_round_function_follows_xpath_1_rules_for_positive_numbers_but_python_rules_for_negative_ones():
128 | """
129 | Not fooling with positive or negative infinity or zero, nor the numeric type business in the XPath 3.0 functions
130 | spec.. Also not, as the test name attests, respecting XPath 1 rules for negative numbers, as Python rounds away
131 | from zero and I anticipate some tiresome drudgery for no particular benefit (again, HQuery is not intended as an
132 | execution target for existing XPath code).
133 | """
134 | assert query_html_doc('', 'round(5.49)') == '5'
135 | assert query_html_doc('', 'round(5.5)') == '6'
136 | assert query_html_doc('', 'round(1 div 0)') == 'NaN'
137 | assert query_html_doc('', 'round(-5.5)') == '-6'
138 | assert query_html_doc('', 'round(-5.49)') == '-5'
139 |
140 |
141 | def test_round_function_supports_an_optional_precision_argument():
142 | assert query_html_doc('', 'round(3.456)') == '3'
143 | assert query_html_doc('', 'round(3.456, 1)') == '3.5'
144 | assert query_html_doc('', 'round(3.456, 2)') == '3.46'
145 | assert query_html_doc('', 'round(3.456, 3)') == '3.456'
146 |
147 |
148 | def test_substring_function_behaves_reasonably_and_lets_agree_to_ignore_all_that_NaN_crap():
149 | assert query_html_doc('', 'substring("12345", 1.5, 2.6)') == '234'
150 | assert query_html_doc('', 'substring("12345", 0, 3)') == '12'
151 | assert query_html_doc('', 'substring("12345", -1, 3)') == '1'
152 | assert query_html_doc('', 'substring("12345", 5, 2)') == '5'
153 |
154 |
155 | def test_substring_after_and_substring_before_work_per_spec():
156 | assert query_html_doc('', 'substring-after("1999/04/01", "/")') == '04/01'
157 | assert query_html_doc('', 'substring-after("1999/04/01", ":")') == ''
158 | assert query_html_doc('', 'substring-before("1999/04/01", "/")') == '1999'
159 | assert query_html_doc('', 'substring-before("1999/04/01", ":")') == ''
160 |
161 |
162 | def test_true_and_false_functions_return_expected_values():
163 | assert query_html_doc('', 'false()') == expected_result('false')
164 | assert query_html_doc('', 'true()') == expected_result('true')
165 | assert query_html_doc('', 'true() = false()') == expected_result('false')
166 | assert query_html_doc('', 'true() != false()') == expected_result('true')
167 |
168 |
169 | def test_position_function_in_predicate_applies_to_current_step_only():
170 | html_body = """
171 |
172 |
173 | one
174 | two
175 |
176 |
177 | uno
178 | dos
179 |
180 |
181 | ichi
182 | ni
183 |
184 |
"""
185 | assert query_html_doc(html_body, '//tr[@class="select-me"]/td[position()=2]') == expected_result("""
186 |
187 | two
188 |
189 |
190 | ni
191 | """)
192 |
193 |
194 | def test_position_function_in_second_predicate_applies_to_results_from_first_predicate():
195 | html_body = """
196 |
197 |
198 | one
199 | two
200 |
201 |
202 | uno
203 | dos
204 |
205 |
206 | ichi
207 | ni
208 |
209 |
"""
210 | assert query_html_doc(html_body, '//td[../@class="select-me"][position()=1]') == expected_result("""
211 |
212 | one
213 |
214 |
215 | ichi
216 | """)
217 |
218 |
219 | def test_string_function_returns_expected_results_for_various_objects():
220 | html_body = """
221 | one
222 | two
"""
223 |
224 | assert query_html_doc(html_body, 'string(//p)') == expected_result('one')
225 | assert query_html_doc('', 'string(2 div 0)') == expected_result('NaN')
226 | assert query_html_doc('', 'string(-0)') == expected_result('0')
227 | assert query_html_doc('', 'string(-9)') == expected_result('-9')
228 | assert query_html_doc('', 'string(98.6)') == expected_result('98.6')
229 | assert query_html_doc('', 'string(true())') == expected_result('true')
230 | assert query_html_doc('', 'string(1 = -1)') == expected_result('false')
231 |
232 |
233 | def test_string_value_of_an_element_with_mixed_content_inserts_proper_spaces_between_text_runs():
234 | html_body = 'once twice thrice
'
235 | assert query_html_doc(html_body, 'string(//p)') == expected_result('once twice thrice')
236 |
237 |
238 | def test_string_length_function_returns_expected_values():
239 | assert query_html_doc('', 'string-length("foo")') == expected_result('3')
240 | assert query_html_doc('', 'string-length("")') == expected_result('0')
241 |
242 |
243 | def test_sum_function_sums_number_interpretation_of_items_in_sequence():
244 | html_body = """
245 | 30
246 |
247 | 2 """
248 |
249 | assert query_html_doc(html_body, 'sum(//span)') == '32'
250 | assert query_html_doc(html_body, 'sum((//span, //div/@value))') == '42.42'
251 |
252 |
253 | def test_sum_function_supports_zero_value_for_empty_sequence_as_second_argument():
254 | assert query_html_doc('', 'sum(//span, "zero")') == 'zero'
255 |
256 |
257 | def test_various_functions_use_context_node_when_no_argument_passed():
258 | html_body = """
259 | first
260 | foo bar
261 | last
"""
262 |
263 | assert query_html_doc(html_body, '//p[string() = "first"]/text()') == expected_result('first')
264 | assert query_html_doc(html_body, '//p[normalize-space() = "foo bar"]/text()', preserve_space=True) == \
265 | expected_result('foo bar')
266 | assert query_html_doc(html_body, '//p[string-length() = 4]/text()') == expected_result('last')
267 |
--------------------------------------------------------------------------------
/test/hquery/test_xpath1_abbreviated_samples.py:
--------------------------------------------------------------------------------
1 | from hq.soup_util import make_soup
2 | from test.common_test_util import expected_result
3 | from test.hquery.hquery_test_util import query_context_node
4 |
5 |
6 | def test_selects_the_para_element_children_of_the_context_node():
7 | html = """
8 |
9 | selected
10 | not selected
11 | also selected
12 | """
13 | assert query_context_node(html, 'para') == expected_result("""
14 |
15 | selected
16 |
17 |
18 | also selected
19 | """)
20 |
21 |
22 | def test_selects_all_element_children_of_the_context_node():
23 | html = """
24 |
25 |
26 | selected
27 | non-selected text
28 | also selected
29 | """
30 | assert query_context_node(html, '*') == expected_result("""
31 |
32 | selected
33 |
34 |
35 | also selected
36 | """)
37 |
38 |
39 | def test_selects_all_text_node_children_of_the_context_node():
40 | html = """
41 |
42 | first
43 | second
44 | third
45 | """
46 | actual = query_context_node(html, 'text()')
47 | assert 'first' in actual
48 | assert 'second' not in actual
49 | assert 'third' in actual
50 |
51 |
52 | def test_selects_the_name_attribute_of_the_context_node():
53 | html = 'not value '
54 | assert query_context_node(html, '@name') == expected_result('name="value"')
55 |
56 |
57 | def test_selects_all_the_attributes_of_the_context_node():
58 | html = ' '
59 | assert query_context_node(html, '@*') == expected_result('''
60 | first="first value"
61 | second="second value"
62 | third="third value"''')
63 |
64 |
65 | def test_selects_the_first_para_child_of_the_context_node():
66 | html = """
67 |
68 | selected
69 | not selected
70 | """
71 | assert query_context_node(html, 'para[1]') == expected_result("""
72 |
73 | selected
74 | """)
75 |
76 |
77 | def test_selects_the_last_para_child_of_the_context_node():
78 | html = """
79 |
80 | not selected
81 | also not selected
82 | selected
83 | """
84 | assert query_context_node(html, 'para[last()]') == expected_result("""
85 |
86 | selected
87 | """)
88 |
89 |
90 | def test_selects_all_para_grandchildren_of_the_context_node():
91 | html = """
92 |
93 |
94 | not selected
95 | selected
96 | also selected
97 |
98 | """
99 | assert query_context_node(html, '*/para') == expected_result("""
100 |
101 | selected
102 |
103 |
104 | also selected
105 | """)
106 |
107 |
108 | def test_selects_the_second_section_of_the_fifth_chapter_of_the_doc():
109 | html = """
110 |
111 | one
112 | two
113 | three
114 | four
115 |
116 |
117 |
118 |
119 | """
120 | assert query_context_node(html, '/doc/chapter[5]/section[2]') == expected_result("""
121 |
122 | five point two
123 | """)
124 |
125 |
126 | def test_selects_the_para_element_descendants_of_the_chapter_element_children_of_the_context_node():
127 | html = """
128 |
129 | not selected
130 |
131 |
132 | selected
133 |
134 |
135 | """
136 | assert query_context_node(html, 'chapter//para') == expected_result("""
137 |
138 |
139 | selected
140 |
141 |
142 |
143 | selected
144 | """)
145 |
146 |
147 | def test_selects_all_the_para_descendants_of_the_document_root_and_thus_selects_all_para_elements_in_the_same_document_as_the_context_node():
148 | html = """
149 |
150 |
151 | selected
152 |
153 |
154 | also selected
155 | """
156 | soup = make_soup(html)
157 | assert query_context_node(soup.root.context, '//para') == expected_result("""
158 |
159 |
160 | selected
161 |
162 |
163 |
164 | selected
165 |
166 |
167 | also selected
168 | """)
169 |
170 |
171 | def test_selects_all_the_item_elements_in_the_same_document_as_the_context_node_that_have_an_olist_parent():
172 | html = """
173 |
174 | no items
175 | - not selected
176 |
177 |
178 | - first
179 |
180 | -
181 |
182 | - second
183 |
184 |
185 | """
186 | soup = make_soup(html)
187 | assert query_context_node(soup.root.context, '//olist/item') == expected_result("""
188 | -
189 | first
190 |
191 | -
192 | second
193 |
""")
194 |
195 |
196 | def test_selects_the_context_node():
197 | html = """
198 |
199 | selected
200 | """
201 | assert query_context_node(html, '.') == expected_result("""
202 |
203 | selected
204 | """)
205 |
206 |
207 | def test_selects_the_para_element_descendants_of_the_context_node():
208 | html = """
209 |
210 |
211 | selected
212 | not selected
213 |
214 | also selected
215 |
216 |
217 | """
218 | soup = make_soup(html)
219 | assert query_context_node(soup.para.context, './/para') == expected_result("""
220 |
221 | selected
222 |
223 |
224 |
225 | also selected
226 |
227 |
228 |
229 | also selected
230 | """)
231 |
232 |
233 | def test_selects_the_parent_of_the_context_node():
234 | html = """
235 |
236 |
237 | """
238 | soup = make_soup(html)
239 | assert query_context_node(html, '..') == expected_result("""
240 |
241 |
242 |
243 | """)
244 |
245 |
246 | def test_selects_the_lang_attribute_of_the_parent_of_the_context_node():
247 | html = """
248 |
249 |
250 | """
251 | soup = make_soup(html)
252 | assert query_context_node(soup.root.context, '../@lang') == expected_result('lang="English"')
253 |
254 |
255 | def test_selects_all_para_children_of_the_context_node_that_have_a_type_attribute_with_value_warning():
256 | html = """
257 |
258 | not selected
259 | selected
260 | not selected
261 | also selected
262 | """
263 | assert query_context_node(html, 'para[@type="warning"]') == expected_result("""
264 |
265 | selected
266 |
267 |
268 | also selected
269 | """)
270 |
271 |
272 | def test_selects_the_fifth_para_child_of_the_context_node_that_has_a_type_attribute_with_value_warning():
273 | html = """
274 |
275 | first error
276 | first warning
277 | second error
278 | second warning
279 | third error
280 | third warning
281 | fourth error
282 | fourth warning
283 | fifth error
284 | fifth warning
285 | """
286 | assert query_context_node(html, 'para[@type="warning"][5]') == expected_result("""
287 |
288 | fifth warning
289 | """)
290 |
291 |
292 | def test_selects_the_fifth_para_child_of_the_context_node_if_that_child_has_a_type_attribute_with_value_warning():
293 | html = """
294 |
295 | not selected
296 | not selected
297 | not selected
298 | not selected
299 | selected
300 | """
301 | assert query_context_node(html, 'para[5][@type="warning"]') == expected_result("")
302 | assert query_context_node(html.replace('error', 'warning'), 'para[5][@type="warning"]') == expected_result("""
303 |
304 | selected
305 | """)
306 |
307 |
308 | def test_selects_the_chapter_children_of_the_context_node_that_have_one_or_more_title_children_with_string_value_equal_to_Introduction():
309 | html = """
310 |
311 |
312 | Introduction
313 |
314 | not selected
315 |
316 | Author's Note
317 |
318 |
319 | Introduction
320 | Hello, I'm chapter.
321 |
322 | """
323 | assert query_context_node(html, 'chapter[title="Introduction"]') == expected_result("""
324 |
325 |
326 | Introduction
327 |
328 |
329 |
330 |
331 | Introduction
332 |
333 |
334 | Hello, I'm chapter.
335 |
336 | """)
337 |
338 |
339 | def test_selects_the_chapter_children_of_the_context_node_that_have_one_or_more_title_children():
340 | html = """
341 |
342 |
343 | selected
344 |
345 |
346 |
347 |
348 |
349 | also selected
350 |
351 | """
352 | assert query_context_node(html, 'chapter[title]') == expected_result("""
353 |
354 |
355 | selected
356 |
357 |
358 |
359 |
360 | also selected
361 |
362 | """)
363 |
364 |
365 | def test_selects_all_the_employee_children_of_the_context_node_that_have_both_a_secretary_attribute_and_an_assistant_attribute():
366 | html = """
367 |
368 |
369 | selected
370 |
371 |
372 | """
373 | assert query_context_node(html, 'employee[@secretary and @assistant]') == expected_result("""
374 |
375 | selected
376 |
377 |
378 | """)
379 |
--------------------------------------------------------------------------------
/test/hquery/test_axes.py:
--------------------------------------------------------------------------------
1 | import os
2 | import re
3 | import sys
4 |
5 | from hq.hquery.syntax_error import HquerySyntaxError
6 | from pytest import raises
7 |
8 | sys.path.insert(0, os.path.abspath('../..'))
9 |
10 | from ..common_test_util import expected_result
11 | from test.hquery.hquery_test_util import query_html_doc
12 |
13 |
14 | def test_explicit_child_axis():
15 | html_body = """
16 | """
19 | assert query_html_doc(html_body, '//div/child::p') == expected_result("""
20 |
21 | foo
22 |
""")
23 |
24 |
25 | def test_child_axis_selects_only_immediate_children():
26 | html_body = """
27 | uncle
28 |
29 |
niece
30 |
nephew
31 |
"""
32 | assert query_html_doc(html_body, '/html/body/child::p') == expected_result("""
33 |
34 | uncle
35 |
""")
36 |
37 |
38 | def test_descendant_axis_selects_from_descendants_not_ancestors():
39 | html_body = """
40 | """
48 | actual = query_html_doc(html_body, '/html/body/div/descendant::div')
49 | assert actual == expected_result("""
50 |
51 | uncle
52 |
53 |
54 | niece
55 |
""")
56 |
57 |
58 | def test_descendant_axis_returns_all_descendants_and_only_descendants_of_nodes_matching_node_test():
59 | html_body = """
60 |
65 |
66 | not selected
67 | not selected
"""
68 | expected = expected_result("""
69 |
70 |
71 | selected
72 |
73 |
74 |
75 | selected
76 |
""")
77 |
78 | assert query_html_doc(html_body, '/html/body/div/descendant::div') == expected
79 | assert query_html_doc(html_body, '/html/body/div/~::div') == expected
80 |
81 |
82 | def test_descendant_or_self_axis_returns_all_descendants_and_context_node_if_it_matches_node_test():
83 | html_body = """
84 |
87 | bar
"""
88 | assert query_html_doc(html_body, '/html/body/descendant-or-self::div') == expected_result("""
89 |
94 |
95 | foo
96 |
97 |
98 | bar
99 |
""")
100 |
101 |
102 | def test_descendant_or_self_axis_does_not_produce_self_if_node_test_does_not_match():
103 | html_body = """
104 | """
107 | assert query_html_doc(html_body, '//div/descendant-or-self::p') == expected_result("""
108 |
109 | foo
110 |
""")
111 |
112 |
113 | def test_parent_axis_returns_parent_of_tag_node():
114 | assert query_html_doc('
', '//div/parent::*') == expected_result("""
115 |
116 |
117 |
118 | """)
119 |
120 |
121 | def test_parent_axis_selects_only_the_immediate_parent():
122 | html_body = """
123 | """
128 | actual = query_html_doc(html_body, '//p/parent::div')
129 | assert actual == expected_result("""
130 |
131 |
132 | daughter
133 |
134 |
""")
135 |
136 |
137 | def test_parent_axis_returns_parents_for_multiple_matching_nodes():
138 | html_body = """
139 |
143 | """
147 | assert query_html_doc(html_body, '//p/parent::*') == expected_result(html_body)
148 |
149 |
150 | def test_parent_axis_produces_nothing_for_root_element():
151 | assert query_html_doc('', '/html/parent::*') == expected_result('')
152 | assert query_html_doc('
', 'div/parent::*', wrap_body=False) == expected_result('')
153 |
154 |
155 | def test_ancestor_axis_selects_all_matching_ancestors():
156 | html_body = """
157 | """
164 | expected = expected_result("""
165 |
166 |
167 |
168 |
169 | text
170 |
171 |
172 |
173 |
174 |
175 |
176 | text
177 |
178 |
""")
179 |
180 | assert query_html_doc(html_body, '//p/ancestor::div') == expected
181 | assert query_html_doc(html_body, '//p/^::div') == expected
182 |
183 |
184 | def test_ancestor_axis_produces_all_ancestors_and_only_ancestors():
185 | html_body = """
186 |
187 |
188 |
189 |
190 |
191 |
192 | """
193 | assert query_html_doc(html_body, '//div/ancestor::*', wrap_body=False) == expected_result("""
194 |
195 |
196 |
197 |
198 |
199 |
200 |
201 |
202 |
203 |
204 |
205 |
206 |
207 |
208 |
209 | """)
210 |
211 |
212 | def test_ancestor_or_self_axis_produces_ancestors_and_self_when_node_test_is_a_match():
213 | html_body = """
214 | """
217 | expected = expected_result("""
218 |
219 |
220 | foo
221 |
222 |
223 |
224 | foo
225 |
""")
226 |
227 | assert query_html_doc(html_body, '/html/body/div/div/ancestor-or-self::div') == expected
228 | assert query_html_doc(html_body, '/html/body/div/div/^^::div') == expected
229 |
230 |
231 | def test_following_sibling_axis_selects_all_following_siblings_and_only_following_siblings_that_match_name_test():
232 | html_body = """
233 |
234 |
235 |
236 | moe
237 |
238 |
239 | larry
240 |
241 | curly
242 | """
243 | expected = expected_result("""
244 |
245 | moe
246 |
247 |
248 | curly
249 |
""")
250 |
251 | assert query_html_doc(html_body, '//div/following-sibling::p') == expected
252 | assert query_html_doc(html_body, '//div/>::p') == expected
253 |
254 |
255 | def test_following_sibling_axis_works_with_node_test():
256 | html_body = """
257 |
258 | foo
259 |
260 | bar
261 |
"""
262 | assert query_html_doc(html_body, '//p/following-sibling::text()') == expected_result('bar')
263 | assert query_html_doc('
foo
', '//div/following-sibling::*') == expected_result("""
264 |
265 | foo
266 |
""")
267 |
268 |
269 | def test_preceding_sibling_axis_works_with_name_test():
270 | html_body = """
271 | foo
272 |
273 | bar
"""
274 | expected = expected_result("""
275 |
276 | foo
277 |
""")
278 |
279 | assert query_html_doc(html_body, '//div/preceding-sibling::p') == expected
280 | assert query_html_doc(html_body, '//div/<::p') == expected
281 |
282 |
283 | def test_preceding_sibling_axis_works_with_node_test():
284 | html_body = """
285 | foo
286 | bar
287 |
288 | nothing
"""
289 | assert query_html_doc(html_body, '//div/preceding-sibling::node()') == expected_result("""
290 |
291 | foo
292 |
293 |
294 | bar
295 |
""")
296 |
297 |
298 | def test_preceding_sibling_axis_returns_nodes_in_document_order():
299 | """Node sets are unordered, but people really seem to like these being in document order."""
300 | html_body = """
301 | foo
302 | bar
303 |
"""
304 | assert query_html_doc(html_body, '//div/preceding-sibling::p') == expected_result("""
305 |
306 | foo
307 |
308 |
309 | bar
310 |
""")
311 |
312 |
313 | def test_following_axis_finds_all_following_nodes_that_match():
314 | html_body = """
315 |
316 | moe
317 |
320 |
323 |
324 | shemp
"""
325 | expected = expected_result("""
326 |
327 | curly
328 |
329 |
330 | shemp
331 |
""")
332 |
333 | assert query_html_doc(html_body, '//aside/following::p') == expected
334 | assert query_html_doc(html_body, '//aside/>>::p') == expected
335 |
336 |
337 | def test_preceding_axis_finds_all_preceding_nodes_that_match_node_test():
338 | html_body = """
339 | foo
340 |
343 | """
344 | actual = query_html_doc(html_body, '//span/preceding::text()')
345 | actual = re.sub(r'\W+', ' ', actual)
346 | assert actual == 'foo bar'
347 |
348 |
349 | def test_preceding_axis_finds_all_preceding_nodes_that_match():
350 | html_body = """
351 | moe
352 |
353 |
356 |
359 | shemp
360 | """
361 | expected = expected_result("""
362 |
363 | moe
364 |
365 |
366 | larry
367 |
""")
368 |
369 | assert query_html_doc(html_body, '//aside/preceding::p') == expected
370 | assert query_html_doc(html_body, '//aside/<<::p') == expected
371 |
372 |
373 | def test_preceding_axis_produces_results_in_document_order_and_also_works_with_node_test():
374 | html_body = """
375 | moe
376 |
377 |
382 |
385 | shemp
386 |
387 | """
388 | assert query_html_doc(html_body, '//script/preceding::p/text()') == expected_result("""
389 | moe
390 | larry
391 | curly
392 | shemp""")
393 |
394 |
395 | def test_attribute_axis_in_full_and_abbreviated_form_selects_named_attributes_or_all_attributes():
396 | html_body = """
397 |
398 |
"""
399 | expected_ids_result = expected_result('''
400 | id="one"
401 | id="two"''')
402 | expected_all_result = expected_result('''
403 | id="one"
404 | class="three"
405 | id="two"''')
406 | assert query_html_doc(html_body, '//div/attribute::id') == expected_ids_result
407 | assert query_html_doc(html_body, '//div/@id') == expected_ids_result
408 | assert query_html_doc(html_body, '//attribute::*') == expected_all_result
409 | assert query_html_doc(html_body, '//@*') == expected_all_result
410 |
411 |
412 | def test_attribute_axis_matching_any_attribute_produces_attributes_from_each_element_in_alphabetical_order():
413 | html_body = """
414 |
415 | """
416 | actual = query_html_doc(html_body, '//span/@*')
417 | assert re.sub(r'\w+="(\d)"\n?', r'\1', actual) == '123456'
418 |
419 |
420 | def test_self_axis_applies_only_to_self():
421 | html_body = """
422 | """
427 | assert query_html_doc(html_body, '/html/body/div/div/self::div') == expected_result("""
428 | """)
432 |
433 |
434 | def test_css_class_axis_finds_elements_based_on_their_css_classes():
435 | html_body = """
436 | foo
437 | foo bar
438 | bar
"""
439 | expected = expected_result("""
440 |
441 | foo bar
442 |
443 |
444 | bar
445 |
""")
446 |
447 | assert query_html_doc(html_body, '//class::bar') == expected
448 | assert query_html_doc(html_body, '//.::bar') == expected
449 |
450 |
451 | def test_css_class_axis_can_only_be_followed_by_name_test():
452 | with raises(HquerySyntaxError):
453 | assert query_html_doc('', '/.::node()')
454 |
--------------------------------------------------------------------------------
/test/hquery/test_xpath1_unabbreviated_samples.py:
--------------------------------------------------------------------------------
1 | """
2 | These tests verify results from the samples of unabbreviated location paths in the W3C XPath 1.0 specification (chapter
3 | 2, Location Paths).
4 |
5 | https://www.w3.org/TR/xpath/#location-paths
6 | """
7 |
8 | import os
9 | import sys
10 |
11 | from hq.soup_util import make_soup
12 |
13 | sys.path.insert(0, os.path.abspath('../..'))
14 |
15 | from ..common_test_util import expected_result
16 | from test.hquery.hquery_test_util import query_html_doc, query_context_node
17 |
18 |
19 | def test_selects_the_para_element_children_of_the_context_node():
20 | assert query_context_node(" ", 'child::para') == expected_result("""
21 |
22 | """)
23 |
24 |
25 | def test_selects_all_text_node_children_of_the_context_node():
26 | html = "beforeduring after "
27 | assert query_context_node(html, 'child::text()') == expected_result("""
28 | before
29 | after""")
30 |
31 |
32 | def test_selects_all_the_children_of_the_context_node_whatever_their_node_type():
33 | html = """
34 | selected text
35 |
36 |
37 | """
38 | assert query_context_node(html, 'child::node()') == expected_result("""
39 | selected text
40 |
41 |
42 | """)
43 |
44 |
45 | def test_selects_the_name_attribute_of_the_context_node():
46 | html = " "
47 | assert query_context_node(html, 'attribute::name') == expected_result('name="selected"')
48 |
49 |
50 | def test_selects_all_the_attributes_of_the_context_node():
51 | html = " "
52 | assert query_context_node(html, 'attribute::*') == expected_result('''
53 | bar="foo"
54 | foo="bar"''')
55 |
56 |
57 | def test_selects_the_para_element_descendants_of_the_context_node():
58 | html = """
59 |
60 |
61 |
62 |
63 | """
64 | assert query_context_node(html, 'descendant::para') == expected_result("""
65 |
66 |
67 |
68 |
69 |
70 | """)
71 |
72 |
73 | def test_selects_all_div_ancestors_of_the_context_node():
74 | html = """
75 |
76 |
77 |
"""
78 | assert query_context_node(make_soup(html).div.notdiv, 'ancestor::div') == expected_result("""
79 |
80 |
81 |
82 |
""")
83 |
84 |
85 | def test_selects_the_div_ancestors_of_the_context_node_and_if_the_context_node_is_a_div_element_the_context_node_as_well():
86 | html = """
87 | """
91 | soup = make_soup(html)
92 | assert query_context_node(soup.div.div, 'ancestor-or-self::div') == expected_result("""
93 |
99 |
100 |
""")
101 | assert query_context_node(soup.div.notdiv, 'ancestor-or-self::div') == expected_result("""
102 |
103 |
104 |
105 |
106 |
107 |
""")
108 |
109 |
110 | def test_selects_the_para_element_descendants_of_the_context_node_and_if_the_context_node_is_a_para_element_the_context_node_as_well():
111 | context_is_para = """
112 |
113 | foo
114 | bar
115 | """
116 | context_is_not_para = """
117 |
118 | foo
119 | bar
120 | """
121 | assert query_context_node(context_is_para, 'descendant-or-self::para') == expected_result("""
122 |
123 |
124 | foo
125 |
126 |
127 | bar
128 |
129 |
130 |
131 | foo
132 |
133 |
134 | bar
135 | """)
136 | assert query_context_node(context_is_not_para, 'descendant-or-self::para') == expected_result("""
137 |
138 | foo
139 |
140 |
141 | bar
142 | """)
143 |
144 |
145 | def test_selects_the_context_node_if_it_is_a_para_element_and_otherwise_selects_nothing():
146 | is_para = " "
147 | is_not_para = " "
148 | assert query_context_node(is_para, 'self::para') == expected_result("""
149 |
150 | """)
151 | assert query_context_node(is_not_para, 'self::para') == ''
152 |
153 |
154 | def test_selects_the_para_element_descendants_of_the_chapter_element_children_of_the_context_node():
155 | html = """
156 |
157 |
158 | not selected
159 |
160 |
161 |
164 |
165 | """
166 | assert query_context_node(html, 'child::chapter/descendant::para') == expected_result("""
167 |
168 | selected
169 | """)
170 |
171 |
172 | def test_selects_all_para_grandchildren_of_the_context_node():
173 | html = """
174 |
175 | not selected
176 |
177 | not selected
178 | selected
179 |
180 |
181 | also selected
182 |
183 | """
184 | assert query_context_node(html, 'child::*/child::para') == expected_result("""
185 |
186 | selected
187 |
188 |
189 | also selected
190 | """)
191 |
192 |
193 | def test_selects_the_document_root_which_is_always_the_parent_of_the_document_element():
194 | html = """
195 |
196 |
197 | """
198 | assert query_context_node(html, '/') == expected_result(html)
199 |
200 |
201 | def test_selects_all_the_para_elements_in_the_same_document_as_the_context_node():
202 | html = """
203 |
204 |
205 | selected
206 | """
207 | soup = make_soup(html)
208 | assert query_context_node(soup.root.notpara, '/descendant::para') == expected_result("""
209 |
210 | selected
211 | """)
212 |
213 |
214 | def test_selects_all_the_item_elements_that_have_an_olist_parent_and_that_are_in_the_same_document_as_the_context_node():
215 | html = """
216 |
217 |
218 |
219 | not selected
220 | - selected
221 |
222 | """
223 | soup = make_soup(html)
224 | assert query_context_node(soup.root.notolist, '/descendant::olist/child::item') == expected_result("""
225 | -
226 | selected
227 |
""")
228 |
229 |
230 | def test_selects_the_first_para_child_of_the_context_node():
231 | html = """
232 |
233 | selected
234 | not selected
235 | """
236 | assert query_context_node(html, 'child::para[position()=1]') == expected_result("""
237 |
238 | selected
239 | """)
240 |
241 |
242 | def test_selects_the_last_para_child_of_the_context_node():
243 | html = """
244 |
245 | not selected
246 | selected
247 | """
248 | assert query_context_node(html, 'child::para[position()=last()]') == expected_result("""
249 |
250 | selected
251 | """)
252 |
253 |
254 | def test_selects_the_last_but_one_para_child_of_the_context_node():
255 | html = """
256 |
257 | not selected
258 | selected
259 | also not selected
260 | """
261 | assert query_context_node(html, 'child::para[position()=last()-1]') == expected_result("""
262 |
263 | selected
264 | """)
265 |
266 |
267 | def test_selects_all_the_para_children_of_the_context_node_other_than_the_first_para_child_of_the_context_node():
268 | html = """
269 |
270 | not selected
271 | selected
272 | also selected
273 | """
274 | assert query_context_node(html, 'child::para[position()>1]') == expected_result("""
275 |
276 | selected
277 |
278 |
279 | also selected
280 | """)
281 |
282 |
283 | def test_selects_the_next_chapter_sibling_of_the_context_node():
284 | html = """
285 |
286 |
287 | selected
288 | not selected
289 | """
290 | soup = make_soup(html)
291 | assert query_context_node(soup.root.context, 'following-sibling::chapter[position()=1]') == expected_result("""
292 |
293 | selected
294 | """)
295 |
296 |
297 | def test_selects_the_previous_chapter_sibling_of_the_context_node():
298 | html = """
299 |
300 | not selected
301 | selected
302 |
303 | """
304 | soup = make_soup(html)
305 | assert query_context_node(soup.root.context, 'preceding-sibling::chapter[position()=1]') == expected_result("""
306 |
307 | selected
308 | """)
309 |
310 |
311 | def test_selects_the_forty_second_figure_element_in_the_document():
312 | html_body = ''.join('{0} '.format(n) for n in range(1, 43))
313 | assert query_html_doc(html_body, '/descendant::figure[position()=42]') == expected_result("""
314 |
315 | 42
316 | """)
317 |
318 |
319 | def test_selects_the_second_section_of_the_fifth_chapter_of_the_doc_document_element():
320 | html_body = """
321 |
322 |
323 |
324 |
325 |
326 |
327 |
328 |
329 |
330 |
331 |
332 | """
333 | actual = query_html_doc(html_body,
334 | '/child::doc/child::chapter[position()=5]/child::section[position()=2]',
335 | wrap_body=False)
336 | assert actual == expected_result("""
337 |
338 | chapter 5, section 2
339 | """)
340 |
341 |
342 | def test_selects_all_para_children_of_the_context_node_that_have_a_type_attribute_with_value_warning():
343 | html = """
344 |
345 | no type
346 | warning one
347 | error type
348 | warning two
349 | """
350 | assert query_context_node(html, 'child::para[attribute::type="warning"]') == expected_result("""
351 |
352 | warning one
353 |
354 |
355 | warning two
356 | """)
357 |
358 |
359 | def test_selects_the_fifth_para_child_of_the_context_node_that_has_a_type_attribute_with_value_warning():
360 | html = """
361 |
362 | warning one
363 | error type
364 | warning two
365 | warning three
366 | warning four
367 | warning five
368 | """
369 | assert query_context_node(html, "child::para[attribute::type='warning'][position()=5]") == expected_result("""
370 |
371 | warning five
372 | """)
373 |
374 |
375 | def test_selects_the_fifth_para_child_of_the_context_node_if_that_child_has_a_type_attribute_with_value_warning():
376 | html_with = """
377 |
378 | para one
379 | para two
380 | para three
381 | para four
382 | para five
383 | """
384 | html_without = """
385 |
386 | para one
387 | para two
388 | para three
389 | para four
390 | para five
391 | """
392 |
393 | actual_with = query_context_node(html_with, 'child::para[position()=5][attribute::type="warning"]')
394 | actual_without = query_context_node(html_without, 'child::para[position()=5][attribute::type="warning"]')
395 |
396 | assert actual_with == expected_result("""
397 |
398 | para five
399 | """)
400 | assert actual_without == expected_result('')
401 |
402 |
403 | def test_selects_the_chapter_children_of_the_context_node_that_have_one_or_more_title_children_with_string_value_equal_to_Introduction():
404 | html = """
405 |
406 | No Title
407 |
408 | Wrong Title
409 |
410 |
411 | Introduction
412 |
413 | """
414 | assert query_context_node(html, "child::chapter[child::title='Introduction']") == expected_result("""
415 |
416 |
417 | Introduction
418 |
419 | """)
420 |
421 |
422 | def test_selects_the_chapter_children_of_the_context_node_that_have_one_or_more_title_children():
423 | html = """
424 |
425 | not selected
426 |
427 | """
428 | assert query_context_node(html, 'child::chapter[child::title]') == expected_result("""
429 |
430 |
431 |
432 | """)
433 |
434 |
435 | def test_selects_the_chapter_and_appendix_children_of_the_context_node():
436 | html = """
437 |
438 | not selected
439 |
440 |
441 | """
442 | assert query_context_node(html, 'child::*[self::chapter or self::appendix]') == expected_result("""
443 |
444 |
445 |
446 | """)
447 |
448 |
449 | def test_selects_the_last_chapter_or_appendix_child_of_the_context_node():
450 | html_with_last_chapter = """
451 |
452 |
453 |
454 | selected
455 | """
456 | html_with_last_appendix = """
457 |
458 |
459 |
460 | selected
461 | """
462 | actual_last_chapter = query_context_node(html_with_last_chapter,
463 | 'child::*[self::chapter or self::appendix][position()=last()]')
464 | actual_last_appendix = query_context_node(html_with_last_appendix,
465 | 'child::*[self::chapter or self::appendix][position()=last()]')
466 | assert actual_last_chapter == expected_result("""
467 |
468 | selected
469 | """)
470 | assert actual_last_appendix == expected_result("""
471 |
472 | selected
473 | """)
474 |
--------------------------------------------------------------------------------
/hq/hquery/tokens.py:
--------------------------------------------------------------------------------
1 | from hq.hquery.computed_constructors.hash_key_value import ComputedHashKeyValueConstructor
2 | from hq.hquery.equality_operators import equals, not_equals
3 | from hq.hquery.flwor import Flwor
4 | from hq.hquery.function_support import FunctionSupport
5 | from hq.hquery.functions.core_boolean import boolean
6 | from hq.hquery.functions.core_number import number
7 | from hq.hquery.node_test import NodeTest
8 | from hq.hquery.object_type import object_type_name, debug_dump_anything
9 | from hq.hquery.sequences import make_node_set, sequence_concat
10 | from hq.hquery.relational_operators import RelationalOperator
11 | from hq.hquery.string_interpolation import parse_interpolated_string
12 | from hq.hquery.syntax_error import HquerySyntaxError
13 | from hq.hquery.union_decomposition import UnionDecomposition
14 | from hq.hquery.variables import value_of_variable
15 | from hq.soup_util import soup_from_any_tag
16 | from hq.string_util import html_entity_decode
17 | from hq.verbosity import verbose_print
18 |
19 | from .axis import Axis
20 | from .expression_context import get_context_node
21 |
22 | function_support = FunctionSupport()
23 |
24 |
25 |
26 | class LBP:
27 | """Left-binding precendence values."""
28 | (
29 | nothing, sequence, union_decomp, union, range, abbrev_flwor, or_op,
30 | and_op, equality_op, relational_op, add_or_subtract, mult_or_div,
31 | prefix_op, function_call, location_step, node_test, parenthesized_expr
32 | ) = range(17)
33 |
34 | assert LBP.sequence == LBP.nothing + 1
35 |
36 |
37 |
38 | class Token(object):
39 |
40 | def __init__(self, parse_interface, value=None, **kwargs):
41 | self.parse_interface = parse_interface
42 | self.value = value
43 |
44 |
45 | def _evaluate_binary_operands(self,
46 | left_generator,
47 | right_generator,
48 | constructor=lambda v: v,
49 | type_name='xpath object'):
50 | try:
51 | self._gab('operator evaluation...', indent_after=True)
52 | self._gab('evaluating left-hand side.', indent_after=True)
53 | left_value = constructor(left_generator())
54 | self._gab('evaluating right-hand side.', outdent_before=True, indent_after=True)
55 | right_value = constructor(right_generator())
56 | self._gab('operand evaluation complete', outdent_before=True)
57 | self._gab('evaluating expression {0} {1} {2}'.format(left_value, self, right_value), outdent_before=True)
58 | return left_value, right_value
59 | except TypeError:
60 | raise HquerySyntaxError('evaluated against a non-{0} operand'.format(type_name))
61 |
62 |
63 | def _evaluate_unary_operand(self, operand_generator, constructor=lambda v: v, type_name='xpath object'):
64 | try:
65 | self._gab('evaluating operand.', indent_after=True)
66 | operand_value = constructor(operand_generator())
67 | self._gab('operand evaluation complete', outdent_before=True)
68 | return operand_value
69 | except TypeError:
70 | raise HquerySyntaxError('evaluated against a non-{0} operand'.format(type_name))
71 |
72 |
73 | def _gab(self, msg, **kwargs):
74 | verbose_print(u'{0} {1}'.format(self, msg), **kwargs)
75 |
76 |
77 |
78 | class AbbreviatedFlworOperatorToken(Token):
79 | lbp = LBP.abbrev_flwor
80 |
81 | def __str__(self):
82 | return '(abbreviated-FLWOR-operator)'
83 |
84 | def led(self, left):
85 | right = self.parse_interface.expression(LBP.sequence)
86 |
87 | flwor = Flwor()
88 | flwor.set_iteration_expression('_', left)
89 | flwor.set_return_expression(right)
90 | return flwor.evaluate
91 |
92 |
93 | class AddOrSubtractOperatorToken(Token):
94 | lbp = LBP.add_or_subtract
95 |
96 | def __str__(self):
97 | return '(plus)' if self.value == '+' else '(minus)'
98 |
99 | def led(self, left):
100 | right = self.parse_interface.expression(self.lbp)
101 |
102 | def evaluate():
103 | left_value, right_value = self._evaluate_binary_operands(left, right, constructor=number, type_name='number')
104 | result = left_value + right_value if self.value == '+' else left_value - right_value
105 | self._gab('returning {0}'.format(result))
106 | return result
107 |
108 | return evaluate
109 |
110 | def nud(self):
111 | if self.value != '-':
112 | raise HquerySyntaxError('unexpected {0} at beginning of an expression')
113 |
114 | right = self.parse_interface.expression(LBP.prefix_op)
115 |
116 | def evaluate():
117 | right_value = self._evaluate_unary_operand(right, constructor=number, type_name='number')
118 | result = -right_value
119 | self._gab('returning {0}'.format(result))
120 | return result
121 |
122 | return evaluate
123 |
124 |
125 |
126 | class AndOperator(Token):
127 | lbp = LBP.or_op
128 |
129 | def __str__(self):
130 | return '(operator "and")'
131 |
132 | def led(self, left):
133 | right = self.parse_interface.expression(self.lbp)
134 |
135 | def evaluate():
136 | left_value, right_value = self._evaluate_binary_operands(left,
137 | right,
138 | constructor=boolean,
139 | type_name='boolean')
140 | result = bool(left_value) and bool(right_value)
141 | self._gab('returning {0}'.format(result))
142 | return result
143 |
144 | return evaluate
145 |
146 |
147 |
148 | class AssignmentOperatorToken(Token):
149 | lbp = LBP.nothing
150 |
151 | def __str__(self):
152 | return '(assignment-operator)'
153 |
154 |
155 |
156 | class AxisToken(Token):
157 | lbp = LBP.nothing
158 |
159 | def __init__(self, parse_interface, value, **kwargs):
160 | super(AxisToken, self).__init__(parse_interface, Axis.canonicalize(value), **kwargs)
161 | self.axis = Axis[self.value]
162 |
163 | def __str__(self):
164 | return '(axis "{0}")'.format(self.value)
165 |
166 | def nud(self):
167 | return self.parse_interface.location_path(self).evaluate
168 |
169 |
170 |
171 | class CloseCurlyBraceToken(Token):
172 | lbp = LBP.nothing
173 |
174 | def __str__(self):
175 | return '(close-curly-brace)'
176 |
177 |
178 |
179 | class CloseParenthesisToken(Token):
180 | lbp = LBP.nothing
181 |
182 | def __str__(self):
183 | return '(close-parenthesis)'
184 |
185 |
186 |
187 | class CloseSquareBraceToken(Token):
188 | lbp = LBP.nothing
189 |
190 | def __str__(self):
191 | return '(right-brace)'
192 |
193 |
194 |
195 | class CommaToken(Token):
196 | lbp = LBP.sequence
197 |
198 | def __str__(self):
199 | return '(comma)'
200 |
201 | def led(self, left):
202 | right = self.parse_interface.expression(self.lbp)
203 |
204 | def evaluate():
205 | left_value, right_value = self._evaluate_binary_operands(left, right)
206 | return sequence_concat(left_value, right_value)
207 |
208 | return evaluate
209 |
210 |
211 |
212 | class ComputedConstructorFiltersToken(Token):
213 | lbp = LBP.nothing
214 |
215 | def __init__(self, parse_interface, value, **kwargs):
216 | super(ComputedConstructorFiltersToken, self).__init__(parse_interface, value[1:-1], **kwargs)
217 |
218 | def __str__(self):
219 | return '(computed-constructor-filters "{0}")'.format(self.value)
220 |
221 |
222 | class ConstructorReservedWordToken(Token):
223 | lbp = LBP.nothing
224 |
225 | def __str__(self):
226 | return '(constructor-keyword "{0}")'.format(self.value)
227 |
228 | def nud(self):
229 | return self.parse_interface.computed_constructor(self).evaluate
230 |
231 |
232 |
233 | class ContextNodeToken(Token):
234 | lbp = LBP.node_test
235 |
236 | def __str__(self):
237 | return '(context-node)'
238 |
239 | def nud(self):
240 | return self.parse_interface.location_path(self).evaluate
241 |
242 |
243 |
244 | class DivOrModOperatorToken(Token):
245 | lbp = LBP.mult_or_div
246 |
247 | def __str__(self):
248 | return '(operator "{0}")'.format(self.value)
249 |
250 | def led(self, left):
251 | right = self.parse_interface.expression(self.lbp)
252 |
253 | def evaluate():
254 | left_value, right_value = self._evaluate_binary_operands(left, right, constructor=number, type_name='number')
255 | result = left_value / right_value if self.value == 'div' else left_value % right_value
256 | self._gab('{0} returning {1}'.format(self, result))
257 | return result
258 |
259 | return evaluate
260 |
261 |
262 |
263 | class DoubleSlashToken(Token):
264 | lbp = LBP.location_step
265 | evaluating_message = 'evaluating remainder of path for node "{0}" and all of its descendants.'
266 |
267 | def __str__(self):
268 | return '(double-slash)'
269 |
270 | def led(self, left):
271 | return self.parse_interface.location_path(self, root_expression=left).evaluate
272 |
273 | def nud(self):
274 | return self.parse_interface.location_path(self).evaluate
275 |
276 |
277 |
278 | class EndToken(Token):
279 | lbp = LBP.nothing
280 |
281 |
282 |
283 | class EqualityOperatorToken(Token):
284 | lbp = LBP.equality_op
285 |
286 | def __str__(self):
287 | return '(equality-operator "{0}")'.format(self.value)
288 |
289 | def led(self, left):
290 | right = self.parse_interface.expression(self.lbp)
291 |
292 | def evaluate():
293 | left_value, right_value = self._evaluate_binary_operands(left, right)
294 | result = equals(left_value, right_value) if self.value == '=' else not_equals(left_value, right_value)
295 | self._gab('returning {0}'.format(result))
296 | return result
297 |
298 | return evaluate
299 |
300 |
301 |
302 | class FlworReservedWordToken(Token):
303 | lbp = LBP.nothing
304 |
305 | def __init__(self, parse_interface, value, **kwargs):
306 | super(FlworReservedWordToken, self).__init__(parse_interface, value.lower(), **kwargs)
307 |
308 | def __str__(self):
309 | return '({0})'.format(self.value)
310 |
311 | def nud(self):
312 | return self.parse_interface.flwor(self).evaluate
313 |
314 |
315 |
316 | class FunctionCallToken(Token):
317 | lbp = LBP.function_call
318 |
319 | def __str__(self):
320 | return '(function call "{0}")'.format(self.value)
321 |
322 | def nud(self):
323 | arg_generators = []
324 |
325 | while (not isinstance(self.parse_interface.peek(), CloseParenthesisToken)):
326 | arg_generators.append(self.parse_interface.expression(LBP.sequence))
327 | self.parse_interface.advance_if(CommaToken)
328 |
329 | self.parse_interface.advance(CloseParenthesisToken)
330 |
331 | def evaluate():
332 | self._gab('evaluating argument list for function "{0}."'.format(self.value))
333 | arguments = [gen() for gen in arg_generators]
334 | arg_types = ','.join(object_type_name(arg) for arg in arguments)
335 | self._gab('calling {0}({1}).'.format(self.value, arg_types))
336 | return function_support.call_function(self.value, *arguments)
337 |
338 | return evaluate
339 |
340 |
341 |
342 | class HashKeyToken(Token):
343 | lpb = LBP.nothing
344 |
345 | def __str__(self):
346 | return '(hash-key "{0}")'.format(self.value)
347 |
348 | def nud(self):
349 | constructor = ComputedHashKeyValueConstructor(self.value)
350 | constructor.set_value(self.parse_interface.expression(LBP.sequence))
351 | return constructor.evaluate
352 |
353 |
354 |
355 | class IfElseToken(Token):
356 | lbp = LBP.nothing
357 |
358 | def __str__(self):
359 | return '(if-reserved-word)'
360 |
361 | def nud(self):
362 | return self.parse_interface.if_then_else()
363 |
364 |
365 |
366 | class InterpolatedStringToken(Token):
367 | lbp = LBP.nothing
368 |
369 | def __init__(self, parse_interface, value, **kwargs):
370 | super(InterpolatedStringToken, self).__init__(parse_interface, value[1:-1], **kwargs)
371 |
372 | def __str__(self):
373 | return u'(interpolated-string `{0}`)'.format(self.value)
374 |
375 | def nud(self):
376 | return parse_interpolated_string(self.value, self.parse_interface)
377 |
378 |
379 |
380 | class LiteralNumberToken(Token):
381 | lbp = LBP.nothing
382 |
383 | def __str__(self):
384 | return '(literal-number {0})'.format(self.value)
385 |
386 | def nud(self):
387 | return lambda: number(self.value)
388 |
389 |
390 |
391 | class LiteralStringToken(Token):
392 | lbp = LBP.nothing
393 |
394 | def __init__(self, parse_interface, value, **kwargs):
395 | super(LiteralStringToken, self).__init__(parse_interface, html_entity_decode(value[1:-1]), **kwargs)
396 |
397 | def __str__(self):
398 | return u'(literal-string "{0}")'.format(self.value)
399 |
400 | def nud(self):
401 | return lambda: self.value
402 |
403 |
404 |
405 | class MultiplyOperatorToken(Token):
406 | lbp = LBP.mult_or_div
407 |
408 | def __str__(self):
409 | return '(times)'
410 |
411 | def led(self, left):
412 | right = self.parse_interface.expression(self.lbp)
413 |
414 | def evaluate():
415 | left_value, right_value = self._evaluate_binary_operands(left, right, constructor=number, type_name='number')
416 | result = left_value * right_value
417 | self._gab('{0} returning {1}'.format(self, result))
418 | return result
419 |
420 | return evaluate
421 |
422 |
423 |
424 | class NameTestToken(Token):
425 | lbp = LBP.node_test
426 |
427 | def __init__(self, *args, **kwargs):
428 | super(NameTestToken, self).__init__(*args, **kwargs)
429 | self.node_test = NodeTest(self.value, name_test=True)
430 |
431 | def __str__(self):
432 | return '(name-test "{0}")'.format(self.value)
433 |
434 | def nud(self):
435 | return self.parse_interface.location_path(self).evaluate
436 |
437 |
438 |
439 | class NodeTestToken(Token):
440 | lbp = LBP.node_test
441 |
442 | def __init__(self, *args, **kwargs):
443 | super(NodeTestToken, self).__init__(*args, **kwargs)
444 | self.node_test = NodeTest(self.value)
445 |
446 | def __str__(self):
447 | return '(node-test "{0}")'.format(self._dump_value())
448 |
449 | def nud(self):
450 | return self.parse_interface.location_path(self).evaluate
451 |
452 | def _dump_value(self):
453 | return '{0}{1}'.format(self.value, '()' if self.value != '*' else '')
454 |
455 |
456 |
457 | class OpenCurlyBraceToken(Token):
458 | lbp = LBP.nothing
459 |
460 | def __str__(self):
461 | return '(open-curly-brace)'
462 |
463 |
464 |
465 | class OpenParenthesisToken(Token):
466 | lbp = LBP.parenthesized_expr
467 |
468 | def __str__(self):
469 | return '(open-parenthesis)'
470 |
471 | def nud(self):
472 | expr = self.parse_interface.expression(LBP.nothing)
473 | self.parse_interface.advance(CloseParenthesisToken)
474 | return expr
475 |
476 |
477 |
478 | class OpenSquareBraceToken(Token):
479 | lbp = LBP.location_step
480 |
481 | def __str__(self):
482 | return '(left-brace)'
483 |
484 | def led(self, left):
485 | path = self.parse_interface.location_path(self, root_expression=left)
486 | return path.evaluate
487 |
488 |
489 |
490 | class OrOperatorToken(Token):
491 | lbp = LBP.or_op
492 |
493 | def __str__(self):
494 | return '(operator "or")'
495 |
496 | def led(self, left):
497 | right = self.parse_interface.expression(self.lbp)
498 |
499 | def evaluate():
500 | left_value, right_value = self._evaluate_binary_operands(left,
501 | right,
502 | constructor=boolean,
503 | type_name='boolean')
504 | result = bool(left_value) or bool(right_value)
505 | self._gab('returning {0}'.format(result))
506 | return result
507 |
508 | return evaluate
509 |
510 |
511 |
512 | class ParentNodeToken(Token):
513 | lbp = LBP.node_test
514 |
515 | def __str__(self):
516 | return '(parent-node)'
517 |
518 | def nud(self):
519 | return self.parse_interface.location_path(self).evaluate
520 |
521 |
522 |
523 | class RangeOperatorToken(Token):
524 | lbp = LBP.range
525 |
526 | def __str__(self):
527 | return '(range-operator)'
528 |
529 | def led(self, left):
530 | right = self.parse_interface.expression(self.lbp)
531 |
532 | def evaluate():
533 | left_value, right_value = self._evaluate_binary_operands(left,
534 | right,
535 | constructor=number,
536 | type_name='number')
537 | return list(number(x) for x in range(int(left_value), int(right_value + 1)))
538 |
539 | return evaluate
540 |
541 |
542 |
543 | class RelationalOperatorToken(Token):
544 | lbp = LBP.relational_op
545 |
546 | def __str__(self):
547 | return '(operator {0})'.format(RelationalOperator(self.value).name)
548 |
549 | def led(self, left):
550 | right = self.parse_interface.expression(self.lbp)
551 |
552 | def evaluate():
553 | left_value, right_value = self._evaluate_binary_operands(left, right)
554 | result = RelationalOperator(self.value).evaluate(left_value, right_value)
555 | self._gab('returning {0}'.format(result))
556 | return result
557 |
558 | return evaluate
559 |
560 |
561 |
562 | class SlashToken(Token):
563 | lbp = LBP.location_step
564 |
565 | def __str__(self):
566 | return '(slash)'
567 |
568 | def led(self, left):
569 | path = self.parse_interface.location_path(self, root_expression=left)
570 | return path.evaluate
571 |
572 | def nud(self):
573 | next_token = self.parse_interface.peek()
574 | absolute_path_followup_tokens = (AxisToken, ContextNodeToken, NameTestToken, NodeTestToken, ParentNodeToken)
575 |
576 | if any(isinstance(next_token, clz) for clz in absolute_path_followup_tokens):
577 | path = self.parse_interface.location_path(self)
578 | return path.evaluate
579 | else:
580 | return lambda: make_node_set(soup_from_any_tag(get_context_node()))
581 |
582 |
583 |
584 | class UnionDecompositionToken(Token):
585 | lbp = LBP.union_decomp
586 |
587 | def __str__(self):
588 | return '(union decomposition)'
589 |
590 | def led(self, left):
591 | decomp = UnionDecomposition()
592 | mapping_generators = []
593 |
594 | if self.parse_interface.advance_if(OpenParenthesisToken) is None:
595 | while True:
596 | mapping_generators.append(self.parse_interface.expression(LBP.union))
597 | if self.parse_interface.advance_if(UnionOperatorToken) is None:
598 | break
599 | else:
600 | while (not isinstance(self.parse_interface.peek(), CloseParenthesisToken)):
601 | mapping_generators.append(self.parse_interface.expression(LBP.union))
602 | self.parse_interface.advance_if(UnionOperatorToken)
603 | self.parse_interface.advance(CloseParenthesisToken)
604 |
605 | decomp.set_union_expression(left)
606 | decomp.set_mapping_generators(mapping_generators)
607 | return decomp.evaluate
608 |
609 |
610 |
611 | class UnionOperatorToken(Token):
612 | lbp = LBP.union
613 |
614 | def __str__(self):
615 | return '(union operator)'
616 |
617 | def led(self, left):
618 | if hasattr(left, 'union_index'):
619 | left_union_index = left.union_index
620 | else:
621 | left_union_index = 0
622 | right_union_index = left_union_index + 1
623 |
624 | right = self.parse_interface.expression(self.lbp)
625 |
626 | def evaluate():
627 | left_value, right_value = self._evaluate_binary_operands(left, right, type_name='node set')
628 | for item in right_value:
629 | if not isinstance(getattr(item, 'union_index', None), int):
630 | setattr(item, 'union_index', right_union_index)
631 | if left_union_index == 0:
632 | for item in left_value:
633 | if not isinstance(getattr(item, 'union_index', None), int):
634 | setattr(item, 'union_index', left_union_index)
635 | left_value.extend(right_value)
636 | result = make_node_set(left_value)
637 | self._gab('returning node set with {0} nodes'.format(len(result)))
638 | return result
639 |
640 | setattr(evaluate, 'union_index', right_union_index)
641 |
642 | return evaluate
643 |
644 |
645 |
646 | class VariableToken(Token):
647 | lbp = LBP.nothing
648 |
649 | def __str__(self):
650 | return '(variable ${0})'.format(self.value)
651 |
652 | def nud(self):
653 |
654 | def evaluate():
655 | result = value_of_variable(self.value)
656 | self._gab(lambda: u'reference evaluating to value {0}'.format(debug_dump_anything(result)))
657 | return result
658 |
659 | return evaluate
660 |
--------------------------------------------------------------------------------