├── hq ├── __init__.py ├── hquery │ ├── __init__.py │ ├── functions │ │ ├── __init__.py │ │ ├── extend_node_set.py │ │ ├── core_boolean.py │ │ ├── core_node_set.py │ │ ├── extend_string.py │ │ ├── core_string.py │ │ └── core_number.py │ ├── computed_constructors │ │ ├── __init__.py │ │ ├── hash_key_value.py │ │ ├── html_attribute.py │ │ ├── html_element.py │ │ ├── json_array.py │ │ └── json_hash.py │ ├── syntax_error.py │ ├── evaluation_error.py │ ├── variables.py │ ├── evaluation_in_context.py │ ├── sequences.py │ ├── axis.py │ ├── expression_context.py │ ├── function_support.py │ ├── union_decomposition.py │ ├── object_type.py │ ├── equality_operators.py │ ├── relational_operators.py │ ├── flwor.py │ ├── node_test.py │ ├── string_interpolation.py │ ├── location_path.py │ └── tokens.py ├── config.py ├── __main__.py ├── string_util.py ├── verbosity.py ├── output.py ├── hq.py └── soup_util.py ├── test ├── __init__.py ├── hquery │ ├── __init__.py │ ├── test_strings.py │ ├── hquery_test_util.py │ ├── test_if_then_else.py │ ├── test_expressions.py │ ├── test_union_decomposition.py │ ├── test_sequences_and_ranges.py │ ├── test_name_tests.py │ ├── test_arithmetic_operators.py │ ├── test_computed_html_construction.py │ ├── test_node_tests.py │ ├── test_equality_operators.py │ ├── test_relational_operators.py │ ├── test_interpolated_strings.py │ ├── test_extended_functions.py │ ├── test_location_paths.py │ ├── test_flwor.py │ ├── test_computed_json_construction.py │ ├── test_core_functions.py │ ├── test_xpath1_abbreviated_samples.py │ ├── test_axes.py │ └── test_xpath1_unabbreviated_samples.py ├── conftest.py ├── common_test_util.py ├── test_unicode_support.py └── test_cli.py ├── MANIFEST.in ├── requirements ├── base.txt └── dev.txt ├── .travis.yml ├── hq_runner.py ├── tox.ini ├── .gitignore ├── LICENSE ├── setup.py └── README.md /hq/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /test/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /hq/hquery/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /test/hquery/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include LICENSE 2 | -------------------------------------------------------------------------------- /hq/hquery/functions/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /hq/hquery/computed_constructors/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /hq/config.py: -------------------------------------------------------------------------------- 1 | 2 | class settings: 3 | VERBOSE = False 4 | -------------------------------------------------------------------------------- /hq/hquery/syntax_error.py: -------------------------------------------------------------------------------- 1 | 2 | class HquerySyntaxError(ValueError): 3 | pass 4 | -------------------------------------------------------------------------------- /requirements/base.txt: -------------------------------------------------------------------------------- 1 | beautifulsoup4==4.11.1 2 | docopt==0.6.2 3 | wheel==0.37.1 4 | -------------------------------------------------------------------------------- /requirements/dev.txt: -------------------------------------------------------------------------------- 1 | -r base.txt 2 | 3 | pytest==7.1.3 4 | pytest-cov==3.0.0 5 | pytest-mock==3.8.2 6 | tox==3.26.0 7 | 8 | hq~=0.0.4 9 | setuptools~=65.4.0 10 | -------------------------------------------------------------------------------- /hq/__main__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """ 4 | hq.__main__: executed when hq module is executed as script. (Based on Jan-Philip Gehrcke's python-cmdline-bootstrap.) 5 | """ 6 | 7 | 8 | from .hq import main 9 | main() 10 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | git: 2 | depth: 3 3 | language: python 4 | python: 5 | - "2.7" 6 | - "3.4" 7 | - "3.5" 8 | install: 9 | - pip install -r requirements/dev.txt 10 | - pip install coveralls 11 | script: py.test --cov=hq 12 | after_success: 13 | coveralls 14 | -------------------------------------------------------------------------------- /hq_runner.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """ 4 | Convenience wrapper for running hq directly from source tree. (Based on Jan-Philip Gehrcke's python-cmdline-bootstrap.) 5 | """ 6 | 7 | from hq.hq import main 8 | 9 | 10 | if __name__ == '__main__': 11 | main() 12 | -------------------------------------------------------------------------------- /test/conftest.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | sys.path.insert(0, os.path.abspath('..')) 5 | 6 | from hq.verbosity import set_verbosity 7 | 8 | 9 | def pytest_addoption(parser): 10 | parser.addoption("--gabby", 11 | action="store_true", 12 | help="Print verbose (debug) information to stderr") 13 | 14 | 15 | def pytest_configure(config): 16 | set_verbosity(bool(config.getvalue('--gabby'))) 17 | pass 18 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | # Tox (http://tox.testrun.org/) is a tool for running tests 2 | # in multiple virtualenvs. This configuration file will run the 3 | # test suite on all supported python versions. To use it, "pip install tox" 4 | # and then run "tox" from this directory. 5 | 6 | [tox] 7 | envlist = python3.5, python3.6, python3.7, py38, py39 8 | 9 | [testenv] 10 | commands = py.test 11 | deps = 12 | beautifulsoup4 13 | docopt 14 | mock 15 | pytest-mock 16 | -------------------------------------------------------------------------------- /test/hquery/test_strings.py: -------------------------------------------------------------------------------- 1 | from test.common_test_util import expected_result 2 | from test.hquery.hquery_test_util import query_html_doc 3 | 4 | 5 | def test_escapes_work_in_string_literals(): 6 | assert query_html_doc('', '"foo bar"') == expected_result(""" 7 | foo 8 | bar""") 9 | assert query_html_doc('', "'foo bar'") == expected_result(""" 10 | foo 11 | bar""") 12 | assert query_html_doc('', '`foo bar`') == expected_result(""" 13 | foo 14 | bar""") 15 | -------------------------------------------------------------------------------- /hq/hquery/evaluation_error.py: -------------------------------------------------------------------------------- 1 | from hq.hquery.object_type import is_node_set, is_sequence 2 | 3 | 4 | class HqueryEvaluationError(RuntimeError): 5 | 6 | @classmethod 7 | def must_be_node_set(cls, obj): 8 | if not is_node_set(obj): 9 | raise HqueryEvaluationError('Expected a node set, but found a(n) {0}'.format(obj.__class__.__name__)) 10 | 11 | @classmethod 12 | def must_be_node_set_or_sequence(cls, obj): 13 | if not (is_node_set(obj) or is_sequence(obj)): 14 | raise HqueryEvaluationError('Expected a node set or sequence, but found a(n) {0}'.format( 15 | obj.__class__.__name__ 16 | )) 17 | -------------------------------------------------------------------------------- /hq/string_util.py: -------------------------------------------------------------------------------- 1 | import re 2 | from html.entities import name2codepoint 3 | 4 | 5 | def html_entity_decode(s): 6 | result = re.sub('&(%s);' % '|'.join(name2codepoint), lambda m: str(chr(name2codepoint[m.group(1)])), s) 7 | result = re.sub(r'&#(\d{2,3});', lambda m: chr(int(m.group(1))), result) 8 | return result 9 | 10 | 11 | def is_a_string(obj): 12 | class_name = obj.__class__.__name__ 13 | return class_name.endswith('str') or class_name.endswith('unicode') 14 | 15 | 16 | def truncate_string(s, length, one_line=True, suffix='...'): 17 | if len(s) <= length: 18 | result = s 19 | else: 20 | result = s[:length + 1].rsplit(' ', 1)[0] + suffix 21 | if one_line: 22 | result = result.replace('\n', '\\n') 23 | return result 24 | -------------------------------------------------------------------------------- /hq/verbosity.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | from .config import settings 4 | from .string_util import is_a_string 5 | 6 | 7 | indent_level = 0 8 | 9 | 10 | def set_verbosity(verbose): 11 | setattr(settings, 'VERBOSE', verbose) 12 | 13 | 14 | def push_indent(): 15 | global indent_level 16 | indent_level += 2 17 | 18 | def pop_indent(): 19 | global indent_level 20 | indent_level -= 2 21 | 22 | 23 | def verbose_print(text, indent_after=False, outdent_before=False): 24 | if settings.VERBOSE: 25 | if outdent_before: 26 | pop_indent() 27 | if not is_a_string(text): 28 | text = text() 29 | print(u'{0}{1}'.format(' ' * indent_level, text), file=sys.stderr) 30 | if indent_after: 31 | push_indent() 32 | -------------------------------------------------------------------------------- /hq/hquery/functions/extend_node_set.py: -------------------------------------------------------------------------------- 1 | from hq.hquery.evaluation_error import HqueryEvaluationError 2 | from hq.hquery.expression_context import get_context_node, peek_context 3 | from hq.hquery.functions.core_boolean import boolean 4 | 5 | exports = ('class_', 'even', 'odd') 6 | 7 | 8 | def class_(*args): 9 | if len(args) == 1: 10 | tag = get_context_node() 11 | name = args[0] 12 | elif len(args) == 2: 13 | HqueryEvaluationError.must_be_node_set(args[0]) 14 | tag = args[0][0] 15 | name = args[1] 16 | else: 17 | raise HqueryEvaluationError('class() expects one or two arguments; got {0}'.format(len(args))) 18 | 19 | return boolean(name in tag['class']) 20 | 21 | 22 | def even(): 23 | return boolean(peek_context().position % 2 == 0) 24 | 25 | 26 | def odd(): 27 | return boolean(peek_context().position % 2 == 1) 28 | -------------------------------------------------------------------------------- /hq/hquery/computed_constructors/hash_key_value.py: -------------------------------------------------------------------------------- 1 | from hq.hquery.object_type import debug_dump_anything 2 | from hq.verbosity import verbose_print 3 | 4 | 5 | class ComputedHashKeyValueConstructor: 6 | 7 | def __init__(self, key): 8 | self.key = key 9 | self.value_fn = None 10 | 11 | 12 | def set_value(self, fn): 13 | self.value_fn = fn 14 | 15 | 16 | def evaluate(self): 17 | verbose_print('Evaluating value expression for constructed hash key "{0}"'.format(self.key), indent_after=True) 18 | 19 | value = self.value_fn() 20 | 21 | msg = u'Finished evaluating; value of constructed hash key "{0}" is {1}' 22 | verbose_print(lambda: msg.format(self.key, debug_dump_anything(value)), outdent_before=True) 23 | 24 | return HashKeyValue(self.key, value) 25 | 26 | 27 | 28 | class HashKeyValue: 29 | 30 | def __init__(self, key, value): 31 | self.key = key 32 | self.value = value 33 | -------------------------------------------------------------------------------- /test/hquery/hquery_test_util.py: -------------------------------------------------------------------------------- 1 | from hq.output import convert_results_to_output_text 2 | from hq.soup_util import make_soup, is_any_node, root_tag_from_soup 3 | from hq.hquery.hquery_processor import HqueryProcessor 4 | from test.common_test_util import soup_with_body, eliminate_blank_lines 5 | 6 | 7 | def query_html_doc(html_body, hquery, preserve_space=False, wrap_body=True): 8 | soup = soup_with_body(html_body) if wrap_body else make_soup(html_body) 9 | raw_result = HqueryProcessor(hquery, preserve_space=preserve_space).query(soup) 10 | return eliminate_blank_lines(convert_results_to_output_text(raw_result, preserve_space=preserve_space).strip()) 11 | 12 | 13 | def query_context_node(node_or_source, hquery): 14 | if not is_any_node(node_or_source): 15 | node_or_source = root_tag_from_soup(make_soup(node_or_source)) 16 | raw_result = HqueryProcessor(hquery).query(node_or_source) 17 | return eliminate_blank_lines(convert_results_to_output_text(raw_result).strip()) 18 | -------------------------------------------------------------------------------- /test/hquery/test_if_then_else.py: -------------------------------------------------------------------------------- 1 | from test.common_test_util import expected_result 2 | from test.hquery.hquery_test_util import query_html_doc 3 | 4 | 5 | def test_if_then_else_works_with_literal_conditions(): 6 | assert query_html_doc('', 'if (true()) then "foo" else "bar"') == 'foo' 7 | assert query_html_doc('', 'if ("") then "foo" else "bar"') == 'bar' 8 | assert query_html_doc('', 'if (0.001) then "foo" else "bar"') == 'foo' 9 | 10 | 11 | def test_if_then_else_works_with_node_sets(): 12 | html_body = """ 13 |

eekaboo

""" 14 | assert query_html_doc(html_body, 'if (//p) then //p else 1 to 3') == expected_result(""" 15 |

16 | eekaboo 17 |

""") 18 | assert query_html_doc(html_body, 'if (//div) then //p else 1 to 3') == expected_result(""" 19 | 1 20 | 2 21 | 3""") 22 | 23 | 24 | def test_if_then_else_works_with_variables_in_a_flwor(): 25 | assert query_html_doc('', 'let $x := 0.1 return if ($x - 0.1) then $x else $x + 1') == '1.1' 26 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | 27 | # PyInstaller 28 | # Usually these files are written by a python script from a template 29 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 30 | *.manifest 31 | *.spec 32 | 33 | # Installer logs 34 | pip-log.txt 35 | pip-delete-this-directory.txt 36 | 37 | # Unit test / coverage reports 38 | htmlcov/ 39 | .tox/ 40 | .coverage 41 | .coverage.* 42 | .cache 43 | nosetests.xml 44 | coverage.xml 45 | *,cover 46 | .hypothesis/ 47 | 48 | # Translations 49 | *.mo 50 | *.pot 51 | 52 | # Django stuff: 53 | *.log 54 | 55 | # Sphinx documentation 56 | docs/_build/ 57 | 58 | # PyBuilder 59 | target/ 60 | 61 | #Ipython Notebook 62 | .ipynb_checkpoints 63 | 64 | # JetBrains 65 | .idea 66 | -------------------------------------------------------------------------------- /hq/hquery/functions/core_boolean.py: -------------------------------------------------------------------------------- 1 | from math import isnan 2 | 3 | from hq.hquery.object_type import is_node_set, is_number, is_boolean 4 | 5 | 6 | exports = ('boolean', 'false', 'not_', 'true') 7 | 8 | 9 | class boolean: 10 | 11 | def __init__(self, obj): 12 | if is_node_set(obj): 13 | self.value = len(obj) > 0 14 | elif is_number(obj): 15 | f = float(obj) 16 | self.value = bool(f) and not isnan(f) 17 | else: 18 | self.value = bool(obj) 19 | 20 | def __bool__(self): 21 | return self.value 22 | 23 | def __nonzero__(self): 24 | return self.__bool__() 25 | 26 | def __str__(self): 27 | return str(self.value).lower() 28 | 29 | def __eq__(self, other): 30 | return is_boolean(other) and self.value == other.value 31 | 32 | def __repr__(self): 33 | return 'boolean({0})'.format(self.value) 34 | 35 | 36 | def false(): 37 | return boolean(False) 38 | 39 | 40 | def not_(value): 41 | return boolean(not boolean(value)) 42 | 43 | 44 | def true(): 45 | return boolean(True) 46 | -------------------------------------------------------------------------------- /hq/output.py: -------------------------------------------------------------------------------- 1 | from builtins import str 2 | 3 | from .hquery.object_type import is_sequence 4 | from .soup_util import is_text_node, is_attribute_node, is_comment_node, is_tag_node, derive_text_from_node, \ 5 | is_root_node 6 | 7 | 8 | def convert_results_to_output_text(results, pretty=True, preserve_space=False): 9 | if is_sequence(results): 10 | return '\n'.join(value_object_to_text(object, pretty, preserve_space) for object in results) 11 | else: 12 | return value_object_to_text(results, pretty, preserve_space) 13 | 14 | 15 | def value_object_to_text(obj, pretty, preserve_space): 16 | if is_comment_node(obj): 17 | return u''.format(str(obj).strip()) 18 | elif is_tag_node(obj) or is_root_node(obj): 19 | return obj.prettify().rstrip(' \t\n') if pretty else str(obj) 20 | elif is_attribute_node(obj): 21 | return u'{0}="{1}"'.format(obj.name, derive_text_from_node(obj, preserve_space=preserve_space)) 22 | elif is_text_node(obj): 23 | return derive_text_from_node(obj, preserve_space=preserve_space) 24 | else: 25 | return str(obj) 26 | -------------------------------------------------------------------------------- /hq/hquery/variables.py: -------------------------------------------------------------------------------- 1 | from hq.hquery.object_type import debug_dump_anything 2 | from hq.verbosity import verbose_print 3 | 4 | variable_stack = [] 5 | NAME, VALUE = range(2) 6 | 7 | 8 | class variable_scope: 9 | def __enter__(self): 10 | self.mark = len(variable_stack) 11 | 12 | def __exit__(self, *args): 13 | del variable_stack[self.mark:] 14 | 15 | 16 | def push_variable(name, value): 17 | global variable_stack 18 | verbose_print(lambda: u'Pushing variable onto stack: let ${0} := {1}'.format(name, debug_dump_anything(value))) 19 | variable_stack.append((name, value)) 20 | 21 | 22 | def value_of_variable(name): 23 | if len(variable_stack) > 0: 24 | for index in range(len(variable_stack) - 1, -1, -1): 25 | if variable_stack[index][NAME] == name: 26 | reverse_index = len(variable_stack) - (index + 1) 27 | verbose_print('Variable "${0}" found on stack (position {1}).'.format(name, reverse_index)) 28 | return variable_stack[index][VALUE] 29 | 30 | verbose_print('Variable "${0}" NOT FOUND on variable stack.'.format(name)) 31 | return None 32 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2016 Richard B. Winslow 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /hq/hquery/evaluation_in_context.py: -------------------------------------------------------------------------------- 1 | from hq.hquery.evaluation_error import HqueryEvaluationError 2 | from hq.hquery.expression_context import push_context, pop_context 3 | from hq.hquery.sequences import make_node_set 4 | from hq.soup_util import is_any_node, debug_dump_long_string 5 | 6 | 7 | def evaluate_across_contexts(node_set, expression_fn): 8 | HqueryEvaluationError.must_be_node_set(node_set) 9 | 10 | node_set_len = len(node_set) 11 | ragged = [evaluate_in_context(node, expression_fn, position=index+1, size=node_set_len) 12 | for index, node in enumerate(node_set)] 13 | return make_node_set([item for sublist in ragged for item in sublist]) 14 | 15 | 16 | def evaluate_in_context(node, expression_fn, position=1, size=1, preserve_space=None): 17 | if not is_any_node(node): 18 | raise HqueryEvaluationError('cannot use {0} "{1}" as context node'.format(type(node), 19 | debug_dump_long_string(str(node)))) 20 | push_context(node, position, size, preserve_space) 21 | result = expression_fn() 22 | pop_context() 23 | return result 24 | -------------------------------------------------------------------------------- /test/hquery/test_expressions.py: -------------------------------------------------------------------------------- 1 | from test.common_test_util import expected_result 2 | from test.hquery.hquery_test_util import query_html_doc 3 | 4 | 5 | def test_parentheses_boost_precedence(): 6 | assert query_html_doc('', '(2+3)*3') == expected_result('15') 7 | assert query_html_doc('', '3*(3+2)') == expected_result('15') 8 | assert query_html_doc('', '2+3*3 != (2+3)*3') == expected_result('true') 9 | 10 | 11 | def test_union_operator_combines_node_sets(): 12 | html_body = """ 13 |
one
14 |
two
15 |

three

""" 16 | assert query_html_doc(html_body, '//div | //p') == expected_result(""" 17 |
18 | one 19 |
20 |
21 | two 22 |
23 |

24 | three 25 |

""") 26 | 27 | 28 | def test_union_operator_produces_node_set_sorted_in_document_order(): 29 | html_body = """ 30 |
one
31 |

two

32 |
three
""" 33 | assert query_html_doc(html_body, '//p | //div') == expected_result(""" 34 |
35 | one 36 |
37 |

38 | two 39 |

40 |
41 | three 42 |
""") 43 | -------------------------------------------------------------------------------- /test/hquery/test_union_decomposition.py: -------------------------------------------------------------------------------- 1 | 2 | from test.common_test_util import expected_result 3 | from test.hquery.hquery_test_util import query_html_doc 4 | 5 | 6 | def test_union_decomposition_with_parentheses(): 7 | html_body = """ 8 |

heading

9 |

content

10 |

another heading

""" 11 | assert query_html_doc(html_body, '(//h1 | //p) => ("fizz" | "buzz")') == expected_result(""" 12 | fizz 13 | buzz 14 | fizz""") 15 | 16 | 17 | def test_union_decomposition_naked(): 18 | html_body = """ 19 |

heading

20 |

content

21 |

another heading

""" 22 | assert query_html_doc(html_body, '(//h1 | //p) => `h1 $_` | `p $_`') == expected_result(""" 23 | h1 heading 24 | p content 25 | h1 another heading""") 26 | 27 | 28 | def test_union_decomposition_applies_first_matching_clause(): 29 | html_body = """ 30 |
div1
31 |

p1

32 |
33 |

p2

34 |
""" 35 | query = '(//p | /html/body/div | /html/body//*) => "one" | "two" | "three"' 36 | assert query_html_doc(html_body, query) == expected_result(""" 37 | two 38 | one 39 | two 40 | one""") 41 | -------------------------------------------------------------------------------- /hq/hquery/sequences.py: -------------------------------------------------------------------------------- 1 | from itertools import filterfalse 2 | 3 | from hq.hquery.evaluation_error import HqueryEvaluationError 4 | from hq.hquery.object_type import object_type_name 5 | from hq.soup_util import is_any_node 6 | 7 | 8 | def make_node_set(node_set, reverse=False): 9 | ids = set() 10 | 11 | def is_unique_id(node): 12 | node_id = id(node) 13 | if node_id in ids: 14 | return False 15 | else: 16 | ids.add(node_id) 17 | return True 18 | 19 | if not isinstance(node_set, list): 20 | node_set = [node_set] 21 | 22 | non_node_member = next(filterfalse(is_any_node, node_set), False) 23 | if non_node_member: 24 | format_str = 'Constructed node set that includes {0} object "{1}"' 25 | raise HqueryEvaluationError(format_str.format(object_type_name(non_node_member), non_node_member)) 26 | 27 | node_set = list(sorted(filter(is_unique_id, node_set), key=lambda n: n.hq_doc_index, reverse=reverse)) 28 | 29 | return node_set 30 | 31 | 32 | def make_sequence(sequence): 33 | if not isinstance(sequence, list): 34 | sequence = [sequence] 35 | return sequence 36 | 37 | 38 | def sequence_concat(first, second): 39 | first = make_sequence(first) 40 | first.extend(make_sequence(second)) 41 | return first 42 | -------------------------------------------------------------------------------- /test/common_test_util.py: -------------------------------------------------------------------------------- 1 | from textwrap import dedent 2 | 3 | from hq.soup_util import make_soup 4 | 5 | 6 | def capture_console_output(capsys, strip=True): 7 | output, errors = capsys.readouterr() 8 | output = output.rstrip('\n') 9 | return eliminate_blank_lines(output.strip()) if strip else output, eliminate_blank_lines(errors.strip()) 10 | 11 | 12 | def eliminate_blank_lines(s): 13 | return '\n'.join([line for line in s.split('\n') if line.strip() != '']) 14 | 15 | 16 | def expected_result(contents): 17 | return dedent(contents.lstrip('\n')) 18 | 19 | 20 | def simulate_args_dict(**kwargs): 21 | args = { 22 | '': '', 23 | '-f': False, 24 | '--file': False, 25 | '--preserve': False, 26 | '--program': '', 27 | '-u': False, 28 | '--ugly': False, 29 | '-v': False, 30 | '--verbose': False 31 | } 32 | for key, value in kwargs.items(): 33 | if key == 'expression': 34 | format_string = '<{0}>' 35 | elif len(key) == 1: 36 | format_string = '-{0}' 37 | else: 38 | format_string = '--{0}' 39 | args[format_string.format(key)] = value 40 | return args 41 | 42 | 43 | def soup_with_body(contents): 44 | return make_soup(wrap_html_body(contents)) 45 | 46 | 47 | def wrap_html_body(contents): 48 | return u'{0}'.format(contents) 49 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import os 5 | import re 6 | from setuptools import setup, find_packages 7 | 8 | version = re.search("^__version__\s*=\s*'(.*)'", open('hq/hq.py').read(), re.M).group(1) 9 | 10 | 11 | long_description = 'Powerful HTML slicing and dicing at the command line.' 12 | if os.path.exists('README.md'): 13 | with open('README.md', 'rb') as f: 14 | long_description = f.read().decode('utf-8') 15 | 16 | 17 | classifiers = [ 18 | 'Development Status :: 2 - Pre-Alpha', 19 | 'Intended Audience :: Developers', 20 | 'Environment :: Console', 21 | 'License :: OSI Approved :: MIT License', 22 | 'Programming Language :: Python', 23 | 'Topic :: Text Processing :: Markup :: HTML', 24 | ] 25 | 26 | 27 | setup(name='hq', 28 | packages=find_packages(exclude=['test']), 29 | entry_points={'console_scripts': ['hq = hq.hq:main']}, 30 | version=version, 31 | description='Command-line tool for querying, slicing & dicing HTML using the XPath/XQuery derivative HQuery.', 32 | long_description=long_description, 33 | author='Richard B. Winslow', 34 | author_email='richard.b.winslow@gmail.com', 35 | license='MIT', 36 | url='https://github.com/rbwinslow/hq', 37 | keywords='html xpath query xquery hquery jq cmdline cli', 38 | classifiers=classifiers, 39 | install_requires=['beautifulsoup4', 'docopt', 'wheel']) 40 | -------------------------------------------------------------------------------- /test/hquery/test_sequences_and_ranges.py: -------------------------------------------------------------------------------- 1 | from test.common_test_util import expected_result 2 | from test.hquery.hquery_test_util import query_html_doc 3 | 4 | 5 | def test_range_expression_produces_expected_sequence(): 6 | assert query_html_doc('', '(1 to 3)') == expected_result(""" 7 | 1 8 | 2 9 | 3""") 10 | 11 | 12 | def test_range_expression_works_without_parentheses(): 13 | assert query_html_doc('', '1 to 3') == expected_result(""" 14 | 1 15 | 2 16 | 3""") 17 | 18 | 19 | def test_range_operator_is_interpreted_as_name_test_in_appropriate_contexts(): 20 | html_body = 'from' 21 | assert query_html_doc(html_body, '//to') == expected_result(""" 22 | 23 | from 24 | """) 25 | 26 | 27 | def test_range_within_sequence_constructor_collapses_into_sequence(): 28 | assert query_html_doc('', '(1, 2 to 4)') == expected_result(""" 29 | 1 30 | 2 31 | 3 32 | 4""") 33 | 34 | 35 | def test_sequences_collapse(): 36 | assert query_html_doc('', '(1, (2, 3), 4)') == expected_result(""" 37 | 1 38 | 2 39 | 3 40 | 4""") 41 | 42 | 43 | def test_string_value_of_a_sequence_is_concatenation_of_all_items_unlike_node_set(): 44 | html_body = """ 45 |

one

46 |

two

""" 47 | 48 | assert query_html_doc(html_body, 'let $_ := //p/text() return string($_)') == 'one' 49 | assert query_html_doc(html_body, 'let $_ := ("one", "two") return string($_)') == 'onetwo' 50 | -------------------------------------------------------------------------------- /hq/hquery/axis.py: -------------------------------------------------------------------------------- 1 | 2 | from enum import Enum 3 | 4 | 5 | class Axis(Enum): 6 | # standard 7 | ancestor = 1 8 | ancestor_or_self = 2 9 | attribute = 3 10 | child = 4 11 | descendant = 5 12 | descendant_or_self = 6 13 | following = 7 14 | following_sibling = 8 15 | parent = 9 16 | preceding = 10 17 | preceding_sibling = 11 18 | self = 12 19 | # extended 20 | css_class = 13 21 | 22 | 23 | def is_reverse_order(self): 24 | return self in reverse_order_axes 25 | 26 | def token(self): 27 | return self.name.replace('_', '-') 28 | 29 | @classmethod 30 | def abbreviations(self): 31 | return _abbreviations.keys() 32 | 33 | @classmethod 34 | def canonicalize(cls, name): 35 | if name in _abbreviations.keys(): 36 | result = _abbreviations[name] 37 | else: 38 | result = name.replace('-', '_') 39 | return result 40 | 41 | 42 | _abbreviations = { 43 | '^': 'ancestor', 44 | '^^': 'ancestor_or_self', 45 | '@': 'attribute', 46 | '.': 'css_class', 47 | 'class': 'css_class', 48 | '~': 'descendant', 49 | '>>': 'following', 50 | '>': 'following_sibling', 51 | '<<': 'preceding', 52 | '<': 'preceding_sibling', 53 | } 54 | 55 | 56 | reverse_order_axes = {Axis.ancestor, Axis.ancestor_or_self, Axis.preceding, Axis.preceding_sibling} 57 | -------------------------------------------------------------------------------- /test/test_unicode_support.py: -------------------------------------------------------------------------------- 1 | from hq.hq import main 2 | from test.common_test_util import expected_result, wrap_html_body, simulate_args_dict, capture_console_output 3 | 4 | 5 | def test_tolerates_latin_characters_in_element_contents(capsys, mocker): 6 | mocker.patch('hq.hq.docopt').return_value = simulate_args_dict(expression='//div') 7 | mocker.patch('sys.stdin.read').return_value = wrap_html_body(u""" 8 |
9 | T\xeate\xa0\xe0\xa0t\xeate 10 |
""") 11 | 12 | main() 13 | 14 | actual, _ = capture_console_output(capsys) 15 | assert actual == expected_result(u""" 16 |
17 | T\xeate\xa0\xe0\xa0t\xeate 18 |
""") 19 | 20 | 21 | def test_tolerates_latin_characters_in_attribute_contents(capsys, mocker): 22 | mocker.patch('hq.hq.docopt').return_value = simulate_args_dict(expression='//div/@role') 23 | mocker.patch('sys.stdin.read').return_value = wrap_html_body(u""" 24 |
25 |
""") 26 | 27 | main() 28 | 29 | actual, _ = capture_console_output(capsys) 30 | assert actual == expected_result(u'role="prim\xe4r"') 31 | 32 | 33 | def test_tolerates_latin_characters_in_comments(capsys, mocker): 34 | mocker.patch('hq.hq.docopt').return_value = simulate_args_dict(expression='//comment()') 35 | mocker.patch('sys.stdin.read').return_value = wrap_html_body(u""" 36 | """) 37 | 38 | main() 39 | 40 | actual, _ = capture_console_output(capsys) 41 | assert actual == expected_result(u'') 42 | -------------------------------------------------------------------------------- /hq/hquery/functions/core_node_set.py: -------------------------------------------------------------------------------- 1 | from hq.hquery.evaluation_error import HqueryEvaluationError 2 | from hq.hquery.expression_context import peek_context, get_context_node 3 | from hq.hquery.functions.core_number import number 4 | from hq.hquery.object_type import string_value, is_sequence, object_type_name 5 | from hq.hquery.sequences import make_node_set 6 | from hq.soup_util import root_tag_from_any_tag, is_tag_node 7 | 8 | exports = ('count', 'id', 'last', 'name', 'position') 9 | 10 | 11 | def count(sequence): 12 | HqueryEvaluationError.must_be_node_set_or_sequence(sequence) 13 | return number(len(sequence)) 14 | 15 | 16 | def id(ids): 17 | if is_sequence(ids): 18 | ids = set(string_value(item) for item in ids) 19 | else: 20 | ids = set(string_value(ids).split()) 21 | result = [] 22 | for node in root_tag_from_any_tag(get_context_node()).descendants: 23 | if is_tag_node(node) and 'id' in node.attrs and node['id'] in ids: 24 | result.append(node) 25 | return make_node_set(result) 26 | 27 | 28 | def last(): 29 | return number(peek_context().size) 30 | 31 | 32 | def name(*args): 33 | if len(args) > 0: 34 | value = args[0] 35 | if is_sequence(value): 36 | value = value[0] 37 | if is_tag_node(value): 38 | return value.name 39 | else: 40 | return '' 41 | else: 42 | node = get_context_node() 43 | if is_tag_node(node): 44 | return node.name 45 | else: 46 | return '' 47 | 48 | 49 | def position(): 50 | return number(peek_context().position) 51 | -------------------------------------------------------------------------------- /test/hquery/test_name_tests.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | from hq.output import convert_results_to_output_text 5 | from hq.soup_util import make_soup 6 | from hq.hquery.hquery_processor import HqueryProcessor 7 | 8 | sys.path.insert(0, os.path.abspath('../..')) 9 | 10 | from ..common_test_util import expected_result 11 | from test.hquery.hquery_test_util import query_html_doc 12 | 13 | 14 | def test_name_test_is_case_insensitive(): 15 | html_body = """ 16 | one 17 | two 18 | three""" 19 | actual = query_html_doc(html_body, '/html/body/SpAn') 20 | assert actual == expected_result(""" 21 | 22 | one 23 | 24 | 25 | two 26 | 27 | 28 | three 29 | """) 30 | 31 | 32 | def test_name_test_at_root_ignores_all_but_root_element(): 33 | html = """ 34 | 35 | 36 | 37 | """ 38 | raw_result = HqueryProcessor('/html').query(make_soup(html)) 39 | actual = convert_results_to_output_text(raw_result) 40 | assert actual == expected_result(""" 41 | 42 | """) 43 | 44 | 45 | def test_name_test_tolerates_hyphens_in_element_names(): 46 | html_body = "" 47 | assert query_html_doc(html_body, '//special-name') == expected_result(""" 48 | 49 | """) 50 | 51 | 52 | def test_name_test_tolerates_hyphens_in_attribute_names(): 53 | html_body = "
" 54 | assert query_html_doc(html_body, '//div/@special-name') == expected_result('special-name="special-value"') 55 | -------------------------------------------------------------------------------- /hq/hquery/expression_context.py: -------------------------------------------------------------------------------- 1 | from hq.verbosity import verbose_print 2 | from ..soup_util import debug_dump_node 3 | 4 | context_stack = [] 5 | 6 | 7 | class ExpressionContext: 8 | 9 | def __init__(self, node, position=1, size=1, preserve_space=None): 10 | self.node = node 11 | self.position = position 12 | self.size = size 13 | if preserve_space is not None: 14 | self.preserve_space = preserve_space 15 | else: 16 | try: 17 | self.preserve_space = peek_context().preserve_space 18 | except ExpressionStackEmptyError: 19 | self.preserve_space = False 20 | 21 | def __str__(self): 22 | return 'context(node={0})'.format(str(self.node)) 23 | 24 | 25 | class ExpressionStackEmptyError(RuntimeError): 26 | pass 27 | 28 | 29 | 30 | def get_context_node(): 31 | return peek_context().node 32 | 33 | 34 | def peek_context(): 35 | try: 36 | return context_stack[-1] 37 | except IndexError: 38 | raise ExpressionStackEmptyError('tried to peek while expression stack was empty') 39 | 40 | 41 | def pop_context(): 42 | result = context_stack.pop() 43 | msg = u'Popping (node={0}, position={1}, size={2} off of context stack.' 44 | verbose_print(lambda: msg.format(debug_dump_node(result.node), result.position, result.size)) 45 | return result 46 | 47 | 48 | def push_context(node, position=1, size=1, preserve_space=None): 49 | msg = u'Pushing (node={0}, position={1}, size={2} on context stack.' 50 | verbose_print(lambda: msg.format(debug_dump_node(node), position, size)) 51 | context_stack.append(ExpressionContext(node=node, position=position, size=size, preserve_space=preserve_space)) 52 | -------------------------------------------------------------------------------- /hq/hquery/computed_constructors/html_attribute.py: -------------------------------------------------------------------------------- 1 | from hq.hquery.evaluation_error import HqueryEvaluationError 2 | from hq.hquery.object_type import is_string, is_number, is_boolean, object_type_name, string_value 3 | from hq.hquery.sequences import make_sequence 4 | from hq.hquery.syntax_error import HquerySyntaxError 5 | from hq.soup_util import debug_dump_node, is_any_node, AttributeNode, is_attribute_node, is_tag_node 6 | 7 | 8 | class ComputedHtmlAttributeConstructor: 9 | 10 | def __init__(self, name): 11 | self.name = name 12 | self.contents = None 13 | 14 | 15 | def set_content(self, expression_fn): 16 | if self.contents is not None: 17 | raise HquerySyntaxError('Computed attribute constructor already has contents') 18 | self.contents = expression_fn 19 | 20 | 21 | def evaluate(self): 22 | result = '' 23 | 24 | for value in make_sequence(self.contents()) if self.contents is not None else []: 25 | if is_string(value) or is_number(value) or is_boolean(value): 26 | result = self._append_to_contents(result, str(value)) 27 | elif is_attribute_node(value): 28 | result = self._append_to_contents(result, value.value) 29 | elif is_tag_node(value): 30 | result = self._append_to_contents(result, string_value(value)) 31 | else: 32 | value_desc = debug_dump_node(value) if is_any_node(value) else object_type_name(value) 33 | raise HqueryEvaluationError( 34 | 'Cannot use {0} as a content object in a computed attribute constructor'.format(value_desc) 35 | ) 36 | 37 | return AttributeNode(self.name, result) 38 | 39 | 40 | def _append_to_contents(self, so_far, more_content): 41 | return '{0}{1}{2}'.format(so_far, ' ' if len(so_far) > 0 else '', more_content) 42 | -------------------------------------------------------------------------------- /hq/hquery/computed_constructors/html_element.py: -------------------------------------------------------------------------------- 1 | from bs4 import BeautifulSoup 2 | from hq.hquery.evaluation_error import HqueryEvaluationError 3 | from hq.hquery.object_type import is_string, object_type_name, is_number, is_boolean 4 | from hq.hquery.sequences import make_node_set, make_sequence 5 | from hq.hquery.syntax_error import HquerySyntaxError 6 | from hq.soup_util import debug_dump_node, is_any_node, is_tag_node, is_attribute_node 7 | 8 | 9 | class ComputedHtmlElementConstructor: 10 | 11 | def __init__(self, name): 12 | self.name = name 13 | self.contents = None 14 | 15 | 16 | def set_content(self, expression_fn): 17 | if self.contents is not None: 18 | raise HquerySyntaxError('Computed element constructor already has contents') 19 | self.contents = expression_fn 20 | 21 | 22 | def evaluate(self): 23 | soup = BeautifulSoup('<{0}>'.format(self.name), 'html.parser') 24 | result = getattr(soup, self.name) 25 | 26 | for value in make_sequence(self.contents()) if self.contents is not None else []: 27 | if is_tag_node(value): 28 | result.append(self._clone_tag(value)) 29 | elif is_attribute_node(value): 30 | result[value.name] = value.value 31 | elif is_string(value) or is_number(value) or is_boolean(value): 32 | result.append(str(value)) 33 | else: 34 | value_desc = debug_dump_node(value) if is_any_node(value) else object_type_name(value) 35 | raise HqueryEvaluationError( 36 | 'Cannot use {0} as a content object in a computed element constructor'.format(value_desc) 37 | ) 38 | 39 | return make_node_set(result) 40 | 41 | 42 | def _clone_tag(self, tag): 43 | name = tag.name 44 | soup = BeautifulSoup(str(tag), 'html.parser') 45 | return getattr(soup, name) 46 | -------------------------------------------------------------------------------- /hq/hquery/function_support.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | from inspect import isclass, isfunction 4 | from pkgutil import iter_modules 5 | 6 | from hq.hquery.evaluation_error import HqueryEvaluationError 7 | from hq.verbosity import verbose_print 8 | 9 | 10 | class FunctionSupport: 11 | all_functions = None 12 | 13 | 14 | def call_function(self, name, *args): 15 | self._load_all_functions() 16 | 17 | py_name = name.replace('-', '_') 18 | 19 | try: 20 | fn = self.all_functions[py_name] 21 | except KeyError: 22 | raise HqueryEvaluationError('Unknown function name "{0}"'.format(name)) 23 | 24 | try: 25 | return fn(*args) 26 | except TypeError as err: 27 | if re.search(r'\d+ (?:.+ )?argument', err.args[0]): 28 | raise HqueryEvaluationError(err.args[0]) 29 | else: 30 | raise 31 | 32 | 33 | def _load_all_functions(self): 34 | if self.all_functions is None: 35 | self.all_functions = dict() 36 | my_package_dir = os.path.dirname(__file__) 37 | verbose_print('FunctionSupport loading all function modules in {0}.'.format(my_package_dir), 38 | indent_after=True) 39 | for importer, modname, ispkg in iter_modules([os.path.join(my_package_dir, 'functions')]): 40 | verbose_print('Found candidate module {0} -- loading.'.format(modname)) 41 | module = importer.find_module(modname).load_module(modname) 42 | 43 | if hasattr(module, 'exports'): 44 | exports = {name.rstrip('_'): getattr(module, name) for name in getattr(module, 'exports')} 45 | verbose_print('Module {0} exports are: {1}'.format(modname, exports.keys())) 46 | if any(not (isclass(obj) or isfunction(obj)) for obj in exports.values()): 47 | raise RuntimeError('Non-class/function export(s) loaded from module {0}'.format(modname)) 48 | self.all_functions.update(exports) 49 | else: 50 | verbose_print('Module {0} defined no exports.'.format(modname)) 51 | 52 | verbose_print('Finished loading function modules.', outdent_before=True) 53 | -------------------------------------------------------------------------------- /hq/hquery/union_decomposition.py: -------------------------------------------------------------------------------- 1 | from hq.hquery.evaluation_error import HqueryEvaluationError 2 | from hq.hquery.object_type import debug_dump_anything 3 | from hq.hquery.sequences import make_sequence, sequence_concat 4 | from hq.hquery.variables import push_variable, variable_scope 5 | from hq.verbosity import verbose_print 6 | 7 | 8 | class UnionDecomposition: 9 | 10 | def __init__(self): 11 | self.mapping_generators = None 12 | self.union_expression = None 13 | 14 | 15 | def __str__(self): 16 | union_str = ' | '.join('' * len(self.mapping_generators)) 17 | return '{0} => {0}'.format(union_str) 18 | 19 | 20 | def evaluate(self): 21 | verbose_print('Evaluating union decomposition ({} clauses)'.format(len(self.mapping_generators)), 22 | indent_after=True) 23 | 24 | sequence = make_sequence(self.union_expression()) 25 | result = [] 26 | 27 | for item in sequence: 28 | verbose_print(lambda: u'Visiting item {0}'.format(debug_dump_anything(item)), indent_after=True) 29 | 30 | with variable_scope(): 31 | push_variable('_', make_sequence(item)) 32 | if not hasattr(item, 'union_index'): 33 | raise HqueryEvaluationError( 34 | "Union decomposition applied to something that wasn't produced by a union" 35 | ) 36 | if item.union_index >= len(self.mapping_generators): 37 | raise HqueryEvaluationError("Decomposed union had more clauses than its mapping") 38 | this_result = make_sequence(self.mapping_generators[item.union_index]()) 39 | verbose_print( 40 | 'Mapping yielded {0} results for this visit'.format( 41 | len(this_result))) 42 | result = sequence_concat(result, this_result) 43 | 44 | verbose_print('Visit finished', outdent_before=True) 45 | 46 | verbose_print('Union decomposition completed', outdent_before=True) 47 | return result 48 | 49 | 50 | def set_mapping_generators(self, mgs): 51 | self.mapping_generators = mgs 52 | 53 | 54 | def set_union_expression(self, ug): 55 | self.union_expression = ug 56 | -------------------------------------------------------------------------------- /test/hquery/test_arithmetic_operators.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | sys.path.insert(0, os.path.abspath('../..')) 5 | 6 | from ..common_test_util import expected_result 7 | from test.hquery.hquery_test_util import query_html_doc 8 | 9 | 10 | def test_the_sum_of_decimals_is_a_decimal(): 11 | assert query_html_doc('', '90+8.6') == expected_result('98.6') 12 | assert query_html_doc('', '-0.2 + 0.1') == expected_result('-0.1') 13 | 14 | 15 | def test_the_sum_of_integers_is_an_integer(): 16 | assert query_html_doc('', '40+2') == expected_result('42') 17 | assert query_html_doc('', '-1 + 1') == expected_result('0') 18 | 19 | 20 | def test_integer_result_of_adding_decimals_is_an_integer(): 21 | assert query_html_doc('', '41.5 + 0.5') == expected_result('42') 22 | 23 | 24 | def test_subtraction_operator(): 25 | assert query_html_doc('', '43.5 - 1.5') == expected_result('42') 26 | 27 | 28 | def test_multiplication_operator(): 29 | assert query_html_doc('', '3 * 3.1') == expected_result('9.3') 30 | 31 | 32 | def test_div_operator(): 33 | assert query_html_doc('', '6div2') == expected_result('3') 34 | 35 | 36 | def test_mod_operator(): 37 | assert query_html_doc('', '11 mod 5') == expected_result('1') 38 | 39 | 40 | def test_interpretation_of_div_and_mod_and_other_arithmetic_operators_as_operators_vs_node_tests(): 41 | div = """ 42 |
43 |
""" 44 | mod = """ 45 | 46 | """ 47 | 48 | assert query_html_doc(div, 'div', wrap_body=False) == expected_result(div) 49 | assert query_html_doc(mod, '/ mod', wrap_body=False) == expected_result(mod) 50 | assert query_html_doc(div, 'boolean(div)', wrap_body=False) == 'true' 51 | assert query_html_doc(mod, 'boolean(div)', wrap_body=False) == 'false' 52 | 53 | div_with_text = '
bar
' 54 | query_with_div_after_comma = 'starts-with(concat("foo ", div), "foo ba")' 55 | assert query_html_doc(div_with_text, query_with_div_after_comma, wrap_body=False) == 'true' 56 | 57 | assert query_html_doc(div, 'number("84")div2') == '42' 58 | assert query_html_doc(div, 'let $x := 4 return $x div 2') == '2' 59 | 60 | rect = '' 61 | assert query_html_doc(rect, 'let $r := //rect return $r/@height * $r/@width') == '20' 62 | 63 | num_in_text = """ 64 | not selected 65 | 42""" 66 | assert query_html_doc(num_in_text, '//span[@id="foo"] mod 10') == '2' 67 | -------------------------------------------------------------------------------- /hq/hquery/computed_constructors/json_array.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | from hq.hquery.evaluation_error import HqueryEvaluationError 4 | from hq.hquery.object_type import string_value, is_string, debug_dump_anything, is_hash, \ 5 | is_boolean, is_number 6 | from hq.hquery.sequences import make_sequence 7 | from hq.hquery.syntax_error import HquerySyntaxError 8 | from hq.soup_util import is_tag_node, is_text_node 9 | from hq.verbosity import verbose_print 10 | 11 | 12 | class JsonArray: 13 | 14 | def __init__(self, contents): 15 | if not isinstance(contents, list): 16 | raise HqueryEvaluationError('Attempted to construct a JSON array based on a(n) {0} object'.format( 17 | contents.__class__.__name__)) 18 | self.contents = contents 19 | 20 | 21 | def __repr__(self): 22 | return 'ARRAY {0}'.format(repr(self.contents)) 23 | 24 | 25 | def __str__(self): 26 | return json.dumps(self.contents) 27 | 28 | 29 | 30 | class ComputedJsonArrayConstructor: 31 | 32 | def __init__(self): 33 | self.contents = None 34 | 35 | 36 | def set_contents(self, expression_fn): 37 | if self.contents is not None: 38 | raise HquerySyntaxError('computed JSON array constructor already has contents') 39 | self.contents = expression_fn 40 | 41 | 42 | def evaluate(self): 43 | return JsonArray([self._make_array_item(item) for item in make_sequence(self.contents())]) 44 | 45 | 46 | def _make_array_item(self, value): 47 | if is_tag_node(value): 48 | self._gab(lambda: 'appending text contents of element "{0}" to array'.format(debug_dump_anything(value))) 49 | return string_value(value) 50 | elif is_text_node(value) or is_string(value): 51 | value = string_value(value) 52 | self._gab(lambda: u'appending text "{0}" to array'.format(debug_dump_anything(value))) 53 | return value 54 | elif is_boolean(value) or is_number(value): 55 | self._gab(lambda: 'appending {0} to array'.format(debug_dump_anything(value))) 56 | return value.value 57 | elif is_hash(value): 58 | self._gab(lambda: u'appending JSON {0} to array'.format(debug_dump_anything(value))) 59 | return value.contents 60 | else: 61 | raise HqueryEvaluationError("Can't use {0} as contents in a computed JSON array constructor".format( 62 | debug_dump_anything(value))) 63 | 64 | 65 | def _gab(self, message): 66 | verbose_print('JSON array constructor {0}'.format(message)) 67 | -------------------------------------------------------------------------------- /hq/hquery/functions/extend_string.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | from hq.hquery.evaluation_error import HqueryEvaluationError 4 | from hq.hquery.expression_context import get_context_node 5 | from hq.hquery.functions.core_boolean import boolean 6 | from hq.hquery.object_type import string_value 7 | 8 | exports = ('lower_case', 'matches', 'replace', 'string_join', 'tokenize', 'upper_case') 9 | 10 | 11 | def lower_case(value): 12 | return string_value(value).lower() 13 | 14 | 15 | def matches(*args): 16 | scenario = len(args) 17 | flags = 0 18 | 19 | if scenario < 1 or scenario > 3: 20 | raise HqueryEvaluationError('matches() called with {0} arguments; expected one, two or three.'.format(scenario)) 21 | 22 | if scenario == 1: 23 | input = string_value(get_context_node()) 24 | pattern = args[0] 25 | else: 26 | input = string_value(args[0]) 27 | pattern = args[1] 28 | if scenario == 3: 29 | flags = _xpath_flags_to_re_flags(args[2]) 30 | 31 | return boolean(re.search(pattern, input, flags)) 32 | 33 | 34 | def replace(*args): 35 | argc = len(args) 36 | if argc < 3 or argc > 4: 37 | raise HqueryEvaluationError('replace() expects 3 or 4 arguments; was passed {0}'.format(argc)) 38 | 39 | input = string_value(args[0]) 40 | pattern = args[1] 41 | replacement = args[2] 42 | if argc == 4: 43 | flags = _xpath_flags_to_re_flags(args[3]) 44 | else: 45 | flags = 0 46 | 47 | return re.sub(pattern, replacement, input, flags=flags) 48 | 49 | 50 | def string_join(sequence, *args): 51 | if len(args) > 0: 52 | delimiter = args[0] 53 | else: 54 | delimiter = '' 55 | return delimiter.join([string_value(x) for x in sequence]) 56 | 57 | 58 | def tokenize(*args): 59 | argc = len(args) 60 | if argc < 2 or argc > 3: 61 | raise HqueryEvaluationError('replace() expects 2 or 3 arguments; was passed {0}'.format(argc)) 62 | 63 | input = string_value(args[0]) 64 | pattern = args[1] 65 | if argc == 3: 66 | flags = _xpath_flags_to_re_flags(args[2]) 67 | else: 68 | flags = 0 69 | 70 | return re.split(pattern, input, flags=flags) 71 | 72 | 73 | def upper_case(value): 74 | return string_value(value).upper() 75 | 76 | 77 | def _xpath_flags_to_re_flags(flags): 78 | re_flags_map = { 79 | 'i': re.IGNORECASE, 80 | 'm': re.MULTILINE, 81 | 's': re.DOTALL, 82 | 'x': re.VERBOSE, 83 | } 84 | 85 | try: 86 | result = 0 87 | for flag in flags: 88 | result |= re_flags_map[flag] 89 | return result 90 | except KeyError as e: 91 | raise HqueryEvaluationError('Unexpected regular expression flag "{0}"'.format(e.args[0])) 92 | -------------------------------------------------------------------------------- /test/hquery/test_computed_html_construction.py: -------------------------------------------------------------------------------- 1 | from test.common_test_util import expected_result 2 | from test.hquery.hquery_test_util import query_html_doc 3 | 4 | 5 | def test_simple_element_construction_with_string_content(): 6 | assert query_html_doc('', 'element foo { "bar" }') == expected_result(""" 7 | 8 | bar 9 | """) 10 | 11 | 12 | def test_element_constructor_accepts_numbers_and_booleans(): 13 | assert query_html_doc('', 'element test { 98.6 }') == expected_result(""" 14 | 15 | 98.6 16 | """) 17 | 18 | assert query_html_doc('', 'element test { false() }') == expected_result(""" 19 | 20 | false 21 | """) 22 | 23 | 24 | def test_construction_of_elements_containing_content_queried_from_original_document(): 25 | html_body = """ 26 |
27 |

Hello, world!

28 |
other div
29 |
""" 30 | assert query_html_doc(html_body, 'element hello { //div }') == expected_result(""" 31 | 32 |
33 |

34 | Hello, world! 35 |

36 |
37 | other div 38 |
39 |
40 |
41 | other div 42 |
43 |
""") 44 | 45 | 46 | def test_element_constructor_accepts_attributes_from_original_document_including_multi_values_like_classes(): 47 | html_body = """ 48 |

49 | contents 50 |

""" 51 | 52 | assert query_html_doc(html_body, 'element test { //p/@* }') == expected_result(""" 53 | 54 | """) 55 | 56 | assert query_html_doc(html_body, 'element test { //p/@three, //p }') == expected_result(""" 57 | 58 |

59 | contents 60 |

61 |
""") 62 | 63 | 64 | def test_element_constructor_can_be_nested(): 65 | assert query_html_doc('', 'element moe {element larry {}, element curly {"Hey, Moe!"}}') == expected_result(""" 66 | 67 | 68 | 69 | 70 | Hey, Moe! 71 | 72 | """) 73 | 74 | 75 | def test_attribute_constructor_adds_attributes_to_an_element(): 76 | html_body = '

' 77 | assert query_html_doc(html_body, 'element model { attribute name { //p[1]/@ng-bind } }') == expected_result(""" 78 | 79 | """) 80 | 81 | 82 | def test_attribute_constructor_takes_string_value_of_tag_nodes_in_content_sequence_and_separates_with_spaces(): 83 | html_body = '

Easy as

' 84 | assert query_html_doc(html_body, 'element foo {attribute bar {//p, 1 to 3}}') == expected_result(""" 85 | 86 | """) 87 | -------------------------------------------------------------------------------- /hq/hquery/functions/core_string.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | from hq.hquery.evaluation_error import HqueryEvaluationError 4 | from hq.hquery.expression_context import get_context_node 5 | from hq.hquery.functions.core_boolean import boolean 6 | from hq.hquery.functions.core_number import number 7 | from hq.hquery.functions.extend_string import _xpath_flags_to_re_flags 8 | from hq.hquery.object_type import string_value, normalize_content, is_string 9 | 10 | exports = ('concat', 'contains', 'normalize_space', 'starts_with', 'string', 'string_length', 'substring', 11 | 'substring_after', 'substring_before') 12 | 13 | 14 | def concat(*args): 15 | return ''.join(string_value(arg) for arg in args) 16 | 17 | 18 | def contains(*args): 19 | argc = len(args) 20 | if argc < 2 or argc > 3: 21 | raise HqueryEvaluationError('contains() function expects two or three arguments; {0} passed'.format(argc)) 22 | if argc == 3: 23 | flags = args[2] 24 | else: 25 | flags = '' 26 | 27 | pattern = re.escape(string_value(args[1])) 28 | to_search = string_value(args[0]) 29 | return boolean(bool(re.search(pattern, to_search, flags=_xpath_flags_to_re_flags(flags)))) 30 | 31 | 32 | def normalize_space(*args): 33 | if len(args) == 1: 34 | return normalize_content(args[0]) 35 | else: 36 | return normalize_content(get_context_node()) 37 | 38 | 39 | def starts_with(left, right): 40 | return boolean(string_value(left).startswith(string_value(right))) 41 | 42 | 43 | def string(*args): 44 | if len(args) == 1: 45 | return string_value(args[0]) 46 | else: 47 | return string_value(get_context_node()) 48 | 49 | 50 | def string_length(*args): 51 | value = args[0] if len(args) == 1 else string_value(get_context_node()) 52 | if not is_string(value): 53 | raise HqueryEvaluationError('string_length() expecting a string, got a {0}'.format(value.__class__.__name__)) 54 | return number(len(value)) 55 | 56 | 57 | def substring(*args): 58 | if len(args) < 2: 59 | raise HqueryEvaluationError('substring() expects at least 2 arguments; {0} were passed'.format(len(args))) 60 | value = args[0] 61 | start_index = args[1].value 62 | start = int(round(start_index) - 1) 63 | if len(args) >= 3: 64 | end = start + int(round(args[2].value)) 65 | else: 66 | end = len(value) - start + 1 67 | return value[start if start >= 0 else 0:end] 68 | 69 | 70 | def substring_after(first, second): 71 | first = string_value(first) 72 | index = first.find(second) 73 | if index < 0: 74 | return '' 75 | else: 76 | return first[index + 1:] 77 | 78 | 79 | def substring_before(first, second): 80 | first = string_value(first) 81 | index = first.find(second) 82 | if index < 0: 83 | return '' 84 | else: 85 | return first[:index] 86 | -------------------------------------------------------------------------------- /hq/hq.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """hq - Powerful HTML querying, filtering, slicing and dicing! 4 | 5 | Usage: 6 | hq.py [options] 7 | hq.py [options] -p 8 | hq.py --version 9 | hq.py (-h | --help) 10 | 11 | Options: 12 | -f, --file Read HTML input from a file rather than stdin. 13 | --preserve Preserve extra whitespace in string values derived 14 | from HTML contents. The default behavior is to 15 | automatically apply normalize-string to all string 16 | values derived from HTML elements and attributes, and 17 | to convert non-breaking spaces into plain spaces. 18 | -p, --program Read HQuery expression from a file instead of the 19 | command line. 20 | -u, --ugly Do not pretty-print HTML markup on output. 21 | -v, --verbose Print verbose query parsing and evaluation information 22 | to stderr. 23 | --version Display the installed HQ version. 24 | 25 | HTML is read from stdin. 26 | 27 | """ 28 | 29 | from docopt import docopt 30 | 31 | from .hquery.evaluation_error import HqueryEvaluationError 32 | from .hquery.hquery_processor import HqueryProcessor, HquerySyntaxError 33 | from .output import convert_results_to_output_text 34 | from .soup_util import make_soup 35 | from .verbosity import verbose_print, set_verbosity 36 | 37 | __version__ = '0.0.4' 38 | 39 | 40 | def main(): 41 | from sys import stderr, stdin # So py.tests have a chance to hook stdout & stderr 42 | 43 | args = docopt(__doc__, version='HQ {0}'.format(__version__)) 44 | preserve_space = bool(args['--preserve']) 45 | set_verbosity(bool(args['--verbose'])) 46 | 47 | try: 48 | if args['--file']: 49 | with open(args['--file']) as file: 50 | source = file.read() 51 | else: 52 | source = stdin.read() 53 | verbose_print('Read {0} characters of input'.format(len(source))) 54 | soup = make_soup(source) 55 | 56 | if args['--program']: 57 | with open(args['--program']) as file: 58 | expression = file.read() 59 | else: 60 | expression = args[''] 61 | if len(expression) > 0: 62 | result = HqueryProcessor(expression, preserve_space).query(soup) 63 | else: 64 | result = [soup] 65 | 66 | print(convert_results_to_output_text(result, pretty=(not args['--ugly']), preserve_space=preserve_space)) 67 | 68 | except HquerySyntaxError as error: 69 | print('\nSYNTAX ERROR: {0}\n'.format(str(error)), file=stderr) 70 | except HqueryEvaluationError as error: 71 | print('\nQUERY ERROR: {0}\n'.format(str(error)), file=stderr) 72 | 73 | 74 | if __name__ == '__main__': 75 | main() 76 | -------------------------------------------------------------------------------- /test/hquery/test_node_tests.py: -------------------------------------------------------------------------------- 1 | 2 | import os 3 | import sys 4 | 5 | sys.path.insert(0, os.path.abspath('../..')) 6 | 7 | from ..common_test_util import expected_result 8 | from .hquery_test_util import query_html_doc 9 | 10 | 11 | def test_any_node_test_selects_all_node_types(): 12 | html_body = """Has he ever whaled it any?

13 |

14 | 15 |

""" 16 | assert query_html_doc(html_body, '/html/body/node()') == expected_result(""" 17 | Has he ever whaled it any? 18 |

19 |

20 | 21 |

22 |

""") 23 | 24 | 25 | def test_tag_node_test_selects_tag_children_but_not_other_stuff(): 26 | html_body = """ 27 |

28 | 29 | Has he ever whaled it any? 30 |

""" 31 | actual = query_html_doc(html_body, '/html/body/*') 32 | assert actual == expected_result(""" 33 |

34 |

35 |

36 |

""") 37 | 38 | 39 | def test_tag_node_test_selects_descendants(): 40 | html_body = """ 41 | 42 |
43 |

text

44 |
""" 45 | actual = query_html_doc(html_body, '/html/body/descendant::*') 46 | assert actual == expected_result(""" 47 |
48 |

49 | text 50 |

51 |
52 |

53 | text 54 |

""") 55 | 56 | 57 | def test_tag_node_test_selects_parent(): 58 | html_body = """ 59 |
60 |
61 |

62 |
63 |
""" 64 | actual = query_html_doc(html_body, '/html/body/section/div/p/parent::*') 65 | assert actual == expected_result(""" 66 |
67 |

68 |

69 |
""") 70 | 71 | 72 | def test_tag_node_test_selects_ancestors(): 73 | html_body = """ 74 |
75 |

76 |
""" 77 | actual = query_html_doc(html_body, '/html/body/div/p/ancestor::*') 78 | assert actual == expected_result(""" 79 | 80 | 81 |
82 |

83 |

84 |
85 | 86 | 87 | 88 |
89 |

90 |

91 |
92 | 93 |
94 |

95 |

96 |
""") 97 | 98 | 99 | def test_text_node_test_selects_disjoint_text_nodes(): 100 | html_body = """

onetwothree

""" 101 | actual = query_html_doc(html_body, '/html/body/p/text()') 102 | assert actual == expected_result(""" 103 | one 104 | three""") 105 | 106 | 107 | def test_comment_node_test_selects_comments(): 108 | html_body = """ 109 | 110 |
111 | 112 |
""" 113 | assert query_html_doc(html_body, '//comment()') == expected_result(""" 114 | 115 | """) 116 | -------------------------------------------------------------------------------- /test/hquery/test_equality_operators.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | sys.path.insert(0, os.path.abspath('../..')) 5 | 6 | from ..common_test_util import expected_result 7 | from test.hquery.hquery_test_util import query_html_doc 8 | 9 | 10 | def test_node_set_equality_is_based_on_text_contents(): 11 | html_body = """ 12 |

foo

13 |
foo
""" 14 | actual = query_html_doc(html_body, '//p = //div') 15 | assert actual == expected_result('true') 16 | 17 | 18 | def test_node_sets_are_equal_if_string_value_of_any_one_node_matches_string_value_of_any_from_other_set(): 19 | html_body = """ 20 |
21 | one 22 | two 23 |
24 |

25 | two 26 | three 27 |

""" 28 | actual = query_html_doc(html_body, '//div/span = //p/span') 29 | assert actual == expected_result('true') 30 | 31 | 32 | def test_equals_operator_compares_numbers(): 33 | actual = query_html_doc('', '2.0 != 2.1') 34 | assert actual == expected_result('true') 35 | 36 | 37 | def test_equals_operator_interprets_integer_and_fractional_numbers_correctly(): 38 | actual = query_html_doc('', '101.0 != 101') 39 | assert actual == expected_result('false') 40 | 41 | 42 | def test_equals_operator_compares_string_value_of_node_converted_to_number_with_number(): 43 | actual = query_html_doc('

042.0

', '//p = 42') 44 | assert actual == expected_result('true') 45 | 46 | 47 | def test_equals_operator_compares_boolean_coercion_of_node_set_with_boolean(): 48 | html_body = '

' 49 | actual = query_html_doc(html_body, '//p = false()') 50 | assert actual == expected_result('false') 51 | 52 | 53 | def test_equals_operator_compares_text_node_contents_with_string(): 54 | html_body = """ 55 |
56 |

one

57 |
58 |
59 |

two

60 |
""" 61 | actual = query_html_doc(html_body, '/html/body/div[p/text() = "two"]') 62 | assert actual == expected_result(""" 63 |
64 |

65 | two 66 |

67 |
""") 68 | 69 | 70 | def test_equals_operator_converts_non_node_sets_to_boolean_when_comparing_to_a_boolean(): 71 | assert query_html_doc('', '1 = true()') == expected_result('true') 72 | assert query_html_doc('', '0 != false()') == expected_result('false') 73 | assert query_html_doc('', '"" = false()') == expected_result('true') 74 | assert query_html_doc('', '" " = true()') == expected_result('true') 75 | 76 | 77 | def test_equals_operator_converts_non_node_sets_to_number_when_comparing_to_a_number(): 78 | assert query_html_doc('', '0.1 = "0"') == expected_result('false') 79 | assert query_html_doc('', '"42" = 42.0') == expected_result('true') 80 | assert query_html_doc('', '"foo" = 0') == expected_result('false') # It's NaN, not zero. 81 | 82 | 83 | def test_equals_operator_works_with_node_sets_containing_attributes(): 84 | html_body = """ 85 |
86 |
""" 87 | assert query_html_doc(html_body, '//div/attribute::id = "two"') == expected_result('true') 88 | -------------------------------------------------------------------------------- /test/hquery/test_relational_operators.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | sys.path.insert(0, os.path.abspath('../..')) 5 | 6 | from ..common_test_util import expected_result 7 | from test.hquery.hquery_test_util import query_html_doc 8 | 9 | 10 | def test_relational_comparison_of_numbers(): 11 | assert query_html_doc('', '1.01>1') == 'true' 12 | assert query_html_doc('', '1 > 2') == 'false' 13 | assert query_html_doc('', '2>2') == 'false' 14 | 15 | assert query_html_doc('', '1 < 2') == 'true' 16 | assert query_html_doc('', '2<1.9999') == 'false' 17 | assert query_html_doc('', '42 <42') == 'false' 18 | 19 | assert query_html_doc('', '3>=3') == 'true' 20 | assert query_html_doc('', '3>= 3.01') == 'false' 21 | 22 | assert query_html_doc('', '2 <=2') == 'true' 23 | assert query_html_doc('', '1.999<= 2') == 'true' 24 | assert query_html_doc('', '2.001 <= 2') == 'false' 25 | 26 | 27 | def test_relational_comparison_of_booleans_with_one_another_and_with_other_non_node_set_primitives(): 28 | assert query_html_doc('', 'true() <= false()') == 'false' 29 | assert query_html_doc('', 'true() <= 0') == 'false' 30 | assert query_html_doc('', '1 > false()') == 'true' 31 | assert query_html_doc('', 'true() >= 25') == 'true' 32 | assert query_html_doc('', 'true() > "0"') == 'false' 33 | 34 | 35 | def test_relational_comparison_of_numbers_with_non_boolean_non_numeric_primitives_aka_strings(): 36 | assert query_html_doc('', '"5" < 4') == 'false' 37 | assert query_html_doc('', '5 > "4"') == 'true' 38 | assert query_html_doc('', '"foo" >= 1') == 'false' 39 | 40 | 41 | def test_relational_comparison_of_non_boolean_non_numeric_primitives_aka_strings_with_one_another(): 42 | assert query_html_doc('', '"low" > "high"') == 'true' 43 | assert query_html_doc('', '"1.0" >= "1.1"') == 'false' 44 | assert query_html_doc('', '"1.1" >= "1.1"') == 'true' 45 | 46 | 47 | def test_relational_comparison_involving_two_node_sets(): 48 | html_body = """ 49 |

9

50 |

10

51 |
10
52 |
11
""" 53 | 54 | assert query_html_doc(html_body, '//p > //div') == 'false' 55 | assert query_html_doc(html_body, '//p >= //div') == 'true' 56 | assert query_html_doc(html_body, '//div[position()=1] <= //p') == 'true' 57 | 58 | 59 | def test_relational_comparison_between_a_node_set_and_a_number(): 60 | html_body = """ 61 |
9.9
62 |
10.1
""" 63 | assert query_html_doc(html_body, '//div > 10') == 'true' 64 | assert query_html_doc(html_body, '10.1 < //div') == 'false' 65 | assert query_html_doc(html_body, '//div <= 9.9') == 'true' 66 | 67 | 68 | def test_relational_comparison_between_a_node_set_and_a_string(): 69 | html_body = """ 70 |
9.9
71 |
10.1
""" 72 | assert query_html_doc(html_body, '//div > "10"') == 'true' 73 | assert query_html_doc(html_body, '"10.1" < //div') == 'false' 74 | assert query_html_doc(html_body, '//div <= "9.9"') == 'true' 75 | 76 | 77 | def test_relational_comparison_between_a_node_set_and_a_boolean_value(): 78 | html_body = """ 79 |
2
80 |
1
""" 81 | assert query_html_doc(html_body, '//div <= false()') == 'false' 82 | assert query_html_doc(html_body, 'true() >= //div') == 'true' 83 | -------------------------------------------------------------------------------- /test/hquery/test_interpolated_strings.py: -------------------------------------------------------------------------------- 1 | from test.common_test_util import expected_result 2 | from test.hquery.hquery_test_util import query_html_doc 3 | 4 | 5 | def test_location_path_works_as_interpolated_string_expression(): 6 | assert query_html_doc("
world
", '`Hello, ${//div/text()}!`') == expected_result('Hello, world!') 7 | 8 | 9 | def test_element_node_becomes_normalized_text_contents_in_interpolated_string(): 10 | html_body = """ 11 |

12 | foo bar 13 |

""" 14 | assert query_html_doc(html_body, '`-->${//p}<--`') == expected_result('-->foo bar<--') 15 | 16 | 17 | def test_text_between_embedded_expressions_gets_picked_up(): 18 | html_body = """ 19 |

one

20 |

two

21 |

three

""" 22 | assert query_html_doc(html_body, 'let $_ := 2 return `${//p[1]}, $_, ${//p[3]}`') == 'one, 2, three' 23 | 24 | 25 | def test_join_filter_joins_string_values_from_node_set(): 26 | html_body = """ 27 |

one

28 |

two

29 |

three

""" 30 | assert query_html_doc(html_body, '`${j:,://p}`') == expected_result('one,two,three') 31 | 32 | 33 | def test_join_filter_defaults_to_empty_string_delimiter(): 34 | html_body = """ 35 |

one

36 |

two

""" 37 | assert query_html_doc(html_body, '`${j:://p}`') == expected_result('onetwo') 38 | 39 | 40 | def test_truncate_filter_elides_contents(): 41 | html_body = '

The quick brown fox jumped over the lazy dog.

' 42 | assert query_html_doc(html_body, '`${tru:23:?://p}`') == expected_result('The quick brown fox?') 43 | 44 | 45 | def test_truncate_filter_defaults_to_no_suffix(): 46 | html_body = '

short, sharp shock

' 47 | assert query_html_doc(html_body, '`${tru:15:://p}`') == expected_result('short, sharp') 48 | 49 | 50 | def test_regex_replace_filter_replaces_stuff_with_other_stuff(): 51 | html_body = 'May 25, 1979' 52 | assert query_html_doc(html_body, r'`${rr:(\w+) (\d+)(, \d+):\2th of \1\3:://span}`') == '25th of May, 1979' 53 | 54 | 55 | def test_use_of_escapes_for_forbidden_characters_in_regex_replace_patterns(): 56 | assert query_html_doc('', r"""`it's ${rr:\w{3}:dog::"a cat's"} life`""") == "it's a dog's life" 57 | assert query_html_doc('', r'`${rr:: ::: let $x := "re: " return concat($x, "search")}`') == 'research' 58 | 59 | 60 | def test_regex_replace_filter_can_be_used_to_replace_unicode_characters(): 61 | assert query_html_doc('', u'`${rr: : :: "non-breaking\u00a0space"}`') == 'non-breaking space' 62 | 63 | 64 | def test_filters_chain_left_to_right(): 65 | html_body = """ 66 |

one

67 |

two

68 |

three

""" 69 | assert query_html_doc(html_body, '`${j:, :tru:12: ...://p} whatever!`') == 'one, two, ... whatever!' 70 | 71 | 72 | def test_character_escape_is_not_prematurely_decoded_in_interpolated_string(): 73 | query = 'let $x := "foo" return `Variable "$x" contains value $x`' 74 | assert query_html_doc('', query) == 'Variable "$x" contains value foo' # Not 'Variable "foo" contains...' 75 | 76 | 77 | def test_filters_are_applied_to_all_items_in_sequence_when_input_is_not_atomic(): 78 | html_body = """ 79 |

Hello, world!

80 |

Goodbye, world!

""" 81 | assert query_html_doc(html_body, '`${tru:8:://p}`') == 'Hello,Goodbye,' 82 | assert query_html_doc(html_body, '`${rr:world:test:://p}`') == 'Hello, test!Goodbye, test!' 83 | -------------------------------------------------------------------------------- /hq/hquery/object_type.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | from hq.hquery.expression_context import peek_context 4 | from hq.string_util import truncate_string, is_a_string 5 | 6 | from ..verbosity import verbose_print 7 | from ..soup_util import is_any_node, is_tag_node, is_text_node, is_attribute_node, debug_dump_node, \ 8 | debug_dump_long_string, derive_text_from_node 9 | 10 | 11 | BOOLEAN, SEQUENCE, NUMBER, STRING = range(4) 12 | TYPE_NAMES = ('BOOLEAN', 'SEQUENCE', 'NUMBER', 'STRING') 13 | 14 | 15 | def debug_dump_anything(obj): 16 | if is_any_node(obj): 17 | result = debug_dump_node(obj) 18 | elif is_boolean(obj) or is_number(obj) or is_hash(obj) or is_array(obj): 19 | result = repr(obj) 20 | elif is_string(obj): 21 | result = u'string("{0}")'.format(obj) 22 | elif is_node_set(obj): 23 | result = u'node-set({0})'.format(', '.join(truncate_string(debug_dump_node(node), 20) for node in obj)) 24 | elif is_sequence(obj): 25 | result = u'sequence({0})'.format(', '.join(truncate_string(debug_dump_anything(item), 20) for item in obj)) 26 | else: 27 | raise RuntimeError("debug_dump_anything doesn't know how to handle {0}".format(obj.__class__.__name__)) 28 | return debug_dump_long_string(result) 29 | 30 | 31 | def is_array(obj): 32 | return obj.__class__.__name__ == 'JsonArray' 33 | 34 | 35 | def is_boolean(obj): 36 | return obj.__class__.__name__ == 'boolean' 37 | 38 | 39 | def is_hash(obj): 40 | return obj.__class__.__name__ == 'JsonHash' 41 | 42 | 43 | def is_node_set(obj): 44 | return isinstance(obj, list) and all(is_any_node(x) for x in obj) 45 | 46 | 47 | def is_number(obj): 48 | return obj.__class__.__name__ == 'number' 49 | 50 | 51 | def is_sequence(obj): 52 | return isinstance(obj, list) 53 | 54 | 55 | def is_string(obj): 56 | return is_a_string(obj) 57 | 58 | 59 | def normalize_content(value): 60 | return re.sub(r'\s+', ' ', string_value(value)) 61 | 62 | 63 | def object_type(obj): 64 | if is_boolean(obj): 65 | return BOOLEAN 66 | elif is_node_set(obj): 67 | return SEQUENCE 68 | elif is_sequence(obj): 69 | return SEQUENCE 70 | elif is_number(obj): 71 | return NUMBER 72 | elif is_string(obj): 73 | return STRING 74 | else: 75 | verbose_print('UH-OH! Returning None from object_type({0})'.format(obj.__class__.__name__)) 76 | return None 77 | 78 | 79 | def object_type_name(obj): 80 | result = 'NULL OR UNKNOWN TYPE' 81 | 82 | if obj is not None: 83 | if isinstance(obj, int): 84 | index = obj 85 | else: 86 | index = object_type(obj) 87 | result = TYPE_NAMES[index] 88 | 89 | return result 90 | 91 | 92 | def string_value(obj): 93 | if is_tag_node(obj) or is_attribute_node(obj) or is_text_node(obj): 94 | return derive_text_from_node(obj, peek_context().preserve_space) 95 | elif is_number(obj) or is_boolean(obj): 96 | return str(obj) 97 | elif is_node_set(obj): 98 | return string_value(obj[0]) if len(obj) > 0 else '' 99 | elif is_sequence(obj): 100 | return ''.join(string_value(item) for item in obj) 101 | elif is_string(obj): 102 | return obj 103 | else: 104 | raise NotImplementedError('string_value not implemented for type "{0}"'.format(obj.__class__.__name__)) 105 | -------------------------------------------------------------------------------- /hq/hquery/equality_operators.py: -------------------------------------------------------------------------------- 1 | from hq.verbosity import verbose_print 2 | from hq.hquery.functions.core_boolean import boolean 3 | from hq.hquery.functions.core_number import number 4 | from hq.hquery.object_type import object_type, string_value, object_type_name 5 | from hq.hquery.evaluation_error import HqueryEvaluationError 6 | 7 | 8 | def _eq_bool_vs_primitive(bool_val, other_val): 9 | verbose_print('Comparing boolean value {0} with non-node-set value {1} (coerced to {2})'.format(bool_val, other_val, boolean(other_val))) 10 | return bool_val == boolean(other_val) 11 | 12 | 13 | def _eq_native(first, second): 14 | return first == second 15 | 16 | 17 | def _eq_node_sets(first, second): 18 | first_values = set([string_value(node) for node in first]) 19 | second_values = set([string_value(node) for node in second]) 20 | 21 | verbose_print('Comparing two nodes sets (size {0} and {1}).'.format(len(first_values), len(second_values))) 22 | 23 | for first_value in first_values: 24 | if first_value in second_values: 25 | verbose_print(u'Found value "{0}" from first node set in second node set'.format(first_value)) 26 | return True 27 | 28 | verbose_print('Found no matching nodes between node sets.') 29 | return False 30 | 31 | 32 | def _eq_node_set_vs_bool(bool_val, nodes_val): 33 | return bool_val == boolean(nodes_val) 34 | 35 | 36 | def _eq_node_set_vs_number(nodes_val, num_val): 37 | verbose_print('(=) comparing number {0} to {1} nodes'.format(num_val, len(nodes_val))) 38 | 39 | for node in nodes_val: 40 | node_str_val = string_value(node) 41 | node_num_val = number(node_str_val) 42 | verbose_print('(=) node string value "{0}" is{1} equal to "{2}"'.format( 43 | node_num_val, 44 | (' not' if node_num_val == num_val else ''), 45 | num_val)) 46 | 47 | if node_num_val == num_val: 48 | return True 49 | 50 | return False 51 | 52 | 53 | def _eq_node_set_vs_string(nodes_val, string_val): 54 | string_val = str(string_val) 55 | verbose_print(u'(=) comparing number "{0}" to {1} nodes'.format(string_val, len(nodes_val))) 56 | 57 | for node in nodes_val: 58 | node_val_string = string_value(node) 59 | verbose_print(u'(=) node string value "{0}" is{1} equal to "{2}"'.format( 60 | node_val_string, 61 | ('' if node_val_string == string_val else ' not'), 62 | string_val)) 63 | 64 | if node_val_string == string_val: 65 | return True 66 | 67 | return False 68 | 69 | 70 | def _eq_num_vs_string(num_val, string_val): 71 | return num_val == number(string_val) 72 | 73 | 74 | equality_ops_table = ( 75 | # BOOLEAN, SEQUENCE, NUMBER, STRING 76 | (_eq_native, _eq_node_set_vs_bool, _eq_bool_vs_primitive, _eq_bool_vs_primitive), # BOOLEAN 77 | (None, _eq_node_sets, _eq_node_set_vs_number, _eq_node_set_vs_string), # SEQUENCE 78 | (None, None, _eq_native, _eq_num_vs_string), # NUMBER 79 | (None, None, None, _eq_native), # STRING 80 | ) 81 | 82 | 83 | def equals(first, second): 84 | first_type = object_type(first) 85 | second_type = object_type(second) 86 | try: 87 | reverse = first_type > second_type 88 | op = equality_ops_table[first_type if not reverse else second_type][second_type if not reverse else first_type] 89 | return boolean(op(first if not reverse else second, second if not reverse else first)) 90 | except TypeError: 91 | msg = 'type mismatch comparing {0} and {1} for equality' 92 | raise HqueryEvaluationError(msg.format(object_type_name(first_type), object_type_name(second_type))) 93 | 94 | 95 | def not_equals(first, second): 96 | return boolean(not bool(equals(first, second))) 97 | -------------------------------------------------------------------------------- /hq/hquery/functions/core_number.py: -------------------------------------------------------------------------------- 1 | import math 2 | 3 | from hq.hquery.evaluation_error import HqueryEvaluationError 4 | from hq.soup_util import is_any_node 5 | from hq.hquery.object_type import is_number, is_node_set, string_value, is_boolean 6 | from hq.hquery.sequences import make_sequence 7 | 8 | exports = ('ceiling', 'floor', 'number', 'round_', 'sum') 9 | 10 | 11 | class number: 12 | 13 | def __init__(self, obj): 14 | if isinstance(obj, number): 15 | self.value = obj.value 16 | elif is_boolean(obj): 17 | self.value = 1 if obj else 0 18 | elif is_node_set(obj) or is_any_node(obj): 19 | self.value = self._int_or_float(float(string_value(obj))) 20 | else: 21 | try: 22 | self.value = self._int_or_float(float(obj)) 23 | except ValueError: 24 | self.value = float('nan') 25 | 26 | def __float__(self): 27 | return float(self.value) 28 | 29 | def __int__(self): 30 | return int(self.value) 31 | 32 | def __str__(self): 33 | result = str(self.value) 34 | if result == 'nan': 35 | result = 'NaN' 36 | return result 37 | 38 | def __hash__(self): 39 | return self.value.__hash__() 40 | 41 | def __add__(self, other): 42 | return number(self.value + self._value_of_other_operand(other)) 43 | 44 | def __sub__(self, other): 45 | return number(self.value - self._value_of_other_operand(other)) 46 | 47 | def __neg__(self): 48 | return number(-self.value) 49 | 50 | def __mul__(self, other): 51 | return number(self.value * self._value_of_other_operand(other)) 52 | 53 | def __div__(self, other): 54 | return self.__truediv__(other) 55 | 56 | def __truediv__(self, other): 57 | other = self._value_of_other_operand(other) 58 | if other == 0: 59 | return number(float('nan')) 60 | else: 61 | return number(self.value / other) 62 | 63 | def __mod__(self, other): 64 | return number(self.value % self._value_of_other_operand(other)) 65 | 66 | def __eq__(self, other): 67 | return self.value == self._value_of_other_operand(other) 68 | 69 | def __ge__(self, other): 70 | return self.value >= self._value_of_other_operand(other) 71 | 72 | def __gt__(self, other): 73 | return self.value > self._value_of_other_operand(other) 74 | 75 | def __le__(self, other): 76 | return self.value <= self._value_of_other_operand(other) 77 | 78 | def __lt__(self, other): 79 | return self.value < self._value_of_other_operand(other) 80 | 81 | def __repr__(self): 82 | return 'number({0})'.format(str(self.value)) 83 | 84 | @staticmethod 85 | def _int_or_float(numeric_value): 86 | if isinstance(numeric_value, int) or numeric_value % 1 != 0: 87 | return numeric_value 88 | else: 89 | return int(numeric_value) 90 | 91 | @staticmethod 92 | def _value_of_other_operand(other): 93 | return other.value if is_number(other) else other 94 | 95 | 96 | def ceiling(value): 97 | return number(math.ceil(value.value)) 98 | 99 | 100 | def floor(value): 101 | return number(math.floor(value.value)) 102 | 103 | 104 | def round_(*args): 105 | if len(args) == 0: 106 | raise HqueryEvaluationError('round() function requires at least one argument') 107 | value = args[0] 108 | if math.isnan(value.value): 109 | return value 110 | else: 111 | return number(round(value.value, 0 if len(args) < 2 else args[1].value)) 112 | 113 | 114 | def sum(*args): 115 | if len(args) >= 1: 116 | sequence = make_sequence(args[0]) 117 | else: 118 | sequence = make_sequence([]) 119 | if len(args) >= 2: 120 | zero = args[1] 121 | else: 122 | zero = number(0) 123 | 124 | if len(sequence) == 0: 125 | return zero 126 | else: 127 | result = number(0) 128 | for item in sequence: 129 | result += number(item) 130 | return result 131 | -------------------------------------------------------------------------------- /hq/hquery/relational_operators.py: -------------------------------------------------------------------------------- 1 | from operator import gt, lt, ge, le 2 | 3 | from hq.hquery.functions.core_string import string 4 | from hq.verbosity import verbose_print 5 | from hq.hquery.functions.core_boolean import boolean 6 | from hq.hquery.functions.core_number import number 7 | from hq.hquery.object_type import object_type, is_boolean, is_number 8 | from hq.hquery.syntax_error import HquerySyntaxError 9 | 10 | 11 | class RelationalOperator: 12 | 13 | def __init__(self, op): 14 | if op == '>': 15 | self.base_op = gt 16 | elif op == '>=': 17 | self.base_op = ge 18 | elif op == '<': 19 | self.base_op = lt 20 | elif op == '<=': 21 | self.base_op = le 22 | else: 23 | raise HquerySyntaxError('unexpected relational operator "{0}"'.format(op)) 24 | 25 | 26 | def evaluate(self, first, second): 27 | first_type = object_type(first) 28 | second_type = object_type(second) 29 | cmp = comparison_method_table[first_type][second_type] 30 | return boolean(cmp(self.base_op, first, second)) 31 | 32 | 33 | @property 34 | def name(self): 35 | return self.base_op.__name__ 36 | 37 | 38 | 39 | def _cmp_node_sets(base_op, first, second): 40 | first_values = set([number(node) for node in first]) 41 | second_values = set([number(node) for node in second]) 42 | 43 | verbose_print('Comparing two nodes sets (size {0} and {1}).'.format(len(first_values), len(second_values))) 44 | 45 | for first_value in first_values: 46 | for second_value in second_values: 47 | if base_op(first_value, second_value): 48 | msg = 'Comparison succeeded for "{0}" from first node set and "{1}" in second node set' 49 | verbose_print(msg.format(first_value, second_value)) 50 | return True 51 | 52 | verbose_print('Comparison failed for all nodes in both node sets.') 53 | return False 54 | 55 | 56 | def _cmp_nodes_to_value(base_op, first, second): 57 | node_values = set([number(node) for node in first]) 58 | second = number(second) 59 | verbose_print('Comparing {0} nodes in node set to value {1}'.format(len(node_values), second)) 60 | 61 | for node_value in node_values: 62 | if base_op(node_value, second): 63 | verbose_print('Comparison succeeded for node value "{0}" and value "{1}"'.format(node_value, second)) 64 | return True 65 | 66 | verbose_print('Comparison failed for all nodes in the node set.') 67 | return False 68 | 69 | 70 | def _cmp_value_to_nodes(base_op, first, second): 71 | node_values = set([number(node) for node in second]) 72 | first = number(first) 73 | verbose_print('Comparing {0} nodes in node set to value "{1}"'.format(len(node_values), first)) 74 | 75 | for node_value in node_values: 76 | if base_op(first, node_value): 77 | verbose_print('Comparison succeeded for value "{0}" and node value "{1}'.format(first, node_value)) 78 | return True 79 | 80 | verbose_print('Comparison failed for all nodes in the node set.') 81 | return False 82 | 83 | 84 | def _cmp_values(base_op, first, second): 85 | if is_boolean(first) or is_boolean(second): 86 | return base_op(1 if boolean(first) else 0, 1 if boolean(second) else 0) 87 | elif is_number(first) or is_number(second): 88 | return base_op(number(first), number(second)) 89 | else: 90 | return base_op(string(first), string(second)) 91 | 92 | 93 | comparison_method_table = ( 94 | # BOOLEAN, SEQUENCE, NUMBER, STRING 95 | (_cmp_values, _cmp_value_to_nodes, _cmp_values, _cmp_values), # BOOLEAN 96 | (_cmp_nodes_to_value, _cmp_node_sets, _cmp_nodes_to_value, _cmp_nodes_to_value), # SEQUENCE 97 | (_cmp_values, _cmp_value_to_nodes, _cmp_values, _cmp_values), # NUMBER 98 | (_cmp_values, _cmp_value_to_nodes, _cmp_values, _cmp_values), # STRING 99 | ) 100 | -------------------------------------------------------------------------------- /test/test_cli.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | try: 4 | from mock import mock_open 5 | except ImportError: 6 | from unittest.mock import mock_open 7 | 8 | from hq.hq import main 9 | from test.common_test_util import simulate_args_dict, wrap_html_body, capture_console_output 10 | 11 | 12 | def test_preserve_space_flag_turns_off_space_normalization(capsys, mocker): 13 | hquery = '`${//p}`' 14 | content_with_spaces = ' PyCharm rocks! ' 15 | mocker.patch('sys.stdin.read').return_value = wrap_html_body('

{0}

'.format(content_with_spaces)) 16 | 17 | mocker.patch('hq.hq.docopt').return_value = simulate_args_dict(expression=hquery, preserve='s') 18 | main() 19 | actual, _ = capture_console_output(capsys, strip=False) 20 | assert actual == content_with_spaces 21 | 22 | mocker.patch('hq.hq.docopt').return_value = simulate_args_dict(expression=hquery, preserve='') 23 | main() 24 | actual, _ = capture_console_output(capsys, strip=False) 25 | assert actual == 'PyCharm rocks!' 26 | 27 | 28 | def test_preserve_space_flag_causes_non_breaking_spaces_to_be_how_shall_we_say_preserved(capsys, mocker): 29 | mocker.patch('sys.stdin.read').return_value = wrap_html_body(u'

non\u00a0breaking spaces?

') 30 | 31 | mocker.patch('hq.hq.docopt').return_value = simulate_args_dict(expression='//p/text()', preserve='s') 32 | main() 33 | actual, _ = capture_console_output(capsys) 34 | assert actual == u'non\u00a0breaking\u00a0spaces?' 35 | 36 | mocker.patch('hq.hq.docopt').return_value = simulate_args_dict(expression='//p/text()', preserve='') 37 | main() 38 | actual, _ = capture_console_output(capsys) 39 | assert actual == u'non breaking spaces?' 40 | 41 | 42 | def test_ugly_flag_preserves_markup_formatting(capsys, mocker): 43 | expected = '

I, too, enjoy PyCharm.

' 44 | mocker.patch('hq.hq.docopt').return_value = simulate_args_dict(expression='//p', ugly=True) 45 | mocker.patch('sys.stdin.read').return_value = wrap_html_body(expected) 46 | 47 | main() 48 | 49 | actual, _ = capture_console_output(capsys, strip=False) 50 | assert actual == expected 51 | 52 | 53 | def test_syntax_error_prints_proper_error_message(capsys, mocker): 54 | mocker.patch('hq.hq.docopt').return_value = simulate_args_dict(expression='child:://') 55 | mocker.patch('sys.stdin.read').return_value = wrap_html_body('') 56 | 57 | main() 58 | 59 | _, actual = capture_console_output(capsys) 60 | assert re.match(r'^syntax error.+expected.+name.+got.+slash', actual.lower()) 61 | 62 | 63 | def test_query_error_prints_proper_error_message(capsys, mocker): 64 | mocker.patch('hq.hq.docopt').return_value = simulate_args_dict(expression='no-such-function()') 65 | mocker.patch('sys.stdin.read').return_value = wrap_html_body('') 66 | 67 | main() 68 | 69 | _, actual = capture_console_output(capsys) 70 | assert re.match(r'^query error.+unknown function.+no-such-function', actual.lower()) 71 | 72 | 73 | def test_reading_input_from_a_file_instead_of_stdin(capsys, mocker): 74 | expected_filename = 'filename.html' 75 | mocked_open = mock_open(read_data=wrap_html_body('

foo

')) 76 | mocker.patch('hq.hq.docopt').return_value = simulate_args_dict( 77 | expression='//p/text()', file=expected_filename) 78 | mocker.patch('hq.hq.open', mocked_open, create=True) 79 | 80 | main() 81 | 82 | actual, _ = capture_console_output(capsys) 83 | mocked_open.assert_called_with(expected_filename) 84 | assert actual == 'foo' 85 | 86 | 87 | def test_program_flag_reads_hquery_program_from_file(capsys, mocker): 88 | expected_filename = 'filename.hq' 89 | mocked_open = mock_open(read_data=''' 90 | //p 91 | -> 92 | $_/text()''') 93 | mocker.patch('hq.hq.docopt').return_value = simulate_args_dict( 94 | program=expected_filename) 95 | mocker.patch('sys.stdin.read').return_value = wrap_html_body('

foo

') 96 | mocker.patch('hq.hq.open', mocked_open, create=True) 97 | 98 | main() 99 | 100 | actual, _ = capture_console_output(capsys) 101 | mocked_open.assert_called_with(expected_filename) 102 | assert actual == 'foo' 103 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # hq 2 | **Powerful HTML slicing and dicing at the command line.** 3 | 4 | [![Build Status](https://travis-ci.org/rbwinslow/hq.svg?branch=master)](https://travis-ci.org/rbwinslow/hq) [![Coverage Status](https://coveralls.io/repos/github/rbwinslow/hq/badge.svg?branch=master)](https://coveralls.io/github/rbwinslow/hq?branch=master) 5 | 6 | `hq` is a Python-based command-line tool for querying HTML, manipulating data and producing results as HTML, JSON or any other format. It's based on a compact, flexible expression language that started out as an XPath implementation but ended up going a few different places, so I'm going ahead and calling it HQuery. 7 | 8 | HQuery is 99% compliant with the [XPath 1.0](https://www.w3.org/TR/xpath/) standard, minus some features not applicable to HTML. That's nice for querying, but you need more power to take control of the shape and format of the data you produce. To that end, HQuery also includes... 9 | 10 | * **Nuggets of XQuery** — only a few of the good parts! Just enough for iteration, branching and the like. 11 | * **XPath expansions for HTML** — including a `class::` axis and `class()` function, plus abbreviated axes to keep things terse. 12 | * **Super-charged string interpolation** — with powerful filters that you can chain together to transform data as you produce it. 13 | * **Computed constructors for HTML and JSON** — so you can programmatically assemble and output new HTML or JSON objects and arrays. 14 | * **Out-of-left-field union decomposition** — enabling amazingly terse and powerful mappings from clauses in a union to different expressions. 15 | 16 | ## Installing `hq` 17 | 18 | pip install hq 19 | 20 | ## Running `hq` 21 | 22 | cat /path/to/file.html | hq '`Hello, ${/html/head/title}!`' 23 | 24 | ...or... 25 | 26 | hq -f /path/to/file.html '`Hello, ${/html/head/title}!`' 27 | 28 | To print usage information: 29 | 30 | hq --help 31 | 32 | ## Running `hq` in a container 33 | 34 | There's a Docker image [project](https://github.com/frioux/hq.dkr) available that makes it super-easy to try out `hq` without installing any software (aside from Docker): 35 | 36 | cat /path/to/file.html | docker run -i frew/hq '//some/hquery' 37 | 38 | Thanks, Frew! 39 | 40 | ## Learning `hq` 41 | 42 | The [wiki](https://github.com/rbwinslow/hq/wiki) discusses the [motivations](https://github.com/rbwinslow/hq/wiki/Why-HQuery%3F) guiding the HQuery language's design and provides a [language reference](https://github.com/rbwinslow/hq/wiki/Language-Reference). 43 | 44 | ## Contributing to `hq` 45 | 46 | `hq` is tested against Pythons 3.5 through 3.9. The file structure and `setup.py` script for the project are based on [this blog post](https://gehrcke.de/2014/02/distributing-a-python-command-line-application/). 47 | 48 | `hq`'s dependencies are split into a "base" file, the subset needed to run the application, and a "dev" file providing the tools necessary to run tests and the like. To do development: 49 | 50 | pip install -r requirements/dev.txt 51 | 52 | The parsing logic in `hquery_processor.py` is based on the [top-down operator precendence](https://www.crockford.com/javascript/tdop/tdop.html) approach. 53 | 54 | ### Running Tests 55 | 56 | py.test 57 | 58 | The "dev.txt" dependencies also include [pytest-cov](https://pypi.python.org/pypi/pytest-cov), so you can generate a nice coverage report (which you'll find in the `htmlcov` directory): 59 | 60 | py.test --cov=hq --cov-report html 61 | 62 | If you want to turn verbosity on to figure out what's going on in a test, you need the `--gabby` flag (since `py.test` owns its own `-v` flag). You'll probably also want to run just one test at a time, because `--gabby` is way gabby: 63 | 64 | py.test --gabby -vv -k some_particular_test_function 65 | 66 | ### Uploading to PyPI 67 | 68 | This and other aspects of project setup, including running the CLI locally and using setup.py, are covered in the blog post linked above. I'm copying the PyPI upload stuff here for my own convenience, but I ask, of course, that you please submit pull requests rather than uploading to PyPI yourself: 69 | 70 | $ python setup.py sdist 71 | $ ls dist 72 | hq-0.0.4.tar.gz 73 | 74 | $ pip install twine 75 | $ twine upload dist/hq-0.0.4.tar.gz 76 | Uploading distributions to https://pypi.python.org/pypi 77 | Uploading hq-0.0.4.tar.gz 78 | Finished 79 | -------------------------------------------------------------------------------- /hq/hquery/flwor.py: -------------------------------------------------------------------------------- 1 | from hq.hquery.object_type import debug_dump_anything 2 | from hq.hquery.sequences import make_sequence, sequence_concat 3 | from hq.hquery.syntax_error import HquerySyntaxError 4 | from hq.hquery.variables import push_variable, variable_scope 5 | from hq.soup_util import debug_dump_long_string 6 | from hq.verbosity import verbose_print 7 | 8 | 9 | class Flwor: 10 | 11 | def __init__(self): 12 | self.global_variables = [] 13 | self.per_iteration_variables = [] 14 | self.return_expression = None 15 | self.sequence_expression = None 16 | self.sequence_variable = None 17 | 18 | 19 | def __str__(self): 20 | return '{0}{1}return '.format( 21 | '' if self.sequence_expression is None else 'for ${0}:= '.format(self.sequence_variable), 22 | (' '.join('let ${0} := '.format(v[0]) for v in self.per_iteration_variables) + ' ') if len(self.per_iteration_variables) else '' 23 | ) 24 | 25 | 26 | def append_let(self, variable_name, expression_fn): 27 | var_tuple = (variable_name, expression_fn) 28 | if self.sequence_expression is None: 29 | self.global_variables.append(var_tuple) 30 | else: 31 | self.per_iteration_variables.append(var_tuple) 32 | 33 | 34 | def debug_dump(self): 35 | return debug_dump_long_string(str(self)) 36 | 37 | 38 | def evaluate(self): 39 | verbose_print('Evaluating FLWOR {0}'.format(self), indent_after=True) 40 | 41 | if self.sequence_expression is not None: 42 | result = self._evaluate_iteration() 43 | else: 44 | result = self._evaluate_without_iteration() 45 | 46 | verbose_print(lambda: 'FLWOR evaluation completed; returning {0}'.format(debug_dump_anything(result)), 47 | outdent_before=True) 48 | return result 49 | 50 | 51 | def set_iteration_expression(self, variable_name, expression_fn): 52 | if self.sequence_expression is not None: 53 | raise HquerySyntaxError('More than one "for" clause found in FLWOR "{0}"'.format(self.debug_dump())) 54 | self.sequence_variable = variable_name 55 | self.sequence_expression = expression_fn 56 | 57 | 58 | def set_return_expression(self, expression_fn): 59 | if self.return_expression is not None: 60 | raise HquerySyntaxError('More than one return clause found for FLWOR {0}'.format(self.debug_dump())) 61 | self.return_expression = expression_fn 62 | 63 | 64 | def _evaluate_iteration(self): 65 | with variable_scope(): 66 | self._push_global_variables() 67 | 68 | sequence = make_sequence(self.sequence_expression()) 69 | verbose_print('Iterating over sequence containing {0} items'.format(len(sequence))) 70 | result = [] 71 | 72 | for item in sequence: 73 | verbose_print(lambda: u'Visiting item {0}'.format(debug_dump_anything(item)), indent_after=True) 74 | 75 | with variable_scope(): 76 | push_variable(self.sequence_variable, make_sequence(item)) 77 | self._push_iteration_variables() 78 | this_result = make_sequence(self.return_expression()) 79 | verbose_print('Return clause yielded {0} results for this visit'.format(len(this_result))) 80 | result = sequence_concat(result, this_result) 81 | 82 | verbose_print('Visit finished', outdent_before=True) 83 | 84 | return result 85 | 86 | 87 | def _evaluate_without_iteration(self): 88 | with variable_scope(): 89 | self._push_global_variables() 90 | verbose_print('Evaluating return expression.', indent_after=True) 91 | result = self.return_expression() 92 | verbose_print('Return expression produced {0}'.format(str(result)), outdent_before=True) 93 | return result 94 | 95 | 96 | def _push_global_variables(self): 97 | for let in self.global_variables: 98 | verbose_print('Evaluating let {0} := '.format(let[0])) 99 | push_variable(let[0], let[1]()) 100 | 101 | 102 | def _push_iteration_variables(self): 103 | for let in self.per_iteration_variables: 104 | verbose_print('Evaluating let {0} := '.format(let[0])) 105 | push_variable(let[0], let[1]()) 106 | -------------------------------------------------------------------------------- /hq/hquery/node_test.py: -------------------------------------------------------------------------------- 1 | from hq.hquery.axis import Axis 2 | 3 | from ..soup_util import is_root_node, is_tag_node, is_text_node, AttributeNode, is_attribute_node, is_any_node, root_tag_from_soup, \ 4 | is_comment_node 5 | 6 | 7 | def _accept_principal_node_type(node, axis=None): 8 | return is_attribute_node(node) if axis == Axis.attribute else is_tag_node(node) 9 | 10 | 11 | def _make_axis_agnostic_accept_fn(fn): 12 | def evaluate(node, axis=None): 13 | return fn(node) 14 | return evaluate 15 | 16 | 17 | def _make_name_accept_fn(value): 18 | def evaluate(node, axis=None): 19 | if axis == Axis.css_class: 20 | return is_tag_node(node) and 'class' in node.attrs and value in node['class'] 21 | else: 22 | type_fn = is_attribute_node if axis == Axis.attribute else is_tag_node 23 | return type_fn(node) and node.name.lower() == value 24 | return evaluate 25 | 26 | 27 | class NodeTest: 28 | 29 | def __init__(self, value, name_test=False): 30 | value = value.lower() 31 | self.repr = value 32 | self.is_name_test = name_test 33 | 34 | if name_test: 35 | self.accept_fn = _make_name_accept_fn(value) 36 | elif value == '*': 37 | self.accept_fn = _accept_principal_node_type 38 | elif value == 'node': 39 | self.accept_fn = _make_axis_agnostic_accept_fn(is_any_node) 40 | elif value == 'text': 41 | self.accept_fn = _make_axis_agnostic_accept_fn(is_text_node) 42 | elif value == 'comment': 43 | self.accept_fn = _make_axis_agnostic_accept_fn(is_comment_node) 44 | 45 | self.repr = '{0}{1}'.format(self.repr, '' if name_test or value == '*' else '()') 46 | 47 | 48 | def __repr__(self): 49 | return self.repr 50 | 51 | 52 | def apply(self, axis, node): 53 | nodes = getattr(self, 'gather_{0}'.format(axis.name))(node) 54 | return [node for node in nodes if self.accept_fn(node, axis=axis)] 55 | 56 | 57 | def gather_ancestor(self, node): 58 | if hasattr(node, 'parents'): 59 | return list(node.parents) 60 | else: 61 | return [] 62 | 63 | 64 | def gather_ancestor_or_self(self, node): 65 | result = self.gather_self(node) 66 | result.extend(self.gather_ancestor(node)) 67 | return result 68 | 69 | 70 | def gather_attribute(self, node): 71 | return list(AttributeNode.enumerate(node)) 72 | 73 | 74 | def gather_child(self, node): 75 | if is_root_node(node): 76 | return [root_tag_from_soup(node)] 77 | elif is_tag_node(node): 78 | return node.contents 79 | else: 80 | return [] 81 | 82 | 83 | def gather_css_class(self, node): 84 | return self.gather_child(node) 85 | 86 | 87 | def gather_descendant(self, node): 88 | if hasattr(node, 'descendants'): 89 | return list(node.descendants) 90 | else: 91 | return [] 92 | 93 | 94 | def gather_descendant_or_self(self, node): 95 | result = self.gather_self(node) 96 | result.extend(self.gather_descendant(node)) 97 | return result 98 | 99 | 100 | def gather_following(self, node): 101 | result = [] 102 | while is_tag_node(node): 103 | for sibling in node.next_siblings: 104 | result.append(sibling) 105 | result.extend(self.gather_descendant(sibling)) 106 | node = node.parent 107 | return result 108 | 109 | 110 | def gather_following_sibling(self, node): 111 | if hasattr(node, 'next_siblings'): 112 | return list(node.next_siblings) 113 | else: 114 | return [] 115 | 116 | 117 | def gather_parent(self, node): 118 | if hasattr(node, 'parent') and node.parent is not None: 119 | return [node.parent] 120 | else: 121 | return [] 122 | 123 | 124 | def gather_preceding(self, node): 125 | result = [] 126 | while is_tag_node(node): 127 | for sibling in node.previous_siblings: 128 | result.append(sibling) 129 | result.extend(self.gather_descendant(sibling)) 130 | node = node.parent 131 | return result 132 | 133 | 134 | def gather_preceding_sibling(self, node): 135 | if hasattr(node, 'previous_siblings'): 136 | return list(node.previous_siblings) 137 | else: 138 | return [] 139 | 140 | 141 | def gather_self(self, node): 142 | return [node] 143 | -------------------------------------------------------------------------------- /test/hquery/test_extended_functions.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | from test.common_test_util import expected_result 4 | from test.hquery.hquery_test_util import query_html_doc 5 | 6 | 7 | def test_class_function_returns_true_when_element_has_name_in_class_attribute(): 8 | html_body = """ 9 |

not selected

10 |

expected

""" 11 | 12 | assert query_html_doc(html_body, 'class(//p[1], "foo")') == 'false' 13 | assert query_html_doc(html_body, 'class(//p[2], "foo")') == 'true' 14 | assert query_html_doc(html_body, '//p[class("bar")]/text()') == 'expected' 15 | 16 | 17 | def test_even_and_odd_functions_select_the_appropriate_elements_based_on_position(): 18 | html_body = """ 19 |

You

20 |

I

21 |

are

22 |

am

23 |

odd.

24 |

even.

""" 25 | 26 | assert query_html_doc(html_body, '//p[even()]/text()') == expected_result(""" 27 | I 28 | am 29 | even.""") 30 | assert query_html_doc(html_body, '//p[odd()]/text()') == expected_result(""" 31 | You 32 | are 33 | odd.""") 34 | 35 | 36 | def test_lower_case_and_upper_case_change_string_case_as_expected(): 37 | assert query_html_doc('', 'lower-case("Foo BAR")') == 'foo bar' 38 | assert query_html_doc('', 'upper-case("fOO bar")') == 'FOO BAR' 39 | 40 | 41 | def test_matches_function_performs_regex_matching_as_per_xpath_30_functions_spec(): 42 | html_body = """ 43 |

moe

44 |

larry

45 |

curly

""" 46 | 47 | assert query_html_doc(html_body, '//p[matches(text(), "^l[ary]+")]/text()') == expected_result('larry') 48 | assert query_html_doc(html_body, '//p[matches(text(), ".URL.", "i")]/text()') == expected_result('curly') 49 | 50 | 51 | def test_matches_function_supports_a_subset_of_xpath_30_flag_values(): 52 | html_body = """ 53 |

first

54 |

second one

55 |

56 | multiple 57 | lines 58 | of 59 | text 60 |

""" 61 | multiline_pattern = r'.+multiple.+text.+' 62 | 63 | assert query_html_doc(html_body, r'//p[matches(text(), "\w+RST", "i")]/text()') == expected_result('first') 64 | assert query_html_doc(html_body, r'//p[matches(text(), ".+lines.+text")]', preserve_space=True) == '' 65 | assert re.match( 66 | multiline_pattern, 67 | query_html_doc(html_body, r'//p[matches(text(), ".+lines.+text", "s")]', preserve_space=True), 68 | re.S 69 | ) 70 | assert query_html_doc(html_body, r'//p[matches(text(), "^ *lines$")]', preserve_space=True) == '' 71 | assert re.match( 72 | multiline_pattern, 73 | query_html_doc(html_body, r'//p[matches(text(), "^\s*lines$", "m")]', preserve_space=True), 74 | re.S 75 | ) 76 | assert query_html_doc(html_body, r'//p[matches(text(), "sec ond\sone")]/text()') == '' 77 | assert query_html_doc(html_body, r'//p[matches(text(), "sec ond\sone", "x")]/text()') == 'second one' 78 | 79 | 80 | def test_matches_function_extends_to_using_context_node_when_passed_no_input_string(): 81 | html_body = """ 82 |

bar

83 |

foo

""" 84 | 85 | assert query_html_doc(html_body, '//p[matches("^f.+")]/text()') == expected_result('foo') 86 | 87 | 88 | def test_replace_function_performs_regex_replacement_as_per_xpath_30_functions_spec(): 89 | assert query_html_doc('', 'replace("dog mattress dog", "^dog", "cat")') == 'cat mattress dog' 90 | 91 | 92 | def test_replace_function_extends_standard_by_taking_string_value_of_any_type_of_input_object(): 93 | assert query_html_doc('

hello

', 'replace(//p, "h", "j")') == 'jello' 94 | 95 | 96 | def test_string_join_function_accepts_sequence_as_first_parameter_and_delimiter_as_second(): 97 | assert query_html_doc('', 'string-join(1 to 3, ", ")') == '1, 2, 3' 98 | 99 | 100 | def test_string_join_second_argument_is_optional(): 101 | assert query_html_doc('', 'string-join(1 to 2)') == '12' 102 | 103 | 104 | def test_tokenize_function_breaks_up_strings_as_per_xpath_30_functions_spec(): 105 | assert query_html_doc('', 'tokenize("Moe:Larry:..Curly", ":\.*")') == expected_result(""" 106 | Moe 107 | Larry 108 | Curly""") 109 | assert query_html_doc('', 'tokenize("HaxtaXpatience", "x", "i")') == expected_result(""" 110 | Ha 111 | ta 112 | patience""") 113 | assert query_html_doc('', 'count(tokenize("haxtaxstax", "x"))') == '4' 114 | 115 | 116 | def test_tokenize_function_extends_standard_by_supporting_any_object_as_input(): 117 | assert query_html_doc('

foo,bar

', 'tokenize(//p, ",")') == expected_result(""" 118 | foo 119 | bar""") 120 | -------------------------------------------------------------------------------- /test/hquery/test_location_paths.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | sys.path.insert(0, os.path.abspath('../..')) 5 | 6 | from hq.output import convert_results_to_output_text 7 | from hq.soup_util import make_soup 8 | from hq.hquery.hquery_processor import HqueryProcessor 9 | 10 | from ..common_test_util import expected_result 11 | from test.hquery.hquery_test_util import query_html_doc 12 | 13 | 14 | def test_absolute_location_path_should_find_multiple_grandchildren(): 15 | actual = query_html_doc('
one

not a div

two
', '/html/body/div') 16 | assert actual == expected_result(""" 17 |
18 | one 19 |
20 |
21 | two 22 |
""") 23 | 24 | 25 | def test_path_to_root_tag_succeeds_despite_other_root_level_objects(): 26 | html = """ 27 | 28 | 29 | 30 | 31 | """ 32 | raw_result = HqueryProcessor('/*').query(make_soup(html)) 33 | actual = convert_results_to_output_text(raw_result) 34 | assert actual == expected_result(""" 35 | 36 | 37 | """) 38 | 39 | 40 | def test_relative_location_path_as_predicate(): 41 | html_body = """ 42 |
43 | one 44 |
45 |
46 |

two

47 |
48 |
49 | three 50 |
""" 51 | actual = query_html_doc(html_body, '/html/body/div[span]') 52 | assert actual == expected_result(""" 53 |
54 | 55 | one 56 | 57 |
58 |
59 | 60 | three 61 | 62 |
""") 63 | 64 | 65 | def test_abbreviated_context_node_works_in_path(): 66 | html_body = """ 67 |
68 |

one

69 |
70 |

two

71 |
72 |

three

73 |
""" 74 | actual = query_html_doc(html_body, '/html/body/div/./p') 75 | assert actual == expected_result(""" 76 |

77 | one 78 |

79 |

80 | three 81 |

""") 82 | 83 | 84 | def test_abbreviated_context_node_works_in_predicate(): 85 | html_body = """ 86 |
87 |

one

88 |
89 |

two

90 |
91 | three 92 |
93 |
94 |

four

95 |
96 | """ 97 | actual = query_html_doc(html_body, '/html/body/node()[./p]') 98 | assert actual == expected_result(""" 99 |
100 |

101 | one 102 |

103 |
104 |
105 |

106 | four 107 |

108 |
""") 109 | 110 | 111 | def test_abbreviated_parent_node_works_in_path(): 112 | html_body = """ 113 |

114 | one 115 |

116 |

117 |
118 | two 119 |

""" 120 | actual = query_html_doc(html_body, '//p/br/../span') 121 | assert actual == expected_result(""" 122 | 123 | two 124 | """) 125 | 126 | 127 | def test_abbreviated_parent_node_works_in_predicate(): 128 | html_body = """ 129 |

130 |
131 | one 132 |

133 |

134 | two 135 |

136 |

137 |
138 | three 139 |

""" 140 | actual = query_html_doc(html_body, '//span[../br]') 141 | assert actual == expected_result(""" 142 | 143 | one 144 | 145 | 146 | three 147 | """) 148 | 149 | 150 | def test_double_slash_works_within_path(): 151 | html_body = """ 152 |
153 |

moe

154 |
155 |
156 |

larry

157 |
158 |

curly

159 |
160 |
161 |

joe besser

162 |
163 |

shemp

164 |
""" 165 | assert query_html_doc(html_body, '//section//p') == expected_result(""" 166 |

167 | moe 168 |

169 |

170 | larry 171 |

172 |

173 | curly 174 |

175 |

176 | shemp 177 |

""") 178 | 179 | 180 | def test_predicate_can_be_applied_to_variable_containing_node_set(): 181 | html_body = """ 182 |

not selected

183 |

selected

""" 184 | assert query_html_doc(html_body, 'let $x := //p return $x[@id="foo"]') == expected_result(""" 185 |

186 | selected 187 |

""") 188 | 189 | 190 | def test_no_space_between_text_runs_crossing_element_boundaries_in_element_string_value_if_there_was_none_in_doc(): 191 | html_body = """

"so-called" Klingon

""" 192 | assert query_html_doc(html_body, 'string(//p)') == '"so-called" Klingon' 193 | assert query_html_doc('

one two

', 'string(//p)') == 'one two' 194 | -------------------------------------------------------------------------------- /hq/hquery/string_interpolation.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | from hq.hquery.functions.extend_string import _xpath_flags_to_re_flags, string_join 4 | from hq.hquery.object_type import string_value, is_sequence 5 | from hq.hquery.syntax_error import HquerySyntaxError 6 | from hq.soup_util import debug_dump_long_string 7 | from hq.string_util import truncate_string, html_entity_decode 8 | from hq.verbosity import verbose_print 9 | 10 | 11 | clauses_pattern = re.compile(r'(\$\{[^\}]+\})|(\$[a-zA-Z_]\w*)|((?:[^\$]+))') 12 | 13 | 14 | def _join_filter_link(arguments): 15 | if arguments is None or len(arguments) == 0: 16 | delimiter = '' 17 | else: 18 | delimiter = arguments[0] 19 | 20 | def construct(eval_fn): 21 | return lambda: string_join(eval_fn(), delimiter) 22 | 23 | return construct 24 | 25 | 26 | def _regex_replace_filter_link(arguments): 27 | if arguments is None or len(arguments) < 2: 28 | msg = 'interpolated string regex replace filter expects three arguments; got {0}' 29 | raise HquerySyntaxError(msg.format(arguments)) 30 | 31 | if len(arguments) == 3: 32 | flags = _xpath_flags_to_re_flags(arguments[2]) 33 | else: 34 | flags = 0 35 | 36 | def construct(eval_fn): 37 | def evaluate(): 38 | value = eval_fn() 39 | if is_sequence(value): 40 | return [re.sub(arguments[0], arguments[1], string_value(item), flags=flags) for item in value] 41 | else: 42 | return re.sub(arguments[0], arguments[1], string_value(value), flags=flags) 43 | return evaluate 44 | 45 | return construct 46 | 47 | 48 | def _truncate_filter_link(arguments): 49 | 50 | def construct(eval_fn): 51 | length = int(arguments[0]) 52 | if len(arguments) == 1: 53 | suffix = '' 54 | else: 55 | suffix = arguments[1] 56 | 57 | def evaluate(): 58 | value = eval_fn() 59 | if is_sequence(value): 60 | return [truncate_string(string_value(item), length, suffix=suffix) for item in value] 61 | else: 62 | return truncate_string(string_value(value), length, suffix=suffix) 63 | 64 | return evaluate 65 | 66 | return construct 67 | 68 | 69 | filters = { 70 | r'j:([^:]*):': _join_filter_link, 71 | r'rr:([^:]+):([^:]*):([i]*):': _regex_replace_filter_link, 72 | r'tru:(\d+):([^:]*):': _truncate_filter_link, 73 | } 74 | 75 | 76 | def reduce_filters_and_expression(remainder, parse_interface, chain=None): 77 | for pattern in filters: 78 | match = re.match(pattern, remainder) 79 | if match is not None: 80 | filter_constructor = filters[pattern]([html_entity_decode(arg) for arg in match.groups()]) 81 | remainder = remainder[match.span()[1]:] 82 | if chain is None: 83 | return reduce_filters_and_expression(remainder, parse_interface, filter_constructor) 84 | else: 85 | return reduce_filters_and_expression(remainder, 86 | parse_interface, 87 | lambda eval_fn: filter_constructor(chain(eval_fn))) 88 | 89 | eval_fn = parse_interface.parse_in_new_processor(remainder) 90 | if chain is None: 91 | return eval_fn 92 | else: 93 | return chain(eval_fn) 94 | 95 | 96 | def parse_interpolated_string(source, parse_interface): 97 | verbose_print(u'Parsing interpolated string contents `{0}`'.format(source), indent_after=True) 98 | 99 | expressions = [] 100 | for embedded_expr, embedded_var, literal in clauses_pattern.findall(source): 101 | if embedded_expr: 102 | verbose_print(u'Adding embedded expression: {0}'.format(embedded_expr)) 103 | expressions.append(reduce_filters_and_expression(embedded_expr[2:-1], parse_interface)) 104 | elif embedded_var: 105 | verbose_print('Adding embedded variable reference: {0}'.format(embedded_var)) 106 | expressions.append(parse_interface.parse_in_new_processor(embedded_var)) 107 | else: 108 | verbose_print(u'Adding literal string contents `{0}`'.format(literal)) 109 | expressions.append(_make_literal_identity_closure(literal)) 110 | 111 | def evaluate(): 112 | chunks = [string_value(exp()) for exp in expressions] 113 | verbose_print(u'Interpolated string evaluation assembling {0} chunks{1}.'.format( 114 | len(chunks), 115 | '' if len(chunks) == 0 else u' ("{0}")'.format(u'", "'.join(chunks))) 116 | ) 117 | return ''.join(chunks) 118 | 119 | verbose_print( 120 | u'Finished parsing interpolated string `{0}` ({1} chunk(s) found)'.format(debug_dump_long_string(source), 121 | len(expressions)), 122 | outdent_before=True 123 | ) 124 | return evaluate 125 | 126 | 127 | def _make_literal_identity_closure(value): 128 | return lambda: html_entity_decode(value) 129 | -------------------------------------------------------------------------------- /test/hquery/test_flwor.py: -------------------------------------------------------------------------------- 1 | from hq.hquery.syntax_error import HquerySyntaxError 2 | from pytest import raises 3 | from test.common_test_util import expected_result 4 | from test.hquery.hquery_test_util import query_html_doc 5 | 6 | 7 | def test_variable_declaration_and_reference_in_a_flwor(): 8 | expected = 'bar' 9 | assert query_html_doc('', 'let $foo := "{0}" return $foo'.format(expected)) == expected 10 | 11 | 12 | def test_variable_declarations_are_processed_in_order(): 13 | hquery = 'let $hello := "hello, " let $whole-phrase := concat($hello, "world!") return $whole-phrase' 14 | assert query_html_doc('', hquery) == 'hello, world!' 15 | 16 | 17 | def test_variable_is_accessible_inside_interpolated_string(): 18 | assert query_html_doc('', 'let $foo := "bar" return `foo is $foo`') == 'foo is bar' 19 | assert query_html_doc('', 'let $foo := (1 to 3) return `${j:, :$foo}`') == '1, 2, 3' 20 | 21 | 22 | def test_multiple_return_clauses_are_not_allowed(): 23 | with raises(HquerySyntaxError): 24 | query_html_doc('', 'let $x := "whatever" return $x return "uh-oh"') 25 | 26 | 27 | def test_that_no_other_clauses_are_allowed_after_a_return(): 28 | with raises(HquerySyntaxError): 29 | query_html_doc('', 'let $x := "whatevs" return $x let $uh-oh := "oh no"') 30 | 31 | 32 | def test_iteration_using_for(): 33 | html_body = """ 34 |

one

35 |

two

36 |

three

""" 37 | assert query_html_doc(html_body, 'for $x in //p return $x/text()') == expected_result(""" 38 | one 39 | two 40 | three""") 41 | 42 | 43 | def test_flwor_variable_declaration_within_iteration(): 44 | query = 'for $x in (1 to 2) let $y := concat("Thing ", string($x)) return $y' 45 | assert query_html_doc('', query) == expected_result(""" 46 | Thing 1 47 | Thing 2""") 48 | 49 | 50 | def test_rooted_location_paths_work_with_both_kinds_of_slash(): 51 | html_body = """ 52 |
53 |
54 |
foo
55 |
56 |
57 |
58 |
59 |
bar
60 |
61 |
""" 62 | 63 | assert query_html_doc(html_body, 'for $x in //section return $x/div') == expected_result(""" 64 |
65 |
66 | foo 67 |
68 |
69 |
70 |
71 | bar 72 |
73 |
""") 74 | 75 | assert query_html_doc(html_body, 'for $x in //section return $x//div') == expected_result(""" 76 |
77 |
78 | foo 79 |
80 |
81 |
82 | foo 83 |
84 |
85 |
86 | bar 87 |
88 |
89 |
90 | bar 91 |
""") 92 | 93 | 94 | def test_variables_before_for_have_global_scope_and_within_for_have_iteration_scope(): 95 | query = """ 96 | let $x := 2 97 | let $z := $x 98 | for $_ in (1, $x) 99 | let $y := $_ 100 | let $x := $_ 101 | return ($x, $z, $x = $y)""" 102 | 103 | assert query_html_doc('', ' '.join(query.split('\n'))) == expected_result(""" 104 | 1 105 | 2 106 | true 107 | 2 108 | 2 109 | true""") 110 | 111 | 112 | def test_flwor_with_multiple_for_clauses_is_a_syntax_error(): 113 | with raises(HquerySyntaxError): 114 | query_html_doc('', 'for $x in (1, 2) let $y := 0 for $z in (3, 4) return $z') 115 | 116 | 117 | def test_flwor_with_multiple_return_clauses_is_a_syntax_error(): 118 | with raises(HquerySyntaxError): 119 | query_html_doc('', 'let $x := 0 return $x return $x + 1') 120 | 121 | 122 | def test_abbreviated_flowr_provides_expected_iteration_variable_in_value_clause(): 123 | html_body = """ 124 |

one

125 |

two

126 |

three

""" 127 | 128 | assert query_html_doc(html_body, '//p -> $_/text()') == expected_result(""" 129 | one 130 | two 131 | three""") 132 | 133 | 134 | def test_nested_abbreviated_flwors_evaluate_as_expected(): 135 | html_body = """ 136 |
137 |

one

138 |

two

139 |
140 |
141 |

three

142 |

four

143 |

five

144 |
""" 145 | 146 | assert query_html_doc(html_body, '//div -> $_/p[odd()] -> $_/text()') == expected_result(""" 147 | one 148 | three 149 | five""") 150 | 151 | 152 | def test_comma_as_sequence_cat_operator_does_not_bind_at_end_of_return_clause(): 153 | assert query_html_doc('', 'for $x in (1 to 2) return $x, "!"') == expected_result(""" 154 | 1 155 | 2 156 | !""") 157 | assert query_html_doc('', 'sum(for $x in //span return $x, "zero")') == 'zero' 158 | assert query_html_doc('', 'sum(//span -> $_, "zero")') == 'zero' 159 | 160 | 161 | def test_lack_of_return_at_end_of_flwor_is_a_syntax_error(): 162 | with raises(HquerySyntaxError): 163 | query_html_doc('', 'let $nil := "nothing"') 164 | 165 | 166 | def test_comma_can_be_used_to_declare_multiple_variables_in_a_let_clause(): 167 | assert query_html_doc('', 'let $foo := "foo", $bar := "bar" return string-join(($foo, $bar), " ")') == 'foo bar' 168 | -------------------------------------------------------------------------------- /hq/soup_util.py: -------------------------------------------------------------------------------- 1 | import re 2 | from builtins import str 3 | from bs4 import BeautifulSoup 4 | 5 | from .string_util import truncate_string 6 | from .verbosity import verbose_print 7 | 8 | 9 | class AttributeNode: 10 | 11 | def __init__(self, name, value): 12 | self.name = name 13 | self.value = ' '.join(value) if isinstance(value, list) else value 14 | 15 | def __repr__(self): 16 | return 'AttributeNode("{0}", "{1}")'.format(self.name, self.value) 17 | 18 | @classmethod 19 | def enumerate(cls, node): 20 | if hasattr(node, 'hq_attrs') and _isnt_root_with_odd_ghost_hq_attrs_on_it_for_reasons_i_dont_understand(node): 21 | return node.hq_attrs 22 | else: 23 | return [] 24 | 25 | 26 | def debug_dump_long_string(s, length=50, one_line=True, suffix='...'): 27 | return truncate_string(s, length, one_line, suffix) 28 | 29 | 30 | def debug_dump_node(obj): 31 | if is_root_node(obj): 32 | return 'ROOT DOCUMENT' 33 | elif is_tag_node(obj): 34 | return u'ELEMENT {0}'.format(debug_dump_long_string(str(obj))) 35 | elif is_attribute_node(obj): 36 | return 'ATTRIBUTE {0}="{1}"'.format(obj.name, debug_dump_long_string(obj.value)) 37 | elif is_text_node(obj): 38 | return u'TEXT "{0}"'.format(debug_dump_long_string(obj.string)) 39 | elif is_comment_node(obj): 40 | return u'COMMENT "{0}"'.format(debug_dump_long_string(obj.string)) 41 | else: 42 | return 'NODE type {0}'.format(obj.__class__.__name__) 43 | 44 | 45 | def derive_text_from_node(obj, preserve_space=False): 46 | if is_tag_node(obj) or is_root_node(obj): 47 | result = u'' 48 | strings = list(obj.strings) 49 | cursor = 0 50 | for run in (strings if preserve_space else obj.stripped_strings): 51 | if preserve_space: 52 | add_space = False 53 | else: 54 | while cursor < len(strings): 55 | if run in strings[cursor]: 56 | break 57 | else: 58 | cursor += 1 59 | if cursor < len(strings): 60 | add_space = strings[cursor][0].isspace() or (cursor > 0 and strings[cursor - 1][-1].isspace()) 61 | else: 62 | add_space = False 63 | result += u'{0}{1}'.format(' ' if add_space else '', run) 64 | elif is_attribute_node(obj): 65 | result = obj.value 66 | elif is_text_node(obj): 67 | result = str(obj) 68 | else: 69 | raise RuntimeError("don't know how to derive test from {0}".format(debug_dump_node(obj))) 70 | 71 | if not preserve_space: 72 | result = re.sub(u'\u00a0', ' ', result) 73 | result = re.sub(r'\s+', ' ', result).strip() 74 | 75 | return result 76 | 77 | 78 | def is_any_node(obj): 79 | return is_root_node(obj) or is_tag_node(obj) or is_attribute_node(obj) or is_text_node(obj) or is_comment_node(obj) 80 | 81 | 82 | def is_attribute_node(obj): 83 | return isinstance(obj, AttributeNode) 84 | 85 | 86 | def is_comment_node(obj): 87 | return obj.__class__.__name__ == 'Comment' 88 | 89 | 90 | def is_root_node(obj): 91 | return obj.__class__.__name__ == 'BeautifulSoup' 92 | 93 | 94 | def is_tag_node(obj): 95 | return obj.__class__.__name__ == 'Tag' 96 | 97 | 98 | def is_text_node(obj): 99 | return obj.__class__.__name__ == 'NavigableString' 100 | 101 | 102 | def make_soup(source): 103 | soup = BeautifulSoup(source, 'html.parser') 104 | counter = [0] 105 | 106 | def visit_node(node): 107 | node.hq_doc_index = counter[0] 108 | counter[0] += 1 109 | if is_tag_node(node): 110 | attr_names = sorted(node.attrs.keys(), key=lambda name: name.lower()) 111 | node.hq_attrs = [AttributeNode(name, node.attrs[name]) for name in attr_names] 112 | for attr in node.hq_attrs: 113 | visit_node(attr) 114 | 115 | preorder_traverse_node_tree(soup, visit_node, filter=is_any_node) 116 | verbose_print('Loaded HTML document containing {0} indexed nodes.'.format(counter[0])) 117 | return soup 118 | 119 | 120 | def preorder_traverse_node_tree(node, fn, filter=lambda n: is_tag_node(n) or is_root_node(n)): 121 | if filter(node): 122 | fn(node) 123 | if hasattr(node, 'hq_attrs') and _isnt_root_with_odd_ghost_hq_attrs_on_it_for_reasons_i_dont_understand(node): 124 | for attr in node.hq_attrs: 125 | preorder_traverse_node_tree(attr, fn, filter) 126 | if hasattr(node, 'children'): 127 | for child in node.children: 128 | preorder_traverse_node_tree(child, fn, filter) 129 | 130 | 131 | def root_tag_from_any_tag(obj): 132 | return root_tag_from_soup(soup_from_any_tag(obj)) 133 | 134 | 135 | def root_tag_from_soup(soup): 136 | return next(tag for tag in soup.children if is_tag_node(tag)) 137 | 138 | 139 | def soup_from_any_tag(obj): 140 | while obj.parent is not None: 141 | obj = obj.parent 142 | return obj 143 | 144 | 145 | def _isnt_root_with_odd_ghost_hq_attrs_on_it_for_reasons_i_dont_understand(node): 146 | return node.hq_attrs is not None 147 | -------------------------------------------------------------------------------- /hq/hquery/location_path.py: -------------------------------------------------------------------------------- 1 | from hq.hquery.axis import Axis 2 | from hq.hquery.syntax_error import HquerySyntaxError 3 | from hq.soup_util import debug_dump_node, soup_from_any_tag, debug_dump_long_string 4 | from hq.verbosity import verbose_print 5 | from hq.hquery.expression_context import get_context_node, peek_context 6 | from hq.hquery.evaluation_in_context import evaluate_across_contexts, evaluate_in_context 7 | from hq.hquery.functions.core_number import number 8 | from hq.hquery.object_type import is_number 9 | from hq.hquery.sequences import make_node_set 10 | 11 | 12 | class LocationPath: 13 | 14 | def __init__(self, first_axis, first_node_test, first_predicates, absolute=False, root_expression=None): 15 | self.absolute = absolute 16 | self.root_expression = root_expression 17 | self.steps = [] 18 | self.append_step(first_axis, first_node_test, first_predicates) 19 | if self.absolute and self.root_expression is not None: 20 | raise HquerySyntaxError('internal error forming location path; it looks both rooted and absolute') 21 | 22 | 23 | def __len__(self): 24 | return len(self.steps) 25 | 26 | 27 | def __str__(self): 28 | return '{0}{1}{2}'.format('' if self.root_expression is None else '/', 29 | '/' if self.absolute else '', 30 | '/'.join([str(step) for step in self.steps])) 31 | 32 | 33 | def append_step(self, axis, node_test, predicates): 34 | if axis == Axis.css_class and not node_test.is_name_test: 35 | raise HquerySyntaxError('CSS class axis must be followed by a name test, not a node test') 36 | self.steps.append(LocationPathStep(axis, node_test, predicates)) 37 | 38 | 39 | def debug_dump(self): 40 | return debug_dump_long_string(str(self)) 41 | 42 | 43 | def evaluate(self): 44 | verbose_print(lambda: 'Evaluating location path {0}'.format(self.debug_dump()), indent_after=True) 45 | 46 | if self.absolute: 47 | verbose_print('Switching context to root because this path is absolute.') 48 | results = evaluate_in_context(soup_from_any_tag(get_context_node()), 49 | lambda: self._evaluate_steps(self.steps)) 50 | elif self.root_expression is not None: 51 | results = evaluate_across_contexts(self.root_expression(), lambda: self._evaluate_steps(self.steps)) 52 | else: 53 | results = self._evaluate_steps(self.steps) 54 | 55 | verbose_print('Evaluation completed; location path selected {0} nodes'.format(len(results)), 56 | outdent_before=True) 57 | return make_node_set(results, reverse=False) 58 | 59 | 60 | def _evaluate_steps(self, remaining_steps): 61 | step = remaining_steps[0] 62 | verbose_print(lambda: 'Evaluating step {0}'.format(remaining_steps[0]), indent_after=True) 63 | 64 | result_set = make_node_set(step.node_test.apply(step.axis, get_context_node()), 65 | reverse=step.axis.is_reverse_order()) 66 | verbose_print(lambda: 'Axis and node test produced {0} matching nodes'.format(len(result_set))) 67 | 68 | for index, expression_fn in enumerate(step.predicates): 69 | def accept_context_node(): 70 | context = peek_context() 71 | 72 | format_str = u'Evaluating predicate expression for context node at position {0} of {1}: {2}.' 73 | verbose_print(lambda: format_str.format(context.position, context.size, debug_dump_node(context.node))) 74 | 75 | value = expression_fn() 76 | if is_number(value): 77 | accept = number(context.position) == value 78 | else: 79 | accept = bool(value) 80 | 81 | verbose_print(lambda: u'{0} node {1}'.format('Accepted' if accept else 'Rejected', 82 | debug_dump_node(context.node))) 83 | return [context.node] if accept else [] 84 | 85 | verbose_print(lambda: 'Evaluating predicate #{0} against {1} nodes'.format(index + 1, len(result_set)), 86 | indent_after=True) 87 | result_set = evaluate_across_contexts(result_set, accept_context_node) 88 | verbose_print( 89 | lambda: 'Evaluation of predicate #{0} complete; accepted {1} nodes.'.format(index + 1, len(result_set)), 90 | outdent_before=True) 91 | 92 | if len(remaining_steps) > 1: 93 | result_set = evaluate_across_contexts(result_set, lambda: self._evaluate_steps(remaining_steps[1:])) 94 | 95 | verbose_print(lambda: 'Step evaluation completed; returning {0} nodes.'.format(len(result_set)), 96 | outdent_before=True) 97 | return result_set 98 | 99 | 100 | 101 | class LocationPathStep: 102 | 103 | def __init__(self, axis, node_test, predicates): 104 | self.axis = axis 105 | self.node_test = node_test 106 | self.predicates = predicates 107 | 108 | def __str__(self): 109 | return '{0}::{1}{2}'.format(self.axis.name, repr(self.node_test), '[predicate]' * len(self.predicates)) 110 | -------------------------------------------------------------------------------- /hq/hquery/computed_constructors/json_hash.py: -------------------------------------------------------------------------------- 1 | import json 2 | import re 3 | 4 | from hq.hquery.computed_constructors.hash_key_value import HashKeyValue 5 | from hq.hquery.evaluation_error import HqueryEvaluationError 6 | from hq.hquery.expression_context import peek_context 7 | from hq.hquery.functions.core_number import number 8 | from hq.hquery.object_type import string_value, object_type_name, is_string, is_number, is_boolean, \ 9 | is_hash, is_array, is_sequence 10 | from hq.hquery.sequences import make_sequence 11 | from hq.hquery.syntax_error import HquerySyntaxError 12 | from hq.soup_util import is_tag_node, debug_dump_node, is_any_node, is_text_node, debug_dump_long_string 13 | from hq.verbosity import verbose_print 14 | 15 | 16 | class JsonHash: 17 | 18 | def __init__(self, contents): 19 | if not isinstance(contents, dict): 20 | raise HqueryEvaluationError('Attempted to construct a JSON hash based on a(n) {0} object'.format( 21 | contents.__class__.__name__)) 22 | self.contents = contents 23 | 24 | 25 | def __repr__(self): 26 | return 'HASH {0}'.format(repr(self.contents)) 27 | 28 | 29 | def __str__(self): 30 | return json.dumps(self.contents) 31 | 32 | 33 | 34 | def _construct_array_filter(tag_names): 35 | tag_names = tag_names.split(',') 36 | 37 | def evaluate(hash): 38 | for key, value in hash.items(): 39 | if key in tag_names: 40 | if not isinstance(value, list): 41 | verbose_print('JSON hash constructor array filter converting attribute "{0}" to array'.format(key)) 42 | hash[key] = [value] 43 | 44 | return evaluate 45 | 46 | 47 | def _construct_map_filter(mappings): 48 | mappings = {old: new for (old, _, new) in [m.partition('>') for m in mappings.split(',')]} 49 | 50 | def evaluate(hash): 51 | to_replace = [] 52 | for key, value in hash.items(): 53 | if key in mappings: 54 | verbose_print('JSON hash constructor mapping filter converting attribute name "{0}" to "{1}"'.format(key, value)) 55 | to_replace.append(key) 56 | 57 | for key in to_replace: 58 | hash[mappings[key]] = hash[key] 59 | del hash[key] 60 | 61 | return evaluate 62 | 63 | 64 | def _construct_number_filter(tag_names): 65 | tag_names = tag_names.split(',') 66 | 67 | def evaluate(hash): 68 | for key, value in hash.items(): 69 | if key in tag_names: 70 | verbose_print( 71 | 'JSON hash constructor number filter converting attribute "{0}" value(s) to numbers'.format(key) 72 | ) 73 | if isinstance(value, list): 74 | hash[key] = [number(v).value for v in value] 75 | else: 76 | hash[key] = number(value).value 77 | 78 | return evaluate 79 | 80 | 81 | _name_list_arg_regex = r'(([a-zA-Z]\w*,?)+)' 82 | 83 | def _skip_over_embedded_groups_from_list_matches(groups): 84 | return groups[::2] 85 | 86 | 87 | _filter_map = { 88 | r'a:{0}:'.format(_name_list_arg_regex): _construct_array_filter, 89 | r'm:(([a-zA-Z]\w*>[a-zA-Z]\w*,?)+):': _construct_map_filter, 90 | r'n:{0}:'.format(_name_list_arg_regex): _construct_number_filter, 91 | } 92 | 93 | 94 | class ComputedJsonHashConstructor: 95 | 96 | def __init__(self): 97 | self.contents = None 98 | self.filters = [] 99 | 100 | 101 | def set_contents(self, expression_fn): 102 | if self.contents is not None: 103 | raise HquerySyntaxError('computed JSON hash constructor already has contents') 104 | self.contents = expression_fn 105 | 106 | 107 | def set_filters(self, source): 108 | while len(source) > 0: 109 | match = None 110 | for regex, constructor in _filter_map.items(): 111 | match = re.match(regex, source) 112 | if match: 113 | filter_fn = constructor(*_skip_over_embedded_groups_from_list_matches(match.groups())) 114 | self.filters.append(filter_fn) 115 | source = source[match.span()[1]:] 116 | break 117 | if match is None: 118 | raise HquerySyntaxError( 119 | 'Malformed filter "{0}" in computed JSON hash constructor filter clause'.format(source) 120 | ) 121 | 122 | 123 | def evaluate(self): 124 | result = dict() 125 | 126 | for item in make_sequence(self.contents()) if self.contents is not None else []: 127 | if isinstance(item, HashKeyValue): 128 | if is_sequence(item.value) and len(item.value) == 1: 129 | item.value = item.value[0] 130 | 131 | if is_number(item.value) or is_boolean(item.value): 132 | result[item.key] = item.value.value 133 | elif is_hash(item.value) or is_array(item.value): 134 | result[item.key] = item.value.contents 135 | else: 136 | result[item.key] = string_value(item.value) 137 | elif is_tag_node(item): 138 | self._gab('adding element "{0}" to contents'.format(item.name)) 139 | self._process_tag(result, item) 140 | elif is_text_node(item) or is_string(item): 141 | self._gab('adding text "{0}" to contents'.format(debug_dump_long_string(string_value(item)))) 142 | result['text'] = self._append_to_text(result['text'] if 'text' in result else '', string_value(item)) 143 | else: 144 | value_desc = debug_dump_node(item) if is_any_node(item) else object_type_name(item) 145 | raise HqueryEvaluationError( 146 | 'Cannot use {0} as a content object in a computed JSON hash constructor'.format(value_desc) 147 | ) 148 | 149 | self._process_filters(result) 150 | 151 | return JsonHash(result) 152 | 153 | 154 | def _append_to_text(self, so_far, more_content): 155 | return '{0}{1}{2}'.format(so_far, ' ' if len(so_far) > 0 else '', more_content) 156 | 157 | 158 | def _gab(self, message): 159 | verbose_print('JSON hash constructor {0}'.format(message)) 160 | 161 | 162 | def _process_filters(self, result): 163 | for filter in self.filters: 164 | filter(result) 165 | 166 | 167 | def _process_tag(self, result, value): 168 | new_value = string_value(value) 169 | if value.name in result: 170 | if isinstance(result[value.name], list): 171 | result[value.name].append(new_value) 172 | else: 173 | result[value.name] = [result[value.name], new_value] 174 | else: 175 | result[value.name] = new_value 176 | -------------------------------------------------------------------------------- /test/hquery/test_computed_json_construction.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | from test.hquery.hquery_test_util import query_html_doc 4 | 5 | 6 | def test_hash_constructor_turns_tags_into_tag_name_keys_with_tag_content_values(): 7 | html_body = """ 8 |

foo

9 |
bar
""" 10 | actual = json.loads(query_html_doc(html_body, 'hash { /html/body/* }')) 11 | assert actual['p'] == 'foo' 12 | assert actual['div'] == 'bar' 13 | 14 | 15 | def test_hash_constructor_turns_text_into_attribute_named_text(): 16 | html_body = '

Hello, world!

' 17 | expected = '{"text": "Hello, world!"}' 18 | assert query_html_doc(html_body, 'hash { //p/text() }') == expected 19 | assert query_html_doc('', 'hash { "Hello, world!" }') == expected 20 | 21 | 22 | def test_hash_constructor_joins_discontinuous_text_from_content_sequence_with_spaces_in_between(): 23 | html_body = '

vidi

' 24 | assert query_html_doc(html_body, 'hash { "veni", //p/text(), "vici" }') == '{"text": "veni vidi vici"}' 25 | 26 | 27 | def test_hash_constructor_coalesces_like_elements_into_an_array_by_default(): 28 | html_body = """ 29 |

one

30 |
two
31 |

three

""" 32 | 33 | actual = json.loads(query_html_doc(html_body, 'hash { /html/body/* }')) 34 | assert isinstance(actual['p'], list) 35 | assert len(actual['p']) == 2 36 | assert actual['p'][1] == 'three' 37 | assert actual['div'] == 'two' 38 | 39 | 40 | def test_hash_constructor_array_filter_causes_matching_elements_to_be_put_in_an_array(): 41 | html_body = """ 42 |

zero

43 |

one

""" 44 | actual = json.loads(query_html_doc(html_body, 'hash {a:h1:} { /html/body/* }')) 45 | 46 | assert actual['p'] == 'one' 47 | assert isinstance(actual['h1'], list) 48 | assert len(actual['h1']) == 1 49 | assert actual['h1'][0] == 'zero' 50 | 51 | 52 | def test_hash_constructor_number_filter_causes_contents_of_matching_elements_to_be_interpreted_as_numbers(): 53 | html_body = """ 54 |

20

55 |
20
56 |

20.20

""" 57 | 58 | actual = json.loads(query_html_doc(html_body, 'hash {n:div,h1:} { /html/body/* }')) 59 | 60 | assert actual['p'] == '20' 61 | assert actual['div'] == 20 62 | assert actual['h1'] == 20.2 63 | 64 | 65 | def test_hash_constructor_filters_can_be_combined(): 66 | html_body = """ 67 |

20

68 |
20
69 |

20.20

""" 70 | 71 | actual = json.loads(query_html_doc(html_body, 'hash {a:p,h1:n:div,h1:} { /html/body/* }')) 72 | assert isinstance(actual['p'], list) 73 | assert isinstance(actual['h1'], list) 74 | assert actual['p'][0] == '20' 75 | assert actual['div'] == 20 76 | assert actual['h1'][0] == 20.2 77 | 78 | actual = json.loads(query_html_doc(html_body, 'hash {n:div,h1:a:p,h1:} { /html/body/* }')) 79 | assert isinstance(actual['p'], list) 80 | assert isinstance(actual['h1'], list) 81 | assert actual['p'][0] == '20' 82 | assert actual['div'] == 20 83 | assert actual['h1'][0] == 20.2 84 | 85 | 86 | def test_hash_constructor_mapping_filter_renames_attributes_derived_from_element_content(): 87 | html_body = """ 88 |

foo

89 |
bar
""" 90 | 91 | actual = json.loads(query_html_doc(html_body, 'hash {m:p>paragraph,div>other:} { /html/body/* }')) 92 | 93 | assert 'paragraph' in actual 94 | assert 'other' in actual 95 | assert 'p' not in actual 96 | assert 'div' not in actual 97 | assert actual['paragraph'] == 'foo' 98 | assert actual['other'] == 'bar' 99 | 100 | 101 | def test_hash_constructor_can_contain_a_sequence_assembled_from_node_sets(): 102 | html_body = """ 103 |

foo

104 |
bar
""" 105 | 106 | actual = json.loads(query_html_doc(html_body, 'hash { /html/body/p, /html/body/div }')) 107 | 108 | assert 'p' in actual 109 | assert 'div' in actual 110 | assert actual['p'] == 'foo' 111 | assert actual['div'] == 'bar' 112 | 113 | 114 | def test_hash_keys_can_be_used_to_define_attributes_in_a_constructed_hash(): 115 | actual = json.loads(query_html_doc('', 'hash {foo: "bar", moe: "larry"}')) 116 | 117 | assert 'foo' in actual 118 | assert actual['foo'] == 'bar' 119 | assert 'moe' in actual 120 | assert actual['moe'] == 'larry' 121 | 122 | 123 | def test_hash_keys_can_be_mixed_with_other_types_of_content_in_a_constructed_hash(): 124 | html_body = """ 125 | Wake up and go back to sleep! 126 | I'm trying to think, but nothing happens!""" 127 | 128 | actual = json.loads(query_html_doc(html_body, 'hash {//moe, larry: "The pain goes away on payday.", //curly}')) 129 | 130 | assert 'moe' in actual 131 | assert 'larry' in actual 132 | assert 'curly' in actual 133 | assert actual['moe'] == 'Wake up and go back to sleep!' 134 | assert actual['larry'] == 'The pain goes away on payday.' 135 | assert actual['curly'] == "I'm trying to think, but nothing happens!" 136 | 137 | 138 | def test_non_string_types_survive_conversion_to_json(): 139 | actual = json.loads(query_html_doc('', 'hash { integer: 1, float: 1.1, boolean: true() }')) 140 | 141 | assert all(name in actual for name in ('integer', 'float', 'boolean')) 142 | assert isinstance(actual['integer'], int) 143 | assert isinstance(actual['float'], float) 144 | assert isinstance(actual['boolean'], bool) 145 | 146 | 147 | def test_hash_can_contain_key_values_that_are_other_computed_json_objects(): 148 | actual = json.loads(query_html_doc('', 'hash {a_hash: hash {foo: "bar"}, an_array: array {"one", 2}}')) 149 | 150 | assert 'a_hash' in actual 151 | assert 'an_array' in actual 152 | assert isinstance(actual['a_hash'], dict) 153 | assert isinstance(actual['an_array'], list) 154 | assert 'foo' in actual['a_hash'] 155 | assert actual['a_hash']['foo'] == 'bar' 156 | assert len(actual['an_array']) == 2 157 | assert actual['an_array'][0] == 'one' 158 | assert actual['an_array'][1] == 2 159 | 160 | 161 | def test_element_value_in_hash_key_is_transformed_into_string_value_by_default(): 162 | html_body = '

you are here

' 163 | 164 | actual = json.loads(query_html_doc(html_body, 'hash { placement: //p }')) == 'You are here' 165 | 166 | 167 | def test_array_constructor_uses_string_value_of_elements_when_given_node_sets_as_contents(): 168 | html_body = """ 169 |

one

170 |
two
171 |

three

""" 172 | 173 | actual = json.loads(query_html_doc(html_body, 'array { //p, //div }')) 174 | 175 | assert len(actual) == 3 176 | assert actual[0] == 'one' 177 | assert actual[1] == 'three' 178 | assert actual[2] == 'two' 179 | 180 | 181 | def test_array_constructor_properly_handles_hash_constructors_as_contents(): 182 | actual = json.loads(query_html_doc('', 'array { (0 to 2) -> hash {value: $_} }')) 183 | 184 | assert len(actual) == 3 185 | assert all('value' in hash for hash in actual) 186 | assert all(actual[i]['value'] == i for i in range(0, 3)) 187 | 188 | 189 | def test_text_content_normalization_is_applied_to_attribute_values_in_hash_constructor(): 190 | preserved = u'\u00a0non\u00a0breaking\u00a0spaces ' 191 | html_body = u'

{0}

'.format(preserved) 192 | 193 | actual = json.loads(query_html_doc(html_body, 'hash {para: //p/text()}')) 194 | assert actual['para'] == 'non breaking spaces' 195 | 196 | actual = json.loads(query_html_doc(html_body, 'hash {para: //p/text()}', preserve_space=True)) 197 | assert actual['para'] == preserved 198 | -------------------------------------------------------------------------------- /test/hquery/test_core_functions.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | sys.path.insert(0, os.path.abspath('../..')) 5 | 6 | from ..common_test_util import expected_result 7 | from test.hquery.hquery_test_util import query_html_doc 8 | 9 | 10 | def test_boolean_function_converts_numbers_according_to_w3c_rules(): 11 | assert query_html_doc('', 'boolean(0)') == expected_result('false') 12 | assert query_html_doc('', 'boolean(-0)') == expected_result('false') 13 | assert query_html_doc('', 'boolean(1)') == expected_result('true') 14 | assert query_html_doc('', 'boolean(-1)') == expected_result('true') 15 | assert query_html_doc('', 'false() = boolean(false())') == expected_result('true') 16 | assert query_html_doc('', 'boolean(0 div 0)') == expected_result('false') 17 | 18 | 19 | def test_boolean_function_converts_node_sets_according_to_w3c_rules(): 20 | assert query_html_doc('
', 'boolean(//div)') == expected_result('true') 21 | assert query_html_doc('
', 'boolean(//p)') == expected_result('false') 22 | 23 | 24 | def test_boolean_function_converts_strings_according_to_w3c_rules(): 25 | assert query_html_doc('', 'boolean("")') == expected_result('false') 26 | assert query_html_doc('', 'boolean(" ")') == expected_result('true') 27 | 28 | 29 | def test_ceiling_returns_expected_integer_values_baserd_on_xpath_3_examples(): 30 | assert query_html_doc('', 'ceiling(10.5)') == '11' 31 | assert query_html_doc('', 'ceiling(-10.5)') == '-10' 32 | 33 | 34 | def test_floor_returns_expected_integer_values_baserd_on_xpath_3_examples(): 35 | assert query_html_doc('', 'floor(10.5)') == '10' 36 | assert query_html_doc('', 'floor(-10.5)') == '-11' 37 | 38 | 39 | def test_id_function_returns_node_set_where_node_ids_match_any_names_in_whitespace_separated_list(): 40 | html_body = """ 41 |

one

42 |

two

43 |

three

""" 44 | assert query_html_doc(html_body, 'id("one")') == expected_result(""" 45 |

46 | one 47 |

""") 48 | assert query_html_doc(html_body, 'id("one 3")') == expected_result(""" 49 |

50 | one 51 |

52 |

53 | three 54 |

""") 55 | assert query_html_doc(html_body, 'id(3)') == expected_result(""" 56 |

57 | three 58 |

""") 59 | 60 | 61 | def test_id_function_crazy_use_case_where_id_values_are_derived_from_string_values_of_nodes_in_node_set(): 62 | html_body = """ 63 |
    64 |
  • one
  • 65 |
  • 2
  • 66 |
67 |

one

68 |

two

""" 69 | assert query_html_doc(html_body, 'id(//li)') == expected_result(""" 70 |

71 | one 72 |

73 |

74 | two 75 |

""") 76 | 77 | 78 | def test_name_function_returns_tag_name_of_given_element_or_first_element_if_given_a_node_set(): 79 | html_body = '

' 80 | assert query_html_doc(html_body, 'name(/html/body/*)') == 'div' 81 | 82 | 83 | def test_name_function_returns_name_of_context_node_if_passed_no_argument(): 84 | html_body = """ 85 |
not selected
86 |

selected

""" 87 | assert query_html_doc(html_body, '/html/body/*[name() = "p"]') == expected_result(""" 88 |

89 | selected 90 |

""") 91 | 92 | 93 | def test_name_function_returns_empty_string_if_passed_a_node_that_is_not_an_element(): 94 | html_body = 'Text comes first then element' 95 | assert query_html_doc(html_body, 'name(/html/body/node()[1])') == '' 96 | assert query_html_doc(html_body, 'name(/html/body/node()[2])') == 'span' 97 | 98 | 99 | def test_not_function_produces_expected_results(): 100 | assert query_html_doc('', 'not(false())') == expected_result('true') 101 | assert query_html_doc('', 'not(not("foo" = "bar"))') == expected_result('false') 102 | assert query_html_doc('', 'not(0)') == expected_result('true') 103 | assert query_html_doc('', 'not(10000)') == expected_result('false') 104 | 105 | 106 | def test_number_function_converts_string_to_number(): 107 | assert query_html_doc('', 'number("43") + number("-1")') == expected_result('42') 108 | assert query_html_doc('', 'number("10") + number("1.11")') == expected_result('11.11') 109 | 110 | 111 | def test_number_function_converts_boolean_values_to_one_and_zero(): 112 | assert query_html_doc('', 'number(true())') == expected_result('1') 113 | assert query_html_doc('', 'number(false())') == expected_result('0') 114 | 115 | 116 | def test_number_function_converts_node_set_based_on_string_value_of_first_node_in_doc_order(): 117 | html_body = """ 118 |
119 |
120 |

98.6

121 |
122 |
123 |

24

""" 124 | assert query_html_doc(html_body, 'number(//p)') == expected_result('98.6') 125 | 126 | 127 | def test_round_function_follows_xpath_1_rules_for_positive_numbers_but_python_rules_for_negative_ones(): 128 | """ 129 | Not fooling with positive or negative infinity or zero, nor the numeric type business in the XPath 3.0 functions 130 | spec.. Also not, as the test name attests, respecting XPath 1 rules for negative numbers, as Python rounds away 131 | from zero and I anticipate some tiresome drudgery for no particular benefit (again, HQuery is not intended as an 132 | execution target for existing XPath code). 133 | """ 134 | assert query_html_doc('', 'round(5.49)') == '5' 135 | assert query_html_doc('', 'round(5.5)') == '6' 136 | assert query_html_doc('', 'round(1 div 0)') == 'NaN' 137 | assert query_html_doc('', 'round(-5.5)') == '-6' 138 | assert query_html_doc('', 'round(-5.49)') == '-5' 139 | 140 | 141 | def test_round_function_supports_an_optional_precision_argument(): 142 | assert query_html_doc('', 'round(3.456)') == '3' 143 | assert query_html_doc('', 'round(3.456, 1)') == '3.5' 144 | assert query_html_doc('', 'round(3.456, 2)') == '3.46' 145 | assert query_html_doc('', 'round(3.456, 3)') == '3.456' 146 | 147 | 148 | def test_substring_function_behaves_reasonably_and_lets_agree_to_ignore_all_that_NaN_crap(): 149 | assert query_html_doc('', 'substring("12345", 1.5, 2.6)') == '234' 150 | assert query_html_doc('', 'substring("12345", 0, 3)') == '12' 151 | assert query_html_doc('', 'substring("12345", -1, 3)') == '1' 152 | assert query_html_doc('', 'substring("12345", 5, 2)') == '5' 153 | 154 | 155 | def test_substring_after_and_substring_before_work_per_spec(): 156 | assert query_html_doc('', 'substring-after("1999/04/01", "/")') == '04/01' 157 | assert query_html_doc('', 'substring-after("1999/04/01", ":")') == '' 158 | assert query_html_doc('', 'substring-before("1999/04/01", "/")') == '1999' 159 | assert query_html_doc('', 'substring-before("1999/04/01", ":")') == '' 160 | 161 | 162 | def test_true_and_false_functions_return_expected_values(): 163 | assert query_html_doc('', 'false()') == expected_result('false') 164 | assert query_html_doc('', 'true()') == expected_result('true') 165 | assert query_html_doc('', 'true() = false()') == expected_result('false') 166 | assert query_html_doc('', 'true() != false()') == expected_result('true') 167 | 168 | 169 | def test_position_function_in_predicate_applies_to_current_step_only(): 170 | html_body = """ 171 | 172 | 173 | 174 | 175 | 176 | 177 | 178 | 179 | 180 | 181 | 182 | 183 | 184 |
onetwo
unodos
ichini
""" 185 | assert query_html_doc(html_body, '//tr[@class="select-me"]/td[position()=2]') == expected_result(""" 186 | 187 | two 188 | 189 | 190 | ni 191 | """) 192 | 193 | 194 | def test_position_function_in_second_predicate_applies_to_results_from_first_predicate(): 195 | html_body = """ 196 | 197 | 198 | 199 | 200 | 201 | 202 | 203 | 204 | 205 | 206 | 207 | 208 | 209 |
onetwo
unodos
ichini
""" 210 | assert query_html_doc(html_body, '//td[../@class="select-me"][position()=1]') == expected_result(""" 211 | 212 | one 213 | 214 | 215 | ichi 216 | """) 217 | 218 | 219 | def test_string_function_returns_expected_results_for_various_objects(): 220 | html_body = """ 221 |

one

222 |

two

""" 223 | 224 | assert query_html_doc(html_body, 'string(//p)') == expected_result('one') 225 | assert query_html_doc('', 'string(2 div 0)') == expected_result('NaN') 226 | assert query_html_doc('', 'string(-0)') == expected_result('0') 227 | assert query_html_doc('', 'string(-9)') == expected_result('-9') 228 | assert query_html_doc('', 'string(98.6)') == expected_result('98.6') 229 | assert query_html_doc('', 'string(true())') == expected_result('true') 230 | assert query_html_doc('', 'string(1 = -1)') == expected_result('false') 231 | 232 | 233 | def test_string_value_of_an_element_with_mixed_content_inserts_proper_spaces_between_text_runs(): 234 | html_body = '

once twice thrice

' 235 | assert query_html_doc(html_body, 'string(//p)') == expected_result('once twice thrice') 236 | 237 | 238 | def test_string_length_function_returns_expected_values(): 239 | assert query_html_doc('', 'string-length("foo")') == expected_result('3') 240 | assert query_html_doc('', 'string-length("")') == expected_result('0') 241 | 242 | 243 | def test_sum_function_sums_number_interpretation_of_items_in_sequence(): 244 | html_body = """ 245 | 30 246 |
247 | 2""" 248 | 249 | assert query_html_doc(html_body, 'sum(//span)') == '32' 250 | assert query_html_doc(html_body, 'sum((//span, //div/@value))') == '42.42' 251 | 252 | 253 | def test_sum_function_supports_zero_value_for_empty_sequence_as_second_argument(): 254 | assert query_html_doc('', 'sum(//span, "zero")') == 'zero' 255 | 256 | 257 | def test_various_functions_use_context_node_when_no_argument_passed(): 258 | html_body = """ 259 |

first

260 |

foo bar

261 |

last

""" 262 | 263 | assert query_html_doc(html_body, '//p[string() = "first"]/text()') == expected_result('first') 264 | assert query_html_doc(html_body, '//p[normalize-space() = "foo bar"]/text()', preserve_space=True) == \ 265 | expected_result('foo bar') 266 | assert query_html_doc(html_body, '//p[string-length() = 4]/text()') == expected_result('last') 267 | -------------------------------------------------------------------------------- /test/hquery/test_xpath1_abbreviated_samples.py: -------------------------------------------------------------------------------- 1 | from hq.soup_util import make_soup 2 | from test.common_test_util import expected_result 3 | from test.hquery.hquery_test_util import query_context_node 4 | 5 | 6 | def test_selects_the_para_element_children_of_the_context_node(): 7 | html = """ 8 | 9 | selected 10 | not selected 11 | also selected 12 | """ 13 | assert query_context_node(html, 'para') == expected_result(""" 14 | 15 | selected 16 | 17 | 18 | also selected 19 | """) 20 | 21 | 22 | def test_selects_all_element_children_of_the_context_node(): 23 | html = """ 24 | 25 | 26 | selected 27 | non-selected text 28 | also selected 29 | """ 30 | assert query_context_node(html, '*') == expected_result(""" 31 | 32 | selected 33 | 34 | 35 | also selected 36 | """) 37 | 38 | 39 | def test_selects_all_text_node_children_of_the_context_node(): 40 | html = """ 41 | 42 | first 43 | second 44 | third 45 | """ 46 | actual = query_context_node(html, 'text()') 47 | assert 'first' in actual 48 | assert 'second' not in actual 49 | assert 'third' in actual 50 | 51 | 52 | def test_selects_the_name_attribute_of_the_context_node(): 53 | html = 'not value' 54 | assert query_context_node(html, '@name') == expected_result('name="value"') 55 | 56 | 57 | def test_selects_all_the_attributes_of_the_context_node(): 58 | html = '' 59 | assert query_context_node(html, '@*') == expected_result(''' 60 | first="first value" 61 | second="second value" 62 | third="third value"''') 63 | 64 | 65 | def test_selects_the_first_para_child_of_the_context_node(): 66 | html = """ 67 | 68 | selected 69 | not selected 70 | """ 71 | assert query_context_node(html, 'para[1]') == expected_result(""" 72 | 73 | selected 74 | """) 75 | 76 | 77 | def test_selects_the_last_para_child_of_the_context_node(): 78 | html = """ 79 | 80 | not selected 81 | also not selected 82 | selected 83 | """ 84 | assert query_context_node(html, 'para[last()]') == expected_result(""" 85 | 86 | selected 87 | """) 88 | 89 | 90 | def test_selects_all_para_grandchildren_of_the_context_node(): 91 | html = """ 92 | 93 | 94 | not selected 95 | selected 96 | also selected 97 | 98 | """ 99 | assert query_context_node(html, '*/para') == expected_result(""" 100 | 101 | selected 102 | 103 | 104 | also selected 105 | """) 106 | 107 | 108 | def test_selects_the_second_section_of_the_fifth_chapter_of_the_doc(): 109 | html = """ 110 | 111 | one 112 | two 113 | three 114 | four 115 | 116 |
five point one
117 |
five point two
118 |
119 |
""" 120 | assert query_context_node(html, '/doc/chapter[5]/section[2]') == expected_result(""" 121 |
122 | five point two 123 |
""") 124 | 125 | 126 | def test_selects_the_para_element_descendants_of_the_chapter_element_children_of_the_context_node(): 127 | html = """ 128 | 129 | not selected 130 | 131 | 132 | selected 133 | 134 | 135 | """ 136 | assert query_context_node(html, 'chapter//para') == expected_result(""" 137 | 138 | 139 | selected 140 | 141 | 142 | 143 | selected 144 | """) 145 | 146 | 147 | def test_selects_all_the_para_descendants_of_the_document_root_and_thus_selects_all_para_elements_in_the_same_document_as_the_context_node(): 148 | html = """ 149 | 150 | 151 | selected 152 | 153 | 154 | also selected 155 | """ 156 | soup = make_soup(html) 157 | assert query_context_node(soup.root.context, '//para') == expected_result(""" 158 | 159 | 160 | selected 161 | 162 | 163 | 164 | selected 165 | 166 | 167 | also selected 168 | """) 169 | 170 | 171 | def test_selects_all_the_item_elements_in_the_same_document_as_the_context_node_that_have_an_olist_parent(): 172 | html = """ 173 | 174 | no items 175 | not selected 176 | 177 | 178 | first 179 | 180 | 181 | 182 | second 183 | 184 | 185 | """ 186 | soup = make_soup(html) 187 | assert query_context_node(soup.root.context, '//olist/item') == expected_result(""" 188 | 189 | first 190 | 191 | 192 | second 193 | """) 194 | 195 | 196 | def test_selects_the_context_node(): 197 | html = """ 198 | 199 | selected 200 | """ 201 | assert query_context_node(html, '.') == expected_result(""" 202 | 203 | selected 204 | """) 205 | 206 | 207 | def test_selects_the_para_element_descendants_of_the_context_node(): 208 | html = """ 209 | 210 | 211 | selected 212 | not selected 213 | 214 | also selected 215 | 216 | 217 | """ 218 | soup = make_soup(html) 219 | assert query_context_node(soup.para.context, './/para') == expected_result(""" 220 | 221 | selected 222 | 223 | 224 | 225 | also selected 226 | 227 | 228 | 229 | also selected 230 | """) 231 | 232 | 233 | def test_selects_the_parent_of_the_context_node(): 234 | html = """ 235 | 236 | 237 | """ 238 | soup = make_soup(html) 239 | assert query_context_node(html, '..') == expected_result(""" 240 | 241 | 242 | 243 | """) 244 | 245 | 246 | def test_selects_the_lang_attribute_of_the_parent_of_the_context_node(): 247 | html = """ 248 | 249 | 250 | """ 251 | soup = make_soup(html) 252 | assert query_context_node(soup.root.context, '../@lang') == expected_result('lang="English"') 253 | 254 | 255 | def test_selects_all_para_children_of_the_context_node_that_have_a_type_attribute_with_value_warning(): 256 | html = """ 257 | 258 | not selected 259 | selected 260 | not selected 261 | also selected 262 | """ 263 | assert query_context_node(html, 'para[@type="warning"]') == expected_result(""" 264 | 265 | selected 266 | 267 | 268 | also selected 269 | """) 270 | 271 | 272 | def test_selects_the_fifth_para_child_of_the_context_node_that_has_a_type_attribute_with_value_warning(): 273 | html = """ 274 | 275 | first error 276 | first warning 277 | second error 278 | second warning 279 | third error 280 | third warning 281 | fourth error 282 | fourth warning 283 | fifth error 284 | fifth warning 285 | """ 286 | assert query_context_node(html, 'para[@type="warning"][5]') == expected_result(""" 287 | 288 | fifth warning 289 | """) 290 | 291 | 292 | def test_selects_the_fifth_para_child_of_the_context_node_if_that_child_has_a_type_attribute_with_value_warning(): 293 | html = """ 294 | 295 | not selected 296 | not selected 297 | not selected 298 | not selected 299 | selected 300 | """ 301 | assert query_context_node(html, 'para[5][@type="warning"]') == expected_result("") 302 | assert query_context_node(html.replace('error', 'warning'), 'para[5][@type="warning"]') == expected_result(""" 303 | 304 | selected 305 | """) 306 | 307 | 308 | def test_selects_the_chapter_children_of_the_context_node_that_have_one_or_more_title_children_with_string_value_equal_to_Introduction(): 309 | html = """ 310 | 311 | 312 | Introduction 313 | 314 | not selected 315 | 316 | Author's Note 317 | 318 | 319 | Introduction 320 | Hello, I'm chapter. 321 | 322 | """ 323 | assert query_context_node(html, 'chapter[title="Introduction"]') == expected_result(""" 324 | 325 | 326 | Introduction 327 | 328 | 329 | 330 | 331 | Introduction 332 | 333 | 334 | Hello, I'm chapter. 335 | 336 | """) 337 | 338 | 339 | def test_selects_the_chapter_children_of_the_context_node_that_have_one_or_more_title_children(): 340 | html = """ 341 | 342 | 343 | selected 344 | 345 | 346 | 347 | 348 | 349 | also selected 350 | 351 | """ 352 | assert query_context_node(html, 'chapter[title]') == expected_result(""" 353 | 354 | 355 | selected 356 | 357 | 358 | 359 | 360 | also selected 361 | 362 | """) 363 | 364 | 365 | def test_selects_all_the_employee_children_of_the_context_node_that_have_both_a_secretary_attribute_and_an_assistant_attribute(): 366 | html = """ 367 | 368 | 369 | selected 370 | 371 | 372 | """ 373 | assert query_context_node(html, 'employee[@secretary and @assistant]') == expected_result(""" 374 | 375 | selected 376 | 377 | 378 | """) 379 | -------------------------------------------------------------------------------- /test/hquery/test_axes.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | import sys 4 | 5 | from hq.hquery.syntax_error import HquerySyntaxError 6 | from pytest import raises 7 | 8 | sys.path.insert(0, os.path.abspath('../..')) 9 | 10 | from ..common_test_util import expected_result 11 | from test.hquery.hquery_test_util import query_html_doc 12 | 13 | 14 | def test_explicit_child_axis(): 15 | html_body = """ 16 |
17 |

foo

18 |
""" 19 | assert query_html_doc(html_body, '//div/child::p') == expected_result(""" 20 |

21 | foo 22 |

""") 23 | 24 | 25 | def test_child_axis_selects_only_immediate_children(): 26 | html_body = """ 27 |

uncle

28 |
29 |

niece

30 |

nephew

31 |
""" 32 | assert query_html_doc(html_body, '/html/body/child::p') == expected_result(""" 33 |

34 | uncle 35 |

""") 36 | 37 | 38 | def test_descendant_axis_selects_from_descendants_not_ancestors(): 39 | html_body = """ 40 |
41 |
42 |
uncle
43 | 46 |
47 |
""" 48 | actual = query_html_doc(html_body, '/html/body/div/descendant::div') 49 | assert actual == expected_result(""" 50 |
51 | uncle 52 |
53 |
54 | niece 55 |
""") 56 | 57 | 58 | def test_descendant_axis_returns_all_descendants_and_only_descendants_of_nodes_matching_node_test(): 59 | html_body = """ 60 |
61 |
62 |
selected
63 |
64 |
65 | 66 |
not selected
67 |

not selected

""" 68 | expected = expected_result(""" 69 |
70 |
71 | selected 72 |
73 |
74 |
75 | selected 76 |
""") 77 | 78 | assert query_html_doc(html_body, '/html/body/div/descendant::div') == expected 79 | assert query_html_doc(html_body, '/html/body/div/~::div') == expected 80 | 81 | 82 | def test_descendant_or_self_axis_returns_all_descendants_and_context_node_if_it_matches_node_test(): 83 | html_body = """ 84 |
85 |
foo
86 |
87 |
bar
""" 88 | assert query_html_doc(html_body, '/html/body/descendant-or-self::div') == expected_result(""" 89 |
90 |
91 | foo 92 |
93 |
94 |
95 | foo 96 |
97 |
98 | bar 99 |
""") 100 | 101 | 102 | def test_descendant_or_self_axis_does_not_produce_self_if_node_test_does_not_match(): 103 | html_body = """ 104 |
105 |

foo

106 |
""" 107 | assert query_html_doc(html_body, '//div/descendant-or-self::p') == expected_result(""" 108 |

109 | foo 110 |

""") 111 | 112 | 113 | def test_parent_axis_returns_parent_of_tag_node(): 114 | assert query_html_doc('
', '//div/parent::*') == expected_result(""" 115 | 116 |
117 |
118 | """) 119 | 120 | 121 | def test_parent_axis_selects_only_the_immediate_parent(): 122 | html_body = """ 123 |
124 |
125 |

daughter

126 |
127 |
""" 128 | actual = query_html_doc(html_body, '//p/parent::div') 129 | assert actual == expected_result(""" 130 |
131 |

132 | daughter 133 |

134 |
""") 135 | 136 | 137 | def test_parent_axis_returns_parents_for_multiple_matching_nodes(): 138 | html_body = """ 139 |
140 |

141 |

142 |
143 |
144 |

145 |

146 |
""" 147 | assert query_html_doc(html_body, '//p/parent::*') == expected_result(html_body) 148 | 149 | 150 | def test_parent_axis_produces_nothing_for_root_element(): 151 | assert query_html_doc('', '/html/parent::*') == expected_result('') 152 | assert query_html_doc('
', 'div/parent::*', wrap_body=False) == expected_result('') 153 | 154 | 155 | def test_ancestor_axis_selects_all_matching_ancestors(): 156 | html_body = """ 157 |
158 |
159 |
160 |

text

161 |
162 |
163 |
""" 164 | expected = expected_result(""" 165 |
166 |
167 |
168 |

169 | text 170 |

171 |
172 |
173 |
174 |
175 |

176 | text 177 |

178 |
""") 179 | 180 | assert query_html_doc(html_body, '//p/ancestor::div') == expected 181 | assert query_html_doc(html_body, '//p/^::div') == expected 182 | 183 | 184 | def test_ancestor_axis_produces_all_ancestors_and_only_ancestors(): 185 | html_body = """ 186 | 187 | 188 | 189 |

190 |
191 | 192 | """ 193 | assert query_html_doc(html_body, '//div/ancestor::*', wrap_body=False) == expected_result(""" 194 | 195 | 196 | 197 |

198 |

199 |
200 |
201 | 202 | 203 | 204 | 205 |

206 |

207 |
208 |
209 | """) 210 | 211 | 212 | def test_ancestor_or_self_axis_produces_ancestors_and_self_when_node_test_is_a_match(): 213 | html_body = """ 214 |
215 |
foo
216 |
""" 217 | expected = expected_result(""" 218 |
219 |
220 | foo 221 |
222 |
223 |
224 | foo 225 |
""") 226 | 227 | assert query_html_doc(html_body, '/html/body/div/div/ancestor-or-self::div') == expected 228 | assert query_html_doc(html_body, '/html/body/div/div/^^::div') == expected 229 | 230 | 231 | def test_following_sibling_axis_selects_all_following_siblings_and_only_following_siblings_that_match_name_test(): 232 | html_body = """ 233 |
234 |
235 |

236 |

moe

237 |
238 |
239 |

larry

240 |

241 |

curly

242 |
""" 243 | expected = expected_result(""" 244 |

245 | moe 246 |

247 |

248 | curly 249 |

""") 250 | 251 | assert query_html_doc(html_body, '//div/following-sibling::p') == expected 252 | assert query_html_doc(html_body, '//div/>::p') == expected 253 | 254 | 255 | def test_following_sibling_axis_works_with_node_test(): 256 | html_body = """ 257 |
258 | foo 259 |

260 | bar 261 |
""" 262 | assert query_html_doc(html_body, '//p/following-sibling::text()') == expected_result('bar') 263 | assert query_html_doc('

foo

', '//div/following-sibling::*') == expected_result(""" 264 |

265 | foo 266 |

""") 267 | 268 | 269 | def test_preceding_sibling_axis_works_with_name_test(): 270 | html_body = """ 271 |

foo

272 |
273 |

bar

""" 274 | expected = expected_result(""" 275 |

276 | foo 277 |

""") 278 | 279 | assert query_html_doc(html_body, '//div/preceding-sibling::p') == expected 280 | assert query_html_doc(html_body, '//div/<::p') == expected 281 | 282 | 283 | def test_preceding_sibling_axis_works_with_node_test(): 284 | html_body = """ 285 |

foo

286 |

bar

287 |
288 |

nothing

""" 289 | assert query_html_doc(html_body, '//div/preceding-sibling::node()') == expected_result(""" 290 |

291 | foo 292 |

293 |

294 | bar 295 |

""") 296 | 297 | 298 | def test_preceding_sibling_axis_returns_nodes_in_document_order(): 299 | """Node sets are unordered, but people really seem to like these being in document order.""" 300 | html_body = """ 301 |

foo

302 |

bar

303 |
""" 304 | assert query_html_doc(html_body, '//div/preceding-sibling::p') == expected_result(""" 305 |

306 | foo 307 |

308 |

309 | bar 310 |

""") 311 | 312 | 313 | def test_following_axis_finds_all_following_nodes_that_match(): 314 | html_body = """ 315 |
316 |

moe

317 | 320 |
321 |

curly

322 |
323 |
324 |

shemp

""" 325 | expected = expected_result(""" 326 |

327 | curly 328 |

329 |

330 | shemp 331 |

""") 332 | 333 | assert query_html_doc(html_body, '//aside/following::p') == expected 334 | assert query_html_doc(html_body, '//aside/>>::p') == expected 335 | 336 | 337 | def test_preceding_axis_finds_all_preceding_nodes_that_match_node_test(): 338 | html_body = """ 339 | foo 340 |
341 |

bar

342 |
343 | """ 344 | actual = query_html_doc(html_body, '//span/preceding::text()') 345 | actual = re.sub(r'\W+', ' ', actual) 346 | assert actual == 'foo bar' 347 | 348 | 349 | def test_preceding_axis_finds_all_preceding_nodes_that_match(): 350 | html_body = """ 351 |

moe

352 |
353 |
354 |

larry

355 |
356 | 359 |

shemp

360 |
""" 361 | expected = expected_result(""" 362 |

363 | moe 364 |

365 |

366 | larry 367 |

""") 368 | 369 | assert query_html_doc(html_body, '//aside/preceding::p') == expected 370 | assert query_html_doc(html_body, '//aside/<<::p') == expected 371 | 372 | 373 | def test_preceding_axis_produces_results_in_document_order_and_also_works_with_node_test(): 374 | html_body = """ 375 |

moe

376 |
377 |
378 |
379 |

larry

380 |
381 |
382 | 385 |

shemp

386 |
387 | """ 388 | assert query_html_doc(html_body, '//script/preceding::p/text()') == expected_result(""" 389 | moe 390 | larry 391 | curly 392 | shemp""") 393 | 394 | 395 | def test_attribute_axis_in_full_and_abbreviated_form_selects_named_attributes_or_all_attributes(): 396 | html_body = """ 397 |
398 |
""" 399 | expected_ids_result = expected_result(''' 400 | id="one" 401 | id="two"''') 402 | expected_all_result = expected_result(''' 403 | id="one" 404 | class="three" 405 | id="two"''') 406 | assert query_html_doc(html_body, '//div/attribute::id') == expected_ids_result 407 | assert query_html_doc(html_body, '//div/@id') == expected_ids_result 408 | assert query_html_doc(html_body, '//attribute::*') == expected_all_result 409 | assert query_html_doc(html_body, '//@*') == expected_all_result 410 | 411 | 412 | def test_attribute_axis_matching_any_attribute_produces_attributes_from_each_element_in_alphabetical_order(): 413 | html_body = """ 414 | 415 | """ 416 | actual = query_html_doc(html_body, '//span/@*') 417 | assert re.sub(r'\w+="(\d)"\n?', r'\1', actual) == '123456' 418 | 419 | 420 | def test_self_axis_applies_only_to_self(): 421 | html_body = """ 422 |
423 |
424 |
425 |
426 |
""" 427 | assert query_html_doc(html_body, '/html/body/div/div/self::div') == expected_result(""" 428 |
429 |
430 |
431 |
""") 432 | 433 | 434 | def test_css_class_axis_finds_elements_based_on_their_css_classes(): 435 | html_body = """ 436 |

foo

437 |

foo bar

438 |

bar

""" 439 | expected = expected_result(""" 440 |

441 | foo bar 442 |

443 |

444 | bar 445 |

""") 446 | 447 | assert query_html_doc(html_body, '//class::bar') == expected 448 | assert query_html_doc(html_body, '//.::bar') == expected 449 | 450 | 451 | def test_css_class_axis_can_only_be_followed_by_name_test(): 452 | with raises(HquerySyntaxError): 453 | assert query_html_doc('', '/.::node()') 454 | -------------------------------------------------------------------------------- /test/hquery/test_xpath1_unabbreviated_samples.py: -------------------------------------------------------------------------------- 1 | """ 2 | These tests verify results from the samples of unabbreviated location paths in the W3C XPath 1.0 specification (chapter 3 | 2, Location Paths). 4 | 5 | https://www.w3.org/TR/xpath/#location-paths 6 | """ 7 | 8 | import os 9 | import sys 10 | 11 | from hq.soup_util import make_soup 12 | 13 | sys.path.insert(0, os.path.abspath('../..')) 14 | 15 | from ..common_test_util import expected_result 16 | from test.hquery.hquery_test_util import query_html_doc, query_context_node 17 | 18 | 19 | def test_selects_the_para_element_children_of_the_context_node(): 20 | assert query_context_node("", 'child::para') == expected_result(""" 21 | 22 | """) 23 | 24 | 25 | def test_selects_all_text_node_children_of_the_context_node(): 26 | html = "beforeduringafter" 27 | assert query_context_node(html, 'child::text()') == expected_result(""" 28 | before 29 | after""") 30 | 31 | 32 | def test_selects_all_the_children_of_the_context_node_whatever_their_node_type(): 33 | html = """ 34 | selected text 35 | 36 | 37 | """ 38 | assert query_context_node(html, 'child::node()') == expected_result(""" 39 | selected text 40 | 41 | 42 | """) 43 | 44 | 45 | def test_selects_the_name_attribute_of_the_context_node(): 46 | html = "" 47 | assert query_context_node(html, 'attribute::name') == expected_result('name="selected"') 48 | 49 | 50 | def test_selects_all_the_attributes_of_the_context_node(): 51 | html = "" 52 | assert query_context_node(html, 'attribute::*') == expected_result(''' 53 | bar="foo" 54 | foo="bar"''') 55 | 56 | 57 | def test_selects_the_para_element_descendants_of_the_context_node(): 58 | html = """ 59 | 60 | 61 | 62 | 63 | """ 64 | assert query_context_node(html, 'descendant::para') == expected_result(""" 65 | 66 | 67 | 68 | 69 | 70 | """) 71 | 72 | 73 | def test_selects_all_div_ancestors_of_the_context_node(): 74 | html = """ 75 |
76 | 77 |
""" 78 | assert query_context_node(make_soup(html).div.notdiv, 'ancestor::div') == expected_result(""" 79 |
80 | 81 | 82 |
""") 83 | 84 | 85 | def test_selects_the_div_ancestors_of_the_context_node_and_if_the_context_node_is_a_div_element_the_context_node_as_well(): 86 | html = """ 87 |
88 |
89 | 90 |
""" 91 | soup = make_soup(html) 92 | assert query_context_node(soup.div.div, 'ancestor-or-self::div') == expected_result(""" 93 |
94 |
95 |
96 | 97 | 98 |
99 |
100 |
""") 101 | assert query_context_node(soup.div.notdiv, 'ancestor-or-self::div') == expected_result(""" 102 |
103 |
104 |
105 | 106 | 107 |
""") 108 | 109 | 110 | def test_selects_the_para_element_descendants_of_the_context_node_and_if_the_context_node_is_a_para_element_the_context_node_as_well(): 111 | context_is_para = """ 112 | 113 | foo 114 | bar 115 | """ 116 | context_is_not_para = """ 117 | 118 | foo 119 | bar 120 | """ 121 | assert query_context_node(context_is_para, 'descendant-or-self::para') == expected_result(""" 122 | 123 | 124 | foo 125 | 126 | 127 | bar 128 | 129 | 130 | 131 | foo 132 | 133 | 134 | bar 135 | """) 136 | assert query_context_node(context_is_not_para, 'descendant-or-self::para') == expected_result(""" 137 | 138 | foo 139 | 140 | 141 | bar 142 | """) 143 | 144 | 145 | def test_selects_the_context_node_if_it_is_a_para_element_and_otherwise_selects_nothing(): 146 | is_para = "" 147 | is_not_para = "" 148 | assert query_context_node(is_para, 'self::para') == expected_result(""" 149 | 150 | """) 151 | assert query_context_node(is_not_para, 'self::para') == '' 152 | 153 | 154 | def test_selects_the_para_element_descendants_of_the_chapter_element_children_of_the_context_node(): 155 | html = """ 156 | 157 | 158 | not selected 159 | 160 | 161 |
162 | selected 163 |
164 |
165 |
""" 166 | assert query_context_node(html, 'child::chapter/descendant::para') == expected_result(""" 167 | 168 | selected 169 | """) 170 | 171 | 172 | def test_selects_all_para_grandchildren_of_the_context_node(): 173 | html = """ 174 | 175 | not selected 176 | 177 | not selected 178 | selected 179 | 180 | 181 | also selected 182 | 183 | """ 184 | assert query_context_node(html, 'child::*/child::para') == expected_result(""" 185 | 186 | selected 187 | 188 | 189 | also selected 190 | """) 191 | 192 | 193 | def test_selects_the_document_root_which_is_always_the_parent_of_the_document_element(): 194 | html = """ 195 | 196 | 197 | """ 198 | assert query_context_node(html, '/') == expected_result(html) 199 | 200 | 201 | def test_selects_all_the_para_elements_in_the_same_document_as_the_context_node(): 202 | html = """ 203 | 204 | 205 | selected 206 | """ 207 | soup = make_soup(html) 208 | assert query_context_node(soup.root.notpara, '/descendant::para') == expected_result(""" 209 | 210 | selected 211 | """) 212 | 213 | 214 | def test_selects_all_the_item_elements_that_have_an_olist_parent_and_that_are_in_the_same_document_as_the_context_node(): 215 | html = """ 216 | 217 | 218 | 219 | not selected 220 | selected 221 | 222 | """ 223 | soup = make_soup(html) 224 | assert query_context_node(soup.root.notolist, '/descendant::olist/child::item') == expected_result(""" 225 | 226 | selected 227 | """) 228 | 229 | 230 | def test_selects_the_first_para_child_of_the_context_node(): 231 | html = """ 232 | 233 | selected 234 | not selected 235 | """ 236 | assert query_context_node(html, 'child::para[position()=1]') == expected_result(""" 237 | 238 | selected 239 | """) 240 | 241 | 242 | def test_selects_the_last_para_child_of_the_context_node(): 243 | html = """ 244 | 245 | not selected 246 | selected 247 | """ 248 | assert query_context_node(html, 'child::para[position()=last()]') == expected_result(""" 249 | 250 | selected 251 | """) 252 | 253 | 254 | def test_selects_the_last_but_one_para_child_of_the_context_node(): 255 | html = """ 256 | 257 | not selected 258 | selected 259 | also not selected 260 | """ 261 | assert query_context_node(html, 'child::para[position()=last()-1]') == expected_result(""" 262 | 263 | selected 264 | """) 265 | 266 | 267 | def test_selects_all_the_para_children_of_the_context_node_other_than_the_first_para_child_of_the_context_node(): 268 | html = """ 269 | 270 | not selected 271 | selected 272 | also selected 273 | """ 274 | assert query_context_node(html, 'child::para[position()>1]') == expected_result(""" 275 | 276 | selected 277 | 278 | 279 | also selected 280 | """) 281 | 282 | 283 | def test_selects_the_next_chapter_sibling_of_the_context_node(): 284 | html = """ 285 | 286 | 287 | selected 288 | not selected 289 | """ 290 | soup = make_soup(html) 291 | assert query_context_node(soup.root.context, 'following-sibling::chapter[position()=1]') == expected_result(""" 292 | 293 | selected 294 | """) 295 | 296 | 297 | def test_selects_the_previous_chapter_sibling_of_the_context_node(): 298 | html = """ 299 | 300 | not selected 301 | selected 302 | 303 | """ 304 | soup = make_soup(html) 305 | assert query_context_node(soup.root.context, 'preceding-sibling::chapter[position()=1]') == expected_result(""" 306 | 307 | selected 308 | """) 309 | 310 | 311 | def test_selects_the_forty_second_figure_element_in_the_document(): 312 | html_body = ''.join('
{0}
'.format(n) for n in range(1, 43)) 313 | assert query_html_doc(html_body, '/descendant::figure[position()=42]') == expected_result(""" 314 |
315 | 42 316 |
""") 317 | 318 | 319 | def test_selects_the_second_section_of_the_fifth_chapter_of_the_doc_document_element(): 320 | html_body = """ 321 | 322 | 323 |
chapter 1, section 1
324 |
325 | 326 | 327 | 328 | 329 |
chapter 5, section 1
330 |
chapter 5, section 2
331 |
332 |
""" 333 | actual = query_html_doc(html_body, 334 | '/child::doc/child::chapter[position()=5]/child::section[position()=2]', 335 | wrap_body=False) 336 | assert actual == expected_result(""" 337 |
338 | chapter 5, section 2 339 |
""") 340 | 341 | 342 | def test_selects_all_para_children_of_the_context_node_that_have_a_type_attribute_with_value_warning(): 343 | html = """ 344 | 345 | no type 346 | warning one 347 | error type 348 | warning two 349 | """ 350 | assert query_context_node(html, 'child::para[attribute::type="warning"]') == expected_result(""" 351 | 352 | warning one 353 | 354 | 355 | warning two 356 | """) 357 | 358 | 359 | def test_selects_the_fifth_para_child_of_the_context_node_that_has_a_type_attribute_with_value_warning(): 360 | html = """ 361 | 362 | warning one 363 | error type 364 | warning two 365 | warning three 366 | warning four 367 | warning five 368 | """ 369 | assert query_context_node(html, "child::para[attribute::type='warning'][position()=5]") == expected_result(""" 370 | 371 | warning five 372 | """) 373 | 374 | 375 | def test_selects_the_fifth_para_child_of_the_context_node_if_that_child_has_a_type_attribute_with_value_warning(): 376 | html_with = """ 377 | 378 | para one 379 | para two 380 | para three 381 | para four 382 | para five 383 | """ 384 | html_without = """ 385 | 386 | para one 387 | para two 388 | para three 389 | para four 390 | para five 391 | """ 392 | 393 | actual_with = query_context_node(html_with, 'child::para[position()=5][attribute::type="warning"]') 394 | actual_without = query_context_node(html_without, 'child::para[position()=5][attribute::type="warning"]') 395 | 396 | assert actual_with == expected_result(""" 397 | 398 | para five 399 | """) 400 | assert actual_without == expected_result('') 401 | 402 | 403 | def test_selects_the_chapter_children_of_the_context_node_that_have_one_or_more_title_children_with_string_value_equal_to_Introduction(): 404 | html = """ 405 | 406 | No Title 407 | 408 | Wrong Title 409 | 410 | 411 | Introduction 412 | 413 | """ 414 | assert query_context_node(html, "child::chapter[child::title='Introduction']") == expected_result(""" 415 | 416 | 417 | Introduction 418 | 419 | """) 420 | 421 | 422 | def test_selects_the_chapter_children_of_the_context_node_that_have_one_or_more_title_children(): 423 | html = """ 424 | 425 | not selected 426 | 427 | """ 428 | assert query_context_node(html, 'child::chapter[child::title]') == expected_result(""" 429 | 430 | 431 | 432 | """) 433 | 434 | 435 | def test_selects_the_chapter_and_appendix_children_of_the_context_node(): 436 | html = """ 437 | 438 | not selected 439 | 440 | 441 | """ 442 | assert query_context_node(html, 'child::*[self::chapter or self::appendix]') == expected_result(""" 443 | 444 | 445 | 446 | """) 447 | 448 | 449 | def test_selects_the_last_chapter_or_appendix_child_of_the_context_node(): 450 | html_with_last_chapter = """ 451 | 452 | 453 | 454 | selected 455 | """ 456 | html_with_last_appendix = """ 457 | 458 | 459 | 460 | selected 461 | """ 462 | actual_last_chapter = query_context_node(html_with_last_chapter, 463 | 'child::*[self::chapter or self::appendix][position()=last()]') 464 | actual_last_appendix = query_context_node(html_with_last_appendix, 465 | 'child::*[self::chapter or self::appendix][position()=last()]') 466 | assert actual_last_chapter == expected_result(""" 467 | 468 | selected 469 | """) 470 | assert actual_last_appendix == expected_result(""" 471 | 472 | selected 473 | """) 474 | -------------------------------------------------------------------------------- /hq/hquery/tokens.py: -------------------------------------------------------------------------------- 1 | from hq.hquery.computed_constructors.hash_key_value import ComputedHashKeyValueConstructor 2 | from hq.hquery.equality_operators import equals, not_equals 3 | from hq.hquery.flwor import Flwor 4 | from hq.hquery.function_support import FunctionSupport 5 | from hq.hquery.functions.core_boolean import boolean 6 | from hq.hquery.functions.core_number import number 7 | from hq.hquery.node_test import NodeTest 8 | from hq.hquery.object_type import object_type_name, debug_dump_anything 9 | from hq.hquery.sequences import make_node_set, sequence_concat 10 | from hq.hquery.relational_operators import RelationalOperator 11 | from hq.hquery.string_interpolation import parse_interpolated_string 12 | from hq.hquery.syntax_error import HquerySyntaxError 13 | from hq.hquery.union_decomposition import UnionDecomposition 14 | from hq.hquery.variables import value_of_variable 15 | from hq.soup_util import soup_from_any_tag 16 | from hq.string_util import html_entity_decode 17 | from hq.verbosity import verbose_print 18 | 19 | from .axis import Axis 20 | from .expression_context import get_context_node 21 | 22 | function_support = FunctionSupport() 23 | 24 | 25 | 26 | class LBP: 27 | """Left-binding precendence values.""" 28 | ( 29 | nothing, sequence, union_decomp, union, range, abbrev_flwor, or_op, 30 | and_op, equality_op, relational_op, add_or_subtract, mult_or_div, 31 | prefix_op, function_call, location_step, node_test, parenthesized_expr 32 | ) = range(17) 33 | 34 | assert LBP.sequence == LBP.nothing + 1 35 | 36 | 37 | 38 | class Token(object): 39 | 40 | def __init__(self, parse_interface, value=None, **kwargs): 41 | self.parse_interface = parse_interface 42 | self.value = value 43 | 44 | 45 | def _evaluate_binary_operands(self, 46 | left_generator, 47 | right_generator, 48 | constructor=lambda v: v, 49 | type_name='xpath object'): 50 | try: 51 | self._gab('operator evaluation...', indent_after=True) 52 | self._gab('evaluating left-hand side.', indent_after=True) 53 | left_value = constructor(left_generator()) 54 | self._gab('evaluating right-hand side.', outdent_before=True, indent_after=True) 55 | right_value = constructor(right_generator()) 56 | self._gab('operand evaluation complete', outdent_before=True) 57 | self._gab('evaluating expression {0} {1} {2}'.format(left_value, self, right_value), outdent_before=True) 58 | return left_value, right_value 59 | except TypeError: 60 | raise HquerySyntaxError('evaluated against a non-{0} operand'.format(type_name)) 61 | 62 | 63 | def _evaluate_unary_operand(self, operand_generator, constructor=lambda v: v, type_name='xpath object'): 64 | try: 65 | self._gab('evaluating operand.', indent_after=True) 66 | operand_value = constructor(operand_generator()) 67 | self._gab('operand evaluation complete', outdent_before=True) 68 | return operand_value 69 | except TypeError: 70 | raise HquerySyntaxError('evaluated against a non-{0} operand'.format(type_name)) 71 | 72 | 73 | def _gab(self, msg, **kwargs): 74 | verbose_print(u'{0} {1}'.format(self, msg), **kwargs) 75 | 76 | 77 | 78 | class AbbreviatedFlworOperatorToken(Token): 79 | lbp = LBP.abbrev_flwor 80 | 81 | def __str__(self): 82 | return '(abbreviated-FLWOR-operator)' 83 | 84 | def led(self, left): 85 | right = self.parse_interface.expression(LBP.sequence) 86 | 87 | flwor = Flwor() 88 | flwor.set_iteration_expression('_', left) 89 | flwor.set_return_expression(right) 90 | return flwor.evaluate 91 | 92 | 93 | class AddOrSubtractOperatorToken(Token): 94 | lbp = LBP.add_or_subtract 95 | 96 | def __str__(self): 97 | return '(plus)' if self.value == '+' else '(minus)' 98 | 99 | def led(self, left): 100 | right = self.parse_interface.expression(self.lbp) 101 | 102 | def evaluate(): 103 | left_value, right_value = self._evaluate_binary_operands(left, right, constructor=number, type_name='number') 104 | result = left_value + right_value if self.value == '+' else left_value - right_value 105 | self._gab('returning {0}'.format(result)) 106 | return result 107 | 108 | return evaluate 109 | 110 | def nud(self): 111 | if self.value != '-': 112 | raise HquerySyntaxError('unexpected {0} at beginning of an expression') 113 | 114 | right = self.parse_interface.expression(LBP.prefix_op) 115 | 116 | def evaluate(): 117 | right_value = self._evaluate_unary_operand(right, constructor=number, type_name='number') 118 | result = -right_value 119 | self._gab('returning {0}'.format(result)) 120 | return result 121 | 122 | return evaluate 123 | 124 | 125 | 126 | class AndOperator(Token): 127 | lbp = LBP.or_op 128 | 129 | def __str__(self): 130 | return '(operator "and")' 131 | 132 | def led(self, left): 133 | right = self.parse_interface.expression(self.lbp) 134 | 135 | def evaluate(): 136 | left_value, right_value = self._evaluate_binary_operands(left, 137 | right, 138 | constructor=boolean, 139 | type_name='boolean') 140 | result = bool(left_value) and bool(right_value) 141 | self._gab('returning {0}'.format(result)) 142 | return result 143 | 144 | return evaluate 145 | 146 | 147 | 148 | class AssignmentOperatorToken(Token): 149 | lbp = LBP.nothing 150 | 151 | def __str__(self): 152 | return '(assignment-operator)' 153 | 154 | 155 | 156 | class AxisToken(Token): 157 | lbp = LBP.nothing 158 | 159 | def __init__(self, parse_interface, value, **kwargs): 160 | super(AxisToken, self).__init__(parse_interface, Axis.canonicalize(value), **kwargs) 161 | self.axis = Axis[self.value] 162 | 163 | def __str__(self): 164 | return '(axis "{0}")'.format(self.value) 165 | 166 | def nud(self): 167 | return self.parse_interface.location_path(self).evaluate 168 | 169 | 170 | 171 | class CloseCurlyBraceToken(Token): 172 | lbp = LBP.nothing 173 | 174 | def __str__(self): 175 | return '(close-curly-brace)' 176 | 177 | 178 | 179 | class CloseParenthesisToken(Token): 180 | lbp = LBP.nothing 181 | 182 | def __str__(self): 183 | return '(close-parenthesis)' 184 | 185 | 186 | 187 | class CloseSquareBraceToken(Token): 188 | lbp = LBP.nothing 189 | 190 | def __str__(self): 191 | return '(right-brace)' 192 | 193 | 194 | 195 | class CommaToken(Token): 196 | lbp = LBP.sequence 197 | 198 | def __str__(self): 199 | return '(comma)' 200 | 201 | def led(self, left): 202 | right = self.parse_interface.expression(self.lbp) 203 | 204 | def evaluate(): 205 | left_value, right_value = self._evaluate_binary_operands(left, right) 206 | return sequence_concat(left_value, right_value) 207 | 208 | return evaluate 209 | 210 | 211 | 212 | class ComputedConstructorFiltersToken(Token): 213 | lbp = LBP.nothing 214 | 215 | def __init__(self, parse_interface, value, **kwargs): 216 | super(ComputedConstructorFiltersToken, self).__init__(parse_interface, value[1:-1], **kwargs) 217 | 218 | def __str__(self): 219 | return '(computed-constructor-filters "{0}")'.format(self.value) 220 | 221 | 222 | class ConstructorReservedWordToken(Token): 223 | lbp = LBP.nothing 224 | 225 | def __str__(self): 226 | return '(constructor-keyword "{0}")'.format(self.value) 227 | 228 | def nud(self): 229 | return self.parse_interface.computed_constructor(self).evaluate 230 | 231 | 232 | 233 | class ContextNodeToken(Token): 234 | lbp = LBP.node_test 235 | 236 | def __str__(self): 237 | return '(context-node)' 238 | 239 | def nud(self): 240 | return self.parse_interface.location_path(self).evaluate 241 | 242 | 243 | 244 | class DivOrModOperatorToken(Token): 245 | lbp = LBP.mult_or_div 246 | 247 | def __str__(self): 248 | return '(operator "{0}")'.format(self.value) 249 | 250 | def led(self, left): 251 | right = self.parse_interface.expression(self.lbp) 252 | 253 | def evaluate(): 254 | left_value, right_value = self._evaluate_binary_operands(left, right, constructor=number, type_name='number') 255 | result = left_value / right_value if self.value == 'div' else left_value % right_value 256 | self._gab('{0} returning {1}'.format(self, result)) 257 | return result 258 | 259 | return evaluate 260 | 261 | 262 | 263 | class DoubleSlashToken(Token): 264 | lbp = LBP.location_step 265 | evaluating_message = 'evaluating remainder of path for node "{0}" and all of its descendants.' 266 | 267 | def __str__(self): 268 | return '(double-slash)' 269 | 270 | def led(self, left): 271 | return self.parse_interface.location_path(self, root_expression=left).evaluate 272 | 273 | def nud(self): 274 | return self.parse_interface.location_path(self).evaluate 275 | 276 | 277 | 278 | class EndToken(Token): 279 | lbp = LBP.nothing 280 | 281 | 282 | 283 | class EqualityOperatorToken(Token): 284 | lbp = LBP.equality_op 285 | 286 | def __str__(self): 287 | return '(equality-operator "{0}")'.format(self.value) 288 | 289 | def led(self, left): 290 | right = self.parse_interface.expression(self.lbp) 291 | 292 | def evaluate(): 293 | left_value, right_value = self._evaluate_binary_operands(left, right) 294 | result = equals(left_value, right_value) if self.value == '=' else not_equals(left_value, right_value) 295 | self._gab('returning {0}'.format(result)) 296 | return result 297 | 298 | return evaluate 299 | 300 | 301 | 302 | class FlworReservedWordToken(Token): 303 | lbp = LBP.nothing 304 | 305 | def __init__(self, parse_interface, value, **kwargs): 306 | super(FlworReservedWordToken, self).__init__(parse_interface, value.lower(), **kwargs) 307 | 308 | def __str__(self): 309 | return '({0})'.format(self.value) 310 | 311 | def nud(self): 312 | return self.parse_interface.flwor(self).evaluate 313 | 314 | 315 | 316 | class FunctionCallToken(Token): 317 | lbp = LBP.function_call 318 | 319 | def __str__(self): 320 | return '(function call "{0}")'.format(self.value) 321 | 322 | def nud(self): 323 | arg_generators = [] 324 | 325 | while (not isinstance(self.parse_interface.peek(), CloseParenthesisToken)): 326 | arg_generators.append(self.parse_interface.expression(LBP.sequence)) 327 | self.parse_interface.advance_if(CommaToken) 328 | 329 | self.parse_interface.advance(CloseParenthesisToken) 330 | 331 | def evaluate(): 332 | self._gab('evaluating argument list for function "{0}."'.format(self.value)) 333 | arguments = [gen() for gen in arg_generators] 334 | arg_types = ','.join(object_type_name(arg) for arg in arguments) 335 | self._gab('calling {0}({1}).'.format(self.value, arg_types)) 336 | return function_support.call_function(self.value, *arguments) 337 | 338 | return evaluate 339 | 340 | 341 | 342 | class HashKeyToken(Token): 343 | lpb = LBP.nothing 344 | 345 | def __str__(self): 346 | return '(hash-key "{0}")'.format(self.value) 347 | 348 | def nud(self): 349 | constructor = ComputedHashKeyValueConstructor(self.value) 350 | constructor.set_value(self.parse_interface.expression(LBP.sequence)) 351 | return constructor.evaluate 352 | 353 | 354 | 355 | class IfElseToken(Token): 356 | lbp = LBP.nothing 357 | 358 | def __str__(self): 359 | return '(if-reserved-word)' 360 | 361 | def nud(self): 362 | return self.parse_interface.if_then_else() 363 | 364 | 365 | 366 | class InterpolatedStringToken(Token): 367 | lbp = LBP.nothing 368 | 369 | def __init__(self, parse_interface, value, **kwargs): 370 | super(InterpolatedStringToken, self).__init__(parse_interface, value[1:-1], **kwargs) 371 | 372 | def __str__(self): 373 | return u'(interpolated-string `{0}`)'.format(self.value) 374 | 375 | def nud(self): 376 | return parse_interpolated_string(self.value, self.parse_interface) 377 | 378 | 379 | 380 | class LiteralNumberToken(Token): 381 | lbp = LBP.nothing 382 | 383 | def __str__(self): 384 | return '(literal-number {0})'.format(self.value) 385 | 386 | def nud(self): 387 | return lambda: number(self.value) 388 | 389 | 390 | 391 | class LiteralStringToken(Token): 392 | lbp = LBP.nothing 393 | 394 | def __init__(self, parse_interface, value, **kwargs): 395 | super(LiteralStringToken, self).__init__(parse_interface, html_entity_decode(value[1:-1]), **kwargs) 396 | 397 | def __str__(self): 398 | return u'(literal-string "{0}")'.format(self.value) 399 | 400 | def nud(self): 401 | return lambda: self.value 402 | 403 | 404 | 405 | class MultiplyOperatorToken(Token): 406 | lbp = LBP.mult_or_div 407 | 408 | def __str__(self): 409 | return '(times)' 410 | 411 | def led(self, left): 412 | right = self.parse_interface.expression(self.lbp) 413 | 414 | def evaluate(): 415 | left_value, right_value = self._evaluate_binary_operands(left, right, constructor=number, type_name='number') 416 | result = left_value * right_value 417 | self._gab('{0} returning {1}'.format(self, result)) 418 | return result 419 | 420 | return evaluate 421 | 422 | 423 | 424 | class NameTestToken(Token): 425 | lbp = LBP.node_test 426 | 427 | def __init__(self, *args, **kwargs): 428 | super(NameTestToken, self).__init__(*args, **kwargs) 429 | self.node_test = NodeTest(self.value, name_test=True) 430 | 431 | def __str__(self): 432 | return '(name-test "{0}")'.format(self.value) 433 | 434 | def nud(self): 435 | return self.parse_interface.location_path(self).evaluate 436 | 437 | 438 | 439 | class NodeTestToken(Token): 440 | lbp = LBP.node_test 441 | 442 | def __init__(self, *args, **kwargs): 443 | super(NodeTestToken, self).__init__(*args, **kwargs) 444 | self.node_test = NodeTest(self.value) 445 | 446 | def __str__(self): 447 | return '(node-test "{0}")'.format(self._dump_value()) 448 | 449 | def nud(self): 450 | return self.parse_interface.location_path(self).evaluate 451 | 452 | def _dump_value(self): 453 | return '{0}{1}'.format(self.value, '()' if self.value != '*' else '') 454 | 455 | 456 | 457 | class OpenCurlyBraceToken(Token): 458 | lbp = LBP.nothing 459 | 460 | def __str__(self): 461 | return '(open-curly-brace)' 462 | 463 | 464 | 465 | class OpenParenthesisToken(Token): 466 | lbp = LBP.parenthesized_expr 467 | 468 | def __str__(self): 469 | return '(open-parenthesis)' 470 | 471 | def nud(self): 472 | expr = self.parse_interface.expression(LBP.nothing) 473 | self.parse_interface.advance(CloseParenthesisToken) 474 | return expr 475 | 476 | 477 | 478 | class OpenSquareBraceToken(Token): 479 | lbp = LBP.location_step 480 | 481 | def __str__(self): 482 | return '(left-brace)' 483 | 484 | def led(self, left): 485 | path = self.parse_interface.location_path(self, root_expression=left) 486 | return path.evaluate 487 | 488 | 489 | 490 | class OrOperatorToken(Token): 491 | lbp = LBP.or_op 492 | 493 | def __str__(self): 494 | return '(operator "or")' 495 | 496 | def led(self, left): 497 | right = self.parse_interface.expression(self.lbp) 498 | 499 | def evaluate(): 500 | left_value, right_value = self._evaluate_binary_operands(left, 501 | right, 502 | constructor=boolean, 503 | type_name='boolean') 504 | result = bool(left_value) or bool(right_value) 505 | self._gab('returning {0}'.format(result)) 506 | return result 507 | 508 | return evaluate 509 | 510 | 511 | 512 | class ParentNodeToken(Token): 513 | lbp = LBP.node_test 514 | 515 | def __str__(self): 516 | return '(parent-node)' 517 | 518 | def nud(self): 519 | return self.parse_interface.location_path(self).evaluate 520 | 521 | 522 | 523 | class RangeOperatorToken(Token): 524 | lbp = LBP.range 525 | 526 | def __str__(self): 527 | return '(range-operator)' 528 | 529 | def led(self, left): 530 | right = self.parse_interface.expression(self.lbp) 531 | 532 | def evaluate(): 533 | left_value, right_value = self._evaluate_binary_operands(left, 534 | right, 535 | constructor=number, 536 | type_name='number') 537 | return list(number(x) for x in range(int(left_value), int(right_value + 1))) 538 | 539 | return evaluate 540 | 541 | 542 | 543 | class RelationalOperatorToken(Token): 544 | lbp = LBP.relational_op 545 | 546 | def __str__(self): 547 | return '(operator {0})'.format(RelationalOperator(self.value).name) 548 | 549 | def led(self, left): 550 | right = self.parse_interface.expression(self.lbp) 551 | 552 | def evaluate(): 553 | left_value, right_value = self._evaluate_binary_operands(left, right) 554 | result = RelationalOperator(self.value).evaluate(left_value, right_value) 555 | self._gab('returning {0}'.format(result)) 556 | return result 557 | 558 | return evaluate 559 | 560 | 561 | 562 | class SlashToken(Token): 563 | lbp = LBP.location_step 564 | 565 | def __str__(self): 566 | return '(slash)' 567 | 568 | def led(self, left): 569 | path = self.parse_interface.location_path(self, root_expression=left) 570 | return path.evaluate 571 | 572 | def nud(self): 573 | next_token = self.parse_interface.peek() 574 | absolute_path_followup_tokens = (AxisToken, ContextNodeToken, NameTestToken, NodeTestToken, ParentNodeToken) 575 | 576 | if any(isinstance(next_token, clz) for clz in absolute_path_followup_tokens): 577 | path = self.parse_interface.location_path(self) 578 | return path.evaluate 579 | else: 580 | return lambda: make_node_set(soup_from_any_tag(get_context_node())) 581 | 582 | 583 | 584 | class UnionDecompositionToken(Token): 585 | lbp = LBP.union_decomp 586 | 587 | def __str__(self): 588 | return '(union decomposition)' 589 | 590 | def led(self, left): 591 | decomp = UnionDecomposition() 592 | mapping_generators = [] 593 | 594 | if self.parse_interface.advance_if(OpenParenthesisToken) is None: 595 | while True: 596 | mapping_generators.append(self.parse_interface.expression(LBP.union)) 597 | if self.parse_interface.advance_if(UnionOperatorToken) is None: 598 | break 599 | else: 600 | while (not isinstance(self.parse_interface.peek(), CloseParenthesisToken)): 601 | mapping_generators.append(self.parse_interface.expression(LBP.union)) 602 | self.parse_interface.advance_if(UnionOperatorToken) 603 | self.parse_interface.advance(CloseParenthesisToken) 604 | 605 | decomp.set_union_expression(left) 606 | decomp.set_mapping_generators(mapping_generators) 607 | return decomp.evaluate 608 | 609 | 610 | 611 | class UnionOperatorToken(Token): 612 | lbp = LBP.union 613 | 614 | def __str__(self): 615 | return '(union operator)' 616 | 617 | def led(self, left): 618 | if hasattr(left, 'union_index'): 619 | left_union_index = left.union_index 620 | else: 621 | left_union_index = 0 622 | right_union_index = left_union_index + 1 623 | 624 | right = self.parse_interface.expression(self.lbp) 625 | 626 | def evaluate(): 627 | left_value, right_value = self._evaluate_binary_operands(left, right, type_name='node set') 628 | for item in right_value: 629 | if not isinstance(getattr(item, 'union_index', None), int): 630 | setattr(item, 'union_index', right_union_index) 631 | if left_union_index == 0: 632 | for item in left_value: 633 | if not isinstance(getattr(item, 'union_index', None), int): 634 | setattr(item, 'union_index', left_union_index) 635 | left_value.extend(right_value) 636 | result = make_node_set(left_value) 637 | self._gab('returning node set with {0} nodes'.format(len(result))) 638 | return result 639 | 640 | setattr(evaluate, 'union_index', right_union_index) 641 | 642 | return evaluate 643 | 644 | 645 | 646 | class VariableToken(Token): 647 | lbp = LBP.nothing 648 | 649 | def __str__(self): 650 | return '(variable ${0})'.format(self.value) 651 | 652 | def nud(self): 653 | 654 | def evaluate(): 655 | result = value_of_variable(self.value) 656 | self._gab(lambda: u'reference evaluating to value {0}'.format(debug_dump_anything(result))) 657 | return result 658 | 659 | return evaluate 660 | --------------------------------------------------------------------------------