├── .gitattributes ├── .gitignore ├── .travis.yml ├── COPYING.txt ├── MANIFEST.in ├── README.md ├── requirements.txt ├── setup.py ├── tests └── unit │ └── test_xpyth.py ├── tox.ini └── xpyth └── __init__.py /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | 4 | # Custom for Visual Studio 5 | *.cs diff=csharp 6 | 7 | # Standard to msysgit 8 | *.doc diff=astextplain 9 | *.DOC diff=astextplain 10 | *.docx diff=astextplain 11 | *.DOCX diff=astextplain 12 | *.dot diff=astextplain 13 | *.DOT diff=astextplain 14 | *.pdf diff=astextplain 15 | *.PDF diff=astextplain 16 | *.rtf diff=astextplain 17 | *.RTF diff=astextplain 18 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | 5 | # C extensions 6 | *.so 7 | 8 | # Distribution / packaging 9 | .Python 10 | env/ 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | lib/ 17 | lib64/ 18 | parts/ 19 | sdist/ 20 | var/ 21 | *.egg-info/ 22 | .installed.cfg 23 | *.egg 24 | 25 | # PyInstaller 26 | # Usually these files are written by a python script from a template 27 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 28 | *.manifest 29 | *.spec 30 | 31 | # Installer logs 32 | pip-log.txt 33 | pip-delete-this-directory.txt 34 | 35 | # Unit test / coverage reports 36 | htmlcov/ 37 | .tox/ 38 | .coverage 39 | .cache 40 | nosetests.xml 41 | coverage.xml 42 | 43 | # Translations 44 | *.mo 45 | *.pot 46 | 47 | # Django stuff: 48 | *.log 49 | 50 | # Sphinx documentation 51 | docs/_build/ 52 | 53 | # PyBuilder 54 | target/ 55 | 56 | # ========================= 57 | # Operating System Files 58 | # ========================= 59 | 60 | # OSX 61 | # ========================= 62 | 63 | .DS_Store 64 | .AppleDouble 65 | .LSOverride 66 | 67 | # Thumbnails 68 | ._* 69 | 70 | # Files that might appear on external disk 71 | .Spotlight-V100 72 | .Trashes 73 | 74 | # Directories potentially created on remote AFP share 75 | .AppleDB 76 | .AppleDesktop 77 | Network Trash Folder 78 | Temporary Items 79 | .apdisk 80 | 81 | # Windows 82 | # ========================= 83 | 84 | # Windows image file caches 85 | Thumbs.db 86 | ehthumbs.db 87 | 88 | # Folder config file 89 | Desktop.ini 90 | 91 | # Recycle Bin used on file shares 92 | $RECYCLE.BIN/ 93 | 94 | # Windows Installer files 95 | *.cab 96 | *.msi 97 | *.msm 98 | *.msp 99 | 100 | # Windows shortcuts 101 | *.lnk 102 | 103 | xpyth_env/ 104 | xpyth.pyproj 105 | *.sln 106 | xpyth.v12.suo 107 | .idea/ 108 | .pytest_cache/ -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | sudo: false 2 | language: python 3 | python: 4 | - "2.7" 5 | - "3.6" 6 | install: pip install tox-travis 7 | script: tox 8 | -------------------------------------------------------------------------------- /COPYING.txt: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2015 Hodgdon Chase Stevens 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include COPYING.txt -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # xpyth 2 | 3 | [![Build Status](https://travis-ci.org/hchasestevens/xpyth.svg?branch=master)](https://travis-ci.org/hchasestevens/xpyth) 4 | [![PyPI version](https://badge.fury.io/py/xpyth.svg)](https://badge.fury.io/py/xpyth) 5 | ![PyPI - Python Version](https://img.shields.io/pypi/pyversions/xpyth.svg) 6 | 7 | A module for querying the DOM tree and writing XPath expressions using native Python syntax. 8 | 9 | Example usage 10 | ------------- 11 | ```python 12 | >>> from xpyth import xpath, DOM, X 13 | 14 | >>> xpath(X for X in DOM if X.name == 'main') 15 | "//*[@name='main']" 16 | 17 | >>> xpath(span for div in DOM for span in div if div.id == 'main') 18 | "//div[@id='main']//span" 19 | 20 | >>> xpath(a for a in DOM if '.com' not in a.href) 21 | "//a[not(contains(@href, '.com'))]" 22 | 23 | >>> xpath(a.href for a in DOM if any(p for p in a.ancestors if p.id)) 24 | "//a[./ancestor::p[@id]]/@href" 25 | 26 | >>> xpath(X.data-bind for X in DOM if X.data-bind == '1') 27 | "//*[@data-bind='1']/@data-bind" 28 | 29 | >>> xpath( 30 | ... form.action 31 | ... for form in DOM 32 | ... if all( 33 | ... input 34 | ... for input in form.children 35 | ... if input.value == 'a' 36 | ... ) 37 | ... ) 38 | "//form[not(./input[not(@value='a')])]/@action" 39 | 40 | >>> allowed_ids = list('abc') 41 | >>> xpath(X for X in DOM if X.id in allowed_ids) 42 | "//*[@id='a' or @id='b' or @id='c']" 43 | ``` 44 | 45 | Motivation 46 | ---------- 47 | 48 | XPath is the de facto standard in querying XML and HTML documents. In Python (and most other languages), XPath expressions are represented as strings; this not only constitutes a potential security threat, but also means that developers are denied standard text-editor and IDE features such as syntax highlighting and autocomplete when writing XPaths. Furthermore, having to become familiar with XPath (or CSS selectors) presents a barrier to entry for developers who want to interact with the web. 49 | 50 | [Great inroads](https://msdn.microsoft.com/en-us/library/bb397933.aspx) have been made in various programming languages in allowing the use of native list-comprehension-like syntax to generate SQL queries. __xpyth__ piggybacks off one such effort, [Pony](http://ponyorm.com/), to extend this functionality to XPath. __Now anyone familiar with Python comprehension syntax can query XML/HTML documents quickly and easily__. Moreover, __xpyth__ integrates with the popular [lxml](http://lxml.de/) library to enable developers to go beyond the querying capabilities of XPath (when necessary). 51 | 52 | Installation 53 | ------------ 54 | 55 | ```bash 56 | pip install xpyth 57 | ``` 58 | 59 | 60 | Use with lxml 61 | ------------- 62 | 63 | __xpyth__ supports querying lxml ```ElementTree```s using the ```query``` function. For example, given a document 64 | ```html 65 | 66 |
67 | Google 68 | Not Google 69 |

Lorem ipsum

70 |

no numbers here

71 |

123

72 |
73 |
74 | Google Charity 75 | Broken link! 76 |
77 | 78 | ``` 79 | accessible as the ```ElementTree``` ```tree```, the following can be executed: 80 | ```python 81 | >>> len(query(a for a in tree)) 82 | 4 83 | >>> query(a for a in tree if 'Not Google' not in a.text)[0].attrib.get('href') 84 | "http://www.google.com" 85 | >>> next( 86 | ... node 87 | ... for node in 88 | ... query( 89 | ... p 90 | ... for p in 91 | ... tree 92 | ... if p.id 93 | ... ) 94 | ... if re.match(r'\D+', node.attrib.get('id')) 95 | ... ).text 96 | "123" 97 | ``` 98 | 99 | Known Issues 100 | ------------ 101 | 102 | * HTML tag names that contain special characters (dashes) cannot be selected, as they violate Python's generator comprehension syntax. HTML attributes containing dashes, e.g. ``data-bind``, work normally. 103 | * The use of ```all``` is quite buggy, e.g. the following return incorrect expressions: 104 | 105 | ```python 106 | >>> xpath(X for X in DOM if all(p.id in ('a', 'b') for p in X)) 107 | "//*[not(.//p/@id='a' or //p/@id='b')]" # expected "//*[not(.//p[./@id!='a' and ./@id!='b'])]" 108 | >>> xpath(X for X in DOM if all('x' in p.id for p in X)) 109 | "//*[not(.contains(@id, //p))]" # expected "//*[not(.//p[not(contains(@id, 'x'))])]" 110 | ``` 111 | 112 | Contacts 113 | -------- 114 | 115 | * Name: [H. Chase Stevens](http://www.chasestevens.com) 116 | * Twitter: [@hchasestevens](https://twitter.com/hchasestevens) 117 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | lxml==3.4.2 2 | pony==0.6.1 3 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | 3 | setup( 4 | name='xpyth', 5 | packages=['xpyth'], 6 | version='0.2.0', 7 | description='Generate XPath expressions from Python comprehensions', 8 | license='MIT', 9 | author='H. Chase Stevens', 10 | author_email='chase@chasestevens.com', 11 | url='https://github.com/hchasestevens/xpyth', 12 | install_requires=[ 13 | 'lxml>=4.1.1', 14 | 'pony>=0.7.3', 15 | ], 16 | tests_require=['pytest>=3.1.2'], 17 | extras_require={'dev': ['pytest>=3.1.2']}, 18 | keywords='xpath xml html', 19 | classifiers=[ 20 | 'Development Status :: 2 - Pre-Alpha', 21 | 'Programming Language :: Python', 22 | 'Programming Language :: Python :: 2', 23 | 'Programming Language :: Python :: 2.7', 24 | 'Programming Language :: Python :: 3', 25 | 'Programming Language :: Python :: 3.6', 26 | 'Intended Audience :: Developers', 27 | 'Operating System :: OS Independent', 28 | 'License :: OSI Approved :: MIT License', 29 | 'Natural Language :: English', 30 | 'Topic :: Internet :: WWW/HTTP', 31 | 'Topic :: Software Development :: Code Generators', 32 | ] 33 | ) -------------------------------------------------------------------------------- /tests/unit/test_xpyth.py: -------------------------------------------------------------------------------- 1 | """Unit tests for xpyth.""" 2 | 3 | import re 4 | 5 | import pytest 6 | 7 | from lxml import etree 8 | from pony.orm.decompiling import Decompiler 9 | 10 | from xpyth import xpath, DOM, X, query 11 | 12 | 13 | def test_iter_insertion(): 14 | """Ensure custom node inserted as comprehension iterator.""" 15 | assert (div for div in DOM).gi_frame.f_locals['.0'] is DOM 16 | 17 | 18 | @pytest.mark.parametrize('comprehension,expected_expression', ( 19 | ((div for div in DOM), '//div'), 20 | ((span for div in DOM for span in div), '//div//span'), 21 | ((span.cls for div in DOM for span in div), '//div//span/@class'), 22 | ((span.text for span in DOM), '//span/text()'), 23 | ((span for span in DOM if span.name == 'main'), "//span[@name='main']"), 24 | ((div for span in DOM if span.name == 'main' for div in span), "//span[@name='main']//div"), 25 | ((div for span in DOM for div in span if span.name == 'main'), "//span[@name='main']//div"), 26 | ((div for span in DOM if span.name == 'main' for div in span if div.cls == 'row'), "//span[@name='main']//div[@class='row']"), 27 | ((div for span in DOM for div in span if div.cls == 'row' and span.name == 'main'), "//span[@name='main']//div[@class='row']"), # tricky case - need to dissect And 28 | ((a for a in DOM if a.href == 'http://www.google.com' and a.name == 'goog'), "//a[@href='http://www.google.com' and @name='goog']"), 29 | ((a for a in DOM if '.com' in a.href), "//a[contains(@href, '.com')]"), 30 | ((a for a in DOM if '.com' not in a.href), "//a[not(contains(@href, '.com'))]"), 31 | ((a for a in DOM if not '.com' in a.href), "//a[not(contains(@href, '.com'))]"), 32 | ((div for div in DOM if div.id != 'main'), "//div[@id!='main']"), 33 | ((div for div in DOM if not div.id == 'main'), "//div[not(@id='main')]"), 34 | ((X for X in DOM if X.name == 'main'), "//*[@name='main']"), 35 | ((span for div in DOM for X in div.following_siblings for span in X.children), '//div/following-sibling::*/span'), 36 | ((a.href for a in DOM if any(p for p in a.following_siblings)), '//a[./following-sibling::p]/@href'), 37 | ((a.href for a in DOM if any(p for p in a.following_siblings if p.id)), '//a[./following-sibling::p[@id]]/@href'), 38 | ((X for X in DOM if any(p for p in DOM)), '//*[//p]'), 39 | ((span for div in DOM for span in div if div.id in ('main', 'other')), "//div[@id='main' or @id='other']//span"), 40 | ((X for X in DOM if X.name in ('a', 'b', 'c')), "//*[@name='a' or @name='b' or @name='c']"), 41 | ((X for X in DOM if all(p for p in X if p.id == 'a')), "//*[not(.//p[not(@id='a')])]"), 42 | ((X for X in DOM if all(p for p in DOM if p.id == 'a')), "//*[not(//p[not(@id='a')])]"), 43 | ((X for X in DOM if any(p.id == 'a' for p in X)), "//*[.//p/@id='a']"), 44 | ((X for X in DOM if all(not p.id == 'a' for p in X)), "//*[not(.//p/@id!='a')]"), 45 | ((X for X in DOM if all(not p.id != 'a' for p in X)), "//*[not(.//p/@id='a')]"), 46 | ((X for X in DOM if len(td for td in X.following_siblings) == 0), "//*[count(./following-sibling::td)=0]"), 47 | ((td.text for td in DOM if td.cls == 'wideonly' and len(td for td in td.following_siblings) == 0), "//td[@class='wideonly' and count(./following-sibling::td)=0]/text()"), 48 | ((X for X in DOM if X.data-bind == 'a'), "//*[@data-bind='a']"), 49 | ((X.data-bind for X in DOM), "//*/@data-bind"), 50 | 51 | pytest.mark.skip(((form.action for form in DOM if all(input.name == 'a' for input in form.children)), "//form[not(./input/@name!='a')]/@action")), 52 | pytest.mark.skip(((X for X in DOM if all(p.id in ('a', 'b') for p in X)), "//*[not(.//p[./@id!='a' and ./@id!='b'])]")), 53 | pytest.mark.skip(((X for X in DOM if all('x' in p.id for p in X)), "//*[not(.//p[not(contains(@id, 'x'))])]")), # Gives //*[not(.contains(@id, //p))] 54 | 55 | # TODO: position (e.g. xpath(a for a in (a for a in DOM)[:20]) ???) 56 | # TODO: position (e.g. xpath(a for X in DOM for a in X[20:]) ???) 57 | )) 58 | def test_expression_generation(comprehension, expected_expression): 59 | """Ensure comprehensions are transformed into expected XPath expressions.""" 60 | try: 61 | expr = xpath(comprehension) 62 | assert expr == expected_expression 63 | except AssertionError: 64 | ast = Decompiler(comprehension.gi_code).ast 65 | print(ast) 66 | print() 67 | raise 68 | 69 | 70 | def test_context(): 71 | """Ensure local context is handled correct when constructing expression.""" 72 | allowed_values = 'a b c'.split() 73 | comprehension = (X for X in DOM if X.name in allowed_values) 74 | expected_expression = "//*[@name='a' or @name='b' or @name='c']" 75 | assert xpath(comprehension) == expected_expression 76 | 77 | 78 | def test_lxml(): 79 | """Ensure lxml compatibility.""" 80 | tree = etree.fromstring(''' 81 | 82 |
83 | Google 84 | Not Google 85 |

Lorem ipsum

86 |

no numbers here

87 |

123

88 |
89 |
90 | Google Charity 91 | Broken link! 92 |
93 | 94 | ''') 95 | assert len(query(a for a in tree)) == 4 96 | assert query(a for a in tree if 'Not Google' in a.text)[0].attrib.get('href') != 'http://www.google.com' 97 | assert query(a for a in tree if 'Not Google' not in a.text)[0].attrib.get('href') == 'http://www.google.com' 98 | assert next( 99 | node 100 | for node in 101 | query( 102 | p 103 | for p in 104 | tree 105 | if node.id 106 | ) 107 | if re.match(r'\D+', node.attrib.get('id')) 108 | ).text == '123' 109 | assert query( # switch between xpyth and regular comprehensions 110 | a 111 | for a in 112 | next( 113 | node 114 | for node in 115 | query( 116 | div 117 | for div in 118 | tree 119 | ) 120 | if re.match(r'\d+', node.attrib.get('id')) 121 | ) 122 | if 'google' in a.href 123 | )[0].text == 'Google Charity' 124 | assert set(query( 125 | a.href 126 | for a in 127 | tree 128 | if any( 129 | p 130 | for p in 131 | a.following_siblings 132 | ) 133 | )) == {'http://www.google.com', 'http://www.chasestevens.com'} 134 | assert set(query( 135 | a.href 136 | for a in 137 | tree 138 | if not any( 139 | p 140 | for p in 141 | a.following_siblings 142 | ) 143 | )) == {'http://www.google.org', 'http://www.chasestevens.org'} 144 | assert set(query( 145 | a.href 146 | for a in 147 | tree 148 | if not any( 149 | p 150 | for p in 151 | a.following_siblings 152 | ) 153 | and any( 154 | p 155 | for p in 156 | a.following_siblings 157 | ) 158 | )) == set() 159 | assert set(query( 160 | a.href 161 | for a in 162 | tree 163 | if any( 164 | p 165 | for p in 166 | tree 167 | ) 168 | )) == {'http://www.google.com', 'http://www.chasestevens.com', 'http://www.google.org', 'http://www.chasestevens.org'} 169 | assert not query( 170 | a.href 171 | for a in 172 | tree 173 | if not any( 174 | p 175 | for p in 176 | tree 177 | ) 178 | ) 179 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | [tox] 2 | skip_missing_interpreters = True 3 | envlist = py27,py36 4 | 5 | [testenv] 6 | extras = dev 7 | commands = py.test ./tests 8 | -------------------------------------------------------------------------------- /xpyth/__init__.py: -------------------------------------------------------------------------------- 1 | try: 2 | from functools import reduce 3 | except ImportError: 4 | pass 5 | 6 | from pony.orm.decompiling import Decompiler 7 | from pony.thirdparty.compiler.ast import * 8 | from lxml import etree 9 | 10 | import ctypes 11 | import collections 12 | import functools 13 | 14 | 15 | __all__ = 'DOM X xpath query'.split() 16 | __author__ = 'H. Chase Stevens' 17 | 18 | 19 | DEBUG = False 20 | 21 | 22 | class _DOM(object): 23 | def __iter__(self): 24 | return self 25 | 26 | def __next__(self): 27 | return self 28 | next = __next__ 29 | 30 | 31 | DOM = _DOM() 32 | 33 | 34 | class X: 35 | '''Wildcard, to give autocomplete suggestions.''' 36 | (text, 37 | ancestors, 38 | ancestors_or_self, 39 | children, 40 | descendants, 41 | descendants_or_self, 42 | following, 43 | followings, 44 | following_siblings, 45 | parent, 46 | parents, 47 | preceding, 48 | precedings, 49 | preceding_siblings, 50 | self) = [None] * 15 51 | 52 | 53 | def xpath(g): 54 | """Returns XPath expression corresponding to generator.""" 55 | assert g.gi_frame.f_locals['.0'] == DOM, "Only root-level expressions are supported." 56 | ast = Decompiler(g.gi_code).ast 57 | frame_locals = g.gi_frame.f_locals 58 | frame_globals = g.gi_frame.f_globals 59 | frame_globals.update(frame_locals) # Any danger in this? 60 | expression = _handle_genexpr(ast, frame_globals) 61 | try: 62 | etree.XPath(expression) # Verify syntax 63 | except etree.XPathSyntaxError: 64 | raise etree.XPathSyntaxError(expression) 65 | return expression 66 | 67 | 68 | def query(g): 69 | """Queries a DOM tree (lxml Element).""" 70 | try: 71 | dom = next(g.gi_frame.f_locals['.0']).getparent() # lxml # TODO: change for selenium etc. 72 | except StopIteration: 73 | return [] # copying what lxml does 74 | 75 | # Magic to convert our generator into a DOM-generator (http://pydev.blogspot.co.uk/2014/02/changing-locals-of-frame-frameflocals.html) 76 | g.gi_frame.f_locals['.0'] = DOM 77 | ctypes.pythonapi.PyFrame_LocalsToFast(ctypes.py_object(g.gi_frame), ctypes.c_int(0)) 78 | 79 | expression = '.' + xpath(g) 80 | 81 | method_names = ( 82 | 'xpath', # lxml ElementTree 83 | 'findall', # xml ElementTree 84 | 'find_elements_by_xpath', # selenium WebDriver/WebElement 85 | ) 86 | for method_name in method_names: 87 | try: 88 | xpath_method = getattr(dom, method_name) 89 | break 90 | except AttributeError: 91 | pass 92 | else: 93 | raise NotImplementedError(dom.__class__.__name__) 94 | 95 | return xpath_method(expression) 96 | 97 | 98 | _ATTR_REPLACEMENTS = { 99 | 'cls': 'class', 100 | '__class__': 'class', 101 | } 102 | 103 | _ATTR_FORMAT_OVERRIDES = { 104 | 'text': '{}()', 105 | } 106 | 107 | _COMPARE_OP_REPLACEMENTS = { 108 | '==': '=', 109 | 'in': 'contains', 110 | } 111 | 112 | _COMPARE_OP_FORMAT_OVERRIDES = { 113 | 'contains': '{1}({2}, {0})', 114 | 'not in': 'not(contains({2}, {0}))', 115 | } 116 | 117 | _COMPARE_OP_OPPOSITES = { 118 | '==': '!=', 119 | 'in': 'not in', 120 | '>': '<=', 121 | '<': '>=', 122 | } 123 | _COMPARE_OP_OPPOSITES.update({v: k for k, v in _COMPARE_OP_OPPOSITES.items()}) 124 | _COMPARE_OP_OPPOSITES['='] = '!=' 125 | 126 | _GENEXPRFOR_GETATTR_SEP_OVERRIDES = { 127 | 'ancestors': '/ancestor::', 128 | 'ancestors_or_self': '/ancestor-or-self::', 129 | 'children': '/', 130 | 'descendants': '/descendant::', 131 | 'descendants_or_self': '/descendant-or-self::', 132 | 'following': '/following::', 133 | 'followings': '/following::', 134 | 'following_siblings': '/following-sibling::', 135 | 'parent': '/parent::', 136 | 'parents': '/parent::', 137 | 'preceding': '/preceding::', 138 | 'precedings': '/preceding::', 139 | 'preceding_siblings': '/preceding-sibling::', 140 | 'self': '/self::', 141 | } 142 | 143 | 144 | def _root_level(genexpr, frame_locals): 145 | genexprfor_src = genexpr.code.quals[0].getChildren()[1] 146 | if genexprfor_src.__class__ == Name: 147 | name = genexprfor_src.name 148 | known_dom = name in ('DOM', '.0') 149 | return known_dom or isinstance(frame_locals.get(name), etree._Element) 150 | 151 | 152 | def _get_highest_src(if_, ranked_srcs): 153 | ntype = if_.__class__ 154 | 155 | if ntype == GenExprIf: 156 | return _get_highest_src(if_.test, ranked_srcs) 157 | 158 | if ntype in (Name, AssName): 159 | return [if_.name] 160 | 161 | if hasattr(if_, 'getChildren'): 162 | srcs = [ 163 | src 164 | for child in 165 | if_.getChildren() 166 | for src in 167 | _get_highest_src(child, ranked_srcs) 168 | if src in ranked_srcs 169 | ] 170 | if srcs: 171 | return [sorted(srcs, key=ranked_srcs.index)[0]] 172 | 173 | return [] 174 | 175 | 176 | def _subtree_handler_factory(): 177 | SUBTREE_HANDLERS = {} 178 | 179 | def _subtree_handler(*ntypes, **kwargs): 180 | supply_ast = kwargs.get('supply_ast', False) 181 | def decorator(f): 182 | @functools.wraps(f) 183 | def wrapper(ast_subtree, frame_locals, relative=False): 184 | children = ast_subtree.getChildren() 185 | result = f(ast_subtree if supply_ast else children, frame_locals, relative) 186 | if DEBUG: 187 | print(f.__name__) 188 | print(result) 189 | print() 190 | return result 191 | for ntype in ntypes: 192 | SUBTREE_HANDLERS[ntype] = wrapper 193 | return wrapper 194 | return decorator 195 | 196 | def _dispatch(subtree): 197 | """Choose appropriate subtree handler for subtree type""" 198 | ntype = subtree.__class__ 199 | try: 200 | return functools.partial(SUBTREE_HANDLERS[ntype], subtree) 201 | except KeyError: 202 | raise NotImplementedError(ntype.__name__) 203 | 204 | return _subtree_handler, _dispatch 205 | 206 | _subtree_handler, _dispatch = _subtree_handler_factory() 207 | 208 | 209 | @_subtree_handler(GenExpr) 210 | def _handle_genexpr(children, frame_locals, relative): 211 | child, = children 212 | rel = '.' if relative else '' 213 | assert child.__class__ == GenExprInner # TODO: remove 214 | return rel + _handle_genexprinner(child, frame_locals) 215 | 216 | 217 | @_subtree_handler(GenExprInner) 218 | def _handle_genexprinner(children, frame_locals, relative): 219 | name = children[0] 220 | fors = children[1:] 221 | rel = '.' if relative else '' 222 | 223 | # Rearrange tree if returning booleans, not nodes (all, any) 224 | return_type = name.__class__ 225 | if return_type in (Compare, Not, And, Or): 226 | if return_type in (And, Or): 227 | raise NotImplementedError("Conjunction and disjunction not supported as return type of generator.") 228 | if return_type == Not: 229 | name = name.expr 230 | assert name.__class__ == Compare 231 | ops = name.ops 232 | if ops: 233 | (op, val), = ops 234 | ops = [(_COMPARE_OP_OPPOSITES[op], val)] 235 | else: 236 | ops = name.ops 237 | new_tree = Compare( 238 | GenExprInner( 239 | name.expr, 240 | fors 241 | ), 242 | ops 243 | ) 244 | return rel + _dispatch(new_tree)(frame_locals) # TODO: replace with Compare, since we know this 245 | 246 | # Rearrange ifs 247 | for_srcs = {for_.assign.name: for_ for for_ in fors if for_.__class__} 248 | ranked_srcs = (for_.getChildren()[1] for for_ in fors) 249 | ranked_src_names = [ 250 | src.getChildren()[0].name 251 | if src.__class__ == Getattr 252 | else src.name 253 | for src in 254 | ranked_srcs 255 | ] 256 | for for_ in fors: 257 | for_src = for_.assign.name 258 | 259 | # decompose Ands 260 | ifs = for_.ifs[:] 261 | for if_ in ifs: 262 | try: 263 | test = if_.test 264 | except AttributeError: # e.g. Not has no test attr 265 | continue 266 | if isinstance(test, And): 267 | for_.ifs.remove(if_) 268 | for_.ifs.extend([GenExprIf(node) for node in test.nodes]) 269 | 270 | # shuffle conditionals around so that they test the appropriate level 271 | ifs = for_.ifs[:] 272 | for if_ in ifs: 273 | highest_src = _get_highest_src(if_, ranked_src_names) 274 | if not highest_src: 275 | continue 276 | highest_src, = highest_src 277 | if highest_src != for_src: 278 | for_srcs[highest_src].ifs.append(if_) 279 | try: 280 | for_.ifs.remove(if_) 281 | except ValueError: # we constructed this conditional artificially 282 | pass 283 | 284 | # conjoin any loose conditionals 285 | if len(for_.ifs) > 1: 286 | for_.ifs = [reduce(lambda x, y: And([x, y]), for_.ifs)] 287 | 288 | assert all(for_.__class__ == GenExprFor for for_ in fors) # TODO: remove 289 | fors = ''.join([_handle_genexprfor(for_, frame_locals) for for_ in fors]) 290 | if return_type in (Getattr, Sub): 291 | return '{}/{}'.format(fors, _dispatch(name)(frame_locals)) 292 | return fors 293 | 294 | 295 | @_subtree_handler(Name, AssName, supply_ast=True) 296 | def _handle_name(ast_subtree, frame_locals, relative=False): 297 | name = ast_subtree.name 298 | if name == '.0': 299 | return '.' 300 | if name == 'X': 301 | return '*' 302 | return name 303 | 304 | 305 | @_subtree_handler(GenExprFor) 306 | def _handle_genexprfor(children, frame_locals, relative): 307 | name, src = children[:2] 308 | conds = children[2:] 309 | sep = '//' 310 | if isinstance(src, Getattr): 311 | sep = _GENEXPRFOR_GETATTR_SEP_OVERRIDES.get(src.attrname, '//') 312 | if not conds: 313 | # TODO: determine type of name 314 | return '{}{}'.format(sep, _dispatch(name)(frame_locals)) # slashes are contingent on src 315 | # TODO: determine type of conds 316 | return '{}{}[{}]'.format(sep, _dispatch(name)(frame_locals), _dispatch(conds[0])(frame_locals)) # 0? 317 | 318 | 319 | @_subtree_handler(Getattr) 320 | def _handle_getattr(children, frame_locals, relative): 321 | name, attr = children 322 | attr = _ATTR_REPLACEMENTS.get(attr, attr) 323 | # this might need to be context-sensitive... Almost assuredly, actually 324 | # consider: .//div/@class, .//div[./@class='x'] 325 | return _ATTR_FORMAT_OVERRIDES.get(attr, '@{}').format(attr) 326 | 327 | 328 | @_subtree_handler(GenExprIf) 329 | def _handle_genexprif(children, frame_locals, relative): 330 | rel = '.' if relative else '' 331 | if len(children) == 1: 332 | return _dispatch(children[0])(frame_locals) # TODO: see if child type is consistent 333 | raise NotImplementedError(children) 334 | 335 | 336 | @_subtree_handler(Compare) 337 | def _handle_compare(children, frame_locals, relative): 338 | rel = '.' if relative else '' 339 | 340 | if len(children) == 3: 341 | n1, op, n2 = children 342 | if n2.__class__ == Name: 343 | # Special case - drag in from outer scope if we're checking inclusion of value in iterable 344 | local = frame_locals.get(n2.name) 345 | if isinstance(local, collections.Iterable) and op == 'in': 346 | n2 = Const(local) 347 | if op == 'in' and n2.__class__ == Const and not isinstance(n2.value, str): 348 | # Special case - checking whether value is in iterable 349 | comparisons = [Compare(n1, ('==', Const(val))) for val in n2.value] 350 | return rel + _handle_or(Or(comparisons), frame_locals) 351 | op = _COMPARE_OP_REPLACEMENTS.get(op, op) 352 | format_str = _COMPARE_OP_FORMAT_OVERRIDES.get(op, '{}{}{}') 353 | return format_str.format(rel + _dispatch(n1)(frame_locals), op, rel + _dispatch(n2)(frame_locals)) 354 | raise NotImplementedError(children) 355 | 356 | 357 | @_subtree_handler(Const, supply_ast=True) 358 | def _handle_const(ast_subtree, frame_locals, relative=False): 359 | return repr(ast_subtree.value) 360 | 361 | 362 | @_subtree_handler(And) 363 | def _handle_and(children, frame_locals, relative): 364 | rel = '.' if relative else '' 365 | return ' and '.join(rel + _dispatch(child)(frame_locals) for child in children) 366 | 367 | 368 | @_subtree_handler(Or) 369 | def _handle_or(children, frame_locals, relative): 370 | rel = '.' if relative else '' 371 | return ' or '.join(rel + _dispatch(child)(frame_locals) for child in children) 372 | 373 | 374 | @_subtree_handler(Not) 375 | def _handle_not(children, frame_locals, relative): 376 | child, = children 377 | rel = '.' if relative else '' 378 | return 'not({})'.format(rel + _dispatch(child)(frame_locals)) 379 | 380 | 381 | @_subtree_handler(Sub) 382 | def _handle_sub(children, frame_locals, relative): 383 | return '-'.join(_dispatch(child)(frame_locals) for child in children) 384 | 385 | 386 | @_subtree_handler(CallFunc) 387 | def _handle_callfunc(children, frame_locals, relative): 388 | rel = '.' if relative else '' 389 | if isinstance(children[0], Name): 390 | func_name = children[0].name 391 | is_relative = lambda: not _root_level(children[1], frame_locals) 392 | if func_name == 'any': 393 | return rel + _dispatch(children[1])(frame_locals, is_relative()) 394 | if func_name == 'len': 395 | return 'count({})'.format(rel + _dispatch(children[1])(frame_locals, is_relative())) 396 | elif func_name == 'all': 397 | # Need to change (\all x. P) to (\not \exists x. \not P) 398 | genexprinner = children[1].getChildren()[0] 399 | assert genexprinner.__class__ == GenExprInner 400 | name, genexprfor = genexprinner.getChildren() 401 | gef_assname, gef_name = genexprfor.getChildren()[:2] 402 | gef_ifs = genexprfor.ifs 403 | new_tree = Not( 404 | GenExpr( 405 | GenExprInner( 406 | name, 407 | [GenExprFor( 408 | gef_assname, 409 | gef_name, 410 | [Not(gef_ifs[0])] if gef_ifs else [] 411 | )] 412 | ) 413 | ) 414 | ) 415 | return rel + _handle_not(new_tree, frame_locals, is_relative()) 416 | raise NotImplementedError(children) 417 | --------------------------------------------------------------------------------