├── tests ├── __init__.py ├── test_html.py └── test_xml.py ├── xq ├── __init__.py └── __main__.py ├── setup.py ├── README.rst └── .gitignore /tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /xq/__init__.py: -------------------------------------------------------------------------------- 1 | VERSION = '0.0.4' 2 | NAME = 'xq' 3 | DESCRIPTION = 'Like jq but for XML and XPath.' 4 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | from codecs import open 3 | 4 | from xq import VERSION, NAME, DESCRIPTION 5 | 6 | with open('README.rst', encoding='utf-8') as f: 7 | long_description = f.read() 8 | 9 | setup( 10 | name=NAME, 11 | description=DESCRIPTION, 12 | long_description=long_description, 13 | keywords='xml xpath text', 14 | version=VERSION, 15 | license='MPL 2.0', 16 | 17 | author='Ben Jeffrey', 18 | author_email='mail@benjeffrey.net', 19 | url='https://github.com/jeffbr13/xq', 20 | 21 | classifiers=[ 22 | 'Development Status :: 3 - Alpha', 23 | 'Programming Language :: Python :: 3', 24 | 'Intended Audience :: Developers', 25 | 'Environment :: Console', 26 | 'Topic :: Terminals', 27 | 'Topic :: Text Processing :: Markup :: XML', 28 | 'Topic :: Utilities', 29 | 'License :: OSI Approved :: Mozilla Public License 2.0 (MPL 2.0)', 30 | ], 31 | 32 | packages=find_packages(), 33 | 34 | install_requires=[ 35 | 'lxml', 36 | 'pygments' 37 | ], 38 | 39 | entry_points={ 40 | 'console_scripts': [ 41 | 'xq=xq.__main__:main', 42 | ], 43 | }, 44 | ) 45 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | xq 2 | == 3 | 4 | Apply XPath expressions to XML, like ``jq`` does for JSONPath and JSON. 5 | 6 | 7 | Installation 8 | ------------ 9 | 10 | Install with ``pip``:: 11 | 12 | pip install xq 13 | 14 | Or download the repo and install via ``setuptools``:: 15 | 16 | python setup.py install 17 | 18 | 19 | Usage 20 | ----- 21 | 22 | Extract download URLs from an RSS feed:: 23 | 24 | http get 'http://br-rss.jeffbr13.net/rss/channels/1/' | xq '//item/enclosure/@url' 25 | 26 | 27 | Extract all links from an HTML page footer:: 28 | 29 | http get 'http://br-rss.jeffbr13.net/ | xq '//footer//a/@href' 30 | 31 | 32 | Test 33 | ---- 34 | 35 | Run ``unittest`` in the root directory to autodetect and run tests:: 36 | 37 | python -m unittest 38 | 39 | 40 | Build 41 | ----- 42 | 43 | Increment ``xq.VERSION`` and run the following two commands 44 | to create a `source distribution `_, 45 | create a `universal wheel `_, 46 | and `upload to PyPI `_ :: 47 | 48 | python setup.py sdist 49 | python setup.py bdist_wheel --universal 50 | twine upload dist/* 51 | 52 | 53 | See Also 54 | -------- 55 | 56 | - `jq `_ 57 | - `hq `_ 58 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Created by .ignore support plugin (hsz.mobi) 2 | ### Python template 3 | # Byte-compiled / optimized / DLL files 4 | __pycache__/ 5 | *.py[cod] 6 | *$py.class 7 | 8 | # C extensions 9 | *.so 10 | 11 | # Distribution / packaging 12 | .Python 13 | build/ 14 | develop-eggs/ 15 | dist/ 16 | downloads/ 17 | eggs/ 18 | .eggs/ 19 | lib/ 20 | lib64/ 21 | parts/ 22 | sdist/ 23 | var/ 24 | wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | .hypothesis/ 50 | 51 | # Translations 52 | *.mo 53 | *.pot 54 | 55 | # Django stuff: 56 | *.log 57 | .static_storage/ 58 | .media/ 59 | local_settings.py 60 | 61 | # Flask stuff: 62 | instance/ 63 | .webassets-cache 64 | 65 | # Scrapy stuff: 66 | .scrapy 67 | 68 | # Sphinx documentation 69 | docs/_build/ 70 | 71 | # PyBuilder 72 | target/ 73 | 74 | # Jupyter Notebook 75 | .ipynb_checkpoints 76 | 77 | # pyenv 78 | .python-version 79 | 80 | # celery beat schedule file 81 | celerybeat-schedule 82 | 83 | # SageMath parsed files 84 | *.sage.py 85 | 86 | # Environments 87 | .env 88 | .venv 89 | env/ 90 | venv/ 91 | ENV/ 92 | env.bak/ 93 | venv.bak/ 94 | 95 | # Spyder project settings 96 | .spyderproject 97 | .spyproject 98 | 99 | # Rope project settings 100 | .ropeproject 101 | 102 | # mkdocs documentation 103 | /site 104 | 105 | # mypy 106 | .mypy_cache/ 107 | 108 | -------------------------------------------------------------------------------- /xq/__main__.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import sys 3 | from typing import Union 4 | 5 | from lxml import etree 6 | from lxml.builder import E 7 | from pygments import highlight 8 | from pygments.formatters.other import NullFormatter 9 | from pygments.formatters.terminal import TerminalFormatter 10 | from pygments.lexers.html import XmlLexer 11 | 12 | from . import NAME, DESCRIPTION, VERSION 13 | 14 | 15 | def wrap_in_results(elements: [Union[etree._Element, etree._ElementUnicodeResult]]) -> etree._Element: 16 | results = E.results() 17 | for el in elements: 18 | results.append(E.result(el)) 19 | return results 20 | 21 | 22 | def apply_xpath(infile, xpath_query=None, colorize=False): 23 | try: 24 | parsed = etree.parse(infile, etree.XMLParser(remove_blank_text=True)) 25 | except etree.XMLSyntaxError: 26 | parsed = etree.parse(infile, etree.HTMLParser(remove_blank_text=True)) 27 | 28 | if xpath_query: 29 | matches = parsed.xpath(xpath_query) 30 | results = wrap_in_results(matches) 31 | output = etree.tostring(results, pretty_print=True) 32 | else: 33 | output = etree.tostring(parsed, pretty_print=True) 34 | 35 | formatter = TerminalFormatter() if colorize else NullFormatter() 36 | return highlight(output, XmlLexer(), formatter) 37 | 38 | 39 | def main(): 40 | parser = argparse.ArgumentParser( 41 | prog=NAME, 42 | description=DESCRIPTION, 43 | ) 44 | parser.add_argument('-v', '--version', action='version', version='%(prog)s ' + VERSION) 45 | parser.add_argument( 46 | 'xpath_query', nargs='?', type=str, 47 | help='XPath query to apply to XML document.' 48 | ) 49 | parser.add_argument( 50 | 'file', nargs='?', type=argparse.FileType('r'), default=sys.stdin, 51 | help='XML file to process. Defaults to STDIN.', 52 | ) 53 | args = parser.parse_args() 54 | sys.stdout.write(apply_xpath(args.file, args.xpath_query, sys.stdout.isatty())) 55 | 56 | 57 | if __name__ == '__main__': 58 | main() 59 | -------------------------------------------------------------------------------- /tests/test_html.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from io import StringIO 3 | from lxml import etree 4 | from lxml.builder import E 5 | 6 | from xq.__main__ import apply_xpath 7 | 8 | SAMPLE_HTML = """ 9 | 10 | 11 | 12 | Page Title 13 | 14 | 15 | 16 | 17 | 18 |
19 |

Heading 1

20 |
21 |
22 |

Paragraph 1 with a link.

23 |

Heading 2

24 |

Paragraph 2.

25 |
26 |
27 | 30 | 31 | 32 | """ 33 | 34 | 35 | class TestHtmlXpathExpressions(unittest.TestCase): 36 | def setUp(self): 37 | self.test_input = StringIO(SAMPLE_HTML) 38 | 39 | def tearDown(self): 40 | self.test_input.close() 41 | 42 | def test_extract_elements(self): 43 | expected_output = ("\n" 44 | "

Paragraph 1 with a link.

\n" 45 | "
\n" 46 | "

Paragraph 2.

\n" 47 | "
\n" 48 | "

🛠

\n" 49 | "
\n" 50 | "
\n") 51 | 52 | self.assertEqual( 53 | apply_xpath(self.test_input, '//p', colorize=False), 54 | expected_output 55 | ) 56 | 57 | def test_extract_attributes(self): 58 | expected_output = etree.tounicode( 59 | E.results( 60 | E.result('/url/1'), 61 | E.result('/url/2'), 62 | ), 63 | pretty_print=True 64 | ) 65 | self.assertEqual( 66 | apply_xpath(self.test_input, '//a/@href', colorize=False), 67 | expected_output 68 | ) 69 | 70 | def test_extract_text(self): 71 | expected_output = etree.tounicode( 72 | E.results( 73 | E.result('Heading 1'), 74 | E.result('Heading 2'), 75 | ), 76 | pretty_print=True 77 | ) 78 | self.assertEqual( 79 | apply_xpath(self.test_input, '//h1/text()', colorize=False), 80 | expected_output 81 | ) 82 | -------------------------------------------------------------------------------- /tests/test_xml.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from io import StringIO 3 | from lxml import etree 4 | 5 | from lxml.builder import E 6 | 7 | from xq.__main__ import apply_xpath 8 | 9 | 10 | SAMPLE_XML = """ 11 | 12 | 13 | 14 | en-gb 15 | 16 | Example RSS feed 17 | This is a podcast RSS feed 18 | This is a podcast RSS feed 19 | 20 | 21 | yes 22 | Example 23 | 24 | 25 | http://example.com/rss/1 26 | Episode 1 27 | This is the first podcast episode. 28 | This is the first podcast episode 29 | Wed, 13 Sep 2017 10:49:56 +0000 30 | 2:00:00 31 | 32 | 33 | 34 | 35 | http://example.com/rss/2 36 | Episode 2 37 | This is the second podcast episode. 38 | This is the second podcast episode 39 | Mon, 11 Sep 2017 11:48:21 +0000 40 | 1:00:00 41 | 42 | 43 | 44 | 45 | http://example.com/rss/3 46 | Episode 3 47 | This is the third podcast episode. 48 | This is the third podcast episode 49 | Thu, 24 Aug 2017 00:50:10 +0000 50 | 1:03:00 51 | 52 | 53 | 54 | 55 | """ 56 | 57 | 58 | class TestXmlXpathExpressions(unittest.TestCase): 59 | 60 | def setUp(self): 61 | self.test_input = StringIO(SAMPLE_XML) 62 | 63 | def tearDown(self): 64 | self.test_input.close() 65 | 66 | def test_extract_elements(self): 67 | expected_output = etree.tounicode( 68 | E.results( 69 | E.result( 70 | E.title('Episode 1') 71 | ), 72 | E.result( 73 | E.title('Episode 2') 74 | ), 75 | E.result( 76 | E.title('Episode 3') 77 | ), 78 | ), 79 | pretty_print=True 80 | ) 81 | self.assertEqual(expected_output, apply_xpath(self.test_input, './channel/item/title', colorize=False)) 82 | 83 | def test_extract_single_attribute(self): 84 | expected_output = etree.tounicode( 85 | E.results( 86 | E.result('http://example.com/rss/2/download'), 87 | ), 88 | pretty_print=True 89 | ) 90 | self.assertEqual(expected_output, apply_xpath(self.test_input, './channel/item[2]/enclosure/@url', colorize=False)) 91 | 92 | def test_extract_text(self): 93 | expected_output = etree.tounicode( 94 | E.results( 95 | E.result('Episode 1'), 96 | E.result('Episode 2'), 97 | E.result('Episode 3'), 98 | ), 99 | pretty_print=True 100 | ) 101 | self.assertEqual(expected_output, apply_xpath(self.test_input, './channel/item/title/text()', colorize=False)) 102 | --------------------------------------------------------------------------------