├── tests
├── __init__.py
├── test_html.py
└── test_xml.py
├── xq
├── __init__.py
└── __main__.py
├── setup.py
├── README.rst
└── .gitignore
/tests/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/xq/__init__.py:
--------------------------------------------------------------------------------
1 | VERSION = '0.0.4'
2 | NAME = 'xq'
3 | DESCRIPTION = 'Like jq but for XML and XPath.'
4 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import setup, find_packages
2 | from codecs import open
3 |
4 | from xq import VERSION, NAME, DESCRIPTION
5 |
6 | with open('README.rst', encoding='utf-8') as f:
7 | long_description = f.read()
8 |
9 | setup(
10 | name=NAME,
11 | description=DESCRIPTION,
12 | long_description=long_description,
13 | keywords='xml xpath text',
14 | version=VERSION,
15 | license='MPL 2.0',
16 |
17 | author='Ben Jeffrey',
18 | author_email='mail@benjeffrey.net',
19 | url='https://github.com/jeffbr13/xq',
20 |
21 | classifiers=[
22 | 'Development Status :: 3 - Alpha',
23 | 'Programming Language :: Python :: 3',
24 | 'Intended Audience :: Developers',
25 | 'Environment :: Console',
26 | 'Topic :: Terminals',
27 | 'Topic :: Text Processing :: Markup :: XML',
28 | 'Topic :: Utilities',
29 | 'License :: OSI Approved :: Mozilla Public License 2.0 (MPL 2.0)',
30 | ],
31 |
32 | packages=find_packages(),
33 |
34 | install_requires=[
35 | 'lxml',
36 | 'pygments'
37 | ],
38 |
39 | entry_points={
40 | 'console_scripts': [
41 | 'xq=xq.__main__:main',
42 | ],
43 | },
44 | )
45 |
--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
1 | xq
2 | ==
3 |
4 | Apply XPath expressions to XML, like ``jq`` does for JSONPath and JSON.
5 |
6 |
7 | Installation
8 | ------------
9 |
10 | Install with ``pip``::
11 |
12 | pip install xq
13 |
14 | Or download the repo and install via ``setuptools``::
15 |
16 | python setup.py install
17 |
18 |
19 | Usage
20 | -----
21 |
22 | Extract download URLs from an RSS feed::
23 |
24 | http get 'http://br-rss.jeffbr13.net/rss/channels/1/' | xq '//item/enclosure/@url'
25 |
26 |
27 | Extract all links from an HTML page footer::
28 |
29 | http get 'http://br-rss.jeffbr13.net/ | xq '//footer//a/@href'
30 |
31 |
32 | Test
33 | ----
34 |
35 | Run ``unittest`` in the root directory to autodetect and run tests::
36 |
37 | python -m unittest
38 |
39 |
40 | Build
41 | -----
42 |
43 | Increment ``xq.VERSION`` and run the following two commands
44 | to create a `source distribution `_,
45 | create a `universal wheel `_,
46 | and `upload to PyPI `_ ::
47 |
48 | python setup.py sdist
49 | python setup.py bdist_wheel --universal
50 | twine upload dist/*
51 |
52 |
53 | See Also
54 | --------
55 |
56 | - `jq `_
57 | - `hq `_
58 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Created by .ignore support plugin (hsz.mobi)
2 | ### Python template
3 | # Byte-compiled / optimized / DLL files
4 | __pycache__/
5 | *.py[cod]
6 | *$py.class
7 |
8 | # C extensions
9 | *.so
10 |
11 | # Distribution / packaging
12 | .Python
13 | build/
14 | develop-eggs/
15 | dist/
16 | downloads/
17 | eggs/
18 | .eggs/
19 | lib/
20 | lib64/
21 | parts/
22 | sdist/
23 | var/
24 | wheels/
25 | *.egg-info/
26 | .installed.cfg
27 | *.egg
28 | MANIFEST
29 |
30 | # PyInstaller
31 | # Usually these files are written by a python script from a template
32 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
33 | *.manifest
34 | *.spec
35 |
36 | # Installer logs
37 | pip-log.txt
38 | pip-delete-this-directory.txt
39 |
40 | # Unit test / coverage reports
41 | htmlcov/
42 | .tox/
43 | .coverage
44 | .coverage.*
45 | .cache
46 | nosetests.xml
47 | coverage.xml
48 | *.cover
49 | .hypothesis/
50 |
51 | # Translations
52 | *.mo
53 | *.pot
54 |
55 | # Django stuff:
56 | *.log
57 | .static_storage/
58 | .media/
59 | local_settings.py
60 |
61 | # Flask stuff:
62 | instance/
63 | .webassets-cache
64 |
65 | # Scrapy stuff:
66 | .scrapy
67 |
68 | # Sphinx documentation
69 | docs/_build/
70 |
71 | # PyBuilder
72 | target/
73 |
74 | # Jupyter Notebook
75 | .ipynb_checkpoints
76 |
77 | # pyenv
78 | .python-version
79 |
80 | # celery beat schedule file
81 | celerybeat-schedule
82 |
83 | # SageMath parsed files
84 | *.sage.py
85 |
86 | # Environments
87 | .env
88 | .venv
89 | env/
90 | venv/
91 | ENV/
92 | env.bak/
93 | venv.bak/
94 |
95 | # Spyder project settings
96 | .spyderproject
97 | .spyproject
98 |
99 | # Rope project settings
100 | .ropeproject
101 |
102 | # mkdocs documentation
103 | /site
104 |
105 | # mypy
106 | .mypy_cache/
107 |
108 |
--------------------------------------------------------------------------------
/xq/__main__.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import sys
3 | from typing import Union
4 |
5 | from lxml import etree
6 | from lxml.builder import E
7 | from pygments import highlight
8 | from pygments.formatters.other import NullFormatter
9 | from pygments.formatters.terminal import TerminalFormatter
10 | from pygments.lexers.html import XmlLexer
11 |
12 | from . import NAME, DESCRIPTION, VERSION
13 |
14 |
15 | def wrap_in_results(elements: [Union[etree._Element, etree._ElementUnicodeResult]]) -> etree._Element:
16 | results = E.results()
17 | for el in elements:
18 | results.append(E.result(el))
19 | return results
20 |
21 |
22 | def apply_xpath(infile, xpath_query=None, colorize=False):
23 | try:
24 | parsed = etree.parse(infile, etree.XMLParser(remove_blank_text=True))
25 | except etree.XMLSyntaxError:
26 | parsed = etree.parse(infile, etree.HTMLParser(remove_blank_text=True))
27 |
28 | if xpath_query:
29 | matches = parsed.xpath(xpath_query)
30 | results = wrap_in_results(matches)
31 | output = etree.tostring(results, pretty_print=True)
32 | else:
33 | output = etree.tostring(parsed, pretty_print=True)
34 |
35 | formatter = TerminalFormatter() if colorize else NullFormatter()
36 | return highlight(output, XmlLexer(), formatter)
37 |
38 |
39 | def main():
40 | parser = argparse.ArgumentParser(
41 | prog=NAME,
42 | description=DESCRIPTION,
43 | )
44 | parser.add_argument('-v', '--version', action='version', version='%(prog)s ' + VERSION)
45 | parser.add_argument(
46 | 'xpath_query', nargs='?', type=str,
47 | help='XPath query to apply to XML document.'
48 | )
49 | parser.add_argument(
50 | 'file', nargs='?', type=argparse.FileType('r'), default=sys.stdin,
51 | help='XML file to process. Defaults to STDIN.',
52 | )
53 | args = parser.parse_args()
54 | sys.stdout.write(apply_xpath(args.file, args.xpath_query, sys.stdout.isatty()))
55 |
56 |
57 | if __name__ == '__main__':
58 | main()
59 |
--------------------------------------------------------------------------------
/tests/test_html.py:
--------------------------------------------------------------------------------
1 | import unittest
2 | from io import StringIO
3 | from lxml import etree
4 | from lxml.builder import E
5 |
6 | from xq.__main__ import apply_xpath
7 |
8 | SAMPLE_HTML = """
9 |
10 |
11 |
12 | Page Title
13 |
14 |
15 |
16 |
17 |
18 |
21 |
22 | Paragraph 1 with a link.
23 | Heading 2
24 | Paragraph 2.
25 |
26 |
27 |
30 |
31 |
32 | """
33 |
34 |
35 | class TestHtmlXpathExpressions(unittest.TestCase):
36 | def setUp(self):
37 | self.test_input = StringIO(SAMPLE_HTML)
38 |
39 | def tearDown(self):
40 | self.test_input.close()
41 |
42 | def test_extract_elements(self):
43 | expected_output = ("\n"
44 | " Paragraph 1 with a link.
\n"
45 | " \n"
46 | " Paragraph 2.
\n"
47 | " \n"
48 | " 🛠
\n"
49 | " \n"
50 | "\n")
51 |
52 | self.assertEqual(
53 | apply_xpath(self.test_input, '//p', colorize=False),
54 | expected_output
55 | )
56 |
57 | def test_extract_attributes(self):
58 | expected_output = etree.tounicode(
59 | E.results(
60 | E.result('/url/1'),
61 | E.result('/url/2'),
62 | ),
63 | pretty_print=True
64 | )
65 | self.assertEqual(
66 | apply_xpath(self.test_input, '//a/@href', colorize=False),
67 | expected_output
68 | )
69 |
70 | def test_extract_text(self):
71 | expected_output = etree.tounicode(
72 | E.results(
73 | E.result('Heading 1'),
74 | E.result('Heading 2'),
75 | ),
76 | pretty_print=True
77 | )
78 | self.assertEqual(
79 | apply_xpath(self.test_input, '//h1/text()', colorize=False),
80 | expected_output
81 | )
82 |
--------------------------------------------------------------------------------
/tests/test_xml.py:
--------------------------------------------------------------------------------
1 | import unittest
2 | from io import StringIO
3 | from lxml import etree
4 |
5 | from lxml.builder import E
6 |
7 | from xq.__main__ import apply_xpath
8 |
9 |
10 | SAMPLE_XML = """
11 |
12 |
13 |
14 | en-gb
15 |
16 | Example RSS feed
17 | This is a podcast RSS feed
18 | This is a podcast RSS feed
19 |
20 |
21 | yes
22 | Example
23 |
24 | -
25 | http://example.com/rss/1
26 | Episode 1
27 | This is the first podcast episode.
28 | This is the first podcast episode
29 | Wed, 13 Sep 2017 10:49:56 +0000
30 | 2:00:00
31 |
32 |
33 |
34 | -
35 | http://example.com/rss/2
36 | Episode 2
37 | This is the second podcast episode.
38 | This is the second podcast episode
39 | Mon, 11 Sep 2017 11:48:21 +0000
40 | 1:00:00
41 |
42 |
43 |
44 | -
45 | http://example.com/rss/3
46 | Episode 3
47 | This is the third podcast episode.
48 | This is the third podcast episode
49 | Thu, 24 Aug 2017 00:50:10 +0000
50 | 1:03:00
51 |
52 |
53 |
54 |
55 | """
56 |
57 |
58 | class TestXmlXpathExpressions(unittest.TestCase):
59 |
60 | def setUp(self):
61 | self.test_input = StringIO(SAMPLE_XML)
62 |
63 | def tearDown(self):
64 | self.test_input.close()
65 |
66 | def test_extract_elements(self):
67 | expected_output = etree.tounicode(
68 | E.results(
69 | E.result(
70 | E.title('Episode 1')
71 | ),
72 | E.result(
73 | E.title('Episode 2')
74 | ),
75 | E.result(
76 | E.title('Episode 3')
77 | ),
78 | ),
79 | pretty_print=True
80 | )
81 | self.assertEqual(expected_output, apply_xpath(self.test_input, './channel/item/title', colorize=False))
82 |
83 | def test_extract_single_attribute(self):
84 | expected_output = etree.tounicode(
85 | E.results(
86 | E.result('http://example.com/rss/2/download'),
87 | ),
88 | pretty_print=True
89 | )
90 | self.assertEqual(expected_output, apply_xpath(self.test_input, './channel/item[2]/enclosure/@url', colorize=False))
91 |
92 | def test_extract_text(self):
93 | expected_output = etree.tounicode(
94 | E.results(
95 | E.result('Episode 1'),
96 | E.result('Episode 2'),
97 | E.result('Episode 3'),
98 | ),
99 | pretty_print=True
100 | )
101 | self.assertEqual(expected_output, apply_xpath(self.test_input, './channel/item/title/text()', colorize=False))
102 |
--------------------------------------------------------------------------------