├── setup.cfg
├── MANIFEST.in
├── .gitmodules
├── .gitignore
├── setup.py
├── README.md
├── tests.py
└── pugixmltodict.pyx


/setup.cfg:
--------------------------------------------------------------------------------
1 | [metadata]
2 | description-file = README.md
3 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | recursive-include pugixml/src *.hpp *.cpp
2 | include pugixmltodict.pyx
3 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "pugixml"]
2 | 	path = pugixml
3 | 	url = https://github.com/zeux/pugixml
4 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | 
 5 | # C extensions
 6 | *.so
 7 | 
 8 | # Distribution / packaging
 9 | /build
10 | /dist
11 | /*.egg-info
12 | 
13 | # Editor / devtools
14 | .*.sw[pon]
15 | \#*#
16 | /tags
17 | 
18 | # Generated files
19 | pugixmltodict.cpp
20 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | import sys
 3 | from distutils.core import setup, Extension
 4 | 
 5 | 
 6 | USE_CYTHON = False
 7 | CYTHON_PARAM = '--cython'
 8 | if CYTHON_PARAM in sys.argv:
 9 |     USE_CYTHON = True
10 |     sys.argv.remove(CYTHON_PARAM)
11 | 
12 | 
13 | SOURCE_EXT = '.pyx' if USE_CYTHON else '.cpp'
14 | EXT_MODULES = [Extension(
15 |     'pugixmltodict',
16 |     sources=[
17 |         'pugixmltodict' + SOURCE_EXT,
18 |         'pugixml/src/pugixml.cpp',
19 |     ],
20 | )]
21 | 
22 | if USE_CYTHON:
23 |     from Cython.Build import cythonize
24 |     EXT_MODULES = cythonize(EXT_MODULES)
25 | 
26 | 
27 | setup(
28 |     name='pugixmltodict',
29 |     version='0.5',
30 |     description='A fast alternative to xmltodict library',
31 |     url='https://github.com/sepeth/pugixmltodict',
32 |     author='Doğan Çeçen',
33 |     author_email='sepeth@gmail.com',
34 | 
35 |     classifiers=[
36 |         'Development Status :: 3 - Alpha',
37 |         'Intended Audience :: Developers',
38 |         'License :: OSI Approved :: MIT License',
39 |         'Programming Language :: Cython',
40 |         'Programming Language :: Python :: 2.7',
41 |         'Programming Language :: Python :: 3',
42 |         'Topic :: Text Processing :: Markup :: XML',
43 |     ],
44 | 
45 |     ext_modules=EXT_MODULES,
46 | )
47 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # pugixmltodict
 2 | 
 3 | ## Install
 4 | 
 5 | ```sh
 6 | pip install pugixmltodict
 7 | ```
 8 | 
 9 | ## Run tests
10 | 
11 | ```sh
12 | python -m unittest tests
13 | ```
14 | 
15 | ## Build from sources
16 | 
17 | Install Cython first:
18 | 
19 | ```sh
20 | pip install cython
21 | ```
22 | 
23 | Build with `--cython` so does Cython sources get compiled to C++:
24 | 
25 | ```sh
26 | python setup.py build --cython
27 | ```
28 | 
29 | _You can omit the parameter at later builds if you haven't made changes to `pugixmltodict.pyx`.
30 | 
31 | 
32 | ## License
33 | 
34 | This library is available to anybody free of charge, under the terms of MIT License:
35 | 
36 | Copyright (c) 2015 Doğan Çeçen
37 | 
38 | Permission is hereby granted, free of charge, to any person obtaining a copy
39 | of this software and associated documentation files (the "Software"), to deal
40 | in the Software without restriction, including without limitation the rights
41 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
42 | copies of the Software, and to permit persons to whom the Software is
43 | furnished to do so, subject to the following conditions:
44 | 
45 | The above copyright notice and this permission notice shall be included in
46 | all copies or substantial portions of the Software.
47 | 
48 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
49 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
50 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
51 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
52 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
53 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
54 | THE SOFTWARE.
55 | 


--------------------------------------------------------------------------------
/tests.py:
--------------------------------------------------------------------------------
  1 | import unittest
  2 | from pugixmltodict import parse, unparse
  3 | 
  4 | 
  5 | class XmlToDictTestCase(unittest.TestCase):
  6 |     def test_minimal(self):
  7 |         self.assertEqual(parse('<a/>'), {'a': None})
  8 | 
  9 |     def test_simple(self):
 10 |         self.assertEqual(parse('<a>data</a>'), {'a': 'data'})
 11 | 
 12 |     def test_list(self):
 13 |         self.assertEqual(parse('<a><b>1</b><b>2</b><b>3</b></a>'),
 14 |                          {'a': {'b': ['1', '2', '3']}})
 15 | 
 16 |     def test_attrib(self):
 17 |         self.assertEqual(parse('<a href="xyz"/>'),
 18 |                          {'a': {'@href': 'xyz'}})
 19 | 
 20 |     def test_attrib_and_text(self):
 21 |         self.assertEqual(parse('<a href="xyz">123</a>'),
 22 |                          {'a': {'@href': 'xyz', '#text': '123'}})
 23 | 
 24 |     def test_semi_structured(self):
 25 |         self.assertEqual(parse('<a>abc<b/>def</a>'),
 26 |                          {'a': {'b': None, '#text': 'abcdef'}})
 27 | 
 28 |     def test_nested_semi_structured(self):
 29 |         self.assertEqual(parse('<a>abc<b>123<c/>456</b>def</a>'),
 30 |                          {'a': {'#text': 'abcdef', 'b': {
 31 |                              '#text': '123456', 'c': None}}})
 32 | 
 33 |     def test_skip_whitespace(self):
 34 |         xml = """
 35 |         <root>
 36 |           <emptya>           </emptya>
 37 |           <emptyb attr="attrvalue">
 38 |           </emptyb>
 39 |           <value>hello</value>
 40 |         </root>
 41 |         """
 42 |         self.assertEqual(
 43 |             parse(xml),
 44 |             {'root': {'emptya': None,
 45 |                       'emptyb': {'@attr': 'attrvalue'},
 46 |                       'value': 'hello'}})
 47 | 
 48 |     def test_namespace_ignore(self):
 49 |         xml = """
 50 |         <root xmlns="http://defaultns.com/"
 51 |               xmlns:a="http://a.com/"
 52 |               xmlns:b="http://b.com/">
 53 |           <x>1</x>
 54 |           <a:y>2</a:y>
 55 |           <b:z>3</b:z>
 56 |         </root>
 57 |         """
 58 |         d = {
 59 |             'root': {
 60 |                 '@xmlns': 'http://defaultns.com/',
 61 |                 '@xmlns:a': 'http://a.com/',
 62 |                 '@xmlns:b': 'http://b.com/',
 63 |                 'x': '1',
 64 |                 'a:y': '2',
 65 |                 'b:z': '3',
 66 |             },
 67 |         }
 68 |         self.assertEqual(parse(xml), d)
 69 | 
 70 |     def test_with_broken_attribute(self):
 71 |         with self.assertRaises(ValueError):
 72 |             parse('<root attr>foo</root>')
 73 | 
 74 |     def test_with_mismatched_tag(self):
 75 |         with self.assertRaises(ValueError):
 76 |             parse('<root attr="val">text</wrong>')
 77 | 
 78 | 
 79 | class DictToXmlTestCase(unittest.TestCase):
 80 |     def test_root(self):
 81 |         obj = {'a': None}
 82 |         self.assertEqual(obj, parse(unparse(obj)))
 83 |         self.assertEqual(unparse(obj), unparse(parse(unparse(obj))))
 84 | 
 85 |     def test_simple_text(self):
 86 |         obj = {'a': 'b'}
 87 |         self.assertEqual(obj, parse(unparse(obj)))
 88 |         self.assertEqual(unparse(obj), unparse(parse(unparse(obj))))
 89 | 
 90 |     def test_attrib(self):
 91 |         obj = {'a': {'@href': 'x'}}
 92 |         self.assertEqual(obj, parse(unparse(obj)))
 93 |         self.assertEqual(unparse(obj), unparse(parse(unparse(obj))))
 94 | 
 95 |     def test_attrib_and_text(self):
 96 |         obj = {'a': {'@href': 'x', '#text': 'y'}}
 97 |         self.assertEqual(obj, parse(unparse(obj)))
 98 |         self.assertEqual(unparse(obj), unparse(parse(unparse(obj))))
 99 | 
100 |     def test_list(self):
101 |         obj = {'a': {'b': ['1', '2', '3']}}
102 |         self.assertEqual(obj, parse(unparse(obj)))
103 |         self.assertEqual(unparse(obj), unparse(parse(unparse(obj))))
104 | 


--------------------------------------------------------------------------------
/pugixmltodict.pyx:
--------------------------------------------------------------------------------
  1 | # distutils: language = c++
  2 | 
  3 | from __future__ import unicode_literals
  4 | import sys
  5 | from libc.string cimport const_char
  6 | from libc.stddef cimport ptrdiff_t
  7 | from libcpp.string cimport string
  8 | 
  9 | 
 10 | PY3 = sys.version_info[0] == 3
 11 | 
 12 | 
 13 | cdef extern from "<sstream>" namespace "std" nogil:
 14 |     cdef cppclass stringstream:
 15 |         string str() const
 16 | 
 17 | 
 18 | cdef extern from "pugixml/src/pugixml.hpp" namespace "pugi" nogil:
 19 |     cdef cppclass xml_text:
 20 |         bint empty() const
 21 |         const_char* get() const
 22 | 
 23 |     cdef cppclass xml_attribute:
 24 |         const_char* name() const
 25 |         const_char* value() const
 26 |         bint empty() const
 27 | 
 28 |         # Get next/previous attribute in the attribute list of the parent node
 29 |         xml_attribute next_attribute() const
 30 |         xml_attribute previous_attribute() const
 31 | 
 32 |         # Set attribute name/value (returns false if attribute is empty or there
 33 |         # is not enough memory)
 34 |         bint set_name(const_char* rhs)
 35 |         bint set_value(const_char* rhs)
 36 | 
 37 |     cdef enum xml_node_type:
 38 |         node_null,         # Empty (null) node handle
 39 |         node_document,     # A document tree's absolute root
 40 |         node_element,      # Element tag, i.e. '<node/>'
 41 |         node_pcdata,       # Plain character data, i.e. 'text'
 42 |         node_cdata,        # Character data, i.e. '<![CDATA[text]]>'
 43 |         node_comment,      # Comment tag, i.e. '<!-- text -->'
 44 |         node_pi,           # Processing instruction, i.e. '<?name?>'
 45 |         node_declaration,  # Document declaration, i.e. '<?xml version="1.0"?>'
 46 |         node_doctype       # Document type declaration, i.e. '<!DOCTYPE doc>'
 47 | 
 48 |     cdef enum xml_encoding:
 49 |         encoding_auto,      # Auto-detect
 50 |         encoding_utf8,      # UTF8 encoding
 51 |         encoding_utf16_le,  # Little-endian UTF16
 52 |         encoding_utf16_be,  # Big-endian UTF16
 53 |         encoding_utf16,     # UTF16 with native endianness
 54 |         encoding_utf32_le,  # Little-endian UTF32
 55 |         encoding_utf32_be,  # Big-endian UTF32
 56 |         encoding_utf32,     # UTF32 with native endianness
 57 |         encoding_wchar,     # The same encoding wchar_t has (either UTF16 or UTF32)
 58 |         encoding_latin1
 59 | 
 60 |     cdef cppclass xml_node:
 61 |         # Check if node is empty.
 62 |         bint empty() const
 63 | 
 64 |         # Get node type
 65 |         xml_node_type type() const
 66 | 
 67 |         # Get node name, or "" if node is empty or it has no name
 68 |         const_char* name() const
 69 | 
 70 |         # Get node value, or "" if node is empty or it has no value
 71 |         # Note: For <node>text</node> node.value() does not return "text"!
 72 |         # Use child_value() or text() methods to access text inside nodes.
 73 |         const_char* value() const
 74 | 
 75 |         xml_attribute first_attribute() const
 76 |         xml_attribute last_attribute() const
 77 | 
 78 |         # Get children list
 79 |         xml_node first_child() const
 80 |         xml_node last_child() const
 81 | 
 82 |         # Get next/previous sibling in the children list of the parent node
 83 |         xml_node next_sibling() const
 84 |         xml_node previous_sibling() const
 85 | 
 86 |         # Get parent node
 87 |         xml_node parent() const
 88 | 
 89 |         # Get root of DOM tree this node belongs to
 90 |         xml_node root() const
 91 | 
 92 |         # Get text object for the current node
 93 |         xml_text text() const
 94 | 
 95 |         # Get child, attribute or next/previous sibling with the specified name
 96 |         xml_node child(const_char* name) const
 97 |         xml_attribute attribute(const_char* name) const
 98 |         xml_node next_sibling(const_char* name) const
 99 |         xml_node previous_sibling(const_char* name) const
100 | 
101 |         # Get child value of current node; that is, value of the first child
102 |         # node of type PCDATA/CDATA
103 |         const_char* child_value() const
104 | 
105 |         # Get child value of child with specified name.
106 |         # Equivalent to child(name).child_value().
107 |         const_char* child_value(const_char* name) const
108 |         bint operator!() const
109 | 
110 |         # Set node name/value (returns false if node is empty,
111 |         # there is not enough memory, or node can not have name/value)
112 |         bint set_name(const_char* rhs)
113 |         bint set_value(const_char* rhs)
114 | 
115 |         # Add attribute with specified name. Returns added attribute,
116 |         # or empty attribute on errors.
117 |         xml_attribute append_attribute(const_char* name)
118 | 
119 |         # Add child node with specified type. Returns added node,
120 |         # or empty node on errors.
121 |         xml_node append_child(const_char* name)
122 |         xml_node append_child(xml_node_type type)
123 | 
124 |     cdef cppclass xml_parse_result:
125 |         ptrdiff_t offset
126 |         bint operator bool() const
127 |         const_char* description() const
128 | 
129 |     cdef cppclass xml_writer:
130 |         pass
131 | 
132 |     cdef cppclass xml_document(xml_node):
133 |         xml_parse_result load_buffer(const char* contents, size_t size)
134 |         void save(stringstream& stream, const_char* indent, unsigned int flags,
135 |                   xml_encoding encoding) const
136 | 
137 | 
138 | cdef unicodify(val):
139 |     if isinstance(val, bytes):
140 |         return val.decode('utf-8')
141 |     if isinstance(val, list):
142 |         return [unicodify(v) for v in val]
143 |     if isinstance(val, dict):
144 |         ret = {}
145 |         for k, v in val.items():
146 |             if isinstance(k, bytes):
147 |                 k = k.decode('utf-8')
148 |             ret[k] = unicodify(v)
149 |         return ret
150 |     return val
151 | 
152 | 
153 | cdef deunicodify(val):
154 |     if isinstance(val, unicode):
155 |         return val.encode('utf-8')
156 |     if isinstance(val, list):
157 |         return [deunicodify(v) for v in val]
158 |     if isinstance(val, dict):
159 |         ret = {}
160 |         for k, v in val.items():
161 |             if isinstance(k, unicode):
162 |                 k = k.encode('utf-8')
163 |             ret[k] = deunicodify(v)
164 |         return ret
165 |     return val
166 | 
167 | 
168 | cdef walk(xml_node node):
169 |     cdef xml_attribute attr = node.first_attribute()
170 |     cdef xml_node child = node.first_child()
171 |     cdef xml_node_type child_type
172 |     cdef bint has_children = 0
173 |     cdef xml_text text = node.text()
174 |     cdef const_char* tag
175 |     cdef bytes text_val = b""
176 | 
177 |     while not child.empty():
178 |         child_type = child.type()
179 |         if child_type != node_cdata and child_type != node_pcdata:
180 |             has_children = 1
181 |             break
182 |         child = child.next_sibling()
183 | 
184 |     if attr.empty() and not has_children:
185 |         if text.empty():
186 |             return None
187 |         return text.get().strip()
188 | 
189 |     cdef dict ret = {}
190 | 
191 |     while not attr.empty():
192 |         ret[b'@' + attr.name()] = attr.value()
193 |         attr = attr.next_attribute()
194 | 
195 |     child = node.first_child()
196 |     while not child.empty():
197 |         child_type = child.type()
198 |         if child_type == node_element:
199 |             tag = child.name()
200 |             if tag in ret:
201 |                 if not isinstance(ret[tag], list):
202 |                     ret[tag] = [ret[tag]]
203 |                 ret[tag].append(walk(child))
204 |             else:
205 |                 ret[tag] = walk(child)
206 |         elif child_type == node_cdata or child_type == node_pcdata:
207 |             text_val += child.value()
208 |         child = child.next_sibling()
209 | 
210 |     if text_val:
211 |         ret[b'#text'] = text_val.strip()
212 | 
213 |     return ret or None
214 | 
215 | 
216 | def parse(xml_input):
217 |     cdef xml_document doc
218 |     cdef xml_parse_result result
219 |     cdef xml_node root
220 |     cdef const_char* input_str
221 |     cdef size_t input_len
222 | 
223 |     if isinstance(xml_input, unicode):
224 |         xml_input = xml_input.encode('utf-8')
225 | 
226 |     input_str = xml_input
227 |     input_len = len(xml_input)
228 | 
229 |     with nogil:
230 |         result = doc.load_buffer(input_str, input_len)
231 |         root = doc.first_child()
232 |     if result:
233 |         ret = {root.name(): walk(root)}
234 |         if PY3:
235 |             return unicodify(ret)
236 |         return ret
237 |     else:
238 |         raise ValueError(
239 |             '%s, at offset %d' % (result.description(), result.offset))
240 | 
241 | 
242 | cdef unwalk_list(xml_node parent, const_char* name, list val):
243 |     for sub in val:
244 |         node = parent.append_child(name)
245 |         unwalk(node, sub)
246 | 
247 | 
248 | cdef unwalk(xml_node parent, val):
249 |     if isinstance(val, unicode):
250 |         parent.append_child(node_pcdata).set_value(val.encode('utf-8'))
251 |     elif isinstance(val, bytes):
252 |         parent.append_child(node_pcdata).set_value(val)
253 |     elif val is None:
254 |         parent.append_child(node_pcdata).set_value(b'')
255 |     elif isinstance(val, dict):
256 |         for k, v in val.items():
257 |             if k[0] == b'@' or k[0] == 64: # is @ char
258 |                 parent.append_attribute(k[1:]).set_value(v)
259 |             elif k == b'#text':
260 |                 unwalk(parent, v)
261 |             elif isinstance(v, list):
262 |                 unwalk_list(parent, k, v)
263 |             else:
264 |                 unwalk(parent.append_child(<bytes>k), v)
265 |     else:
266 |         parent.append_child(node_pcdata).set_value(str(val))
267 | 
268 | 
269 | def unparse(xml_dict):
270 |     cdef xml_document doc
271 |     cdef stringstream ss
272 |     cdef bytes ret
273 |     cdef xml_node decl = doc.append_child(node_declaration)
274 |     decl.append_attribute("version").set_value("1.0")
275 |     if not PY3:
276 |         decl.append_attribute("encoding").set_value("utf-8")
277 |     else:
278 |         xml_dict = deunicodify(xml_dict)
279 |     unwalk(doc, xml_dict)
280 |     with nogil:
281 |         doc.save(ss, "", 0, encoding_utf8)  # no indent
282 |     ret = ss.str()
283 |     if PY3:
284 |         return ret.decode('utf-8')
285 |     return ret
286 | 


--------------------------------------------------------------------------------