├── setup.cfg
├── MANIFEST.in
├── .gitmodules
├── .gitignore
├── setup.py
├── README.md
├── tests.py
└── pugixmltodict.pyx
/setup.cfg:
--------------------------------------------------------------------------------
1 | [metadata]
2 | description-file = README.md
3 |
--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | recursive-include pugixml/src *.hpp *.cpp
2 | include pugixmltodict.pyx
3 |
--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "pugixml"]
2 | path = pugixml
3 | url = https://github.com/zeux/pugixml
4 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 |
5 | # C extensions
6 | *.so
7 |
8 | # Distribution / packaging
9 | /build
10 | /dist
11 | /*.egg-info
12 |
13 | # Editor / devtools
14 | .*.sw[pon]
15 | \#*#
16 | /tags
17 |
18 | # Generated files
19 | pugixmltodict.cpp
20 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | import sys
3 | from distutils.core import setup, Extension
4 |
5 |
6 | USE_CYTHON = False
7 | CYTHON_PARAM = '--cython'
8 | if CYTHON_PARAM in sys.argv:
9 | USE_CYTHON = True
10 | sys.argv.remove(CYTHON_PARAM)
11 |
12 |
13 | SOURCE_EXT = '.pyx' if USE_CYTHON else '.cpp'
14 | EXT_MODULES = [Extension(
15 | 'pugixmltodict',
16 | sources=[
17 | 'pugixmltodict' + SOURCE_EXT,
18 | 'pugixml/src/pugixml.cpp',
19 | ],
20 | )]
21 |
22 | if USE_CYTHON:
23 | from Cython.Build import cythonize
24 | EXT_MODULES = cythonize(EXT_MODULES)
25 |
26 |
27 | setup(
28 | name='pugixmltodict',
29 | version='0.5',
30 | description='A fast alternative to xmltodict library',
31 | url='https://github.com/sepeth/pugixmltodict',
32 | author='Doğan Çeçen',
33 | author_email='sepeth@gmail.com',
34 |
35 | classifiers=[
36 | 'Development Status :: 3 - Alpha',
37 | 'Intended Audience :: Developers',
38 | 'License :: OSI Approved :: MIT License',
39 | 'Programming Language :: Cython',
40 | 'Programming Language :: Python :: 2.7',
41 | 'Programming Language :: Python :: 3',
42 | 'Topic :: Text Processing :: Markup :: XML',
43 | ],
44 |
45 | ext_modules=EXT_MODULES,
46 | )
47 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # pugixmltodict
2 |
3 | ## Install
4 |
5 | ```sh
6 | pip install pugixmltodict
7 | ```
8 |
9 | ## Run tests
10 |
11 | ```sh
12 | python -m unittest tests
13 | ```
14 |
15 | ## Build from sources
16 |
17 | Install Cython first:
18 |
19 | ```sh
20 | pip install cython
21 | ```
22 |
23 | Build with `--cython` so does Cython sources get compiled to C++:
24 |
25 | ```sh
26 | python setup.py build --cython
27 | ```
28 |
29 | _You can omit the parameter at later builds if you haven't made changes to `pugixmltodict.pyx`.
30 |
31 |
32 | ## License
33 |
34 | This library is available to anybody free of charge, under the terms of MIT License:
35 |
36 | Copyright (c) 2015 Doğan Çeçen
37 |
38 | Permission is hereby granted, free of charge, to any person obtaining a copy
39 | of this software and associated documentation files (the "Software"), to deal
40 | in the Software without restriction, including without limitation the rights
41 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
42 | copies of the Software, and to permit persons to whom the Software is
43 | furnished to do so, subject to the following conditions:
44 |
45 | The above copyright notice and this permission notice shall be included in
46 | all copies or substantial portions of the Software.
47 |
48 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
49 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
50 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
51 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
52 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
53 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
54 | THE SOFTWARE.
55 |
--------------------------------------------------------------------------------
/tests.py:
--------------------------------------------------------------------------------
1 | import unittest
2 | from pugixmltodict import parse, unparse
3 |
4 |
5 | class XmlToDictTestCase(unittest.TestCase):
6 | def test_minimal(self):
7 | self.assertEqual(parse(''), {'a': None})
8 |
9 | def test_simple(self):
10 | self.assertEqual(parse('data'), {'a': 'data'})
11 |
12 | def test_list(self):
13 | self.assertEqual(parse('123'),
14 | {'a': {'b': ['1', '2', '3']}})
15 |
16 | def test_attrib(self):
17 | self.assertEqual(parse(''),
18 | {'a': {'@href': 'xyz'}})
19 |
20 | def test_attrib_and_text(self):
21 | self.assertEqual(parse('123'),
22 | {'a': {'@href': 'xyz', '#text': '123'}})
23 |
24 | def test_semi_structured(self):
25 | self.assertEqual(parse('abcdef'),
26 | {'a': {'b': None, '#text': 'abcdef'}})
27 |
28 | def test_nested_semi_structured(self):
29 | self.assertEqual(parse('abc123456def'),
30 | {'a': {'#text': 'abcdef', 'b': {
31 | '#text': '123456', 'c': None}}})
32 |
33 | def test_skip_whitespace(self):
34 | xml = """
35 |
36 |
37 |
38 |
39 | hello
40 |
41 | """
42 | self.assertEqual(
43 | parse(xml),
44 | {'root': {'emptya': None,
45 | 'emptyb': {'@attr': 'attrvalue'},
46 | 'value': 'hello'}})
47 |
48 | def test_namespace_ignore(self):
49 | xml = """
50 |
53 | 1
54 | 2
55 | 3
56 |
57 | """
58 | d = {
59 | 'root': {
60 | '@xmlns': 'http://defaultns.com/',
61 | '@xmlns:a': 'http://a.com/',
62 | '@xmlns:b': 'http://b.com/',
63 | 'x': '1',
64 | 'a:y': '2',
65 | 'b:z': '3',
66 | },
67 | }
68 | self.assertEqual(parse(xml), d)
69 |
70 | def test_with_broken_attribute(self):
71 | with self.assertRaises(ValueError):
72 | parse('foo')
73 |
74 | def test_with_mismatched_tag(self):
75 | with self.assertRaises(ValueError):
76 | parse('text')
77 |
78 |
79 | class DictToXmlTestCase(unittest.TestCase):
80 | def test_root(self):
81 | obj = {'a': None}
82 | self.assertEqual(obj, parse(unparse(obj)))
83 | self.assertEqual(unparse(obj), unparse(parse(unparse(obj))))
84 |
85 | def test_simple_text(self):
86 | obj = {'a': 'b'}
87 | self.assertEqual(obj, parse(unparse(obj)))
88 | self.assertEqual(unparse(obj), unparse(parse(unparse(obj))))
89 |
90 | def test_attrib(self):
91 | obj = {'a': {'@href': 'x'}}
92 | self.assertEqual(obj, parse(unparse(obj)))
93 | self.assertEqual(unparse(obj), unparse(parse(unparse(obj))))
94 |
95 | def test_attrib_and_text(self):
96 | obj = {'a': {'@href': 'x', '#text': 'y'}}
97 | self.assertEqual(obj, parse(unparse(obj)))
98 | self.assertEqual(unparse(obj), unparse(parse(unparse(obj))))
99 |
100 | def test_list(self):
101 | obj = {'a': {'b': ['1', '2', '3']}}
102 | self.assertEqual(obj, parse(unparse(obj)))
103 | self.assertEqual(unparse(obj), unparse(parse(unparse(obj))))
104 |
--------------------------------------------------------------------------------
/pugixmltodict.pyx:
--------------------------------------------------------------------------------
1 | # distutils: language = c++
2 |
3 | from __future__ import unicode_literals
4 | import sys
5 | from libc.string cimport const_char
6 | from libc.stddef cimport ptrdiff_t
7 | from libcpp.string cimport string
8 |
9 |
10 | PY3 = sys.version_info[0] == 3
11 |
12 |
13 | cdef extern from "" namespace "std" nogil:
14 | cdef cppclass stringstream:
15 | string str() const
16 |
17 |
18 | cdef extern from "pugixml/src/pugixml.hpp" namespace "pugi" nogil:
19 | cdef cppclass xml_text:
20 | bint empty() const
21 | const_char* get() const
22 |
23 | cdef cppclass xml_attribute:
24 | const_char* name() const
25 | const_char* value() const
26 | bint empty() const
27 |
28 | # Get next/previous attribute in the attribute list of the parent node
29 | xml_attribute next_attribute() const
30 | xml_attribute previous_attribute() const
31 |
32 | # Set attribute name/value (returns false if attribute is empty or there
33 | # is not enough memory)
34 | bint set_name(const_char* rhs)
35 | bint set_value(const_char* rhs)
36 |
37 | cdef enum xml_node_type:
38 | node_null, # Empty (null) node handle
39 | node_document, # A document tree's absolute root
40 | node_element, # Element tag, i.e. ''
41 | node_pcdata, # Plain character data, i.e. 'text'
42 | node_cdata, # Character data, i.e. ''
43 | node_comment, # Comment tag, i.e. ''
44 | node_pi, # Processing instruction, i.e. ''
45 | node_declaration, # Document declaration, i.e. ''
46 | node_doctype # Document type declaration, i.e. ''
47 |
48 | cdef enum xml_encoding:
49 | encoding_auto, # Auto-detect
50 | encoding_utf8, # UTF8 encoding
51 | encoding_utf16_le, # Little-endian UTF16
52 | encoding_utf16_be, # Big-endian UTF16
53 | encoding_utf16, # UTF16 with native endianness
54 | encoding_utf32_le, # Little-endian UTF32
55 | encoding_utf32_be, # Big-endian UTF32
56 | encoding_utf32, # UTF32 with native endianness
57 | encoding_wchar, # The same encoding wchar_t has (either UTF16 or UTF32)
58 | encoding_latin1
59 |
60 | cdef cppclass xml_node:
61 | # Check if node is empty.
62 | bint empty() const
63 |
64 | # Get node type
65 | xml_node_type type() const
66 |
67 | # Get node name, or "" if node is empty or it has no name
68 | const_char* name() const
69 |
70 | # Get node value, or "" if node is empty or it has no value
71 | # Note: For text node.value() does not return "text"!
72 | # Use child_value() or text() methods to access text inside nodes.
73 | const_char* value() const
74 |
75 | xml_attribute first_attribute() const
76 | xml_attribute last_attribute() const
77 |
78 | # Get children list
79 | xml_node first_child() const
80 | xml_node last_child() const
81 |
82 | # Get next/previous sibling in the children list of the parent node
83 | xml_node next_sibling() const
84 | xml_node previous_sibling() const
85 |
86 | # Get parent node
87 | xml_node parent() const
88 |
89 | # Get root of DOM tree this node belongs to
90 | xml_node root() const
91 |
92 | # Get text object for the current node
93 | xml_text text() const
94 |
95 | # Get child, attribute or next/previous sibling with the specified name
96 | xml_node child(const_char* name) const
97 | xml_attribute attribute(const_char* name) const
98 | xml_node next_sibling(const_char* name) const
99 | xml_node previous_sibling(const_char* name) const
100 |
101 | # Get child value of current node; that is, value of the first child
102 | # node of type PCDATA/CDATA
103 | const_char* child_value() const
104 |
105 | # Get child value of child with specified name.
106 | # Equivalent to child(name).child_value().
107 | const_char* child_value(const_char* name) const
108 | bint operator!() const
109 |
110 | # Set node name/value (returns false if node is empty,
111 | # there is not enough memory, or node can not have name/value)
112 | bint set_name(const_char* rhs)
113 | bint set_value(const_char* rhs)
114 |
115 | # Add attribute with specified name. Returns added attribute,
116 | # or empty attribute on errors.
117 | xml_attribute append_attribute(const_char* name)
118 |
119 | # Add child node with specified type. Returns added node,
120 | # or empty node on errors.
121 | xml_node append_child(const_char* name)
122 | xml_node append_child(xml_node_type type)
123 |
124 | cdef cppclass xml_parse_result:
125 | ptrdiff_t offset
126 | bint operator bool() const
127 | const_char* description() const
128 |
129 | cdef cppclass xml_writer:
130 | pass
131 |
132 | cdef cppclass xml_document(xml_node):
133 | xml_parse_result load_buffer(const char* contents, size_t size)
134 | void save(stringstream& stream, const_char* indent, unsigned int flags,
135 | xml_encoding encoding) const
136 |
137 |
138 | cdef unicodify(val):
139 | if isinstance(val, bytes):
140 | return val.decode('utf-8')
141 | if isinstance(val, list):
142 | return [unicodify(v) for v in val]
143 | if isinstance(val, dict):
144 | ret = {}
145 | for k, v in val.items():
146 | if isinstance(k, bytes):
147 | k = k.decode('utf-8')
148 | ret[k] = unicodify(v)
149 | return ret
150 | return val
151 |
152 |
153 | cdef deunicodify(val):
154 | if isinstance(val, unicode):
155 | return val.encode('utf-8')
156 | if isinstance(val, list):
157 | return [deunicodify(v) for v in val]
158 | if isinstance(val, dict):
159 | ret = {}
160 | for k, v in val.items():
161 | if isinstance(k, unicode):
162 | k = k.encode('utf-8')
163 | ret[k] = deunicodify(v)
164 | return ret
165 | return val
166 |
167 |
168 | cdef walk(xml_node node):
169 | cdef xml_attribute attr = node.first_attribute()
170 | cdef xml_node child = node.first_child()
171 | cdef xml_node_type child_type
172 | cdef bint has_children = 0
173 | cdef xml_text text = node.text()
174 | cdef const_char* tag
175 | cdef bytes text_val = b""
176 |
177 | while not child.empty():
178 | child_type = child.type()
179 | if child_type != node_cdata and child_type != node_pcdata:
180 | has_children = 1
181 | break
182 | child = child.next_sibling()
183 |
184 | if attr.empty() and not has_children:
185 | if text.empty():
186 | return None
187 | return text.get().strip()
188 |
189 | cdef dict ret = {}
190 |
191 | while not attr.empty():
192 | ret[b'@' + attr.name()] = attr.value()
193 | attr = attr.next_attribute()
194 |
195 | child = node.first_child()
196 | while not child.empty():
197 | child_type = child.type()
198 | if child_type == node_element:
199 | tag = child.name()
200 | if tag in ret:
201 | if not isinstance(ret[tag], list):
202 | ret[tag] = [ret[tag]]
203 | ret[tag].append(walk(child))
204 | else:
205 | ret[tag] = walk(child)
206 | elif child_type == node_cdata or child_type == node_pcdata:
207 | text_val += child.value()
208 | child = child.next_sibling()
209 |
210 | if text_val:
211 | ret[b'#text'] = text_val.strip()
212 |
213 | return ret or None
214 |
215 |
216 | def parse(xml_input):
217 | cdef xml_document doc
218 | cdef xml_parse_result result
219 | cdef xml_node root
220 | cdef const_char* input_str
221 | cdef size_t input_len
222 |
223 | if isinstance(xml_input, unicode):
224 | xml_input = xml_input.encode('utf-8')
225 |
226 | input_str = xml_input
227 | input_len = len(xml_input)
228 |
229 | with nogil:
230 | result = doc.load_buffer(input_str, input_len)
231 | root = doc.first_child()
232 | if result:
233 | ret = {root.name(): walk(root)}
234 | if PY3:
235 | return unicodify(ret)
236 | return ret
237 | else:
238 | raise ValueError(
239 | '%s, at offset %d' % (result.description(), result.offset))
240 |
241 |
242 | cdef unwalk_list(xml_node parent, const_char* name, list val):
243 | for sub in val:
244 | node = parent.append_child(name)
245 | unwalk(node, sub)
246 |
247 |
248 | cdef unwalk(xml_node parent, val):
249 | if isinstance(val, unicode):
250 | parent.append_child(node_pcdata).set_value(val.encode('utf-8'))
251 | elif isinstance(val, bytes):
252 | parent.append_child(node_pcdata).set_value(val)
253 | elif val is None:
254 | parent.append_child(node_pcdata).set_value(b'')
255 | elif isinstance(val, dict):
256 | for k, v in val.items():
257 | if k[0] == b'@' or k[0] == 64: # is @ char
258 | parent.append_attribute(k[1:]).set_value(v)
259 | elif k == b'#text':
260 | unwalk(parent, v)
261 | elif isinstance(v, list):
262 | unwalk_list(parent, k, v)
263 | else:
264 | unwalk(parent.append_child(k), v)
265 | else:
266 | parent.append_child(node_pcdata).set_value(str(val))
267 |
268 |
269 | def unparse(xml_dict):
270 | cdef xml_document doc
271 | cdef stringstream ss
272 | cdef bytes ret
273 | cdef xml_node decl = doc.append_child(node_declaration)
274 | decl.append_attribute("version").set_value("1.0")
275 | if not PY3:
276 | decl.append_attribute("encoding").set_value("utf-8")
277 | else:
278 | xml_dict = deunicodify(xml_dict)
279 | unwalk(doc, xml_dict)
280 | with nogil:
281 | doc.save(ss, "", 0, encoding_utf8) # no indent
282 | ret = ss.str()
283 | if PY3:
284 | return ret.decode('utf-8')
285 | return ret
286 |
--------------------------------------------------------------------------------