├── setup.cfg ├── MANIFEST.in ├── .gitmodules ├── .gitignore ├── setup.py ├── README.md ├── tests.py └── pugixmltodict.pyx /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | description-file = README.md 3 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | recursive-include pugixml/src *.hpp *.cpp 2 | include pugixmltodict.pyx 3 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "pugixml"] 2 | path = pugixml 3 | url = https://github.com/zeux/pugixml 4 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | 5 | # C extensions 6 | *.so 7 | 8 | # Distribution / packaging 9 | /build 10 | /dist 11 | /*.egg-info 12 | 13 | # Editor / devtools 14 | .*.sw[pon] 15 | \#*# 16 | /tags 17 | 18 | # Generated files 19 | pugixmltodict.cpp 20 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | import sys 3 | from distutils.core import setup, Extension 4 | 5 | 6 | USE_CYTHON = False 7 | CYTHON_PARAM = '--cython' 8 | if CYTHON_PARAM in sys.argv: 9 | USE_CYTHON = True 10 | sys.argv.remove(CYTHON_PARAM) 11 | 12 | 13 | SOURCE_EXT = '.pyx' if USE_CYTHON else '.cpp' 14 | EXT_MODULES = [Extension( 15 | 'pugixmltodict', 16 | sources=[ 17 | 'pugixmltodict' + SOURCE_EXT, 18 | 'pugixml/src/pugixml.cpp', 19 | ], 20 | )] 21 | 22 | if USE_CYTHON: 23 | from Cython.Build import cythonize 24 | EXT_MODULES = cythonize(EXT_MODULES) 25 | 26 | 27 | setup( 28 | name='pugixmltodict', 29 | version='0.5', 30 | description='A fast alternative to xmltodict library', 31 | url='https://github.com/sepeth/pugixmltodict', 32 | author='Doğan Çeçen', 33 | author_email='sepeth@gmail.com', 34 | 35 | classifiers=[ 36 | 'Development Status :: 3 - Alpha', 37 | 'Intended Audience :: Developers', 38 | 'License :: OSI Approved :: MIT License', 39 | 'Programming Language :: Cython', 40 | 'Programming Language :: Python :: 2.7', 41 | 'Programming Language :: Python :: 3', 42 | 'Topic :: Text Processing :: Markup :: XML', 43 | ], 44 | 45 | ext_modules=EXT_MODULES, 46 | ) 47 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # pugixmltodict 2 | 3 | ## Install 4 | 5 | ```sh 6 | pip install pugixmltodict 7 | ``` 8 | 9 | ## Run tests 10 | 11 | ```sh 12 | python -m unittest tests 13 | ``` 14 | 15 | ## Build from sources 16 | 17 | Install Cython first: 18 | 19 | ```sh 20 | pip install cython 21 | ``` 22 | 23 | Build with `--cython` so does Cython sources get compiled to C++: 24 | 25 | ```sh 26 | python setup.py build --cython 27 | ``` 28 | 29 | _You can omit the parameter at later builds if you haven't made changes to `pugixmltodict.pyx`. 30 | 31 | 32 | ## License 33 | 34 | This library is available to anybody free of charge, under the terms of MIT License: 35 | 36 | Copyright (c) 2015 Doğan Çeçen 37 | 38 | Permission is hereby granted, free of charge, to any person obtaining a copy 39 | of this software and associated documentation files (the "Software"), to deal 40 | in the Software without restriction, including without limitation the rights 41 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 42 | copies of the Software, and to permit persons to whom the Software is 43 | furnished to do so, subject to the following conditions: 44 | 45 | The above copyright notice and this permission notice shall be included in 46 | all copies or substantial portions of the Software. 47 | 48 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 49 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 50 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 51 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 52 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 53 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 54 | THE SOFTWARE. 55 | -------------------------------------------------------------------------------- /tests.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from pugixmltodict import parse, unparse 3 | 4 | 5 | class XmlToDictTestCase(unittest.TestCase): 6 | def test_minimal(self): 7 | self.assertEqual(parse(''), {'a': None}) 8 | 9 | def test_simple(self): 10 | self.assertEqual(parse('data'), {'a': 'data'}) 11 | 12 | def test_list(self): 13 | self.assertEqual(parse('123'), 14 | {'a': {'b': ['1', '2', '3']}}) 15 | 16 | def test_attrib(self): 17 | self.assertEqual(parse(''), 18 | {'a': {'@href': 'xyz'}}) 19 | 20 | def test_attrib_and_text(self): 21 | self.assertEqual(parse('123'), 22 | {'a': {'@href': 'xyz', '#text': '123'}}) 23 | 24 | def test_semi_structured(self): 25 | self.assertEqual(parse('abcdef'), 26 | {'a': {'b': None, '#text': 'abcdef'}}) 27 | 28 | def test_nested_semi_structured(self): 29 | self.assertEqual(parse('abc123456def'), 30 | {'a': {'#text': 'abcdef', 'b': { 31 | '#text': '123456', 'c': None}}}) 32 | 33 | def test_skip_whitespace(self): 34 | xml = """ 35 | 36 | 37 | 38 | 39 | hello 40 | 41 | """ 42 | self.assertEqual( 43 | parse(xml), 44 | {'root': {'emptya': None, 45 | 'emptyb': {'@attr': 'attrvalue'}, 46 | 'value': 'hello'}}) 47 | 48 | def test_namespace_ignore(self): 49 | xml = """ 50 | 53 | 1 54 | 2 55 | 3 56 | 57 | """ 58 | d = { 59 | 'root': { 60 | '@xmlns': 'http://defaultns.com/', 61 | '@xmlns:a': 'http://a.com/', 62 | '@xmlns:b': 'http://b.com/', 63 | 'x': '1', 64 | 'a:y': '2', 65 | 'b:z': '3', 66 | }, 67 | } 68 | self.assertEqual(parse(xml), d) 69 | 70 | def test_with_broken_attribute(self): 71 | with self.assertRaises(ValueError): 72 | parse('foo') 73 | 74 | def test_with_mismatched_tag(self): 75 | with self.assertRaises(ValueError): 76 | parse('text') 77 | 78 | 79 | class DictToXmlTestCase(unittest.TestCase): 80 | def test_root(self): 81 | obj = {'a': None} 82 | self.assertEqual(obj, parse(unparse(obj))) 83 | self.assertEqual(unparse(obj), unparse(parse(unparse(obj)))) 84 | 85 | def test_simple_text(self): 86 | obj = {'a': 'b'} 87 | self.assertEqual(obj, parse(unparse(obj))) 88 | self.assertEqual(unparse(obj), unparse(parse(unparse(obj)))) 89 | 90 | def test_attrib(self): 91 | obj = {'a': {'@href': 'x'}} 92 | self.assertEqual(obj, parse(unparse(obj))) 93 | self.assertEqual(unparse(obj), unparse(parse(unparse(obj)))) 94 | 95 | def test_attrib_and_text(self): 96 | obj = {'a': {'@href': 'x', '#text': 'y'}} 97 | self.assertEqual(obj, parse(unparse(obj))) 98 | self.assertEqual(unparse(obj), unparse(parse(unparse(obj)))) 99 | 100 | def test_list(self): 101 | obj = {'a': {'b': ['1', '2', '3']}} 102 | self.assertEqual(obj, parse(unparse(obj))) 103 | self.assertEqual(unparse(obj), unparse(parse(unparse(obj)))) 104 | -------------------------------------------------------------------------------- /pugixmltodict.pyx: -------------------------------------------------------------------------------- 1 | # distutils: language = c++ 2 | 3 | from __future__ import unicode_literals 4 | import sys 5 | from libc.string cimport const_char 6 | from libc.stddef cimport ptrdiff_t 7 | from libcpp.string cimport string 8 | 9 | 10 | PY3 = sys.version_info[0] == 3 11 | 12 | 13 | cdef extern from "" namespace "std" nogil: 14 | cdef cppclass stringstream: 15 | string str() const 16 | 17 | 18 | cdef extern from "pugixml/src/pugixml.hpp" namespace "pugi" nogil: 19 | cdef cppclass xml_text: 20 | bint empty() const 21 | const_char* get() const 22 | 23 | cdef cppclass xml_attribute: 24 | const_char* name() const 25 | const_char* value() const 26 | bint empty() const 27 | 28 | # Get next/previous attribute in the attribute list of the parent node 29 | xml_attribute next_attribute() const 30 | xml_attribute previous_attribute() const 31 | 32 | # Set attribute name/value (returns false if attribute is empty or there 33 | # is not enough memory) 34 | bint set_name(const_char* rhs) 35 | bint set_value(const_char* rhs) 36 | 37 | cdef enum xml_node_type: 38 | node_null, # Empty (null) node handle 39 | node_document, # A document tree's absolute root 40 | node_element, # Element tag, i.e. '' 41 | node_pcdata, # Plain character data, i.e. 'text' 42 | node_cdata, # Character data, i.e. '' 43 | node_comment, # Comment tag, i.e. '' 44 | node_pi, # Processing instruction, i.e. '' 45 | node_declaration, # Document declaration, i.e. '' 46 | node_doctype # Document type declaration, i.e. '' 47 | 48 | cdef enum xml_encoding: 49 | encoding_auto, # Auto-detect 50 | encoding_utf8, # UTF8 encoding 51 | encoding_utf16_le, # Little-endian UTF16 52 | encoding_utf16_be, # Big-endian UTF16 53 | encoding_utf16, # UTF16 with native endianness 54 | encoding_utf32_le, # Little-endian UTF32 55 | encoding_utf32_be, # Big-endian UTF32 56 | encoding_utf32, # UTF32 with native endianness 57 | encoding_wchar, # The same encoding wchar_t has (either UTF16 or UTF32) 58 | encoding_latin1 59 | 60 | cdef cppclass xml_node: 61 | # Check if node is empty. 62 | bint empty() const 63 | 64 | # Get node type 65 | xml_node_type type() const 66 | 67 | # Get node name, or "" if node is empty or it has no name 68 | const_char* name() const 69 | 70 | # Get node value, or "" if node is empty or it has no value 71 | # Note: For text node.value() does not return "text"! 72 | # Use child_value() or text() methods to access text inside nodes. 73 | const_char* value() const 74 | 75 | xml_attribute first_attribute() const 76 | xml_attribute last_attribute() const 77 | 78 | # Get children list 79 | xml_node first_child() const 80 | xml_node last_child() const 81 | 82 | # Get next/previous sibling in the children list of the parent node 83 | xml_node next_sibling() const 84 | xml_node previous_sibling() const 85 | 86 | # Get parent node 87 | xml_node parent() const 88 | 89 | # Get root of DOM tree this node belongs to 90 | xml_node root() const 91 | 92 | # Get text object for the current node 93 | xml_text text() const 94 | 95 | # Get child, attribute or next/previous sibling with the specified name 96 | xml_node child(const_char* name) const 97 | xml_attribute attribute(const_char* name) const 98 | xml_node next_sibling(const_char* name) const 99 | xml_node previous_sibling(const_char* name) const 100 | 101 | # Get child value of current node; that is, value of the first child 102 | # node of type PCDATA/CDATA 103 | const_char* child_value() const 104 | 105 | # Get child value of child with specified name. 106 | # Equivalent to child(name).child_value(). 107 | const_char* child_value(const_char* name) const 108 | bint operator!() const 109 | 110 | # Set node name/value (returns false if node is empty, 111 | # there is not enough memory, or node can not have name/value) 112 | bint set_name(const_char* rhs) 113 | bint set_value(const_char* rhs) 114 | 115 | # Add attribute with specified name. Returns added attribute, 116 | # or empty attribute on errors. 117 | xml_attribute append_attribute(const_char* name) 118 | 119 | # Add child node with specified type. Returns added node, 120 | # or empty node on errors. 121 | xml_node append_child(const_char* name) 122 | xml_node append_child(xml_node_type type) 123 | 124 | cdef cppclass xml_parse_result: 125 | ptrdiff_t offset 126 | bint operator bool() const 127 | const_char* description() const 128 | 129 | cdef cppclass xml_writer: 130 | pass 131 | 132 | cdef cppclass xml_document(xml_node): 133 | xml_parse_result load_buffer(const char* contents, size_t size) 134 | void save(stringstream& stream, const_char* indent, unsigned int flags, 135 | xml_encoding encoding) const 136 | 137 | 138 | cdef unicodify(val): 139 | if isinstance(val, bytes): 140 | return val.decode('utf-8') 141 | if isinstance(val, list): 142 | return [unicodify(v) for v in val] 143 | if isinstance(val, dict): 144 | ret = {} 145 | for k, v in val.items(): 146 | if isinstance(k, bytes): 147 | k = k.decode('utf-8') 148 | ret[k] = unicodify(v) 149 | return ret 150 | return val 151 | 152 | 153 | cdef deunicodify(val): 154 | if isinstance(val, unicode): 155 | return val.encode('utf-8') 156 | if isinstance(val, list): 157 | return [deunicodify(v) for v in val] 158 | if isinstance(val, dict): 159 | ret = {} 160 | for k, v in val.items(): 161 | if isinstance(k, unicode): 162 | k = k.encode('utf-8') 163 | ret[k] = deunicodify(v) 164 | return ret 165 | return val 166 | 167 | 168 | cdef walk(xml_node node): 169 | cdef xml_attribute attr = node.first_attribute() 170 | cdef xml_node child = node.first_child() 171 | cdef xml_node_type child_type 172 | cdef bint has_children = 0 173 | cdef xml_text text = node.text() 174 | cdef const_char* tag 175 | cdef bytes text_val = b"" 176 | 177 | while not child.empty(): 178 | child_type = child.type() 179 | if child_type != node_cdata and child_type != node_pcdata: 180 | has_children = 1 181 | break 182 | child = child.next_sibling() 183 | 184 | if attr.empty() and not has_children: 185 | if text.empty(): 186 | return None 187 | return text.get().strip() 188 | 189 | cdef dict ret = {} 190 | 191 | while not attr.empty(): 192 | ret[b'@' + attr.name()] = attr.value() 193 | attr = attr.next_attribute() 194 | 195 | child = node.first_child() 196 | while not child.empty(): 197 | child_type = child.type() 198 | if child_type == node_element: 199 | tag = child.name() 200 | if tag in ret: 201 | if not isinstance(ret[tag], list): 202 | ret[tag] = [ret[tag]] 203 | ret[tag].append(walk(child)) 204 | else: 205 | ret[tag] = walk(child) 206 | elif child_type == node_cdata or child_type == node_pcdata: 207 | text_val += child.value() 208 | child = child.next_sibling() 209 | 210 | if text_val: 211 | ret[b'#text'] = text_val.strip() 212 | 213 | return ret or None 214 | 215 | 216 | def parse(xml_input): 217 | cdef xml_document doc 218 | cdef xml_parse_result result 219 | cdef xml_node root 220 | cdef const_char* input_str 221 | cdef size_t input_len 222 | 223 | if isinstance(xml_input, unicode): 224 | xml_input = xml_input.encode('utf-8') 225 | 226 | input_str = xml_input 227 | input_len = len(xml_input) 228 | 229 | with nogil: 230 | result = doc.load_buffer(input_str, input_len) 231 | root = doc.first_child() 232 | if result: 233 | ret = {root.name(): walk(root)} 234 | if PY3: 235 | return unicodify(ret) 236 | return ret 237 | else: 238 | raise ValueError( 239 | '%s, at offset %d' % (result.description(), result.offset)) 240 | 241 | 242 | cdef unwalk_list(xml_node parent, const_char* name, list val): 243 | for sub in val: 244 | node = parent.append_child(name) 245 | unwalk(node, sub) 246 | 247 | 248 | cdef unwalk(xml_node parent, val): 249 | if isinstance(val, unicode): 250 | parent.append_child(node_pcdata).set_value(val.encode('utf-8')) 251 | elif isinstance(val, bytes): 252 | parent.append_child(node_pcdata).set_value(val) 253 | elif val is None: 254 | parent.append_child(node_pcdata).set_value(b'') 255 | elif isinstance(val, dict): 256 | for k, v in val.items(): 257 | if k[0] == b'@' or k[0] == 64: # is @ char 258 | parent.append_attribute(k[1:]).set_value(v) 259 | elif k == b'#text': 260 | unwalk(parent, v) 261 | elif isinstance(v, list): 262 | unwalk_list(parent, k, v) 263 | else: 264 | unwalk(parent.append_child(k), v) 265 | else: 266 | parent.append_child(node_pcdata).set_value(str(val)) 267 | 268 | 269 | def unparse(xml_dict): 270 | cdef xml_document doc 271 | cdef stringstream ss 272 | cdef bytes ret 273 | cdef xml_node decl = doc.append_child(node_declaration) 274 | decl.append_attribute("version").set_value("1.0") 275 | if not PY3: 276 | decl.append_attribute("encoding").set_value("utf-8") 277 | else: 278 | xml_dict = deunicodify(xml_dict) 279 | unwalk(doc, xml_dict) 280 | with nogil: 281 | doc.save(ss, "", 0, encoding_utf8) # no indent 282 | ret = ss.str() 283 | if PY3: 284 | return ret.decode('utf-8') 285 | return ret 286 | --------------------------------------------------------------------------------