├── .gitignore ├── README.md ├── lua_zxml.c ├── makefile ├── test_zxml.lua ├── zxml.h ├── zxml_parser.c └── zxml_test.c /.gitignore: -------------------------------------------------------------------------------- 1 | *.so 2 | *.dSYM 3 | *.log 4 | *.o 5 | *.xml -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## Introduction 2 | zxml is zero copy, efficient and simple xml parse library for C. [zxml.h](https://github.com/lvzixun/zxml/blob/master/zxml.h) is C api headfile. read [zxml_parser.c](https://github.com/lvzixun/zxml/blob/master/zxml_parser.c) for more detail. 3 | 4 | [lua_zxml.c](https://github.com/lvzixun/zxml/blob/master/lua_zxml.c) is lua bind for decode excel xml2003 format file. read [test_zxml.lua](https://github.com/lvzixun/zxml/blob/master/test_zxml.lua) for more detail. 5 | 6 | ### benchmark 7 | 8 | | library | parse 200M xml file | 9 | |:-------:|:--------------------:| 10 | | zxml | 0.901s | 11 | | rapidxml | 1.287s | 12 | | tinyxml2 | 3.227s | 13 | | xml.sax of python | 14.996s | 14 | -------------------------------------------------------------------------------- /lua_zxml.c: -------------------------------------------------------------------------------- 1 | #include "zxml.h" 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | static struct { 10 | struct xml_context* context; 11 | size_t memory_size; 12 | }LAST_ZXML_CONTEXT; 13 | 14 | #define get_context() (LAST_ZXML_CONTEXT.context) 15 | 16 | #define do_resolve_children(element, f, idx, params) do{ \ 17 | struct xml_node* c = (element)->children_head; \ 18 | while(c) { \ 19 | f(L, c, (idx), (params)); \ 20 | c = c->next; \ 21 | } \ 22 | }while(0) 23 | 24 | #define lua_pushxmlstr(L, xmlstr) (lua_pushlstring((L), (xmlstr)->str, (xmlstr)->size)) 25 | 26 | struct xml_params { 27 | unsigned int worksheet; 28 | unsigned int table; 29 | unsigned int row; 30 | unsigned int cell; 31 | unsigned int children; 32 | unsigned int max_col_of_row1; 33 | luaL_Buffer* data_buffer; 34 | }; 35 | 36 | 37 | static void 38 | _init_context(lua_State* L, size_t new_memory_size) { 39 | if(get_context() == NULL || LAST_ZXML_CONTEXT.memory_size < new_memory_size) { 40 | if(get_context() != NULL) { 41 | xml_destory(get_context()); 42 | } 43 | LAST_ZXML_CONTEXT.context = xml_create(new_memory_size); 44 | if(get_context() == NULL) { 45 | luaL_error(L, "new context error"); 46 | } 47 | LAST_ZXML_CONTEXT.memory_size = new_memory_size; 48 | } else { 49 | xml_reset(get_context()); 50 | } 51 | } 52 | 53 | 54 | inline static bool 55 | _check_name(struct xml_str* str, const char* s) { 56 | size_t l = strlen(s); 57 | return (l == str->size) && (memcmp(str->str, s, l)==0); 58 | } 59 | 60 | 61 | static struct xml_str* 62 | _get_propertyvalue(struct xml_property* attrs, const char* s) { 63 | struct xml_property* p = attrs; 64 | while(p) { 65 | if(_check_name(&p->field_name, s)) { 66 | return &p->field_value; 67 | } 68 | p = p->next; 69 | } 70 | return NULL; 71 | } 72 | 73 | 74 | static struct escape_str { 75 | const char* s; 76 | char c; 77 | } ESCAPE_LIST[] = { 78 | {NULL, '\0'}, 79 | }; 80 | 81 | inline static void 82 | _do_content(lua_State* L, struct xml_str* content, luaL_Buffer* data_buffer) { 83 | int l = content->size; 84 | const char* s = content->str; 85 | const char* sub_str = s; 86 | int sub_sz = 0; 87 | for(int i=0; i=1; k--) { 157 | char kc = *(sub_str+k); 158 | if(kc>='0' && kc<='9') { 159 | int cv = (int)(kc - '0'); 160 | cv *= base; 161 | v += cv; 162 | base *= 10; 163 | }else { 164 | luaL_error(L, "invalid &# escape char"); 165 | } 166 | } 167 | if(v >0 && v<0x7f) { 168 | ret_char = (char)v; 169 | } else { 170 | ret_char = ' '; // convert invalid ascii code to space 171 | } 172 | } else { 173 | goto _DO_OTHER_ESCAPE_CHAR; 174 | } 175 | }; break; 176 | _DO_OTHER_ESCAPE_CHAR: 177 | default: { 178 | struct escape_str* p = ESCAPE_LIST; 179 | struct xml_str str; 180 | str.str = sub_str; 181 | str.size = sub_sz; 182 | while(p->s) { 183 | if(_check_name(&str, p->s)) { 184 | ret_char = p->c; 185 | break; 186 | } 187 | p++; 188 | } 189 | if(!p->s) { 190 | luaL_checkstack(L, 1, NULL); 191 | lua_pushxmlstr(L, &str); 192 | const char* error_escape_str = lua_tostring(L, -1); 193 | luaL_error(L, "invalid escape string:%s", error_escape_str); 194 | } 195 | }; break; 196 | } 197 | assert(ret_char); 198 | luaL_addchar(data_buffer, ret_char); 199 | sub_str += sub_sz + 1; 200 | sub_sz = 0; 201 | } else { 202 | sub_sz++; 203 | } 204 | s++; 205 | } 206 | if(sub_sz>0) { 207 | luaL_addlstring(data_buffer, sub_str, sub_sz); 208 | } 209 | } 210 | 211 | 212 | static void 213 | _resolve_content(lua_State* L, struct xml_node* node, int ret_tbl_idx, struct xml_params* params) { 214 | enum e_xml_node_type nt = node->nt; 215 | if(nt == node_element) { 216 | struct xml_element* element = &node->value.element_value; 217 | do_resolve_children(element, _resolve_content, ret_tbl_idx, params); 218 | } else if (nt == node_content) { 219 | struct xml_str* content = &node->value.content_value; 220 | // luaL_addlstring(params->data_buffer, content->str, content->size); 221 | _do_content(L, content, params->data_buffer); 222 | } else { 223 | luaL_error(L, "invalid node_type:%d", nt); 224 | } 225 | } 226 | 227 | 228 | static void 229 | _resolve_data(lua_State* L, struct xml_node* node, int ret_tbl_idx, struct xml_params* params) { 230 | enum e_xml_node_type nt = node->nt; 231 | if(nt == node_element) { 232 | struct xml_element* element = &node->value.element_value; 233 | if(_check_name(&element->tag, "Data") || _check_name(&element->tag, "ss:Data")) { 234 | do_resolve_children(element, _resolve_content, ret_tbl_idx, params); 235 | } 236 | } 237 | } 238 | 239 | 240 | static void 241 | _resolve_cell(lua_State* L, struct xml_node* node, int ret_tbl_idx, struct xml_params* params) { 242 | unsigned int max_col_of_row1 = params->max_col_of_row1; 243 | if(max_col_of_row1>0 && params->cell>=max_col_of_row1) { 244 | return; 245 | } 246 | 247 | enum e_xml_node_type nt = node->nt; 248 | if(nt == node_element) { 249 | struct xml_element* element = &node->value.element_value; 250 | if(_check_name(&element->tag, "Cell")) { 251 | params->cell++; 252 | luaL_checkstack(L, 2, NULL); 253 | // set empty cell 254 | struct xml_str* cell_index = _get_propertyvalue(element->attrs, "ss:Index"); 255 | if(cell_index) { 256 | lua_pushxmlstr(L, cell_index); 257 | int isnum; 258 | lua_Integer cidx = lua_tointegerx(L, -1, &isnum); 259 | if(!isnum || cidx < params->cell) { 260 | luaL_error(L, "invalid cell ss:Index"); 261 | } 262 | lua_pop(L, 1); 263 | while(cidx > params->cell) { 264 | lua_pushstring(L, ""); 265 | lua_seti(L, ret_tbl_idx, params->cell); 266 | params->cell++; 267 | if(max_col_of_row1>0 && params->cell>max_col_of_row1) { 268 | return; 269 | } 270 | } 271 | } 272 | luaL_Buffer B; 273 | params->data_buffer = &B; 274 | luaL_buffinit(L, &B); 275 | do_resolve_children(element, _resolve_data, ret_tbl_idx, params); 276 | luaL_pushresult(&B); 277 | lua_seti(L, ret_tbl_idx, params->cell); 278 | params->data_buffer = NULL; 279 | } 280 | } 281 | } 282 | 283 | 284 | static void 285 | _resolve_row(lua_State* L, struct xml_node* node, int ret_tbl_idx, struct xml_params* params) { 286 | enum e_xml_node_type nt = node->nt; 287 | if(nt == node_element) { 288 | struct xml_element* element = &node->value.element_value; 289 | if(_check_name(&element->tag, "Row")) { 290 | params->row++; 291 | luaL_checkstack(L, 4, NULL); 292 | // set empty row 293 | struct xml_str* row_index = _get_propertyvalue(element->attrs, "ss:Index"); 294 | if(row_index) { 295 | lua_pushxmlstr(L, row_index); 296 | int isnum; 297 | lua_Integer cidx = lua_tointegerx(L, -1, &isnum); 298 | if(!isnum || cidx < params->row) { 299 | luaL_error(L, "invalid row ss:Index"); 300 | } 301 | lua_pop(L, 1); 302 | while(cidx > params->row) { 303 | lua_newtable(L); 304 | unsigned int i=0; 305 | for(i=1; i<= params->max_col_of_row1; i++) { 306 | lua_pushstring(L, ""); 307 | lua_seti(L, -2, i); 308 | } 309 | lua_seti(L, ret_tbl_idx, params->row); 310 | params->row++; 311 | } 312 | } 313 | lua_newtable(L); 314 | lua_pushvalue(L, -1); 315 | lua_seti(L, ret_tbl_idx, params->row); 316 | int row_idx = lua_gettop(L); 317 | params->cell = 0; 318 | do_resolve_children(element, _resolve_cell, row_idx, params); 319 | // first line 320 | if(params->max_col_of_row1 == 0) { 321 | // adjust first line 322 | unsigned int i=0; 323 | for(i=params->cell; i>=1; i--) { 324 | size_t l=0; 325 | lua_geti(L, row_idx, i); 326 | lua_tolstring(L, -1, &l); 327 | lua_pop(L, 1); 328 | if(l>0) { 329 | break; 330 | } else { 331 | lua_pushnil(L); 332 | lua_seti(L, row_idx, i); 333 | } 334 | } 335 | params->max_col_of_row1 = i; 336 | 337 | } else { 338 | unsigned int max_col = params->max_col_of_row1; 339 | unsigned int i; 340 | for(i=params->cell+1; i<= max_col; i++) { 341 | lua_pushstring(L, ""); 342 | lua_seti(L, row_idx, i); 343 | } 344 | } 345 | params->cell = 0; 346 | lua_pop(L, 1); 347 | } 348 | } 349 | } 350 | 351 | 352 | static void 353 | _resolve_table(lua_State* L, struct xml_node* node, int ret_tbl_idx, struct xml_params* params) { 354 | enum e_xml_node_type nt = node->nt; 355 | if(nt == node_element) { 356 | struct xml_element* element = &node->value.element_value; 357 | if(_check_name(&element->tag, "Table")) { 358 | params->table++; 359 | luaL_checkstack(L, 4, NULL); 360 | struct xml_str* col_count = _get_propertyvalue(element->attrs, "ss:ExpandedColumnCount"); 361 | struct xml_str* row_count = _get_propertyvalue(element->attrs, "ss:ExpandedRowCount"); 362 | if(!col_count || !row_count) { 363 | luaL_error(L, "invalid col and row of table"); 364 | } 365 | int col_isnum; 366 | int row_isnum; 367 | lua_pushxmlstr(L, col_count); 368 | lua_pushxmlstr(L, row_count); 369 | lua_Integer col = lua_tointegerx(L, -2, &col_isnum); 370 | lua_Integer row = lua_tointegerx(L, -1, &row_isnum); 371 | if(!col_isnum || !row_isnum) { 372 | luaL_error(L, "invalid col and row number of table"); 373 | } 374 | lua_pop(L, 2); 375 | lua_newtable(L); 376 | lua_pushinteger(L, col); 377 | lua_setfield(L, -2, "col"); 378 | lua_pushinteger(L, row); 379 | lua_setfield(L, -2, "row"); 380 | lua_newtable(L); 381 | lua_pushvalue(L, -1); 382 | lua_insert(L, -3); 383 | lua_setfield(L, -2, "data"); 384 | 385 | lua_seti(L, ret_tbl_idx, params->table); 386 | int table_idx = lua_gettop(L); 387 | params->row = 0; 388 | params->max_col_of_row1 = 0; 389 | do_resolve_children(element, _resolve_row, table_idx, params); 390 | params->row = 0; 391 | params->max_col_of_row1 = 0; 392 | lua_pop(L, 1); 393 | } 394 | } 395 | } 396 | 397 | 398 | static void 399 | _resolve_worksheet(lua_State* L, struct xml_node* node, int ret_tbl_idx, struct xml_params* params) { 400 | enum e_xml_node_type nt = node->nt; 401 | if(nt == node_element) { 402 | struct xml_element* element = &node->value.element_value; 403 | if(_check_name(&element->tag, "Worksheet")) { 404 | params->worksheet++; 405 | struct xml_str* sheetname = _get_propertyvalue(element->attrs, "ss:Name"); 406 | if(!sheetname) { 407 | luaL_error(L, "invalid sheetname of worksheet"); 408 | } 409 | luaL_checkstack(L, 4, NULL); 410 | lua_newtable(L); 411 | lua_pushlstring(L, sheetname->str, sheetname->size); 412 | lua_setfield(L, -2, "name"); 413 | 414 | lua_newtable(L); 415 | lua_pushvalue(L, -1); 416 | lua_insert(L, -3); 417 | lua_setfield(L, -2, "tables"); 418 | 419 | lua_seti(L, ret_tbl_idx, params->worksheet); 420 | int top = lua_gettop(L); 421 | params->table = 0; 422 | do_resolve_children(element, _resolve_table, top, params); 423 | params->table = 0; 424 | lua_pop(L, 1); 425 | } 426 | } 427 | } 428 | 429 | 430 | inline static struct xml_node* 431 | __zxml_parser(lua_State* L) { 432 | size_t source_len = 0; 433 | const char* source = luaL_checklstring(L, 1, &source_len); 434 | size_t new_memory_size = source_len*8; 435 | _init_context(L, new_memory_size); 436 | 437 | struct xml_node* root = xml_parser(get_context(), source, source_len); 438 | if(!root) { 439 | luaL_error(L, "zxml parser %s", xml_geterror(get_context())); 440 | } else if(root->nt != node_element) { 441 | luaL_error(L, "zxml error root node type:%d", root->nt); 442 | } 443 | return root; 444 | } 445 | 446 | 447 | static int 448 | _lua_zxml_parser_excel_xml2003(lua_State* L) { 449 | struct xml_node* root = __zxml_parser(L); 450 | lua_newtable(L); // for worksheets 451 | int top = lua_gettop(L); 452 | struct xml_params excel_params = {0}; 453 | excel_params.worksheet = 0; 454 | struct xml_element* element = &root->value.element_value; 455 | do_resolve_children(element, _resolve_worksheet, top, &excel_params); 456 | excel_params.worksheet = 0; 457 | return 1; 458 | } 459 | 460 | 461 | static void 462 | _resolve_node(lua_State* L, struct xml_node* node, int ret_tbl_idx, struct xml_params* params) { 463 | enum e_xml_node_type nt = node->nt; 464 | params->children++; 465 | if(nt == node_element) { 466 | struct xml_element* element = &node->value.element_value; 467 | luaL_checkstack(L, 4, NULL); 468 | lua_newtable(L); 469 | lua_pushxmlstr(L, &element->tag); 470 | lua_setfield(L, -2, "tag"); 471 | lua_newtable(L); 472 | lua_pushvalue(L, -1); 473 | lua_setfield(L, -3, "attrs"); 474 | struct xml_property* p = element->attrs; 475 | while(p) { 476 | lua_pushxmlstr(L, &p->field_name); 477 | lua_pushxmlstr(L, &p->field_value); 478 | lua_settable(L, -3); 479 | p = p->next; 480 | } 481 | lua_pop(L, 1); 482 | lua_newtable(L); 483 | lua_pushvalue(L, -1); 484 | lua_setfield(L, -3, "children"); 485 | int top = lua_gettop(L); 486 | unsigned int bak = params->children; 487 | params->children = 0; 488 | do_resolve_children(element, _resolve_node, top, params); 489 | params->children = bak; 490 | lua_pop(L, 1); 491 | 492 | } else if (nt == node_content) { 493 | struct xml_str* content = &node->value.content_value; 494 | luaL_Buffer B; 495 | luaL_buffinit(L, &B); 496 | _do_content(L, content, &B); 497 | luaL_pushresult(&B); 498 | 499 | } else { 500 | luaL_error(L, "invalid xml node type:%d", nt); 501 | } 502 | 503 | if(ret_tbl_idx > 0) { 504 | lua_seti(L, ret_tbl_idx, params->children); 505 | } 506 | } 507 | 508 | 509 | static int 510 | _lua_zxml_parser(lua_State* L) { 511 | struct xml_node* root = __zxml_parser(L); 512 | struct xml_params node_params = {0}; 513 | node_params.children = 0; 514 | _resolve_node(L, root, -1, &node_params); 515 | node_params.children = 0; 516 | return 1; 517 | } 518 | 519 | 520 | int 521 | luaopen_zxml_core(lua_State* L) { 522 | luaL_checkversion(L); 523 | luaL_Reg l[] = { 524 | {"zxml_parser_excel_xml2003", _lua_zxml_parser_excel_xml2003}, 525 | {"zxml_parser", _lua_zxml_parser}, 526 | {NULL, NULL}, 527 | }; 528 | luaL_newlib(L, l); 529 | return 1; 530 | } 531 | 532 | -------------------------------------------------------------------------------- /makefile: -------------------------------------------------------------------------------- 1 | 2 | macosx: 3 | make zxml.so "CC = clang" "DLLFLAGS = -undefined dynamic_lookup --shared" 4 | 5 | linux: 6 | make zxml.so "CC = gcc" "DLLFLAGS = -shared -fPIC" 7 | 8 | win: zxml.dll 9 | 10 | zxml.dll: zxml_parser.c zxml.h lua_zxml.c 11 | gcc -g -Wall -O2 --shared -o $@ zxml_parser.c lua_zxml.c -I/usr/local/include -L/usr/local/bin -llua53 12 | 13 | zxml.so: zxml_parser.c zxml.h lua_zxml.c 14 | $(CC) -g -Wall -O2 $(DLLFLAGS) -o $@ zxml_parser.c lua_zxml.c 15 | 16 | test_zxml: zxml_parser.c zxml.h zxml_test.c 17 | clang -g -Wall -O2 -DZXML_TEST -o test_zxml zxml_parser.c zxml_test.c 18 | 19 | 20 | .PHONY: macosx linux win 21 | 22 | clean: 23 | rm -rf zxml.so test_zxml test_rxml -------------------------------------------------------------------------------- /test_zxml.lua: -------------------------------------------------------------------------------- 1 | local zxml_c = require "zxml.core" 2 | local print_r =require "print_r" 3 | local zxml_parser_excel_xml2003 = zxml_c.zxml_parser_excel_xml2003 4 | local zxml_parser = zxml_c.zxml_parser 5 | 6 | local function readfile(path) 7 | local fd = io.open(path, "r") 8 | local s = fd:read("a") 9 | fd:close() 10 | return s 11 | end 12 | 13 | local xml_path = ... 14 | local xml_source = readfile(xml_path) 15 | local t1 = zxml_parser(xml_source) 16 | local t2 = zxml_parser_excel_xml2003(xml_source) 17 | print_r(t1) 18 | print("-----------------") 19 | print_r(t2) -------------------------------------------------------------------------------- /zxml.h: -------------------------------------------------------------------------------- 1 | #ifndef __ZXML_HEAD__ 2 | #define __ZXML_HEAD__ 3 | 4 | #include 5 | 6 | 7 | #define XML_SUCCESS 0 8 | #define XML_PARSER_ERROR 1 9 | #define XML_MALLOC_ERROR 2 10 | 11 | enum e_xml_node_type { 12 | node_element, 13 | node_content, 14 | }; 15 | 16 | struct xml_str { 17 | const char* str; 18 | int size; 19 | }; 20 | 21 | struct xml_property { 22 | struct xml_str field_name; 23 | struct xml_str field_value; 24 | struct xml_property* next; 25 | }; 26 | 27 | struct xml_element { 28 | struct xml_str tag; 29 | struct xml_property* attrs; 30 | struct xml_node* children_head; 31 | struct xml_node* children_tail; 32 | }; 33 | 34 | struct xml_node { 35 | enum e_xml_node_type nt; 36 | union { 37 | struct xml_element element_value; 38 | struct xml_str content_value; 39 | } value; 40 | struct xml_node* next; 41 | }; 42 | 43 | 44 | struct xml_context; 45 | struct xml_context* xml_create(size_t memory_size); 46 | void xml_reset(struct xml_context* context); 47 | void xml_destory(struct xml_context* context); 48 | 49 | struct xml_node* xml_parser(struct xml_context* context, const char* xml_source, size_t sz); 50 | const char* xml_geterror(struct xml_context* context); 51 | 52 | #endif 53 | 54 | -------------------------------------------------------------------------------- /zxml_parser.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include "zxml.h" 10 | 11 | static char cmask[] = { 12 | 'U', 'U', 'U', 'U', 'U', 'U', 'U', 'U', 'U', 'S', 'S', 'U', 'U', 'S', 'U', 'U', 'U', 'U', 'U', 'U', 'U', 'U', 'U', 'U', 'U', 'U', 'U', 'U', 'U', 'U', 'U', 'U', 13 | 'S', '!', '"', '#', '$', '%', '&', '\'', 'U', 'U', '*', '+', ',', '-', '.', '/', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', ':', ';', '<', '=', '>', '?', 14 | '@', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', '[', '\\', ']', 'U', 'A', 15 | 'U', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', '{', '|', '}', '~', 'U', 16 | 'U', 'U', 'U', 'U', 'U', 'U', 'U', 'U', 'U', 'U', 'U', 'U', 'U', 'U', 'U', 'U', 'U', 'U', 'U', 'U', 'U', 'U', 'U', 'U', 'U', 'U', 'U', 'U', 'U', 'U', 'U', 'U', 17 | 'U', 'U', 'U', 'U', 'U', 'U', 'U', 'U', 'U', 'U', 'U', 'U', 'U', 'U', 'U', 'U', 'U', 'U', 'U', 'U', 'U', 'U', 'U', 'U', 'U', 'U', 'U', 'U', 'U', 'U', 'U', 'U', 18 | 'U', 'U', 'U', 'U', 'U', 'U', 'U', 'U', 'U', 'U', 'U', 'U', 'U', 'U', 'U', 'U', 'U', 'U', 'U', 'U', 'U', 'U', 'U', 'U', 'U', 'U', 'U', 'U', 'U', 'U', 'U', 'U', 19 | 'U', 'U', 'U', 'U', 'U', 'U', 'U', 'U', 'U', 'U', 'U', 'U', 'U', 'U', 'U', 'U', 'U', 'U', 'U', 'U', 'U', 'U', 'U', 'U', 'U', 'U', 'U', 'U', 'U', 'U', 'U', 'U', 20 | }; 21 | 22 | static const char* xml_error_str[] = { 23 | "XML_SUCCESS", 24 | "XML_PARSER_ERROR", 25 | "XML_MALLOC_ERROR", 26 | }; 27 | 28 | 29 | struct xml_context { 30 | uint8_t* memory_head; 31 | size_t memory_size; 32 | uint8_t* memory_tail; 33 | 34 | struct { 35 | const char* head; 36 | size_t offset; 37 | size_t size; 38 | } reader; 39 | 40 | char error_info[512]; 41 | jmp_buf buf; 42 | int parser_status; 43 | 44 | struct xml_node* root; 45 | }; 46 | 47 | #define reader_isend() (context->reader.offset >= context->reader.size) 48 | #define reader_curchar() (assert(!reader_isend()), (context->reader.head[context->reader.offset])) 49 | #define reader_nextchar() (assert(!reader_isend()), (context->reader.head[context->reader.offset++])) 50 | #define reader_curtype() (cmask[(unsigned char)reader_curchar()]) 51 | #define reader_curhead() (context->reader.head + context->reader.offset) 52 | #define reader_set(v) (assert((v) > 0 && (v) <= context->reader.size), (context->reader.offset = (v))) 53 | #define reader_offset() (context->reader.offset) 54 | #define reader_look(o) (context->reader.offset+(o) >= context->reader.size)?('\0'):(context->reader.head[context->reader.offset+(o)]) 55 | 56 | #define _xml_parser_error(context, f, ...) _xml_error((context), XML_PARSER_ERROR, f, __VA_ARGS__) 57 | #define _xml_errcode2str(e) (assert((e)>0 && (e)/") 61 | #define _xml_parser_tag_fieldname(context, out_str) _xml_parser_value(context, out_str, "S=") 62 | #define _xml_parser_tag_fieldvalue(context, out_str) _xml_parser_value(context, out_str, "S>/") 63 | #define _xml_parser_nodevalue(context, out_str) _xml_parser_value(context, out_str, "<") 64 | 65 | 66 | #define _xml_expect(context, out_str, f, t) do { \ 67 | bool b = f(context, out_str); \ 68 | if(!b) { \ 69 | _xml_parser_error(context, "expect %s token at offset: %ld", t, reader_offset()); \ 70 | } \ 71 | }while(0) 72 | #define _xml_expect_tagname(context, out_str) _xml_expect(context, out_str, _xml_parser_tagname, "tagname") 73 | #define _xml_expect_fieldname(context, out_str) _xml_expect(context, out_str, _xml_parser_tag_fieldname, "fieldname") 74 | #define _xml_expect_fieldvalue(context, out_str) _xml_expect(context, out_str, _xml_parser_tag_fieldvalue, "fieldvalue") 75 | #define _xml_expect_nodevalue(context, out_str) _xml_expect(context, out_str, _xml_parser_nodevalue, "nodevalue") 76 | 77 | #define node_get_element(node) (assert((node)->nt == node_element), &((node)->value.element_value)) 78 | #define node_get_content(node) (assert((node)->nt == node_content), &((node)->value.content_value)) 79 | 80 | static struct xml_node* _xml_parser_entry(struct xml_context* context); 81 | 82 | 83 | struct xml_context* 84 | xml_create(size_t memory_size) { 85 | struct xml_context* context = (struct xml_context*)malloc(sizeof(struct xml_context) + memory_size); 86 | context->memory_head = (uint8_t*)(context + 1); 87 | context->memory_size = memory_size; 88 | context->memory_tail = context->memory_head; 89 | context->root = NULL; 90 | context->error_info[0] = '\0'; 91 | return context; 92 | } 93 | 94 | 95 | void 96 | xml_reset(struct xml_context* context) { 97 | context->memory_tail = context->memory_head; 98 | context->reader.offset = 0; 99 | context->error_info[0] = '\0'; 100 | context->parser_status = XML_SUCCESS; 101 | context->root = NULL; 102 | } 103 | 104 | 105 | struct xml_node* 106 | xml_parser(struct xml_context* context, const char* xml_source, size_t sz) { 107 | xml_reset(context); 108 | context->reader.head = xml_source; 109 | context->reader.size = sz; 110 | 111 | if(setjmp(context->buf) == 0) { 112 | // do parser 113 | struct xml_node* root = _xml_parser_entry(context); 114 | context->root = root; 115 | return root; 116 | } else { 117 | char* error_p = context->error_info; 118 | int error_sz = sizeof(context->error_info); 119 | char tmp[error_sz]; 120 | strncpy(tmp, error_p, sizeof(tmp)-1); 121 | tmp[error_sz-1]='\0'; 122 | int len = snprintf(error_p, error_sz, "error:[%s] %s\ncurrent_reader:", 123 | _xml_errcode2str(context->parser_status), 124 | tmp); 125 | error_sz -= len; 126 | error_p += len; 127 | if(error_sz > 5) { 128 | size_t cap = context->reader.size - reader_offset(); 129 | if(cap < error_sz-1) { 130 | memcpy(error_p, reader_curhead(), cap); 131 | error_p[cap] = '\0'; 132 | } else { 133 | size_t sz = error_sz - 5; 134 | memcpy(error_p, reader_curhead(), sz); 135 | error_sz -= sz; 136 | error_p += sz; 137 | assert(error_sz >=5); 138 | snprintf(error_p, error_sz, "...\n"); 139 | } 140 | } 141 | return NULL; 142 | } 143 | } 144 | 145 | 146 | void 147 | xml_destory(struct xml_context* context) { 148 | free(context); 149 | } 150 | 151 | 152 | const char* 153 | xml_geterror(struct xml_context* context) { 154 | if(context->parser_status != XML_SUCCESS) { 155 | return (const char*)context->error_info; 156 | } else { 157 | return NULL; 158 | } 159 | } 160 | 161 | 162 | static bool 163 | _xml_cmpstr(struct xml_str* s1, struct xml_str* s2) { 164 | return (s1->size == s2->size) && (memcmp(s1->str, s2->str, s1->size)==0); 165 | } 166 | 167 | 168 | static void 169 | _xml_error(struct xml_context* context, int status, const char* f, ...) { 170 | va_list args; 171 | va_start(args, f); 172 | char* error_info = context->error_info; 173 | vsnprintf(error_info, sizeof(context->error_info), f, args); 174 | va_end(args); 175 | context->parser_status = status; 176 | longjmp(context->buf, 1); 177 | } 178 | 179 | 180 | static void* 181 | _xml_malloc(struct xml_context* context, size_t sz) { 182 | size_t cap = context->memory_size - (context->memory_tail - context->memory_head); 183 | if(cap < sz) { 184 | _xml_error(context, XML_MALLOC_ERROR, "xml malloc error"); 185 | return NULL; 186 | } 187 | void* p = (void*)context->memory_tail; 188 | context->memory_tail += sz; 189 | return p; 190 | } 191 | 192 | 193 | static struct xml_node* 194 | xml_get_node(struct xml_context* context, enum e_xml_node_type nt) { 195 | struct xml_node* p = (struct xml_node*)_xml_malloc(context, sizeof(struct xml_node)); 196 | memset(p, 0, sizeof(*p)); 197 | p->nt = nt; 198 | return p; 199 | } 200 | 201 | 202 | static struct xml_property* 203 | xml_get_property(struct xml_context* context) { 204 | struct xml_property* p = (struct xml_property*)_xml_malloc(context, sizeof(struct xml_property)); 205 | p->next = NULL; 206 | return p; 207 | } 208 | 209 | 210 | // match all space 211 | static bool 212 | _xml_parser_blank(struct xml_context* context) { 213 | bool b = false; 214 | while(!reader_isend()) { 215 | char ct = reader_curtype(); 216 | if(ct == 'S') { 217 | b = true; 218 | }else { 219 | break; 220 | } 221 | reader_nextchar(); 222 | } 223 | return b; 224 | } 225 | 226 | 227 | static bool 228 | _xml_parser_single(struct xml_context* context, char c) { 229 | if(!reader_isend() && reader_curchar() == c) { 230 | reader_nextchar(); 231 | return true; 232 | }else { 233 | return false; 234 | } 235 | } 236 | 237 | 238 | static bool 239 | _xml_parser_value(struct xml_context* context, struct xml_str* out_str, const char* final_ct_set) { 240 | bool b = false; 241 | while(!reader_isend()) { 242 | char ct = reader_curtype(); 243 | bool in_set = false; 244 | const char* p = final_ct_set; 245 | while(*p) { 246 | if(*p == ct) { 247 | in_set = true; 248 | break; 249 | } 250 | p++; 251 | } 252 | if(!in_set) { 253 | if(b == false) { 254 | b = true; 255 | out_str->str = reader_curhead(); 256 | out_str->size = 0; 257 | } 258 | out_str->size++; 259 | }else { 260 | break; 261 | } 262 | reader_nextchar(); 263 | } 264 | return b; 265 | } 266 | 267 | 268 | static bool 269 | _xml_parser_string(struct xml_context* context, struct xml_str* out_str) { 270 | bool b = false; 271 | size_t bak_offset = reader_offset(); 272 | char str_begin = reader_look(0); 273 | if(str_begin != '"' && str_begin != '\'') { 274 | return false; 275 | } 276 | reader_nextchar(); 277 | 278 | out_str->str = reader_curhead(); 279 | out_str->size = 0; 280 | while(!reader_isend()) { 281 | char c = reader_nextchar(); 282 | if(c == str_begin) { 283 | b = true; 284 | break; 285 | }else { 286 | out_str->size++; 287 | } 288 | } 289 | 290 | if(!b) { 291 | reader_set(bak_offset); 292 | } 293 | return b; 294 | } 295 | 296 | 297 | inline static void 298 | _xml_expect_single(struct xml_context* context, char c) { 299 | bool b = _xml_parser_single(context, c); 300 | if(!b) { 301 | char lcs[2] = {0}; 302 | char* plc = lcs; 303 | lcs[0] = reader_look(0); 304 | if(lcs[0] == 0) { 305 | plc = "EOF"; 306 | } 307 | _xml_parser_error(context, "expecet char `%c` current `%s` at offset: %ld", c, plc, reader_offset()); 308 | } 309 | } 310 | 311 | inline static void 312 | _xml_expect_string(struct xml_context* context, struct xml_str* out_str) { 313 | bool b = _xml_parser_string(context, out_str); 314 | if(!b) { 315 | _xml_parser_error(context, "expect string at offset: %ld", reader_offset()); 316 | } 317 | } 318 | 319 | 320 | static void 321 | _xml_parser_property(struct xml_context* context, struct xml_property* property) { 322 | _xml_expect_fieldname(context, &property->field_name); 323 | _xml_expect_single(context, '='); 324 | if(!reader_isend()) { 325 | char c = reader_curchar(); 326 | if(c == '"' || c == '\'') { 327 | _xml_expect_string(context, &property->field_value); 328 | }else { 329 | _xml_expect_fieldvalue(context, &property->field_value); 330 | } 331 | }else { 332 | _xml_parser_error(context, "not expect EOF when parser property at offset: %ld", reader_offset()); 333 | } 334 | } 335 | 336 | 337 | // match 338 | static void 339 | _xml_parser_tagheader(struct xml_context* context) { 340 | _xml_expect_single(context, '<'); 341 | _xml_expect_single(context, '?'); 342 | struct xml_str str; 343 | struct xml_property p; 344 | _xml_parser_tagname(context, &str); 345 | _xml_parser_blank(context); 346 | while(!reader_isend()) { 347 | char ct = reader_curtype(); 348 | if(ct == '?') { 349 | break; 350 | }else { 351 | _xml_parser_property(context, &p); 352 | _xml_parser_blank(context); 353 | } 354 | } 355 | _xml_expect_single(context, '?'); 356 | _xml_expect_single(context, '>'); 357 | } 358 | 359 | 360 | static struct xml_node* 361 | _xml_parser_tag(struct xml_context* context) { 362 | // match begin tag 363 | _xml_expect_single(context, '<'); 364 | struct xml_node* node = xml_get_node(context, node_element); 365 | struct xml_element* element = node_get_element(node); 366 | _xml_expect_tagname(context, &element->tag); 367 | _xml_parser_blank(context); 368 | while(!reader_isend()) { 369 | char ct = reader_curtype(); 370 | if(ct == '/') { 371 | _xml_expect_single(context, '/'); 372 | _xml_expect_single(context, '>'); 373 | return node; 374 | }else if (ct == '>') { 375 | _xml_expect_single(context, '>'); 376 | break; 377 | } else { 378 | struct xml_property* p = xml_get_property(context); 379 | _xml_parser_property(context, p); 380 | p->next = element->attrs; 381 | element->attrs = p; 382 | _xml_parser_blank(context); 383 | } 384 | } 385 | 386 | // match children 387 | _xml_parser_blank(context); 388 | while(!reader_isend()) { 389 | struct xml_node* cnode = NULL; 390 | char c = reader_look(0); 391 | if(c == '<') { 392 | char c0 = reader_curchar(); 393 | char c1 = reader_look(1); 394 | if(c0 == '<' && c1 == '/') { 395 | break; 396 | } 397 | cnode = _xml_parser_tag(context); 398 | _xml_parser_blank(context); 399 | } else { 400 | cnode = xml_get_node(context, node_content); 401 | _xml_expect_nodevalue(context, node_get_content(cnode)); 402 | } 403 | assert(cnode); 404 | if(element->children_tail) { 405 | element->children_tail->next = cnode; 406 | cnode->next = NULL; 407 | } 408 | element->children_tail = cnode; 409 | if(!element->children_head) { 410 | element->children_head = cnode; 411 | } 412 | } 413 | 414 | // match end tag 415 | _xml_expect_single(context, '<'); 416 | _xml_expect_single(context, '/'); 417 | struct xml_str endtag_name; 418 | _xml_expect_tagname(context, &endtag_name); 419 | _xml_expect_single(context, '>'); 420 | 421 | if(!_xml_cmpstr(&element->tag, &endtag_name)) { 422 | char s1[element->tag.size + 1]; 423 | char s2[endtag_name.size + 1]; 424 | memcpy(s1, element->tag.str, element->tag.size); 425 | memcpy(s2, endtag_name.str, endtag_name.size); 426 | s1[element->tag.size] ='\0'; 427 | s2[endtag_name.size] = '\0'; 428 | _xml_parser_error(context, "inconsistent tag name `%s` and `%s` at offset:%ld", s1, s2, reader_offset()); 429 | } 430 | return node; 431 | } 432 | 433 | 434 | static struct xml_node* 435 | _xml_parser_entry(struct xml_context* context) { 436 | _xml_parser_blank(context); 437 | while(!reader_isend()) { 438 | char c0 = reader_look(0); 439 | char c1 = reader_look(1); 440 | if(c0 == '<' && c1 == '?') { 441 | _xml_parser_tagheader(context); 442 | } else { 443 | break; 444 | } 445 | _xml_parser_blank(context); 446 | } 447 | 448 | _xml_parser_blank(context); 449 | struct xml_node* root = _xml_parser_tag(context); 450 | return root; 451 | } 452 | -------------------------------------------------------------------------------- /zxml_test.c: -------------------------------------------------------------------------------- 1 | #include "zxml.h" 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | static void 9 | _xml_printtab(int tab) { 10 | int i; 11 | for(i=0; isize > 0) { 19 | _xml_printtab(tab); 20 | char s[str->size+1]; 21 | memcpy(s, str->str, str->size); 22 | s[str->size] = '\0'; 23 | printf("%s", s); 24 | } 25 | } 26 | 27 | 28 | static void 29 | _xml_dump_node(struct xml_node* node, int tab) { 30 | _xml_printtab(tab); 31 | enum e_xml_node_type nt = node->nt; 32 | if(nt == node_element) { 33 | struct xml_element* element = &node->value.element_value; 34 | printf("[element:]\n"); 35 | _xml_printtab(tab); 36 | printf("[tag:]\n"); 37 | _xml_printstr(&element->tag, tab); 38 | printf("\n"); 39 | _xml_printtab(tab); 40 | printf("[attrs:]\n"); 41 | struct xml_property* p = element->attrs; 42 | while(p) { 43 | _xml_printtab(tab); 44 | _xml_printstr(&p->field_name, tab); 45 | printf(" = "); 46 | _xml_printstr(&p->field_value, 0); 47 | printf("\n"); 48 | p = p->next; 49 | } 50 | _xml_printtab(tab); 51 | printf("[children:]\n"); 52 | struct xml_node* c = element->children_head; 53 | while(c) { 54 | _xml_dump_node(c, tab+1); 55 | c = c->next; 56 | } 57 | 58 | } else if(nt == node_content) { 59 | printf("[context:]\n"); 60 | _xml_printstr(&node->value.content_value, tab); 61 | printf("\n"); 62 | 63 | } else { 64 | assert(false); 65 | } 66 | } 67 | 68 | 69 | // for test 70 | int 71 | main(int argc, char const *argv[]) { 72 | assert(argc == 2); 73 | const char* xml_path = argv[1]; 74 | printf("xml_path: %s\n", xml_path); 75 | FILE* fp = fopen(xml_path, "r"); 76 | if(!fp) { 77 | printf("open xml_path:%s error\n", xml_path); 78 | return 0; 79 | } 80 | fseek(fp, 0, SEEK_END); 81 | long size = ftell(fp); 82 | fseek(fp, 0, SEEK_SET); 83 | char* source = malloc(size+1); 84 | fread(source, size, 1, fp); 85 | source[size+1] = '\0'; 86 | 87 | struct xml_context* context = xml_create(size*8); 88 | assert(context); 89 | struct xml_node* root = xml_parser(context, source, size); 90 | printf("context:%p root:%p\n", context, root); 91 | printf("\n--------------\n"); 92 | if (root) { 93 | _xml_dump_node(root, 0); 94 | } else { 95 | printf("xml parser:%s\n", xml_geterror(context)); 96 | } 97 | return 0; 98 | } --------------------------------------------------------------------------------