├── README.md ├── UNLICENSE ├── sxml.c ├── sxml.h └── sxml_test.c /README.md: -------------------------------------------------------------------------------- 1 | SXML 2 | ==== 3 | A Small XML parser written in C. 4 | 5 | Inspired by the clean API design used by the [JSON parser JSMN](http://zserge.bitbucket.org/jsmn.html), SXML has the same design goal and features for XML parsing. Go read about JSMN's Philosophy and Features to get an idea of how it differs to other parsers - I'll wait right here. 6 | 7 | Features 8 | -------- 9 | Here is a list of features SXML shares with JSMN. 10 | 11 | * compatible with C89 12 | * no dependencies 13 | * highly portable 14 | * about 420 lines of code 15 | * extremely small code footprint 16 | * API contains only 2 functions 17 | * no dynamic memory allocation 18 | * incremental single-pass parsing 19 | 20 | Usage 21 | ----- 22 | The header file is heavily commented and should be the first place to look to get started. 23 | 24 | Check out the file sxml_test.c for an example of using SXML within a constrained environment with a fixed sized input and output buffer. 25 | 26 | Limitations 27 | ----------- 28 | In order to remain lightweight the parser has the following limitations: 29 | 30 | * Minimal XML syntax check during parsing 31 | * Input text must be ascii or an [ascii extension](http://en.wikipedia.org/wiki/Extended_ASCII) (latin-1 and utf-8 are examples of ascii extensions) 32 | 33 | Do contact me with suggestions if the limitations above are preventing you from using the parser. 34 | 35 | Alternatives 36 | ------------ 37 | List of alternative lightweight XML parsers considered before writing my own. 38 | 39 | * [ezXML](http://ezxml.sourceforge.net/) 40 | * [FastXML](http://codesuppository.blogspot.com/2009/02/fastxml-extremely-lightweight-stream.html) 41 | * [TinyXml](http://www.grinninglizard.com/tinyxml2/) 42 | * [RapidXML](http://rapidxml.sourceforge.net/) 43 | -------------------------------------------------------------------------------- /UNLICENSE: -------------------------------------------------------------------------------- 1 | This is free and unencumbered software released into the public domain. 2 | 3 | Anyone is free to copy, modify, publish, use, compile, sell, or 4 | distribute this software, either in source code form or as a compiled 5 | binary, for any purpose, commercial or non-commercial, and by any 6 | means. 7 | 8 | In jurisdictions that recognize copyright laws, the author or authors 9 | of this software dedicate any and all copyright interest in the 10 | software to the public domain. We make this dedication for the benefit 11 | of the public at large and to the detriment of our heirs and 12 | successors. We intend this dedication to be an overt act of 13 | relinquishment in perpetuity of all present and future rights to this 14 | software under copyright law. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 19 | IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR 20 | OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 21 | ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 22 | OTHER DEALINGS IN THE SOFTWARE. 23 | 24 | For more information, please refer to 25 | -------------------------------------------------------------------------------- /sxml.c: -------------------------------------------------------------------------------- 1 | #include "sxml.h" 2 | 3 | /* The following functions will need to be replaced if you want no dependency to libc: */ 4 | #include /* memchr, memcmp, strlen, memcpy */ 5 | #include /* assert */ 6 | 7 | typedef unsigned UINT; 8 | typedef int BOOL; 9 | #define FALSE 0 10 | #define TRUE (!FALSE) 11 | 12 | /* 13 | MARK: String 14 | String functions work within the memory range specified (excluding end). 15 | Returns 'end' if value not found. 16 | */ 17 | 18 | static const char* str_findchr (const char* start, const char* end, int c) 19 | { 20 | const char* it; 21 | 22 | assert (start <= end); 23 | assert (0 <= c && c <= 127); /* CHAR_MAX - memchr implementation will only work when searching for ascii characters within a utf-8 string */ 24 | 25 | it= (const char*) memchr (start, c, end - start); 26 | return (it != NULL) ? it : end; 27 | } 28 | 29 | static const char* str_findstr (const char* start, const char* end, const char* needle) 30 | { 31 | size_t needlelen; 32 | int first; 33 | assert (start <= end); 34 | 35 | needlelen= strlen (needle); 36 | assert (0 < needlelen); 37 | first = (unsigned char) needle[0]; 38 | 39 | while (start + needlelen <= end) 40 | { 41 | const char* it= (const char*) memchr (start, first, (end - start) - (needlelen - 1)); 42 | if (it == NULL) 43 | break; 44 | 45 | if (memcmp (it, needle, needlelen) == 0) 46 | return it; 47 | 48 | start= it + 1; 49 | } 50 | 51 | return end; 52 | } 53 | 54 | static BOOL str_startswith (const char* start, const char* end, const char* prefix) 55 | { 56 | long nbytes; 57 | assert (start <= end); 58 | 59 | nbytes= strlen (prefix); 60 | if (end - start < nbytes) 61 | return FALSE; 62 | 63 | return memcmp (prefix, start, nbytes) == 0; 64 | } 65 | 66 | /* http://www.w3.org/TR/xml11/#sec-common-syn */ 67 | 68 | static BOOL WhiteSpace (int c) 69 | { 70 | switch (c) 71 | { 72 | case ' ': /* 0x20 */ 73 | case '\t': /* 0x9 */ 74 | case '\r': /* 0xD */ 75 | case '\n': /* 0xA */ 76 | return TRUE; 77 | } 78 | 79 | return FALSE; 80 | } 81 | 82 | static BOOL NameStartChar (int c) 83 | { 84 | /* 85 | We don't perform utf-8 decoding - just accept all characters with hight bit set 86 | (0xC0 <= c && c <= 0xD6) || (0xD8 <= c && c <= 0xF6) || (0xF8 <= c && c <= 0x2FF) || 87 | (0x370 <= c && c <= 0x37D) || (0x37F <= c && c <= 0x1FFF) || (0x200C <= c && c <= 0x200D) || 88 | (0x2070 <= c && c <= 0x218F) || (0x2C00 <= c && c <= 0x2FEF) || (0x3001 <= c && c <= 0xD7FF) || 89 | (0xF900 <= c && c <= 0xFDCF) || (0xFDF0 <= c && c <= 0xFFFD) || (0x10000 <= c && c <= 0xEFFFF); 90 | */ 91 | if (0x80 <= c) 92 | return TRUE; 93 | 94 | return c == ':' || ('A' <= c && c <= 'Z') || c == '_' || ('a' <= c && c <= 'z'); 95 | } 96 | 97 | static BOOL NameChar (int c) 98 | { 99 | return NameStartChar (c) || 100 | c == '-' || c == '.' || ('0' <= c && c <= '9') || 101 | c == 0xB7 || (0x0300 <= c && c <= 0x036F) || (0x203F <= c && c <= 0x2040); 102 | } 103 | 104 | #define ISSPACE(c) (WhiteSpace(((unsigned char)(c)))) 105 | #define ISALPHA(c) (NameStartChar(((unsigned char)(c)))) 106 | #define ISALNUM(c) (NameChar(((unsigned char)(c)))) 107 | 108 | /* Left trim whitespace */ 109 | static const char* str_ltrim (const char* start, const char* end) 110 | { 111 | const char* it; 112 | assert (start <= end); 113 | 114 | for (it= start; it != end && ISSPACE (*it); it++) 115 | ; 116 | 117 | return it; 118 | } 119 | 120 | /* Right trim whitespace */ 121 | static const char* str_rtrim (const char* start, const char* end) 122 | { 123 | const char* it, *prev; 124 | assert (start <= end); 125 | 126 | for (it= end; start != it; it= prev) 127 | { 128 | prev= it - 1; 129 | if (!ISSPACE (*prev)) 130 | return it; 131 | } 132 | 133 | return start; 134 | } 135 | 136 | static const char* str_find_notalnum (const char* start, const char* end) 137 | { 138 | const char* it; 139 | assert (start <= end); 140 | 141 | for (it= start; it != end && ISALNUM (*it); it++) 142 | ; 143 | 144 | return it; 145 | } 146 | 147 | /* MARK: State */ 148 | 149 | /* Collect arguments in a structure for convenience */ 150 | typedef struct 151 | { 152 | const char* buffer; 153 | UINT bufferlen; 154 | sxmltok_t* tokens; 155 | UINT num_tokens; 156 | } sxml_args_t; 157 | 158 | #define buffer_fromoffset(args,i) ((args)->buffer + (i)) 159 | #define buffer_tooffset(args,ptr) (unsigned) ((ptr) - (args)->buffer) 160 | #define buffer_getend(args) ((args)->buffer + (args)->bufferlen) 161 | 162 | static BOOL state_pushtoken (sxml_t* state, sxml_args_t* args, sxmltype_t type, const char* start, const char* end) 163 | { 164 | sxmltok_t* token; 165 | UINT i= state->ntokens++; 166 | if (args->num_tokens < state->ntokens) 167 | return FALSE; 168 | 169 | token= &args->tokens[i]; 170 | token->type= type; 171 | token->startpos= buffer_tooffset (args, start); 172 | token->endpos= buffer_tooffset (args, end); 173 | token->size= 0; 174 | 175 | switch (type) 176 | { 177 | case SXML_STARTTAG: state->taglevel++; break; 178 | 179 | case SXML_ENDTAG: 180 | assert (0 < state->taglevel); 181 | state->taglevel--; 182 | break; 183 | 184 | default: 185 | break; 186 | } 187 | 188 | return TRUE; 189 | } 190 | 191 | static sxmlerr_t state_setpos (sxml_t* state, const sxml_args_t* args, const char* ptr) 192 | { 193 | state->bufferpos= buffer_tooffset (args, ptr); 194 | return (state->ntokens <= args->num_tokens) ? SXML_SUCCESS : SXML_ERROR_TOKENSFULL; 195 | } 196 | 197 | #define state_commit(dest,src) memcpy ((dest), (src), sizeof (sxml_t)) 198 | 199 | /* 200 | MARK: Parse 201 | 202 | SXML does minimal validation of the input data. 203 | SXML_ERROR_XMLSTRICT is returned if some simple XML validation tests fail. 204 | SXML_ERROR_XMLINVALID is instead returned if the invalid XML data is serious enough to prevent the parser from continuing. 205 | We currently make no difference between these two - but they are marked differently in case we wish to do so in the future. 206 | */ 207 | 208 | #define SXML_ERROR_XMLSTRICT SXML_ERROR_XMLINVALID 209 | 210 | #define ENTITY_MAXLEN 8 /* Σ */ 211 | #define MIN(a,b) ((a) < (b) ? (a) : (b)) 212 | 213 | static sxmlerr_t parse_characters (sxml_t* state, sxml_args_t* args, const char* end) 214 | { 215 | const char* start= buffer_fromoffset (args, state->bufferpos); 216 | const char* limit, *colon, *ampr= str_findchr (start, end, '&'); 217 | assert (end <= buffer_getend (args)); 218 | 219 | if (ampr != start) 220 | state_pushtoken (state, args, SXML_CHARACTER, start, ampr); 221 | 222 | if (ampr == end) 223 | return state_setpos (state, args, ampr); 224 | 225 | /* limit entity to search to ENTITY_MAXLEN */ 226 | limit= MIN (ampr + ENTITY_MAXLEN, end); 227 | colon= str_findchr (ampr, limit, ';'); 228 | if (colon == limit) 229 | return (limit == end) ? SXML_ERROR_BUFFERDRY : SXML_ERROR_XMLINVALID; 230 | 231 | start= colon + 1; 232 | state_pushtoken (state, args, SXML_CHARACTER, ampr, start); 233 | return state_setpos (state, args, start); 234 | } 235 | 236 | static sxmlerr_t parse_attrvalue (sxml_t* state, sxml_args_t* args, const char* end) 237 | { 238 | while (buffer_fromoffset (args, state->bufferpos) != end) 239 | { 240 | sxmlerr_t err= parse_characters (state, args, end); 241 | if (err != SXML_SUCCESS) 242 | return err; 243 | } 244 | 245 | return SXML_SUCCESS; 246 | } 247 | 248 | static sxmlerr_t parse_attributes (sxml_t* state, sxml_args_t* args) 249 | { 250 | const char* start= buffer_fromoffset (args, state->bufferpos); 251 | const char* end= buffer_getend (args); 252 | const char* name= str_ltrim (start, end); 253 | 254 | UINT ntokens= state->ntokens; 255 | assert (0 < ntokens); 256 | 257 | while (name != end && ISALPHA (*name)) 258 | { 259 | const char* eq, *space, *quot, *value; 260 | sxmlerr_t err; 261 | 262 | /* Attribute name */ 263 | eq= str_findchr (name, end, '='); 264 | if (eq == end) 265 | return SXML_ERROR_BUFFERDRY; 266 | 267 | space= str_rtrim (name, eq); 268 | state_pushtoken (state, args, SXML_CDATA, name, space); 269 | 270 | /* Attribute value */ 271 | quot= str_ltrim (eq + 1, end); 272 | if (quot == end) 273 | return SXML_ERROR_BUFFERDRY; 274 | else if (*quot != '\'' && *quot != '"') 275 | return SXML_ERROR_XMLINVALID; 276 | 277 | value= quot + 1; 278 | quot= str_findchr (value, end, *quot); 279 | if (quot == end) 280 | return SXML_ERROR_BUFFERDRY; 281 | 282 | state_setpos (state, args, value); 283 | err= parse_attrvalue (state, args, quot); 284 | if (err != SXML_SUCCESS) 285 | return err; 286 | 287 | /* --- */ 288 | 289 | name= str_ltrim (quot + 1, end); 290 | } 291 | 292 | { 293 | sxmltok_t* token= args->tokens + (ntokens - 1); 294 | token->size= (unsigned short) (state->ntokens - ntokens); 295 | } 296 | 297 | return state_setpos (state, args, name); 298 | } 299 | 300 | /* --- */ 301 | 302 | #define TAG_LEN(str) (sizeof (str) - 1) 303 | #define TAG_MINSIZE 3 304 | 305 | static sxmlerr_t parse_comment (sxml_t* state, sxml_args_t* args) 306 | { 307 | static const char STARTTAG[]= ""; 309 | 310 | const char* dash; 311 | const char* start= buffer_fromoffset (args, state->bufferpos); 312 | const char* end= buffer_getend (args); 313 | if (end - start < TAG_LEN (STARTTAG)) 314 | return SXML_ERROR_BUFFERDRY; 315 | 316 | if (!str_startswith (start, end, STARTTAG)) 317 | return SXML_ERROR_XMLINVALID; 318 | 319 | start+= TAG_LEN (STARTTAG); 320 | dash= str_findstr (start, end, ENDTAG); 321 | if (dash == end) 322 | return SXML_ERROR_BUFFERDRY; 323 | 324 | state_pushtoken (state, args, SXML_COMMENT, start, dash); 325 | return state_setpos (state, args, dash + TAG_LEN (ENDTAG)); 326 | } 327 | 328 | static sxmlerr_t parse_instruction (sxml_t* state, sxml_args_t* args) 329 | { 330 | static const char STARTTAG[]= ""; 332 | 333 | sxmlerr_t err; 334 | const char* quest, *space; 335 | const char* start= buffer_fromoffset (args, state->bufferpos); 336 | const char* end= buffer_getend (args); 337 | assert (TAG_MINSIZE <= end - start); 338 | 339 | if (!str_startswith (start, end, STARTTAG)) 340 | return SXML_ERROR_XMLINVALID; 341 | 342 | start+= TAG_LEN (STARTTAG); 343 | space= str_find_notalnum (start, end); 344 | if (space == end) 345 | return SXML_ERROR_BUFFERDRY; 346 | 347 | state_pushtoken (state, args, SXML_INSTRUCTION, start, space); 348 | 349 | state_setpos (state, args, space); 350 | err= parse_attributes (state, args); 351 | if (err != SXML_SUCCESS) 352 | return err; 353 | 354 | quest= buffer_fromoffset (args, state->bufferpos); 355 | if (end - quest < TAG_LEN (ENDTAG)) 356 | return SXML_ERROR_BUFFERDRY; 357 | 358 | if (!str_startswith (quest, end, ENDTAG)) 359 | return SXML_ERROR_XMLINVALID; 360 | 361 | return state_setpos (state, args, quest + TAG_LEN (ENDTAG)); 362 | } 363 | 364 | static sxmlerr_t parse_doctype (sxml_t* state, sxml_args_t* args) 365 | { 366 | static const char STARTTAG[]= ""; 368 | 369 | const char* bracket; 370 | const char* start= buffer_fromoffset (args, state->bufferpos); 371 | const char* end= buffer_getend (args); 372 | if (end - start < TAG_LEN (STARTTAG)) 373 | return SXML_ERROR_BUFFERDRY; 374 | 375 | if (!str_startswith (start, end, STARTTAG)) 376 | return SXML_ERROR_BUFFERDRY; 377 | 378 | start+= TAG_LEN (STARTTAG); 379 | bracket= str_findstr (start, end, ENDTAG); 380 | if (bracket == end) 381 | return SXML_ERROR_BUFFERDRY; 382 | 383 | state_pushtoken (state, args, SXML_DOCTYPE, start, bracket); 384 | return state_setpos (state, args, bracket + TAG_LEN (ENDTAG)); 385 | } 386 | 387 | static sxmlerr_t parse_start (sxml_t* state, sxml_args_t* args) 388 | { 389 | sxmlerr_t err; 390 | const char* gt, *name, *space; 391 | const char* start= buffer_fromoffset (args, state->bufferpos); 392 | const char* end= buffer_getend (args); 393 | assert (TAG_MINSIZE <= end - start); 394 | 395 | if (!(start[0] == '<' && ISALPHA (start[1]))) 396 | return SXML_ERROR_XMLINVALID; 397 | 398 | /* --- */ 399 | 400 | name= start + 1; 401 | space= str_find_notalnum (name, end); 402 | if (space == end) 403 | return SXML_ERROR_BUFFERDRY; 404 | 405 | state_pushtoken (state, args, SXML_STARTTAG, name, space); 406 | 407 | state_setpos (state, args, space); 408 | err= parse_attributes (state, args); 409 | if (err != SXML_SUCCESS) 410 | return err; 411 | 412 | /* --- */ 413 | 414 | gt= buffer_fromoffset (args, state->bufferpos); 415 | 416 | if (gt != end && *gt == '/') 417 | { 418 | state_pushtoken (state, args, SXML_ENDTAG, name, space); 419 | gt++; 420 | } 421 | 422 | if (gt == end) 423 | return SXML_ERROR_BUFFERDRY; 424 | 425 | if (*gt != '>') 426 | return SXML_ERROR_XMLINVALID; 427 | 428 | return state_setpos (state, args, gt + 1); 429 | } 430 | 431 | static sxmlerr_t parse_end (sxml_t* state, sxml_args_t* args) 432 | { 433 | const char* gt, *space; 434 | const char* start= buffer_fromoffset (args, state->bufferpos); 435 | const char* end= buffer_getend (args); 436 | assert (TAG_MINSIZE <= end - start); 437 | 438 | if (!(str_startswith (start, end, "'); 443 | if (gt == end) 444 | return SXML_ERROR_BUFFERDRY; 445 | 446 | /* Test for no characters beyond elem name */ 447 | space= str_find_notalnum (start, gt); 448 | if (str_ltrim (space, gt) != gt) 449 | return SXML_ERROR_XMLSTRICT; 450 | 451 | state_pushtoken (state, args, SXML_ENDTAG, start, space); 452 | return state_setpos (state, args, gt + 1); 453 | } 454 | 455 | static sxmlerr_t parse_cdata (sxml_t* state, sxml_args_t* args) 456 | { 457 | static const char STARTTAG[]= ""; 459 | 460 | const char* bracket; 461 | const char* start= buffer_fromoffset (args, state->bufferpos); 462 | const char* end= buffer_getend (args); 463 | if (end - start < TAG_LEN (STARTTAG)) 464 | return SXML_ERROR_BUFFERDRY; 465 | 466 | if (!str_startswith (start, end, STARTTAG)) 467 | return SXML_ERROR_XMLINVALID; 468 | 469 | start+= TAG_LEN (STARTTAG); 470 | bracket= str_findstr (start, end, ENDTAG); 471 | if (bracket == end) 472 | return SXML_ERROR_BUFFERDRY; 473 | 474 | state_pushtoken (state, args, SXML_CDATA, start, bracket); 475 | return state_setpos (state, args, bracket + TAG_LEN (ENDTAG)); 476 | } 477 | 478 | /* 479 | MARK: SXML 480 | Public API inspired by the JSON parser JSMN ( http://zserge.com/jsmn.html ). 481 | */ 482 | 483 | void sxml_init (sxml_t *state) 484 | { 485 | state->bufferpos= 0; 486 | state->ntokens= 0; 487 | state->taglevel= 0; 488 | } 489 | 490 | #define ROOT_FOUND(state) (0 < (state)->taglevel) 491 | #define ROOT_PARSED(state) ((state)->taglevel == 0) 492 | 493 | sxmlerr_t sxml_parse(sxml_t *state, const char *buffer, UINT bufferlen, sxmltok_t tokens[], UINT num_tokens) 494 | { 495 | sxml_t temp= *state; 496 | const char* end= buffer + bufferlen; 497 | 498 | sxml_args_t args; 499 | args.buffer= buffer; 500 | args.bufferlen= bufferlen; 501 | args.tokens= tokens; 502 | args.num_tokens= num_tokens; 503 | 504 | /* --- */ 505 | 506 | while (!ROOT_FOUND (&temp)) 507 | { 508 | sxmlerr_t err; 509 | const char* start= buffer_fromoffset (&args, temp.bufferpos); 510 | const char* lt= str_ltrim (start, end); 511 | state_setpos (&temp, &args, lt); 512 | state_commit (state, &temp); 513 | 514 | if (end - lt < TAG_MINSIZE) 515 | return SXML_ERROR_BUFFERDRY; 516 | 517 | /* --- */ 518 | 519 | if (*lt != '<') 520 | return SXML_ERROR_XMLINVALID; 521 | 522 | switch (lt[1]) 523 | { 524 | case '?': err= parse_instruction (&temp, &args); break; 525 | case '!': err= parse_doctype (&temp, &args); break; 526 | default: err= parse_start (&temp, &args); break; 527 | } 528 | 529 | if (err != SXML_SUCCESS) 530 | return err; 531 | 532 | state_commit (state, &temp); 533 | } 534 | 535 | /* --- */ 536 | 537 | while (!ROOT_PARSED (&temp)) 538 | { 539 | sxmlerr_t err; 540 | const char* start= buffer_fromoffset (&args, temp.bufferpos); 541 | const char* lt= str_findchr (start, end, '<'); 542 | while (buffer_fromoffset (&args, temp.bufferpos) != lt) 543 | { 544 | sxmlerr_t err= parse_characters (&temp, &args, lt); 545 | if (err != SXML_SUCCESS) 546 | return err; 547 | 548 | state_commit (state, &temp); 549 | } 550 | 551 | /* --- */ 552 | 553 | if (end - lt < TAG_MINSIZE) 554 | return SXML_ERROR_BUFFERDRY; 555 | 556 | switch (lt[1]) 557 | { 558 | case '?': err= parse_instruction (&temp, &args); break; 559 | case '/': err= parse_end (&temp, &args); break; 560 | case '!': err= (lt[2] == '-') ? parse_comment (&temp, &args) : parse_cdata (&temp, &args); break; 561 | default: err= parse_start (&temp, &args); break; 562 | } 563 | 564 | if (err != SXML_SUCCESS) 565 | return err; 566 | 567 | state_commit (state, &temp); 568 | } 569 | 570 | return SXML_SUCCESS; 571 | } 572 | -------------------------------------------------------------------------------- /sxml.h: -------------------------------------------------------------------------------- 1 | #ifndef _SXML_H_INCLUDED 2 | #define _SXML_H_INCLUDED 3 | 4 | #ifdef __cplusplus 5 | extern "C" { 6 | #endif 7 | 8 | /* 9 | --- SXML --- 10 | Short description of how to use SXML for parsing XML text. 11 | 12 | SXML is a lightweight XML parser with no external dependencies. 13 | To parse XML text you only need to call one function: sxml_parse(). 14 | The function has the following return codes: 15 | */ 16 | 17 | typedef enum 18 | { 19 | SXML_ERROR_XMLINVALID= -1, /* Parser found invalid XML data - not much you can do beyond error reporting */ 20 | SXML_SUCCESS= 0, /* Parser has completed successfully - parsing of XML document is complete */ 21 | SXML_ERROR_BUFFERDRY= 1, /* Parser ran out of input data - refill buffer with more XML text to continue parsing */ 22 | SXML_ERROR_TOKENSFULL= 2 /* Parser has filled all the supplied tokens with data - provide more tokens for further output */ 23 | } sxmlerr_t; 24 | 25 | /* 26 | You provide sxml_parse() with a buffer of XML text for parsing. 27 | The parser will handle text data encoded in ascii, latin-1 and utf-8. 28 | It should also work with other encodings that are acsii extensions. 29 | 30 | sxml_parse() is reentrant. 31 | In the case of return code SXML_ERROR_BUFFERDRY or SXML_ERROR_TOKENSFULL, you are expected to call the function again after resolving the problem to continue parsing. 32 | */ 33 | 34 | typedef struct sxml_t sxml_t; 35 | typedef struct sxmltok_t sxmltok_t; 36 | sxmlerr_t sxml_parse(sxml_t *parser, const char *buffer, unsigned bufferlen, sxmltok_t* tokens, unsigned num_tokens); 37 | 38 | /* 39 | The sxml_t object stores all data required for SXML to continue from where it left of. 40 | 41 | After calling sxml_parse() 'ntokens' tells you how many output tokens have been filled with data. 42 | Depending on how you resolve SXML_ERROR_BUFFERDRY or SXML_ERROR_TOKENSFULL you may need to modifiy 'bufferpos' and 'ntokens' to correctly reflect the new buffer and tokens you provide. 43 | */ 44 | 45 | struct sxml_t 46 | { 47 | unsigned bufferpos; /* Current offset into buffer - all XML data before this position has been successfully parsed */ 48 | unsigned ntokens; /* Number of tokens filled with valid data by the parser */ 49 | unsigned taglevel; /* Used internally - keeps track of number of unclosed XML elements to detect start and end of document */ 50 | }; 51 | 52 | /* 53 | Before you call sxml_parse() for the first time, you have to initialize the parser object. 54 | You may easily do that with the provided function sxml_init(). 55 | */ 56 | 57 | void sxml_init(sxml_t *parser); 58 | 59 | /* 60 | Unlike most XML parsers, SXML does not use SAX callbacks or allocate a DOM tree. 61 | Instead you will have to interpret the XML structure through a table of tokens. 62 | 63 | A token can describe any of the following types: 64 | */ 65 | 66 | typedef enum 67 | { 68 | SXML_STARTTAG, /* Start tag describes the opening of an XML element */ 69 | SXML_ENDTAG, /* End tag is the closing of an XML element */ 70 | 71 | SXML_CHARACTER, /* Character data may be escaped - check if the first character is an ampersand '&' to identity a XML character reference */ 72 | SXML_CDATA, /* Character data should be read as is - it is not escaped */ 73 | 74 | /* And some other token types you might be interested in: */ 75 | SXML_INSTRUCTION, /* Can be used to identity the text encoding */ 76 | SXML_DOCTYPE, /* If you'd like to interpret DTD data */ 77 | SXML_COMMENT /* Most likely you don't care about comments - but this is where you'll find them */ 78 | } sxmltype_t; 79 | 80 | /* 81 | If you are familiar with the structure of an XML document most of these type names should sound familiar. 82 | 83 | A token has the following data: 84 | */ 85 | 86 | struct sxmltok_t 87 | { 88 | unsigned short type; /* A token is one of the above sxmltype_t */ 89 | unsigned short size; /* The following number of tokens contain additional data related to this token - used for describing attributes */ 90 | 91 | /* 'startpos' and 'endpos' together define a range within the provided text buffer - use these offsets with the buffer to extract the text value of the token */ 92 | unsigned startpos; 93 | unsigned endpos; 94 | }; 95 | 96 | /* 97 | Let's walk through how to correctly interpret a token of type SXML_STARTTAG. 98 | 99 | 100 | 101 | The element name ('example') can be extracted from the text buffer using 'startpos' and 'endpos'. 102 | 103 | The attributes of the XML element are described in the following 'size' tokens. 104 | Each attribute is divided by a token of type SXML_CDATA - this is the attribute key. 105 | There will be zero or more tokens of type SXML_CHARACTER following the key - together they describe one attribute value. 106 | 107 | In our example you will get the following number of SXML_CHARACTER tokens after the attribute key: 108 | * 'zero' will use no tokens to describe the empty attribute value. 109 | * 'one' will have one token describing the attribute value ('Hello there!'). 110 | * 'three' will have three tokens describing the attribute value ('Me, Myself ')('&')(' I') 111 | 112 | In our example the token of type SXML_STARTTAG will have a 'size' of 7 (3 SXML_CDATA and 4 SXML_CHARACTER). 113 | When processing the tokens do not forget about 'size' - for any token you want to skip, also remember to skip the additional token data! 114 | */ 115 | 116 | #ifdef __cplusplus 117 | } 118 | #endif 119 | 120 | /* 121 | Congratulations on making it this far - now might be a good time to check out sxml_test.c for an example of using SXML. 122 | */ 123 | 124 | #endif /* _SXML_H_INCLUDED */ 125 | -------------------------------------------------------------------------------- /sxml_test.c: -------------------------------------------------------------------------------- 1 | #include "sxml.h" 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | typedef unsigned UINT; 9 | 10 | /* 11 | MARK: Pretty print XML 12 | Example of simple processing of parsed token output. 13 | */ 14 | static void print_indent (UINT indentlevel) 15 | { 16 | if (0 < indentlevel) 17 | { 18 | char fmt[8]; 19 | sprintf (fmt, "%%%ds", indentlevel * 3); 20 | printf (fmt, " "); 21 | } 22 | } 23 | 24 | static void print_tokenvalue (const char* buffer, const sxmltok_t* token) 25 | { 26 | char fmt[8]; 27 | sprintf (fmt, "%%.%ds", token->endpos - token->startpos); 28 | printf (fmt, buffer + token->startpos); 29 | } 30 | 31 | static UINT print_chartokens (const char* buffer, const sxmltok_t tokens[], UINT num_tokens) 32 | { 33 | UINT i; 34 | 35 | for (i= 0; i < num_tokens; i++) 36 | { 37 | const char* ampr; 38 | 39 | const sxmltok_t* token= tokens + i; 40 | if (token->type != SXML_CHARACTER) 41 | return i; 42 | 43 | ampr= buffer + token->startpos; 44 | assert (0 < token->endpos - token->startpos); 45 | 46 | if (*ampr != '&') 47 | { 48 | print_tokenvalue (buffer, token); 49 | continue; 50 | } 51 | 52 | switch (ampr[1]) 53 | { 54 | case 'a': printf ((ampr[2] == 'm') ? "&" : "'"); break; 55 | case 'g': printf (">"); break; 56 | case 'l': printf ("<"); break; 57 | case 'q': printf ("\""); break; 58 | default: 59 | assert (0); 60 | break; 61 | } 62 | } 63 | 64 | return num_tokens; 65 | } 66 | 67 | static void print_prettyxml (const char* buffer, const sxmltok_t tokens[], UINT num_tokens, UINT* indentlevel) 68 | { 69 | UINT i; 70 | for (i= 0; i < num_tokens; i++) 71 | { 72 | const sxmltok_t* token= tokens + i; 73 | switch (token->type) 74 | { 75 | case SXML_STARTTAG: 76 | { 77 | UINT j; 78 | 79 | print_indent ((*indentlevel)++); 80 | printf ("<"); 81 | print_tokenvalue (buffer, token); 82 | 83 | /* elem attributes are listed in the following tokens */ 84 | for (j= 0; j < token->size; j++) 85 | { 86 | printf (" "); 87 | print_tokenvalue (buffer, &token[j + 1]); 88 | printf ("='"); 89 | j+= print_chartokens (buffer, &token[j + 2], token->size - (j + 1)); 90 | printf ("'"); 91 | } 92 | 93 | puts (">"); 94 | break; 95 | } 96 | 97 | case SXML_ENDTAG: 98 | print_indent (--(*indentlevel)); 99 | printf (""); 102 | break; 103 | 104 | 105 | /* Other token types you might be interested in: */ 106 | /* 107 | case SXML_INSTRUCTION 108 | case SXML_DOCTYPE: 109 | case SXML_COMMENT: 110 | case SXML_CDATA: 111 | case SXML_CHARACTER: 112 | */ 113 | 114 | default: 115 | break; 116 | } 117 | 118 | /* 119 | Tokens may contain additional data. Skip 'size' tokens to get the next token to proccess. 120 | (see SXML_STARTTAG case above as an example of how attributes are specified) 121 | */ 122 | i+= token->size; 123 | } 124 | } 125 | 126 | /* 127 | MARK: Utility functions 128 | Useful for error reporting. 129 | */ 130 | static UINT count_lines (const char* buffer, UINT bufferlen) 131 | { 132 | const char* end= buffer + bufferlen; 133 | const char* it= buffer; 134 | UINT i; 135 | 136 | for (i= 0; ; i++) 137 | { 138 | it= (const char*) memchr (it, '\n', end - it); 139 | if (it == NULL) 140 | return i; 141 | 142 | it++; 143 | } 144 | } 145 | 146 | 147 | /* 148 | MARK: main 149 | Minimal example showing how you may use SXML within a constrained environment with a fixed size input and output buffer. 150 | */ 151 | 152 | #define MIN(a,b) (((a) < (b)) ? (a) : (b)) 153 | #define COUNT(arr) (sizeof (arr) / sizeof ((arr)[0])) 154 | 155 | #define BUFFER_MAXLEN 1024 156 | 157 | 158 | int main (int argc, const char* argv[]) 159 | { 160 | /* Input XML text */ 161 | char buffer[BUFFER_MAXLEN]; 162 | UINT bufferlen= 0; 163 | 164 | /* Output token table */ 165 | sxmltok_t tokens[128]; 166 | 167 | /* Used in example for pretty printing and error reporting */ 168 | UINT indent= 0, lineno= 1; 169 | 170 | const char* path; 171 | FILE* file; 172 | 173 | /* Parser object stores all data required for SXML to be reentrant */ 174 | sxml_t parser; 175 | sxml_init (&parser); 176 | 177 | /* Usage: sxml_test.exe test.xml */ 178 | assert (argc == 2); 179 | path= argv[1]; 180 | file= fopen (path, "rb"); 181 | assert (file != NULL); 182 | 183 | for (;;) 184 | { 185 | sxmlerr_t err= sxml_parse (&parser, buffer, bufferlen, tokens, COUNT (tokens)); 186 | if (err == SXML_SUCCESS) 187 | break; 188 | 189 | switch (err) 190 | { 191 | case SXML_ERROR_TOKENSFULL: 192 | { 193 | /* 194 | Need to give parser more space for tokens to continue parsing. 195 | We choose here to reuse the existing token table once tokens have been processed. 196 | 197 | Example of some processing of the token data. 198 | Instead you might be interested in creating your own DOM structure 199 | or other processing of XML data useful to your application. 200 | */ 201 | print_prettyxml (buffer, tokens, parser.ntokens, &indent); 202 | 203 | /* Parser can now safely reuse all of the token table */ 204 | parser.ntokens= 0; 205 | break; 206 | } 207 | 208 | case SXML_ERROR_BUFFERDRY: 209 | { 210 | /* 211 | Parser expects more XML data to continue parsing. 212 | We choose here to reuse the existing buffer array. 213 | */ 214 | size_t len; 215 | 216 | /* Need to processs existing tokens before buffer is overwritten with new data */ 217 | print_prettyxml (buffer, tokens, parser.ntokens, &indent); 218 | parser.ntokens= 0; 219 | 220 | /* For error reporting */ 221 | lineno+= count_lines(buffer, parser.bufferpos); 222 | 223 | /* 224 | Example of how to reuse buffer array. 225 | Move unprocessed buffer content to start of array 226 | */ 227 | bufferlen-= parser.bufferpos; 228 | memmove (buffer, buffer + parser.bufferpos, bufferlen); 229 | 230 | /* 231 | If your buffer is smaller than the size required to complete a token the parser will endlessly call SXML_ERROR_BUFFERDRY. 232 | You will most likely encounter this problem if you have XML comments longer than BUFFER_MAXLEN in size. 233 | SXML_CHARACTER solves this problem by dividing the data over multiple tokens, but other token types remain affected. 234 | */ 235 | assert (bufferlen < BUFFER_MAXLEN); 236 | 237 | /* Fill remaining buffer with new data from file */ 238 | len= fread (buffer + bufferlen, 1, BUFFER_MAXLEN - bufferlen, file); 239 | assert (0 < len); 240 | bufferlen+= len; 241 | 242 | /* Parser will now have to read from beginning of buffer to contiue */ 243 | parser.bufferpos= 0; 244 | break; 245 | } 246 | 247 | case SXML_ERROR_XMLINVALID: 248 | { 249 | char fmt[8]; 250 | 251 | /* Example of some simple error reporting */ 252 | lineno+= count_lines (buffer, parser.bufferpos); 253 | fprintf(stderr, "Error while parsing line %d:\n", lineno); 254 | 255 | /* Print out contents of line containing the error */ 256 | sprintf (fmt, "%%.%ds", MIN (bufferlen - parser.bufferpos, 72)); 257 | fprintf (stderr, fmt, buffer + parser.bufferpos); 258 | 259 | abort(); 260 | break; 261 | } 262 | 263 | default: 264 | assert (0); 265 | break; 266 | } 267 | } 268 | 269 | fclose (file); 270 | 271 | /* Sucessfully parsed XML file - flush remainig token output */ 272 | print_prettyxml (buffer, tokens, parser.ntokens, &indent); 273 | return 0; 274 | } 275 | --------------------------------------------------------------------------------