├── README.md
├── UNLICENSE
├── sxml.c
├── sxml.h
└── sxml_test.c
/README.md:
--------------------------------------------------------------------------------
1 | SXML
2 | ====
3 | A Small XML parser written in C.
4 |
5 | Inspired by the clean API design used by the [JSON parser JSMN](http://zserge.bitbucket.org/jsmn.html), SXML has the same design goal and features for XML parsing. Go read about JSMN's Philosophy and Features to get an idea of how it differs to other parsers - I'll wait right here.
6 |
7 | Features
8 | --------
9 | Here is a list of features SXML shares with JSMN.
10 |
11 | * compatible with C89
12 | * no dependencies
13 | * highly portable
14 | * about 420 lines of code
15 | * extremely small code footprint
16 | * API contains only 2 functions
17 | * no dynamic memory allocation
18 | * incremental single-pass parsing
19 |
20 | Usage
21 | -----
22 | The header file is heavily commented and should be the first place to look to get started.
23 |
24 | Check out the file sxml_test.c for an example of using SXML within a constrained environment with a fixed sized input and output buffer.
25 |
26 | Limitations
27 | -----------
28 | In order to remain lightweight the parser has the following limitations:
29 |
30 | * Minimal XML syntax check during parsing
31 | * Input text must be ascii or an [ascii extension](http://en.wikipedia.org/wiki/Extended_ASCII) (latin-1 and utf-8 are examples of ascii extensions)
32 |
33 | Do contact me with suggestions if the limitations above are preventing you from using the parser.
34 |
35 | Alternatives
36 | ------------
37 | List of alternative lightweight XML parsers considered before writing my own.
38 |
39 | * [ezXML](http://ezxml.sourceforge.net/)
40 | * [FastXML](http://codesuppository.blogspot.com/2009/02/fastxml-extremely-lightweight-stream.html)
41 | * [TinyXml](http://www.grinninglizard.com/tinyxml2/)
42 | * [RapidXML](http://rapidxml.sourceforge.net/)
43 |
--------------------------------------------------------------------------------
/UNLICENSE:
--------------------------------------------------------------------------------
1 | This is free and unencumbered software released into the public domain.
2 |
3 | Anyone is free to copy, modify, publish, use, compile, sell, or
4 | distribute this software, either in source code form or as a compiled
5 | binary, for any purpose, commercial or non-commercial, and by any
6 | means.
7 |
8 | In jurisdictions that recognize copyright laws, the author or authors
9 | of this software dedicate any and all copyright interest in the
10 | software to the public domain. We make this dedication for the benefit
11 | of the public at large and to the detriment of our heirs and
12 | successors. We intend this dedication to be an overt act of
13 | relinquishment in perpetuity of all present and future rights to this
14 | software under copyright law.
15 |
16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
19 | IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
20 | OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
21 | ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
22 | OTHER DEALINGS IN THE SOFTWARE.
23 |
24 | For more information, please refer to
25 |
--------------------------------------------------------------------------------
/sxml.c:
--------------------------------------------------------------------------------
1 | #include "sxml.h"
2 |
3 | /* The following functions will need to be replaced if you want no dependency to libc: */
4 | #include /* memchr, memcmp, strlen, memcpy */
5 | #include /* assert */
6 |
7 | typedef unsigned UINT;
8 | typedef int BOOL;
9 | #define FALSE 0
10 | #define TRUE (!FALSE)
11 |
12 | /*
13 | MARK: String
14 | String functions work within the memory range specified (excluding end).
15 | Returns 'end' if value not found.
16 | */
17 |
18 | static const char* str_findchr (const char* start, const char* end, int c)
19 | {
20 | const char* it;
21 |
22 | assert (start <= end);
23 | assert (0 <= c && c <= 127); /* CHAR_MAX - memchr implementation will only work when searching for ascii characters within a utf-8 string */
24 |
25 | it= (const char*) memchr (start, c, end - start);
26 | return (it != NULL) ? it : end;
27 | }
28 |
29 | static const char* str_findstr (const char* start, const char* end, const char* needle)
30 | {
31 | size_t needlelen;
32 | int first;
33 | assert (start <= end);
34 |
35 | needlelen= strlen (needle);
36 | assert (0 < needlelen);
37 | first = (unsigned char) needle[0];
38 |
39 | while (start + needlelen <= end)
40 | {
41 | const char* it= (const char*) memchr (start, first, (end - start) - (needlelen - 1));
42 | if (it == NULL)
43 | break;
44 |
45 | if (memcmp (it, needle, needlelen) == 0)
46 | return it;
47 |
48 | start= it + 1;
49 | }
50 |
51 | return end;
52 | }
53 |
54 | static BOOL str_startswith (const char* start, const char* end, const char* prefix)
55 | {
56 | long nbytes;
57 | assert (start <= end);
58 |
59 | nbytes= strlen (prefix);
60 | if (end - start < nbytes)
61 | return FALSE;
62 |
63 | return memcmp (prefix, start, nbytes) == 0;
64 | }
65 |
66 | /* http://www.w3.org/TR/xml11/#sec-common-syn */
67 |
68 | static BOOL WhiteSpace (int c)
69 | {
70 | switch (c)
71 | {
72 | case ' ': /* 0x20 */
73 | case '\t': /* 0x9 */
74 | case '\r': /* 0xD */
75 | case '\n': /* 0xA */
76 | return TRUE;
77 | }
78 |
79 | return FALSE;
80 | }
81 |
82 | static BOOL NameStartChar (int c)
83 | {
84 | /*
85 | We don't perform utf-8 decoding - just accept all characters with hight bit set
86 | (0xC0 <= c && c <= 0xD6) || (0xD8 <= c && c <= 0xF6) || (0xF8 <= c && c <= 0x2FF) ||
87 | (0x370 <= c && c <= 0x37D) || (0x37F <= c && c <= 0x1FFF) || (0x200C <= c && c <= 0x200D) ||
88 | (0x2070 <= c && c <= 0x218F) || (0x2C00 <= c && c <= 0x2FEF) || (0x3001 <= c && c <= 0xD7FF) ||
89 | (0xF900 <= c && c <= 0xFDCF) || (0xFDF0 <= c && c <= 0xFFFD) || (0x10000 <= c && c <= 0xEFFFF);
90 | */
91 | if (0x80 <= c)
92 | return TRUE;
93 |
94 | return c == ':' || ('A' <= c && c <= 'Z') || c == '_' || ('a' <= c && c <= 'z');
95 | }
96 |
97 | static BOOL NameChar (int c)
98 | {
99 | return NameStartChar (c) ||
100 | c == '-' || c == '.' || ('0' <= c && c <= '9') ||
101 | c == 0xB7 || (0x0300 <= c && c <= 0x036F) || (0x203F <= c && c <= 0x2040);
102 | }
103 |
104 | #define ISSPACE(c) (WhiteSpace(((unsigned char)(c))))
105 | #define ISALPHA(c) (NameStartChar(((unsigned char)(c))))
106 | #define ISALNUM(c) (NameChar(((unsigned char)(c))))
107 |
108 | /* Left trim whitespace */
109 | static const char* str_ltrim (const char* start, const char* end)
110 | {
111 | const char* it;
112 | assert (start <= end);
113 |
114 | for (it= start; it != end && ISSPACE (*it); it++)
115 | ;
116 |
117 | return it;
118 | }
119 |
120 | /* Right trim whitespace */
121 | static const char* str_rtrim (const char* start, const char* end)
122 | {
123 | const char* it, *prev;
124 | assert (start <= end);
125 |
126 | for (it= end; start != it; it= prev)
127 | {
128 | prev= it - 1;
129 | if (!ISSPACE (*prev))
130 | return it;
131 | }
132 |
133 | return start;
134 | }
135 |
136 | static const char* str_find_notalnum (const char* start, const char* end)
137 | {
138 | const char* it;
139 | assert (start <= end);
140 |
141 | for (it= start; it != end && ISALNUM (*it); it++)
142 | ;
143 |
144 | return it;
145 | }
146 |
147 | /* MARK: State */
148 |
149 | /* Collect arguments in a structure for convenience */
150 | typedef struct
151 | {
152 | const char* buffer;
153 | UINT bufferlen;
154 | sxmltok_t* tokens;
155 | UINT num_tokens;
156 | } sxml_args_t;
157 |
158 | #define buffer_fromoffset(args,i) ((args)->buffer + (i))
159 | #define buffer_tooffset(args,ptr) (unsigned) ((ptr) - (args)->buffer)
160 | #define buffer_getend(args) ((args)->buffer + (args)->bufferlen)
161 |
162 | static BOOL state_pushtoken (sxml_t* state, sxml_args_t* args, sxmltype_t type, const char* start, const char* end)
163 | {
164 | sxmltok_t* token;
165 | UINT i= state->ntokens++;
166 | if (args->num_tokens < state->ntokens)
167 | return FALSE;
168 |
169 | token= &args->tokens[i];
170 | token->type= type;
171 | token->startpos= buffer_tooffset (args, start);
172 | token->endpos= buffer_tooffset (args, end);
173 | token->size= 0;
174 |
175 | switch (type)
176 | {
177 | case SXML_STARTTAG: state->taglevel++; break;
178 |
179 | case SXML_ENDTAG:
180 | assert (0 < state->taglevel);
181 | state->taglevel--;
182 | break;
183 |
184 | default:
185 | break;
186 | }
187 |
188 | return TRUE;
189 | }
190 |
191 | static sxmlerr_t state_setpos (sxml_t* state, const sxml_args_t* args, const char* ptr)
192 | {
193 | state->bufferpos= buffer_tooffset (args, ptr);
194 | return (state->ntokens <= args->num_tokens) ? SXML_SUCCESS : SXML_ERROR_TOKENSFULL;
195 | }
196 |
197 | #define state_commit(dest,src) memcpy ((dest), (src), sizeof (sxml_t))
198 |
199 | /*
200 | MARK: Parse
201 |
202 | SXML does minimal validation of the input data.
203 | SXML_ERROR_XMLSTRICT is returned if some simple XML validation tests fail.
204 | SXML_ERROR_XMLINVALID is instead returned if the invalid XML data is serious enough to prevent the parser from continuing.
205 | We currently make no difference between these two - but they are marked differently in case we wish to do so in the future.
206 | */
207 |
208 | #define SXML_ERROR_XMLSTRICT SXML_ERROR_XMLINVALID
209 |
210 | #define ENTITY_MAXLEN 8 /* Σ */
211 | #define MIN(a,b) ((a) < (b) ? (a) : (b))
212 |
213 | static sxmlerr_t parse_characters (sxml_t* state, sxml_args_t* args, const char* end)
214 | {
215 | const char* start= buffer_fromoffset (args, state->bufferpos);
216 | const char* limit, *colon, *ampr= str_findchr (start, end, '&');
217 | assert (end <= buffer_getend (args));
218 |
219 | if (ampr != start)
220 | state_pushtoken (state, args, SXML_CHARACTER, start, ampr);
221 |
222 | if (ampr == end)
223 | return state_setpos (state, args, ampr);
224 |
225 | /* limit entity to search to ENTITY_MAXLEN */
226 | limit= MIN (ampr + ENTITY_MAXLEN, end);
227 | colon= str_findchr (ampr, limit, ';');
228 | if (colon == limit)
229 | return (limit == end) ? SXML_ERROR_BUFFERDRY : SXML_ERROR_XMLINVALID;
230 |
231 | start= colon + 1;
232 | state_pushtoken (state, args, SXML_CHARACTER, ampr, start);
233 | return state_setpos (state, args, start);
234 | }
235 |
236 | static sxmlerr_t parse_attrvalue (sxml_t* state, sxml_args_t* args, const char* end)
237 | {
238 | while (buffer_fromoffset (args, state->bufferpos) != end)
239 | {
240 | sxmlerr_t err= parse_characters (state, args, end);
241 | if (err != SXML_SUCCESS)
242 | return err;
243 | }
244 |
245 | return SXML_SUCCESS;
246 | }
247 |
248 | static sxmlerr_t parse_attributes (sxml_t* state, sxml_args_t* args)
249 | {
250 | const char* start= buffer_fromoffset (args, state->bufferpos);
251 | const char* end= buffer_getend (args);
252 | const char* name= str_ltrim (start, end);
253 |
254 | UINT ntokens= state->ntokens;
255 | assert (0 < ntokens);
256 |
257 | while (name != end && ISALPHA (*name))
258 | {
259 | const char* eq, *space, *quot, *value;
260 | sxmlerr_t err;
261 |
262 | /* Attribute name */
263 | eq= str_findchr (name, end, '=');
264 | if (eq == end)
265 | return SXML_ERROR_BUFFERDRY;
266 |
267 | space= str_rtrim (name, eq);
268 | state_pushtoken (state, args, SXML_CDATA, name, space);
269 |
270 | /* Attribute value */
271 | quot= str_ltrim (eq + 1, end);
272 | if (quot == end)
273 | return SXML_ERROR_BUFFERDRY;
274 | else if (*quot != '\'' && *quot != '"')
275 | return SXML_ERROR_XMLINVALID;
276 |
277 | value= quot + 1;
278 | quot= str_findchr (value, end, *quot);
279 | if (quot == end)
280 | return SXML_ERROR_BUFFERDRY;
281 |
282 | state_setpos (state, args, value);
283 | err= parse_attrvalue (state, args, quot);
284 | if (err != SXML_SUCCESS)
285 | return err;
286 |
287 | /* --- */
288 |
289 | name= str_ltrim (quot + 1, end);
290 | }
291 |
292 | {
293 | sxmltok_t* token= args->tokens + (ntokens - 1);
294 | token->size= (unsigned short) (state->ntokens - ntokens);
295 | }
296 |
297 | return state_setpos (state, args, name);
298 | }
299 |
300 | /* --- */
301 |
302 | #define TAG_LEN(str) (sizeof (str) - 1)
303 | #define TAG_MINSIZE 3
304 |
305 | static sxmlerr_t parse_comment (sxml_t* state, sxml_args_t* args)
306 | {
307 | static const char STARTTAG[]= "";
309 |
310 | const char* dash;
311 | const char* start= buffer_fromoffset (args, state->bufferpos);
312 | const char* end= buffer_getend (args);
313 | if (end - start < TAG_LEN (STARTTAG))
314 | return SXML_ERROR_BUFFERDRY;
315 |
316 | if (!str_startswith (start, end, STARTTAG))
317 | return SXML_ERROR_XMLINVALID;
318 |
319 | start+= TAG_LEN (STARTTAG);
320 | dash= str_findstr (start, end, ENDTAG);
321 | if (dash == end)
322 | return SXML_ERROR_BUFFERDRY;
323 |
324 | state_pushtoken (state, args, SXML_COMMENT, start, dash);
325 | return state_setpos (state, args, dash + TAG_LEN (ENDTAG));
326 | }
327 |
328 | static sxmlerr_t parse_instruction (sxml_t* state, sxml_args_t* args)
329 | {
330 | static const char STARTTAG[]= "";
331 | static const char ENDTAG[]= "?>";
332 |
333 | sxmlerr_t err;
334 | const char* quest, *space;
335 | const char* start= buffer_fromoffset (args, state->bufferpos);
336 | const char* end= buffer_getend (args);
337 | assert (TAG_MINSIZE <= end - start);
338 |
339 | if (!str_startswith (start, end, STARTTAG))
340 | return SXML_ERROR_XMLINVALID;
341 |
342 | start+= TAG_LEN (STARTTAG);
343 | space= str_find_notalnum (start, end);
344 | if (space == end)
345 | return SXML_ERROR_BUFFERDRY;
346 |
347 | state_pushtoken (state, args, SXML_INSTRUCTION, start, space);
348 |
349 | state_setpos (state, args, space);
350 | err= parse_attributes (state, args);
351 | if (err != SXML_SUCCESS)
352 | return err;
353 |
354 | quest= buffer_fromoffset (args, state->bufferpos);
355 | if (end - quest < TAG_LEN (ENDTAG))
356 | return SXML_ERROR_BUFFERDRY;
357 |
358 | if (!str_startswith (quest, end, ENDTAG))
359 | return SXML_ERROR_XMLINVALID;
360 |
361 | return state_setpos (state, args, quest + TAG_LEN (ENDTAG));
362 | }
363 |
364 | static sxmlerr_t parse_doctype (sxml_t* state, sxml_args_t* args)
365 | {
366 | static const char STARTTAG[]= "";
368 |
369 | const char* bracket;
370 | const char* start= buffer_fromoffset (args, state->bufferpos);
371 | const char* end= buffer_getend (args);
372 | if (end - start < TAG_LEN (STARTTAG))
373 | return SXML_ERROR_BUFFERDRY;
374 |
375 | if (!str_startswith (start, end, STARTTAG))
376 | return SXML_ERROR_BUFFERDRY;
377 |
378 | start+= TAG_LEN (STARTTAG);
379 | bracket= str_findstr (start, end, ENDTAG);
380 | if (bracket == end)
381 | return SXML_ERROR_BUFFERDRY;
382 |
383 | state_pushtoken (state, args, SXML_DOCTYPE, start, bracket);
384 | return state_setpos (state, args, bracket + TAG_LEN (ENDTAG));
385 | }
386 |
387 | static sxmlerr_t parse_start (sxml_t* state, sxml_args_t* args)
388 | {
389 | sxmlerr_t err;
390 | const char* gt, *name, *space;
391 | const char* start= buffer_fromoffset (args, state->bufferpos);
392 | const char* end= buffer_getend (args);
393 | assert (TAG_MINSIZE <= end - start);
394 |
395 | if (!(start[0] == '<' && ISALPHA (start[1])))
396 | return SXML_ERROR_XMLINVALID;
397 |
398 | /* --- */
399 |
400 | name= start + 1;
401 | space= str_find_notalnum (name, end);
402 | if (space == end)
403 | return SXML_ERROR_BUFFERDRY;
404 |
405 | state_pushtoken (state, args, SXML_STARTTAG, name, space);
406 |
407 | state_setpos (state, args, space);
408 | err= parse_attributes (state, args);
409 | if (err != SXML_SUCCESS)
410 | return err;
411 |
412 | /* --- */
413 |
414 | gt= buffer_fromoffset (args, state->bufferpos);
415 |
416 | if (gt != end && *gt == '/')
417 | {
418 | state_pushtoken (state, args, SXML_ENDTAG, name, space);
419 | gt++;
420 | }
421 |
422 | if (gt == end)
423 | return SXML_ERROR_BUFFERDRY;
424 |
425 | if (*gt != '>')
426 | return SXML_ERROR_XMLINVALID;
427 |
428 | return state_setpos (state, args, gt + 1);
429 | }
430 |
431 | static sxmlerr_t parse_end (sxml_t* state, sxml_args_t* args)
432 | {
433 | const char* gt, *space;
434 | const char* start= buffer_fromoffset (args, state->bufferpos);
435 | const char* end= buffer_getend (args);
436 | assert (TAG_MINSIZE <= end - start);
437 |
438 | if (!(str_startswith (start, end, "") && ISALPHA (start[2])))
439 | return SXML_ERROR_XMLINVALID;
440 |
441 | start+= 2;
442 | gt= str_findchr (start, end, '>');
443 | if (gt == end)
444 | return SXML_ERROR_BUFFERDRY;
445 |
446 | /* Test for no characters beyond elem name */
447 | space= str_find_notalnum (start, gt);
448 | if (str_ltrim (space, gt) != gt)
449 | return SXML_ERROR_XMLSTRICT;
450 |
451 | state_pushtoken (state, args, SXML_ENDTAG, start, space);
452 | return state_setpos (state, args, gt + 1);
453 | }
454 |
455 | static sxmlerr_t parse_cdata (sxml_t* state, sxml_args_t* args)
456 | {
457 | static const char STARTTAG[]= "";
459 |
460 | const char* bracket;
461 | const char* start= buffer_fromoffset (args, state->bufferpos);
462 | const char* end= buffer_getend (args);
463 | if (end - start < TAG_LEN (STARTTAG))
464 | return SXML_ERROR_BUFFERDRY;
465 |
466 | if (!str_startswith (start, end, STARTTAG))
467 | return SXML_ERROR_XMLINVALID;
468 |
469 | start+= TAG_LEN (STARTTAG);
470 | bracket= str_findstr (start, end, ENDTAG);
471 | if (bracket == end)
472 | return SXML_ERROR_BUFFERDRY;
473 |
474 | state_pushtoken (state, args, SXML_CDATA, start, bracket);
475 | return state_setpos (state, args, bracket + TAG_LEN (ENDTAG));
476 | }
477 |
478 | /*
479 | MARK: SXML
480 | Public API inspired by the JSON parser JSMN ( http://zserge.com/jsmn.html ).
481 | */
482 |
483 | void sxml_init (sxml_t *state)
484 | {
485 | state->bufferpos= 0;
486 | state->ntokens= 0;
487 | state->taglevel= 0;
488 | }
489 |
490 | #define ROOT_FOUND(state) (0 < (state)->taglevel)
491 | #define ROOT_PARSED(state) ((state)->taglevel == 0)
492 |
493 | sxmlerr_t sxml_parse(sxml_t *state, const char *buffer, UINT bufferlen, sxmltok_t tokens[], UINT num_tokens)
494 | {
495 | sxml_t temp= *state;
496 | const char* end= buffer + bufferlen;
497 |
498 | sxml_args_t args;
499 | args.buffer= buffer;
500 | args.bufferlen= bufferlen;
501 | args.tokens= tokens;
502 | args.num_tokens= num_tokens;
503 |
504 | /* --- */
505 |
506 | while (!ROOT_FOUND (&temp))
507 | {
508 | sxmlerr_t err;
509 | const char* start= buffer_fromoffset (&args, temp.bufferpos);
510 | const char* lt= str_ltrim (start, end);
511 | state_setpos (&temp, &args, lt);
512 | state_commit (state, &temp);
513 |
514 | if (end - lt < TAG_MINSIZE)
515 | return SXML_ERROR_BUFFERDRY;
516 |
517 | /* --- */
518 |
519 | if (*lt != '<')
520 | return SXML_ERROR_XMLINVALID;
521 |
522 | switch (lt[1])
523 | {
524 | case '?': err= parse_instruction (&temp, &args); break;
525 | case '!': err= parse_doctype (&temp, &args); break;
526 | default: err= parse_start (&temp, &args); break;
527 | }
528 |
529 | if (err != SXML_SUCCESS)
530 | return err;
531 |
532 | state_commit (state, &temp);
533 | }
534 |
535 | /* --- */
536 |
537 | while (!ROOT_PARSED (&temp))
538 | {
539 | sxmlerr_t err;
540 | const char* start= buffer_fromoffset (&args, temp.bufferpos);
541 | const char* lt= str_findchr (start, end, '<');
542 | while (buffer_fromoffset (&args, temp.bufferpos) != lt)
543 | {
544 | sxmlerr_t err= parse_characters (&temp, &args, lt);
545 | if (err != SXML_SUCCESS)
546 | return err;
547 |
548 | state_commit (state, &temp);
549 | }
550 |
551 | /* --- */
552 |
553 | if (end - lt < TAG_MINSIZE)
554 | return SXML_ERROR_BUFFERDRY;
555 |
556 | switch (lt[1])
557 | {
558 | case '?': err= parse_instruction (&temp, &args); break;
559 | case '/': err= parse_end (&temp, &args); break;
560 | case '!': err= (lt[2] == '-') ? parse_comment (&temp, &args) : parse_cdata (&temp, &args); break;
561 | default: err= parse_start (&temp, &args); break;
562 | }
563 |
564 | if (err != SXML_SUCCESS)
565 | return err;
566 |
567 | state_commit (state, &temp);
568 | }
569 |
570 | return SXML_SUCCESS;
571 | }
572 |
--------------------------------------------------------------------------------
/sxml.h:
--------------------------------------------------------------------------------
1 | #ifndef _SXML_H_INCLUDED
2 | #define _SXML_H_INCLUDED
3 |
4 | #ifdef __cplusplus
5 | extern "C" {
6 | #endif
7 |
8 | /*
9 | --- SXML ---
10 | Short description of how to use SXML for parsing XML text.
11 |
12 | SXML is a lightweight XML parser with no external dependencies.
13 | To parse XML text you only need to call one function: sxml_parse().
14 | The function has the following return codes:
15 | */
16 |
17 | typedef enum
18 | {
19 | SXML_ERROR_XMLINVALID= -1, /* Parser found invalid XML data - not much you can do beyond error reporting */
20 | SXML_SUCCESS= 0, /* Parser has completed successfully - parsing of XML document is complete */
21 | SXML_ERROR_BUFFERDRY= 1, /* Parser ran out of input data - refill buffer with more XML text to continue parsing */
22 | SXML_ERROR_TOKENSFULL= 2 /* Parser has filled all the supplied tokens with data - provide more tokens for further output */
23 | } sxmlerr_t;
24 |
25 | /*
26 | You provide sxml_parse() with a buffer of XML text for parsing.
27 | The parser will handle text data encoded in ascii, latin-1 and utf-8.
28 | It should also work with other encodings that are acsii extensions.
29 |
30 | sxml_parse() is reentrant.
31 | In the case of return code SXML_ERROR_BUFFERDRY or SXML_ERROR_TOKENSFULL, you are expected to call the function again after resolving the problem to continue parsing.
32 | */
33 |
34 | typedef struct sxml_t sxml_t;
35 | typedef struct sxmltok_t sxmltok_t;
36 | sxmlerr_t sxml_parse(sxml_t *parser, const char *buffer, unsigned bufferlen, sxmltok_t* tokens, unsigned num_tokens);
37 |
38 | /*
39 | The sxml_t object stores all data required for SXML to continue from where it left of.
40 |
41 | After calling sxml_parse() 'ntokens' tells you how many output tokens have been filled with data.
42 | Depending on how you resolve SXML_ERROR_BUFFERDRY or SXML_ERROR_TOKENSFULL you may need to modifiy 'bufferpos' and 'ntokens' to correctly reflect the new buffer and tokens you provide.
43 | */
44 |
45 | struct sxml_t
46 | {
47 | unsigned bufferpos; /* Current offset into buffer - all XML data before this position has been successfully parsed */
48 | unsigned ntokens; /* Number of tokens filled with valid data by the parser */
49 | unsigned taglevel; /* Used internally - keeps track of number of unclosed XML elements to detect start and end of document */
50 | };
51 |
52 | /*
53 | Before you call sxml_parse() for the first time, you have to initialize the parser object.
54 | You may easily do that with the provided function sxml_init().
55 | */
56 |
57 | void sxml_init(sxml_t *parser);
58 |
59 | /*
60 | Unlike most XML parsers, SXML does not use SAX callbacks or allocate a DOM tree.
61 | Instead you will have to interpret the XML structure through a table of tokens.
62 |
63 | A token can describe any of the following types:
64 | */
65 |
66 | typedef enum
67 | {
68 | SXML_STARTTAG, /* Start tag describes the opening of an XML element */
69 | SXML_ENDTAG, /* End tag is the closing of an XML element */
70 |
71 | SXML_CHARACTER, /* Character data may be escaped - check if the first character is an ampersand '&' to identity a XML character reference */
72 | SXML_CDATA, /* Character data should be read as is - it is not escaped */
73 |
74 | /* And some other token types you might be interested in: */
75 | SXML_INSTRUCTION, /* Can be used to identity the text encoding */
76 | SXML_DOCTYPE, /* If you'd like to interpret DTD data */
77 | SXML_COMMENT /* Most likely you don't care about comments - but this is where you'll find them */
78 | } sxmltype_t;
79 |
80 | /*
81 | If you are familiar with the structure of an XML document most of these type names should sound familiar.
82 |
83 | A token has the following data:
84 | */
85 |
86 | struct sxmltok_t
87 | {
88 | unsigned short type; /* A token is one of the above sxmltype_t */
89 | unsigned short size; /* The following number of tokens contain additional data related to this token - used for describing attributes */
90 |
91 | /* 'startpos' and 'endpos' together define a range within the provided text buffer - use these offsets with the buffer to extract the text value of the token */
92 | unsigned startpos;
93 | unsigned endpos;
94 | };
95 |
96 | /*
97 | Let's walk through how to correctly interpret a token of type SXML_STARTTAG.
98 |
99 |
100 |
101 | The element name ('example') can be extracted from the text buffer using 'startpos' and 'endpos'.
102 |
103 | The attributes of the XML element are described in the following 'size' tokens.
104 | Each attribute is divided by a token of type SXML_CDATA - this is the attribute key.
105 | There will be zero or more tokens of type SXML_CHARACTER following the key - together they describe one attribute value.
106 |
107 | In our example you will get the following number of SXML_CHARACTER tokens after the attribute key:
108 | * 'zero' will use no tokens to describe the empty attribute value.
109 | * 'one' will have one token describing the attribute value ('Hello there!').
110 | * 'three' will have three tokens describing the attribute value ('Me, Myself ')('&')(' I')
111 |
112 | In our example the token of type SXML_STARTTAG will have a 'size' of 7 (3 SXML_CDATA and 4 SXML_CHARACTER).
113 | When processing the tokens do not forget about 'size' - for any token you want to skip, also remember to skip the additional token data!
114 | */
115 |
116 | #ifdef __cplusplus
117 | }
118 | #endif
119 |
120 | /*
121 | Congratulations on making it this far - now might be a good time to check out sxml_test.c for an example of using SXML.
122 | */
123 |
124 | #endif /* _SXML_H_INCLUDED */
125 |
--------------------------------------------------------------------------------
/sxml_test.c:
--------------------------------------------------------------------------------
1 | #include "sxml.h"
2 |
3 | #include
4 | #include
5 | #include
6 | #include
7 |
8 | typedef unsigned UINT;
9 |
10 | /*
11 | MARK: Pretty print XML
12 | Example of simple processing of parsed token output.
13 | */
14 | static void print_indent (UINT indentlevel)
15 | {
16 | if (0 < indentlevel)
17 | {
18 | char fmt[8];
19 | sprintf (fmt, "%%%ds", indentlevel * 3);
20 | printf (fmt, " ");
21 | }
22 | }
23 |
24 | static void print_tokenvalue (const char* buffer, const sxmltok_t* token)
25 | {
26 | char fmt[8];
27 | sprintf (fmt, "%%.%ds", token->endpos - token->startpos);
28 | printf (fmt, buffer + token->startpos);
29 | }
30 |
31 | static UINT print_chartokens (const char* buffer, const sxmltok_t tokens[], UINT num_tokens)
32 | {
33 | UINT i;
34 |
35 | for (i= 0; i < num_tokens; i++)
36 | {
37 | const char* ampr;
38 |
39 | const sxmltok_t* token= tokens + i;
40 | if (token->type != SXML_CHARACTER)
41 | return i;
42 |
43 | ampr= buffer + token->startpos;
44 | assert (0 < token->endpos - token->startpos);
45 |
46 | if (*ampr != '&')
47 | {
48 | print_tokenvalue (buffer, token);
49 | continue;
50 | }
51 |
52 | switch (ampr[1])
53 | {
54 | case 'a': printf ((ampr[2] == 'm') ? "&" : "'"); break;
55 | case 'g': printf (">"); break;
56 | case 'l': printf ("<"); break;
57 | case 'q': printf ("\""); break;
58 | default:
59 | assert (0);
60 | break;
61 | }
62 | }
63 |
64 | return num_tokens;
65 | }
66 |
67 | static void print_prettyxml (const char* buffer, const sxmltok_t tokens[], UINT num_tokens, UINT* indentlevel)
68 | {
69 | UINT i;
70 | for (i= 0; i < num_tokens; i++)
71 | {
72 | const sxmltok_t* token= tokens + i;
73 | switch (token->type)
74 | {
75 | case SXML_STARTTAG:
76 | {
77 | UINT j;
78 |
79 | print_indent ((*indentlevel)++);
80 | printf ("<");
81 | print_tokenvalue (buffer, token);
82 |
83 | /* elem attributes are listed in the following tokens */
84 | for (j= 0; j < token->size; j++)
85 | {
86 | printf (" ");
87 | print_tokenvalue (buffer, &token[j + 1]);
88 | printf ("='");
89 | j+= print_chartokens (buffer, &token[j + 2], token->size - (j + 1));
90 | printf ("'");
91 | }
92 |
93 | puts (">");
94 | break;
95 | }
96 |
97 | case SXML_ENDTAG:
98 | print_indent (--(*indentlevel));
99 | printf ("");
100 | print_tokenvalue (buffer, token);
101 | puts (">");
102 | break;
103 |
104 |
105 | /* Other token types you might be interested in: */
106 | /*
107 | case SXML_INSTRUCTION
108 | case SXML_DOCTYPE:
109 | case SXML_COMMENT:
110 | case SXML_CDATA:
111 | case SXML_CHARACTER:
112 | */
113 |
114 | default:
115 | break;
116 | }
117 |
118 | /*
119 | Tokens may contain additional data. Skip 'size' tokens to get the next token to proccess.
120 | (see SXML_STARTTAG case above as an example of how attributes are specified)
121 | */
122 | i+= token->size;
123 | }
124 | }
125 |
126 | /*
127 | MARK: Utility functions
128 | Useful for error reporting.
129 | */
130 | static UINT count_lines (const char* buffer, UINT bufferlen)
131 | {
132 | const char* end= buffer + bufferlen;
133 | const char* it= buffer;
134 | UINT i;
135 |
136 | for (i= 0; ; i++)
137 | {
138 | it= (const char*) memchr (it, '\n', end - it);
139 | if (it == NULL)
140 | return i;
141 |
142 | it++;
143 | }
144 | }
145 |
146 |
147 | /*
148 | MARK: main
149 | Minimal example showing how you may use SXML within a constrained environment with a fixed size input and output buffer.
150 | */
151 |
152 | #define MIN(a,b) (((a) < (b)) ? (a) : (b))
153 | #define COUNT(arr) (sizeof (arr) / sizeof ((arr)[0]))
154 |
155 | #define BUFFER_MAXLEN 1024
156 |
157 |
158 | int main (int argc, const char* argv[])
159 | {
160 | /* Input XML text */
161 | char buffer[BUFFER_MAXLEN];
162 | UINT bufferlen= 0;
163 |
164 | /* Output token table */
165 | sxmltok_t tokens[128];
166 |
167 | /* Used in example for pretty printing and error reporting */
168 | UINT indent= 0, lineno= 1;
169 |
170 | const char* path;
171 | FILE* file;
172 |
173 | /* Parser object stores all data required for SXML to be reentrant */
174 | sxml_t parser;
175 | sxml_init (&parser);
176 |
177 | /* Usage: sxml_test.exe test.xml */
178 | assert (argc == 2);
179 | path= argv[1];
180 | file= fopen (path, "rb");
181 | assert (file != NULL);
182 |
183 | for (;;)
184 | {
185 | sxmlerr_t err= sxml_parse (&parser, buffer, bufferlen, tokens, COUNT (tokens));
186 | if (err == SXML_SUCCESS)
187 | break;
188 |
189 | switch (err)
190 | {
191 | case SXML_ERROR_TOKENSFULL:
192 | {
193 | /*
194 | Need to give parser more space for tokens to continue parsing.
195 | We choose here to reuse the existing token table once tokens have been processed.
196 |
197 | Example of some processing of the token data.
198 | Instead you might be interested in creating your own DOM structure
199 | or other processing of XML data useful to your application.
200 | */
201 | print_prettyxml (buffer, tokens, parser.ntokens, &indent);
202 |
203 | /* Parser can now safely reuse all of the token table */
204 | parser.ntokens= 0;
205 | break;
206 | }
207 |
208 | case SXML_ERROR_BUFFERDRY:
209 | {
210 | /*
211 | Parser expects more XML data to continue parsing.
212 | We choose here to reuse the existing buffer array.
213 | */
214 | size_t len;
215 |
216 | /* Need to processs existing tokens before buffer is overwritten with new data */
217 | print_prettyxml (buffer, tokens, parser.ntokens, &indent);
218 | parser.ntokens= 0;
219 |
220 | /* For error reporting */
221 | lineno+= count_lines(buffer, parser.bufferpos);
222 |
223 | /*
224 | Example of how to reuse buffer array.
225 | Move unprocessed buffer content to start of array
226 | */
227 | bufferlen-= parser.bufferpos;
228 | memmove (buffer, buffer + parser.bufferpos, bufferlen);
229 |
230 | /*
231 | If your buffer is smaller than the size required to complete a token the parser will endlessly call SXML_ERROR_BUFFERDRY.
232 | You will most likely encounter this problem if you have XML comments longer than BUFFER_MAXLEN in size.
233 | SXML_CHARACTER solves this problem by dividing the data over multiple tokens, but other token types remain affected.
234 | */
235 | assert (bufferlen < BUFFER_MAXLEN);
236 |
237 | /* Fill remaining buffer with new data from file */
238 | len= fread (buffer + bufferlen, 1, BUFFER_MAXLEN - bufferlen, file);
239 | assert (0 < len);
240 | bufferlen+= len;
241 |
242 | /* Parser will now have to read from beginning of buffer to contiue */
243 | parser.bufferpos= 0;
244 | break;
245 | }
246 |
247 | case SXML_ERROR_XMLINVALID:
248 | {
249 | char fmt[8];
250 |
251 | /* Example of some simple error reporting */
252 | lineno+= count_lines (buffer, parser.bufferpos);
253 | fprintf(stderr, "Error while parsing line %d:\n", lineno);
254 |
255 | /* Print out contents of line containing the error */
256 | sprintf (fmt, "%%.%ds", MIN (bufferlen - parser.bufferpos, 72));
257 | fprintf (stderr, fmt, buffer + parser.bufferpos);
258 |
259 | abort();
260 | break;
261 | }
262 |
263 | default:
264 | assert (0);
265 | break;
266 | }
267 | }
268 |
269 | fclose (file);
270 |
271 | /* Sucessfully parsed XML file - flush remainig token output */
272 | print_prettyxml (buffer, tokens, parser.ntokens, &indent);
273 | return 0;
274 | }
275 |
--------------------------------------------------------------------------------