├── README.md
├── UNLICENSE
├── sxml.c
├── sxml.h
└── sxml_test.c


/README.md:
--------------------------------------------------------------------------------
 1 | SXML
 2 | ====
 3 | A Small XML parser written in C.
 4 | 
 5 | Inspired by the clean API design used by the [JSON parser JSMN](http://zserge.bitbucket.org/jsmn.html), SXML has the same design goal and features for XML parsing. Go read about JSMN's Philosophy and Features to get an idea of how it differs to other parsers - I'll wait right here.
 6 | 
 7 | Features
 8 | --------
 9 | Here is a list of features SXML shares with JSMN.
10 | 
11 | * compatible with C89
12 | * no dependencies
13 | * highly portable
14 | * about 420 lines of code
15 | * extremely small code footprint
16 | * API contains only 2 functions
17 | * no dynamic memory allocation
18 | * incremental single-pass parsing
19 | 
20 | Usage
21 | -----
22 | The header file is heavily commented and should be the first place to look to get started.
23 | 
24 | Check out the file sxml_test.c for an example of using SXML within a constrained environment with a fixed sized input and output buffer.
25 | 
26 | Limitations
27 | -----------
28 | In order to remain lightweight the parser has the following limitations:
29 | 
30 | * Minimal XML syntax check during parsing
31 | * Input text must be ascii or an [ascii extension](http://en.wikipedia.org/wiki/Extended_ASCII) (latin-1 and utf-8 are examples of ascii extensions)
32 | 
33 | Do contact me with suggestions if the limitations above are preventing you from using the parser.
34 | 
35 | Alternatives
36 | ------------
37 | List of alternative lightweight XML parsers considered before writing my own.
38 | 
39 | * [ezXML](http://ezxml.sourceforge.net/)
40 | * [FastXML](http://codesuppository.blogspot.com/2009/02/fastxml-extremely-lightweight-stream.html)
41 | * [TinyXml](http://www.grinninglizard.com/tinyxml2/)
42 | * [RapidXML](http://rapidxml.sourceforge.net/)
43 | 


--------------------------------------------------------------------------------
/UNLICENSE:
--------------------------------------------------------------------------------
 1 | This is free and unencumbered software released into the public domain.
 2 | 
 3 | Anyone is free to copy, modify, publish, use, compile, sell, or
 4 | distribute this software, either in source code form or as a compiled
 5 | binary, for any purpose, commercial or non-commercial, and by any
 6 | means.
 7 | 
 8 | In jurisdictions that recognize copyright laws, the author or authors
 9 | of this software dedicate any and all copyright interest in the
10 | software to the public domain. We make this dedication for the benefit
11 | of the public at large and to the detriment of our heirs and
12 | successors. We intend this dedication to be an overt act of
13 | relinquishment in perpetuity of all present and future rights to this
14 | software under copyright law.
15 | 
16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
19 | IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
20 | OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
21 | ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
22 | OTHER DEALINGS IN THE SOFTWARE.
23 | 
24 | For more information, please refer to <http://unlicense.org/>
25 | 


--------------------------------------------------------------------------------
/sxml.c:
--------------------------------------------------------------------------------
  1 | #include "sxml.h"
  2 | 
  3 | /* The following functions will need to be replaced if you want no dependency to libc: */
  4 | #include <string.h>	/* memchr, memcmp, strlen, memcpy */
  5 | #include <assert.h>	/* assert */
  6 | 
  7 | typedef unsigned UINT;
  8 | typedef int BOOL;
  9 | #define FALSE	0
 10 | #define TRUE	(!FALSE)
 11 | 
 12 | /*
 13 |  MARK: String
 14 |  String functions work within the memory range specified (excluding end).
 15 |  Returns 'end' if value not found.
 16 | */
 17 | 
 18 | static const char* str_findchr (const char* start, const char* end, int c)
 19 | {
 20 | 	const char* it;
 21 | 
 22 | 	assert (start <= end);
 23 | 	assert (0 <= c && c <= 127);	/* CHAR_MAX - memchr implementation will only work when searching for ascii characters within a utf-8 string */
 24 | 	
 25 | 	it= (const char*) memchr (start, c, end - start);
 26 | 	return (it != NULL) ? it : end;
 27 | }
 28 | 
 29 | static const char* str_findstr (const char* start, const char* end, const char* needle)
 30 | {
 31 | 	size_t needlelen;
 32 | 	int first;
 33 | 	assert  (start <= end);
 34 | 	
 35 | 	needlelen= strlen (needle);
 36 | 	assert (0 < needlelen);
 37 | 	first = (unsigned char) needle[0];
 38 | 
 39 | 	while (start + needlelen <= end)
 40 | 	{
 41 | 		const char* it= (const char*) memchr (start, first, (end - start) - (needlelen - 1));
 42 | 		if (it == NULL)
 43 | 			break;
 44 | 
 45 | 		if (memcmp (it, needle, needlelen) == 0)
 46 | 			return it;
 47 | 
 48 | 		start= it + 1;
 49 | 	}
 50 | 
 51 | 	return end;
 52 | }
 53 | 
 54 | static BOOL str_startswith (const char* start, const char* end, const char* prefix)
 55 | {
 56 | 	long nbytes;
 57 | 	assert (start <= end);
 58 | 	
 59 | 	nbytes= strlen (prefix);
 60 | 	if (end - start < nbytes)
 61 | 		return FALSE;
 62 | 	
 63 | 	return memcmp (prefix, start, nbytes) == 0;
 64 | }
 65 | 
 66 | /* http://www.w3.org/TR/xml11/#sec-common-syn */
 67 | 
 68 | static BOOL WhiteSpace (int c)
 69 | {
 70 | 	switch (c)
 71 | 	{
 72 | 		case ' ':	/* 0x20 */
 73 | 		case '\t':	/* 0x9 */
 74 | 		case '\r':	/* 0xD */
 75 | 		case '\n':	/* 0xA */
 76 | 			return TRUE;
 77 | 	}
 78 | 
 79 | 	return FALSE;
 80 | }
 81 | 
 82 | static BOOL NameStartChar (int c)
 83 | {
 84 | 	/*
 85 | 	 We don't perform utf-8 decoding - just accept all characters with hight bit set
 86 | 	 (0xC0 <= c && c <= 0xD6) || (0xD8 <= c && c <= 0xF6) || (0xF8 <= c && c <= 0x2FF) ||
 87 | 	 (0x370 <= c && c <= 0x37D) || (0x37F <= c && c <= 0x1FFF) || (0x200C <= c && c <= 0x200D) ||
 88 | 	 (0x2070 <= c && c <= 0x218F) || (0x2C00 <= c && c <= 0x2FEF) || (0x3001 <= c && c <= 0xD7FF) ||
 89 | 	 (0xF900 <= c && c <= 0xFDCF) || (0xFDF0 <= c && c <= 0xFFFD) || (0x10000 <= c && c <= 0xEFFFF);
 90 | 	 */
 91 | 	if (0x80 <= c)
 92 | 		return TRUE;
 93 | 
 94 | 	return c == ':' || ('A' <= c && c <= 'Z') || c == '_' || ('a' <= c && c <= 'z');
 95 | }
 96 | 
 97 | static BOOL NameChar (int c)
 98 | {
 99 | 	return NameStartChar (c) ||
100 | 		c == '-' || c == '.' || ('0'  <= c && c <= '9') ||
101 | 		c == 0xB7 || (0x0300 <= c && c <= 0x036F) || (0x203F <= c && c <= 0x2040);
102 | }
103 | 
104 | #define ISSPACE(c)	(WhiteSpace(((unsigned char)(c))))
105 | #define ISALPHA(c)	(NameStartChar(((unsigned char)(c))))
106 | #define ISALNUM(c)	(NameChar(((unsigned char)(c))))
107 | 
108 | /* Left trim whitespace */
109 | static const char* str_ltrim (const char* start, const char* end)
110 | {
111 | 	const char* it;
112 | 	assert (start <= end);
113 | 
114 | 	for (it= start; it != end && ISSPACE (*it); it++)
115 | 		;
116 | 
117 | 	return it;
118 | }
119 | 
120 | /* Right trim whitespace */
121 | static const char* str_rtrim (const char* start, const char* end)
122 | {
123 | 	const char* it, *prev;
124 | 	assert (start <= end);
125 | 
126 | 	for (it= end; start != it; it= prev)
127 | 	{
128 | 		prev= it - 1;
129 | 		if (!ISSPACE (*prev))
130 | 			return it;
131 | 	}
132 | 	
133 | 	return start;
134 | }
135 | 
136 | static const char* str_find_notalnum (const char* start, const char* end)
137 | {
138 | 	const char* it;	
139 | 	assert (start <= end);
140 | 
141 | 	for (it= start; it != end && ISALNUM (*it); it++)
142 | 		;
143 | 
144 | 	return it;
145 | }
146 | 
147 | /* MARK: State */
148 | 
149 | /* Collect arguments in a structure for convenience */
150 | typedef struct
151 | {
152 | 	const char* buffer;
153 | 	UINT bufferlen;
154 | 	sxmltok_t* tokens;
155 | 	UINT num_tokens;
156 | } sxml_args_t;
157 | 
158 | #define buffer_fromoffset(args,i)	((args)->buffer + (i))
159 | #define buffer_tooffset(args,ptr)	(unsigned) ((ptr) - (args)->buffer)
160 | #define buffer_getend(args) ((args)->buffer + (args)->bufferlen)
161 | 
162 | static BOOL state_pushtoken (sxml_t* state, sxml_args_t* args, sxmltype_t type, const char* start, const char* end)
163 | {
164 | 	sxmltok_t* token;
165 | 	UINT i= state->ntokens++;
166 | 	if (args->num_tokens < state->ntokens)
167 | 		return FALSE;
168 | 	
169 | 	token= &args->tokens[i];
170 | 	token->type= type;
171 | 	token->startpos= buffer_tooffset (args, start);
172 | 	token->endpos= buffer_tooffset (args, end);
173 | 	token->size= 0;
174 | 
175 | 	switch (type)
176 | 	{
177 | 		case SXML_STARTTAG:	state->taglevel++;	break;
178 | 
179 | 		case SXML_ENDTAG:
180 | 			assert (0 < state->taglevel);
181 | 			state->taglevel--;
182 | 			break;
183 | 
184 | 		default:
185 | 			break;
186 | 	}
187 | 
188 | 	return TRUE;
189 | }
190 | 
191 | static sxmlerr_t state_setpos (sxml_t* state, const sxml_args_t* args, const char* ptr)
192 | {
193 | 	state->bufferpos= buffer_tooffset (args, ptr);
194 | 	return (state->ntokens <= args->num_tokens) ? SXML_SUCCESS : SXML_ERROR_TOKENSFULL;
195 | }
196 | 
197 | #define state_commit(dest,src) memcpy ((dest), (src), sizeof (sxml_t))
198 | 
199 | /*
200 |  MARK: Parse
201 |  
202 |  SXML does minimal validation of the input data.
203 |  SXML_ERROR_XMLSTRICT is returned if some simple XML validation tests fail.
204 |  SXML_ERROR_XMLINVALID is instead returned if the invalid XML data is serious enough to prevent the parser from continuing.
205 |  We currently make no difference between these two - but they are marked differently in case we wish to do so in the future.
206 | */
207 | 
208 | #define SXML_ERROR_XMLSTRICT	SXML_ERROR_XMLINVALID
209 | 
210 | #define ENTITY_MAXLEN 8	/* &#x03A3; */
211 | #define MIN(a,b)	((a) < (b) ? (a) : (b))
212 | 
213 | static sxmlerr_t parse_characters (sxml_t* state, sxml_args_t* args, const char* end)
214 | {
215 | 	const char* start= buffer_fromoffset (args, state->bufferpos);
216 | 	const char* limit, *colon, *ampr= str_findchr (start, end, '&');
217 | 	assert (end <= buffer_getend (args));
218 | 
219 | 	if (ampr != start)
220 | 		state_pushtoken (state, args, SXML_CHARACTER, start, ampr);
221 | 
222 | 	if (ampr == end)
223 | 		return state_setpos (state, args, ampr);
224 | 
225 | 	/* limit entity to search to ENTITY_MAXLEN */
226 | 	limit= MIN (ampr + ENTITY_MAXLEN, end);
227 | 	colon= str_findchr (ampr, limit, ';');
228 | 	if (colon == limit)
229 | 		return (limit == end) ? SXML_ERROR_BUFFERDRY : SXML_ERROR_XMLINVALID;
230 | 		
231 | 	start= colon + 1;
232 | 	state_pushtoken (state, args, SXML_CHARACTER, ampr, start);
233 | 	return state_setpos (state, args, start);
234 | }
235 | 
236 | static sxmlerr_t parse_attrvalue (sxml_t* state, sxml_args_t* args, const char* end)
237 | {
238 | 	while (buffer_fromoffset (args, state->bufferpos) != end)
239 | 	{
240 | 		sxmlerr_t err= parse_characters (state, args, end);
241 | 		if (err != SXML_SUCCESS)
242 | 			return err;
243 | 	}
244 | 	
245 | 	return SXML_SUCCESS;
246 | }
247 | 
248 | static sxmlerr_t parse_attributes (sxml_t* state, sxml_args_t* args)
249 | {
250 | 	const char* start= buffer_fromoffset (args, state->bufferpos);
251 | 	const char* end= buffer_getend (args);
252 | 	const char* name= str_ltrim (start, end);
253 | 	
254 | 	UINT ntokens= state->ntokens;
255 | 	assert (0 < ntokens);
256 | 
257 | 	while (name != end && ISALPHA (*name))
258 | 	{
259 | 		const char* eq, *space, *quot, *value;
260 | 		sxmlerr_t err;
261 | 
262 | 		/* Attribute name */		
263 | 		eq= str_findchr (name, end, '=');
264 | 		if (eq == end)
265 | 			return SXML_ERROR_BUFFERDRY;
266 | 
267 | 		space= str_rtrim (name, eq);
268 | 		state_pushtoken (state, args, SXML_CDATA, name, space);
269 | 
270 | 		/* Attribute value */
271 | 		quot= str_ltrim (eq + 1, end);
272 | 		if (quot == end)
273 | 			return SXML_ERROR_BUFFERDRY;
274 | 		else if (*quot != '\'' && *quot != '"')
275 | 			return SXML_ERROR_XMLINVALID;
276 | 
277 | 		value= quot + 1;
278 | 		quot= str_findchr (value, end, *quot);
279 | 		if (quot == end)
280 | 			return SXML_ERROR_BUFFERDRY;
281 | 
282 | 		state_setpos (state, args, value);
283 | 		err= parse_attrvalue (state, args, quot);
284 | 		if (err != SXML_SUCCESS)
285 | 			return err;
286 | 
287 | 		/* --- */
288 | 		
289 | 		name= str_ltrim (quot + 1, end);
290 | 	}
291 | 
292 | 	{
293 | 		sxmltok_t* token= args->tokens + (ntokens - 1);
294 | 		token->size= (unsigned short) (state->ntokens - ntokens);
295 | 	}
296 | 	
297 | 	return state_setpos (state, args, name);
298 | }
299 | 
300 | /* --- */
301 | 
302 | #define TAG_LEN(str)	(sizeof (str) - 1)
303 | #define TAG_MINSIZE	3
304 | 
305 | static sxmlerr_t parse_comment (sxml_t* state, sxml_args_t* args)
306 | {
307 | 	static const char STARTTAG[]= "<!--";
308 | 	static const char ENDTAG[]= "-->";
309 | 
310 | 	const char* dash;
311 | 	const char* start= buffer_fromoffset (args, state->bufferpos);
312 | 	const char* end= buffer_getend (args);
313 | 	if (end - start < TAG_LEN (STARTTAG))
314 | 		return SXML_ERROR_BUFFERDRY;
315 | 
316 | 	if (!str_startswith (start, end, STARTTAG))
317 | 		return SXML_ERROR_XMLINVALID;
318 | 
319 | 	start+= TAG_LEN (STARTTAG);
320 | 	dash= str_findstr (start, end, ENDTAG);
321 | 	if (dash == end)
322 | 		return SXML_ERROR_BUFFERDRY;
323 | 
324 | 	state_pushtoken (state, args, SXML_COMMENT, start, dash);
325 | 	return state_setpos (state, args, dash + TAG_LEN (ENDTAG));
326 | }
327 | 
328 | static sxmlerr_t parse_instruction (sxml_t* state, sxml_args_t* args)
329 | {
330 | 	static const char STARTTAG[]= "<?";
331 | 	static const char ENDTAG[]= "?>";
332 | 
333 | 	sxmlerr_t err;
334 | 	const char* quest, *space;
335 | 	const char* start= buffer_fromoffset (args, state->bufferpos);
336 | 	const char* end= buffer_getend (args);
337 | 	assert (TAG_MINSIZE <= end - start);
338 | 
339 | 	if (!str_startswith (start, end, STARTTAG))
340 | 		return SXML_ERROR_XMLINVALID;
341 | 
342 | 	start+= TAG_LEN (STARTTAG);
343 | 	space= str_find_notalnum (start, end);
344 | 	if (space == end)
345 | 		return SXML_ERROR_BUFFERDRY;
346 | 
347 | 	state_pushtoken (state, args, SXML_INSTRUCTION, start, space);
348 | 
349 | 	state_setpos (state, args, space);
350 | 	err= parse_attributes (state, args);
351 | 	if (err != SXML_SUCCESS)
352 | 		return err;
353 | 
354 | 	quest= buffer_fromoffset (args, state->bufferpos);
355 | 	if (end - quest < TAG_LEN (ENDTAG))
356 | 		return SXML_ERROR_BUFFERDRY;
357 | 
358 | 	if (!str_startswith (quest, end, ENDTAG))
359 | 		return SXML_ERROR_XMLINVALID;
360 | 
361 | 	return state_setpos (state, args, quest + TAG_LEN (ENDTAG));
362 | }
363 | 
364 | static sxmlerr_t parse_doctype (sxml_t* state, sxml_args_t* args)
365 | {
366 | 	static const char STARTTAG[]= "<!DOCTYPE";
367 | 	static const char ENDTAG[]= "]>";
368 | 
369 | 	const char* bracket;
370 | 	const char* start= buffer_fromoffset (args, state->bufferpos);
371 | 	const char* end= buffer_getend (args);
372 | 	if (end - start < TAG_LEN (STARTTAG))
373 | 		return SXML_ERROR_BUFFERDRY;
374 | 
375 | 	if (!str_startswith (start, end, STARTTAG))
376 | 		return SXML_ERROR_BUFFERDRY;
377 | 
378 | 	start+= TAG_LEN (STARTTAG);
379 | 	bracket= str_findstr (start, end, ENDTAG);
380 | 	if (bracket == end)
381 | 		return SXML_ERROR_BUFFERDRY;
382 | 
383 | 	state_pushtoken (state, args, SXML_DOCTYPE, start, bracket);
384 | 	return state_setpos (state, args, bracket + TAG_LEN (ENDTAG));
385 | }
386 | 
387 | static sxmlerr_t parse_start (sxml_t* state, sxml_args_t* args)
388 | {	
389 | 	sxmlerr_t err;
390 | 	const char* gt, *name, *space;
391 | 	const char* start= buffer_fromoffset (args, state->bufferpos);
392 | 	const char* end= buffer_getend (args);
393 | 	assert (TAG_MINSIZE <= end - start);
394 | 
395 | 	if (!(start[0] == '<' && ISALPHA (start[1])))
396 | 		return SXML_ERROR_XMLINVALID;
397 | 
398 | 	/* --- */
399 | 
400 | 	name= start + 1;
401 | 	space= str_find_notalnum (name, end);
402 | 	if (space == end)
403 | 		return SXML_ERROR_BUFFERDRY;
404 | 
405 | 	state_pushtoken (state, args, SXML_STARTTAG, name, space);
406 | 
407 | 	state_setpos (state, args, space);
408 | 	err= parse_attributes (state, args);
409 | 	if (err != SXML_SUCCESS)
410 | 		return err;
411 | 
412 | 	/* --- */
413 | 
414 | 	gt= buffer_fromoffset (args, state->bufferpos);
415 | 	
416 | 	if (gt != end && *gt == '/')
417 | 	{
418 | 		state_pushtoken (state, args, SXML_ENDTAG, name, space);
419 | 		gt++;
420 | 	}
421 | 
422 | 	if (gt == end)
423 | 		return SXML_ERROR_BUFFERDRY;
424 | 
425 | 	if (*gt != '>')
426 | 		return SXML_ERROR_XMLINVALID;
427 | 
428 | 	return state_setpos (state, args, gt + 1);
429 | }
430 | 
431 | static sxmlerr_t parse_end (sxml_t* state, sxml_args_t* args)
432 | {
433 | 	const char* gt, *space;
434 | 	const char* start= buffer_fromoffset (args, state->bufferpos);
435 | 	const char* end= buffer_getend (args);
436 | 	assert (TAG_MINSIZE <= end - start);
437 | 
438 | 	if (!(str_startswith (start, end, "</") && ISALPHA (start[2])))
439 | 		return SXML_ERROR_XMLINVALID;
440 | 
441 | 	start+= 2;	
442 | 	gt= str_findchr (start, end, '>');
443 | 	if (gt == end)
444 | 		return SXML_ERROR_BUFFERDRY;
445 | 
446 | 	/* Test for no characters beyond elem name */
447 | 	space= str_find_notalnum (start, gt);
448 | 	if (str_ltrim (space, gt) != gt)
449 | 		return SXML_ERROR_XMLSTRICT;
450 | 
451 | 	state_pushtoken (state, args, SXML_ENDTAG, start, space);
452 | 	return state_setpos (state, args, gt + 1);
453 | }
454 | 
455 | static sxmlerr_t parse_cdata (sxml_t* state, sxml_args_t* args)
456 | {
457 | 	static const char STARTTAG[]= "<![CDATA[";
458 | 	static const char ENDTAG[]= "]]>";
459 | 
460 | 	const char* bracket;
461 | 	const char* start= buffer_fromoffset (args, state->bufferpos);
462 | 	const char* end= buffer_getend (args);
463 | 	if (end - start < TAG_LEN (STARTTAG))
464 | 		return SXML_ERROR_BUFFERDRY;
465 | 
466 | 	if (!str_startswith (start, end, STARTTAG))
467 | 		return SXML_ERROR_XMLINVALID;
468 | 
469 | 	start+= TAG_LEN (STARTTAG);
470 | 	bracket= str_findstr (start, end, ENDTAG);
471 | 	if (bracket == end)
472 | 		return SXML_ERROR_BUFFERDRY;
473 | 
474 | 	state_pushtoken (state, args, SXML_CDATA, start, bracket);
475 | 	return state_setpos (state, args, bracket + TAG_LEN (ENDTAG));
476 | }
477 | 
478 | /*
479 |  MARK: SXML
480 |  Public API inspired by the JSON parser JSMN ( http://zserge.com/jsmn.html ).
481 | */
482 | 
483 | void sxml_init (sxml_t *state)
484 | {
485 |     state->bufferpos= 0;
486 |     state->ntokens= 0;
487 | 	state->taglevel= 0;
488 | }
489 | 
490 | #define ROOT_FOUND(state)	(0 < (state)->taglevel)
491 | #define ROOT_PARSED(state)	((state)->taglevel == 0)
492 | 
493 | sxmlerr_t sxml_parse(sxml_t *state, const char *buffer, UINT bufferlen, sxmltok_t tokens[], UINT num_tokens)
494 | {
495 | 	sxml_t temp= *state;
496 | 	const char* end= buffer + bufferlen;
497 | 	
498 | 	sxml_args_t args;
499 | 	args.buffer= buffer;
500 | 	args.bufferlen= bufferlen;
501 | 	args.tokens= tokens;
502 | 	args.num_tokens= num_tokens;
503 | 
504 | 	/* --- */
505 | 
506 | 	while (!ROOT_FOUND (&temp))
507 | 	{
508 | 		sxmlerr_t err;
509 | 		const char* start= buffer_fromoffset (&args, temp.bufferpos);
510 | 		const char* lt= str_ltrim (start, end);
511 | 		state_setpos (&temp, &args, lt);
512 | 		state_commit (state, &temp);
513 | 
514 | 		if (end - lt < TAG_MINSIZE)
515 | 			return SXML_ERROR_BUFFERDRY;
516 | 
517 | 		/* --- */
518 | 
519 | 		if (*lt != '<')
520 | 			return SXML_ERROR_XMLINVALID;
521 | 
522 | 		switch (lt[1])
523 | 		{
524 | 		case '?':	err= parse_instruction (&temp, &args);	break;
525 | 		case '!':	err= parse_doctype (&temp, &args);	break;
526 | 		default:	err= parse_start (&temp, &args);	break;
527 | 		}
528 | 
529 | 		if (err != SXML_SUCCESS)
530 | 			return err;
531 | 
532 | 		state_commit (state, &temp);
533 | 	}
534 | 
535 | 	/* --- */
536 | 
537 | 	while (!ROOT_PARSED (&temp))
538 | 	{
539 | 		sxmlerr_t err;
540 | 		const char* start= buffer_fromoffset (&args, temp.bufferpos);
541 | 		const char* lt= str_findchr (start, end, '<');
542 | 		while (buffer_fromoffset (&args, temp.bufferpos) != lt)
543 | 		{
544 | 			sxmlerr_t err= parse_characters (&temp, &args, lt);
545 | 			if (err != SXML_SUCCESS)
546 | 				return err;
547 | 
548 | 			state_commit (state, &temp);
549 | 		}
550 | 
551 | 		/* --- */
552 | 
553 | 		if (end - lt < TAG_MINSIZE)
554 | 			return SXML_ERROR_BUFFERDRY;
555 | 
556 | 		switch (lt[1])
557 | 		{
558 | 		case '?':	err= parse_instruction (&temp, &args);		break;
559 | 		case '/':	err= parse_end (&temp, &args);	break;
560 | 		case '!':	err= (lt[2] == '-') ? parse_comment (&temp, &args) : parse_cdata (&temp, &args);	break;
561 | 		default:	err= parse_start (&temp, &args);	break;
562 | 		}
563 | 
564 | 		if (err != SXML_SUCCESS)
565 | 			return err;
566 | 
567 | 		state_commit (state, &temp);
568 | 	}
569 | 
570 | 	return SXML_SUCCESS;
571 | }
572 | 


--------------------------------------------------------------------------------
/sxml.h:
--------------------------------------------------------------------------------
  1 | #ifndef _SXML_H_INCLUDED
  2 | #define _SXML_H_INCLUDED
  3 | 
  4 | #ifdef __cplusplus
  5 | extern "C" {
  6 | #endif
  7 | 
  8 | /*
  9 |  --- SXML ---
 10 |  Short description of how to use SXML for parsing XML text.
 11 | 
 12 |  SXML is a lightweight XML parser with no external dependencies.
 13 |  To parse XML text you only need to call one function: sxml_parse().
 14 |  The function has the following return codes:
 15 | */
 16 | 
 17 | typedef enum
 18 | {
 19 | 	SXML_ERROR_XMLINVALID= -1,	/* Parser found invalid XML data - not much you can do beyond error reporting */
 20 | 	SXML_SUCCESS= 0,			/* Parser has completed successfully - parsing of XML document is complete */
 21 | 	SXML_ERROR_BUFFERDRY= 1,	/* Parser ran out of input data - refill buffer with more XML text to continue parsing */
 22 | 	SXML_ERROR_TOKENSFULL= 2	/* Parser has filled all the supplied tokens with data - provide more tokens for further output */
 23 | } sxmlerr_t;
 24 | 
 25 | /*
 26 |  You provide sxml_parse() with a buffer of XML text for parsing.
 27 |  The parser will handle text data encoded in ascii, latin-1 and utf-8.
 28 |  It should also work with other encodings that are acsii extensions.
 29 | 
 30 |  sxml_parse() is reentrant.
 31 |  In the case of return code SXML_ERROR_BUFFERDRY or SXML_ERROR_TOKENSFULL, you are expected to call the function again after resolving the problem to continue parsing.
 32 |  */
 33 | 
 34 | typedef	struct sxml_t sxml_t;
 35 | typedef	struct sxmltok_t sxmltok_t;
 36 | sxmlerr_t sxml_parse(sxml_t *parser, const char *buffer, unsigned bufferlen, sxmltok_t* tokens, unsigned num_tokens);
 37 | 
 38 | /*
 39 |  The sxml_t object stores all data required for SXML to continue from where it left of.
 40 | 
 41 |  After calling sxml_parse() 'ntokens' tells you how many output tokens have been filled with data.
 42 |  Depending on how you resolve SXML_ERROR_BUFFERDRY or SXML_ERROR_TOKENSFULL you may need to modifiy 'bufferpos' and 'ntokens' to correctly reflect the new buffer and tokens you provide.
 43 | */
 44 | 
 45 | struct sxml_t
 46 | {
 47 | 	unsigned bufferpos;	/* Current offset into buffer - all XML data before this position has been successfully parsed */
 48 | 	unsigned ntokens;	/* Number of tokens filled with valid data by the parser */
 49 | 	unsigned taglevel;	/* Used internally - keeps track of number of unclosed XML elements to detect start and end of document */
 50 | };
 51 | 
 52 | /*
 53 |  Before you call sxml_parse() for the first time, you have to initialize the parser object.
 54 |  You may easily do that with the provided function sxml_init().
 55 | */
 56 | 
 57 | void sxml_init(sxml_t *parser);
 58 | 
 59 | /*
 60 |  Unlike most XML parsers, SXML does not use SAX callbacks or allocate a DOM tree.
 61 |  Instead you will have to interpret the XML structure through a table of tokens.
 62 | 
 63 |  A token can describe any of the following types:
 64 | */
 65 | 
 66 | typedef enum
 67 | {
 68 | 	SXML_STARTTAG,	/* Start tag describes the opening of an XML element */
 69 | 	SXML_ENDTAG,	/* End tag is the closing of an XML element */
 70 | 
 71 | 	SXML_CHARACTER,		/* Character data may be escaped - check if the first character is an ampersand '&' to identity a XML character reference */
 72 | 	SXML_CDATA,			/* Character data should be read as is - it is not escaped */
 73 | 
 74 | 	/* And some other token types you might be interested in: */
 75 | 	SXML_INSTRUCTION,	/* Can be used to identity the text encoding */
 76 | 	SXML_DOCTYPE,		/* If you'd like to interpret DTD data */
 77 | 	SXML_COMMENT		/* Most likely you don't care about comments - but this is where you'll find them */
 78 | } sxmltype_t;
 79 | 
 80 | /*
 81 |  If you are familiar with the structure of an XML document most of these type names should sound familiar.
 82 |  
 83 |  A token has the following data:
 84 | */
 85 | 
 86 | struct sxmltok_t
 87 | {
 88 | 	unsigned short type;	/* A token is one of the above sxmltype_t */
 89 | 	unsigned short size;	/* The following number of tokens contain additional data related to this token - used for describing attributes */
 90 | 
 91 | 	/* 'startpos' and 'endpos' together define a range within the provided text buffer - use these offsets with the buffer to extract the text value of the token */
 92 | 	unsigned startpos;
 93 | 	unsigned endpos;
 94 | };
 95 | 
 96 | /*
 97 |  Let's walk through how to correctly interpret a token of type SXML_STARTTAG.
 98 |  
 99 |  <example zero='' one='Hello there!' three='Me, Myself &amp; I' />
100 | 
101 |  The element name ('example') can be extracted from the text buffer using 'startpos' and 'endpos'.
102 | 
103 |  The attributes of the XML element are described in the following 'size' tokens.
104 |  Each attribute is divided by a token of type SXML_CDATA - this is the attribute key.
105 |  There will be zero or more tokens of type SXML_CHARACTER following the key - together they describe one attribute value.
106 |  
107 |  In our example you will get the following number of SXML_CHARACTER tokens after the attribute key:
108 |  * 'zero' will use no tokens to describe the empty attribute value.
109 |  * 'one' will have one token describing the attribute value ('Hello there!').
110 |  * 'three' will have three tokens describing the attribute value ('Me, Myself ')('&amp;')(' I')
111 | 
112 |  In our example the token of type SXML_STARTTAG will have a 'size' of 7 (3 SXML_CDATA and 4 SXML_CHARACTER).
113 |  When processing the tokens do not forget about 'size' - for any token you want to skip, also remember to skip the additional token data!
114 | */
115 | 
116 | #ifdef __cplusplus
117 | }
118 | #endif
119 | 
120 | /*
121 |  Congratulations on making it this far - now might be a good time to check out sxml_test.c for an example of using SXML.
122 | */
123 | 
124 | #endif /* _SXML_H_INCLUDED */
125 | 


--------------------------------------------------------------------------------
/sxml_test.c:
--------------------------------------------------------------------------------
  1 | #include "sxml.h"
  2 | 
  3 | #include <string.h>
  4 | #include <stdio.h>
  5 | #include <stdlib.h>
  6 | #include <assert.h>
  7 | 
  8 | typedef unsigned UINT;
  9 | 
 10 | /*
 11 |  MARK: Pretty print XML
 12 |  Example of simple processing of parsed token output.
 13 | */
 14 | static void print_indent (UINT indentlevel)
 15 | {
 16 | 	if (0 < indentlevel)
 17 | 	{
 18 | 		char fmt[8];
 19 | 		sprintf (fmt, "%%%ds", indentlevel * 3);
 20 | 		printf (fmt, " ");
 21 | 	}
 22 | }
 23 | 
 24 | static void print_tokenvalue (const char* buffer, const sxmltok_t* token)
 25 | {
 26 | 	char fmt[8];
 27 | 	sprintf (fmt, "%%.%ds", token->endpos - token->startpos);
 28 | 	printf (fmt, buffer + token->startpos);
 29 | }
 30 | 
 31 | static UINT print_chartokens (const char* buffer, const sxmltok_t tokens[], UINT num_tokens)
 32 | {
 33 | 	UINT i;
 34 | 
 35 | 	for (i= 0; i < num_tokens; i++)
 36 | 	{
 37 | 		const char* ampr;
 38 | 
 39 | 		const sxmltok_t* token= tokens + i;
 40 | 		if (token->type != SXML_CHARACTER)
 41 | 			return i;
 42 | 
 43 | 		ampr= buffer + token->startpos;
 44 | 		assert (0 < token->endpos - token->startpos);
 45 | 
 46 | 		if (*ampr != '&')
 47 | 		{
 48 | 			print_tokenvalue (buffer, token);
 49 | 			continue;
 50 | 		}
 51 | 
 52 | 		switch (ampr[1])
 53 | 		{
 54 | 		case 'a':	printf ((ampr[2] == 'm') ? "&" : "'");	break;
 55 | 		case 'g':	printf (">");	break;
 56 | 		case 'l':	printf ("<");	break;
 57 | 		case 'q':	printf ("\"");	break;
 58 | 		default:
 59 | 			assert (0);
 60 | 			break;
 61 | 		}
 62 | 	}
 63 | 
 64 | 	return num_tokens;
 65 | }
 66 | 
 67 | static void print_prettyxml (const char* buffer, const sxmltok_t tokens[], UINT num_tokens, UINT* indentlevel)
 68 | {
 69 | 	UINT i;
 70 | 	for (i= 0; i < num_tokens; i++)
 71 | 	{
 72 | 		const sxmltok_t* token= tokens + i;
 73 | 		switch (token->type)
 74 | 		{
 75 | 			case SXML_STARTTAG:
 76 | 			{
 77 | 				UINT j;
 78 | 
 79 | 				print_indent ((*indentlevel)++);
 80 | 				printf ("<");
 81 | 				print_tokenvalue (buffer, token);
 82 | 
 83 | 				/* elem attributes are listed in the following tokens */
 84 | 				for (j= 0; j < token->size; j++)
 85 | 				{
 86 | 					printf (" ");
 87 | 					print_tokenvalue (buffer, &token[j + 1]);
 88 | 					printf ("='");
 89 | 					j+= print_chartokens (buffer, &token[j + 2], token->size - (j + 1));
 90 | 					printf ("'");
 91 | 				}
 92 | 
 93 | 				puts (">");
 94 | 				break;
 95 | 			}
 96 | 
 97 | 			case SXML_ENDTAG:
 98 | 				print_indent (--(*indentlevel));
 99 | 				printf ("</");
100 | 				print_tokenvalue (buffer, token);
101 | 				puts (">");
102 | 				break;
103 | 
104 | 
105 | 			/* Other token types you might be interested in: */
106 | 			/*
107 | 			case SXML_INSTRUCTION
108 | 			case SXML_DOCTYPE:
109 | 			case SXML_COMMENT:
110 | 			case SXML_CDATA:
111 | 			case SXML_CHARACTER:
112 | 			*/
113 | 
114 | 			default:
115 | 				break;
116 | 		}
117 | 
118 | 		/*
119 | 		 Tokens may contain additional data. Skip 'size' tokens to get the next token to proccess.
120 | 		 (see SXML_STARTTAG case above as an example of how attributes are specified)
121 | 		*/
122 | 		i+= token->size;
123 | 	}
124 | }
125 | 
126 | /*
127 |  MARK: Utility functions
128 |  Useful for error reporting.
129 | */
130 | static UINT count_lines (const char* buffer, UINT bufferlen)
131 | {
132 | 	const char* end= buffer + bufferlen;
133 | 	const char* it= buffer;
134 | 	UINT i;
135 | 
136 | 	for (i= 0; ; i++)
137 | 	{
138 | 		it= (const char*) memchr (it, '\n', end - it);
139 | 		if (it == NULL)
140 | 			return i;
141 | 
142 | 		it++;
143 | 	}
144 | }
145 | 
146 | 
147 | /*
148 |  MARK: main
149 |  Minimal example showing how you may use SXML within a constrained environment with a fixed size input and output buffer.
150 | */
151 | 
152 | #define MIN(a,b)	(((a) < (b)) ? (a) : (b))
153 | #define COUNT(arr)	(sizeof (arr) / sizeof ((arr)[0]))
154 | 
155 | #define BUFFER_MAXLEN	1024
156 | 
157 | 
158 | int main (int argc, const char* argv[])
159 | {
160 | 	/* Input XML text */
161 | 	char buffer[BUFFER_MAXLEN];
162 | 	UINT bufferlen= 0;
163 | 
164 | 	/* Output token table */
165 | 	sxmltok_t tokens[128];
166 | 
167 | 	/* Used in example for pretty printing and error reporting */
168 | 	UINT indent= 0, lineno= 1;
169 | 
170 | 	const char* path;
171 | 	FILE* file;
172 | 
173 | 	/* Parser object stores all data required for SXML to be reentrant */
174 | 	sxml_t parser;
175 | 	sxml_init (&parser);
176 | 
177 | 	/* Usage: sxml_test.exe test.xml */
178 | 	assert (argc == 2);
179 | 	path= argv[1];
180 | 	file= fopen (path, "rb");
181 | 	assert (file != NULL);
182 | 	
183 | 	for (;;)
184 | 	{
185 | 		sxmlerr_t err= sxml_parse (&parser, buffer, bufferlen, tokens, COUNT (tokens));
186 | 		if (err == SXML_SUCCESS)
187 | 			break;
188 | 
189 | 		switch (err)
190 | 		{
191 | 			case SXML_ERROR_TOKENSFULL:
192 | 			{
193 | 				/*
194 | 				 Need to give parser more space for tokens to continue parsing.
195 | 				 We choose here to reuse the existing token table once tokens have been processed.
196 | 
197 | 				 Example of some processing of the token data.
198 | 				 Instead you might be interested in creating your own DOM structure
199 | 				 or other processing of XML data useful to your application.
200 | 				*/
201 | 				print_prettyxml (buffer, tokens, parser.ntokens, &indent);
202 | 
203 | 				/* Parser can now safely reuse all of the token table */
204 | 				parser.ntokens= 0;
205 | 				break;
206 | 			}
207 | 
208 | 			case SXML_ERROR_BUFFERDRY:
209 | 			{
210 | 				/* 
211 | 				 Parser expects more XML data to continue parsing.
212 | 				 We choose here to reuse the existing buffer array.
213 | 				*/
214 | 				size_t len;
215 | 
216 | 				/* Need to processs existing tokens before buffer is overwritten with new data */
217 | 				print_prettyxml (buffer, tokens, parser.ntokens, &indent);
218 | 				parser.ntokens= 0;
219 | 
220 | 				/* For error reporting */
221 | 				lineno+= count_lines(buffer, parser.bufferpos);
222 | 
223 | 				/*
224 | 				 Example of how to reuse buffer array.
225 | 				 Move unprocessed buffer content to start of array
226 | 				*/
227 | 				bufferlen-= parser.bufferpos;
228 | 				memmove (buffer, buffer + parser.bufferpos, bufferlen);
229 | 
230 | 				/* 
231 | 				 If your buffer is smaller than the size required to complete a token the parser will endlessly call SXML_ERROR_BUFFERDRY.
232 | 				 You will most likely encounter this problem if you have XML comments longer than BUFFER_MAXLEN in size.
233 | 				 SXML_CHARACTER solves this problem by dividing the data over multiple tokens, but other token types remain affected.
234 | 				*/
235 | 				assert (bufferlen < BUFFER_MAXLEN);
236 | 
237 | 				/* Fill remaining buffer with new data from file */
238 | 				len= fread (buffer + bufferlen, 1, BUFFER_MAXLEN - bufferlen, file);
239 | 				assert (0 < len);
240 | 				bufferlen+= len;
241 | 
242 | 				/* Parser will now have to read from beginning of buffer to contiue */
243 | 				parser.bufferpos= 0;
244 | 				break;
245 | 			}
246 | 
247 | 			case SXML_ERROR_XMLINVALID:
248 | 			{
249 | 				char fmt[8];
250 | 
251 | 				/* Example of some simple error reporting */
252 | 				lineno+= count_lines (buffer, parser.bufferpos);
253 | 				fprintf(stderr, "Error while parsing line %d:\n", lineno);
254 | 
255 | 				/* Print out contents of line containing the error */
256 | 				sprintf (fmt, "%%.%ds", MIN (bufferlen - parser.bufferpos, 72));
257 | 				fprintf (stderr, fmt, buffer + parser.bufferpos);
258 | 
259 | 				abort();
260 | 				break;
261 | 			}
262 | 
263 | 		default:
264 | 			assert (0);
265 | 			break;
266 | 		}
267 | 	}
268 | 
269 | 	fclose (file);
270 | 
271 | 	/* Sucessfully parsed XML file - flush remainig token output */
272 | 	print_prettyxml (buffer, tokens, parser.ntokens, &indent);
273 | 	return 0;
274 | }
275 | 


--------------------------------------------------------------------------------