├── .gitignore
├── COPYING
├── Makefile
├── README.md
├── cppmain.c
├── preproc.c
├── preproc.h
├── tokenizer.c
└── tokenizer.h


/.gitignore:
--------------------------------------------------------------------------------
1 | *.i
2 | *.o
3 | 
4 | 


--------------------------------------------------------------------------------
/COPYING:
--------------------------------------------------------------------------------
 1 | tinycpp is licensed under the following standard MIT license:
 2 | 
 3 | ----------------------------------------------------------------------
 4 | Copyright © 2019 rofl0r.
 5 | 
 6 | Permission is hereby granted, free of charge, to any person obtaining
 7 | a copy of this software and associated documentation files (the
 8 | "Software"), to deal in the Software without restriction, including
 9 | without limitation the rights to use, copy, modify, merge, publish,
10 | distribute, sublicense, and/or sell copies of the Software, and to
11 | permit persons to whom the Software is furnished to do so, subject to
12 | the following conditions:
13 | 
14 | The above copyright notice and this permission notice shall be
15 | included in all copies or substantial portions of the Software.
16 | 
17 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
18 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
19 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
20 | IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
21 | CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
22 | TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
23 | SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
24 | ----------------------------------------------------------------------
25 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | #Makefile autogenerated by RcB2
 2 | prefix = /usr/local
 3 | bindir = $(prefix)/bin
 4 | 
 5 | PROG = cppmain
 6 | SRCS = cppmain.c \
 7 | 	tokenizer.c \
 8 | 	preproc.c
 9 | 
10 | LIBULZ_BASE?=../cdev/cdev/lib/
11 | 
12 | LIBS = 
13 | 
14 | CFLAGS_N = 
15 | CPPFLAGS_N = -I $(LIBULZ_BASE)/include
16 | LDFLAGS_N = 
17 | 
18 | OBJS = $(SRCS:.c=.o)
19 | 
20 | MAKEFILE := $(firstword $(MAKEFILE_LIST))
21 | 
22 | -include config.mak
23 | 
24 | all: $(PROG)
25 | 
26 | clean:
27 | 	rm -f $(PROG)
28 | 	rm -f $(OBJS)
29 | 
30 | rebuild:
31 | 	$(MAKE) -f $(MAKEFILE) clean && $(MAKE) -f $(MAKEFILE) all
32 | 
33 | ddebug:
34 | 	$(MAKE) -f $(MAKEFILE) clean && $(MAKE) -f $(MAKEFILE) CFLAGS="-O0 -g3 -DDEBUG" all
35 | 
36 | debug:
37 | 	$(MAKE) -f $(MAKEFILE) clean && $(MAKE) -f $(MAKEFILE) CFLAGS="-O0 -g3" all
38 | 
39 | install: $(PROG)
40 | 	install -d $(DESTDIR)/$(bindir)
41 | 	install -D -m 755 $(PROG) $(DESTDIR)/$(bindir)/
42 | 
43 | src: $(SRCS)
44 | 	$(CC) $(CPPFLAGS_N) $(CPPFLAGS) $(CFLAGS_N) $(CFLAGS) -o $(PROG) $^ $(LDFLAGS_N) $(LDFLAGS) $(LIBS)
45 | 
46 | %.o: %.c
47 | 	$(CC) $(CPPFLAGS_N) $(CPPFLAGS) $(CFLAGS_N) $(CFLAGS) -c -o $@ $<
48 | 
49 | $(PROG): $(OBJS)
50 | 	$(CC) $(CFLAGS_N) $(CFLAGS) $(LDFLAGS_N) $(LDFLAGS) $(OBJS) $(LIBS) -o $@
51 | 
52 | .PHONY: all clean rebuild install src
53 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | tinycpp - a small, embeddable C-style preprocessor
 2 | ==================================================
 3 | 
 4 | tinycpp was created with the intention of having a C-style preprocessor
 5 | for use in an assembler i'm working on.
 6 | the particular issue i faced with standard C preprocessors is that
 7 | multiline-macros are expanded into a single line. this basically
 8 | requires to add something like ';' to the assembler language, to support
 9 | several expressions in a single line.
10 | 
11 | one of the design goals from the start was to read the input token by
12 | token, instead of slurping the entire file into memory.
13 | this, unfortunately, required some trickery to get the right behaviour
14 | in some cases, but should save a lot of memory on big files
15 | (theoretically, it should be able to process gigabyte-big files, while
16 | only consuming a few MBs ram (depending on the amount of macros that
17 | need to be stored)).
18 | 
19 | apart from that, tinycpp pretty much behaves like your standard cpp.
20 | 
21 | it's self-hosting: it can preprocess its own source, and the result
22 | compiles fine, so it's quite complete (tested with musl libc headers).
23 | 
24 | size
25 | ----
26 | the 2 TUs used by the preprocessor library are less than 2 KLOC combined.
27 | additionally about 500 LOC of list and hash header implementations from
28 | libulz are used. this is still a lot less than ucpp's 8 KLOC-ish
29 | implementation. not as tiny as i'd like, but a C preprocessor is a
30 | surprisingly complex beast.
31 | 
32 | speed
33 | -----
34 | speed is slightly slower than GNU cpp, and slightly faster than mcpp on
35 | a 12MB testfile which defines, undefs and uses thousands of macros.
36 | 
37 | differences to standard C preprocessors
38 | ---------------------------------------
39 | 
40 | - "if" evaluation treats all numeric literals as integers, even if they
41 |   have L/U/LL/LLU suffixes. this is probably the biggest blocker from
42 |   becoming a fully compliant C preprocessor.
43 |   shouldn't be hard to support though.
44 | - widechar literals in conditionals are treated as if they were a single
45 |   non-wide character.
46 | - multiline macros keep newline characters, which doesn't cause any
47 |   issues, apart from making it harder to diff against other CPPs output.
48 |   (`__LINE__` macro behaves as expected, though, in that it shows the same
49 |   line number for all expanded lines).
50 | - no predefined macros such as `__STDC__`. you can set them yourself, if
51 |   you like.
52 | - a few test cases of mcpp fail. these are cornercases that are usually
53 |   not encountered in the wild.
54 |   e.g. https://github.com/ned14/mcpp/blob/master/test-c/n_5.c
55 | - lines starting w/ comments like `/**/` followed by preprocessor directives
56 |   are currently not detected as such. this is because comments are removed
57 |   on the fly, not in a previous pass. it shouldn't be very hard to support
58 |   it, though.
59 | - no digraphs and trigraphs supported.
60 | - multiple sequential whitespace characters are preserved.
61 | - max token length is 4095, though this can easily be changed.
62 |   many CPPs happily process much longer tokens, even though the standard
63 |   doesn't require it.
64 | - some built-ins like `__TIME__` and `__DATE__` are missing, but you can
65 |   define them yourself if needed. `__LINE__` and `__FILE_`_ were added,
66 |   as they're used by musl's headers.
67 | - the printed diagnostics are sometimes not very helpful.
68 | 
69 | anything else not mentioned here is supported (including varargs, pasting,
70 | stringification, ...)
71 | 
72 | differences to other C preprocessor libraries
73 | ---------------------------------------------
74 | 
75 | the preprocessor interface takes a `FILE*` as input and one as output.
76 | it doesn't try to provide a C token stream.
77 | in order not to write to disk, you can use memory streams
78 | (open_memstream() to create a writable stream, followed by fflush() to
79 | make its contents available)
80 | 
81 | how to build
82 | ------------
83 | clone the libulz library https://github.com/rofl0r/libulz, and point the
84 | Makefile to the directory, or copy the 3 headers needed into the source
85 | tree, then run `make`.
86 | 
87 | how to use
88 | ----------
89 | look at `preproc.h` and `cppmain.c`, which implements the demo preprocessor
90 | program.
91 | 
92 | acknowledgements
93 | ----------------
94 | thanks go to mcpp's author, whose testsuite i extensively used.
95 | 
96 | 


--------------------------------------------------------------------------------
/cppmain.c:
--------------------------------------------------------------------------------
 1 | #include "preproc.h"
 2 | #include <string.h>
 3 | #include <unistd.h>
 4 | 
 5 | static int usage(char *a0) {
 6 | 	fprintf(stderr,
 7 | 			"example preprocessor\n"
 8 | 			"usage: %s [-I includedir...] [-D define] file\n"
 9 | 			"if no filename or '-' is passed, stdin is used.\n"
10 | 			, a0);
11 | 	return 1;
12 | }
13 | 
14 | int main(int argc, char** argv) {
15 | 	int c; char* tmp;
16 | 	struct cpp* cpp = cpp_new();
17 | 	while ((c = getopt(argc, argv, "D:I:")) != EOF) switch(c) {
18 | 	case 'I': cpp_add_includedir(cpp, optarg); break;
19 | 	case 'D':
20 | 		if((tmp = strchr(optarg, '='))) *tmp = ' ';
21 | 		cpp_add_define(cpp, optarg);
22 | 		break;
23 | 	default: return usage(argv[0]);
24 | 	}
25 | 	char *fn = "stdin";
26 | 	FILE *in = stdin;
27 | 	if(argv[optind] && strcmp(argv[optind], "-")) {
28 | 		fn = argv[optind];
29 | 		in = fopen(fn, "r");
30 | 		if(!in) {
31 | 			perror("fopen");
32 | 			return 1;
33 | 		}
34 | 	}
35 | 	int ret = cpp_run(cpp, in, stdout, fn);
36 | 	cpp_free(cpp);
37 | 	if(in != stdin) fclose(in);
38 | 	return !ret;
39 | }
40 | 
41 | 


--------------------------------------------------------------------------------
/preproc.c:
--------------------------------------------------------------------------------
   1 | #include <string.h>
   2 | #include <ctype.h>
   3 | #include <assert.h>
   4 | #include "preproc.h"
   5 | #include "tokenizer.h"
   6 | #include "tglist.h"
   7 | #include "hbmap.h"
   8 | 
   9 | #define MACRO_FLAG_OBJECTLIKE (1U<<31)
  10 | #define MACRO_FLAG_VARIADIC (1U<<30)
  11 | #define MACRO_ARGCOUNT_MASK (~(0|MACRO_FLAG_OBJECTLIKE|MACRO_FLAG_VARIADIC))
  12 | 
  13 | #define OBJECTLIKE(M) (M->num_args & MACRO_FLAG_OBJECTLIKE)
  14 | #define FUNCTIONLIKE(M) (!(OBJECTLIKE(M)))
  15 | #define MACRO_ARGCOUNT(M) (M->num_args & MACRO_ARGCOUNT_MASK)
  16 | #define MACRO_VARIADIC(M) (M->num_args & MACRO_FLAG_VARIADIC)
  17 | 
  18 | #define MAX_RECURSION 32
  19 | 
  20 | static unsigned string_hash(const char* s) {
  21 | 	uint_fast32_t h = 0;
  22 | 	while (*s) {
  23 | 		h = 16*h + *s++;
  24 | 		h ^= h>>24 & 0xf0;
  25 | 	}
  26 | 	return h & 0xfffffff;
  27 | }
  28 | 
  29 | struct macro {
  30 | 	unsigned num_args;
  31 | 	FILE* str_contents;
  32 | 	char *str_contents_buf;
  33 | 	tglist(char*) argnames;
  34 | };
  35 | 
  36 | struct cpp {
  37 | 	tglist(char*) includedirs;
  38 | 	hbmap(char*, struct macro, 128) *macros;
  39 | 	const char *last_file;
  40 | 	int last_line;
  41 | 	struct tokenizer *tchain[MAX_RECURSION];
  42 | };
  43 | 
  44 | static int token_needs_string(struct token *tok) {
  45 | 	switch(tok->type) {
  46 | 		case TT_IDENTIFIER:
  47 | 		case TT_WIDECHAR_LIT:
  48 | 		case TT_WIDESTRING_LIT:
  49 | 		case TT_SQSTRING_LIT:
  50 | 		case TT_DQSTRING_LIT:
  51 |                 case TT_ELLIPSIS:
  52 |                 case TT_HEX_INT_LIT:
  53 |                 case TT_OCT_INT_LIT:
  54 |                 case TT_DEC_INT_LIT:
  55 | 		case TT_FLOAT_LIT:
  56 | 		case TT_UNKNOWN:
  57 | 			return 1;
  58 | 		default:
  59 | 			return 0;
  60 | 	}
  61 | }
  62 | 
  63 | static void tokenizer_from_file(struct tokenizer *t, FILE* f) {
  64 | 	tokenizer_init(t, f, TF_PARSE_STRINGS);
  65 | 	tokenizer_set_filename(t, "<macro>");
  66 | 	tokenizer_rewind(t);
  67 | }
  68 | 
  69 | static int strptrcmp(const void *a, const void *b) {
  70 | 	const char * const *x = a;
  71 | 	const char * const *y = b;
  72 | 	return strcmp(*x, *y);
  73 | }
  74 | 
  75 | static struct macro* get_macro(struct cpp *cpp, const char *name) {
  76 | 	return hbmap_get(cpp->macros, name);
  77 | }
  78 | 
  79 | static void add_macro(struct cpp *cpp, const char *name, struct macro*m) {
  80 | 	hbmap_insert(cpp->macros, name, *m);
  81 | }
  82 | 
  83 | static int undef_macro(struct cpp *cpp, const char *name) {
  84 | 	hbmap_iter k = hbmap_find(cpp->macros, name);
  85 | 	if(k == (hbmap_iter) -1) return 0;
  86 | 	struct macro *m = &hbmap_getval(cpp->macros, k);
  87 | 	free(hbmap_getkey(cpp->macros, k));
  88 | 	if(m->str_contents) fclose(m->str_contents);
  89 | 	free(m->str_contents_buf);
  90 | 	tglist_free_values(&m->argnames);
  91 | 	tglist_free_items(&m->argnames);
  92 | 	hbmap_delete(cpp->macros, k);
  93 | 	return 1;
  94 | }
  95 | 
  96 | static void free_macros(struct cpp *cpp) {
  97 | 	hbmap_iter i;
  98 | 	hbmap_foreach(cpp->macros, i) {
  99 | 		while(hbmap_iter_index_valid(cpp->macros, i))
 100 | 			undef_macro(cpp, hbmap_getkey(cpp->macros, i));
 101 | 	}
 102 | 	hbmap_fini(cpp->macros, 1);
 103 | 	free(cpp->macros);
 104 | }
 105 | 
 106 | static void error_or_warning(const char *err, const char* type, struct tokenizer *t, struct token *curr) {
 107 | 	unsigned column = curr ? curr->column : t->column;
 108 | 	unsigned line  = curr ? curr->line : t->line;
 109 | 	dprintf(2, "<%s> %u:%u %s: '%s'\n", t->filename, line, column, type, err);
 110 | 	dprintf(2, "%s\n", t->buf);
 111 | 	for(int i = 0; i < strlen(t->buf); i++)
 112 | 		dprintf(2, "^");
 113 | 	dprintf(2, "\n");
 114 | }
 115 | static void error(const char *err, struct tokenizer *t, struct token *curr) {
 116 | 	error_or_warning(err, "error", t, curr);
 117 | }
 118 | static void warning(const char *err, struct tokenizer *t, struct token *curr) {
 119 | 	error_or_warning(err, "warning", t, curr);
 120 | }
 121 | 
 122 | static void emit(FILE *out, const char *s) {
 123 | 	fprintf(out, "%s", s);
 124 | }
 125 | 
 126 | static int x_tokenizer_next_of(struct tokenizer *t, struct token *tok, int fail_unk) {
 127 | 	int ret = tokenizer_next(t, tok);
 128 | 	if(tok->type == TT_OVERFLOW) {
 129 | 		error("max token length of 4095 exceeded!", t, tok);
 130 | 		return 0;
 131 | 	} else if (fail_unk && ret == 0) {
 132 | 		error("tokenizer encountered unknown token", t, tok);
 133 | 		return 0;
 134 | 	}
 135 | 	return 1;
 136 | }
 137 | 
 138 | #define tokenizer_next(T, TOK) x_tokenizer_next_of(T, TOK, 0)
 139 | #define x_tokenizer_next(T, TOK) x_tokenizer_next_of(T, TOK, 1)
 140 | 
 141 | static int is_whitespace_token(struct token *token)
 142 | {
 143 | 	return token->type == TT_SEP &&
 144 | 		(token->value == ' ' || token->value == '\t');
 145 | }
 146 | 
 147 | /* return index of matching item in values array, or -1 on error */
 148 | static int expect(struct tokenizer *t, enum tokentype tt, const char* values[], struct token *token)
 149 | {
 150 | 	int ret;
 151 | 	do {
 152 | 		ret = tokenizer_next(t, token);
 153 | 		if(ret == 0 || token->type == TT_EOF) goto err;
 154 | 	} while(is_whitespace_token(token));
 155 | 
 156 | 	if(token->type != tt) {
 157 | err:
 158 | 		error("unexpected token", t, token);
 159 | 		return -1;
 160 | 	}
 161 | 	int i = 0;
 162 | 	while(values[i]) {
 163 | 		if(!strcmp(values[i], t->buf))
 164 | 			return i;
 165 | 		++i;
 166 | 	}
 167 | 	return -1;
 168 | }
 169 | 
 170 | static int is_char(struct token *tok, int ch) {
 171 | 	return tok->type == TT_SEP && tok->value == ch;
 172 | }
 173 | 
 174 | static void flush_whitespace(FILE *out, int *ws_count) {
 175 | 	while(*ws_count > 0) {
 176 | 		emit(out, " ");
 177 | 		--(*ws_count);
 178 | 	}
 179 | }
 180 | 
 181 | /* skips until the next non-whitespace token (if the current one is one too)*/
 182 | static int eat_whitespace(struct tokenizer *t, struct token *token, int *count) {
 183 | 	*count = 0;
 184 | 	int ret = 1;
 185 | 	while (is_whitespace_token(token)) {
 186 | 		++(*count);
 187 | 		ret = x_tokenizer_next(t, token);
 188 | 		if(!ret) break;
 189 | 	}
 190 | 	return ret;
 191 | }
 192 | /* fetches the next token until it is non-whitespace */
 193 | static int skip_next_and_ws(struct tokenizer *t, struct token *tok) {
 194 | 	int ret = tokenizer_next(t, tok);
 195 | 	if(!ret) return ret;
 196 | 	int ws_count;
 197 | 	ret = eat_whitespace(t, tok, &ws_count);
 198 | 	return ret;
 199 | }
 200 | 
 201 | static void emit_token(FILE* out, struct token *tok, const char* strbuf) {
 202 | 	if(tok->type == TT_SEP) {
 203 | 		fprintf(out, "%c", tok->value);
 204 | 	} else if(strbuf && token_needs_string(tok)) {
 205 | 		fprintf(out, "%s", strbuf);
 206 | 	} else {
 207 | 		dprintf(2, "oops, dunno how to handle tt %d (%s)\n", (int) tok->type, strbuf);
 208 | 	}
 209 | }
 210 | 
 211 | int parse_file(struct cpp* cpp, FILE *f, const char*, FILE *out);
 212 | static int include_file(struct cpp* cpp, struct tokenizer *t, FILE* out) {
 213 | 	static const char* inc_chars[] = { "\"", "<", 0};
 214 | 	static const char* inc_chars_end[] = { "\"", ">", 0};
 215 | 	struct token tok;
 216 | 	tokenizer_set_flags(t, 0); // disable string tokenization
 217 | 
 218 | 	int inc1sep = expect(t, TT_SEP, inc_chars, &tok);
 219 | 	if(inc1sep == -1) {
 220 | 		error("expected one of [\"<]", t, &tok);
 221 | 		return 0;
 222 | 	}
 223 | 	int ret = tokenizer_read_until(t, inc_chars_end[inc1sep], 1);
 224 | 	if(!ret) {
 225 | 		error("error parsing filename", t, &tok);
 226 | 		return 0;
 227 | 	}
 228 | 	// TODO: different path lookup depending on whether " or <
 229 | 	size_t i;
 230 | 	FILE *f = 0;
 231 | 	tglist_foreach(&cpp->includedirs, i) {
 232 | 		char buf[512];
 233 | 		snprintf(buf, sizeof buf, "%s/%s", tglist_get(&cpp->includedirs, i), t->buf);
 234 | 		f = fopen(buf, "r");
 235 | 		if(f) break;
 236 | 	}
 237 | 	if(!f) {
 238 | 		dprintf(2, "%s: ", t->buf);
 239 | 		perror("fopen");
 240 | 		return 0;
 241 | 	}
 242 | 	const char *fn = strdup(t->buf);
 243 | 	assert(tokenizer_next(t, &tok) && is_char(&tok, inc_chars_end[inc1sep][0]));
 244 | 
 245 | 	tokenizer_set_flags(t, TF_PARSE_STRINGS);
 246 | 	return parse_file(cpp, f, fn, out);
 247 | }
 248 | 
 249 | static int emit_error_or_warning(struct tokenizer *t, int is_error) {
 250 | 	int ws_count;
 251 | 	int ret = tokenizer_skip_chars(t, " \t", &ws_count);
 252 | 	if(!ret) return ret;
 253 | 	struct token tmp = {.column = t->column, .line = t->line};
 254 | 	ret = tokenizer_read_until(t, "\n", 1);
 255 | 	if(is_error) {
 256 | 		error(t->buf, t, &tmp);
 257 | 		return 0;
 258 | 	}
 259 | 	warning(t->buf, t, &tmp);
 260 | 	return 1;
 261 | }
 262 | 
 263 | static FILE *freopen_r(FILE *f, char **buf, size_t *size) {
 264 | 	fflush(f);
 265 | 	fclose(f);
 266 | 	return fmemopen(*buf, *size, "r");
 267 | }
 268 | 
 269 | static int consume_nl_and_ws(struct tokenizer *t, struct token *tok, int expected) {
 270 | 	if(!x_tokenizer_next(t, tok)) {
 271 | err:
 272 | 		error("unexpected", t, tok);
 273 | 		return 0;
 274 | 	}
 275 | 	if(expected) {
 276 | 		if(tok->type != TT_SEP || tok->value != expected) goto err;
 277 | 		switch(expected) {
 278 | 			case '\\' : expected = '\n'; break;
 279 | 			case '\n' : expected = 0; break;
 280 | 		}
 281 | 	} else {
 282 | 		if(is_whitespace_token(tok)) ;
 283 | 		else if(is_char(tok, '\\')) expected = '\n';
 284 | 		else return 1;
 285 | 	}
 286 | 	return consume_nl_and_ws(t, tok, expected);
 287 | }
 288 | 
 289 | static int expand_macro(struct cpp *cpp, struct tokenizer *t, FILE* out, const char* name, unsigned rec_level, char *visited[]);
 290 | 
 291 | static int parse_macro(struct cpp *cpp, struct tokenizer *t) {
 292 | 	int ws_count;
 293 | 	int ret = tokenizer_skip_chars(t, " \t", &ws_count);
 294 | 	if(!ret) return ret;
 295 | 	struct token curr; //tmp = {.column = t->column, .line = t->line};
 296 | 	ret = tokenizer_next(t, &curr) && curr.type != TT_EOF;
 297 | 	if(!ret) {
 298 | 		error("parsing macro name", t, &curr);
 299 | 		return ret;
 300 | 	}
 301 | 	if(curr.type != TT_IDENTIFIER) {
 302 | 		error("expected identifier", t, &curr);
 303 | 		return 0;
 304 | 	}
 305 | 	const char* macroname = strdup(t->buf);
 306 | #ifdef DEBUG
 307 | 	dprintf(2, "parsing macro %s\n", macroname);
 308 | #endif
 309 | 	int redefined = 0;
 310 | 	if(get_macro(cpp, macroname)) {
 311 | 		if(!strcmp(macroname, "defined")) {
 312 | 			error("\"defined\" cannot be used as a macro name", t, &curr);
 313 | 			return 0;
 314 | 		}
 315 | 		redefined = 1;
 316 | 	}
 317 | 
 318 | 	struct macro new = { 0 };
 319 | 	unsigned macro_flags = MACRO_FLAG_OBJECTLIKE;
 320 | 	tglist_init(&new.argnames);
 321 | 
 322 | 	ret = x_tokenizer_next(t, &curr) && curr.type != TT_EOF;
 323 | 	if(!ret) return ret;
 324 | 
 325 | 	if (is_char(&curr, '(')) {
 326 | 		macro_flags = 0;
 327 | 		unsigned expected = 0;
 328 | 		while(1) {
 329 | 			/* process next function argument identifier */
 330 | 			ret = consume_nl_and_ws(t, &curr, expected);
 331 | 			if(!ret) {
 332 | 				error("unexpected", t, &curr);
 333 | 				return ret;
 334 | 			}
 335 | 			expected = 0;
 336 | 			if(curr.type == TT_SEP) {
 337 | 				switch(curr.value) {
 338 | 				case '\\':
 339 | 					expected = '\n';
 340 | 					continue;
 341 | 				case ',':
 342 | 					continue;
 343 | 				case ')':
 344 | 					ret = tokenizer_skip_chars(t, " \t", &ws_count);
 345 | 					if(!ret) return ret;
 346 | 					goto break_loop1;
 347 | 				default:
 348 | 					error("unexpected character", t, &curr);
 349 | 					return 0;
 350 | 				}
 351 | 			} else if(!(curr.type == TT_IDENTIFIER || curr.type == TT_ELLIPSIS)) {
 352 | 				error("expected identifier for macro arg", t, &curr);
 353 | 				return 0;
 354 | 			}
 355 | 			{
 356 | 				if(curr.type == TT_ELLIPSIS) {
 357 | 					if(macro_flags & MACRO_FLAG_VARIADIC) {
 358 | 						error("\"...\" isn't the last parameter", t, &curr);
 359 | 						return 0;
 360 | 					}
 361 | 					macro_flags |= MACRO_FLAG_VARIADIC;
 362 | 				}
 363 | 				char *tmps = strdup(t->buf);
 364 | 				tglist_add(&new.argnames, tmps);
 365 | 			}
 366 | 			++new.num_args;
 367 | 		}
 368 | 		break_loop1:;
 369 | 	} else if(is_whitespace_token(&curr)) {
 370 | 		ret = tokenizer_skip_chars(t, " \t", &ws_count);
 371 | 		if(!ret) return ret;
 372 | 	} else if(is_char(&curr, '\n')) {
 373 | 		/* content-less macro */
 374 | 		goto done;
 375 | 	}
 376 | 
 377 | 	struct FILE_container {
 378 | 		FILE *f;
 379 | 		char *buf;
 380 | 		size_t len;
 381 |         } contents;
 382 | 	contents.f = open_memstream(&contents.buf, &contents.len);
 383 | 
 384 | 	int backslash_seen = 0;
 385 | 	while(1) {
 386 | 		/* ignore unknown tokens in macro body */
 387 | 		ret = tokenizer_next(t, &curr);
 388 | 		if(!ret) return 0;
 389 | 		if(curr.type == TT_EOF) break;
 390 | 		if (curr.type == TT_SEP) {
 391 | 			if(curr.value == '\\')
 392 | 				backslash_seen = 1;
 393 | 			else {
 394 | 				if(curr.value == '\n' && !backslash_seen) break;
 395 | 				emit_token(contents.f, &curr, t->buf);
 396 | 				backslash_seen = 0;
 397 | 			}
 398 | 		} else {
 399 | 			emit_token(contents.f, &curr, t->buf);
 400 | 		}
 401 | 	}
 402 | 	new.str_contents = freopen_r(contents.f, &contents.buf, &contents.len);
 403 | 	new.str_contents_buf = contents.buf;
 404 | done:
 405 | 	if(redefined) {
 406 | 		struct macro *old = get_macro(cpp, macroname);
 407 | 		char *s_old = old->str_contents_buf ? old->str_contents_buf : "";
 408 | 		char *s_new = new.str_contents_buf ? new.str_contents_buf : "";
 409 | 		if(strcmp(s_old, s_new)) {
 410 | 			char buf[128];
 411 | 			sprintf(buf, "redefinition of macro %s", macroname);
 412 | 			warning(buf, t, 0);
 413 | 		}
 414 | 	}
 415 | 	new.num_args |= macro_flags;
 416 | 	add_macro(cpp, macroname, &new);
 417 | 	return 1;
 418 | }
 419 | 
 420 | static size_t macro_arglist_pos(struct macro *m, const char* iden) {
 421 | 	size_t i;
 422 | 	for(i = 0; i < tglist_getsize(&m->argnames); i++) {
 423 | 		char *item = tglist_get(&m->argnames, i);
 424 | 		if(!strcmp(item, iden)) return i;
 425 | 	}
 426 | 	return (size_t) -1;
 427 | }
 428 | 
 429 | 
 430 | struct macro_info {
 431 | 	const char *name;
 432 | 	unsigned nest;
 433 | 	unsigned first;
 434 | 	unsigned last;
 435 | };
 436 | 
 437 | static int was_visited(const char *name, char*visited[], unsigned rec_level) {
 438 | 	int x;
 439 | 	for(x = rec_level; x >= 0; --x) {
 440 | 		if(!strcmp(visited[x], name)) return 1;
 441 | 	}
 442 | 	return 0;
 443 | }
 444 | 
 445 | unsigned get_macro_info(struct cpp* cpp,
 446 | 	struct tokenizer *t,
 447 | 	struct macro_info *mi_list, size_t *mi_cnt,
 448 | 	unsigned nest, unsigned tpos, const char *name,
 449 | 	char* visited[], unsigned rec_level
 450 | 	) {
 451 | 	int brace_lvl = 0;
 452 | 	while(1) {
 453 | 		struct token tok;
 454 | 		int ret = tokenizer_next(t, &tok);
 455 | 		if(!ret || tok.type == TT_EOF) break;
 456 | #ifdef DEBUG
 457 | 		dprintf(2, "(%s) nest %d, brace %u t: %s\n", name, nest, brace_lvl, t->buf);
 458 | #endif
 459 | 		struct macro* m = 0;
 460 | 		if(tok.type == TT_IDENTIFIER && (m = get_macro(cpp, t->buf)) && !was_visited(t->buf, visited, rec_level)) {
 461 | 			const char* newname = strdup(t->buf);
 462 | 			if(FUNCTIONLIKE(m)) {
 463 | 				if(tokenizer_peek(t) == '(') {
 464 | 					unsigned tpos_save = tpos;
 465 | 					tpos = get_macro_info(cpp, t, mi_list, mi_cnt, nest+1, tpos+1, newname, visited, rec_level);
 466 | 					mi_list[*mi_cnt] = (struct macro_info) {
 467 | 						.name = newname,
 468 | 						.nest=nest+1,
 469 | 						.first = tpos_save,
 470 | 						.last = tpos + 1};
 471 | 					++(*mi_cnt);
 472 | 				} else {
 473 | 					/* suppress expansion */
 474 | 				}
 475 | 			} else {
 476 | 				mi_list[*mi_cnt] = (struct macro_info) {
 477 | 					.name = newname,
 478 | 					.nest=nest+1,
 479 | 					.first = tpos,
 480 | 					.last = tpos + 1};
 481 | 				++(*mi_cnt);
 482 | 			}
 483 | 		} else if(is_char(&tok, '(')) {
 484 | 			++brace_lvl;
 485 | 		} else if(is_char(&tok, ')')) {
 486 | 			--brace_lvl;
 487 | 			if(brace_lvl == 0 && nest != 0) break;
 488 | 		}
 489 | 		++tpos;
 490 | 	}
 491 | 	return tpos;
 492 | }
 493 | 
 494 | struct FILE_container {
 495 | 	FILE *f;
 496 | 	char *buf;
 497 | 	size_t len;
 498 | 	struct tokenizer t;
 499 | };
 500 | 
 501 | static void free_file_container(struct FILE_container *fc) {
 502 | 	fclose(fc->f);
 503 | 	free(fc->buf);
 504 | }
 505 | 
 506 | static int mem_tokenizers_join(
 507 | 	struct FILE_container* org, struct FILE_container *inj,
 508 | 	struct FILE_container* result,
 509 | 	int first, off_t lastpos) {
 510 | 	result->f = open_memstream(&result->buf, &result->len);
 511 | 	size_t i;
 512 | 	struct token tok;
 513 | 	int ret;
 514 | 	tokenizer_rewind(&org->t);
 515 | 	for(i=0; i<first; ++i) {
 516 | 		ret = tokenizer_next(&org->t, &tok);
 517 | 		assert(ret && tok.type != TT_EOF);
 518 | 		emit_token(result->f, &tok, org->t.buf);
 519 | 	}
 520 | 	int cnt = 0, last = first;
 521 | 	while(1) {
 522 | 		ret = tokenizer_next(&inj->t, &tok);
 523 | 		if(!ret || tok.type == TT_EOF) break;
 524 | 		emit_token(result->f, &tok, inj->t.buf);
 525 | 		++cnt;
 526 | 	}
 527 | 	while(tokenizer_ftello(&org->t) < lastpos) {
 528 | 		ret = tokenizer_next(&org->t, &tok);
 529 | 		last++;
 530 | 	}
 531 | 
 532 | 	int diff = cnt - ((int) last - (int) first);
 533 | 
 534 | 	while(1) {
 535 | 		ret = tokenizer_next(&org->t, &tok);
 536 | 		if(!ret || tok.type == TT_EOF) break;
 537 | 		emit_token(result->f, &tok, org->t.buf);
 538 | 	}
 539 | 
 540 | 	result->f = freopen_r(result->f, &result->buf, &result->len);
 541 | 	tokenizer_from_file(&result->t, result->f);
 542 | 	return diff;
 543 | }
 544 | 
 545 | static int tchain_parens_follows(struct cpp *cpp, int rec_level) {
 546 | 	int i, c = 0;
 547 | 	for(i=rec_level;i>=0;--i) {
 548 | 		c = tokenizer_peek(cpp->tchain[i]);
 549 | 		if(c == EOF) continue;
 550 | 		if(c == '(') return i;
 551 | 		else break;
 552 | 	}
 553 | 	return -1;
 554 | }
 555 | 
 556 | static int stringify(struct cpp *ccp, struct tokenizer *t, FILE* output) {
 557 | 	int ret = 1;
 558 | 	struct token tok;
 559 | 	emit(output, "\"");
 560 | 	while(1) {
 561 | 		ret = tokenizer_next(t, &tok);
 562 | 		if(!ret) return ret;
 563 | 		if(tok.type == TT_EOF) break;
 564 | 		if(is_char(&tok, '\n')) continue;
 565 | 		if(is_char(&tok, '\\') && tokenizer_peek(t) == '\n') continue;
 566 | 		if(tok.type == TT_DQSTRING_LIT) {
 567 | 			char *s = t->buf;
 568 | 			char buf[2] = {0};
 569 | 			while(*s) {
 570 | 				if(*s == '\"') {
 571 | 					emit(output, "\\\"");
 572 | 				} else if (*s == '\\') {
 573 | 					emit(output, "\\\\");
 574 | 				} else {
 575 | 					buf[0] = *s;
 576 | 					emit(output, buf);
 577 | 				}
 578 | 				++s;
 579 | 			}
 580 | 		} else
 581 | 			emit_token(output, &tok, t->buf);
 582 | 	}
 583 | 	emit(output, "\"");
 584 | 	return ret;
 585 | }
 586 | 
 587 | /* rec_level -1 serves as a magic value to signal we're using
 588 |    expand_macro from the if-evaluator code, which means activating
 589 |    the "define" macro */
 590 | static int expand_macro(struct cpp* cpp, struct tokenizer *t, FILE* out, const char* name, unsigned rec_level, char* visited[]) {
 591 | 	int is_define = !strcmp(name, "defined");
 592 | 
 593 | 	struct macro *m;
 594 | 	if(is_define && rec_level != -1)
 595 | 		m = NULL;
 596 | 	else m = get_macro(cpp, name);
 597 | 	if(!m) {
 598 | 		emit(out, name);
 599 | 		return 1;
 600 | 	}
 601 | 	if(rec_level == -1) rec_level = 0;
 602 | 	if(rec_level >= MAX_RECURSION) {
 603 | 		error("max recursion level reached", t, 0);
 604 | 		return 0;
 605 | 	}
 606 | #ifdef DEBUG
 607 | 	dprintf(2, "lvl %u: expanding macro %s (%s)\n", rec_level, name, m->str_contents_buf);
 608 | #endif
 609 | 
 610 | 	if(rec_level == 0 && strcmp(t->filename, "<macro>")) {
 611 | 		cpp->last_file = t->filename;
 612 | 		cpp->last_line = t->line;
 613 | 	}
 614 | 	if(!strcmp(name, "__FILE__")) {
 615 | 		emit(out, "\"");
 616 | 		emit(out, cpp->last_file);
 617 | 		emit(out, "\"");
 618 | 		return 1;
 619 | 	} else if(!strcmp(name, "__LINE__")) {
 620 | 		char buf[64];
 621 | 		sprintf(buf, "%d", cpp->last_line);
 622 | 		emit(out, buf);
 623 | 		return 1;
 624 | 	}
 625 | 
 626 | 	if(visited[rec_level]) free(visited[rec_level]);
 627 | 	visited[rec_level] = strdup(name);
 628 | 	cpp->tchain[rec_level] = t;
 629 | 
 630 | 	size_t i;
 631 | 	struct token tok;
 632 | 	unsigned num_args = MACRO_ARGCOUNT(m);
 633 | 	struct FILE_container *argvalues = calloc(MACRO_VARIADIC(m) ? num_args + 1 : num_args, sizeof(struct FILE_container));
 634 | 
 635 | 	for(i=0; i < num_args; i++)
 636 | 		argvalues[i].f = open_memstream(&argvalues[i].buf, &argvalues[i].len);
 637 | 
 638 | 	/* replace named arguments in the contents of the macro call */
 639 | 	if(FUNCTIONLIKE(m)) {
 640 | 		int ret;
 641 | 		if((ret = tokenizer_peek(t)) != '(') {
 642 | 			/* function-like macro shall not be expanded if not followed by '(' */
 643 | 			if(ret == EOF && rec_level > 0 && (ret = tchain_parens_follows(cpp, rec_level-1)) != -1) {
 644 | 				// warning("Replacement text involved subsequent text", t, 0);
 645 | 				t = cpp->tchain[ret];
 646 | 			} else {
 647 | 				emit(out, name);
 648 | 				goto cleanup;
 649 | 			}
 650 | 		}
 651 | 		ret = x_tokenizer_next(t, &tok);
 652 | 		assert(ret && is_char(&tok, '('));
 653 | 
 654 | 		unsigned curr_arg = 0, need_arg = 1, parens = 0;
 655 | 		int ws_count;
 656 | 		if(!tokenizer_skip_chars(t, " \t", &ws_count)) return 0;
 657 | 
 658 | 		int varargs = 0;
 659 | 		if(num_args == 1 && MACRO_VARIADIC(m)) varargs = 1;
 660 | 		while(1) {
 661 | 			int ret = tokenizer_next(t, &tok);
 662 | 			if(!ret) return 0;
 663 | 			if( tok.type == TT_EOF) {
 664 | 				dprintf(2, "warning EOF\n");
 665 | 				break;
 666 | 			}
 667 | 			if(!parens && is_char(&tok, ',') && !varargs) {
 668 | 				if(need_arg && !ws_count) {
 669 | 					/* empty argument is OK */
 670 | 				}
 671 | 				need_arg = 1;
 672 | 				if(!varargs) curr_arg++;
 673 | 				if(curr_arg + 1 == num_args && MACRO_VARIADIC(m)) {
 674 | 					varargs = 1;
 675 | 				} else if(curr_arg >= num_args) {
 676 | 					error("too many arguments for function macro", t, &tok);
 677 | 					return 0;
 678 | 				}
 679 | 				ret = tokenizer_skip_chars(t, " \t", &ws_count);
 680 | 				if(!ret) return ret;
 681 | 				continue;
 682 | 			} else if(is_char(&tok, '(')) {
 683 | 				++parens;
 684 | 			} else if(is_char(&tok, ')')) {
 685 | 				if(!parens) {
 686 | 					if(curr_arg + num_args && curr_arg < num_args-1) {
 687 | 						error("too few args for function macro", t, &tok);
 688 | 						return 0;
 689 | 					}
 690 | 					break;
 691 | 				}
 692 | 				--parens;
 693 | 			} else if(is_char(&tok, '\\')) {
 694 | 				if(tokenizer_peek(t) == '\n') continue;
 695 | 			}
 696 | 			need_arg = 0;
 697 | 			emit_token(argvalues[curr_arg].f, &tok, t->buf);
 698 | 		}
 699 | 	}
 700 | 
 701 | 	for(i=0; i < num_args; i++) {
 702 | 		argvalues[i].f = freopen_r(argvalues[i].f, &argvalues[i].buf, &argvalues[i].len);
 703 | 		tokenizer_from_file(&argvalues[i].t, argvalues[i].f);
 704 | #ifdef DEBUG
 705 | 		dprintf(2, "macro argument %i: %s\n", (int) i, argvalues[i].buf);
 706 | #endif
 707 | 	}
 708 | 
 709 | 	if(is_define) {
 710 | 		if(get_macro(cpp, argvalues[0].buf))
 711 | 			emit(out, "1");
 712 | 		else
 713 | 			emit(out, "0");
 714 | 	}
 715 | 
 716 | 	if(!m->str_contents) goto cleanup;
 717 | 
 718 | 	struct FILE_container cwae = {0}; /* contents_with_args_expanded */
 719 | 	cwae.f = open_memstream(&cwae.buf, &cwae.len);
 720 | 	FILE* output = cwae.f;
 721 | 
 722 | 	struct tokenizer t2;
 723 | 	tokenizer_from_file(&t2, m->str_contents);
 724 | 	int hash_count = 0;
 725 | 	int ws_count = 0;
 726 | 	while(1) {
 727 | 		int ret;
 728 | 		ret = tokenizer_next(&t2, &tok);
 729 | 		if(!ret) return 0;
 730 | 		if(tok.type == TT_EOF) break;
 731 | 		if(tok.type == TT_IDENTIFIER) {
 732 | 			flush_whitespace(output, &ws_count);
 733 | 			char *id = t2.buf;
 734 | 			if(MACRO_VARIADIC(m) && !strcmp(t2.buf, "__VA_ARGS__")) {
 735 | 				id = "...";
 736 | 			}
 737 | 			size_t arg_nr = macro_arglist_pos(m, id);
 738 | 			if(arg_nr != (size_t) -1) {
 739 | 				tokenizer_rewind(&argvalues[arg_nr].t);
 740 | 				if(hash_count == 1) ret = stringify(cpp, &argvalues[arg_nr].t, output);
 741 | 				else while(1) {
 742 | 					ret = tokenizer_next(&argvalues[arg_nr].t, &tok);
 743 | 					if(!ret) return ret;
 744 | 					if(tok.type == TT_EOF) break;
 745 | 					emit_token(output, &tok, argvalues[arg_nr].t.buf);
 746 | 				}
 747 | 				hash_count = 0;
 748 | 			} else {
 749 | 				if(hash_count == 1) {
 750 | 		hash_err:
 751 | 					error("'#' is not followed by macro parameter", &t2, &tok);
 752 | 					return 0;
 753 | 				}
 754 | 				emit_token(output, &tok, t2.buf);
 755 | 			}
 756 | 		} else if(is_char(&tok, '#')) {
 757 | 			if(hash_count) {
 758 | 				goto hash_err;
 759 | 			}
 760 | 			while(1) {
 761 | 				++hash_count;
 762 | 				/* in a real cpp we'd need to look for '\\' first */
 763 | 				while(tokenizer_peek(&t2) == '\n') {
 764 | 					x_tokenizer_next(&t2, &tok);
 765 | 				}
 766 | 				if(tokenizer_peek(&t2) == '#') x_tokenizer_next(&t2, &tok);
 767 | 				else break;
 768 | 			}
 769 | 			if(hash_count == 1) flush_whitespace(output, &ws_count);
 770 | 			else if(hash_count > 2) {
 771 | 				error("only two '#' characters allowed for macro expansion", &t2, &tok);
 772 | 				return 0;
 773 | 			}
 774 | 			if(hash_count == 2)
 775 | 				ret = tokenizer_skip_chars(&t2, " \t\n", &ws_count);
 776 | 			else
 777 | 				ret = tokenizer_skip_chars(&t2, " \t", &ws_count);
 778 | 
 779 | 			if(!ret) return ret;
 780 | 			ws_count = 0;
 781 | 
 782 | 		} else if(is_whitespace_token(&tok)) {
 783 | 			ws_count++;
 784 | 		} else {
 785 | 			if(hash_count == 1) goto hash_err;
 786 | 			flush_whitespace(output, &ws_count);
 787 | 			emit_token(output, &tok, t2.buf);
 788 | 		}
 789 | 	}
 790 | 	flush_whitespace(output, &ws_count);
 791 | 
 792 | 	/* we need to expand macros after the macro arguments have been inserted */
 793 | 	if(1) {
 794 | 		cwae.f = freopen_r(cwae.f, &cwae.buf, &cwae.len);
 795 | #ifdef DEBUG
 796 | 		dprintf(2, "contents with args expanded: %s\n", cwae.buf);
 797 | #endif
 798 | 		tokenizer_from_file(&cwae.t, cwae.f);
 799 | 		size_t mac_cnt = 0;
 800 | 		while(1) {
 801 | 			int ret = tokenizer_next(&cwae.t, &tok);
 802 | 			if(!ret) return ret;
 803 | 			if(tok.type == TT_EOF) break;
 804 | 			if(tok.type == TT_IDENTIFIER && get_macro(cpp, cwae.t.buf))
 805 | 				++mac_cnt;
 806 | 		}
 807 | 
 808 | 		tokenizer_rewind(&cwae.t);
 809 | 		struct macro_info *mcs = calloc(mac_cnt, sizeof(struct macro_info));
 810 | 		{
 811 | 			size_t mac_iter = 0;
 812 | 			get_macro_info(cpp, &cwae.t, mcs, &mac_iter, 0, 0, "null", visited, rec_level);
 813 | 			/* some of the macros might not expand at this stage (without braces)*/
 814 | 			while(mac_cnt && mcs[mac_cnt-1].name == 0)
 815 | 				--mac_cnt;
 816 | 		}
 817 | 		size_t i; int depth = 0;
 818 | 		for(i = 0; i < mac_cnt; ++i) {
 819 | 			if(mcs[i].nest > depth) depth = mcs[i].nest;
 820 | 		}
 821 | 		while(depth > -1) {
 822 | 			for(i = 0; i < mac_cnt; ++i) if(mcs[i].nest == depth) {
 823 | 				struct macro_info *mi = &mcs[i];
 824 | 				tokenizer_rewind(&cwae.t);
 825 | 				size_t j;
 826 | 				struct token utok;
 827 | 				for(j = 0; j < mi->first+1; ++j)
 828 | 					tokenizer_next(&cwae.t, &utok);
 829 | 				struct FILE_container t2 = {0}, tmp = {0};
 830 | 				t2.f = open_memstream(&t2.buf, &t2.len);
 831 | 				if(!expand_macro(cpp, &cwae.t, t2.f, mi->name, rec_level+1, visited))
 832 | 					return 0;
 833 | 				t2.f = freopen_r(t2.f, &t2.buf, &t2.len);
 834 | 				tokenizer_from_file(&t2.t, t2.f);
 835 | 				/* manipulating the stream in case more stuff has been consumed */
 836 | 				off_t cwae_pos = tokenizer_ftello(&cwae.t);
 837 | 				tokenizer_rewind(&cwae.t);
 838 | #ifdef DEBUG
 839 | 				dprintf(2, "merging %s with %s\n", cwae.buf, t2.buf);
 840 | #endif
 841 | 				int diff = mem_tokenizers_join(&cwae, &t2, &tmp, mi->first, cwae_pos);
 842 | 				free_file_container(&cwae);
 843 | 				free_file_container(&t2);
 844 | 				cwae = tmp;
 845 | #ifdef DEBUG
 846 | 				dprintf(2, "result: %s\n", cwae.buf);
 847 | #endif
 848 | 				if(diff == 0) continue;
 849 | 				for(j = 0; j < mac_cnt; ++j) {
 850 | 					if(j == i) continue;
 851 | 					struct macro_info *mi2 = &mcs[j];
 852 | 					/* modified element mi can be either inside, after or before
 853 | 					   another macro. the after case doesn't affect us. */
 854 | 					if(mi->first >= mi2->first && mi->last <= mi2->last) {
 855 | 						/* inside m2 */
 856 | 						mi2->last += diff;
 857 | 					} else if (mi->first < mi2->first) {
 858 | 						/* before m2 */
 859 | 						mi2->first += diff;
 860 | 						mi2->last += diff;
 861 | 					}
 862 | 				}
 863 | 			}
 864 | 			--depth;
 865 | 		}
 866 | 		tokenizer_rewind(&cwae.t);
 867 | 		while(1) {
 868 | 			struct macro *ma;
 869 | 			tokenizer_next(&cwae.t, &tok);
 870 | 			if(tok.type == TT_EOF) break;
 871 | 			if(tok.type == TT_IDENTIFIER && tokenizer_peek(&cwae.t) == EOF &&
 872 | 			   (ma = get_macro(cpp, cwae.t.buf)) && FUNCTIONLIKE(ma) && tchain_parens_follows(cpp, rec_level) != -1
 873 | 			) {
 874 | 				int ret = expand_macro(cpp, &cwae.t, out, cwae.t.buf, rec_level+1, visited);
 875 | 				if(!ret) return ret;
 876 | 			} else
 877 | 				emit_token(out, &tok, cwae.t.buf);
 878 | 		}
 879 | 		free(mcs);
 880 | 	}
 881 | 
 882 | 	free_file_container(&cwae);
 883 | 
 884 | cleanup:
 885 | 	for(i=0; i < num_args; i++) {
 886 | 		fclose(argvalues[i].f);
 887 | 		free(argvalues[i].buf);
 888 | 	}
 889 | 	free(argvalues);
 890 | 	return 1;
 891 | }
 892 | 
 893 | #define TT_LAND TT_CUSTOM+0
 894 | #define TT_LOR TT_CUSTOM+1
 895 | #define TT_LTE TT_CUSTOM+2
 896 | #define TT_GTE TT_CUSTOM+3
 897 | #define TT_SHL TT_CUSTOM+4
 898 | #define TT_SHR TT_CUSTOM+5
 899 | #define TT_EQ TT_CUSTOM+6
 900 | #define TT_NEQ TT_CUSTOM+7
 901 | #define TT_LT TT_CUSTOM+8
 902 | #define TT_GT TT_CUSTOM+9
 903 | #define TT_BAND TT_CUSTOM+10
 904 | #define TT_BOR TT_CUSTOM+11
 905 | #define TT_XOR TT_CUSTOM+12
 906 | #define TT_NEG TT_CUSTOM+13
 907 | #define TT_PLUS TT_CUSTOM+14
 908 | #define TT_MINUS TT_CUSTOM+15
 909 | #define TT_MUL TT_CUSTOM+16
 910 | #define TT_DIV TT_CUSTOM+17
 911 | #define TT_MOD TT_CUSTOM+18
 912 | #define TT_LPAREN TT_CUSTOM+19
 913 | #define TT_RPAREN TT_CUSTOM+20
 914 | #define TT_LNOT TT_CUSTOM+21
 915 | 
 916 | #define TTINT(X) X-TT_CUSTOM
 917 | #define TTENT(X, Y) [TTINT(X)] = Y
 918 | 
 919 | static int bp(int tokentype) {
 920 | 	static const int bplist[] = {
 921 | 		TTENT(TT_LOR, 1 << 4),
 922 | 		TTENT(TT_LAND, 1 << 5),
 923 | 		TTENT(TT_BOR, 1 << 6),
 924 | 		TTENT(TT_XOR, 1 << 7),
 925 | 		TTENT(TT_BAND, 1 << 8),
 926 | 		TTENT(TT_EQ, 1 << 9),
 927 | 		TTENT(TT_NEQ, 1 << 9),
 928 | 		TTENT(TT_LTE, 1 << 10),
 929 | 		TTENT(TT_GTE, 1 << 10),
 930 | 		TTENT(TT_LT, 1 << 10),
 931 | 		TTENT(TT_GT, 1 << 10),
 932 | 		TTENT(TT_SHL, 1 << 11),
 933 | 		TTENT(TT_SHR, 1 << 11),
 934 | 		TTENT(TT_PLUS, 1 << 12),
 935 | 		TTENT(TT_MINUS, 1 << 12),
 936 | 		TTENT(TT_MUL, 1 << 13),
 937 | 		TTENT(TT_DIV, 1 << 13),
 938 | 		TTENT(TT_MOD, 1 << 13),
 939 | 		TTENT(TT_NEG, 1 << 14),
 940 | 		TTENT(TT_LNOT, 1 << 14),
 941 | 		TTENT(TT_LPAREN, 1 << 15),
 942 | //		TTENT(TT_RPAREN, 1 << 15),
 943 | //		TTENT(TT_LPAREN, 0),
 944 | 		TTENT(TT_RPAREN, 0),
 945 | 	};
 946 | 	if(TTINT(tokentype) < sizeof(bplist)/sizeof(bplist[0])) return bplist[TTINT(tokentype)];
 947 | 	return 0;
 948 | }
 949 | 
 950 | static int expr(struct tokenizer *t, int rbp, int *err);
 951 | 
 952 | static int charlit_to_int(const char *lit) {
 953 | 	if(lit[1] == '\\') switch(lit[2]) {
 954 | 		case '0': return 0;
 955 | 		case 'n': return 10;
 956 | 		case 't': return 9;
 957 | 		case 'r': return 13;
 958 | 		case 'x': return strtol(lit+3, NULL, 16);
 959 | 		default: return lit[2];
 960 | 	}
 961 | 	return lit[1];
 962 | }
 963 | 
 964 | static int nud(struct tokenizer *t, struct token *tok, int *err) {
 965 | 	switch((unsigned) tok->type) {
 966 | 		case TT_IDENTIFIER: return 0;
 967 | 		case TT_WIDECHAR_LIT:
 968 | 		case TT_SQSTRING_LIT:  return charlit_to_int(t->buf);
 969 | 		case TT_HEX_INT_LIT:
 970 | 		case TT_OCT_INT_LIT:
 971 | 		case TT_DEC_INT_LIT:
 972 | 			return strtol(t->buf, NULL, 0);
 973 | 		case TT_NEG:   return ~ expr(t, bp(tok->type), err);
 974 | 		case TT_PLUS:  return expr(t, bp(tok->type), err);
 975 | 		case TT_MINUS: return - expr(t, bp(tok->type), err);
 976 | 		case TT_LNOT:  return !expr(t, bp(tok->type), err);
 977 | 		case TT_LPAREN: {
 978 | 			int inner = expr(t, 0, err);
 979 | 			if(0!=expect(t, TT_RPAREN, (const char*[]){")", 0}, tok)) {
 980 | 				error("missing ')'", t, tok);
 981 | 				return 0;
 982 | 			}
 983 | 			return inner;
 984 | 		}
 985 | 		case TT_FLOAT_LIT:
 986 | 			error("floating constant in preprocessor expression", t, tok);
 987 | 			*err = 1;
 988 | 			return 0;
 989 | 		case TT_RPAREN:
 990 | 		default:
 991 | 			error("unexpected token", t, tok);
 992 | 			*err = 1;
 993 | 			return 0;
 994 | 	}
 995 | }
 996 | 
 997 | static int led(struct tokenizer *t, int left, struct token *tok, int *err) {
 998 | 	int right;
 999 | 	switch((unsigned) tok->type) {
1000 | 		case TT_LAND:
1001 | 		case TT_LOR:
1002 | 			right = expr(t, bp(tok->type), err);
1003 | 			if(tok->type == TT_LAND) return left && right;
1004 | 			return left || right;
1005 | 		case TT_LTE:  return left <= expr(t, bp(tok->type), err);
1006 | 		case TT_GTE:  return left >= expr(t, bp(tok->type), err);
1007 | 		case TT_SHL:  return left << expr(t, bp(tok->type), err);
1008 | 		case TT_SHR:  return left >> expr(t, bp(tok->type), err);
1009 | 		case TT_EQ:   return left == expr(t, bp(tok->type), err);
1010 | 		case TT_NEQ:  return left != expr(t, bp(tok->type), err);
1011 | 		case TT_LT:   return left <  expr(t, bp(tok->type), err);
1012 | 		case TT_GT:   return left >  expr(t, bp(tok->type), err);
1013 | 		case TT_BAND: return left &  expr(t, bp(tok->type), err);
1014 | 		case TT_BOR:  return left |  expr(t, bp(tok->type), err);
1015 | 		case TT_XOR:  return left ^  expr(t, bp(tok->type), err);
1016 | 		case TT_PLUS: return left +  expr(t, bp(tok->type), err);
1017 | 		case TT_MINUS:return left -  expr(t, bp(tok->type), err);
1018 | 		case TT_MUL:  return left *  expr(t, bp(tok->type), err);
1019 | 		case TT_DIV:
1020 | 		case TT_MOD:
1021 | 			right = expr(t, bp(tok->type), err);
1022 | 			if(right == 0)  {
1023 | 				error("eval: div by zero", t, tok);
1024 | 				*err = 1;
1025 | 			}
1026 | 			else if(tok->type == TT_DIV) return left / right;
1027 | 			else if(tok->type == TT_MOD) return left % right;
1028 | 			return 0;
1029 | 		default:
1030 | 			error("eval: unexpect token", t, tok);
1031 | 			*err = 1;
1032 | 			return 0;
1033 | 	}
1034 | }
1035 | 
1036 | 
1037 | static int tokenizer_peek_next_non_ws(struct tokenizer *t, struct token *tok)
1038 | {
1039 | 	int ret;
1040 | 	while(1) {
1041 | 		ret = tokenizer_peek_token(t, tok);
1042 | 		if(is_whitespace_token(tok))
1043 | 			x_tokenizer_next(t, tok);
1044 | 		else break;
1045 | 	}
1046 | 	return ret;
1047 | }
1048 | 
1049 | static int expr(struct tokenizer *t, int rbp, int*err) {
1050 | 	struct token tok;
1051 | 	int ret = skip_next_and_ws(t, &tok);
1052 | 	if(tok.type == TT_EOF) return 0;
1053 | 	int left = nud(t, &tok, err);
1054 | 	while(1) {
1055 | 		ret = tokenizer_peek_next_non_ws(t, &tok);
1056 | 		if(bp(tok.type) <= rbp) break;
1057 | 		ret = tokenizer_next(t, &tok);
1058 | 		if(tok.type == TT_EOF) break;
1059 | 		left = led(t, left, &tok, err);
1060 | 	}
1061 | 	(void) ret;
1062 | 	return left;
1063 | }
1064 | 
1065 | static int do_eval(struct tokenizer *t, int *result) {
1066 | 	tokenizer_register_custom_token(t, TT_LAND, "&&");
1067 | 	tokenizer_register_custom_token(t, TT_LOR, "||");
1068 | 	tokenizer_register_custom_token(t, TT_LTE, "<=");
1069 | 	tokenizer_register_custom_token(t, TT_GTE, ">=");
1070 | 	tokenizer_register_custom_token(t, TT_SHL, "<<");
1071 | 	tokenizer_register_custom_token(t, TT_SHR, ">>");
1072 | 	tokenizer_register_custom_token(t, TT_EQ, "==");
1073 | 	tokenizer_register_custom_token(t, TT_NEQ, "!=");
1074 | 
1075 | 	tokenizer_register_custom_token(t, TT_LT, "<");
1076 | 	tokenizer_register_custom_token(t, TT_GT, ">");
1077 | 
1078 | 	tokenizer_register_custom_token(t, TT_BAND, "&");
1079 | 	tokenizer_register_custom_token(t, TT_BOR, "|");
1080 | 	tokenizer_register_custom_token(t, TT_XOR, "^");
1081 | 	tokenizer_register_custom_token(t, TT_NEG, "~");
1082 | 
1083 | 	tokenizer_register_custom_token(t, TT_PLUS, "+");
1084 | 	tokenizer_register_custom_token(t, TT_MINUS, "-");
1085 | 	tokenizer_register_custom_token(t, TT_MUL, "*");
1086 | 	tokenizer_register_custom_token(t, TT_DIV, "/");
1087 | 	tokenizer_register_custom_token(t, TT_MOD, "%");
1088 | 
1089 | 	tokenizer_register_custom_token(t, TT_LPAREN, "(");
1090 | 	tokenizer_register_custom_token(t, TT_RPAREN, ")");
1091 | 	tokenizer_register_custom_token(t, TT_LNOT, "!");
1092 | 
1093 | 	int err = 0;
1094 | 	*result = expr(t, 0, &err);
1095 | #ifdef DEBUG
1096 | 	dprintf(2, "eval result: %d\n", *result);
1097 | #endif
1098 | 	return !err;
1099 | }
1100 | 
1101 | static int evaluate_condition(struct cpp *cpp, struct tokenizer *t, int *result, char *visited[]) {
1102 | 	int ret, backslash_seen = 0;
1103 | 	struct token curr;
1104 | 	char *bufp;
1105 | 	size_t size;
1106 | 	int tflags = tokenizer_get_flags(t);
1107 | 	tokenizer_set_flags(t, tflags | TF_PARSE_WIDE_STRINGS);
1108 | 	ret = tokenizer_next(t, &curr);
1109 | 	if(!ret) return ret;
1110 | 	if(!is_whitespace_token(&curr)) {
1111 | 		error("expected whitespace after if/elif", t, &curr);
1112 | 		return 0;
1113 | 	}
1114 | 	FILE *f = open_memstream(&bufp, &size);
1115 | 	while(1) {
1116 | 		ret = tokenizer_next(t, &curr);
1117 | 		if(!ret) return ret;
1118 | 		if(curr.type == TT_IDENTIFIER) {
1119 | 			if(!expand_macro(cpp, t, f, t->buf, -1, visited)) return 0;
1120 | 		} else if(curr.type == TT_SEP) {
1121 | 			if(curr.value == '\\')
1122 | 				backslash_seen = 1;
1123 | 			else {
1124 | 				if(curr.value == '\n') {
1125 | 					if(!backslash_seen) break;
1126 | 				} else {
1127 | 					emit_token(f, &curr, t->buf);
1128 | 				}
1129 | 				backslash_seen = 0;
1130 | 			}
1131 | 		} else {
1132 | 			emit_token(f, &curr, t->buf);
1133 | 		}
1134 | 	}
1135 | 	f = freopen_r(f, &bufp, &size);
1136 | 	if(!f || size == 0) {
1137 | 		error("#(el)if with no expression", t, &curr);
1138 | 		return 0;
1139 | 	}
1140 | #ifdef DEBUG
1141 | 	dprintf(2, "evaluating condition %s\n", bufp);
1142 | #endif
1143 | 	struct tokenizer t2;
1144 | 	tokenizer_from_file(&t2, f);
1145 | 	ret = do_eval(&t2, result);
1146 | 	fclose(f);
1147 | 	free(bufp);
1148 | 	tokenizer_set_flags(t, tflags);
1149 | 	return ret;
1150 | }
1151 | 
1152 | static void free_visited(char *visited[]) {
1153 | 	size_t i;
1154 | 	for(i=0; i< MAX_RECURSION; i++)
1155 | 		if(visited[i]) free(visited[i]);
1156 | 
1157 | }
1158 | 
1159 | int parse_file(struct cpp *cpp, FILE *f, const char *fn, FILE *out) {
1160 | 	struct tokenizer t;
1161 | 	struct token curr;
1162 | 	tokenizer_init(&t, f, TF_PARSE_STRINGS);
1163 | 	tokenizer_set_filename(&t, fn);
1164 | 	tokenizer_register_marker(&t, MT_MULTILINE_COMMENT_START, "/*"); /**/
1165 | 	tokenizer_register_marker(&t, MT_MULTILINE_COMMENT_END, "*/");
1166 | 	tokenizer_register_marker(&t, MT_SINGLELINE_COMMENT_START, "//");
1167 | 	int ret, newline=1, ws_count = 0;
1168 | 
1169 | 	int if_level = 0, if_level_active = 0, if_level_satisfied = 0;
1170 | 
1171 | #define all_levels_active() (if_level_active == if_level)
1172 | #define prev_level_active() (if_level_active == if_level-1)
1173 | #define set_level(X, V) do { \
1174 | 		if(if_level_active > X) if_level_active = X; \
1175 | 		if(if_level_satisfied > X) if_level_satisfied = X; \
1176 | 		if(V != -1) { \
1177 | 			if(V) if_level_active = X; \
1178 | 			else if(if_level_active == X) if_level_active = X-1; \
1179 | 			if(V && if_level_active == X) if_level_satisfied = X; \
1180 | 		} \
1181 | 		if_level = X; \
1182 | 	} while(0)
1183 | #define skip_conditional_block (if_level > if_level_active)
1184 | 
1185 | 	static const char* directives[] = {"include", "error", "warning", "define", "undef", "if", "elif", "else", "ifdef", "ifndef", "endif", "line", "pragma", 0};
1186 | 	while((ret = tokenizer_next(&t, &curr)) && curr.type != TT_EOF) {
1187 | 		newline = curr.column == 0;
1188 | 		if(newline) {
1189 | 			ret = eat_whitespace(&t, &curr, &ws_count);
1190 | 			if(!ret) return ret;
1191 | 		}
1192 | 		if(curr.type == TT_EOF) break;
1193 | 		if(skip_conditional_block && !(newline && is_char(&curr, '#'))) continue;
1194 | 		if(is_char(&curr, '#')) {
1195 | 			if(!newline) {
1196 | 				error("stray #", &t, &curr);
1197 | 				return 0;
1198 | 			}
1199 | 			int index = expect(&t, TT_IDENTIFIER, directives, &curr);
1200 | 			if(index == -1) {
1201 | 				if(skip_conditional_block) continue;
1202 | 				error("invalid preprocessing directive", &t, &curr);
1203 | 				return 0;
1204 | 			}
1205 | 			if(skip_conditional_block) switch(index) {
1206 | 				case 0: case 1: case 2: case 3: case 4:
1207 | 				case 11: case 12:
1208 | 					continue;
1209 | 				default: break;
1210 | 			}
1211 | 			switch(index) {
1212 | 			case 0:
1213 | 				ret = include_file(cpp, &t, out);
1214 | 				if(!ret) return ret;
1215 | 				break;
1216 | 			case 1:
1217 | 				ret = emit_error_or_warning(&t, 1);
1218 | 				if(!ret) return ret;
1219 | 				break;
1220 | 			case 2:
1221 | 				ret = emit_error_or_warning(&t, 0);
1222 | 				if(!ret) return ret;
1223 | 				break;
1224 | 			case 3:
1225 | 				ret = parse_macro(cpp, &t);
1226 | 				if(!ret) return ret;
1227 | 				break;
1228 | 			case 4:
1229 | 				if(!skip_next_and_ws(&t, &curr)) return 0;
1230 | 				if(curr.type != TT_IDENTIFIER) {
1231 | 					error("expected identifier", &t, &curr);
1232 | 					return 0;
1233 | 				}
1234 | 				undef_macro(cpp, t.buf);
1235 | 				break;
1236 | 			case 5: // if
1237 | 				if(all_levels_active()) {
1238 | 					char* visited[MAX_RECURSION] = {0};
1239 | 					if(!evaluate_condition(cpp, &t, &ret, visited)) return 0;
1240 | 					free_visited(visited);
1241 | 					set_level(if_level + 1, ret);
1242 | 				} else {
1243 | 					set_level(if_level + 1, 0);
1244 | 				}
1245 | 				break;
1246 | 			case 6: // elif
1247 | 				if(prev_level_active() && if_level_satisfied < if_level) {
1248 | 					char* visited[MAX_RECURSION] = {0};
1249 | 					if(!evaluate_condition(cpp, &t, &ret, visited)) return 0;
1250 | 					free_visited(visited);
1251 | 					if(ret) {
1252 | 						if_level_active = if_level;
1253 | 						if_level_satisfied = if_level;
1254 | 					}
1255 | 				} else if(if_level_active == if_level) {
1256 | 					--if_level_active;
1257 | 				}
1258 | 				break;
1259 | 			case 7: // else
1260 | 				if(prev_level_active() && if_level_satisfied < if_level) {
1261 | 					if(1) {
1262 | 						if_level_active = if_level;
1263 | 						if_level_satisfied = if_level;
1264 | 					}
1265 | 				} else if(if_level_active == if_level) {
1266 | 					--if_level_active;
1267 | 				}
1268 | 				break;
1269 | 			case 8: // ifdef
1270 | 			case 9: // ifndef
1271 | 				if(!skip_next_and_ws(&t, &curr) || curr.type == TT_EOF) return 0;
1272 | 				ret = !!get_macro(cpp, t.buf);
1273 | 				if(index == 9) ret = !ret;
1274 | 
1275 | 				if(all_levels_active()) {
1276 | 					set_level(if_level + 1, ret);
1277 | 				} else {
1278 | 					set_level(if_level + 1, 0);
1279 | 				}
1280 | 				break;
1281 | 			case 10: // endif
1282 | 				set_level(if_level-1, -1);
1283 | 				break;
1284 | 			case 11: // line
1285 | 				ret = tokenizer_read_until(&t, "\n", 1);
1286 | 				if(!ret) {
1287 | 					error("unknown", &t, &curr);
1288 | 					return 0;
1289 | 				}
1290 | 				break;
1291 | 			case 12: // pragma
1292 | 				emit(out, "#pragma");
1293 | 				while((ret = x_tokenizer_next(&t, &curr)) && curr.type != TT_EOF) {
1294 | 					emit_token(out, &curr, t.buf);
1295 | 					if(is_char(&curr, '\n')) break;
1296 | 				}
1297 | 				if(!ret) return ret;
1298 | 				break;
1299 | 			default:
1300 | 				break;
1301 | 			}
1302 | 			continue;
1303 | 		} else {
1304 | 			while(ws_count) {
1305 | 				emit(out, " ");
1306 | 				--ws_count;
1307 | 			}
1308 | 		}
1309 | #if DEBUG
1310 | 		dprintf(2, "(stdin:%u,%u) ", curr.line, curr.column);
1311 | 		if(curr.type == TT_SEP)
1312 | 			dprintf(2, "separator: %c\n", curr.value == '\n'? ' ' : curr.value);
1313 | 		else
1314 | 			dprintf(2, "%s: %s\n", tokentype_to_str(curr.type), t.buf);
1315 | #endif
1316 | 		if(curr.type == TT_IDENTIFIER) {
1317 | 			char* visited[MAX_RECURSION] = {0};
1318 | 			if(!expand_macro(cpp, &t, out, t.buf, 0, visited))
1319 | 				return 0;
1320 | 			free_visited(visited);
1321 | 		} else {
1322 | 			emit_token(out, &curr, t.buf);
1323 | 		}
1324 | 	}
1325 | 	if(if_level) {
1326 | 		error("unterminated #if", &t, &curr);
1327 | 		return 0;
1328 | 	}
1329 | 	return 1;
1330 | }
1331 | 
1332 | struct cpp * cpp_new(void) {
1333 | 	struct cpp* ret = calloc(1, sizeof(struct cpp));
1334 | 	if(!ret) return ret;
1335 | 	tglist_init(&ret->includedirs);
1336 | 	cpp_add_includedir(ret, ".");
1337 | 	ret->macros = hbmap_new(strptrcmp, string_hash, 128);
1338 | 	struct macro m = {.num_args = 1};
1339 | 	add_macro(ret, strdup("defined"), &m);
1340 | 	m.num_args = MACRO_FLAG_OBJECTLIKE;
1341 | 	add_macro(ret, strdup("__FILE__"), &m);
1342 | 	add_macro(ret, strdup("__LINE__"), &m);
1343 | 	return ret;
1344 | }
1345 | 
1346 | void cpp_free(struct cpp*cpp) {
1347 | 	free_macros(cpp);
1348 | 	tglist_free_values(&cpp->includedirs);
1349 | 	tglist_free_items(&cpp->includedirs);
1350 | }
1351 | 
1352 | void cpp_add_includedir(struct cpp *cpp, const char* includedir) {
1353 | 	tglist_add(&cpp->includedirs, strdup(includedir));
1354 | }
1355 | 
1356 | int cpp_add_define(struct cpp *cpp, const char *mdecl) {
1357 | 	struct FILE_container tmp = {0};
1358 | 	tmp.f = open_memstream(&tmp.buf, &tmp.len);
1359 | 	fprintf(tmp.f, "%s\n", mdecl);
1360 | 	tmp.f = freopen_r(tmp.f, &tmp.buf, &tmp.len);
1361 | 	tokenizer_from_file(&tmp.t, tmp.f);
1362 | 	int ret = parse_macro(cpp, &tmp.t);
1363 | 	free_file_container(&tmp);
1364 | 	return ret;
1365 | }
1366 | 
1367 | int cpp_run(struct cpp *cpp, FILE* in, FILE* out, const char* inname) {
1368 | 	return parse_file(cpp, in, inname, out);
1369 | }
1370 | 


--------------------------------------------------------------------------------
/preproc.h:
--------------------------------------------------------------------------------
 1 | #ifndef PREPROC_H
 2 | #define PREPROC_H
 3 | 
 4 | #include <stdio.h>
 5 | 
 6 | struct cpp;
 7 | 
 8 | struct cpp *cpp_new(void);
 9 | void cpp_free(struct cpp*);
10 | void cpp_add_includedir(struct cpp *cpp, const char* includedir);
11 | int cpp_add_define(struct cpp *cpp, const char *mdecl);
12 | int cpp_run(struct cpp *cpp, FILE* in, FILE* out, const char* inname);
13 | 
14 | #ifdef __GNUC__
15 | #pragma GCC diagnostic ignored "-Wunknown-pragmas"
16 | #endif
17 | #pragma RcB2 DEP "preproc.c"
18 | 
19 | #endif
20 | 
21 | 


--------------------------------------------------------------------------------
/tokenizer.c:
--------------------------------------------------------------------------------
  1 | #include <stdint.h>
  2 | #include <stdio.h>
  3 | #include <ctype.h>
  4 | #include <string.h>
  5 | #include <assert.h>
  6 | 
  7 | #include "tokenizer.h"
  8 | 
  9 | void tokenizer_set_filename(struct tokenizer *t, const char* fn) {
 10 | 	t->filename = fn;
 11 | }
 12 | 
 13 | #define ARRAY_SIZE(X) (sizeof(X)/sizeof(X[0]))
 14 | 
 15 | off_t tokenizer_ftello(struct tokenizer *t) {
 16 | 	return ftello(t->input)-t->getc_buf.buffered;
 17 | }
 18 | 
 19 | static int tokenizer_ungetc(struct tokenizer *t, int c)
 20 | {
 21 | 	++t->getc_buf.buffered;
 22 | 	assert(t->getc_buf.buffered<ARRAY_SIZE(t->getc_buf.buf));
 23 | 	assert(t->getc_buf.cnt > 0);
 24 | 	--t->getc_buf.cnt;
 25 | 	assert(t->getc_buf.buf[t->getc_buf.cnt % ARRAY_SIZE(t->getc_buf.buf)] == c);
 26 | 	return c;
 27 | }
 28 | static int tokenizer_getc(struct tokenizer *t)
 29 | {
 30 | 	int c;
 31 | 	if(t->getc_buf.buffered) {
 32 | 		t->getc_buf.buffered--;
 33 | 		c = t->getc_buf.buf[(t->getc_buf.cnt) % ARRAY_SIZE(t->getc_buf.buf)];
 34 | 	} else {
 35 | 		c = getc(t->input);
 36 | 		t->getc_buf.buf[t->getc_buf.cnt % ARRAY_SIZE(t->getc_buf.buf)] = c;
 37 | 	}
 38 | 	++t->getc_buf.cnt;
 39 | 	return c;
 40 | }
 41 | 
 42 | int tokenizer_peek(struct tokenizer *t) {
 43 | 	if(t->peeking) return t->peek_token.value;
 44 | 	int ret = tokenizer_getc(t);
 45 | 	if(ret != EOF) tokenizer_ungetc(t, ret);
 46 | 	return ret;
 47 | }
 48 | 
 49 | int tokenizer_peek_token(struct tokenizer *t, struct token *tok) {
 50 | 	int ret = tokenizer_next(t, tok);
 51 | 	t->peek_token = *tok;
 52 | 	t->peeking = 1;
 53 | 	return ret;
 54 | }
 55 | 
 56 | void tokenizer_register_custom_token(struct tokenizer*t, int tokentype, const char* str) {
 57 | 	assert(tokentype >= TT_CUSTOM && tokentype < TT_CUSTOM + MAX_CUSTOM_TOKENS);
 58 | 	int pos = tokentype - TT_CUSTOM;
 59 | 	t->custom_tokens[pos] = str;
 60 | 	if(pos+1 > t->custom_count) t->custom_count = pos+1;
 61 | }
 62 | 
 63 | const char* tokentype_to_str(enum tokentype tt) {
 64 | 	switch((unsigned) tt) {
 65 | 		case TT_IDENTIFIER: return "iden";
 66 | 		case TT_WIDECHAR_LIT: return "widechar";
 67 | 		case TT_WIDESTRING_LIT: return "widestring";
 68 | 		case TT_SQSTRING_LIT: return "single-quoted string";
 69 | 		case TT_DQSTRING_LIT: return "double-quoted string";
 70 | 		case TT_ELLIPSIS: return "ellipsis";
 71 | 		case TT_HEX_INT_LIT: return "hexint";
 72 | 		case TT_OCT_INT_LIT: return "octint";
 73 | 		case TT_DEC_INT_LIT: return "decint";
 74 | 		case TT_FLOAT_LIT: return "float";
 75 | 		case TT_SEP: return "separator";
 76 | 		case TT_UNKNOWN: return "unknown";
 77 | 		case TT_OVERFLOW: return "overflow";
 78 | 		case TT_EOF: return "eof";
 79 | 	}
 80 | 	return "????";
 81 | }
 82 | 
 83 | static int has_ul_tail(const char *p) {
 84 | 	char tail[4];
 85 | 	int tc = 0, c;
 86 | 	while(tc < 4 ) {
 87 | 		if(!*p) break;
 88 | 		c = tolower(*p);
 89 | 		if(c == 'u' || c == 'l') {
 90 | 			tail[tc++] = c;
 91 | 		} else {
 92 | 			return 0;
 93 | 		}
 94 | 		p++;
 95 | 	}
 96 | 	if(tc == 1) return 1;
 97 | 	if(tc == 2) {
 98 | 		if(!memcmp(tail, "lu", 2)) return 1;
 99 | 		if(!memcmp(tail, "ul", 2)) return 1;
100 | 		if(!memcmp(tail, "ll", 2)) return 1;
101 | 	}
102 | 	if(tc == 3) {
103 | 		if(!memcmp(tail, "llu", 3)) return 1;
104 | 		if(!memcmp(tail, "ull", 3)) return 1;
105 | 	}
106 | 	return 0;
107 | }
108 | 
109 | static int is_hex_int_literal(const char *s) {
110 | 	if(s[0] == '-') s++;
111 | 	if(s[0] == '0' && (s[1] == 'x' || s[1] == 'X')) {
112 | 		const char* p = s+2;
113 | 		while(*p) {
114 | 			if(!strchr("0123456789abcdef", tolower(*p))) {
115 | 				if(p == s+2) return 0;
116 | 				return has_ul_tail(p);
117 | 			}
118 | 			p++;
119 | 		}
120 | 		return 1;
121 | 	}
122 | 	return 0;
123 | }
124 | 
125 | static int is_plus_or_minus(int c) {
126 | 	return c == '-' || c == '+';
127 | }
128 | 
129 | static int is_dec_int_literal(const char *str) {
130 | 	const char *s = str;
131 | 	if(is_plus_or_minus(s[0])) s++;
132 | 	if(s[0] == '0') {
133 | 		if(s[1] == 0) return 1;
134 | 		if(isdigit(s[1])) return 0;
135 | 	}
136 | 	while(*s) {
137 | 		if(!isdigit(*s)) {
138 | 			if(s > str && (is_plus_or_minus(str[0]) ? s > str+1 : 1)) return has_ul_tail(s);
139 | 			else return 0;
140 | 		}
141 | 		s++;
142 | 	}
143 | 	return 1;
144 | }
145 | 
146 | static int is_float_literal(const char *str) {
147 | 	const char *s = str;
148 | 	if(is_plus_or_minus(s[0])) s++;
149 | 	int got_dot = 0, got_e = 0, got_digits = 0;
150 | 	while(*s) {
151 | 		int l = tolower(*s);
152 | 		if(*s == '.') {
153 | 			if(got_dot) return 0;
154 | 			got_dot = 1;
155 | 		} else if(l == 'f') {
156 | 			if(s[1] == 0 && (got_dot || got_e) && got_digits) return 1;
157 | 			return 0;
158 | 		} else if (isdigit(*s)) {
159 | 			got_digits = 1;
160 | 		} else if(l == 'e') {
161 | 			if(!got_digits) return 0;
162 | 			s++;
163 | 			if(is_plus_or_minus(*s)) s++;
164 | 			if(!isdigit(*s)) return 0;
165 | 			got_e = 1;
166 | 		} else return 0;
167 | 		s++;
168 | 	}
169 | 	if(got_digits && (got_e || got_dot)) return 1;
170 | 	return 0;
171 | }
172 | 
173 | static int is_valid_float_until(const char*s, const char* until) {
174 | 	int got_digits = 0, got_dot = 0;
175 | 	while(s < until) {
176 | 		if(isdigit(*s)) got_digits = 1;
177 | 		else if(*s == '.') {
178 | 			if(got_dot) return 0;
179 | 			got_dot = 1;
180 | 		} else return 0;
181 | 		++s;
182 | 	}
183 | 	return got_digits | (got_dot << 1);
184 | }
185 | 
186 | static int is_oct_int_literal(const char *s) {
187 | 	if(s[0] == '-') s++;
188 | 	if(s[0] != '0') return 0;
189 | 	while(*s) {
190 | 		if(!strchr("01234567", *s)) return 0;
191 | 		s++;
192 | 	}
193 | 	return 1;
194 | }
195 | 
196 | static int is_identifier(const char *s) {
197 | 	static const char ascmap[128] = {
198 | 	['0'] = 2, ['1'] = 2, ['2'] = 2, ['3'] = 2,
199 | 	['4'] = 2, ['5'] = 2, ['6'] = 2, ['7'] = 2,
200 | 	['8'] = 2, ['9'] = 2, ['A'] = 1, ['B'] = 1,
201 | 	['C'] = 1, ['D'] = 1, ['E'] = 1, ['F'] = 1,
202 | 	['G'] = 1, ['H'] = 1, ['I'] = 1, ['J'] = 1,
203 | 	['K'] = 1, ['L'] = 1, ['M'] = 1, ['N'] = 1,
204 | 	['O'] = 1, ['P'] = 1, ['Q'] = 1, ['R'] = 1,
205 | 	['S'] = 1, ['T'] = 1, ['U'] = 1, ['V'] = 1,
206 | 	['W'] = 1, ['X'] = 1, ['Y'] = 1, ['Z'] = 1,
207 | 	['_'] = 1, ['a'] = 1, ['b'] = 1, ['c'] = 1,
208 | 	['d'] = 1, ['e'] = 1, ['f'] = 1, ['g'] = 1,
209 | 	['h'] = 1, ['i'] = 1, ['j'] = 1, ['k'] = 1,
210 | 	['l'] = 1, ['m'] = 1, ['n'] = 1, ['o'] = 1,
211 | 	['p'] = 1, ['q'] = 1, ['r'] = 1, ['s'] = 1,
212 | 	['t'] = 1, ['u'] = 1, ['v'] = 1, ['w'] = 1,
213 | 	['x'] = 1, ['y'] = 1, ['z'] = 1,
214 | 	};
215 | 	if((*s) & 128) return 0;
216 | 	if(ascmap[(unsigned) *s] != 1) return 0;
217 | 	++s;
218 | 	while(*s) {
219 | 		if((*s) & 128) return 0;
220 | 		if(!ascmap[(unsigned) *s])
221 | 			return 0;
222 | 		s++;
223 | 	}
224 | 	return 1;
225 | }
226 | 
227 | static enum tokentype categorize(const char *s) {
228 | 	if(is_hex_int_literal(s)) return TT_HEX_INT_LIT;
229 | 	if(is_dec_int_literal(s)) return TT_DEC_INT_LIT;
230 | 	if(is_oct_int_literal(s)) return TT_OCT_INT_LIT;
231 | 	if(is_float_literal(s)) return TT_FLOAT_LIT;
232 | 	if(is_identifier(s)) return TT_IDENTIFIER;
233 | 	return TT_UNKNOWN;
234 | }
235 | 
236 | 
237 | static int is_sep(int c) {
238 | 	static const char ascmap[128] = {
239 | 		['\t'] = 1, ['\n'] = 1, [' '] = 1, ['!'] = 1,
240 | 		['\"'] = 1, ['#'] = 1, ['%'] = 1, ['&'] = 1,
241 | 		['\''] = 1, ['('] = 1, [')'] = 1, ['*'] = 1,
242 | 		['+'] = 1, [','] = 1, ['-'] = 1, ['.'] = 1,
243 | 		['/'] = 1, [':'] = 1, [';'] = 1, ['<'] = 1,
244 | 		['='] = 1, ['>'] = 1, ['?'] = 1, ['['] = 1,
245 | 		['\\'] = 1, [']'] = 1, ['{'] = 1, ['|'] = 1,
246 | 		['}'] = 1, ['~'] = 1, ['^'] = 1,
247 | 	};
248 | 	return !(c&128) && ascmap[c];
249 | }
250 | 
251 | static int apply_coords(struct tokenizer *t, struct token* out, char *end, int retval) {
252 | 	out->line = t->line;
253 | 	uintptr_t len = end - t->buf;
254 | 	out->column = t->column - len;
255 | 	if(len + 1 >= t->bufsize) {
256 | 		out->type = TT_OVERFLOW;
257 | 		return 0;
258 | 	}
259 | 	return retval;
260 | }
261 | 
262 | static inline char *assign_bufchar(struct tokenizer *t, char *s, int c) {
263 | 	t->column++;
264 | 	*s = c;
265 | 	return s + 1;
266 | }
267 | 
268 | static int get_string(struct tokenizer *t, char quote_char, struct token* out, int wide) {
269 | 	char *s = t->buf+1;
270 | 	int escaped = 0;
271 | 	char *end = t->buf + t->bufsize - 2;
272 | 	while(s < end) {
273 | 		int c = tokenizer_getc(t);
274 | 		if(c == EOF) {
275 | 			out->type = TT_EOF;
276 | 			*s = 0;
277 | 			return apply_coords(t, out, s, 0);
278 | 		}
279 | 		if(c == '\\') {
280 | 			c = tokenizer_getc(t);
281 | 			if(c == '\n') continue;
282 | 			tokenizer_ungetc(t, c);
283 | 			c = '\\';
284 | 		}
285 | 		if(c == '\n') {
286 | 			if(escaped) {
287 | 				escaped = 0;
288 | 				continue;
289 | 			}
290 | 			tokenizer_ungetc(t, c);
291 | 			out->type = TT_UNKNOWN;
292 | 			s = assign_bufchar(t, s, 0);
293 | 			return apply_coords(t, out, s, 0);
294 | 		}
295 | 		if(!escaped) {
296 | 			if(c == quote_char) {
297 | 				s = assign_bufchar(t, s, c);
298 | 				*s = 0;
299 | 				//s = assign_bufchar(t, s, 0);
300 | 				if(!wide)
301 | 					out->type = (quote_char == '"'? TT_DQSTRING_LIT : TT_SQSTRING_LIT);
302 | 				else
303 | 					out->type = (quote_char == '"'? TT_WIDESTRING_LIT : TT_WIDECHAR_LIT);
304 | 				return apply_coords(t, out, s, 1);
305 | 			}
306 | 			if(c == '\\') escaped = 1;
307 | 		} else {
308 | 			escaped = 0;
309 | 		}
310 | 		s = assign_bufchar(t, s, c);
311 | 	}
312 | 	t->buf[MAX_TOK_LEN-1] = 0;
313 | 	out->type = TT_OVERFLOW;
314 | 	return apply_coords(t, out, s, 0);
315 | }
316 | 
317 | /* if sequence found, next tokenizer call will point after the sequence */
318 | static int sequence_follows(struct tokenizer *t, int c, const char *which)
319 | {
320 | 	if(!which || !which[0]) return 0;
321 | 	size_t i = 0;
322 | 	while(c == which[i]) {
323 | 		if(!which[++i]) break;
324 | 		c = tokenizer_getc(t);
325 | 	}
326 | 	if(!which[i]) return 1;
327 | 	while(i > 0) {
328 | 		tokenizer_ungetc(t, c);
329 | 		c = which[--i];
330 | 	}
331 | 	return 0;
332 | }
333 | 
334 | int tokenizer_skip_chars(struct tokenizer *t, const char *chars, int *count) {
335 | 	assert(!t->peeking);
336 | 	int c;
337 | 	*count = 0;
338 | 	while(1) {
339 | 		c = tokenizer_getc(t);
340 | 		if(c == EOF) return 0;
341 | 		const char *s = chars;
342 | 		int match = 0;
343 | 		while(*s) {
344 | 			if(c==*s) {
345 | 				++(*count);
346 | 				match = 1;
347 | 				break;
348 | 			}
349 | 			++s;
350 | 		}
351 | 		if(!match) {
352 | 			tokenizer_ungetc(t, c);
353 | 			return 1;
354 | 		}
355 | 	}
356 | 
357 | }
358 | 
359 | int tokenizer_read_until(struct tokenizer *t, const char* marker, int stop_at_nl)
360 | {
361 | 	int c, marker_is_nl = !strcmp(marker, "\n");
362 | 	char *s = t->buf;
363 | 	while(1) {
364 | 		c = tokenizer_getc(t);
365 | 		if(c == EOF) {
366 | 			*s = 0;
367 | 			return 0;
368 | 		}
369 | 		if(c == '\n') {
370 | 			t->line++;
371 | 			t->column = 0;
372 | 			if(stop_at_nl) {
373 | 				*s = 0;
374 | 				if(marker_is_nl) return 1;
375 | 				return 0;
376 | 			}
377 | 		}
378 | 		if(!sequence_follows(t, c, marker))
379 | 			s = assign_bufchar(t, s, c);
380 | 		else
381 | 			break;
382 | 	}
383 | 	*s = 0;
384 | 	size_t i;
385 | 	for(i=strlen(marker); i > 0; )
386 | 		tokenizer_ungetc(t, marker[--i]);
387 | 	return 1;
388 | }
389 | static int ignore_until(struct tokenizer *t, const char* marker, int col_advance)
390 | {
391 | 	t->column += col_advance;
392 | 	int c;
393 | 	do {
394 | 		c = tokenizer_getc(t);
395 | 		if(c == EOF) return 0;
396 | 		if(c == '\n') {
397 | 			t->line++;
398 | 			t->column = 0;
399 | 		} else t->column++;
400 | 	} while(!sequence_follows(t, c, marker));
401 | 	t->column += strlen(marker)-1;
402 | 	return 1;
403 | }
404 | 
405 | void tokenizer_skip_until(struct tokenizer *t, const char *marker)
406 | {
407 | 	ignore_until(t, marker, 0);
408 | }
409 | 
410 | int tokenizer_next(struct tokenizer *t, struct token* out) {
411 | 	char *s = t->buf;
412 | 	out->value = 0;
413 | 	int c = 0;
414 | 	if(t->peeking) {
415 | 		*out = t->peek_token;
416 | 		t->peeking = 0;
417 | 		return 1;
418 | 	}
419 | 	while(1) {
420 | 		c = tokenizer_getc(t);
421 | 		if(c == EOF) break;
422 | 
423 | 		/* components of multi-line comment marker might be terminals themselves */
424 | 		if(sequence_follows(t, c, t->marker[MT_MULTILINE_COMMENT_START])) {
425 | 			ignore_until(t, t->marker[MT_MULTILINE_COMMENT_END], strlen(t->marker[MT_MULTILINE_COMMENT_START]));
426 | 			continue;
427 | 		}
428 | 		if(sequence_follows(t, c, t->marker[MT_SINGLELINE_COMMENT_START])) {
429 | 			ignore_until(t, "\n", strlen(t->marker[MT_SINGLELINE_COMMENT_START]));
430 | 			continue;
431 | 		}
432 | 		if(is_sep(c)) {
433 | 			if(s != t->buf && c == '\\' && !isspace(s[-1])) {
434 | 				c = tokenizer_getc(t);
435 | 				if(c == '\n') continue;
436 | 				tokenizer_ungetc(t, c);
437 | 				c = '\\';
438 | 			} else if(is_plus_or_minus(c) && s > t->buf+1 &&
439 | 				  (s[-1] == 'E' || s[-1] == 'e') && is_valid_float_until(t->buf, s-1)) {
440 | 				goto process_char;
441 | 			} else if(c == '.' && s != t->buf && is_valid_float_until(t->buf, s) == 1) {
442 | 				goto process_char;
443 | 			} else if(c == '.' && s == t->buf) {
444 | 				int jump = 0;
445 | 				c = tokenizer_getc(t);
446 | 				if(isdigit(c)) jump = 1;
447 | 				tokenizer_ungetc(t, c);
448 | 				c = '.';
449 | 				if(jump) goto process_char;
450 | 			}
451 | 			tokenizer_ungetc(t, c);
452 | 			break;
453 | 		}
454 | 		if((t->flags & TF_PARSE_WIDE_STRINGS) && s == t->buf && c == 'L') {
455 | 			c = tokenizer_getc(t);
456 | 			tokenizer_ungetc(t, c);
457 | 			tokenizer_ungetc(t, 'L');
458 | 			if(c == '\'' || c == '\"') break;
459 | 		}
460 | 
461 | process_char:;
462 | 		s = assign_bufchar(t, s, c);
463 | 		if(t->column + 1 >= MAX_TOK_LEN) {
464 | 			out->type = TT_OVERFLOW;
465 | 			return apply_coords(t, out, s, 0);
466 | 		}
467 | 	}
468 | 	if(s == t->buf) {
469 | 		if(c == EOF) {
470 | 			out->type = TT_EOF;
471 | 			return apply_coords(t, out, s, 1);
472 | 		}
473 | 
474 | 		int wide = 0;
475 | 		c = tokenizer_getc(t);
476 | 		if((t->flags & TF_PARSE_WIDE_STRINGS) && c == 'L') {
477 | 			c = tokenizer_getc(t);
478 | 			assert(c == '\'' || c == '\"');
479 | 			wide = 1;
480 | 			goto string_handling;
481 | 		} else if (c == '.' && sequence_follows(t, c, "...")) {
482 | 			strcpy(t->buf, "...");
483 | 			out->type = TT_ELLIPSIS;
484 | 			return apply_coords(t, out, s+3, 1);
485 | 		}
486 | 
487 | 		{
488 | 			int i;
489 | 			for(i = 0; i < t->custom_count; i++)
490 | 				if(sequence_follows(t, c, t->custom_tokens[i])) {
491 | 					const char *p = t->custom_tokens[i];
492 | 					while(*p) {
493 | 						s = assign_bufchar(t, s, *p);
494 | 						p++;
495 | 					}
496 | 					*s = 0;
497 | 					out->type = TT_CUSTOM + i;
498 | 					return apply_coords(t, out, s, 1);
499 | 				}
500 | 		}
501 | 
502 | string_handling:
503 | 		s = assign_bufchar(t, s, c);
504 | 		*s = 0;
505 | 		//s = assign_bufchar(t, s, 0);
506 | 		if(c == '"' || c == '\'')
507 | 			if(t->flags & TF_PARSE_STRINGS) return get_string(t, c, out, wide);
508 | 		out->type = TT_SEP;
509 | 		out->value = c;
510 | 		if(c == '\n') {
511 | 			apply_coords(t, out, s, 1);
512 | 			t->line++;
513 | 			t->column=0;
514 | 			return 1;
515 | 		}
516 | 		return apply_coords(t, out, s, 1);
517 | 	}
518 | 	//s = assign_bufchar(t, s, 0);
519 | 	*s = 0;
520 | 	out->type = categorize(t->buf);
521 | 	return apply_coords(t, out, s, out->type != TT_UNKNOWN);
522 | }
523 | 
524 | void tokenizer_set_flags(struct tokenizer *t, int flags) {
525 | 	t->flags = flags;
526 | }
527 | 
528 | int tokenizer_get_flags(struct tokenizer *t) {
529 | 	return t->flags;
530 | }
531 | 
532 | void tokenizer_init(struct tokenizer *t, FILE* in, int flags) {
533 | 	*t = (struct tokenizer){ .input = in, .line = 1, .flags = flags, .bufsize = MAX_TOK_LEN};
534 | }
535 | 
536 | void tokenizer_register_marker(struct tokenizer *t, enum markertype mt, const char* marker)
537 | {
538 | 	t->marker[mt] = marker;
539 | }
540 | 
541 | int tokenizer_rewind(struct tokenizer *t) {
542 | 	FILE *f = t->input;
543 | 	int flags = t->flags;
544 | 	const char* fn = t->filename;
545 | 	tokenizer_init(t, f, flags);
546 | 	tokenizer_set_filename(t, fn);
547 | 	return fseek(f, 0, SEEK_SET) == 0;
548 | }
549 | 


--------------------------------------------------------------------------------
/tokenizer.h:
--------------------------------------------------------------------------------
 1 | #ifndef TOKENIZER_H
 2 | #define TOKENIZER_H
 3 | 
 4 | #define MAX_TOK_LEN 4096
 5 | #define MAX_UNGETC 8
 6 | 
 7 | #include <stdint.h>
 8 | #include <stddef.h>
 9 | #include <stdio.h>
10 | 
11 | struct tokenizer_getc_buf {
12 | 	int buf[MAX_UNGETC];
13 | 	size_t cnt, buffered;
14 | };
15 | 
16 | enum markertype {
17 | 	MT_SINGLELINE_COMMENT_START = 0,
18 | 	MT_MULTILINE_COMMENT_START = 1,
19 | 	MT_MULTILINE_COMMENT_END = 2,
20 | 	MT_MAX = MT_MULTILINE_COMMENT_END
21 | };
22 | 
23 | #define MAX_CUSTOM_TOKENS 32
24 | 
25 | enum tokentype {
26 | 	TT_IDENTIFIER = 1,
27 | 	TT_SQSTRING_LIT,
28 | 	TT_DQSTRING_LIT,
29 | 	TT_ELLIPSIS,
30 | 	TT_HEX_INT_LIT,
31 | 	TT_OCT_INT_LIT,
32 | 	TT_DEC_INT_LIT,
33 | 	TT_FLOAT_LIT,
34 | 	TT_SEP,
35 | 	/* errors and similar */
36 | 	TT_UNKNOWN,
37 | 	TT_OVERFLOW,
38 | 	TT_WIDECHAR_LIT,
39 | 	TT_WIDESTRING_LIT,
40 | 	TT_EOF,
41 | 	TT_CUSTOM = 1000 /* start user defined tokentype values */
42 | };
43 | 
44 | const char* tokentype_to_str(enum tokentype tt);
45 | 
46 | struct token {
47 | 	enum tokentype type;
48 | 	uint32_t line;
49 | 	uint32_t column;
50 | 	int value;
51 | };
52 | 
53 | enum tokenizer_flags {
54 | 	TF_PARSE_STRINGS = 1 << 0,
55 | 	TF_PARSE_WIDE_STRINGS = 1 << 1,
56 | };
57 | 
58 | struct tokenizer {
59 | 	FILE *input;
60 | 	uint32_t line;
61 | 	uint32_t column;
62 | 	int flags;
63 | 	int custom_count;
64 | 	int peeking;
65 | 	const char *custom_tokens[MAX_CUSTOM_TOKENS];
66 | 	char buf[MAX_TOK_LEN];
67 | 	size_t bufsize;
68 | 	struct tokenizer_getc_buf getc_buf;
69 | 	const char* marker[MT_MAX+1];
70 | 	const char* filename;
71 | 	struct token peek_token;
72 | };
73 | 
74 | void tokenizer_init(struct tokenizer *t, FILE* in, int flags);
75 | void tokenizer_set_filename(struct tokenizer *t, const char*);
76 | void tokenizer_set_flags(struct tokenizer *t, int flags);
77 | int tokenizer_get_flags(struct tokenizer *t);
78 | off_t tokenizer_ftello(struct tokenizer *t);
79 | void tokenizer_register_marker(struct tokenizer*, enum markertype, const char*);
80 | void tokenizer_register_custom_token(struct tokenizer*, int tokentype, const char*);
81 | int tokenizer_next(struct tokenizer *t, struct token* out);
82 | int tokenizer_peek_token(struct tokenizer *t, struct token* out);
83 | int tokenizer_peek(struct tokenizer *t);
84 | void tokenizer_skip_until(struct tokenizer *t, const char *marker);
85 | int tokenizer_skip_chars(struct tokenizer *t, const char *chars, int *count);
86 | int tokenizer_read_until(struct tokenizer *t, const char* marker, int stop_at_nl);
87 | int tokenizer_rewind(struct tokenizer *t);
88 | 
89 | #ifdef __GNUC__
90 | #pragma GCC diagnostic ignored "-Wunknown-pragmas"
91 | #endif
92 | #pragma RcB2 DEP "tokenizer.c"
93 | 
94 | #endif
95 | 
96 | 


--------------------------------------------------------------------------------