├── .gitignore ├── COPYING ├── Makefile ├── README.md ├── cppmain.c ├── preproc.c ├── preproc.h ├── tokenizer.c └── tokenizer.h /.gitignore: -------------------------------------------------------------------------------- 1 | *.i 2 | *.o 3 | 4 | -------------------------------------------------------------------------------- /COPYING: -------------------------------------------------------------------------------- 1 | tinycpp is licensed under the following standard MIT license: 2 | 3 | ---------------------------------------------------------------------- 4 | Copyright © 2019 rofl0r. 5 | 6 | Permission is hereby granted, free of charge, to any person obtaining 7 | a copy of this software and associated documentation files (the 8 | "Software"), to deal in the Software without restriction, including 9 | without limitation the rights to use, copy, modify, merge, publish, 10 | distribute, sublicense, and/or sell copies of the Software, and to 11 | permit persons to whom the Software is furnished to do so, subject to 12 | the following conditions: 13 | 14 | The above copyright notice and this permission notice shall be 15 | included in all copies or substantial portions of the Software. 16 | 17 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 18 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 19 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 20 | IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 21 | CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 22 | TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 23 | SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 24 | ---------------------------------------------------------------------- 25 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | #Makefile autogenerated by RcB2 2 | prefix = /usr/local 3 | bindir = $(prefix)/bin 4 | 5 | PROG = cppmain 6 | SRCS = cppmain.c \ 7 | tokenizer.c \ 8 | preproc.c 9 | 10 | LIBULZ_BASE?=../cdev/cdev/lib/ 11 | 12 | LIBS = 13 | 14 | CFLAGS_N = 15 | CPPFLAGS_N = -I $(LIBULZ_BASE)/include 16 | LDFLAGS_N = 17 | 18 | OBJS = $(SRCS:.c=.o) 19 | 20 | MAKEFILE := $(firstword $(MAKEFILE_LIST)) 21 | 22 | -include config.mak 23 | 24 | all: $(PROG) 25 | 26 | clean: 27 | rm -f $(PROG) 28 | rm -f $(OBJS) 29 | 30 | rebuild: 31 | $(MAKE) -f $(MAKEFILE) clean && $(MAKE) -f $(MAKEFILE) all 32 | 33 | ddebug: 34 | $(MAKE) -f $(MAKEFILE) clean && $(MAKE) -f $(MAKEFILE) CFLAGS="-O0 -g3 -DDEBUG" all 35 | 36 | debug: 37 | $(MAKE) -f $(MAKEFILE) clean && $(MAKE) -f $(MAKEFILE) CFLAGS="-O0 -g3" all 38 | 39 | install: $(PROG) 40 | install -d $(DESTDIR)/$(bindir) 41 | install -D -m 755 $(PROG) $(DESTDIR)/$(bindir)/ 42 | 43 | src: $(SRCS) 44 | $(CC) $(CPPFLAGS_N) $(CPPFLAGS) $(CFLAGS_N) $(CFLAGS) -o $(PROG) $^ $(LDFLAGS_N) $(LDFLAGS) $(LIBS) 45 | 46 | %.o: %.c 47 | $(CC) $(CPPFLAGS_N) $(CPPFLAGS) $(CFLAGS_N) $(CFLAGS) -c -o $@ $< 48 | 49 | $(PROG): $(OBJS) 50 | $(CC) $(CFLAGS_N) $(CFLAGS) $(LDFLAGS_N) $(LDFLAGS) $(OBJS) $(LIBS) -o $@ 51 | 52 | .PHONY: all clean rebuild install src 53 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | tinycpp - a small, embeddable C-style preprocessor 2 | ================================================== 3 | 4 | tinycpp was created with the intention of having a C-style preprocessor 5 | for use in an assembler i'm working on. 6 | the particular issue i faced with standard C preprocessors is that 7 | multiline-macros are expanded into a single line. this basically 8 | requires to add something like ';' to the assembler language, to support 9 | several expressions in a single line. 10 | 11 | one of the design goals from the start was to read the input token by 12 | token, instead of slurping the entire file into memory. 13 | this, unfortunately, required some trickery to get the right behaviour 14 | in some cases, but should save a lot of memory on big files 15 | (theoretically, it should be able to process gigabyte-big files, while 16 | only consuming a few MBs ram (depending on the amount of macros that 17 | need to be stored)). 18 | 19 | apart from that, tinycpp pretty much behaves like your standard cpp. 20 | 21 | it's self-hosting: it can preprocess its own source, and the result 22 | compiles fine, so it's quite complete (tested with musl libc headers). 23 | 24 | size 25 | ---- 26 | the 2 TUs used by the preprocessor library are less than 2 KLOC combined. 27 | additionally about 500 LOC of list and hash header implementations from 28 | libulz are used. this is still a lot less than ucpp's 8 KLOC-ish 29 | implementation. not as tiny as i'd like, but a C preprocessor is a 30 | surprisingly complex beast. 31 | 32 | speed 33 | ----- 34 | speed is slightly slower than GNU cpp, and slightly faster than mcpp on 35 | a 12MB testfile which defines, undefs and uses thousands of macros. 36 | 37 | differences to standard C preprocessors 38 | --------------------------------------- 39 | 40 | - "if" evaluation treats all numeric literals as integers, even if they 41 | have L/U/LL/LLU suffixes. this is probably the biggest blocker from 42 | becoming a fully compliant C preprocessor. 43 | shouldn't be hard to support though. 44 | - widechar literals in conditionals are treated as if they were a single 45 | non-wide character. 46 | - multiline macros keep newline characters, which doesn't cause any 47 | issues, apart from making it harder to diff against other CPPs output. 48 | (`__LINE__` macro behaves as expected, though, in that it shows the same 49 | line number for all expanded lines). 50 | - no predefined macros such as `__STDC__`. you can set them yourself, if 51 | you like. 52 | - a few test cases of mcpp fail. these are cornercases that are usually 53 | not encountered in the wild. 54 | e.g. https://github.com/ned14/mcpp/blob/master/test-c/n_5.c 55 | - lines starting w/ comments like `/**/` followed by preprocessor directives 56 | are currently not detected as such. this is because comments are removed 57 | on the fly, not in a previous pass. it shouldn't be very hard to support 58 | it, though. 59 | - no digraphs and trigraphs supported. 60 | - multiple sequential whitespace characters are preserved. 61 | - max token length is 4095, though this can easily be changed. 62 | many CPPs happily process much longer tokens, even though the standard 63 | doesn't require it. 64 | - some built-ins like `__TIME__` and `__DATE__` are missing, but you can 65 | define them yourself if needed. `__LINE__` and `__FILE_`_ were added, 66 | as they're used by musl's headers. 67 | - the printed diagnostics are sometimes not very helpful. 68 | 69 | anything else not mentioned here is supported (including varargs, pasting, 70 | stringification, ...) 71 | 72 | differences to other C preprocessor libraries 73 | --------------------------------------------- 74 | 75 | the preprocessor interface takes a `FILE*` as input and one as output. 76 | it doesn't try to provide a C token stream. 77 | in order not to write to disk, you can use memory streams 78 | (open_memstream() to create a writable stream, followed by fflush() to 79 | make its contents available) 80 | 81 | how to build 82 | ------------ 83 | clone the libulz library https://github.com/rofl0r/libulz, and point the 84 | Makefile to the directory, or copy the 3 headers needed into the source 85 | tree, then run `make`. 86 | 87 | how to use 88 | ---------- 89 | look at `preproc.h` and `cppmain.c`, which implements the demo preprocessor 90 | program. 91 | 92 | acknowledgements 93 | ---------------- 94 | thanks go to mcpp's author, whose testsuite i extensively used. 95 | 96 | -------------------------------------------------------------------------------- /cppmain.c: -------------------------------------------------------------------------------- 1 | #include "preproc.h" 2 | #include 3 | #include 4 | 5 | static int usage(char *a0) { 6 | fprintf(stderr, 7 | "example preprocessor\n" 8 | "usage: %s [-I includedir...] [-D define] file\n" 9 | "if no filename or '-' is passed, stdin is used.\n" 10 | , a0); 11 | return 1; 12 | } 13 | 14 | int main(int argc, char** argv) { 15 | int c; char* tmp; 16 | struct cpp* cpp = cpp_new(); 17 | while ((c = getopt(argc, argv, "D:I:")) != EOF) switch(c) { 18 | case 'I': cpp_add_includedir(cpp, optarg); break; 19 | case 'D': 20 | if((tmp = strchr(optarg, '='))) *tmp = ' '; 21 | cpp_add_define(cpp, optarg); 22 | break; 23 | default: return usage(argv[0]); 24 | } 25 | char *fn = "stdin"; 26 | FILE *in = stdin; 27 | if(argv[optind] && strcmp(argv[optind], "-")) { 28 | fn = argv[optind]; 29 | in = fopen(fn, "r"); 30 | if(!in) { 31 | perror("fopen"); 32 | return 1; 33 | } 34 | } 35 | int ret = cpp_run(cpp, in, stdout, fn); 36 | cpp_free(cpp); 37 | if(in != stdin) fclose(in); 38 | return !ret; 39 | } 40 | 41 | -------------------------------------------------------------------------------- /preproc.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include "preproc.h" 5 | #include "tokenizer.h" 6 | #include "tglist.h" 7 | #include "hbmap.h" 8 | 9 | #define MACRO_FLAG_OBJECTLIKE (1U<<31) 10 | #define MACRO_FLAG_VARIADIC (1U<<30) 11 | #define MACRO_ARGCOUNT_MASK (~(0|MACRO_FLAG_OBJECTLIKE|MACRO_FLAG_VARIADIC)) 12 | 13 | #define OBJECTLIKE(M) (M->num_args & MACRO_FLAG_OBJECTLIKE) 14 | #define FUNCTIONLIKE(M) (!(OBJECTLIKE(M))) 15 | #define MACRO_ARGCOUNT(M) (M->num_args & MACRO_ARGCOUNT_MASK) 16 | #define MACRO_VARIADIC(M) (M->num_args & MACRO_FLAG_VARIADIC) 17 | 18 | #define MAX_RECURSION 32 19 | 20 | static unsigned string_hash(const char* s) { 21 | uint_fast32_t h = 0; 22 | while (*s) { 23 | h = 16*h + *s++; 24 | h ^= h>>24 & 0xf0; 25 | } 26 | return h & 0xfffffff; 27 | } 28 | 29 | struct macro { 30 | unsigned num_args; 31 | FILE* str_contents; 32 | char *str_contents_buf; 33 | tglist(char*) argnames; 34 | }; 35 | 36 | struct cpp { 37 | tglist(char*) includedirs; 38 | hbmap(char*, struct macro, 128) *macros; 39 | const char *last_file; 40 | int last_line; 41 | struct tokenizer *tchain[MAX_RECURSION]; 42 | }; 43 | 44 | static int token_needs_string(struct token *tok) { 45 | switch(tok->type) { 46 | case TT_IDENTIFIER: 47 | case TT_WIDECHAR_LIT: 48 | case TT_WIDESTRING_LIT: 49 | case TT_SQSTRING_LIT: 50 | case TT_DQSTRING_LIT: 51 | case TT_ELLIPSIS: 52 | case TT_HEX_INT_LIT: 53 | case TT_OCT_INT_LIT: 54 | case TT_DEC_INT_LIT: 55 | case TT_FLOAT_LIT: 56 | case TT_UNKNOWN: 57 | return 1; 58 | default: 59 | return 0; 60 | } 61 | } 62 | 63 | static void tokenizer_from_file(struct tokenizer *t, FILE* f) { 64 | tokenizer_init(t, f, TF_PARSE_STRINGS); 65 | tokenizer_set_filename(t, ""); 66 | tokenizer_rewind(t); 67 | } 68 | 69 | static int strptrcmp(const void *a, const void *b) { 70 | const char * const *x = a; 71 | const char * const *y = b; 72 | return strcmp(*x, *y); 73 | } 74 | 75 | static struct macro* get_macro(struct cpp *cpp, const char *name) { 76 | return hbmap_get(cpp->macros, name); 77 | } 78 | 79 | static void add_macro(struct cpp *cpp, const char *name, struct macro*m) { 80 | hbmap_insert(cpp->macros, name, *m); 81 | } 82 | 83 | static int undef_macro(struct cpp *cpp, const char *name) { 84 | hbmap_iter k = hbmap_find(cpp->macros, name); 85 | if(k == (hbmap_iter) -1) return 0; 86 | struct macro *m = &hbmap_getval(cpp->macros, k); 87 | free(hbmap_getkey(cpp->macros, k)); 88 | if(m->str_contents) fclose(m->str_contents); 89 | free(m->str_contents_buf); 90 | tglist_free_values(&m->argnames); 91 | tglist_free_items(&m->argnames); 92 | hbmap_delete(cpp->macros, k); 93 | return 1; 94 | } 95 | 96 | static void free_macros(struct cpp *cpp) { 97 | hbmap_iter i; 98 | hbmap_foreach(cpp->macros, i) { 99 | while(hbmap_iter_index_valid(cpp->macros, i)) 100 | undef_macro(cpp, hbmap_getkey(cpp->macros, i)); 101 | } 102 | hbmap_fini(cpp->macros, 1); 103 | free(cpp->macros); 104 | } 105 | 106 | static void error_or_warning(const char *err, const char* type, struct tokenizer *t, struct token *curr) { 107 | unsigned column = curr ? curr->column : t->column; 108 | unsigned line = curr ? curr->line : t->line; 109 | dprintf(2, "<%s> %u:%u %s: '%s'\n", t->filename, line, column, type, err); 110 | dprintf(2, "%s\n", t->buf); 111 | for(int i = 0; i < strlen(t->buf); i++) 112 | dprintf(2, "^"); 113 | dprintf(2, "\n"); 114 | } 115 | static void error(const char *err, struct tokenizer *t, struct token *curr) { 116 | error_or_warning(err, "error", t, curr); 117 | } 118 | static void warning(const char *err, struct tokenizer *t, struct token *curr) { 119 | error_or_warning(err, "warning", t, curr); 120 | } 121 | 122 | static void emit(FILE *out, const char *s) { 123 | fprintf(out, "%s", s); 124 | } 125 | 126 | static int x_tokenizer_next_of(struct tokenizer *t, struct token *tok, int fail_unk) { 127 | int ret = tokenizer_next(t, tok); 128 | if(tok->type == TT_OVERFLOW) { 129 | error("max token length of 4095 exceeded!", t, tok); 130 | return 0; 131 | } else if (fail_unk && ret == 0) { 132 | error("tokenizer encountered unknown token", t, tok); 133 | return 0; 134 | } 135 | return 1; 136 | } 137 | 138 | #define tokenizer_next(T, TOK) x_tokenizer_next_of(T, TOK, 0) 139 | #define x_tokenizer_next(T, TOK) x_tokenizer_next_of(T, TOK, 1) 140 | 141 | static int is_whitespace_token(struct token *token) 142 | { 143 | return token->type == TT_SEP && 144 | (token->value == ' ' || token->value == '\t'); 145 | } 146 | 147 | /* return index of matching item in values array, or -1 on error */ 148 | static int expect(struct tokenizer *t, enum tokentype tt, const char* values[], struct token *token) 149 | { 150 | int ret; 151 | do { 152 | ret = tokenizer_next(t, token); 153 | if(ret == 0 || token->type == TT_EOF) goto err; 154 | } while(is_whitespace_token(token)); 155 | 156 | if(token->type != tt) { 157 | err: 158 | error("unexpected token", t, token); 159 | return -1; 160 | } 161 | int i = 0; 162 | while(values[i]) { 163 | if(!strcmp(values[i], t->buf)) 164 | return i; 165 | ++i; 166 | } 167 | return -1; 168 | } 169 | 170 | static int is_char(struct token *tok, int ch) { 171 | return tok->type == TT_SEP && tok->value == ch; 172 | } 173 | 174 | static void flush_whitespace(FILE *out, int *ws_count) { 175 | while(*ws_count > 0) { 176 | emit(out, " "); 177 | --(*ws_count); 178 | } 179 | } 180 | 181 | /* skips until the next non-whitespace token (if the current one is one too)*/ 182 | static int eat_whitespace(struct tokenizer *t, struct token *token, int *count) { 183 | *count = 0; 184 | int ret = 1; 185 | while (is_whitespace_token(token)) { 186 | ++(*count); 187 | ret = x_tokenizer_next(t, token); 188 | if(!ret) break; 189 | } 190 | return ret; 191 | } 192 | /* fetches the next token until it is non-whitespace */ 193 | static int skip_next_and_ws(struct tokenizer *t, struct token *tok) { 194 | int ret = tokenizer_next(t, tok); 195 | if(!ret) return ret; 196 | int ws_count; 197 | ret = eat_whitespace(t, tok, &ws_count); 198 | return ret; 199 | } 200 | 201 | static void emit_token(FILE* out, struct token *tok, const char* strbuf) { 202 | if(tok->type == TT_SEP) { 203 | fprintf(out, "%c", tok->value); 204 | } else if(strbuf && token_needs_string(tok)) { 205 | fprintf(out, "%s", strbuf); 206 | } else { 207 | dprintf(2, "oops, dunno how to handle tt %d (%s)\n", (int) tok->type, strbuf); 208 | } 209 | } 210 | 211 | int parse_file(struct cpp* cpp, FILE *f, const char*, FILE *out); 212 | static int include_file(struct cpp* cpp, struct tokenizer *t, FILE* out) { 213 | static const char* inc_chars[] = { "\"", "<", 0}; 214 | static const char* inc_chars_end[] = { "\"", ">", 0}; 215 | struct token tok; 216 | tokenizer_set_flags(t, 0); // disable string tokenization 217 | 218 | int inc1sep = expect(t, TT_SEP, inc_chars, &tok); 219 | if(inc1sep == -1) { 220 | error("expected one of [\"<]", t, &tok); 221 | return 0; 222 | } 223 | int ret = tokenizer_read_until(t, inc_chars_end[inc1sep], 1); 224 | if(!ret) { 225 | error("error parsing filename", t, &tok); 226 | return 0; 227 | } 228 | // TODO: different path lookup depending on whether " or < 229 | size_t i; 230 | FILE *f = 0; 231 | tglist_foreach(&cpp->includedirs, i) { 232 | char buf[512]; 233 | snprintf(buf, sizeof buf, "%s/%s", tglist_get(&cpp->includedirs, i), t->buf); 234 | f = fopen(buf, "r"); 235 | if(f) break; 236 | } 237 | if(!f) { 238 | dprintf(2, "%s: ", t->buf); 239 | perror("fopen"); 240 | return 0; 241 | } 242 | const char *fn = strdup(t->buf); 243 | assert(tokenizer_next(t, &tok) && is_char(&tok, inc_chars_end[inc1sep][0])); 244 | 245 | tokenizer_set_flags(t, TF_PARSE_STRINGS); 246 | return parse_file(cpp, f, fn, out); 247 | } 248 | 249 | static int emit_error_or_warning(struct tokenizer *t, int is_error) { 250 | int ws_count; 251 | int ret = tokenizer_skip_chars(t, " \t", &ws_count); 252 | if(!ret) return ret; 253 | struct token tmp = {.column = t->column, .line = t->line}; 254 | ret = tokenizer_read_until(t, "\n", 1); 255 | if(is_error) { 256 | error(t->buf, t, &tmp); 257 | return 0; 258 | } 259 | warning(t->buf, t, &tmp); 260 | return 1; 261 | } 262 | 263 | static FILE *freopen_r(FILE *f, char **buf, size_t *size) { 264 | fflush(f); 265 | fclose(f); 266 | return fmemopen(*buf, *size, "r"); 267 | } 268 | 269 | static int consume_nl_and_ws(struct tokenizer *t, struct token *tok, int expected) { 270 | if(!x_tokenizer_next(t, tok)) { 271 | err: 272 | error("unexpected", t, tok); 273 | return 0; 274 | } 275 | if(expected) { 276 | if(tok->type != TT_SEP || tok->value != expected) goto err; 277 | switch(expected) { 278 | case '\\' : expected = '\n'; break; 279 | case '\n' : expected = 0; break; 280 | } 281 | } else { 282 | if(is_whitespace_token(tok)) ; 283 | else if(is_char(tok, '\\')) expected = '\n'; 284 | else return 1; 285 | } 286 | return consume_nl_and_ws(t, tok, expected); 287 | } 288 | 289 | static int expand_macro(struct cpp *cpp, struct tokenizer *t, FILE* out, const char* name, unsigned rec_level, char *visited[]); 290 | 291 | static int parse_macro(struct cpp *cpp, struct tokenizer *t) { 292 | int ws_count; 293 | int ret = tokenizer_skip_chars(t, " \t", &ws_count); 294 | if(!ret) return ret; 295 | struct token curr; //tmp = {.column = t->column, .line = t->line}; 296 | ret = tokenizer_next(t, &curr) && curr.type != TT_EOF; 297 | if(!ret) { 298 | error("parsing macro name", t, &curr); 299 | return ret; 300 | } 301 | if(curr.type != TT_IDENTIFIER) { 302 | error("expected identifier", t, &curr); 303 | return 0; 304 | } 305 | const char* macroname = strdup(t->buf); 306 | #ifdef DEBUG 307 | dprintf(2, "parsing macro %s\n", macroname); 308 | #endif 309 | int redefined = 0; 310 | if(get_macro(cpp, macroname)) { 311 | if(!strcmp(macroname, "defined")) { 312 | error("\"defined\" cannot be used as a macro name", t, &curr); 313 | return 0; 314 | } 315 | redefined = 1; 316 | } 317 | 318 | struct macro new = { 0 }; 319 | unsigned macro_flags = MACRO_FLAG_OBJECTLIKE; 320 | tglist_init(&new.argnames); 321 | 322 | ret = x_tokenizer_next(t, &curr) && curr.type != TT_EOF; 323 | if(!ret) return ret; 324 | 325 | if (is_char(&curr, '(')) { 326 | macro_flags = 0; 327 | unsigned expected = 0; 328 | while(1) { 329 | /* process next function argument identifier */ 330 | ret = consume_nl_and_ws(t, &curr, expected); 331 | if(!ret) { 332 | error("unexpected", t, &curr); 333 | return ret; 334 | } 335 | expected = 0; 336 | if(curr.type == TT_SEP) { 337 | switch(curr.value) { 338 | case '\\': 339 | expected = '\n'; 340 | continue; 341 | case ',': 342 | continue; 343 | case ')': 344 | ret = tokenizer_skip_chars(t, " \t", &ws_count); 345 | if(!ret) return ret; 346 | goto break_loop1; 347 | default: 348 | error("unexpected character", t, &curr); 349 | return 0; 350 | } 351 | } else if(!(curr.type == TT_IDENTIFIER || curr.type == TT_ELLIPSIS)) { 352 | error("expected identifier for macro arg", t, &curr); 353 | return 0; 354 | } 355 | { 356 | if(curr.type == TT_ELLIPSIS) { 357 | if(macro_flags & MACRO_FLAG_VARIADIC) { 358 | error("\"...\" isn't the last parameter", t, &curr); 359 | return 0; 360 | } 361 | macro_flags |= MACRO_FLAG_VARIADIC; 362 | } 363 | char *tmps = strdup(t->buf); 364 | tglist_add(&new.argnames, tmps); 365 | } 366 | ++new.num_args; 367 | } 368 | break_loop1:; 369 | } else if(is_whitespace_token(&curr)) { 370 | ret = tokenizer_skip_chars(t, " \t", &ws_count); 371 | if(!ret) return ret; 372 | } else if(is_char(&curr, '\n')) { 373 | /* content-less macro */ 374 | goto done; 375 | } 376 | 377 | struct FILE_container { 378 | FILE *f; 379 | char *buf; 380 | size_t len; 381 | } contents; 382 | contents.f = open_memstream(&contents.buf, &contents.len); 383 | 384 | int backslash_seen = 0; 385 | while(1) { 386 | /* ignore unknown tokens in macro body */ 387 | ret = tokenizer_next(t, &curr); 388 | if(!ret) return 0; 389 | if(curr.type == TT_EOF) break; 390 | if (curr.type == TT_SEP) { 391 | if(curr.value == '\\') 392 | backslash_seen = 1; 393 | else { 394 | if(curr.value == '\n' && !backslash_seen) break; 395 | emit_token(contents.f, &curr, t->buf); 396 | backslash_seen = 0; 397 | } 398 | } else { 399 | emit_token(contents.f, &curr, t->buf); 400 | } 401 | } 402 | new.str_contents = freopen_r(contents.f, &contents.buf, &contents.len); 403 | new.str_contents_buf = contents.buf; 404 | done: 405 | if(redefined) { 406 | struct macro *old = get_macro(cpp, macroname); 407 | char *s_old = old->str_contents_buf ? old->str_contents_buf : ""; 408 | char *s_new = new.str_contents_buf ? new.str_contents_buf : ""; 409 | if(strcmp(s_old, s_new)) { 410 | char buf[128]; 411 | sprintf(buf, "redefinition of macro %s", macroname); 412 | warning(buf, t, 0); 413 | } 414 | } 415 | new.num_args |= macro_flags; 416 | add_macro(cpp, macroname, &new); 417 | return 1; 418 | } 419 | 420 | static size_t macro_arglist_pos(struct macro *m, const char* iden) { 421 | size_t i; 422 | for(i = 0; i < tglist_getsize(&m->argnames); i++) { 423 | char *item = tglist_get(&m->argnames, i); 424 | if(!strcmp(item, iden)) return i; 425 | } 426 | return (size_t) -1; 427 | } 428 | 429 | 430 | struct macro_info { 431 | const char *name; 432 | unsigned nest; 433 | unsigned first; 434 | unsigned last; 435 | }; 436 | 437 | static int was_visited(const char *name, char*visited[], unsigned rec_level) { 438 | int x; 439 | for(x = rec_level; x >= 0; --x) { 440 | if(!strcmp(visited[x], name)) return 1; 441 | } 442 | return 0; 443 | } 444 | 445 | unsigned get_macro_info(struct cpp* cpp, 446 | struct tokenizer *t, 447 | struct macro_info *mi_list, size_t *mi_cnt, 448 | unsigned nest, unsigned tpos, const char *name, 449 | char* visited[], unsigned rec_level 450 | ) { 451 | int brace_lvl = 0; 452 | while(1) { 453 | struct token tok; 454 | int ret = tokenizer_next(t, &tok); 455 | if(!ret || tok.type == TT_EOF) break; 456 | #ifdef DEBUG 457 | dprintf(2, "(%s) nest %d, brace %u t: %s\n", name, nest, brace_lvl, t->buf); 458 | #endif 459 | struct macro* m = 0; 460 | if(tok.type == TT_IDENTIFIER && (m = get_macro(cpp, t->buf)) && !was_visited(t->buf, visited, rec_level)) { 461 | const char* newname = strdup(t->buf); 462 | if(FUNCTIONLIKE(m)) { 463 | if(tokenizer_peek(t) == '(') { 464 | unsigned tpos_save = tpos; 465 | tpos = get_macro_info(cpp, t, mi_list, mi_cnt, nest+1, tpos+1, newname, visited, rec_level); 466 | mi_list[*mi_cnt] = (struct macro_info) { 467 | .name = newname, 468 | .nest=nest+1, 469 | .first = tpos_save, 470 | .last = tpos + 1}; 471 | ++(*mi_cnt); 472 | } else { 473 | /* suppress expansion */ 474 | } 475 | } else { 476 | mi_list[*mi_cnt] = (struct macro_info) { 477 | .name = newname, 478 | .nest=nest+1, 479 | .first = tpos, 480 | .last = tpos + 1}; 481 | ++(*mi_cnt); 482 | } 483 | } else if(is_char(&tok, '(')) { 484 | ++brace_lvl; 485 | } else if(is_char(&tok, ')')) { 486 | --brace_lvl; 487 | if(brace_lvl == 0 && nest != 0) break; 488 | } 489 | ++tpos; 490 | } 491 | return tpos; 492 | } 493 | 494 | struct FILE_container { 495 | FILE *f; 496 | char *buf; 497 | size_t len; 498 | struct tokenizer t; 499 | }; 500 | 501 | static void free_file_container(struct FILE_container *fc) { 502 | fclose(fc->f); 503 | free(fc->buf); 504 | } 505 | 506 | static int mem_tokenizers_join( 507 | struct FILE_container* org, struct FILE_container *inj, 508 | struct FILE_container* result, 509 | int first, off_t lastpos) { 510 | result->f = open_memstream(&result->buf, &result->len); 511 | size_t i; 512 | struct token tok; 513 | int ret; 514 | tokenizer_rewind(&org->t); 515 | for(i=0; it, &tok); 517 | assert(ret && tok.type != TT_EOF); 518 | emit_token(result->f, &tok, org->t.buf); 519 | } 520 | int cnt = 0, last = first; 521 | while(1) { 522 | ret = tokenizer_next(&inj->t, &tok); 523 | if(!ret || tok.type == TT_EOF) break; 524 | emit_token(result->f, &tok, inj->t.buf); 525 | ++cnt; 526 | } 527 | while(tokenizer_ftello(&org->t) < lastpos) { 528 | ret = tokenizer_next(&org->t, &tok); 529 | last++; 530 | } 531 | 532 | int diff = cnt - ((int) last - (int) first); 533 | 534 | while(1) { 535 | ret = tokenizer_next(&org->t, &tok); 536 | if(!ret || tok.type == TT_EOF) break; 537 | emit_token(result->f, &tok, org->t.buf); 538 | } 539 | 540 | result->f = freopen_r(result->f, &result->buf, &result->len); 541 | tokenizer_from_file(&result->t, result->f); 542 | return diff; 543 | } 544 | 545 | static int tchain_parens_follows(struct cpp *cpp, int rec_level) { 546 | int i, c = 0; 547 | for(i=rec_level;i>=0;--i) { 548 | c = tokenizer_peek(cpp->tchain[i]); 549 | if(c == EOF) continue; 550 | if(c == '(') return i; 551 | else break; 552 | } 553 | return -1; 554 | } 555 | 556 | static int stringify(struct cpp *ccp, struct tokenizer *t, FILE* output) { 557 | int ret = 1; 558 | struct token tok; 559 | emit(output, "\""); 560 | while(1) { 561 | ret = tokenizer_next(t, &tok); 562 | if(!ret) return ret; 563 | if(tok.type == TT_EOF) break; 564 | if(is_char(&tok, '\n')) continue; 565 | if(is_char(&tok, '\\') && tokenizer_peek(t) == '\n') continue; 566 | if(tok.type == TT_DQSTRING_LIT) { 567 | char *s = t->buf; 568 | char buf[2] = {0}; 569 | while(*s) { 570 | if(*s == '\"') { 571 | emit(output, "\\\""); 572 | } else if (*s == '\\') { 573 | emit(output, "\\\\"); 574 | } else { 575 | buf[0] = *s; 576 | emit(output, buf); 577 | } 578 | ++s; 579 | } 580 | } else 581 | emit_token(output, &tok, t->buf); 582 | } 583 | emit(output, "\""); 584 | return ret; 585 | } 586 | 587 | /* rec_level -1 serves as a magic value to signal we're using 588 | expand_macro from the if-evaluator code, which means activating 589 | the "define" macro */ 590 | static int expand_macro(struct cpp* cpp, struct tokenizer *t, FILE* out, const char* name, unsigned rec_level, char* visited[]) { 591 | int is_define = !strcmp(name, "defined"); 592 | 593 | struct macro *m; 594 | if(is_define && rec_level != -1) 595 | m = NULL; 596 | else m = get_macro(cpp, name); 597 | if(!m) { 598 | emit(out, name); 599 | return 1; 600 | } 601 | if(rec_level == -1) rec_level = 0; 602 | if(rec_level >= MAX_RECURSION) { 603 | error("max recursion level reached", t, 0); 604 | return 0; 605 | } 606 | #ifdef DEBUG 607 | dprintf(2, "lvl %u: expanding macro %s (%s)\n", rec_level, name, m->str_contents_buf); 608 | #endif 609 | 610 | if(rec_level == 0 && strcmp(t->filename, "")) { 611 | cpp->last_file = t->filename; 612 | cpp->last_line = t->line; 613 | } 614 | if(!strcmp(name, "__FILE__")) { 615 | emit(out, "\""); 616 | emit(out, cpp->last_file); 617 | emit(out, "\""); 618 | return 1; 619 | } else if(!strcmp(name, "__LINE__")) { 620 | char buf[64]; 621 | sprintf(buf, "%d", cpp->last_line); 622 | emit(out, buf); 623 | return 1; 624 | } 625 | 626 | if(visited[rec_level]) free(visited[rec_level]); 627 | visited[rec_level] = strdup(name); 628 | cpp->tchain[rec_level] = t; 629 | 630 | size_t i; 631 | struct token tok; 632 | unsigned num_args = MACRO_ARGCOUNT(m); 633 | struct FILE_container *argvalues = calloc(MACRO_VARIADIC(m) ? num_args + 1 : num_args, sizeof(struct FILE_container)); 634 | 635 | for(i=0; i < num_args; i++) 636 | argvalues[i].f = open_memstream(&argvalues[i].buf, &argvalues[i].len); 637 | 638 | /* replace named arguments in the contents of the macro call */ 639 | if(FUNCTIONLIKE(m)) { 640 | int ret; 641 | if((ret = tokenizer_peek(t)) != '(') { 642 | /* function-like macro shall not be expanded if not followed by '(' */ 643 | if(ret == EOF && rec_level > 0 && (ret = tchain_parens_follows(cpp, rec_level-1)) != -1) { 644 | // warning("Replacement text involved subsequent text", t, 0); 645 | t = cpp->tchain[ret]; 646 | } else { 647 | emit(out, name); 648 | goto cleanup; 649 | } 650 | } 651 | ret = x_tokenizer_next(t, &tok); 652 | assert(ret && is_char(&tok, '(')); 653 | 654 | unsigned curr_arg = 0, need_arg = 1, parens = 0; 655 | int ws_count; 656 | if(!tokenizer_skip_chars(t, " \t", &ws_count)) return 0; 657 | 658 | int varargs = 0; 659 | if(num_args == 1 && MACRO_VARIADIC(m)) varargs = 1; 660 | while(1) { 661 | int ret = tokenizer_next(t, &tok); 662 | if(!ret) return 0; 663 | if( tok.type == TT_EOF) { 664 | dprintf(2, "warning EOF\n"); 665 | break; 666 | } 667 | if(!parens && is_char(&tok, ',') && !varargs) { 668 | if(need_arg && !ws_count) { 669 | /* empty argument is OK */ 670 | } 671 | need_arg = 1; 672 | if(!varargs) curr_arg++; 673 | if(curr_arg + 1 == num_args && MACRO_VARIADIC(m)) { 674 | varargs = 1; 675 | } else if(curr_arg >= num_args) { 676 | error("too many arguments for function macro", t, &tok); 677 | return 0; 678 | } 679 | ret = tokenizer_skip_chars(t, " \t", &ws_count); 680 | if(!ret) return ret; 681 | continue; 682 | } else if(is_char(&tok, '(')) { 683 | ++parens; 684 | } else if(is_char(&tok, ')')) { 685 | if(!parens) { 686 | if(curr_arg + num_args && curr_arg < num_args-1) { 687 | error("too few args for function macro", t, &tok); 688 | return 0; 689 | } 690 | break; 691 | } 692 | --parens; 693 | } else if(is_char(&tok, '\\')) { 694 | if(tokenizer_peek(t) == '\n') continue; 695 | } 696 | need_arg = 0; 697 | emit_token(argvalues[curr_arg].f, &tok, t->buf); 698 | } 699 | } 700 | 701 | for(i=0; i < num_args; i++) { 702 | argvalues[i].f = freopen_r(argvalues[i].f, &argvalues[i].buf, &argvalues[i].len); 703 | tokenizer_from_file(&argvalues[i].t, argvalues[i].f); 704 | #ifdef DEBUG 705 | dprintf(2, "macro argument %i: %s\n", (int) i, argvalues[i].buf); 706 | #endif 707 | } 708 | 709 | if(is_define) { 710 | if(get_macro(cpp, argvalues[0].buf)) 711 | emit(out, "1"); 712 | else 713 | emit(out, "0"); 714 | } 715 | 716 | if(!m->str_contents) goto cleanup; 717 | 718 | struct FILE_container cwae = {0}; /* contents_with_args_expanded */ 719 | cwae.f = open_memstream(&cwae.buf, &cwae.len); 720 | FILE* output = cwae.f; 721 | 722 | struct tokenizer t2; 723 | tokenizer_from_file(&t2, m->str_contents); 724 | int hash_count = 0; 725 | int ws_count = 0; 726 | while(1) { 727 | int ret; 728 | ret = tokenizer_next(&t2, &tok); 729 | if(!ret) return 0; 730 | if(tok.type == TT_EOF) break; 731 | if(tok.type == TT_IDENTIFIER) { 732 | flush_whitespace(output, &ws_count); 733 | char *id = t2.buf; 734 | if(MACRO_VARIADIC(m) && !strcmp(t2.buf, "__VA_ARGS__")) { 735 | id = "..."; 736 | } 737 | size_t arg_nr = macro_arglist_pos(m, id); 738 | if(arg_nr != (size_t) -1) { 739 | tokenizer_rewind(&argvalues[arg_nr].t); 740 | if(hash_count == 1) ret = stringify(cpp, &argvalues[arg_nr].t, output); 741 | else while(1) { 742 | ret = tokenizer_next(&argvalues[arg_nr].t, &tok); 743 | if(!ret) return ret; 744 | if(tok.type == TT_EOF) break; 745 | emit_token(output, &tok, argvalues[arg_nr].t.buf); 746 | } 747 | hash_count = 0; 748 | } else { 749 | if(hash_count == 1) { 750 | hash_err: 751 | error("'#' is not followed by macro parameter", &t2, &tok); 752 | return 0; 753 | } 754 | emit_token(output, &tok, t2.buf); 755 | } 756 | } else if(is_char(&tok, '#')) { 757 | if(hash_count) { 758 | goto hash_err; 759 | } 760 | while(1) { 761 | ++hash_count; 762 | /* in a real cpp we'd need to look for '\\' first */ 763 | while(tokenizer_peek(&t2) == '\n') { 764 | x_tokenizer_next(&t2, &tok); 765 | } 766 | if(tokenizer_peek(&t2) == '#') x_tokenizer_next(&t2, &tok); 767 | else break; 768 | } 769 | if(hash_count == 1) flush_whitespace(output, &ws_count); 770 | else if(hash_count > 2) { 771 | error("only two '#' characters allowed for macro expansion", &t2, &tok); 772 | return 0; 773 | } 774 | if(hash_count == 2) 775 | ret = tokenizer_skip_chars(&t2, " \t\n", &ws_count); 776 | else 777 | ret = tokenizer_skip_chars(&t2, " \t", &ws_count); 778 | 779 | if(!ret) return ret; 780 | ws_count = 0; 781 | 782 | } else if(is_whitespace_token(&tok)) { 783 | ws_count++; 784 | } else { 785 | if(hash_count == 1) goto hash_err; 786 | flush_whitespace(output, &ws_count); 787 | emit_token(output, &tok, t2.buf); 788 | } 789 | } 790 | flush_whitespace(output, &ws_count); 791 | 792 | /* we need to expand macros after the macro arguments have been inserted */ 793 | if(1) { 794 | cwae.f = freopen_r(cwae.f, &cwae.buf, &cwae.len); 795 | #ifdef DEBUG 796 | dprintf(2, "contents with args expanded: %s\n", cwae.buf); 797 | #endif 798 | tokenizer_from_file(&cwae.t, cwae.f); 799 | size_t mac_cnt = 0; 800 | while(1) { 801 | int ret = tokenizer_next(&cwae.t, &tok); 802 | if(!ret) return ret; 803 | if(tok.type == TT_EOF) break; 804 | if(tok.type == TT_IDENTIFIER && get_macro(cpp, cwae.t.buf)) 805 | ++mac_cnt; 806 | } 807 | 808 | tokenizer_rewind(&cwae.t); 809 | struct macro_info *mcs = calloc(mac_cnt, sizeof(struct macro_info)); 810 | { 811 | size_t mac_iter = 0; 812 | get_macro_info(cpp, &cwae.t, mcs, &mac_iter, 0, 0, "null", visited, rec_level); 813 | /* some of the macros might not expand at this stage (without braces)*/ 814 | while(mac_cnt && mcs[mac_cnt-1].name == 0) 815 | --mac_cnt; 816 | } 817 | size_t i; int depth = 0; 818 | for(i = 0; i < mac_cnt; ++i) { 819 | if(mcs[i].nest > depth) depth = mcs[i].nest; 820 | } 821 | while(depth > -1) { 822 | for(i = 0; i < mac_cnt; ++i) if(mcs[i].nest == depth) { 823 | struct macro_info *mi = &mcs[i]; 824 | tokenizer_rewind(&cwae.t); 825 | size_t j; 826 | struct token utok; 827 | for(j = 0; j < mi->first+1; ++j) 828 | tokenizer_next(&cwae.t, &utok); 829 | struct FILE_container t2 = {0}, tmp = {0}; 830 | t2.f = open_memstream(&t2.buf, &t2.len); 831 | if(!expand_macro(cpp, &cwae.t, t2.f, mi->name, rec_level+1, visited)) 832 | return 0; 833 | t2.f = freopen_r(t2.f, &t2.buf, &t2.len); 834 | tokenizer_from_file(&t2.t, t2.f); 835 | /* manipulating the stream in case more stuff has been consumed */ 836 | off_t cwae_pos = tokenizer_ftello(&cwae.t); 837 | tokenizer_rewind(&cwae.t); 838 | #ifdef DEBUG 839 | dprintf(2, "merging %s with %s\n", cwae.buf, t2.buf); 840 | #endif 841 | int diff = mem_tokenizers_join(&cwae, &t2, &tmp, mi->first, cwae_pos); 842 | free_file_container(&cwae); 843 | free_file_container(&t2); 844 | cwae = tmp; 845 | #ifdef DEBUG 846 | dprintf(2, "result: %s\n", cwae.buf); 847 | #endif 848 | if(diff == 0) continue; 849 | for(j = 0; j < mac_cnt; ++j) { 850 | if(j == i) continue; 851 | struct macro_info *mi2 = &mcs[j]; 852 | /* modified element mi can be either inside, after or before 853 | another macro. the after case doesn't affect us. */ 854 | if(mi->first >= mi2->first && mi->last <= mi2->last) { 855 | /* inside m2 */ 856 | mi2->last += diff; 857 | } else if (mi->first < mi2->first) { 858 | /* before m2 */ 859 | mi2->first += diff; 860 | mi2->last += diff; 861 | } 862 | } 863 | } 864 | --depth; 865 | } 866 | tokenizer_rewind(&cwae.t); 867 | while(1) { 868 | struct macro *ma; 869 | tokenizer_next(&cwae.t, &tok); 870 | if(tok.type == TT_EOF) break; 871 | if(tok.type == TT_IDENTIFIER && tokenizer_peek(&cwae.t) == EOF && 872 | (ma = get_macro(cpp, cwae.t.buf)) && FUNCTIONLIKE(ma) && tchain_parens_follows(cpp, rec_level) != -1 873 | ) { 874 | int ret = expand_macro(cpp, &cwae.t, out, cwae.t.buf, rec_level+1, visited); 875 | if(!ret) return ret; 876 | } else 877 | emit_token(out, &tok, cwae.t.buf); 878 | } 879 | free(mcs); 880 | } 881 | 882 | free_file_container(&cwae); 883 | 884 | cleanup: 885 | for(i=0; i < num_args; i++) { 886 | fclose(argvalues[i].f); 887 | free(argvalues[i].buf); 888 | } 889 | free(argvalues); 890 | return 1; 891 | } 892 | 893 | #define TT_LAND TT_CUSTOM+0 894 | #define TT_LOR TT_CUSTOM+1 895 | #define TT_LTE TT_CUSTOM+2 896 | #define TT_GTE TT_CUSTOM+3 897 | #define TT_SHL TT_CUSTOM+4 898 | #define TT_SHR TT_CUSTOM+5 899 | #define TT_EQ TT_CUSTOM+6 900 | #define TT_NEQ TT_CUSTOM+7 901 | #define TT_LT TT_CUSTOM+8 902 | #define TT_GT TT_CUSTOM+9 903 | #define TT_BAND TT_CUSTOM+10 904 | #define TT_BOR TT_CUSTOM+11 905 | #define TT_XOR TT_CUSTOM+12 906 | #define TT_NEG TT_CUSTOM+13 907 | #define TT_PLUS TT_CUSTOM+14 908 | #define TT_MINUS TT_CUSTOM+15 909 | #define TT_MUL TT_CUSTOM+16 910 | #define TT_DIV TT_CUSTOM+17 911 | #define TT_MOD TT_CUSTOM+18 912 | #define TT_LPAREN TT_CUSTOM+19 913 | #define TT_RPAREN TT_CUSTOM+20 914 | #define TT_LNOT TT_CUSTOM+21 915 | 916 | #define TTINT(X) X-TT_CUSTOM 917 | #define TTENT(X, Y) [TTINT(X)] = Y 918 | 919 | static int bp(int tokentype) { 920 | static const int bplist[] = { 921 | TTENT(TT_LOR, 1 << 4), 922 | TTENT(TT_LAND, 1 << 5), 923 | TTENT(TT_BOR, 1 << 6), 924 | TTENT(TT_XOR, 1 << 7), 925 | TTENT(TT_BAND, 1 << 8), 926 | TTENT(TT_EQ, 1 << 9), 927 | TTENT(TT_NEQ, 1 << 9), 928 | TTENT(TT_LTE, 1 << 10), 929 | TTENT(TT_GTE, 1 << 10), 930 | TTENT(TT_LT, 1 << 10), 931 | TTENT(TT_GT, 1 << 10), 932 | TTENT(TT_SHL, 1 << 11), 933 | TTENT(TT_SHR, 1 << 11), 934 | TTENT(TT_PLUS, 1 << 12), 935 | TTENT(TT_MINUS, 1 << 12), 936 | TTENT(TT_MUL, 1 << 13), 937 | TTENT(TT_DIV, 1 << 13), 938 | TTENT(TT_MOD, 1 << 13), 939 | TTENT(TT_NEG, 1 << 14), 940 | TTENT(TT_LNOT, 1 << 14), 941 | TTENT(TT_LPAREN, 1 << 15), 942 | // TTENT(TT_RPAREN, 1 << 15), 943 | // TTENT(TT_LPAREN, 0), 944 | TTENT(TT_RPAREN, 0), 945 | }; 946 | if(TTINT(tokentype) < sizeof(bplist)/sizeof(bplist[0])) return bplist[TTINT(tokentype)]; 947 | return 0; 948 | } 949 | 950 | static int expr(struct tokenizer *t, int rbp, int *err); 951 | 952 | static int charlit_to_int(const char *lit) { 953 | if(lit[1] == '\\') switch(lit[2]) { 954 | case '0': return 0; 955 | case 'n': return 10; 956 | case 't': return 9; 957 | case 'r': return 13; 958 | case 'x': return strtol(lit+3, NULL, 16); 959 | default: return lit[2]; 960 | } 961 | return lit[1]; 962 | } 963 | 964 | static int nud(struct tokenizer *t, struct token *tok, int *err) { 965 | switch((unsigned) tok->type) { 966 | case TT_IDENTIFIER: return 0; 967 | case TT_WIDECHAR_LIT: 968 | case TT_SQSTRING_LIT: return charlit_to_int(t->buf); 969 | case TT_HEX_INT_LIT: 970 | case TT_OCT_INT_LIT: 971 | case TT_DEC_INT_LIT: 972 | return strtol(t->buf, NULL, 0); 973 | case TT_NEG: return ~ expr(t, bp(tok->type), err); 974 | case TT_PLUS: return expr(t, bp(tok->type), err); 975 | case TT_MINUS: return - expr(t, bp(tok->type), err); 976 | case TT_LNOT: return !expr(t, bp(tok->type), err); 977 | case TT_LPAREN: { 978 | int inner = expr(t, 0, err); 979 | if(0!=expect(t, TT_RPAREN, (const char*[]){")", 0}, tok)) { 980 | error("missing ')'", t, tok); 981 | return 0; 982 | } 983 | return inner; 984 | } 985 | case TT_FLOAT_LIT: 986 | error("floating constant in preprocessor expression", t, tok); 987 | *err = 1; 988 | return 0; 989 | case TT_RPAREN: 990 | default: 991 | error("unexpected token", t, tok); 992 | *err = 1; 993 | return 0; 994 | } 995 | } 996 | 997 | static int led(struct tokenizer *t, int left, struct token *tok, int *err) { 998 | int right; 999 | switch((unsigned) tok->type) { 1000 | case TT_LAND: 1001 | case TT_LOR: 1002 | right = expr(t, bp(tok->type), err); 1003 | if(tok->type == TT_LAND) return left && right; 1004 | return left || right; 1005 | case TT_LTE: return left <= expr(t, bp(tok->type), err); 1006 | case TT_GTE: return left >= expr(t, bp(tok->type), err); 1007 | case TT_SHL: return left << expr(t, bp(tok->type), err); 1008 | case TT_SHR: return left >> expr(t, bp(tok->type), err); 1009 | case TT_EQ: return left == expr(t, bp(tok->type), err); 1010 | case TT_NEQ: return left != expr(t, bp(tok->type), err); 1011 | case TT_LT: return left < expr(t, bp(tok->type), err); 1012 | case TT_GT: return left > expr(t, bp(tok->type), err); 1013 | case TT_BAND: return left & expr(t, bp(tok->type), err); 1014 | case TT_BOR: return left | expr(t, bp(tok->type), err); 1015 | case TT_XOR: return left ^ expr(t, bp(tok->type), err); 1016 | case TT_PLUS: return left + expr(t, bp(tok->type), err); 1017 | case TT_MINUS:return left - expr(t, bp(tok->type), err); 1018 | case TT_MUL: return left * expr(t, bp(tok->type), err); 1019 | case TT_DIV: 1020 | case TT_MOD: 1021 | right = expr(t, bp(tok->type), err); 1022 | if(right == 0) { 1023 | error("eval: div by zero", t, tok); 1024 | *err = 1; 1025 | } 1026 | else if(tok->type == TT_DIV) return left / right; 1027 | else if(tok->type == TT_MOD) return left % right; 1028 | return 0; 1029 | default: 1030 | error("eval: unexpect token", t, tok); 1031 | *err = 1; 1032 | return 0; 1033 | } 1034 | } 1035 | 1036 | 1037 | static int tokenizer_peek_next_non_ws(struct tokenizer *t, struct token *tok) 1038 | { 1039 | int ret; 1040 | while(1) { 1041 | ret = tokenizer_peek_token(t, tok); 1042 | if(is_whitespace_token(tok)) 1043 | x_tokenizer_next(t, tok); 1044 | else break; 1045 | } 1046 | return ret; 1047 | } 1048 | 1049 | static int expr(struct tokenizer *t, int rbp, int*err) { 1050 | struct token tok; 1051 | int ret = skip_next_and_ws(t, &tok); 1052 | if(tok.type == TT_EOF) return 0; 1053 | int left = nud(t, &tok, err); 1054 | while(1) { 1055 | ret = tokenizer_peek_next_non_ws(t, &tok); 1056 | if(bp(tok.type) <= rbp) break; 1057 | ret = tokenizer_next(t, &tok); 1058 | if(tok.type == TT_EOF) break; 1059 | left = led(t, left, &tok, err); 1060 | } 1061 | (void) ret; 1062 | return left; 1063 | } 1064 | 1065 | static int do_eval(struct tokenizer *t, int *result) { 1066 | tokenizer_register_custom_token(t, TT_LAND, "&&"); 1067 | tokenizer_register_custom_token(t, TT_LOR, "||"); 1068 | tokenizer_register_custom_token(t, TT_LTE, "<="); 1069 | tokenizer_register_custom_token(t, TT_GTE, ">="); 1070 | tokenizer_register_custom_token(t, TT_SHL, "<<"); 1071 | tokenizer_register_custom_token(t, TT_SHR, ">>"); 1072 | tokenizer_register_custom_token(t, TT_EQ, "=="); 1073 | tokenizer_register_custom_token(t, TT_NEQ, "!="); 1074 | 1075 | tokenizer_register_custom_token(t, TT_LT, "<"); 1076 | tokenizer_register_custom_token(t, TT_GT, ">"); 1077 | 1078 | tokenizer_register_custom_token(t, TT_BAND, "&"); 1079 | tokenizer_register_custom_token(t, TT_BOR, "|"); 1080 | tokenizer_register_custom_token(t, TT_XOR, "^"); 1081 | tokenizer_register_custom_token(t, TT_NEG, "~"); 1082 | 1083 | tokenizer_register_custom_token(t, TT_PLUS, "+"); 1084 | tokenizer_register_custom_token(t, TT_MINUS, "-"); 1085 | tokenizer_register_custom_token(t, TT_MUL, "*"); 1086 | tokenizer_register_custom_token(t, TT_DIV, "/"); 1087 | tokenizer_register_custom_token(t, TT_MOD, "%"); 1088 | 1089 | tokenizer_register_custom_token(t, TT_LPAREN, "("); 1090 | tokenizer_register_custom_token(t, TT_RPAREN, ")"); 1091 | tokenizer_register_custom_token(t, TT_LNOT, "!"); 1092 | 1093 | int err = 0; 1094 | *result = expr(t, 0, &err); 1095 | #ifdef DEBUG 1096 | dprintf(2, "eval result: %d\n", *result); 1097 | #endif 1098 | return !err; 1099 | } 1100 | 1101 | static int evaluate_condition(struct cpp *cpp, struct tokenizer *t, int *result, char *visited[]) { 1102 | int ret, backslash_seen = 0; 1103 | struct token curr; 1104 | char *bufp; 1105 | size_t size; 1106 | int tflags = tokenizer_get_flags(t); 1107 | tokenizer_set_flags(t, tflags | TF_PARSE_WIDE_STRINGS); 1108 | ret = tokenizer_next(t, &curr); 1109 | if(!ret) return ret; 1110 | if(!is_whitespace_token(&curr)) { 1111 | error("expected whitespace after if/elif", t, &curr); 1112 | return 0; 1113 | } 1114 | FILE *f = open_memstream(&bufp, &size); 1115 | while(1) { 1116 | ret = tokenizer_next(t, &curr); 1117 | if(!ret) return ret; 1118 | if(curr.type == TT_IDENTIFIER) { 1119 | if(!expand_macro(cpp, t, f, t->buf, -1, visited)) return 0; 1120 | } else if(curr.type == TT_SEP) { 1121 | if(curr.value == '\\') 1122 | backslash_seen = 1; 1123 | else { 1124 | if(curr.value == '\n') { 1125 | if(!backslash_seen) break; 1126 | } else { 1127 | emit_token(f, &curr, t->buf); 1128 | } 1129 | backslash_seen = 0; 1130 | } 1131 | } else { 1132 | emit_token(f, &curr, t->buf); 1133 | } 1134 | } 1135 | f = freopen_r(f, &bufp, &size); 1136 | if(!f || size == 0) { 1137 | error("#(el)if with no expression", t, &curr); 1138 | return 0; 1139 | } 1140 | #ifdef DEBUG 1141 | dprintf(2, "evaluating condition %s\n", bufp); 1142 | #endif 1143 | struct tokenizer t2; 1144 | tokenizer_from_file(&t2, f); 1145 | ret = do_eval(&t2, result); 1146 | fclose(f); 1147 | free(bufp); 1148 | tokenizer_set_flags(t, tflags); 1149 | return ret; 1150 | } 1151 | 1152 | static void free_visited(char *visited[]) { 1153 | size_t i; 1154 | for(i=0; i< MAX_RECURSION; i++) 1155 | if(visited[i]) free(visited[i]); 1156 | 1157 | } 1158 | 1159 | int parse_file(struct cpp *cpp, FILE *f, const char *fn, FILE *out) { 1160 | struct tokenizer t; 1161 | struct token curr; 1162 | tokenizer_init(&t, f, TF_PARSE_STRINGS); 1163 | tokenizer_set_filename(&t, fn); 1164 | tokenizer_register_marker(&t, MT_MULTILINE_COMMENT_START, "/*"); /**/ 1165 | tokenizer_register_marker(&t, MT_MULTILINE_COMMENT_END, "*/"); 1166 | tokenizer_register_marker(&t, MT_SINGLELINE_COMMENT_START, "//"); 1167 | int ret, newline=1, ws_count = 0; 1168 | 1169 | int if_level = 0, if_level_active = 0, if_level_satisfied = 0; 1170 | 1171 | #define all_levels_active() (if_level_active == if_level) 1172 | #define prev_level_active() (if_level_active == if_level-1) 1173 | #define set_level(X, V) do { \ 1174 | if(if_level_active > X) if_level_active = X; \ 1175 | if(if_level_satisfied > X) if_level_satisfied = X; \ 1176 | if(V != -1) { \ 1177 | if(V) if_level_active = X; \ 1178 | else if(if_level_active == X) if_level_active = X-1; \ 1179 | if(V && if_level_active == X) if_level_satisfied = X; \ 1180 | } \ 1181 | if_level = X; \ 1182 | } while(0) 1183 | #define skip_conditional_block (if_level > if_level_active) 1184 | 1185 | static const char* directives[] = {"include", "error", "warning", "define", "undef", "if", "elif", "else", "ifdef", "ifndef", "endif", "line", "pragma", 0}; 1186 | while((ret = tokenizer_next(&t, &curr)) && curr.type != TT_EOF) { 1187 | newline = curr.column == 0; 1188 | if(newline) { 1189 | ret = eat_whitespace(&t, &curr, &ws_count); 1190 | if(!ret) return ret; 1191 | } 1192 | if(curr.type == TT_EOF) break; 1193 | if(skip_conditional_block && !(newline && is_char(&curr, '#'))) continue; 1194 | if(is_char(&curr, '#')) { 1195 | if(!newline) { 1196 | error("stray #", &t, &curr); 1197 | return 0; 1198 | } 1199 | int index = expect(&t, TT_IDENTIFIER, directives, &curr); 1200 | if(index == -1) { 1201 | if(skip_conditional_block) continue; 1202 | error("invalid preprocessing directive", &t, &curr); 1203 | return 0; 1204 | } 1205 | if(skip_conditional_block) switch(index) { 1206 | case 0: case 1: case 2: case 3: case 4: 1207 | case 11: case 12: 1208 | continue; 1209 | default: break; 1210 | } 1211 | switch(index) { 1212 | case 0: 1213 | ret = include_file(cpp, &t, out); 1214 | if(!ret) return ret; 1215 | break; 1216 | case 1: 1217 | ret = emit_error_or_warning(&t, 1); 1218 | if(!ret) return ret; 1219 | break; 1220 | case 2: 1221 | ret = emit_error_or_warning(&t, 0); 1222 | if(!ret) return ret; 1223 | break; 1224 | case 3: 1225 | ret = parse_macro(cpp, &t); 1226 | if(!ret) return ret; 1227 | break; 1228 | case 4: 1229 | if(!skip_next_and_ws(&t, &curr)) return 0; 1230 | if(curr.type != TT_IDENTIFIER) { 1231 | error("expected identifier", &t, &curr); 1232 | return 0; 1233 | } 1234 | undef_macro(cpp, t.buf); 1235 | break; 1236 | case 5: // if 1237 | if(all_levels_active()) { 1238 | char* visited[MAX_RECURSION] = {0}; 1239 | if(!evaluate_condition(cpp, &t, &ret, visited)) return 0; 1240 | free_visited(visited); 1241 | set_level(if_level + 1, ret); 1242 | } else { 1243 | set_level(if_level + 1, 0); 1244 | } 1245 | break; 1246 | case 6: // elif 1247 | if(prev_level_active() && if_level_satisfied < if_level) { 1248 | char* visited[MAX_RECURSION] = {0}; 1249 | if(!evaluate_condition(cpp, &t, &ret, visited)) return 0; 1250 | free_visited(visited); 1251 | if(ret) { 1252 | if_level_active = if_level; 1253 | if_level_satisfied = if_level; 1254 | } 1255 | } else if(if_level_active == if_level) { 1256 | --if_level_active; 1257 | } 1258 | break; 1259 | case 7: // else 1260 | if(prev_level_active() && if_level_satisfied < if_level) { 1261 | if(1) { 1262 | if_level_active = if_level; 1263 | if_level_satisfied = if_level; 1264 | } 1265 | } else if(if_level_active == if_level) { 1266 | --if_level_active; 1267 | } 1268 | break; 1269 | case 8: // ifdef 1270 | case 9: // ifndef 1271 | if(!skip_next_and_ws(&t, &curr) || curr.type == TT_EOF) return 0; 1272 | ret = !!get_macro(cpp, t.buf); 1273 | if(index == 9) ret = !ret; 1274 | 1275 | if(all_levels_active()) { 1276 | set_level(if_level + 1, ret); 1277 | } else { 1278 | set_level(if_level + 1, 0); 1279 | } 1280 | break; 1281 | case 10: // endif 1282 | set_level(if_level-1, -1); 1283 | break; 1284 | case 11: // line 1285 | ret = tokenizer_read_until(&t, "\n", 1); 1286 | if(!ret) { 1287 | error("unknown", &t, &curr); 1288 | return 0; 1289 | } 1290 | break; 1291 | case 12: // pragma 1292 | emit(out, "#pragma"); 1293 | while((ret = x_tokenizer_next(&t, &curr)) && curr.type != TT_EOF) { 1294 | emit_token(out, &curr, t.buf); 1295 | if(is_char(&curr, '\n')) break; 1296 | } 1297 | if(!ret) return ret; 1298 | break; 1299 | default: 1300 | break; 1301 | } 1302 | continue; 1303 | } else { 1304 | while(ws_count) { 1305 | emit(out, " "); 1306 | --ws_count; 1307 | } 1308 | } 1309 | #if DEBUG 1310 | dprintf(2, "(stdin:%u,%u) ", curr.line, curr.column); 1311 | if(curr.type == TT_SEP) 1312 | dprintf(2, "separator: %c\n", curr.value == '\n'? ' ' : curr.value); 1313 | else 1314 | dprintf(2, "%s: %s\n", tokentype_to_str(curr.type), t.buf); 1315 | #endif 1316 | if(curr.type == TT_IDENTIFIER) { 1317 | char* visited[MAX_RECURSION] = {0}; 1318 | if(!expand_macro(cpp, &t, out, t.buf, 0, visited)) 1319 | return 0; 1320 | free_visited(visited); 1321 | } else { 1322 | emit_token(out, &curr, t.buf); 1323 | } 1324 | } 1325 | if(if_level) { 1326 | error("unterminated #if", &t, &curr); 1327 | return 0; 1328 | } 1329 | return 1; 1330 | } 1331 | 1332 | struct cpp * cpp_new(void) { 1333 | struct cpp* ret = calloc(1, sizeof(struct cpp)); 1334 | if(!ret) return ret; 1335 | tglist_init(&ret->includedirs); 1336 | cpp_add_includedir(ret, "."); 1337 | ret->macros = hbmap_new(strptrcmp, string_hash, 128); 1338 | struct macro m = {.num_args = 1}; 1339 | add_macro(ret, strdup("defined"), &m); 1340 | m.num_args = MACRO_FLAG_OBJECTLIKE; 1341 | add_macro(ret, strdup("__FILE__"), &m); 1342 | add_macro(ret, strdup("__LINE__"), &m); 1343 | return ret; 1344 | } 1345 | 1346 | void cpp_free(struct cpp*cpp) { 1347 | free_macros(cpp); 1348 | tglist_free_values(&cpp->includedirs); 1349 | tglist_free_items(&cpp->includedirs); 1350 | } 1351 | 1352 | void cpp_add_includedir(struct cpp *cpp, const char* includedir) { 1353 | tglist_add(&cpp->includedirs, strdup(includedir)); 1354 | } 1355 | 1356 | int cpp_add_define(struct cpp *cpp, const char *mdecl) { 1357 | struct FILE_container tmp = {0}; 1358 | tmp.f = open_memstream(&tmp.buf, &tmp.len); 1359 | fprintf(tmp.f, "%s\n", mdecl); 1360 | tmp.f = freopen_r(tmp.f, &tmp.buf, &tmp.len); 1361 | tokenizer_from_file(&tmp.t, tmp.f); 1362 | int ret = parse_macro(cpp, &tmp.t); 1363 | free_file_container(&tmp); 1364 | return ret; 1365 | } 1366 | 1367 | int cpp_run(struct cpp *cpp, FILE* in, FILE* out, const char* inname) { 1368 | return parse_file(cpp, in, inname, out); 1369 | } 1370 | -------------------------------------------------------------------------------- /preproc.h: -------------------------------------------------------------------------------- 1 | #ifndef PREPROC_H 2 | #define PREPROC_H 3 | 4 | #include 5 | 6 | struct cpp; 7 | 8 | struct cpp *cpp_new(void); 9 | void cpp_free(struct cpp*); 10 | void cpp_add_includedir(struct cpp *cpp, const char* includedir); 11 | int cpp_add_define(struct cpp *cpp, const char *mdecl); 12 | int cpp_run(struct cpp *cpp, FILE* in, FILE* out, const char* inname); 13 | 14 | #ifdef __GNUC__ 15 | #pragma GCC diagnostic ignored "-Wunknown-pragmas" 16 | #endif 17 | #pragma RcB2 DEP "preproc.c" 18 | 19 | #endif 20 | 21 | -------------------------------------------------------------------------------- /tokenizer.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | #include "tokenizer.h" 8 | 9 | void tokenizer_set_filename(struct tokenizer *t, const char* fn) { 10 | t->filename = fn; 11 | } 12 | 13 | #define ARRAY_SIZE(X) (sizeof(X)/sizeof(X[0])) 14 | 15 | off_t tokenizer_ftello(struct tokenizer *t) { 16 | return ftello(t->input)-t->getc_buf.buffered; 17 | } 18 | 19 | static int tokenizer_ungetc(struct tokenizer *t, int c) 20 | { 21 | ++t->getc_buf.buffered; 22 | assert(t->getc_buf.bufferedgetc_buf.buf)); 23 | assert(t->getc_buf.cnt > 0); 24 | --t->getc_buf.cnt; 25 | assert(t->getc_buf.buf[t->getc_buf.cnt % ARRAY_SIZE(t->getc_buf.buf)] == c); 26 | return c; 27 | } 28 | static int tokenizer_getc(struct tokenizer *t) 29 | { 30 | int c; 31 | if(t->getc_buf.buffered) { 32 | t->getc_buf.buffered--; 33 | c = t->getc_buf.buf[(t->getc_buf.cnt) % ARRAY_SIZE(t->getc_buf.buf)]; 34 | } else { 35 | c = getc(t->input); 36 | t->getc_buf.buf[t->getc_buf.cnt % ARRAY_SIZE(t->getc_buf.buf)] = c; 37 | } 38 | ++t->getc_buf.cnt; 39 | return c; 40 | } 41 | 42 | int tokenizer_peek(struct tokenizer *t) { 43 | if(t->peeking) return t->peek_token.value; 44 | int ret = tokenizer_getc(t); 45 | if(ret != EOF) tokenizer_ungetc(t, ret); 46 | return ret; 47 | } 48 | 49 | int tokenizer_peek_token(struct tokenizer *t, struct token *tok) { 50 | int ret = tokenizer_next(t, tok); 51 | t->peek_token = *tok; 52 | t->peeking = 1; 53 | return ret; 54 | } 55 | 56 | void tokenizer_register_custom_token(struct tokenizer*t, int tokentype, const char* str) { 57 | assert(tokentype >= TT_CUSTOM && tokentype < TT_CUSTOM + MAX_CUSTOM_TOKENS); 58 | int pos = tokentype - TT_CUSTOM; 59 | t->custom_tokens[pos] = str; 60 | if(pos+1 > t->custom_count) t->custom_count = pos+1; 61 | } 62 | 63 | const char* tokentype_to_str(enum tokentype tt) { 64 | switch((unsigned) tt) { 65 | case TT_IDENTIFIER: return "iden"; 66 | case TT_WIDECHAR_LIT: return "widechar"; 67 | case TT_WIDESTRING_LIT: return "widestring"; 68 | case TT_SQSTRING_LIT: return "single-quoted string"; 69 | case TT_DQSTRING_LIT: return "double-quoted string"; 70 | case TT_ELLIPSIS: return "ellipsis"; 71 | case TT_HEX_INT_LIT: return "hexint"; 72 | case TT_OCT_INT_LIT: return "octint"; 73 | case TT_DEC_INT_LIT: return "decint"; 74 | case TT_FLOAT_LIT: return "float"; 75 | case TT_SEP: return "separator"; 76 | case TT_UNKNOWN: return "unknown"; 77 | case TT_OVERFLOW: return "overflow"; 78 | case TT_EOF: return "eof"; 79 | } 80 | return "????"; 81 | } 82 | 83 | static int has_ul_tail(const char *p) { 84 | char tail[4]; 85 | int tc = 0, c; 86 | while(tc < 4 ) { 87 | if(!*p) break; 88 | c = tolower(*p); 89 | if(c == 'u' || c == 'l') { 90 | tail[tc++] = c; 91 | } else { 92 | return 0; 93 | } 94 | p++; 95 | } 96 | if(tc == 1) return 1; 97 | if(tc == 2) { 98 | if(!memcmp(tail, "lu", 2)) return 1; 99 | if(!memcmp(tail, "ul", 2)) return 1; 100 | if(!memcmp(tail, "ll", 2)) return 1; 101 | } 102 | if(tc == 3) { 103 | if(!memcmp(tail, "llu", 3)) return 1; 104 | if(!memcmp(tail, "ull", 3)) return 1; 105 | } 106 | return 0; 107 | } 108 | 109 | static int is_hex_int_literal(const char *s) { 110 | if(s[0] == '-') s++; 111 | if(s[0] == '0' && (s[1] == 'x' || s[1] == 'X')) { 112 | const char* p = s+2; 113 | while(*p) { 114 | if(!strchr("0123456789abcdef", tolower(*p))) { 115 | if(p == s+2) return 0; 116 | return has_ul_tail(p); 117 | } 118 | p++; 119 | } 120 | return 1; 121 | } 122 | return 0; 123 | } 124 | 125 | static int is_plus_or_minus(int c) { 126 | return c == '-' || c == '+'; 127 | } 128 | 129 | static int is_dec_int_literal(const char *str) { 130 | const char *s = str; 131 | if(is_plus_or_minus(s[0])) s++; 132 | if(s[0] == '0') { 133 | if(s[1] == 0) return 1; 134 | if(isdigit(s[1])) return 0; 135 | } 136 | while(*s) { 137 | if(!isdigit(*s)) { 138 | if(s > str && (is_plus_or_minus(str[0]) ? s > str+1 : 1)) return has_ul_tail(s); 139 | else return 0; 140 | } 141 | s++; 142 | } 143 | return 1; 144 | } 145 | 146 | static int is_float_literal(const char *str) { 147 | const char *s = str; 148 | if(is_plus_or_minus(s[0])) s++; 149 | int got_dot = 0, got_e = 0, got_digits = 0; 150 | while(*s) { 151 | int l = tolower(*s); 152 | if(*s == '.') { 153 | if(got_dot) return 0; 154 | got_dot = 1; 155 | } else if(l == 'f') { 156 | if(s[1] == 0 && (got_dot || got_e) && got_digits) return 1; 157 | return 0; 158 | } else if (isdigit(*s)) { 159 | got_digits = 1; 160 | } else if(l == 'e') { 161 | if(!got_digits) return 0; 162 | s++; 163 | if(is_plus_or_minus(*s)) s++; 164 | if(!isdigit(*s)) return 0; 165 | got_e = 1; 166 | } else return 0; 167 | s++; 168 | } 169 | if(got_digits && (got_e || got_dot)) return 1; 170 | return 0; 171 | } 172 | 173 | static int is_valid_float_until(const char*s, const char* until) { 174 | int got_digits = 0, got_dot = 0; 175 | while(s < until) { 176 | if(isdigit(*s)) got_digits = 1; 177 | else if(*s == '.') { 178 | if(got_dot) return 0; 179 | got_dot = 1; 180 | } else return 0; 181 | ++s; 182 | } 183 | return got_digits | (got_dot << 1); 184 | } 185 | 186 | static int is_oct_int_literal(const char *s) { 187 | if(s[0] == '-') s++; 188 | if(s[0] != '0') return 0; 189 | while(*s) { 190 | if(!strchr("01234567", *s)) return 0; 191 | s++; 192 | } 193 | return 1; 194 | } 195 | 196 | static int is_identifier(const char *s) { 197 | static const char ascmap[128] = { 198 | ['0'] = 2, ['1'] = 2, ['2'] = 2, ['3'] = 2, 199 | ['4'] = 2, ['5'] = 2, ['6'] = 2, ['7'] = 2, 200 | ['8'] = 2, ['9'] = 2, ['A'] = 1, ['B'] = 1, 201 | ['C'] = 1, ['D'] = 1, ['E'] = 1, ['F'] = 1, 202 | ['G'] = 1, ['H'] = 1, ['I'] = 1, ['J'] = 1, 203 | ['K'] = 1, ['L'] = 1, ['M'] = 1, ['N'] = 1, 204 | ['O'] = 1, ['P'] = 1, ['Q'] = 1, ['R'] = 1, 205 | ['S'] = 1, ['T'] = 1, ['U'] = 1, ['V'] = 1, 206 | ['W'] = 1, ['X'] = 1, ['Y'] = 1, ['Z'] = 1, 207 | ['_'] = 1, ['a'] = 1, ['b'] = 1, ['c'] = 1, 208 | ['d'] = 1, ['e'] = 1, ['f'] = 1, ['g'] = 1, 209 | ['h'] = 1, ['i'] = 1, ['j'] = 1, ['k'] = 1, 210 | ['l'] = 1, ['m'] = 1, ['n'] = 1, ['o'] = 1, 211 | ['p'] = 1, ['q'] = 1, ['r'] = 1, ['s'] = 1, 212 | ['t'] = 1, ['u'] = 1, ['v'] = 1, ['w'] = 1, 213 | ['x'] = 1, ['y'] = 1, ['z'] = 1, 214 | }; 215 | if((*s) & 128) return 0; 216 | if(ascmap[(unsigned) *s] != 1) return 0; 217 | ++s; 218 | while(*s) { 219 | if((*s) & 128) return 0; 220 | if(!ascmap[(unsigned) *s]) 221 | return 0; 222 | s++; 223 | } 224 | return 1; 225 | } 226 | 227 | static enum tokentype categorize(const char *s) { 228 | if(is_hex_int_literal(s)) return TT_HEX_INT_LIT; 229 | if(is_dec_int_literal(s)) return TT_DEC_INT_LIT; 230 | if(is_oct_int_literal(s)) return TT_OCT_INT_LIT; 231 | if(is_float_literal(s)) return TT_FLOAT_LIT; 232 | if(is_identifier(s)) return TT_IDENTIFIER; 233 | return TT_UNKNOWN; 234 | } 235 | 236 | 237 | static int is_sep(int c) { 238 | static const char ascmap[128] = { 239 | ['\t'] = 1, ['\n'] = 1, [' '] = 1, ['!'] = 1, 240 | ['\"'] = 1, ['#'] = 1, ['%'] = 1, ['&'] = 1, 241 | ['\''] = 1, ['('] = 1, [')'] = 1, ['*'] = 1, 242 | ['+'] = 1, [','] = 1, ['-'] = 1, ['.'] = 1, 243 | ['/'] = 1, [':'] = 1, [';'] = 1, ['<'] = 1, 244 | ['='] = 1, ['>'] = 1, ['?'] = 1, ['['] = 1, 245 | ['\\'] = 1, [']'] = 1, ['{'] = 1, ['|'] = 1, 246 | ['}'] = 1, ['~'] = 1, ['^'] = 1, 247 | }; 248 | return !(c&128) && ascmap[c]; 249 | } 250 | 251 | static int apply_coords(struct tokenizer *t, struct token* out, char *end, int retval) { 252 | out->line = t->line; 253 | uintptr_t len = end - t->buf; 254 | out->column = t->column - len; 255 | if(len + 1 >= t->bufsize) { 256 | out->type = TT_OVERFLOW; 257 | return 0; 258 | } 259 | return retval; 260 | } 261 | 262 | static inline char *assign_bufchar(struct tokenizer *t, char *s, int c) { 263 | t->column++; 264 | *s = c; 265 | return s + 1; 266 | } 267 | 268 | static int get_string(struct tokenizer *t, char quote_char, struct token* out, int wide) { 269 | char *s = t->buf+1; 270 | int escaped = 0; 271 | char *end = t->buf + t->bufsize - 2; 272 | while(s < end) { 273 | int c = tokenizer_getc(t); 274 | if(c == EOF) { 275 | out->type = TT_EOF; 276 | *s = 0; 277 | return apply_coords(t, out, s, 0); 278 | } 279 | if(c == '\\') { 280 | c = tokenizer_getc(t); 281 | if(c == '\n') continue; 282 | tokenizer_ungetc(t, c); 283 | c = '\\'; 284 | } 285 | if(c == '\n') { 286 | if(escaped) { 287 | escaped = 0; 288 | continue; 289 | } 290 | tokenizer_ungetc(t, c); 291 | out->type = TT_UNKNOWN; 292 | s = assign_bufchar(t, s, 0); 293 | return apply_coords(t, out, s, 0); 294 | } 295 | if(!escaped) { 296 | if(c == quote_char) { 297 | s = assign_bufchar(t, s, c); 298 | *s = 0; 299 | //s = assign_bufchar(t, s, 0); 300 | if(!wide) 301 | out->type = (quote_char == '"'? TT_DQSTRING_LIT : TT_SQSTRING_LIT); 302 | else 303 | out->type = (quote_char == '"'? TT_WIDESTRING_LIT : TT_WIDECHAR_LIT); 304 | return apply_coords(t, out, s, 1); 305 | } 306 | if(c == '\\') escaped = 1; 307 | } else { 308 | escaped = 0; 309 | } 310 | s = assign_bufchar(t, s, c); 311 | } 312 | t->buf[MAX_TOK_LEN-1] = 0; 313 | out->type = TT_OVERFLOW; 314 | return apply_coords(t, out, s, 0); 315 | } 316 | 317 | /* if sequence found, next tokenizer call will point after the sequence */ 318 | static int sequence_follows(struct tokenizer *t, int c, const char *which) 319 | { 320 | if(!which || !which[0]) return 0; 321 | size_t i = 0; 322 | while(c == which[i]) { 323 | if(!which[++i]) break; 324 | c = tokenizer_getc(t); 325 | } 326 | if(!which[i]) return 1; 327 | while(i > 0) { 328 | tokenizer_ungetc(t, c); 329 | c = which[--i]; 330 | } 331 | return 0; 332 | } 333 | 334 | int tokenizer_skip_chars(struct tokenizer *t, const char *chars, int *count) { 335 | assert(!t->peeking); 336 | int c; 337 | *count = 0; 338 | while(1) { 339 | c = tokenizer_getc(t); 340 | if(c == EOF) return 0; 341 | const char *s = chars; 342 | int match = 0; 343 | while(*s) { 344 | if(c==*s) { 345 | ++(*count); 346 | match = 1; 347 | break; 348 | } 349 | ++s; 350 | } 351 | if(!match) { 352 | tokenizer_ungetc(t, c); 353 | return 1; 354 | } 355 | } 356 | 357 | } 358 | 359 | int tokenizer_read_until(struct tokenizer *t, const char* marker, int stop_at_nl) 360 | { 361 | int c, marker_is_nl = !strcmp(marker, "\n"); 362 | char *s = t->buf; 363 | while(1) { 364 | c = tokenizer_getc(t); 365 | if(c == EOF) { 366 | *s = 0; 367 | return 0; 368 | } 369 | if(c == '\n') { 370 | t->line++; 371 | t->column = 0; 372 | if(stop_at_nl) { 373 | *s = 0; 374 | if(marker_is_nl) return 1; 375 | return 0; 376 | } 377 | } 378 | if(!sequence_follows(t, c, marker)) 379 | s = assign_bufchar(t, s, c); 380 | else 381 | break; 382 | } 383 | *s = 0; 384 | size_t i; 385 | for(i=strlen(marker); i > 0; ) 386 | tokenizer_ungetc(t, marker[--i]); 387 | return 1; 388 | } 389 | static int ignore_until(struct tokenizer *t, const char* marker, int col_advance) 390 | { 391 | t->column += col_advance; 392 | int c; 393 | do { 394 | c = tokenizer_getc(t); 395 | if(c == EOF) return 0; 396 | if(c == '\n') { 397 | t->line++; 398 | t->column = 0; 399 | } else t->column++; 400 | } while(!sequence_follows(t, c, marker)); 401 | t->column += strlen(marker)-1; 402 | return 1; 403 | } 404 | 405 | void tokenizer_skip_until(struct tokenizer *t, const char *marker) 406 | { 407 | ignore_until(t, marker, 0); 408 | } 409 | 410 | int tokenizer_next(struct tokenizer *t, struct token* out) { 411 | char *s = t->buf; 412 | out->value = 0; 413 | int c = 0; 414 | if(t->peeking) { 415 | *out = t->peek_token; 416 | t->peeking = 0; 417 | return 1; 418 | } 419 | while(1) { 420 | c = tokenizer_getc(t); 421 | if(c == EOF) break; 422 | 423 | /* components of multi-line comment marker might be terminals themselves */ 424 | if(sequence_follows(t, c, t->marker[MT_MULTILINE_COMMENT_START])) { 425 | ignore_until(t, t->marker[MT_MULTILINE_COMMENT_END], strlen(t->marker[MT_MULTILINE_COMMENT_START])); 426 | continue; 427 | } 428 | if(sequence_follows(t, c, t->marker[MT_SINGLELINE_COMMENT_START])) { 429 | ignore_until(t, "\n", strlen(t->marker[MT_SINGLELINE_COMMENT_START])); 430 | continue; 431 | } 432 | if(is_sep(c)) { 433 | if(s != t->buf && c == '\\' && !isspace(s[-1])) { 434 | c = tokenizer_getc(t); 435 | if(c == '\n') continue; 436 | tokenizer_ungetc(t, c); 437 | c = '\\'; 438 | } else if(is_plus_or_minus(c) && s > t->buf+1 && 439 | (s[-1] == 'E' || s[-1] == 'e') && is_valid_float_until(t->buf, s-1)) { 440 | goto process_char; 441 | } else if(c == '.' && s != t->buf && is_valid_float_until(t->buf, s) == 1) { 442 | goto process_char; 443 | } else if(c == '.' && s == t->buf) { 444 | int jump = 0; 445 | c = tokenizer_getc(t); 446 | if(isdigit(c)) jump = 1; 447 | tokenizer_ungetc(t, c); 448 | c = '.'; 449 | if(jump) goto process_char; 450 | } 451 | tokenizer_ungetc(t, c); 452 | break; 453 | } 454 | if((t->flags & TF_PARSE_WIDE_STRINGS) && s == t->buf && c == 'L') { 455 | c = tokenizer_getc(t); 456 | tokenizer_ungetc(t, c); 457 | tokenizer_ungetc(t, 'L'); 458 | if(c == '\'' || c == '\"') break; 459 | } 460 | 461 | process_char:; 462 | s = assign_bufchar(t, s, c); 463 | if(t->column + 1 >= MAX_TOK_LEN) { 464 | out->type = TT_OVERFLOW; 465 | return apply_coords(t, out, s, 0); 466 | } 467 | } 468 | if(s == t->buf) { 469 | if(c == EOF) { 470 | out->type = TT_EOF; 471 | return apply_coords(t, out, s, 1); 472 | } 473 | 474 | int wide = 0; 475 | c = tokenizer_getc(t); 476 | if((t->flags & TF_PARSE_WIDE_STRINGS) && c == 'L') { 477 | c = tokenizer_getc(t); 478 | assert(c == '\'' || c == '\"'); 479 | wide = 1; 480 | goto string_handling; 481 | } else if (c == '.' && sequence_follows(t, c, "...")) { 482 | strcpy(t->buf, "..."); 483 | out->type = TT_ELLIPSIS; 484 | return apply_coords(t, out, s+3, 1); 485 | } 486 | 487 | { 488 | int i; 489 | for(i = 0; i < t->custom_count; i++) 490 | if(sequence_follows(t, c, t->custom_tokens[i])) { 491 | const char *p = t->custom_tokens[i]; 492 | while(*p) { 493 | s = assign_bufchar(t, s, *p); 494 | p++; 495 | } 496 | *s = 0; 497 | out->type = TT_CUSTOM + i; 498 | return apply_coords(t, out, s, 1); 499 | } 500 | } 501 | 502 | string_handling: 503 | s = assign_bufchar(t, s, c); 504 | *s = 0; 505 | //s = assign_bufchar(t, s, 0); 506 | if(c == '"' || c == '\'') 507 | if(t->flags & TF_PARSE_STRINGS) return get_string(t, c, out, wide); 508 | out->type = TT_SEP; 509 | out->value = c; 510 | if(c == '\n') { 511 | apply_coords(t, out, s, 1); 512 | t->line++; 513 | t->column=0; 514 | return 1; 515 | } 516 | return apply_coords(t, out, s, 1); 517 | } 518 | //s = assign_bufchar(t, s, 0); 519 | *s = 0; 520 | out->type = categorize(t->buf); 521 | return apply_coords(t, out, s, out->type != TT_UNKNOWN); 522 | } 523 | 524 | void tokenizer_set_flags(struct tokenizer *t, int flags) { 525 | t->flags = flags; 526 | } 527 | 528 | int tokenizer_get_flags(struct tokenizer *t) { 529 | return t->flags; 530 | } 531 | 532 | void tokenizer_init(struct tokenizer *t, FILE* in, int flags) { 533 | *t = (struct tokenizer){ .input = in, .line = 1, .flags = flags, .bufsize = MAX_TOK_LEN}; 534 | } 535 | 536 | void tokenizer_register_marker(struct tokenizer *t, enum markertype mt, const char* marker) 537 | { 538 | t->marker[mt] = marker; 539 | } 540 | 541 | int tokenizer_rewind(struct tokenizer *t) { 542 | FILE *f = t->input; 543 | int flags = t->flags; 544 | const char* fn = t->filename; 545 | tokenizer_init(t, f, flags); 546 | tokenizer_set_filename(t, fn); 547 | return fseek(f, 0, SEEK_SET) == 0; 548 | } 549 | -------------------------------------------------------------------------------- /tokenizer.h: -------------------------------------------------------------------------------- 1 | #ifndef TOKENIZER_H 2 | #define TOKENIZER_H 3 | 4 | #define MAX_TOK_LEN 4096 5 | #define MAX_UNGETC 8 6 | 7 | #include 8 | #include 9 | #include 10 | 11 | struct tokenizer_getc_buf { 12 | int buf[MAX_UNGETC]; 13 | size_t cnt, buffered; 14 | }; 15 | 16 | enum markertype { 17 | MT_SINGLELINE_COMMENT_START = 0, 18 | MT_MULTILINE_COMMENT_START = 1, 19 | MT_MULTILINE_COMMENT_END = 2, 20 | MT_MAX = MT_MULTILINE_COMMENT_END 21 | }; 22 | 23 | #define MAX_CUSTOM_TOKENS 32 24 | 25 | enum tokentype { 26 | TT_IDENTIFIER = 1, 27 | TT_SQSTRING_LIT, 28 | TT_DQSTRING_LIT, 29 | TT_ELLIPSIS, 30 | TT_HEX_INT_LIT, 31 | TT_OCT_INT_LIT, 32 | TT_DEC_INT_LIT, 33 | TT_FLOAT_LIT, 34 | TT_SEP, 35 | /* errors and similar */ 36 | TT_UNKNOWN, 37 | TT_OVERFLOW, 38 | TT_WIDECHAR_LIT, 39 | TT_WIDESTRING_LIT, 40 | TT_EOF, 41 | TT_CUSTOM = 1000 /* start user defined tokentype values */ 42 | }; 43 | 44 | const char* tokentype_to_str(enum tokentype tt); 45 | 46 | struct token { 47 | enum tokentype type; 48 | uint32_t line; 49 | uint32_t column; 50 | int value; 51 | }; 52 | 53 | enum tokenizer_flags { 54 | TF_PARSE_STRINGS = 1 << 0, 55 | TF_PARSE_WIDE_STRINGS = 1 << 1, 56 | }; 57 | 58 | struct tokenizer { 59 | FILE *input; 60 | uint32_t line; 61 | uint32_t column; 62 | int flags; 63 | int custom_count; 64 | int peeking; 65 | const char *custom_tokens[MAX_CUSTOM_TOKENS]; 66 | char buf[MAX_TOK_LEN]; 67 | size_t bufsize; 68 | struct tokenizer_getc_buf getc_buf; 69 | const char* marker[MT_MAX+1]; 70 | const char* filename; 71 | struct token peek_token; 72 | }; 73 | 74 | void tokenizer_init(struct tokenizer *t, FILE* in, int flags); 75 | void tokenizer_set_filename(struct tokenizer *t, const char*); 76 | void tokenizer_set_flags(struct tokenizer *t, int flags); 77 | int tokenizer_get_flags(struct tokenizer *t); 78 | off_t tokenizer_ftello(struct tokenizer *t); 79 | void tokenizer_register_marker(struct tokenizer*, enum markertype, const char*); 80 | void tokenizer_register_custom_token(struct tokenizer*, int tokentype, const char*); 81 | int tokenizer_next(struct tokenizer *t, struct token* out); 82 | int tokenizer_peek_token(struct tokenizer *t, struct token* out); 83 | int tokenizer_peek(struct tokenizer *t); 84 | void tokenizer_skip_until(struct tokenizer *t, const char *marker); 85 | int tokenizer_skip_chars(struct tokenizer *t, const char *chars, int *count); 86 | int tokenizer_read_until(struct tokenizer *t, const char* marker, int stop_at_nl); 87 | int tokenizer_rewind(struct tokenizer *t); 88 | 89 | #ifdef __GNUC__ 90 | #pragma GCC diagnostic ignored "-Wunknown-pragmas" 91 | #endif 92 | #pragma RcB2 DEP "tokenizer.c" 93 | 94 | #endif 95 | 96 | --------------------------------------------------------------------------------