├── README.md ├── makefile └── mini.c /README.md: -------------------------------------------------------------------------------- 1 | # Implementing a Language in C 2 | 3 | In this post we're implementing a small programming language in C. The 4 | language, which I extended from 5 | [**tinyc**](http://www.iro.umontreal.ca/~felipe/IFT2030-Automne2002/Complements/tinyc.c) 6 | by **Marc Feeley**, is a statement language. I added a print statement to the 7 | language (which means extending the tokenizer and the parser, and adding a new 8 | machine instruction and modifying the virtual machine interpreter accordingly). 9 | I also removed the restriction that only one-symbol pre-defined identifiers are 10 | allowed; this again leads to changes in the tokenizer, the parser, the 11 | compiler, and the virtual machine interpreter. Moreover, as will be seen, I 12 | added an abstract syntax tree printer and an abstract syntax interpreter to the 13 | program. 14 | 15 | In this language a program is simply a statement, where a statement can be a 16 | condition statement, a loop statement, a print statement, an empty statement, 17 | or zero or more statements enclosed in brackets. Each statement is followed by 18 | a semi-colon. To be precise, the definition is shown below in the BNF grammar 19 | of the language. The non-terminals are enclosed in `<>` while the terminals are 20 | shown between double quotes. Note that zero or more is simply enclosed inside a 21 | pair of brackets (instead of the usual `*`). 22 | 23 | ``` 24 | := 25 | := "if" 26 | := "if" "else" 27 | := "while" 28 | := "do" "while" ";" 29 | := "print" ";" 30 | := "{" { } "}" 31 | := ";" 32 | := ";" 33 | 34 | := "(" ")" 35 | := 36 | := "=" 37 | := 38 | := "<" 39 | := 40 | := "+" 41 | := "-" 42 | := 43 | := 44 | := 45 | := 46 | := 47 | ``` 48 | 49 | #### The Tokenizer 50 | 51 | As is known, the tokenizer makes available the current token and makes possible 52 | moving to the next token. In general, a token has a type and a value. In our 53 | tokenizer, we have the following token types 54 | 55 | ```c 56 | enum 57 | { 58 | DO_SYM, 59 | ELSE_SYM, 60 | IF_SYM, 61 | WHILE_SYM, 62 | PRINT_SYM, 63 | LBRA_SYM, 64 | RBRA_SYM, 65 | LPAR_SYM, 66 | RPAR_SYM, 67 | PLUS_SYM, 68 | MINUS_SYM, 69 | LESS_SYM, 70 | SEMI_SYM, 71 | EQUAL_SYM, 72 | NUM_SYM, 73 | ID_SYM, 74 | EOI_SYM 75 | }; 76 | ``` 77 | 78 | Except `EOI_SYM` to help with parsing later, the other token types are all 79 | obtained from the grammar. Every terminal has a token type, and moreover the 80 | non-terminals that directly generate terminals (`` and ``) also belong 81 | to some token types. The latter two have associated values which we'll keep in 82 | an integer and a string: 83 | 84 | ```c 85 | int num_val; 86 | char id_name[100]; 87 | ``` 88 | 89 | On the other hand, the current token (more precisely the current token type) 90 | and getting the next token type are realized by a variable and a function: 91 | 92 | ```c 93 | int sym; 94 | void next_sym() 95 | { 96 | // To be filled in 97 | } 98 | ``` 99 | 100 | In order to signal a syntax error we'll have the function 101 | 102 | ```c 103 | void syntax_error(char *msg) 104 | { 105 | fprintf(stderr, "syntax error - %s\n", msg); 106 | exit(1); 107 | } 108 | ``` 109 | 110 | Now let's write the tokenizer, i.e. implement `next_sym`. The basic idea is 111 | to examine the current character and update the token type kept in `sym`, and 112 | then move on to the next character. In the case of numbers and identifiers the 113 | characters are accumulated and put in `num_val` and `id_name` respectively. So 114 | we'll have first of all 115 | 116 | ```c 117 | int ch = ' '; 118 | void next_ch() { ch = getchar(); } 119 | ``` 120 | 121 | Space characters are simply ignored, and we'll have 122 | 123 | ```c 124 | void next_sym() 125 | { 126 | again: 127 | switch (ch) 128 | { 129 | case ' ': 130 | case '\n': 131 | next_ch(); 132 | goto again; 133 | // To be continued 134 | } 135 | } 136 | ``` 137 | 138 | To continue, when `EOF` is reached, `sym` is updated to `EOI_SYM`. 139 | 140 | ```c 141 | void next_sym() 142 | { 143 | again: 144 | switch (ch) 145 | { 146 | // ... 147 | case EOF: 148 | sym = EOI_SYM; 149 | break; 150 | // To be continued 151 | } 152 | } 153 | ``` 154 | 155 | In the case of `+`, `-`, `=`, and so on, `sym` is updated accordingly, and the 156 | next character is made available by a call to `next_ch()`. 157 | 158 | ```c 159 | void next_sym() 160 | { 161 | again: 162 | switch (ch) 163 | { 164 | // ... 165 | case '{': 166 | next_ch(); 167 | sym = LBRA_SYM; 168 | break; 169 | case '}': 170 | next_ch(); 171 | sym = RBRA_SYM; 172 | break; 173 | case '(': 174 | next_ch(); 175 | sym = LPAR_SYM; 176 | break; 177 | case ')': 178 | next_ch(); 179 | sym = RPAR_SYM; 180 | break; 181 | case '+': 182 | next_ch(); 183 | sym = PLUS_SYM; 184 | break; 185 | case '-': 186 | next_ch(); 187 | sym = MINUS_SYM; 188 | break; 189 | case '<': 190 | next_ch(); 191 | sym = LESS_SYM; 192 | break; 193 | case ';': 194 | next_ch(); 195 | sym = SEMI_SYM; 196 | break; 197 | case '=': 198 | next_ch(); 199 | sym = EQUAL_SYM; 200 | break; 201 | // To be continued 202 | } 203 | } 204 | ``` 205 | 206 | The remaining case is either a number, an identifier, or a syntax error. If the 207 | current character is a digit, we simply accumulate all the following digits, 208 | convert the token to a number and save it in `num_val`, and update `sym` to 209 | `NUM_SYM`: 210 | 211 | ```c 212 | void next_sym() 213 | { 214 | again: 215 | switch (ch) 216 | { 217 | // ... 218 | default: 219 | if (ch >= '0' && ch <= '9') 220 | { 221 | num_val = 0; 222 | while (ch >= '0' && ch <= '9') 223 | { 224 | num_val = num_val * 10 + (ch - '0'); 225 | next_ch(); 226 | } 227 | sym = NUM_SYM; 228 | } 229 | // To be continued 230 | } 231 | } 232 | ``` 233 | 234 | If on the other hand the current character is an alphabet, we're dealing with 235 | an identifier. Here, there is a subtlety: The identifier could be one of the 236 | words in our language, such as `do`, `else`, and so on. Thus we'll have 237 | 238 | ```c 239 | char *words[] = {"do", "else", "if", "while", "print", NULL}; 240 | ``` 241 | 242 | Note the order that we have put the words: the order of `do` match that of 243 | `DO_SYM`, `else` `ELSE_SYM`, and so on. This is so, so that after having 244 | accumulated the characters of an identifier to `id_name` (ended properly with 245 | `\0`), we can reset `sym` to `0` which correponds to the index of `do` in 246 | `words` (and the index of `DO_SYM`). Then we can simply increment `sym` to 247 | check all the words which at the same time makes sure `sym` have the correct 248 | value if a word has indeed been seen. Otherwise we have seen an identifier and 249 | we simply update `sym` to `ID_SYM`. Thus 250 | 251 | ```c 252 | void next_sym() 253 | { 254 | again: 255 | switch (ch) 256 | { 257 | // ... 258 | default: 259 | // ... 260 | else if (ch >= 'a' && ch <= 'z') 261 | { 262 | int i = 0; 263 | while ((ch >= 'a' && ch <= 'z') || ch == '_' || (ch >= '0' && ch <= '9')) 264 | { 265 | id_name[i++] = ch; 266 | next_ch(); 267 | } 268 | id_name[i] = '\0'; 269 | sym = 0; 270 | while (words[sym] != NULL && strcmp(words[sym], id_name) != 0) 271 | sym++; 272 | if (words[sym] == NULL) 273 | sym = ID_SYM; 274 | } 275 | // .. To be continued 276 | } 277 | } 278 | ``` 279 | 280 | Anything else is a syntax error in the language, and we have 281 | 282 | ```c 283 | void next_sym() 284 | { 285 | again: 286 | switch (ch) 287 | { 288 | // ... 289 | default: 290 | // ... 291 | // ... 292 | else 293 | syntax_error("unknown symbol"); 294 | } 295 | } 296 | ``` 297 | 298 | The tokenizer is done. The following function that prints all the tokens should 299 | be written as we were extending the tokenizer: 300 | 301 | ```c 302 | void print_tokens() 303 | { 304 | again: 305 | next_sym(); 306 | switch (sym) 307 | { 308 | case DO_SYM: 309 | printf("DO_SYM \"%s\"\n", id_name); 310 | goto again; 311 | case ELSE_SYM: 312 | printf("ELSE_SYM \"%s\"\n", id_name); 313 | goto again; 314 | case IF_SYM: 315 | printf("IF_SYM \"%s\"\n", id_name); 316 | goto again; 317 | case WHILE_SYM: 318 | printf("WHILE_SYM \"%s\"\n", id_name); 319 | goto again; 320 | case PRINT_SYM: 321 | printf("PRINT_SYM \"%s\"\n", id_name); 322 | goto again; 323 | case LBRA_SYM: 324 | printf("LBRA_SYM\n"); 325 | goto again; 326 | case RBRA_SYM: 327 | printf("RBRA_SYM\n"); 328 | goto again; 329 | case LPAR_SYM: 330 | printf("LPAR_SYM\n"); 331 | goto again; 332 | case RPAR_SYM: 333 | printf("RPAR_SYM\n"); 334 | goto again; 335 | case PLUS_SYM: 336 | printf("PLUS_SYM\n"); 337 | goto again; 338 | case MINUS_SYM: 339 | printf("MINUS_SYM\n"); 340 | goto again; 341 | case LESS_SYM: 342 | printf("LESS_SYM\n"); 343 | goto again; 344 | case SEMI_SYM: 345 | printf("SEMI_SYM\n"); 346 | goto again; 347 | case EQUAL_SYM: 348 | printf("EQUAL_SYM\n"); 349 | goto again; 350 | case NUM_SYM: 351 | printf("NUM_SYM \"%d\"\n", num_val); 352 | goto again; 353 | case ID_SYM: 354 | printf("ID_SYM \"%s\"\n", id_name); 355 | goto again; 356 | case EOI_SYM: 357 | printf("EOI_SYM\n"); 358 | break; 359 | } 360 | } 361 | ``` 362 | 363 | Now a simple demonstration of `{ i=1; while (i<100) i=i+i; }`: 364 | 365 | ``` 366 | LBRA_SYM 367 | ID_SYM "i" 368 | EQUAL_SYM 369 | NUM_SYM "1" 370 | SEMI_SYM 371 | WHILE_SYM "while" 372 | LPAR_SYM 373 | ID_SYM "i" 374 | LESS_SYM 375 | NUM_SYM "100" 376 | RPAR_SYM 377 | ID_SYM "i" 378 | EQUAL_SYM 379 | ID_SYM "i" 380 | PLUS_SYM 381 | ID_SYM "i" 382 | SEMI_SYM 383 | RBRA_SYM 384 | EOI_SYM 385 | ``` 386 | 387 | #### The Parser 388 | 389 | In order to write the parser we'll first need to give names to the semantic 390 | constructs in the language, which is done by going through the grammar and 391 | labelling the productions. The following list is complete: 392 | 393 | ```c 394 | enum 395 | { 396 | VAR, 397 | CST, 398 | ADD, 399 | SUB, 400 | LT, 401 | SET, 402 | IF, 403 | IFELSE, 404 | WHILE, 405 | DO, 406 | PRINT, 407 | EMPTY, 408 | SEQ, 409 | EXPR, 410 | PROG 411 | }; 412 | ``` 413 | 414 | Next, we'll need a data structure to hold not only the type of a construct but 415 | also its components. We'll need at maximum three pieces of data (the `IFELSE` 416 | construct) and the following data structure is adequate for all the constructs 417 | in the language: 418 | 419 | ```c 420 | typedef struct node 421 | { 422 | int kind; 423 | struct node *o1, *o2, *o3; 424 | 425 | union { 426 | int val; 427 | char id[100]; 428 | }; 429 | 430 | } node; 431 | ``` 432 | 433 | A new `node` is created by calling the following function: 434 | 435 | ```c 436 | node *new_node(int k) 437 | { 438 | node *x = malloc(sizeof(node)); 439 | x->kind = k; 440 | return x; 441 | } 442 | ``` 443 | 444 | At certain points during the parsing we'll simply need to consume an expected 445 | token type and the following function is useful: 446 | 447 | ```c 448 | void consume(int expected) 449 | { 450 | if (sym == expected) 451 | next_sym(); 452 | else 453 | syntax_error("unknown expected"); 454 | } 455 | ``` 456 | 457 | The parser itself is made up of a set of possibly recursive functions calling each 458 | other according to the grammar. To start, `id` simply creates a new node of 459 | type `VAR` and copies over the content of `id_name`; it also makes the next 460 | token available by calling `next_sym`: 461 | 462 | ```c 463 | node *id() 464 | { 465 | node *x = new_node(VAR); 466 | strcpy(x->id, id_name); 467 | next_sym(); 468 | return x; 469 | } 470 | ``` 471 | 472 | Next, `num` is likewise: 473 | 474 | ```c 475 | node *num() 476 | { 477 | node *x = new_node(CST); 478 | x->val = num_val; 479 | next_sym(); 480 | return x; 481 | } 482 | ``` 483 | 484 | Next, according to its grammar, `term` decides which non-terminal to follow 485 | according to the current token type. Note that we need a forward declaration 486 | of `paren_expr()`. 487 | 488 | ```c 489 | node *paren_expr(); 490 | 491 | node *term() 492 | { 493 | if (sym == ID_SYM) 494 | return id(); 495 | else if (sym == NUM_SYM) 496 | return num(); 497 | else 498 | return paren_expr(); 499 | } 500 | ``` 501 | 502 | Next in line is `sum`. According to its grammar, there're three productions 503 | none of which begins with a terminal. However, we can expand an addition or a 504 | subtraction and see that a term always needs to be parsed first. Subsequently 505 | it's zero or more additions or subtractions which translates to a loop 506 | structure. Thus 507 | 508 | ```c 509 | node *sum() 510 | { 511 | node *x = term(); 512 | while (sym == PLUS_SYM || sym == MINUS_SYM) 513 | { 514 | node *t = x; 515 | x = new_node(sym == PLUS_SYM ? ADD : SUB); 516 | next_sym(); 517 | x->o1 = t; 518 | x->o2 = term(); 519 | } 520 | return x; 521 | } 522 | ``` 523 | 524 | Let's pause a bit here and try to visualize the work of the parser. Given an 525 | expression such as `1 + 2 - 3` we'll get the parsed tree 526 | 527 | ``` 528 | + 529 | / \ 530 | 1 - 531 | / \ 532 | 2 3 533 | ``` 534 | 535 | The next one is `test`: 536 | 537 | ```c 538 | node *test() 539 | { 540 | node *x = sum(); 541 | if (sym == LESS_SYM) 542 | { 543 | node *t = x; 544 | x = new_node(LT); 545 | next_sym(); 546 | x->o1 = t; 547 | x->o2 = sum(); 548 | } 549 | return x; 550 | } 551 | ``` 552 | 553 | Now, `expr`, which is a bit tricky. Even though an identifier seems to point to 554 | the second production, if we expand the first production, ``, we see that 555 | it may start with an identifier as well. In the following, we will simply check 556 | if the current token is not an identifier, in which case we parse the first 557 | production by returning what will be parsed by `test`. Otherwise, we call 558 | `test` and check if we have an identifier followed by the equal sign, in which 559 | case we are parsing the `SET` clause, and if not, it's also what has been 560 | parsed by `test`. 561 | 562 | ```c 563 | node *expr() 564 | { 565 | if (sym != ID_SYM) 566 | return test(); 567 | 568 | node *x = test(); 569 | if (x->kind == VAR && sym == EQUAL_SYM) 570 | { 571 | node *t = x; 572 | x = new_node(SET); 573 | next_sym(); 574 | x->o1 = t; 575 | x->o2 = expr(); 576 | } 577 | return x; 578 | } 579 | ``` 580 | 581 | Next, `paren_expr`, it's probably the simplest one: 582 | 583 | ```c 584 | node *paren_expr() 585 | { 586 | consume(LPAR_SYM); 587 | node *x = expr(); 588 | consume(RPAR_SYM); 589 | 590 | return x; 591 | } 592 | ``` 593 | 594 | Next is `statement`, which has eight productions and is therefore the longest 595 | function. Seven of them start with a distinct terminal indicating which 596 | production is to be parsed (even though there're two conditional statements, 597 | one can be distinguished from the other by the `else` terminal). Most of them 598 | are straight-forward, except the sequence production. It translates into a loop 599 | structure that keeps parsing and attaching what has been parsed as a component 600 | to what will be parsed. We may roughly see how it works by looking at an 601 | example: The sequence `{ i=1; while (i<100) i=i+i; }` will be parsed by the 602 | parser into the tree below, the lower tree being parsed first and glued to the 603 | upper tree as a branch. 604 | 605 | ``` 606 | SEQ 607 | / \ 608 | SEQ WHILE 609 | / \ 610 | EMPTY EXPR 611 | ``` 612 | 613 | ```c 614 | node *statement() 615 | { 616 | node *x; 617 | if (sym == IF_SYM) 618 | { 619 | next_sym(); 620 | x = new_node(IF); 621 | x->o1 = paren_expr(); 622 | x->o2 = statement(); 623 | if (sym == ELSE_SYM) 624 | { 625 | x->kind = IFELSE; 626 | next_sym(); 627 | x->o3 = statement(); 628 | } 629 | } 630 | else if (sym == WHILE_SYM) 631 | { 632 | x = new_node(WHILE); 633 | next_sym(); 634 | x->o1 = paren_expr(); 635 | x->o2 = statement(); 636 | } 637 | else if (sym == DO_SYM) 638 | { 639 | x = new_node(DO); 640 | next_sym(); 641 | x->o1 = statement(); 642 | consume(WHILE_SYM); 643 | x->o2 = paren_expr(); 644 | consume(SEMI_SYM); 645 | } 646 | else if (sym == PRINT_SYM) 647 | { 648 | x = new_node(PRINT); 649 | next_sym(); 650 | x->o1 = paren_expr(); 651 | consume(SEMI_SYM); 652 | } 653 | else if (sym == LBRA_SYM) 654 | { 655 | x = new_node(EMPTY); 656 | next_sym(); 657 | while (sym != RBRA_SYM) 658 | { 659 | node *t = x; 660 | x = new_node(SEQ); 661 | x->o1 = t; 662 | x->o2 = statement(); 663 | } 664 | next_sym(); 665 | } 666 | else if (sym == SEMI_SYM) 667 | { 668 | x = new_node(EMPTY); 669 | next_sym(); 670 | } 671 | else 672 | { 673 | x = new_node(EXPR); 674 | x->o1 = expr(); 675 | consume(SEMI_SYM); 676 | } 677 | 678 | return x; 679 | } 680 | ``` 681 | 682 | Finally, `program`. We need to remember to consume `EOF_SYM`. 683 | 684 | ```c 685 | node *program() 686 | { 687 | node *x = new_node(PROG); 688 | x->o1 = statement(); 689 | consume(EOI_SYM); 690 | return x; 691 | } 692 | ``` 693 | 694 | And a function that initiates the parsing: 695 | 696 | ```c 697 | node *parse() 698 | { 699 | next_sym(); 700 | node *x = program(); 701 | return x; 702 | } 703 | ``` 704 | 705 | The following function prints the abstract syntrax tree to give us roughly an 706 | idea of how it looks like. 707 | 708 | ```c 709 | void print_ast(node *x) 710 | { 711 | switch (x->kind) 712 | { 713 | case VAR: 714 | printf("VAR \"%s\" ", x->id); 715 | break; 716 | case CST: 717 | printf("CST \"%d\" ", x->val); 718 | break; 719 | case ADD: 720 | print_ast(x->o1); 721 | printf("ADD "); 722 | print_ast(x->o2); 723 | break; 724 | case SUB: 725 | print_ast(x->o1); 726 | printf("SUB "); 727 | print_ast(x->o2); 728 | break; 729 | case LT: 730 | print_ast(x->o1); 731 | printf("LT "); 732 | print_ast(x->o2); 733 | break; 734 | case SET: 735 | printf("SET "); 736 | print_ast(x->o1); 737 | print_ast(x->o2); 738 | break; 739 | case IF: 740 | printf("IF "); 741 | print_ast(x->o1); 742 | print_ast(x->o2); 743 | break; 744 | case IFELSE: 745 | printf("IF "); 746 | print_ast(x->o1); 747 | print_ast(x->o2); 748 | printf("ELSE "); 749 | print_ast(x->o3); 750 | break; 751 | case EXPR: 752 | printf("EXPR "); 753 | print_ast(x->o1); 754 | break; 755 | case SEQ: 756 | printf("SEQ "); 757 | print_ast(x->o1); 758 | print_ast(x->o2); 759 | break; 760 | case PRINT: 761 | printf("PRINT "); 762 | print_ast(x->o1); 763 | break; 764 | case WHILE: 765 | printf("WHILE "); 766 | print_ast(x->o1); 767 | print_ast(x->o2); 768 | break; 769 | case DO: 770 | printf("DO "); 771 | print_ast(x->o1); 772 | printf("WHILE "); 773 | print_ast(x->o2); 774 | break; 775 | case PROG: 776 | printf("PROG "); 777 | print_ast(x->o1); 778 | break; 779 | case EMPTY: 780 | printf("EMPTY "); 781 | break; 782 | default: 783 | syntax_error("unknown node"); 784 | break; 785 | } 786 | } 787 | ``` 788 | 789 | As an example, we may see the output of `{ i=1; while (i<100) i=i+i; }`: 790 | 791 | ``` 792 | PROG SEQ SEQ EMPTY EXPR SET VAR "i" CST "1" WHILE VAR "i" LT CST "100" 793 | EXPR SET VAR "i" VAR "i" ADD VAR "i" 794 | ``` 795 | 796 | Here we see the main node `PROG` that contains a sequence. As we discussed how 797 | a sequence was parsed, i.e. roughly `{ i=1; while (i<100) i=i+i; }` will parsed 798 | into 799 | 800 | ``` 801 | SEQ 802 | / \ 803 | SEQ WHILE 804 | / \ 805 | EMPTY EXPR 806 | ``` 807 | 808 | which was reflected in the output. 809 | 810 | #### The Interpreter 811 | 812 | This language is a very small subset of the C language and the semantics of its 813 | statements are clear. The only point that should be discussed is scoping. 814 | Usually brackets open a new scope and interpretation should take scoping into 815 | account properly. In the current language, however, we'll simply treat brackets 816 | as a way to group statements, and accordingly there's only one global scope, 817 | instead of different local scopes for different pair of matching brackets. As 818 | an example, in our language the statement `{a = 3; {a = a + a;}; a = a + a; print(a)}` will print `12`, instead of `6`; the inner pair of brackets does 819 | update the variable `a`. 820 | 821 | Thus, instead of creating new local environments when entering brackets we will 822 | have a global environment that keeps identifiers and their values. The 823 | environment will be a list 824 | 825 | ```c 826 | typedef struct list 827 | { 828 | char *id; 829 | int value; 830 | struct list *next; 831 | } list; 832 | 833 | list *env; 834 | ``` 835 | 836 | We want to be able to get an identifier, which is an element in the list, and 837 | also be able to look up the value of a given identifier: 838 | 839 | ```c 840 | list *get_id(char *id) 841 | { 842 | for (list *lst = env; lst; lst = lst->next) 843 | if (strcmp(lst->id, id) == 0) 844 | return lst; 845 | 846 | return (list *)NULL; 847 | } 848 | 849 | void lookup_error(char *id) 850 | { 851 | fprintf(stderr, "error looking up %s\n", id); 852 | exit(1); 853 | } 854 | 855 | int lookup_value(char *id) 856 | { 857 | list *pid = get_id(id); 858 | if (pid) 859 | return pid->value; 860 | 861 | lookup_error(id); 862 | return -1; 863 | } 864 | ``` 865 | 866 | Finally, we want to be able to add an identifier and its value to the global 867 | environment. Because of our scoping rule, if the identifier already exists its 868 | value will simply be replaced by the new value; otherwise the new name-value 869 | pair will be added to the beginning of the list. 870 | 871 | ```c 872 | void add_id(char *id, int value) 873 | { 874 | list *pid = get_id(id); 875 | if (pid) 876 | { 877 | pid->value = value; 878 | return; 879 | } 880 | 881 | list *lst = malloc(sizeof(list)); 882 | lst->id = id; 883 | lst->value = value; 884 | lst->next = env; 885 | env = lst; 886 | } 887 | ``` 888 | 889 | Now the interpreter. According to our grammar there should be three functions, 890 | `eval_program`, `eval_statement`, and `eval_expr`. First, `eval-expr`: 891 | 892 | ```c 893 | int eval_expr(node *x) 894 | { 895 | node *var; 896 | int val; 897 | 898 | switch (x->kind) 899 | { 900 | case VAR: 901 | return lookup_value(x->id); 902 | case CST: 903 | return x->val; 904 | case ADD: 905 | return eval_expr(x->o1) + eval_expr(x->o2); 906 | case SUB: 907 | return eval_expr(x->o1) - eval_expr(x->o2); 908 | case LT: 909 | return eval_expr(x->o1) < eval_expr(x->o2); 910 | case SET: 911 | var = x->o1; 912 | val = eval_expr(x->o2); 913 | add_id(var->id, val); 914 | return val; 915 | default: 916 | eval_error(); 917 | return -1; 918 | } 919 | } 920 | ``` 921 | 922 | Next, `eval_statement`: 923 | 924 | ```c 925 | void eval_statement(node *x) 926 | { 927 | switch (x->kind) 928 | { 929 | case PRINT: 930 | printf("%d\n", eval_expr(x->o1)); 931 | break; 932 | case IF: 933 | if (eval_expr(x->o1)) 934 | eval_statement(x->o2); 935 | break; 936 | case IFELSE: 937 | if (eval_expr(x->o1)) 938 | eval_statement(x->o2); 939 | else 940 | eval_statement(x->o3); 941 | break; 942 | case WHILE: 943 | while (eval_expr(x->o1)) 944 | eval_statement(x->o2); 945 | break; 946 | case DO: 947 | do 948 | eval_statement(x->o1); 949 | while (eval_expr(x->o2)); 950 | break; 951 | case SEQ: 952 | eval_statement(x->o1); 953 | eval_statement(x->o2); 954 | break; 955 | case EXPR: 956 | eval_expr(x->o1); 957 | break; 958 | case EMPTY: 959 | break; 960 | default: 961 | eval_error(); 962 | } 963 | } 964 | ``` 965 | 966 | Finally, `eval_program`: 967 | 968 | ```c 969 | void eval_program(node *x) 970 | { 971 | switch (x->kind) 972 | { 973 | case PROG: 974 | eval_statement(x->o1); 975 | break; 976 | default: 977 | eval_error(); 978 | } 979 | } 980 | ``` 981 | 982 | Let's run the interpreter on the program `{ i=1; j = 10; while (i<100) print(i=i+j); }` 983 | 984 | ``` 985 | 11 986 | 21 987 | 31 988 | 41 989 | 51 990 | 61 991 | 71 992 | 81 993 | 91 994 | 101 995 | ``` 996 | 997 | #### The Compiler 998 | 999 | A compiler translates a program in our language into a set of bytecote 1000 | instructions that will subsequently be interpreted. Each instruction is 1001 | associated with a number, and the set of instructions that will be used is 1002 | 1003 | ```c 1004 | enum 1005 | { 1006 | IFETCH, 1007 | ISTORE, 1008 | IPUSH, 1009 | IPOP, 1010 | IADD, 1011 | ISUB, 1012 | ILT, 1013 | IJZ, 1014 | IJNZ, 1015 | IJMP, 1016 | IPRINT, 1017 | IHALT 1018 | }; 1019 | ``` 1020 | 1021 | The interpretation of the bytecode will be carried out by a stack virtual 1022 | machine, starting from the first instruction, with a stack holding computation 1023 | values. In addition, there will be two additional arrays to keep variables and 1024 | their associated values. Each variable will be put into a specific location on 1025 | one array, and the value associated to the variable will be put into the 1026 | corresponding location on the other array. The location number will be used as 1027 | a bytecode instruction. 1028 | 1029 | Please note that we are operating at a very low level. 1030 | 1031 | To start, the instructions are put into an array called `object`. Since we'll 1032 | keep pushing the instructions on top of the array, we need a pointer, `here`. 1033 | The function `g` simply puts a given bytecode on top of the code array and 1034 | moves up one element. 1035 | 1036 | ```c 1037 | typedef char code; 1038 | code object[1000], *here = object; 1039 | 1040 | void g(code c) { *here++ = c; } 1041 | ``` 1042 | 1043 | As we have said, there are two arrays holding the variables and their 1044 | associated values. We're restricting to only maximum a hundred names, and 1045 | therefore a hundred values. 1046 | 1047 | ```c 1048 | char names[100][100], (*namespt)[100] = names; 1049 | 1050 | int globals[100]; 1051 | ``` 1052 | 1053 | An important operation on `names` is getting the index of a variable. If the 1054 | variable is already in the array, its index will be returned; otherwise the 1055 | variable is put on top of the array and the index of the top element is 1056 | returned: 1057 | 1058 | ```c 1059 | int get_index(char *name) 1060 | { 1061 | int i; 1062 | for (char(*npt)[100] = names; npt < namespt; npt++) 1063 | { 1064 | i = npt - names; 1065 | if (strcmp(name, names[i]) == 0) 1066 | return i; 1067 | } 1068 | i = namespt++ - names; 1069 | strcpy(names[i], name); 1070 | return i; 1071 | } 1072 | ``` 1073 | 1074 | Let's move on to generating the code. We'll write a function called `c` that 1075 | takes an abstract syntax tree and generates the corresponding code. 1076 | 1077 | ```c 1078 | void c(node *x) 1079 | { 1080 | 1081 | switch (x->kind) 1082 | { 1083 | // To be continued 1084 | } 1085 | } 1086 | ``` 1087 | 1088 | Since `PROG` is the containing node, let's work on that first. We need to 1089 | generate the bytecode and put `IHALT` at the end. Thus, 1090 | 1091 | ```c 1092 | void c(node *x) 1093 | { 1094 | 1095 | switch (x->kind) 1096 | { 1097 | case PROG: 1098 | c(x->o1); 1099 | g(IHALT); 1100 | break; 1101 | // To be continued 1102 | } 1103 | } 1104 | ``` 1105 | 1106 | Now, according to the grammar of the language, there're eight different kinds 1107 | of nodes that could be contained in a `PROG` node. The easiest one is `EMPTY`: 1108 | 1109 | ```c 1110 | void c(node *x) 1111 | { 1112 | 1113 | switch (x->kind) 1114 | { 1115 | // To be continued 1116 | case EMPTY: 1117 | break; 1118 | // ... 1119 | } 1120 | } 1121 | ``` 1122 | 1123 | Let's pick `EXPR` as the next one. In this case a value is expected to be 1124 | computed and put on the virtual machine stack, before being used. Thus we'll 1125 | generate the instructions followed by the `IPOP` instruction. 1126 | 1127 | ```c 1128 | void c(node *x) 1129 | { 1130 | 1131 | switch (x->kind) 1132 | { 1133 | // To be continued 1134 | case EXPR: 1135 | c(x->o1); 1136 | g(IPOP); 1137 | break; 1138 | // ... 1139 | } 1140 | } 1141 | ``` 1142 | 1143 | There're several possible expressions. Let's start with `VAR`. Given a 1144 | variable, as there is already a value associated with it, we simply generate 1145 | `IFETCH` and the index of the location of the variable: 1146 | 1147 | ```c 1148 | void c(node *x) 1149 | { 1150 | 1151 | switch (x->kind) 1152 | { 1153 | case VAR: 1154 | g(IFETCH); 1155 | g(get_index(x->id)); 1156 | break; 1157 | // ... 1158 | } 1159 | } 1160 | ``` 1161 | 1162 | Next, `CST`. In this case we generate `IPUSH` and the value to be put to the 1163 | virtual machine stack: 1164 | 1165 | ```c 1166 | void c(node *x) 1167 | { 1168 | 1169 | switch (x->kind) 1170 | { 1171 | // ... 1172 | case CST: 1173 | g(IPUSH); 1174 | g(x->val); 1175 | break; 1176 | // To be continued 1177 | // ... 1178 | } 1179 | } 1180 | ``` 1181 | 1182 | Next are `ADD` and `SUB`: 1183 | 1184 | ```c 1185 | void c(node *x) 1186 | { 1187 | 1188 | switch (x->kind) 1189 | { 1190 | // ... 1191 | case ADD: 1192 | c(x->o1); 1193 | c(x->o2); 1194 | g(IADD); 1195 | break; 1196 | case SUB: 1197 | c(x->o1); 1198 | c(x->o2); 1199 | g(ISUB); 1200 | break; 1201 | // To be continued 1202 | // ... 1203 | } 1204 | } 1205 | ``` 1206 | 1207 | `LT` is similar: 1208 | 1209 | ```c 1210 | void c(node *x) 1211 | { 1212 | 1213 | switch (x->kind) 1214 | { 1215 | // ... 1216 | case LT: 1217 | c(x->o1); 1218 | c(x->o2); 1219 | g(ILT); 1220 | break; 1221 | // To be continued 1222 | // ... 1223 | } 1224 | } 1225 | ``` 1226 | 1227 | Finally, `SET`. Here we compute the value to be set first, then store it, then 1228 | generate the index of the variable: 1229 | 1230 | ```c 1231 | void c(node *x) 1232 | { 1233 | 1234 | switch (x->kind) 1235 | { 1236 | // ... 1237 | case SET: 1238 | c(x->o2); 1239 | g(ISTORE); 1240 | g(get_index(x->o1->id)); 1241 | break; 1242 | // To be continued 1243 | // ... 1244 | } 1245 | } 1246 | ``` 1247 | 1248 | Now let's get back to the nodes that could be contained in the `PROG` node. We 1249 | have implemented `EMPTY` and `EXPR`. The next easy one is `PRINT`: 1250 | 1251 | ```c 1252 | void c(node *x) 1253 | { 1254 | 1255 | switch (x->kind) 1256 | { 1257 | // ... 1258 | case PRINT: 1259 | c(x->o1); 1260 | g(IPRINT); 1261 | break; 1262 | // To be continued 1263 | // ... 1264 | } 1265 | } 1266 | ``` 1267 | 1268 | `SEQ` is also straight-forward: 1269 | 1270 | ```c 1271 | void c(node *x) 1272 | { 1273 | 1274 | switch (x->kind) 1275 | { 1276 | // ... 1277 | case SEQ: 1278 | c(x->o1); 1279 | c(x->o2); 1280 | break; 1281 | // To be continued 1282 | // ... 1283 | } 1284 | } 1285 | ``` 1286 | 1287 | Next, let's move on to `IF`. The key to translating this node is to use the 1288 | instruction `IJZ`, meaning jump if zero. Before writing the code, we will look 1289 | at an example, `if (1 < 10) print(10);`. Here the condition `1 < 10` will be 1290 | translated into a series of instructions, which we'll mark `X` below: 1291 | 1292 | ``` 1293 | ... |X| | |... 1294 | ``` 1295 | 1296 | `print(10)` must be translated too, but whether or not it should be executed 1297 | depends on whether the condition is true (`1`), in other words the sequence of instructions should be skipped if the condition is false (`0`). To achieve 1298 | this effect, we put `IJZ` followed by a hole, and generate the instructions 1299 | for `print(10)`. The hole is to hold the number of instructions to be skipped. 1300 | We can keep the following illustration in mind: 1301 | 1302 | ``` 1303 | ... |X|IJZ| |X|... 1304 | ``` 1305 | 1306 | To create a hole, we have the function 1307 | 1308 | ```c 1309 | code *hole() { return here++; } 1310 | ``` 1311 | 1312 | And to compute the number of steps: 1313 | 1314 | ```c 1315 | void fix(code *src, code *dst) { *src = dst - src; } 1316 | ``` 1317 | 1318 | The code for translating `IF` is: 1319 | 1320 | ```c 1321 | void c(node *x) 1322 | { 1323 | 1324 | switch (x->kind) 1325 | { 1326 | // ... 1327 | case IF: 1328 | c(x->o1); 1329 | g(IJZ); 1330 | p1 = hole(); 1331 | c(x->o2); 1332 | fix(p1, here); 1333 | break; 1334 | // To be continued 1335 | // ... 1336 | } 1337 | } 1338 | ``` 1339 | 1340 | Next we will translate `IFELSE`. We'll use a concrete example, `if (1<10) print(1); else print(10);`, to follow along. The code for `1<10`, `print(1)`, 1341 | and `print(10)` will need to be translated. In a general statement, one of the 1342 | consequences will be skipped depending on the condition. Thus we will place 1343 | a jump instruction and a hole in front of each. But the second jump is a simple 1344 | jump. Thus we use the `IJMP` instruction. 1345 | 1346 | ``` 1347 | ...|X|IJZ| |X|JMP| |X| |... 1348 | ``` 1349 | 1350 | The addresses to be kept in the hole need to be calculated at the right places: 1351 | 1352 | ```c 1353 | void c(node *x) 1354 | { 1355 | 1356 | switch (x->kind) 1357 | { 1358 | // ... 1359 | case IFELSE: 1360 | c(x->o1); 1361 | g(IJZ); 1362 | p1 = hole(); 1363 | c(x->o2); 1364 | g(IJMP); 1365 | p2 = hole(); 1366 | fix(p1, here); 1367 | c(x->o3); 1368 | fix(p2, here); 1369 | break; 1370 | // To be continued 1371 | // ... 1372 | } 1373 | } 1374 | ``` 1375 | 1376 | Now translating `WHILE`. This works almost like a condition. We'll have 1377 | 1378 | ``` 1379 | |X|IJZ| |X|IJMP|... 1380 | ``` 1381 | 1382 | We need to be careful with the calculation of the jumping addresses: 1383 | 1384 | ```c 1385 | void c(node *x) 1386 | { 1387 | 1388 | switch (x->kind) 1389 | { 1390 | // ... 1391 | case WHILE: 1392 | p1 = here; 1393 | c(x->o1); 1394 | g(IJZ); 1395 | p2 = hole(); 1396 | c(x->o2); 1397 | g(IJMP); 1398 | fix(hole(), p1); 1399 | fix(p2, here); 1400 | break; 1401 | // To be continued 1402 | // ... 1403 | } 1404 | } 1405 | ``` 1406 | 1407 | Finally, `DOWHILE`. Here the `JNZ`, or jump if not zero, is used. 1408 | 1409 | ```c 1410 | void c(node *x) 1411 | { 1412 | 1413 | switch (x->kind) 1414 | { 1415 | // ... 1416 | case DO: 1417 | p1 = here; 1418 | c(x->o1); 1419 | c(x->o2); 1420 | g(JNZ); 1421 | fix(hole(), p1); 1422 | break; 1423 | // ... 1424 | } 1425 | } 1426 | ``` 1427 | 1428 | #### The Virtual Machine Interpreter 1429 | 1430 | The bytecode instructions generated by the compiler written above are 1431 | interpreted by a stack virtual machine interpreter. There is a stack to hold 1432 | computation values, and the instructions are to be interpreted from beginning 1433 | to end. 1434 | 1435 | ```c 1436 | void run() 1437 | { 1438 | int stack[1000], *sp = stack; 1439 | code *pc = object; 1440 | again: 1441 | switch (*pc++) 1442 | { 1443 | // To be continued ... 1444 | } 1445 | } 1446 | ``` 1447 | 1448 | The following instructions manipulate the stack or the value array directly: 1449 | 1450 | ```c 1451 | void run() 1452 | { 1453 | int stack[1000], *sp = stack; 1454 | code *pc = object; 1455 | again: 1456 | switch (*pc++) 1457 | { 1458 | case IFETCH: 1459 | *sp++ = globals[*pc++]; 1460 | goto again; 1461 | case ISTORE: 1462 | globals[*pc++] = sp[-1]; 1463 | goto again; 1464 | case IPUSH: 1465 | *sp++ = *pc++; 1466 | goto again; 1467 | case IPOP: 1468 | --sp; 1469 | goto again; 1470 | case IADD: 1471 | sp[-2] = sp[-2] + sp[-1]; 1472 | --sp; 1473 | goto again; 1474 | case ISUB: 1475 | sp[-2] = sp[-2] - sp[-1]; 1476 | --sp; 1477 | goto again; 1478 | case ILT: 1479 | sp[-2] = sp[-2] < sp[-1]; 1480 | --sp; 1481 | goto again; 1482 | case IPRINT: 1483 | printf("%d\n", *--sp); 1484 | goto again; 1485 | // To be continued ... 1486 | } 1487 | } 1488 | ``` 1489 | 1490 | The remaining instructions involve using the number of steps to jump when 1491 | needed: 1492 | 1493 | ```c 1494 | void run() 1495 | { 1496 | int stack[1000], *sp = stack; 1497 | code *pc = object; 1498 | again: 1499 | switch (*pc++) 1500 | { 1501 | // ... 1502 | case IJZ: 1503 | if (*--sp == 0) 1504 | pc += *pc; 1505 | else 1506 | pc++; 1507 | goto again; 1508 | case IJMP: 1509 | pc += *pc; 1510 | goto again; 1511 | case IJNZ: 1512 | if (*--sp != 0) 1513 | pc += *pc; 1514 | else 1515 | pc++; 1516 | goto again; 1517 | } 1518 | } 1519 | ``` 1520 | 1521 | The virtual machine has been completed. Given a program, such as `{ i = 0; do { i = i + 10; print(i);} while (i < 50);}`, it will produces the output 1522 | 1523 | ``` 1524 | 10 1525 | 20 1526 | 30 1527 | 40 1528 | 50 1529 | ``` 1530 | -------------------------------------------------------------------------------- /makefile: -------------------------------------------------------------------------------- 1 | CC=gcc 2 | CFLAGS=-std=c99 3 | 4 | all: mini 5 | 6 | mini: mini.o 7 | $(CC) $(CFLAGS) -o $@ $^ 8 | 9 | mini.o: mini.c 10 | 11 | clean: 12 | $(RM) mini *.o 13 | -------------------------------------------------------------------------------- /mini.c: -------------------------------------------------------------------------------- 1 | /* 2 | * := 3 | * := "if" 4 | * := "if" "else" 5 | * := "while" 6 | * := "do" "while" ";" 7 | * := "print" ";" 8 | * := "{" { } "}" 9 | * := ";" 10 | * := ";" 11 | * 12 | * := "(" ")" 13 | * := 14 | * := "=" 15 | * := 16 | * := "<" 17 | * := 18 | * := "+" 19 | * := "-" 20 | * := 21 | * := 22 | * := 23 | * := 24 | * := 25 | */ 26 | 27 | #include 28 | #include 29 | #include 30 | 31 | void syntax_error(char *msg) 32 | { 33 | fprintf(stderr, "syntax error - %s\n", msg); 34 | exit(1); 35 | } 36 | 37 | /*----------------------------------------------------------------------------*/ 38 | /* Tokenizer */ 39 | 40 | enum 41 | { 42 | DO_SYM, 43 | ELSE_SYM, 44 | IF_SYM, 45 | WHILE_SYM, 46 | PRINT_SYM, 47 | LBRA_SYM, 48 | RBRA_SYM, 49 | LPAR_SYM, 50 | RPAR_SYM, 51 | PLUS_SYM, 52 | MINUS_SYM, 53 | LESS_SYM, 54 | SEMI_SYM, 55 | EQUAL_SYM, 56 | NUM_SYM, 57 | ID_SYM, 58 | EOI_SYM 59 | }; 60 | 61 | char *words[] = {"do", "else", "if", "while", "print", NULL}; 62 | 63 | int ch = ' '; 64 | void next_ch() { ch = getchar(); } 65 | 66 | int num_val; 67 | char id_name[100]; 68 | 69 | int sym; 70 | void next_sym() 71 | { 72 | again: 73 | switch (ch) 74 | { 75 | case ' ': 76 | case '\n': 77 | next_ch(); 78 | goto again; 79 | case EOF: 80 | sym = EOI_SYM; 81 | break; 82 | case '{': 83 | next_ch(); 84 | sym = LBRA_SYM; 85 | break; 86 | case '}': 87 | next_ch(); 88 | sym = RBRA_SYM; 89 | break; 90 | case '(': 91 | next_ch(); 92 | sym = LPAR_SYM; 93 | break; 94 | case ')': 95 | next_ch(); 96 | sym = RPAR_SYM; 97 | break; 98 | case '+': 99 | next_ch(); 100 | sym = PLUS_SYM; 101 | break; 102 | case '-': 103 | next_ch(); 104 | sym = MINUS_SYM; 105 | break; 106 | case '<': 107 | next_ch(); 108 | sym = LESS_SYM; 109 | break; 110 | case ';': 111 | next_ch(); 112 | sym = SEMI_SYM; 113 | break; 114 | case '=': 115 | next_ch(); 116 | sym = EQUAL_SYM; 117 | break; 118 | default: 119 | if (ch >= '0' && ch <= '9') 120 | { 121 | num_val = 0; 122 | while (ch >= '0' && ch <= '9') 123 | { 124 | num_val = num_val * 10 + (ch - '0'); 125 | next_ch(); 126 | } 127 | sym = NUM_SYM; 128 | } 129 | else if (ch >= 'a' && ch <= 'z') 130 | { 131 | int i = 0; 132 | while ((ch >= 'a' && ch <= 'z') || ch == '_' || (ch >= '0' && ch <= '9')) 133 | { 134 | id_name[i++] = ch; 135 | next_ch(); 136 | } 137 | id_name[i] = '\0'; 138 | sym = 0; 139 | while (words[sym] != NULL && strcmp(words[sym], id_name) != 0) 140 | sym++; 141 | if (words[sym] == NULL) 142 | sym = ID_SYM; 143 | } 144 | else 145 | syntax_error("unknown symbol"); 146 | } 147 | } 148 | 149 | void print_tokens() 150 | { 151 | again: 152 | next_sym(); 153 | switch (sym) 154 | { 155 | case DO_SYM: 156 | printf("DO_SYM \"%s\"\n", id_name); 157 | goto again; 158 | case ELSE_SYM: 159 | printf("ELSE_SYM \"%s\"\n", id_name); 160 | goto again; 161 | case IF_SYM: 162 | printf("IF_SYM \"%s\"\n", id_name); 163 | goto again; 164 | case WHILE_SYM: 165 | printf("WHILE_SYM \"%s\"\n", id_name); 166 | goto again; 167 | case PRINT_SYM: 168 | printf("PRINT_SYM \"%s\"\n", id_name); 169 | goto again; 170 | case LBRA_SYM: 171 | printf("LBRA_SYM\n"); 172 | goto again; 173 | case RBRA_SYM: 174 | printf("RBRA_SYM\n"); 175 | goto again; 176 | case LPAR_SYM: 177 | printf("LPAR_SYM\n"); 178 | goto again; 179 | case RPAR_SYM: 180 | printf("RPAR_SYM\n"); 181 | goto again; 182 | case PLUS_SYM: 183 | printf("PLUS_SYM\n"); 184 | goto again; 185 | case MINUS_SYM: 186 | printf("MINUS_SYM\n"); 187 | goto again; 188 | case LESS_SYM: 189 | printf("LESS_SYM\n"); 190 | goto again; 191 | case SEMI_SYM: 192 | printf("SEMI_SYM\n"); 193 | goto again; 194 | case EQUAL_SYM: 195 | printf("EQUAL_SYM\n"); 196 | goto again; 197 | case NUM_SYM: 198 | printf("NUM_SYM \"%d\"\n", num_val); 199 | goto again; 200 | case ID_SYM: 201 | printf("ID_SYM \"%s\"\n", id_name); 202 | goto again; 203 | case EOI_SYM: 204 | printf("EOI_SYM\n"); 205 | break; 206 | } 207 | } 208 | 209 | /*----------------------------------------------------------------------------*/ 210 | /* Parser */ 211 | 212 | enum 213 | { 214 | VAR, 215 | CST, 216 | ADD, 217 | SUB, 218 | LT, 219 | SET, 220 | IF, 221 | IFELSE, 222 | WHILE, 223 | DO, 224 | PRINT, 225 | EMPTY, 226 | SEQ, 227 | EXPR, 228 | PROG 229 | }; 230 | 231 | typedef struct node 232 | { 233 | int kind; 234 | struct node *o1, *o2, *o3; 235 | 236 | union { 237 | int val; 238 | char id[100]; 239 | }; 240 | 241 | } node; 242 | 243 | void consume(int expected) 244 | { 245 | if (sym == expected) 246 | next_sym(); 247 | else 248 | syntax_error("unknown expected"); 249 | } 250 | 251 | node *new_node(int k) 252 | { 253 | node *x = malloc(sizeof(node)); 254 | x->kind = k; 255 | return x; 256 | } 257 | 258 | node *id() 259 | { 260 | node *x = new_node(VAR); 261 | strcpy(x->id, id_name); 262 | next_sym(); 263 | return x; 264 | } 265 | 266 | node *num() 267 | { 268 | node *x = new_node(CST); 269 | x->val = num_val; 270 | next_sym(); 271 | return x; 272 | } 273 | 274 | node *paren_expr(); 275 | 276 | node *term() 277 | { 278 | if (sym == ID_SYM) 279 | return id(); 280 | else if (sym == NUM_SYM) 281 | return num(); 282 | else 283 | return paren_expr(); 284 | } 285 | 286 | node *sum() 287 | { 288 | node *x = term(); 289 | while (sym == PLUS_SYM || sym == MINUS_SYM) 290 | { 291 | node *t = x; 292 | x = new_node(sym == PLUS_SYM ? ADD : SUB); 293 | next_sym(); 294 | x->o1 = t; 295 | x->o2 = term(); 296 | } 297 | return x; 298 | } 299 | 300 | node *test() 301 | { 302 | node *x = sum(); 303 | if (sym == LESS_SYM) 304 | { 305 | node *t = x; 306 | x = new_node(LT); 307 | next_sym(); 308 | x->o1 = t; 309 | x->o2 = sum(); 310 | } 311 | return x; 312 | } 313 | 314 | node *expr() 315 | { 316 | if (sym != ID_SYM) 317 | return test(); 318 | 319 | node *x = test(); 320 | if (x->kind == VAR && sym == EQUAL_SYM) 321 | { 322 | node *t = x; 323 | x = new_node(SET); 324 | next_sym(); 325 | x->o1 = t; 326 | x->o2 = expr(); 327 | } 328 | return x; 329 | } 330 | 331 | node *paren_expr() 332 | { 333 | consume(LPAR_SYM); 334 | node *x = expr(); 335 | consume(RPAR_SYM); 336 | 337 | return x; 338 | } 339 | 340 | node *statement() 341 | { 342 | node *x; 343 | if (sym == IF_SYM) 344 | { 345 | next_sym(); 346 | x = new_node(IF); 347 | x->o1 = paren_expr(); 348 | x->o2 = statement(); 349 | if (sym == ELSE_SYM) 350 | { 351 | x->kind = IFELSE; 352 | next_sym(); 353 | x->o3 = statement(); 354 | } 355 | } 356 | else if (sym == WHILE_SYM) 357 | { 358 | x = new_node(WHILE); 359 | next_sym(); 360 | x->o1 = paren_expr(); 361 | x->o2 = statement(); 362 | } 363 | else if (sym == DO_SYM) 364 | { 365 | x = new_node(DO); 366 | next_sym(); 367 | x->o1 = statement(); 368 | consume(WHILE_SYM); 369 | x->o2 = paren_expr(); 370 | consume(SEMI_SYM); 371 | } 372 | else if (sym == PRINT_SYM) 373 | { 374 | x = new_node(PRINT); 375 | next_sym(); 376 | x->o1 = paren_expr(); 377 | consume(SEMI_SYM); 378 | } 379 | else if (sym == LBRA_SYM) 380 | { 381 | x = new_node(EMPTY); 382 | next_sym(); 383 | while (sym != RBRA_SYM) 384 | { 385 | node *t = x; 386 | x = new_node(SEQ); 387 | x->o1 = t; 388 | x->o2 = statement(); 389 | } 390 | next_sym(); 391 | } 392 | else if (sym == SEMI_SYM) 393 | { 394 | x = new_node(EMPTY); 395 | next_sym(); 396 | } 397 | else 398 | { 399 | x = new_node(EXPR); 400 | x->o1 = expr(); 401 | consume(SEMI_SYM); 402 | } 403 | 404 | return x; 405 | } 406 | 407 | node *program() 408 | { 409 | node *x = new_node(PROG); 410 | x->o1 = statement(); 411 | consume(EOI_SYM); 412 | return x; 413 | } 414 | 415 | void print_ast(node *x) 416 | { 417 | switch (x->kind) 418 | { 419 | case VAR: 420 | printf("VAR \"%s\" ", x->id); 421 | break; 422 | case CST: 423 | printf("CST \"%d\" ", x->val); 424 | break; 425 | case ADD: 426 | print_ast(x->o1); 427 | printf("ADD "); 428 | print_ast(x->o2); 429 | break; 430 | case SUB: 431 | print_ast(x->o1); 432 | printf("SUB "); 433 | print_ast(x->o2); 434 | break; 435 | case LT: 436 | print_ast(x->o1); 437 | printf("LT "); 438 | print_ast(x->o2); 439 | break; 440 | case SET: 441 | printf("SET "); 442 | print_ast(x->o1); 443 | print_ast(x->o2); 444 | break; 445 | case IF: 446 | printf("IF "); 447 | print_ast(x->o1); 448 | print_ast(x->o2); 449 | break; 450 | case IFELSE: 451 | printf("IF "); 452 | print_ast(x->o1); 453 | print_ast(x->o2); 454 | printf("ELSE "); 455 | print_ast(x->o3); 456 | break; 457 | case EXPR: 458 | printf("EXPR "); 459 | print_ast(x->o1); 460 | break; 461 | case SEQ: 462 | printf("SEQ "); 463 | print_ast(x->o1); 464 | print_ast(x->o2); 465 | break; 466 | case PRINT: 467 | printf("PRINT "); 468 | print_ast(x->o1); 469 | break; 470 | case WHILE: 471 | printf("WHILE "); 472 | print_ast(x->o1); 473 | print_ast(x->o2); 474 | break; 475 | case DO: 476 | printf("DO "); 477 | print_ast(x->o1); 478 | printf("WHILE "); 479 | print_ast(x->o2); 480 | break; 481 | case PROG: 482 | printf("PROG "); 483 | print_ast(x->o1); 484 | break; 485 | case EMPTY: 486 | printf("EMPTY "); 487 | break; 488 | default: 489 | syntax_error("unknown node"); 490 | break; 491 | } 492 | } 493 | 494 | node *parse() 495 | { 496 | next_sym(); 497 | node *x = program(); 498 | return x; 499 | } 500 | 501 | /*----------------------------------------------------------------------------*/ 502 | /* Interpreter */ 503 | 504 | typedef struct list 505 | { 506 | char *id; 507 | int value; 508 | struct list *next; 509 | } list; 510 | 511 | list *env; 512 | 513 | list *get_id(char *id) 514 | { 515 | for (list *lst = env; lst; lst = lst->next) 516 | if (strcmp(lst->id, id) == 0) 517 | return lst; 518 | 519 | return (list *)NULL; 520 | } 521 | 522 | void lookup_error(char *id) 523 | { 524 | fprintf(stderr, "error looking up %s\n", id); 525 | exit(1); 526 | } 527 | 528 | int lookup_value(char *id) 529 | { 530 | list *pid = get_id(id); 531 | if (pid) 532 | return pid->value; 533 | 534 | lookup_error(id); 535 | return -1; 536 | } 537 | 538 | void add_id(char *id, int value) 539 | { 540 | list *pid = get_id(id); 541 | if (pid) 542 | { 543 | pid->value = value; 544 | return; 545 | } 546 | 547 | list *lst = malloc(sizeof(list)); 548 | lst->id = id; 549 | lst->value = value; 550 | lst->next = env; 551 | env = lst; 552 | } 553 | 554 | void eval_error() 555 | { 556 | fprintf(stderr, "semantics error"); 557 | exit(1); 558 | } 559 | 560 | int eval_expr(node *x) 561 | { 562 | 563 | switch (x->kind) 564 | { 565 | case VAR: 566 | return lookup_value(x->id); 567 | case CST: 568 | return x->val; 569 | case ADD: 570 | return eval_expr(x->o1) + eval_expr(x->o2); 571 | case SUB: 572 | return eval_expr(x->o1) - eval_expr(x->o2); 573 | case LT: 574 | return eval_expr(x->o1) < eval_expr(x->o2); 575 | case SET: 576 | { 577 | node *var = x->o1; 578 | int val = eval_expr(x->o2); 579 | add_id(var->id, val); 580 | return val; 581 | } 582 | default: 583 | eval_error(); 584 | return -1; 585 | } 586 | } 587 | 588 | void eval_statement(node *x) 589 | { 590 | switch (x->kind) 591 | { 592 | case PRINT: 593 | printf("%d\n", eval_expr(x->o1)); 594 | break; 595 | case IF: 596 | if (eval_expr(x->o1)) 597 | eval_statement(x->o2); 598 | break; 599 | case IFELSE: 600 | if (eval_expr(x->o1)) 601 | eval_statement(x->o2); 602 | else 603 | eval_statement(x->o3); 604 | break; 605 | case WHILE: 606 | while (eval_expr(x->o1)) 607 | eval_statement(x->o2); 608 | break; 609 | case DO: 610 | do 611 | eval_statement(x->o1); 612 | while (eval_expr(x->o2)); 613 | break; 614 | case SEQ: 615 | eval_statement(x->o1); 616 | eval_statement(x->o2); 617 | break; 618 | case EXPR: 619 | eval_expr(x->o1); 620 | break; 621 | case EMPTY: 622 | break; 623 | default: 624 | eval_error(); 625 | } 626 | } 627 | 628 | void eval_program(node *x) 629 | { 630 | switch (x->kind) 631 | { 632 | case PROG: 633 | eval_statement(x->o1); 634 | break; 635 | default: 636 | eval_error(); 637 | } 638 | } 639 | 640 | /*----------------------------------------------------------------------------*/ 641 | /* Compiler */ 642 | 643 | enum 644 | { 645 | IFETCH, 646 | ISTORE, 647 | IPUSH, 648 | IPOP, 649 | IADD, 650 | ISUB, 651 | ILT, 652 | IJZ, 653 | IJNZ, 654 | IJMP, 655 | IPRINT, 656 | IHALT 657 | }; 658 | 659 | typedef char code; 660 | code object[1000], *here = object; 661 | 662 | void g(code c) { *here++ = c; } 663 | 664 | code *hole() { return here++; } 665 | 666 | void fix(code *src, code *dst) { *src = dst - src; } 667 | 668 | char names[100][100], (*namespt)[100] = names; 669 | 670 | int globals[100]; 671 | void initialize_globals() 672 | { 673 | for (int i = 0; i < 100; i++) 674 | globals[i] = -1; 675 | } 676 | 677 | int get_index(char *name) 678 | { 679 | int i; 680 | for (char(*npt)[100] = names; npt < namespt; npt++) 681 | { 682 | i = npt - names; 683 | if (strcmp(name, names[i]) == 0) 684 | return i; 685 | } 686 | i = namespt++ - names; 687 | strcpy(names[i], name); 688 | return i; 689 | } 690 | 691 | void c(node *x) 692 | { 693 | code *p1, *p2; 694 | switch (x->kind) 695 | { 696 | case VAR: 697 | g(IFETCH); 698 | g(get_index(x->id)); 699 | break; 700 | case CST: 701 | g(IPUSH); 702 | g(x->val); 703 | break; 704 | case ADD: 705 | c(x->o1); 706 | c(x->o2); 707 | g(IADD); 708 | break; 709 | case SUB: 710 | c(x->o1); 711 | c(x->o2); 712 | g(ISUB); 713 | break; 714 | case LT: 715 | c(x->o1); 716 | c(x->o2); 717 | g(ILT); 718 | break; 719 | case SET: 720 | c(x->o2); 721 | g(ISTORE); 722 | g(get_index(x->o1->id)); 723 | break; 724 | case IF: 725 | c(x->o1); 726 | g(IJZ); 727 | p1 = hole(); 728 | c(x->o2); 729 | fix(p1, here); 730 | break; 731 | case IFELSE: 732 | c(x->o1); 733 | g(IJZ); 734 | p1 = hole(); 735 | c(x->o2); 736 | g(IJMP); 737 | p2 = hole(); 738 | fix(p1, here); 739 | c(x->o3); 740 | fix(p2, here); 741 | break; 742 | case WHILE: 743 | p1 = here; 744 | c(x->o1); 745 | g(IJZ); 746 | p2 = hole(); 747 | c(x->o2); 748 | g(IJMP); 749 | fix(hole(), p1); 750 | fix(p2, here); 751 | break; 752 | case DO: 753 | p1 = here; 754 | c(x->o1); 755 | c(x->o2); 756 | g(IJNZ); 757 | fix(hole(), p1); 758 | break; 759 | case PRINT: 760 | c(x->o1); 761 | g(IPRINT); 762 | break; 763 | case EMPTY: 764 | break; 765 | case SEQ: 766 | c(x->o1); 767 | c(x->o2); 768 | break; 769 | case EXPR: 770 | c(x->o1); 771 | g(IPOP); 772 | break; 773 | case PROG: 774 | c(x->o1); 775 | g(IHALT); 776 | break; 777 | } 778 | } 779 | 780 | void run() 781 | { 782 | int stack[1000], *sp = stack; 783 | code *pc = object; 784 | again: 785 | switch (*pc++) 786 | { 787 | case IFETCH: 788 | *sp++ = globals[*pc++]; 789 | goto again; 790 | case ISTORE: 791 | globals[*pc++] = sp[-1]; 792 | goto again; 793 | case IPUSH: 794 | *sp++ = *pc++; 795 | goto again; 796 | case IPOP: 797 | --sp; 798 | goto again; 799 | case IADD: 800 | sp[-2] = sp[-2] + sp[-1]; 801 | --sp; 802 | goto again; 803 | case ISUB: 804 | sp[-2] = sp[-2] - sp[-1]; 805 | --sp; 806 | goto again; 807 | case ILT: 808 | sp[-2] = sp[-2] < sp[-1]; 809 | --sp; 810 | goto again; 811 | case IJZ: 812 | if (*--sp == 0) 813 | pc += *pc; 814 | else 815 | pc++; 816 | goto again; 817 | case IJMP: 818 | pc += *pc; 819 | goto again; 820 | case IJNZ: 821 | if (*--sp != 0) 822 | pc += *pc; 823 | else 824 | pc++; 825 | goto again; 826 | case IPRINT: 827 | printf("%d\n", *--sp); 828 | goto again; 829 | } 830 | } 831 | 832 | /*----------------------------------------------------------------------------*/ 833 | 834 | int main() 835 | { 836 | node *prog = parse(); 837 | 838 | printf("\n--- The abstract syntax code: \n"); 839 | print_ast(prog); 840 | 841 | printf("\n--- The interpreter: \n"); 842 | eval_program(prog); 843 | 844 | printf("\n--- The virtual machine interpreting the bytecodes: \n"); 845 | c(prog); 846 | run(); 847 | 848 | return 0; 849 | } 850 | --------------------------------------------------------------------------------