├── .gitignore ├── Makefile ├── README.md ├── hello.mini ├── minilang.c └── minilang.mini /.gitignore: -------------------------------------------------------------------------------- 1 | minilang 2 | minilang_jr 3 | minilang.s 4 | minilang_jr.s 5 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | 2 | all: 3 | c99 minilang.c -o minilang -g -Wall 4 | 5 | 6 | bootstrap: minilang 7 | ./minilang minilang.mini minilang.s 8 | gcc minilang.s -o minilang_jr 9 | ./minilang_jr minilang.mini minilang_jr.s 10 | diff minilang.s minilang_jr.s 11 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # minilang 2 | 3 | Inspired by [BASICO](http://www.andreadrian.de/tbng/index.html), minilang is a 4 | little programming language. My intention was to keep the implementation brief, 5 | yet readable. It is bootstrapped, thus proving to be reasonably expressive. The 6 | compiler outputs x86-64 ASM code. Similar to Python, code blocks are formed 7 | via indentation. There is no type checking whatsoever, so be careful. Please, 8 | look into `minilang.mini` and you will quickly get the idea. Obligatorily, 9 | here is your first programme: 10 | 11 | main():puts("Hello, world!") 12 | 13 | Compile the throwaway compiler: 14 | 15 | $ c99 minilang.c -o minilang 16 | 17 | Bootstrap thus: 18 | 19 | $ ./minilang minilang.mini minilang.s # compile with throwaway compiler 20 | $ gcc minilang.s -o minilang_jr # GCC assembles the executable 21 | $ ./minilang_jr minilang.mini minilang_jr.s # compile with bootstrapped compiler 22 | $ diff minilang.s minilang_jr.s # compare output -> equal 23 | 24 | -------------------------------------------------------------------------------- /hello.mini: -------------------------------------------------------------------------------- 1 | main():puts("Hello, world!") 2 | -------------------------------------------------------------------------------- /minilang.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | 9 | enum { 10 | LEX_EOF = EOF, 11 | LEX_ASM = 0, 12 | LEX_IF, 13 | LEX_ELSE, 14 | LEX_ELIF, 15 | LEX_WHILE, 16 | LEX_BREAK, 17 | LEX_CONTINUE, 18 | LEX_RETURN, 19 | LEX_VAR, 20 | LEX_KEYWORD_COUNT, 21 | LEX_BLOCK_END, 22 | LEX_CHAR, 23 | LEX_STRING, 24 | LEX_NUMBER, 25 | LEX_IDENT, 26 | LEX_ASM_LINE, 27 | LEX_LE, 28 | LEX_GE, 29 | LEX_EQ, 30 | LEX_NE, 31 | LEX_SIZE 32 | }; 33 | 34 | const char* keywords[] = { 35 | "asm", 36 | "if", 37 | "else", 38 | "elif", 39 | "while", 40 | "break", 41 | "continue", 42 | "return", 43 | "var", 44 | NULL, 45 | "block end", 46 | "character", 47 | "string", 48 | "number", 49 | "identifier", 50 | }; 51 | 52 | 53 | // scanner 54 | int character; 55 | int lexeme; 56 | char token[1024]; 57 | long long number; 58 | int neg_number; 59 | int line_number = 0; 60 | int cursor_pos = 0; 61 | 62 | int brackets = 0; 63 | int block = 0; 64 | int indent = 0; 65 | int newline = 1; 66 | int asm_active = 0; 67 | 68 | FILE* src_file; 69 | FILE* dst_file; 70 | 71 | 72 | void error(char* msg, ...) { 73 | fprintf(stderr, "%d:%d: error: ", line_number, cursor_pos); 74 | va_list args; 75 | va_start(args, msg); 76 | vfprintf(stderr, msg, args); 77 | va_end(args); 78 | fprintf(stderr, "\n"); 79 | exit(1); 80 | } 81 | 82 | 83 | void output(char* msg, ...) { 84 | va_list args; 85 | va_start(args, msg); 86 | vfprintf(dst_file, msg, args); 87 | va_end(args); 88 | } 89 | 90 | 91 | int read_char() { 92 | int c = character; 93 | character = fgetc(src_file); 94 | cursor_pos++; 95 | if(character == '\n') { 96 | line_number++; 97 | cursor_pos = 0; 98 | } 99 | return c; 100 | } 101 | 102 | 103 | int scan() { 104 | space: 105 | while(isspace(character)) { 106 | if(newline) { 107 | if(character == ' ') indent++; 108 | if(character == '\t') indent = (indent & ~3) + 4; 109 | } 110 | if(character == '\n') { 111 | indent = 0; 112 | int n = newline; 113 | newline = 1; 114 | if(n == 0 && brackets == 0) return ';'; 115 | } 116 | read_char(); 117 | } 118 | 119 | // ignore comment 120 | if(character == '#') { 121 | while(character != '\n') read_char(); 122 | goto space; 123 | } 124 | 125 | // indent 126 | if(!brackets) { 127 | if(indent > block) error("invalid indentation"); 128 | if(indent < block) { 129 | asm_active = 0; 130 | block -= 4; 131 | return LEX_BLOCK_END; 132 | } 133 | } 134 | 135 | // asm line 136 | if(asm_active) { 137 | int i = 0; 138 | while(character != '\n') { 139 | token[i] = read_char(); 140 | i++; 141 | } 142 | token[i] = '\0'; 143 | return LEX_ASM_LINE; 144 | } 145 | 146 | newline = 0; 147 | // one character token 148 | if(strchr("-+*/%&|~!=<>;:()[],@{}", character)) { 149 | int c = read_char(); 150 | if(c == ':') { // new block 151 | block += 4; 152 | indent += 4; 153 | } 154 | else if(strchr("<>!=", c) && character == '=') { 155 | read_char(); 156 | switch(c) { 157 | case '<': return LEX_LE; 158 | case '>': return LEX_GE; 159 | case '=': return LEX_EQ; 160 | case '!': return LEX_NE; 161 | } 162 | } 163 | else if(c == '(' || c == '[') brackets++; 164 | else if(c == ')' || c == ']') brackets--; 165 | if(isdigit(character)) neg_number = (c == '-'); 166 | return c; 167 | } 168 | 169 | // char 170 | if(character == '\'') { 171 | read_char(); 172 | token[0] = '\''; 173 | int i = 1; 174 | if(character == '\\') token[i++] = read_char(); 175 | token[i++] = read_char(); 176 | token[i++] = '\''; 177 | token[i] = '\0'; 178 | if(read_char() != '\'') error("bad character literal"); 179 | return LEX_CHAR; 180 | } 181 | 182 | // string 183 | if(character == '"') { 184 | int i = 0; 185 | do { 186 | if(character == '\\') token[i++] = read_char(); 187 | token[i++] = read_char(); 188 | if(i > 1020) error("string too long"); 189 | } while(character != '"'); 190 | token[i++] = read_char(); 191 | token[i] = '\0'; 192 | return LEX_STRING; 193 | } 194 | 195 | // number 196 | if(isdigit(character)) { 197 | int i = 0; 198 | do { 199 | token[i++] = read_char(); 200 | if(i > 20) error("number too long"); 201 | } while(isdigit(character)); 202 | token[i] = '\0'; 203 | number = atoll(token); 204 | return LEX_NUMBER; 205 | } 206 | 207 | // identifier and keywords 208 | if(isalpha(character) || character == '_') { 209 | int i = 0; 210 | do { 211 | token[i++] = read_char(); 212 | if(i > 62) error("identifier too long"); 213 | } while(isalnum(character) || character == '_'); 214 | token[i] = '\0'; 215 | 216 | // check for keywords 217 | for(i = 0; i < LEX_KEYWORD_COUNT; i++) { 218 | if(strcmp(token, keywords[i]) == 0) return i; 219 | } 220 | return LEX_IDENT; 221 | } 222 | 223 | if(character != EOF) error("unknown character"); 224 | if(block > 0) { 225 | block -= 4; 226 | return LEX_BLOCK_END; 227 | } 228 | return LEX_EOF; 229 | } 230 | 231 | 232 | void read_lexeme() { lexeme = scan(); } 233 | 234 | 235 | void expect(int l) { 236 | if(lexeme != l) { 237 | if(l < LEX_SIZE) error("%s expected", keywords[l]); 238 | else error("<%c> expected", l); 239 | } 240 | read_lexeme(); 241 | } 242 | 243 | 244 | // symbol table 245 | typedef struct { 246 | char name[64]; 247 | int offset; 248 | } Variable; 249 | 250 | Variable locals[1024]; 251 | int local_count; 252 | 253 | 254 | // code generation 255 | const char* call_regs[] = { "rdi", "rsi", "rdx", "rcx", "r8", "r9" }; 256 | const char* regs[] = { "r8", "r9", "r11", "rax" }; 257 | enum { 258 | cache_size = sizeof(regs) / sizeof(char*) 259 | }; 260 | int cache[cache_size]; 261 | int stack_size; 262 | int label = 0; 263 | int while_labels[256]; 264 | int while_level = -1; 265 | 266 | 267 | const char* regname(int i) { return regs[cache[i]]; } 268 | 269 | 270 | void init_cache() { 271 | for(int i = 0; i < cache_size; i++) cache[i] = i; 272 | stack_size = 0; 273 | } 274 | 275 | 276 | Variable* lookup_local() { 277 | for(int i = 0; i < local_count; i++) { 278 | if(strcmp(token, locals[i].name) == 0) return &locals[i]; 279 | } 280 | return NULL; 281 | } 282 | 283 | 284 | void add_local(int offset) { 285 | for(int i = 0; i < 1024; i++) { 286 | if(i == local_count) { 287 | strcpy(locals[i].name, token); 288 | locals[i].offset = offset; 289 | local_count++; 290 | return; 291 | } 292 | if(strcmp(token, locals[i].name) == 0) error("multiple declarations"); 293 | } 294 | error("too many variables"); 295 | } 296 | 297 | 298 | void push() { 299 | int i = cache_size - 1; 300 | int tmp = cache[i]; 301 | if(stack_size >= cache_size) output("\tpush %s\n", regs[tmp]); 302 | while(i > 0) { 303 | cache[i] = cache[i - 1]; 304 | i--; 305 | } 306 | cache[0] = tmp; 307 | stack_size++; 308 | } 309 | 310 | 311 | void pop() { 312 | stack_size--; 313 | if(stack_size == 0) init_cache(); 314 | else { 315 | int i = 0; 316 | int tmp = cache[0]; 317 | while(i < cache_size - 1) { 318 | cache[i] = cache[i + 1]; 319 | i++; 320 | } 321 | cache[i] = tmp; 322 | if(stack_size >= cache_size) output("\tpop %s\n", regs[i]); 323 | } 324 | } 325 | 326 | 327 | int is_expr_beginning() { 328 | static const int lexemes[] = { 329 | '-', '!', '(', LEX_NUMBER, LEX_CHAR, LEX_STRING, LEX_IDENT 330 | }; 331 | for(int i = 0; i < sizeof(lexemes) / sizeof(int); i++) 332 | if(lexeme == lexemes[i]) return 1; 333 | return 0; 334 | } 335 | 336 | 337 | int is_stmt_beginning() { 338 | static const int lexemes[] = { 339 | LEX_ASM, LEX_IF, LEX_WHILE, LEX_BREAK, LEX_CONTINUE, LEX_RETURN, ';' 340 | }; 341 | for(int i = 0; i < sizeof(lexemes) / sizeof(int); i++) 342 | if(lexeme == lexemes[i]) return 1; 343 | return is_expr_beginning(); 344 | } 345 | 346 | 347 | void expression(); 348 | void expr_level_zero() { 349 | 350 | if(lexeme == '!') { 351 | read_lexeme(); 352 | expr_level_zero(); 353 | output("\ttest %s, %s\n", regname(0), regname(0)); 354 | output("\tsetz cl\n"); 355 | output("\tmovzx %s, cl\n", regname(0)); 356 | return; 357 | } 358 | if(lexeme == '-') { 359 | if(!neg_number) { 360 | read_lexeme(); 361 | expr_level_zero(); 362 | output("\tneg %s\n", regname(0)); 363 | return; 364 | } 365 | read_lexeme(); 366 | push(); 367 | output("\tmov %s, %ld\n", regname(0), -number); 368 | read_lexeme(); 369 | } 370 | else if(lexeme == LEX_NUMBER) { 371 | push(); 372 | output("\tmov %s, %ld\n", regname(0), number); 373 | read_lexeme(); 374 | } 375 | else if(lexeme == LEX_CHAR) { 376 | push(); 377 | output("\tmov %s, %s\n", regname(0), token); 378 | read_lexeme(); 379 | } 380 | else if(lexeme == '(') { 381 | read_lexeme(); 382 | expression(); 383 | expect(')'); 384 | } 385 | else if(lexeme == LEX_IDENT) { 386 | char name[64]; 387 | strcpy(name, token); 388 | Variable* v = lookup_local(); 389 | 390 | read_lexeme(); 391 | if(lexeme == '(') { // function call 392 | // save used regs on stack 393 | int i = stack_size; 394 | if(i > cache_size) i = cache_size; 395 | while(i-- > 0) output("\tpush %s\n", regname(i)); 396 | 397 | int old_size = stack_size; 398 | stack_size = 0; 399 | 400 | // expr list 401 | int args = 0; 402 | read_lexeme(); 403 | if(is_expr_beginning()) { 404 | args++; 405 | expression(); 406 | output("\tpush %s\n", regname(0)); 407 | pop(); 408 | while(lexeme == ',') { 409 | read_lexeme(); 410 | args++; 411 | if(args > 6) error("too many arguments"); 412 | expression(); 413 | output("\tpush %s\n", regname(0)); 414 | pop(); 415 | } 416 | } 417 | expect(')'); 418 | 419 | // set-up registers 420 | for(int i = args - 1; i >= 0; i--) { 421 | output("\tpop %s\n", call_regs[i]); 422 | } 423 | 424 | // call 425 | output("\txor rax, rax\n"); 426 | output("\tcall %s@PLT\n", name); 427 | 428 | init_cache(); 429 | push(); 430 | stack_size = old_size + 1; 431 | int m = stack_size; 432 | if(m > cache_size) m = cache_size; 433 | for(i = 1; i < m; i++) { 434 | output("\tpop %s\n", regname(i)); 435 | } 436 | } 437 | else if(lexeme == '=') { 438 | read_lexeme(); 439 | expression(); 440 | if(!v) 441 | output("\tmov %s[rip], %s\n", name, regname(0)); 442 | else 443 | output("\tmov QWORD PTR [rbp - %d], %s\n", v->offset, regname(0)); 444 | } 445 | else { 446 | push(); 447 | if(!v) 448 | output("\tmov %s, %s[rip]\n", regname(0), name); 449 | else 450 | output("\tmov %s, QWORD PTR [rbp - %d]\n", regname(0), v->offset); 451 | } 452 | } 453 | else if(lexeme == '@') { 454 | // dereference 455 | error("not implementet yet"); 456 | } 457 | else if(lexeme == LEX_STRING) { 458 | push(); 459 | output("\t.section .rodata\n"); 460 | output("LC%d:\n", label); 461 | output("\t.string %s\n", token); 462 | output("\t.text\n"); 463 | output("\tlea %s, LC%d[rip]\n", regname(0), label); 464 | label++; 465 | read_lexeme(); 466 | } 467 | else error("bad expression"); 468 | 469 | while(lexeme == '[') { 470 | read_lexeme(); 471 | expression(); 472 | expect(']'); 473 | if(lexeme == '=') { 474 | read_lexeme(); 475 | expression(); 476 | output("\tmov QWORD PTR [%s + %s * 8], %s\n", regname(2), regname(1), regname(0)); 477 | int tmp = cache[2]; 478 | cache[2] = cache[0]; 479 | cache[0] = tmp; 480 | pop(); 481 | pop(); 482 | return; 483 | } 484 | output("\tmov %s, QWORD PTR [%s + %s * 8]\n", regname(1), regname(1), regname(0)); 485 | pop(); 486 | } 487 | if(lexeme == '{') { 488 | read_lexeme(); 489 | expression(); 490 | expect('}'); 491 | if(lexeme == '=') { 492 | read_lexeme(); 493 | expression(); 494 | output("\tmov rcx, %s\n", regname(0)); 495 | output("\tmov BYTE PTR [%s + %s], cl\n", regname(2), regname(1)); 496 | int tmp = cache[2]; 497 | cache[2] = cache[0]; 498 | cache[0] = tmp; 499 | pop(); 500 | pop(); 501 | return; 502 | } 503 | output("\tmov cl, BYTE PTR [%s + %s]\n", regname(1), regname(0)); 504 | output("\tmovzx %s, cl\n", regname(1)); 505 | pop(); 506 | } 507 | 508 | } 509 | 510 | 511 | void expr_level_one() { 512 | expr_level_zero(); 513 | while(strchr("*%/", lexeme)) { 514 | if(lexeme == '*') { 515 | read_lexeme(); 516 | expr_level_zero(); 517 | output("\timul %s, %s\n", regname(1), regname(0)); 518 | pop(); 519 | } 520 | else if(lexeme == '%') { 521 | error("TODO"); 522 | } 523 | else if(lexeme == '/') { 524 | error("TODO"); 525 | } 526 | } 527 | } 528 | 529 | 530 | void expr_level_two() { 531 | expr_level_one(); 532 | while(strchr("+-", lexeme)) { 533 | if(lexeme == '+') { 534 | read_lexeme(); 535 | expr_level_one(); 536 | output("\tadd %s, %s\n", regname(1), regname(0)); 537 | pop(); 538 | } 539 | else if(lexeme == '-') { 540 | read_lexeme(); 541 | expr_level_one(); 542 | output("\tsub %s, %s\n", regname(1), regname(0)); 543 | pop(); 544 | } 545 | } 546 | } 547 | 548 | 549 | void expr_level_three() { 550 | expr_level_two(); 551 | char* comp; 552 | switch(lexeme) { 553 | case '<': comp = "l"; break; 554 | case '>': comp = "g"; break; 555 | case LEX_LE: comp = "le"; break; 556 | case LEX_GE: comp = "ge"; break; 557 | case LEX_EQ: comp = "e"; break; 558 | case LEX_NE: comp = "ne"; break; 559 | default: return; 560 | } 561 | read_lexeme(); 562 | expr_level_two(); 563 | output("\tcmp %s, %s\n", regname(1), regname(0)); 564 | output("\tset%s cl\n", comp); 565 | output("\tmovzx %s, cl\n", regname(1)); 566 | pop(); 567 | } 568 | 569 | 570 | void expr_level_four() { 571 | expr_level_three(); 572 | while(lexeme == '&') { 573 | read_lexeme(); 574 | expr_level_three(); 575 | output("\tand %s, %s\n", regname(1), regname(0)); 576 | pop(); 577 | } 578 | } 579 | 580 | 581 | void expression() { 582 | expr_level_four(); 583 | while(lexeme == '|') { 584 | read_lexeme(); 585 | expr_level_four(); 586 | output("\tor %s, %s\n", regname(1), regname(0)); 587 | pop(); 588 | } 589 | } 590 | 591 | 592 | void statement(); 593 | void statement_list() { 594 | while(is_stmt_beginning()) statement(); 595 | } 596 | 597 | 598 | void statement() { 599 | if(lexeme == LEX_ASM) { 600 | read_lexeme(); 601 | asm_active = 1; 602 | newline = 1; 603 | expect(':'); 604 | while(lexeme == LEX_ASM_LINE) { 605 | if(lexeme == LEX_ASM_LINE) output("\t%s\n", token); 606 | read_lexeme(); 607 | } 608 | expect(LEX_BLOCK_END); 609 | } 610 | else if(lexeme == LEX_IF) { 611 | read_lexeme(); 612 | expression(); 613 | expect(':'); 614 | int l_end = label++; 615 | int l_next = label++; 616 | int end = 0; 617 | output("\ttest %s, %s\n", regname(0), regname(0)); 618 | output("\tjz .L%d\n", l_next); 619 | init_cache(); 620 | statement_list(); 621 | expect(LEX_BLOCK_END); 622 | if(lexeme == LEX_ELIF || lexeme == LEX_ELSE) { 623 | output("\tjmp .L%d\n", l_end); 624 | end = 1; 625 | } 626 | output(".L%d:\n", l_next); 627 | while(lexeme == LEX_ELIF) { 628 | read_lexeme(); 629 | expression(); 630 | expect(':'); 631 | l_next = label++; 632 | output("\ttest %s, %s\n", regname(0), regname(0)); 633 | output("\tjz .L%d\n", l_next); 634 | init_cache(); 635 | statement_list(); 636 | expect(LEX_BLOCK_END); 637 | if(lexeme == LEX_ELIF || lexeme == LEX_ELSE) 638 | output("\tjmp .L%d\n", l_end); 639 | output(".L%d:\n", l_next); 640 | } 641 | if(lexeme == LEX_ELSE) { 642 | read_lexeme(); 643 | expect(':'); 644 | init_cache(); 645 | statement_list(); 646 | expect(LEX_BLOCK_END); 647 | } 648 | if(end) output(".L%d:\n", l_end); 649 | } 650 | else if(lexeme == LEX_WHILE) { 651 | read_lexeme(); 652 | while_level++; 653 | if(while_level == 256) error("while nesting limit exceeded"); 654 | while_labels[while_level] = label; 655 | label += 2; 656 | output(".L%d:\n", while_labels[while_level]); 657 | expression(); 658 | expect(':'); 659 | output("\ttest %s, %s\n", regname(0), regname(0)); 660 | output("\tjz .L%d\n", while_labels[while_level] + 1); 661 | init_cache(); 662 | statement_list(); 663 | expect(LEX_BLOCK_END); 664 | output("\tjmp .L%d\n", while_labels[while_level]); 665 | output(".L%d:\n", while_labels[while_level] + 1); 666 | while_level--; 667 | } 668 | else if(lexeme == LEX_BREAK) { 669 | read_lexeme(); 670 | if(while_level < 0) error("break without while"); 671 | output("\tjmp .L%d\n", while_labels[while_level] + 1); 672 | } 673 | else if(lexeme == LEX_CONTINUE) { 674 | read_lexeme(); 675 | if(while_level < 0) error("continue without while"); 676 | output("\tjmp .L%d\n", while_labels[while_level]); 677 | } 678 | else if(lexeme == LEX_RETURN) { 679 | read_lexeme(); 680 | if(is_expr_beginning()) { 681 | expression(); 682 | if(strcmp(regname(0), "rax") != 0) 683 | output("\tmov rax, %s\n", regname(0)); 684 | pop(); 685 | } 686 | output("\tleave\n"); 687 | output("\tret\n"); 688 | } 689 | else if(is_expr_beginning()) { 690 | expression(); 691 | pop(); 692 | } 693 | else expect(';'); 694 | } 695 | 696 | 697 | void minilang() { 698 | line_number = 1; 699 | read_char(); 700 | read_lexeme(); 701 | 702 | output("\t.intel_syntax noprefix\n"); 703 | output("\t.text\n"); 704 | 705 | while(lexeme != LEX_EOF) { 706 | 707 | // global variables 708 | while(lexeme == LEX_VAR) { 709 | read_lexeme(); 710 | expect(LEX_IDENT); 711 | output("\t.comm %s, 8, 8\n", token); 712 | while(lexeme == ',') { 713 | read_lexeme(); 714 | expect(LEX_IDENT); 715 | output("\t.comm %s, 8, 8\n", token); 716 | } 717 | while(lexeme == ';') read_lexeme(); 718 | } 719 | 720 | expect(LEX_IDENT); 721 | output("\t.globl %s\n", token); 722 | output("%s:\n", token); 723 | output("\tpush rbp\n"); 724 | output("\tmov rbp, rsp\n"); 725 | 726 | int frame = 0; 727 | local_count = 0; 728 | 729 | // parameter list 730 | int params = 0; 731 | expect('('); 732 | if(lexeme == LEX_IDENT) { 733 | params++; 734 | expect(LEX_IDENT); 735 | frame += 8; 736 | add_local(frame); 737 | while(lexeme == ',') { 738 | read_lexeme(); 739 | params++; 740 | if(params > 6) error("too many arguments"); 741 | expect(LEX_IDENT); 742 | frame += 8; 743 | add_local(frame); 744 | } 745 | } 746 | expect(')'); 747 | expect(':'); 748 | 749 | // local variables 750 | while(lexeme == ';') read_lexeme(); 751 | while(lexeme == LEX_VAR) { 752 | read_lexeme(); 753 | expect(LEX_IDENT); 754 | frame += 8; 755 | add_local(frame); 756 | while(lexeme == ',') { 757 | read_lexeme(); 758 | expect(LEX_IDENT); 759 | frame += 8; 760 | add_local(frame); 761 | } 762 | while(lexeme == ';') read_lexeme(); 763 | } 764 | 765 | if(frame > 0) output("\tsub rsp, %d\n", frame); 766 | for(int i = 0; i < params; i++) { 767 | output("\tmov QWORD PTR [rbp - %d], %s\n", i * 8 + 8, call_regs[i]); 768 | } 769 | 770 | init_cache(); 771 | statement_list(); 772 | output("\tleave\n"); 773 | output("\tret\n"); 774 | expect(LEX_BLOCK_END); 775 | 776 | } 777 | } 778 | 779 | 780 | void cleanup() { 781 | if(src_file) fclose(src_file); 782 | if(dst_file && dst_file != stdin) fclose(dst_file); 783 | } 784 | 785 | 786 | int main(int argc, char** argv) { 787 | 788 | if(argc < 2 || argc > 3) { 789 | printf("usuage: %s [output]\n", argv[0]); 790 | exit(0); 791 | } 792 | 793 | src_file = fopen(argv[1], "r"); 794 | if(!src_file) error("opening source file failed"); 795 | 796 | if(argc == 3) { 797 | dst_file = fopen(argv[2], "w"); 798 | if(!dst_file) error("opening output file failed"); 799 | } 800 | else dst_file = stdout; 801 | atexit(cleanup); 802 | minilang(); 803 | return 0; 804 | } 805 | 806 | -------------------------------------------------------------------------------- /minilang.mini: -------------------------------------------------------------------------------- 1 | # minilang compiler in minilang 2 | 3 | 4 | var src_file, dst_file 5 | 6 | # scanner 7 | var line_number, cursor_pos 8 | var character, lexeme 9 | var token 10 | var number, is_negative 11 | var brackets, block, indent, newline, asm_active 12 | var keywords 13 | 14 | 15 | error(msg, a1, a2, a3, a4): 16 | fprintf(stderr, "%d:%d: error: ", line_number, cursor_pos) 17 | fprintf(stderr, msg, a1, a2, a3, a4) 18 | fprintf(stderr, "\n") 19 | exit(1) 20 | 21 | 22 | output(msg, a1, a2, a3, a4): fprintf(dst_file, msg, a1, a2, a3, a4) 23 | 24 | 25 | read_char(): 26 | var c 27 | c = character 28 | character = fgetc(src_file) 29 | cursor_pos = cursor_pos + 1 30 | if character == '\n': 31 | line_number = line_number + 1 32 | cursor_pos = 0 33 | return c 34 | 35 | 36 | scan(): 37 | var i 38 | 39 | while 1: 40 | # whitespace 41 | while isspace(character): 42 | if newline: 43 | if character == ' ': indent = indent + 1 44 | elif character == '\t': indent = (indent & -4) + 4 45 | if character == '\n': 46 | indent = 0 47 | i = newline 48 | newline = 1 49 | if i == 0 & brackets == 0: return ';' 50 | read_char() 51 | 52 | # comment 53 | if character == '#': 54 | while character != '\n': read_char() 55 | else: break 56 | 57 | # indent 58 | if !brackets: 59 | if indent > block: error("invalid indentation") 60 | if indent < block: 61 | asm_active = 0 62 | block = block - 4 63 | return 'E' 64 | 65 | # asm line 66 | if asm_active: 67 | i = 0 68 | while character != '\n': 69 | token{i} = read_char() 70 | i = i + 1 71 | token{i} = 0 72 | return 'A' 73 | 74 | newline = 0 75 | 76 | if character == ':': 77 | read_char() 78 | block = block + 4 79 | indent = indent + 4 80 | return ':' 81 | 82 | if strchr("-+*&|!=<>;()[]{},", character): 83 | i = read_char() 84 | if strchr("<>=!", i) != 0 & character == '=': 85 | read_char() 86 | if i == '<': return 'L' 87 | if i == '>': return 'M' 88 | if i == '=': return 'Q' 89 | if i == '!': return 'T' 90 | if i == '[' | i == '(': brackets = brackets + 1 91 | elif i == ']' | i == ')': brackets = brackets - 1 92 | if isdigit(character): is_negative = (i == '-') 93 | return i 94 | 95 | # char 96 | if character == '\'': 97 | token{0} = read_char() 98 | i = 1 99 | if character == '\\': token{1} = read_char(); i = 2 100 | token{i} = read_char() 101 | token{i + 1} = '\'' 102 | token{i + 2} = 0 103 | if read_char() != '\'': error("bad character literal") 104 | return 'C' 105 | 106 | # string 107 | if character == '"': 108 | i = 0 109 | while 1: 110 | if character == '\\': token{i} = read_char(); i = i + 1 111 | token{i} = read_char(); i = i + 1 112 | if i > 1020: error("string too long") 113 | if character == '"': break 114 | token{i} = read_char() 115 | token{i + 1} = 0 116 | return 'S' 117 | 118 | # number 119 | if isdigit(character): 120 | i = 0 121 | while isdigit(character): 122 | token{i} = read_char() 123 | i = i + 1 124 | if i > 20: error("number too long") 125 | token{i} = 0 126 | number = atoll(token) 127 | return 'N' 128 | 129 | # identifier/keyword 130 | if isalpha(character) | character == '_': 131 | token{0} = read_char() 132 | i = 1 133 | while isalnum(character) | character == '_': 134 | token{i} = read_char() 135 | i = i + 1 136 | if i > 62: error("identifier too long") 137 | token{i} = 0 138 | 139 | # check for keyword 140 | i = 0 141 | while i < 9: 142 | if strcmp(token, keywords[i]) == 0: return "ailfwbcrv"{i} 143 | i = i + 1 144 | return 'I' 145 | 146 | if character < 0: error("unknown character") 147 | if block > 0: block = block - 4; return 'E' 148 | return '0' 149 | 150 | 151 | read_lexeme(): lexeme = scan() 152 | 153 | 154 | expect(l): 155 | if lexeme != l: error("read <%c>, but <%c> expected", lexeme, l) 156 | read_lexeme() 157 | 158 | 159 | # parser 160 | var local_count, local_names, local_offsets 161 | var cache, stack_size, frame 162 | var label, while_labels, while_level 163 | var regs, callregs 164 | 165 | 166 | regname(i): return regs[cache[i]] 167 | 168 | 169 | init_cache(): 170 | var i; i = 0 171 | while i < 4: cache[i] = i; i = i + 1 172 | stack_size = 0 173 | 174 | 175 | lookup_local(): 176 | var i; i = 0 177 | while i < local_count: 178 | if strcmp(token, local_names[i]) == 0: return i 179 | i = i + 1 180 | return -1 181 | 182 | 183 | add_local(): 184 | var i 185 | frame = frame + 8 186 | i = 0 187 | while i < 256: 188 | if i == local_count: 189 | strcpy(local_names[i], token) 190 | local_offsets[i] = frame 191 | local_count = local_count + 1 192 | return 193 | if strcmp(token, local_names[i]) == 0: error("multiple declarations") 194 | i = i + 1 195 | error("too many variables") 196 | 197 | 198 | push(): 199 | var i, tmp 200 | i = 3 201 | tmp = cache[i] 202 | if stack_size >= 4: output("\tpush %s\n", regs[tmp]) 203 | while i > 0: 204 | cache[i] = cache[i - 1] 205 | i = i - 1 206 | cache[0] = tmp 207 | stack_size = stack_size + 1 208 | 209 | 210 | pop(): 211 | var i, tmp 212 | stack_size = stack_size - 1 213 | if stack_size == 0: init_cache() 214 | else: 215 | i = 0 216 | tmp = cache[0] 217 | while i < 3: 218 | cache[i] = cache[i + 1] 219 | i = i + 1 220 | cache[i] = tmp 221 | if stack_size >= 4: output("\tpop %s\n", regs[i]) 222 | 223 | 224 | is_expr_beginning(): return strchr("-!(NCSI", lexeme) != 0 225 | is_stmt_beginning(): return is_expr_beginning() | strchr("aiwbcr;", lexeme) != 0 226 | 227 | 228 | expr_0(): 229 | var i, v, s, name 230 | if lexeme == '!': 231 | read_lexeme() 232 | expr_0() 233 | output("\ttest %s, %s\n", regname(0), regname(0)) 234 | output("\tsetz cl\n") 235 | output("\tmovzx %s, cl\n", regname(0)) 236 | return 237 | if lexeme == '-': 238 | if !is_negative: 239 | read_lexeme() 240 | expr_0() 241 | output("\tneg %s\n", regname(0)) 242 | return 243 | read_lexeme() 244 | push() 245 | output("\tmov %s, %ld\n", regname(0), -number) 246 | read_lexeme() 247 | elif lexeme == 'N': 248 | push() 249 | output("\tmov %s, %ld\n", regname(0), number) 250 | read_lexeme() 251 | elif lexeme == 'C': 252 | push() 253 | output("\tmov %s, %s\n", regname(0), token) 254 | read_lexeme() 255 | elif lexeme == '(': 256 | read_lexeme() 257 | expression() 258 | expect(')') 259 | elif lexeme == 'I': 260 | name = malloc(64) 261 | strcpy(name, token) 262 | v = lookup_local() 263 | read_lexeme() 264 | 265 | if lexeme == '(': # function call 266 | 267 | # save regs 268 | i = stack_size 269 | if i > 4: i = 4 270 | while i > 0: 271 | i = i - 1 272 | output("\tpush %s\n", regname(i)) 273 | 274 | s = stack_size 275 | stack_size = 0 276 | 277 | read_lexeme() 278 | i = 0 279 | if is_expr_beginning(): 280 | i = i + 1 281 | expression() 282 | output("\tpush %s\n", regname(0)) 283 | pop() 284 | while lexeme == ',': 285 | read_lexeme() 286 | i = i + 1 287 | if i > 6: error("too many arguments") 288 | expression() 289 | output("\tpush %s\n", regname(0)) 290 | pop() 291 | expect(')') 292 | 293 | # set-up regs 294 | while i > 0: 295 | i = i - 1 296 | output("\tpop %s\n", callregs[i]) 297 | output("\txor rax, rax\n") 298 | output("\tcall %s@PLT\n", name) 299 | 300 | # return value in rax 301 | init_cache() 302 | push() 303 | stack_size = s + 1 304 | 305 | if s > 3: s = 3 306 | i = 1 307 | while i <= s: 308 | output("\tpop %s\n", regname(i)) 309 | i = i + 1 310 | 311 | elif lexeme == '=': # assignment 312 | read_lexeme() 313 | expression() 314 | if v < 0: output("\tmov %s[rip], %s\n", name, regname(0)) 315 | else: output("\tmov QWORD PTR [rbp - %d], %s\n", 316 | local_offsets[v], regname(0)) 317 | else: 318 | push() 319 | if v < 0: output("\tmov %s, %s[rip]\n", regname(0), name) 320 | else: output("\tmov %s, QWORD PTR [rbp - %d]\n", 321 | regname(0), local_offsets[v]) 322 | free(name) 323 | 324 | elif lexeme == 'S': # string literal 325 | push() 326 | output("\t.section .rodata\n") 327 | output("LC%d:\n", label) 328 | output("\t.string %s\n", token) 329 | output("\t.text\n") 330 | output("\tlea %s, LC%d[rip]\n", regname(0), label) 331 | label = label + 1 332 | read_lexeme() 333 | else: error("bad expression") 334 | 335 | while lexeme == '[': 336 | read_lexeme() 337 | expression() 338 | expect(']') 339 | if lexeme == '=': 340 | read_lexeme() 341 | expression() 342 | output("\tmov QWORD PTR [%s + %s * 8], %s\n", 343 | regname(2), regname(1), regname(0)) 344 | i = cache[2] 345 | cache[2] = cache[0] 346 | cache[0] = i 347 | pop() 348 | pop() 349 | return 350 | output("\tmov %s, QWORD PTR [%s + %s * 8]\n", 351 | regname(1), regname(1), regname(0)) 352 | pop() 353 | 354 | if lexeme == '{': 355 | read_lexeme() 356 | expression() 357 | expect('}') 358 | if lexeme == '=': 359 | read_lexeme() 360 | expression() 361 | output("\tmov rcx, %s\n", regname(0)) 362 | output("\tmov BYTE PTR [%s + %s], cl\n", regname(2), regname(1)) 363 | i = cache[2] 364 | cache[2] = cache[0] 365 | cache[0] = i 366 | pop() 367 | pop() 368 | return 369 | 370 | output("\tmov cl, BYTE PTR [%s + %s]\n", regname(1), regname(0)) 371 | output("\tmovzx %s, cl\n", regname(1)) 372 | pop() 373 | 374 | 375 | expr_1(): 376 | expr_0() 377 | while lexeme == '*': 378 | read_lexeme() 379 | expr_0() 380 | output("\timul %s, %s\n", regname(1), regname(0)) 381 | pop() 382 | 383 | 384 | expr_2(): 385 | expr_1() 386 | while strchr("+-", lexeme): 387 | if lexeme == '+': 388 | read_lexeme() 389 | expr_1() 390 | output("\tadd %s, %s\n", regname(1), regname(0)) 391 | pop() 392 | elif lexeme == '-': 393 | read_lexeme() 394 | expr_1() 395 | output("\tsub %s, %s\n", regname(1), regname(0)) 396 | pop() 397 | 398 | 399 | expr_3(): 400 | var c 401 | expr_2() 402 | if lexeme == '<': c = "l" 403 | elif lexeme == '>': c = "g" 404 | elif lexeme == 'L': c = "le" 405 | elif lexeme == 'M': c = "ge" 406 | elif lexeme == 'Q': c = "e" 407 | elif lexeme == 'T': c = "ne" 408 | else: return 409 | read_lexeme() 410 | expr_2() 411 | output("\tcmp %s, %s\n", regname(1), regname(0)) 412 | output("\tset%s cl\n", c) 413 | output("\tmovzx %s, cl\n", regname(1)) 414 | pop() 415 | 416 | 417 | expr_4(): 418 | expr_3() 419 | while lexeme == '&': 420 | read_lexeme() 421 | expr_3() 422 | output("\tand %s, %s\n", regname(1), regname(0)) 423 | pop() 424 | 425 | 426 | expression(): 427 | expr_4() 428 | while lexeme == '|': 429 | read_lexeme() 430 | expr_4() 431 | output("\tor %s, %s\n", regname(1), regname(0)) 432 | pop() 433 | 434 | 435 | statement_list(): while is_stmt_beginning(): statement() 436 | 437 | 438 | statement(): 439 | var l_end, l_next, end 440 | if lexeme == 'a': 441 | read_lexeme() 442 | asm_active = 1 443 | newline = 1 444 | expect(':') 445 | while lexeme == 'A': 446 | output("\t%s\n", token) 447 | read_lexeme() 448 | expect('E') 449 | elif lexeme == 'i': 450 | read_lexeme() 451 | expression() 452 | expect(':') 453 | l_end = label 454 | l_next = label + 1 455 | label = label + 2 456 | end = 0 457 | output("\ttest %s, %s\n", regname(0), regname(0)) 458 | output("\tjz .L%d\n", l_next) 459 | init_cache() 460 | statement_list() 461 | expect('E') 462 | if strchr("lf", lexeme): 463 | output("\tjmp .L%d\n", l_end) 464 | end = 1 465 | output(".L%d:\n", l_next) 466 | while lexeme == 'f': 467 | read_lexeme() 468 | expression() 469 | expect(':') 470 | l_next = label 471 | label = label + 1 472 | output("\ttest %s, %s\n", regname(0), regname(0)) 473 | output("\tjz .L%d\n", l_next) 474 | init_cache() 475 | statement_list() 476 | expect('E') 477 | if strchr("lf", lexeme): output("\tjmp .L%d\n", l_end) 478 | output(".L%d:\n", l_next) 479 | if lexeme == 'l': 480 | read_lexeme() 481 | expect(':') 482 | init_cache() 483 | statement_list() 484 | expect('E') 485 | if end: output(".L%d:\n", l_end) 486 | elif lexeme == 'w': 487 | read_lexeme() 488 | while_level = while_level + 1 489 | if while_level == 256: error("while nesting limit exceeded") 490 | while_labels[while_level] = label 491 | label = label + 2 492 | output(".L%d:\n", while_labels[while_level]) 493 | expression() 494 | expect(':') 495 | output("\ttest %s, %s\n", regname(0), regname(0)) 496 | output("\tjz .L%d\n", while_labels[while_level] + 1) 497 | init_cache() 498 | statement_list() 499 | expect('E') 500 | output("\tjmp .L%d\n", while_labels[while_level]) 501 | output(".L%d:\n", while_labels[while_level] + 1) 502 | while_level = while_level - 1 503 | elif lexeme == 'b': 504 | read_lexeme() 505 | if while_level < 0: error("break without while") 506 | output("\tjmp .L%d\n", while_labels[while_level] + 1) 507 | elif lexeme == 'c': 508 | read_lexeme() 509 | if while_level < 0: error("continue without while") 510 | output("\tjmp .L%d\n", while_labels[while_level]) 511 | elif lexeme == 'r': 512 | read_lexeme() 513 | if is_expr_beginning(): 514 | expression() 515 | if strcmp(regname(0), "rax") != 0: 516 | output("\tmov rax, %s\n", regname(0)) 517 | pop() 518 | output("\tleave\n") 519 | output("\tret\n") 520 | elif is_expr_beginning(): 521 | expression() 522 | pop() 523 | else: expect(';') 524 | 525 | 526 | minilang(): 527 | var i, params 528 | 529 | # init scanner 530 | token = malloc(1024) 531 | keywords = malloc(8 * 9) 532 | keywords[0] = "asm" 533 | keywords[1] = "if" 534 | keywords[2] = "else" 535 | keywords[3] = "elif" 536 | keywords[4] = "while" 537 | keywords[5] = "break" 538 | keywords[6] = "continue" 539 | keywords[7] = "return" 540 | keywords[8] = "var" 541 | line_number = 1 542 | newline = 1 543 | read_char() 544 | read_lexeme() 545 | 546 | # init parser 547 | while_labels = malloc(8 * 256) 548 | cache = malloc(8 * 4) 549 | local_names = malloc(8 * 256) 550 | local_offsets = malloc(8 * 256) 551 | i = 0 552 | while i < 256: 553 | local_names[i] = malloc(64) 554 | i = i + 1 555 | regs = malloc(8 * 4) 556 | regs[0] = "r8" 557 | regs[1] = "r9" 558 | regs[2] = "r11" 559 | regs[3] = "rax" 560 | callregs = malloc(8 * 6) 561 | callregs[0] = "rdi" 562 | callregs[1] = "rsi" 563 | callregs[2] = "rdx" 564 | callregs[3] = "rcx" 565 | callregs[4] = "r8" 566 | callregs[5] = "r9" 567 | while_level = -1 568 | 569 | output("\t.intel_syntax noprefix\n") 570 | output("\t.text\n") 571 | 572 | while lexeme != '0': 573 | # global variables 574 | while lexeme == 'v': 575 | read_lexeme() 576 | expect('I') 577 | output("\t.comm %s, 8, 8\n", token) 578 | while lexeme == ',': 579 | read_lexeme() 580 | expect('I') 581 | output("\t.comm %s, 8, 8\n", token) 582 | while lexeme == ';': read_lexeme() 583 | 584 | 585 | # function definition 586 | expect('I') 587 | output("\t.globl %s\n", token) 588 | output("%s:\n", token) 589 | output("\tpush rbp\n") 590 | output("\tmov rbp, rsp\n") 591 | 592 | frame = 0 593 | local_count = 0 594 | 595 | # parameter list 596 | params = 0 597 | expect('(') 598 | if lexeme == 'I': 599 | params = params + 1 600 | expect('I') 601 | add_local() 602 | while lexeme == ',': 603 | read_lexeme() 604 | params = params + 1 605 | if params > 6: error("too many arguments") 606 | expect('I') 607 | add_local() 608 | expect(')') 609 | expect(':') 610 | 611 | # local variables 612 | while lexeme == ';': read_lexeme() 613 | while lexeme == 'v': 614 | read_lexeme() 615 | expect('I') 616 | add_local() 617 | while lexeme == ',': 618 | read_lexeme() 619 | expect('I') 620 | add_local() 621 | while lexeme == ';': read_lexeme() 622 | 623 | if frame > 0: output("\tsub rsp, %d\n", frame) 624 | i = 0 625 | while i < params: 626 | output("\tmov QWORD PTR [rbp - %d], %s\n", i * 8 + 8, callregs[i]) 627 | i = i + 1 628 | 629 | init_cache() 630 | statement_list() 631 | output("\tleave\n") 632 | output("\tret\n") 633 | expect('E') 634 | 635 | 636 | main(argc, argv): 637 | var s 638 | if argc < 2 | argc > 3: 639 | printf("usuage: %s [output]\n", argv[0]) 640 | exit(0) 641 | src_file = s = fopen(argv[1], "r") 642 | if !src_file: error("opening source file failed") 643 | if argc == 3: dst_file = fopen(argv[2], "w") 644 | else: dst_file = stdout 645 | minilang() 646 | fclose(src_file) 647 | if dst_file != stdin: fclose(dst_file) 648 | return 0 649 | 650 | --------------------------------------------------------------------------------