├── assets └── tweet.png ├── README.md └── tiny.js /assets/tweet.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mgechev/tiny-compiler/HEAD/assets/tweet.png -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Tiny Interpreter and Compiler 2 | 3 | A tiny interpreter and compiler which shows the basics of compiler development. 4 | 5 | For more details see [the source](./tiny.js) or my blog post "[Implementing a Simple Compiler on 25 Lines of JavaScript](http://blog.mgechev.com/2017/09/16/developing-simple-interpreter-transpiler-compiler-tutorial/)" 6 | 7 | Along the implementation you can find sample (and simple) implementations of: 8 | 9 | - Lexer which produces a list of tokens (module for lexical analysis). 10 | - Parser which produces an Abstract Syntax Tree (AST) (module for syntax analysis). 11 | - Interpreter which traverses and evaluates the AST. 12 | - EBNF grammar. 13 | - Recursive Descent Parsing. 14 | 15 | ## You want even smaller compiler? 16 | 17 | Here it is! 18 | 19 | [![Compiler in a Tweet](/assets/tweet.png)](https://twitter.com/mgechev/status/955211214719602688) 20 | 21 | # License 22 | 23 | MIT 24 | 25 | -------------------------------------------------------------------------------- /tiny.js: -------------------------------------------------------------------------------- 1 | /* 2 | # Lexer 3 | 4 | The lexer is responsible for turning the input string into 5 | a list of tokens. Usually a token looks the following way: 6 | 7 | ```javascript 8 | { 9 | "type": Symbol("Operator"), 10 | "value: "-" 11 | } 12 | ``` 13 | 14 | In our case we're keeping everything simplified and store 15 | only the token's value. We can infer the type based on 16 | regular expressions defined below. 17 | 18 | In short, `lex` will turn the following expression: 19 | 20 | ``` 21 | mul 3 sub 2 sum 1 3 4 22 | ``` 23 | 24 | To the following array: 25 | 26 | ``` 27 | ["mul", "3", "sub", "2", "sum", "1", "3", "4"] 28 | ``` 29 | */ 30 | const lex = str => str.split(' ').map(s => s.trim()).filter(s => s.length); 31 | 32 | /* 33 | # Parser 34 | 35 | The parser is responsible for turning the list of tokens 36 | into an AST or Abstract Syntax Tree. In the example below 37 | we use recursive descent parsing to produce the AST 38 | from the input token array. 39 | 40 | Visually, the parsing is a process which turns the array: 41 | 42 | ```javascript 43 | const tokens = ["sub", "2", "sum", "1", "3", "4"]; 44 | ``` 45 | 46 | to the following tree: 47 | 48 | ``` 49 | sub 50 | / \ 51 | 2 sum 52 | /|\ 53 | 1 3 4 54 | ``` 55 | 56 | The parser uses the following grammar to parse the input token array: 57 | 58 | ``` 59 | num := 0-9+ 60 | op := sum | sub | div | mul 61 | expr := num | op expr+ 62 | ``` 63 | 64 | This translated to plain English, means: 65 | - `num` can be any sequence of the numbers between 0 and 9. 66 | - `op` can be any of `sum`, `sub`, `div`, `mul`. 67 | - `expr` can be either a number (i.e. `num`) or an operation followed by one or more `expr`s. 68 | 69 | Notice that `expr` has a recursive declaration. 70 | */ 71 | 72 | const Op = Symbol('op'); 73 | const Num = Symbol('num'); 74 | 75 | const parse = tokens => { 76 | 77 | let c = 0; 78 | const peek = () => tokens[c]; 79 | const consume = () => tokens[c++]; 80 | 81 | const parseNum = () => ({ val: parseInt(consume()), type: Num }); 82 | 83 | const parseOp = () => { 84 | const node = { val: consume(), type: Op, expr: [] }; 85 | while (peek()) node.expr.push(parseExpr()); 86 | return node; 87 | }; 88 | 89 | const parseExpr = () => /\d/.test(peek()) ? parseNum() : parseOp(); 90 | 91 | return parseExpr(); 92 | }; 93 | 94 | /* 95 | # Evaluator 96 | 97 | Finally, this is our evaluator. In it we simply visit each node 98 | from the tree with pre-order traversal and either: 99 | 100 | - Return the corresponding value, in case the node is of type number. 101 | - Perform the corresponding arithmetic operation, in case of an operation node. 102 | */ 103 | const evaluate = ast => { 104 | const opAcMap = { 105 | sum: args => args.reduce((a, b) => a + b, 0), 106 | sub: args => args.reduce((a, b) => a - b), 107 | div: args => args.reduce((a, b) => a / b), 108 | mul: args => args.reduce((a, b) => a * b, 1) 109 | }; 110 | 111 | if (ast.type === Num) return ast.val; 112 | return opAcMap[ast.val](ast.expr.map(evaluate)); 113 | }; 114 | 115 | /* 116 | # Code generator 117 | 118 | Alternatively, instead of interpreting the AST, we can translate 119 | it to another language. Here's how we can do that with JavaScript. 120 | */ 121 | const compile = ast => { 122 | const opMap = { sum: '+', mul: '*', sub: '-', div: '/' }; 123 | const compileNum = ast => ast.val; 124 | const compileOp = ast => `(${ast.expr.map(compile).join(' ' + opMap[ast.val] + ' ')})`; 125 | const compile = ast => ast.type === Num ? compileNum(ast) : compileOp(ast); 126 | return compile(ast); 127 | }; 128 | 129 | const program = 'mul 3 sub 2 sum 1 3 4'; 130 | 131 | /* 132 | # Interpreter 133 | 134 | In order to interpret the input stream we feed the parser with the input 135 | from the lexer and the evaluator with the output of the parser. 136 | */ 137 | console.log(evaluate(parse(lex(program)))); 138 | 139 | /* 140 | # Compiler 141 | 142 | In order to compile the expression to JavaScript, the only change we need to make 143 | is to update the outermost `evaluate` invocation to `compile`. 144 | */ 145 | console.log(compile(parse(lex(program)))); 146 | 147 | --------------------------------------------------------------------------------