├── .babelrc ├── index.js ├── lib ├── generation.js ├── parse.js ├── tokenize.js └── transform.js ├── test └── index.js ├── gulpfile.js ├── package.json ├── demo ├── tokenizer.html ├── parser.html └── transformer.html └── README.md /.babelrc: -------------------------------------------------------------------------------- 1 | { 2 | "presets": [ 3 | ["latest", { 4 | "es2015": { 5 | "modules": false 6 | } 7 | }] 8 | ], 9 | "plugins": ["external-helpers"] 10 | } -------------------------------------------------------------------------------- /index.js: -------------------------------------------------------------------------------- 1 | const {tokenize} = require('../lib/tokenize'); 2 | const {parse} = require('../lib/parse'); 3 | const {transform} = require('../lib/transform'); 4 | const {generation} = require('../lib/generation'); 5 | 6 | module.exports = { 7 | tokenize, 8 | parse, 9 | transform, 10 | generation 11 | } -------------------------------------------------------------------------------- /lib/generation.js: -------------------------------------------------------------------------------- 1 | /** 2 | * @author Jan 3 | * @description 代码生成器 4 | */ 5 | function generation(node) { 6 | switch(node.type) { 7 | case 'program': 8 | return node.body.map(generation).join('\n'); 9 | case 'arithExpression': 10 | return '(' + node.arguments.map(generation).join(` ${node.name} `) + ')'; 11 | case 'number': 12 | return node.value; 13 | default: 14 | throw new Error('节点类型无法被识别'); 15 | } 16 | } 17 | 18 | exports.generation = generation; -------------------------------------------------------------------------------- /test/index.js: -------------------------------------------------------------------------------- 1 | const {tokenize} = require('../lib/tokenize'); 2 | const {parse} = require('../lib/parse'); 3 | const {transform} = require('../lib/transform'); 4 | const {generation} = require('../lib/generation'); 5 | 6 | const tokens = tokenize('(/ (- 72 10) (+ 4.5 2))'); 7 | 8 | console.log(tokens); 9 | 10 | const ast = parse(tokens, null, 2); 11 | 12 | console.log(JSON.stringify(ast, null, 2)); 13 | 14 | const newAst = transform(ast); 15 | 16 | console.log(JSON.stringify(newAst, null, 2)); 17 | 18 | console.log(generation(newAst)); 19 | -------------------------------------------------------------------------------- /gulpfile.js: -------------------------------------------------------------------------------- 1 | const gulp = require('gulp'); 2 | const rollup = require('rollup'); 3 | const commonjs = require('rollup-plugin-commonjs'); 4 | const resolve = require('rollup-plugin-node-resolve'); 5 | const babel = require('rollup-plugin-babel'); 6 | 7 | gulp.task('build', async function() { 8 | const index = await rollup.rollup({ 9 | entry: `./index.js`, 10 | plugins: [ 11 | commonjs(), 12 | resolve(), 13 | babel({ 14 | exclude: 'node_modules/**' // 只编译我们的源代码 15 | }) 16 | ] 17 | }) 18 | 19 | await index.write({ 20 | dest: `./dist/tiny-compiler.js`, 21 | format: 'umd', 22 | moduleName: 'compiler', 23 | }) 24 | }); -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "tiny-compiler", 3 | "version": "1.0.0", 4 | "description": "a tiny compiler", 5 | "main": "index.js", 6 | "scripts": { 7 | "test": "echo \"Error: no test specified\" && exit 1" 8 | }, 9 | "repository": { 10 | "type": "git", 11 | "url": "git+https://github.com/jan-wong/tiny-compiler.git" 12 | }, 13 | "keywords": [ 14 | "compiler", 15 | "javascript" 16 | ], 17 | "author": "wolf", 18 | "license": "ISC", 19 | "bugs": { 20 | "url": "https://github.com/jan-wong/tiny-compiler/issues" 21 | }, 22 | "homepage": "https://github.com/jan-wong/tiny-compiler#readme", 23 | "devDependencies": { 24 | "esprima": "^4.0.0" 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /demo/tokenizer.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | tokenizer 6 | 7 | 12 | 13 | 18 | 19 | 20 | 21 | 22 | 23 | 24 |

(add 2 (subtract 4 2))

=>
25 | 26 | 32 | 33 | -------------------------------------------------------------------------------- /demo/parser.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | parser 6 | 7 | 12 | 13 | 18 | 19 | 20 | 21 | 22 | 23 | 24 |
=>
25 | 26 | 27 | 36 | 37 | -------------------------------------------------------------------------------- /lib/parse.js: -------------------------------------------------------------------------------- 1 | /** 2 | * @author Jan 3 | * @description 语法分析器 4 | */ 5 | function parse(tokens) { 6 | let i = 0; // tokens的下标 7 | 8 | function walk() { 9 | let token = tokens[i]; 10 | 11 | // 数值类型 12 | if (token.type === 'number') { 13 | return { 14 | type: 'number', 15 | value: token.value 16 | } 17 | } 18 | 19 | // 处理'(' 入口 20 | if (token.type === 'parren' && token.value === '(') { 21 | token = tokens[++i]; 22 | 23 | // 在LISP语言中 运算操作符实际上是一个函数,数值为其中的参数 24 | let node = { 25 | type: 'arithCall', 26 | name: token.value, 27 | params: [] 28 | } 29 | 30 | token = tokens[++i]; 31 | while(token && token.value !== ')') { 32 | node.params.push(walk()); 33 | token = tokens[++i]; 34 | } 35 | 36 | // 出口 37 | if (token.type === 'parren' && token.value === ')') return node; 38 | } 39 | } 40 | 41 | const ast = { 42 | type: 'program', 43 | body: [] 44 | } 45 | 46 | ast.body.push(walk()); 47 | 48 | return ast; 49 | } 50 | 51 | exports.parse = parse; -------------------------------------------------------------------------------- /demo/transformer.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | parser 6 | 7 | 12 | 13 | 18 | 19 | 20 | 21 | 22 | 23 | 24 |
=>
25 | 26 | 27 | 28 | 29 | 39 | 40 | -------------------------------------------------------------------------------- /lib/tokenize.js: -------------------------------------------------------------------------------- 1 | /** 2 | * @author Jan 3 | * @description 词法分析器 4 | * @param {String} input 5 | */ 6 | function tokenize(input) { 7 | const tokens = []; //包含tokens的数组 8 | let i = 0; //input字符串序列的下标 9 | 10 | while(i < input.length) { 11 | let char = input[i]; 12 | 13 | // 处理空白符 14 | const whiteReg = /\s/; 15 | if (whiteReg.test(char)) { 16 | i++; 17 | continue; 18 | } 19 | 20 | // 处理小括号 21 | if (char === '(' || char === ')') { 22 | tokens.push({type: 'parren', value: input[i]}); 23 | i++; 24 | continue; 25 | } 26 | 27 | // 处理算数运算符 28 | const arithReg = /[\+\-\*\/%]/; 29 | if (arithReg.test(char)) { 30 | tokens.push({type: 'arith', value: char}); 31 | i++; 32 | continue; 33 | } 34 | 35 | // 处理number 36 | const numberReg = /[0-9\.]/; 37 | if (numberReg.test(char)) { 38 | let numberical = ''; 39 | 40 | while(numberReg.test(char)) { 41 | numberical += char; 42 | i++; 43 | char = input[i]; 44 | } 45 | 46 | tokens.push({type: 'number', value: numberical}); 47 | continue; 48 | } 49 | 50 | throw new Error('不能被识别的token类型'); 51 | } 52 | 53 | return tokens; 54 | } 55 | 56 | exports.tokenize = tokenize; -------------------------------------------------------------------------------- /lib/transform.js: -------------------------------------------------------------------------------- 1 | /** 2 | * @author Jan 3 | * @description 转换器 4 | */ 5 | 6 | // 访问者,包含处理对应ast节点类型的函数的对象 7 | const visitor = { 8 | number(node, parent) { 9 | const childNode = { 10 | type: 'number', 11 | value: node.value 12 | } 13 | parent.push(childNode); 14 | }, 15 | arithCall(node, parent) { 16 | const expression = { 17 | type: 'arithExpression', 18 | name: node.name, 19 | arguments: [] 20 | } 21 | parent.push(expression); 22 | 23 | return expression.arguments; 24 | } 25 | } 26 | 27 | // 深度优先遍历ast,并构建newAst 28 | function DFSwalk(ast, newAst, visitor) { 29 | 30 | function walkNodeArray(nodeArr, newNodeArr) { 31 | nodeArr.forEach(childNode => { 32 | walkNode(childNode, newNodeArr); 33 | }); 34 | } 35 | 36 | function walkNode(node, newParentArr) { 37 | const type = node.type; 38 | 39 | if (visitor[type]) { 40 | newParentArr = visitor[type](node, newParentArr) || newParentArr; 41 | } 42 | 43 | switch(type) { 44 | case 'program': 45 | walkNodeArray(node.body, newParentArr); 46 | break; 47 | case 'number': 48 | break; 49 | case 'arithCall': 50 | walkNodeArray(node.params, newParentArr); 51 | break; 52 | default: 53 | throw new Error('不能被识别的类型'); 54 | } 55 | } 56 | 57 | walkNode(ast, newAst.body); 58 | } 59 | 60 | // 转换器 61 | function transform(ast) { 62 | const newAst = { 63 | type: 'program', 64 | body: [] 65 | } 66 | 67 | DFSwalk(ast, newAst, visitor); 68 | 69 | return newAst; 70 | } 71 | 72 | exports.transform = transform; -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # tiny-compiler (一个简单的编译器) 2 | 3 | ## 前言 4 | 5 | 数值求值的栗子: 6 | ``` 7 | LISP Javascript 8 | 9 | (+ 3 2) (3 + 2) 10 | (- 7 1) (7 - 1) 11 | (/ (- 72 10) (+ 4.5 2)) ((72 - 10) / ( 4.5 + 2)) 12 | ``` 13 | 14 | 如果某一天我们脑洞大开,想使用LISP的语法形式去编写Javascript,或许我们会使用到编译原理,将LISP语法形式的代码编译成Javascript。就像我们编写ES6标准的代码,通过Babel编译成浏览器都支持的ES5标准一样,这非常有趣。那么编译原理对于Javascript开发者有用吗?其实编译原理在很多开源项目上都进行了使用,比如Bebel、TypeScript、CoffeScript、Flow等等,可见掌握一些编译原理对于理解它们是很有用的一件事情。如上是一个数值求值的例子,接下来将演示将如上LISP语法编译为Javascript语法的整个编译过程。源码见[https://github.com/jan-wong/tiny-compiler](https://github.com/jan-wong/tiny-compiler) 15 | 16 | ## 整体实现 17 | 18 | 一般而言,编译原理会被划分为三个阶段:解析(解析又分为词法分析(tokenize)、语法分析(parse))、转换(transform)、代码生成(code generation) 19 | 20 | ### 词法分析器(tokenizer) 21 | 22 | 词法分析器的作用是将输入(input)的字符串序列划分成一个个标记对象(token)组成的数组,以便进行语法分析,标记对象包含标记类型和标记的字面值。标记是什么?标记是源代码的最小单位,一般用空格分开。编程语言的标记种类是有限的,比如有数据类型(字符串、数值、数组等)、操作符(算数操作符、比较操作符、逻辑操作符等)、分隔符(逗号、分号、括号等)、保留字、标识符等等。对'(/ (- 72 10) (+ 4.5 2))'进行词法分析可以得到如下, 源码见tokenize.js: 23 | 24 | ```javascript 25 | [ { type: 'parren', value: '(' }, 26 | { type: 'arith', value: '/' }, 27 | { type: 'parren', value: '(' }, 28 | { type: 'arith', value: '-' }, 29 | { type: 'number', value: '72' }, 30 | 词法分析(tokenize) { type: 'number', value: '10' }, 31 | '(/ (- 72 10) (+ 4.5 2))' =================>>> { type: 'parren', value: ')' }, 32 | { type: 'parren', value: '(' }, 33 | { type: 'arith', value: '+' }, 34 | { type: 'number', value: '4.5' }, 35 | { type: 'number', value: '2' }, 36 | { type: 'parren', value: ')' }, 37 | { type: 'parren', value: ')' } ] 38 | ``` 39 | 40 | ### 语法分析器(parser) 41 | 42 | 语法分析器的作用是将输入标记数组(tokens)重新格式化,让标记与标记之间形成关联,最后形成程序、语句或者表达式。我们会用一棵树来描述这种形成相互关系的程序,这棵树唤做抽象语法树(AST)。继续进行语法分析,过程如下,源码在parse.js: 43 | 44 | ```javascript 45 | {"type": "program", 46 | "body": [ 47 | { 48 | "type": "arithCall", 49 | "name": "/", 50 | [ { type: 'parren', value: '(' }, "params": [ 51 | { type: 'arith', value: '/' }, { 52 | { type: 'parren', value: '(' }, "type": "arithCall", 53 | { type: 'arith', value: '-' }, "name": "-", 54 | { type: 'number', value: '72' }, "params": [ 55 | { type: 'number', value: '10' }, 语法分析(parse) { "type": "number", "value": "72" }, 56 | { type: 'parren', value: ')' }, ==================>>> { "type": "number", "value": "10" } 57 | { type: 'parren', value: '(' }, ] 58 | { type: 'arith', value: '+' }, }, 59 | { type: 'number', value: '4.5' }, { 60 | { type: 'number', value: '2' }, "type": "arithCall", 61 | { type: 'parren', value: ')' }, "name": "+", 62 | { type: 'parren', value: ')' } ] "params": [ 63 | { "type": "number", "value": "4.5" }, 64 | { "type": "number", "value": "2" } 65 | ] 66 | } 67 | ] 68 | } 69 | ]} 70 | ``` 71 | 72 | ### 转换器(transform) 73 | 转换器的作用是将符合LISP语法的语法树(AST)转换为符合Javascript语法的语法树。转换后如下: 74 | 75 | ```javascript 76 | { 77 | "type": "program", 78 | "body": [ 79 | { 80 | "type": "arithExpression", 81 | "name": "/", 82 | "arguments": [ 83 | { 84 | "type": "arithExpression", 85 | "name": "-", 86 | "arguments": [ 87 | { "type": "number", "value": "72" }, 88 | { "type": "number", "value": "10" } 89 | ] 90 | }, 91 | { 92 | "type": "arithExpression", 93 | "name": "+", 94 | "arguments": [ 95 | { "type": "number", "value": "4.5" }, 96 | { "type": "number", "value": "2" } 97 | ] 98 | } 99 | ] 100 | } 101 | ] 102 | } 103 | ``` 104 | ### 代码生成器(generator) 105 | 106 | 代码生成器的作用是将newAst生成javascript语法形式的代码。具体实现参考generation.js,结果如下: 107 | 108 | ```javscript 109 | ((72 - 10) / (4.5 + 2)) 110 | ``` 111 | 112 | ### 参考 113 | 114 | [https://github.com/thejameskyle/the-super-tiny-compiler](https://github.com/thejameskyle/the-super-tiny-compiler) 115 | 116 | 117 | 118 | 119 | 120 | 121 | --------------------------------------------------------------------------------