├── .babelrc
├── index.js
├── lib
├── generation.js
├── parse.js
├── tokenize.js
└── transform.js
├── test
└── index.js
├── gulpfile.js
├── package.json
├── demo
├── tokenizer.html
├── parser.html
└── transformer.html
└── README.md
/.babelrc:
--------------------------------------------------------------------------------
1 | {
2 | "presets": [
3 | ["latest", {
4 | "es2015": {
5 | "modules": false
6 | }
7 | }]
8 | ],
9 | "plugins": ["external-helpers"]
10 | }
--------------------------------------------------------------------------------
/index.js:
--------------------------------------------------------------------------------
1 | const {tokenize} = require('../lib/tokenize');
2 | const {parse} = require('../lib/parse');
3 | const {transform} = require('../lib/transform');
4 | const {generation} = require('../lib/generation');
5 |
6 | module.exports = {
7 | tokenize,
8 | parse,
9 | transform,
10 | generation
11 | }
--------------------------------------------------------------------------------
/lib/generation.js:
--------------------------------------------------------------------------------
1 | /**
2 | * @author Jan
3 | * @description 代码生成器
4 | */
5 | function generation(node) {
6 | switch(node.type) {
7 | case 'program':
8 | return node.body.map(generation).join('\n');
9 | case 'arithExpression':
10 | return '(' + node.arguments.map(generation).join(` ${node.name} `) + ')';
11 | case 'number':
12 | return node.value;
13 | default:
14 | throw new Error('节点类型无法被识别');
15 | }
16 | }
17 |
18 | exports.generation = generation;
--------------------------------------------------------------------------------
/test/index.js:
--------------------------------------------------------------------------------
1 | const {tokenize} = require('../lib/tokenize');
2 | const {parse} = require('../lib/parse');
3 | const {transform} = require('../lib/transform');
4 | const {generation} = require('../lib/generation');
5 |
6 | const tokens = tokenize('(/ (- 72 10) (+ 4.5 2))');
7 |
8 | console.log(tokens);
9 |
10 | const ast = parse(tokens, null, 2);
11 |
12 | console.log(JSON.stringify(ast, null, 2));
13 |
14 | const newAst = transform(ast);
15 |
16 | console.log(JSON.stringify(newAst, null, 2));
17 |
18 | console.log(generation(newAst));
19 |
--------------------------------------------------------------------------------
/gulpfile.js:
--------------------------------------------------------------------------------
1 | const gulp = require('gulp');
2 | const rollup = require('rollup');
3 | const commonjs = require('rollup-plugin-commonjs');
4 | const resolve = require('rollup-plugin-node-resolve');
5 | const babel = require('rollup-plugin-babel');
6 |
7 | gulp.task('build', async function() {
8 | const index = await rollup.rollup({
9 | entry: `./index.js`,
10 | plugins: [
11 | commonjs(),
12 | resolve(),
13 | babel({
14 | exclude: 'node_modules/**' // 只编译我们的源代码
15 | })
16 | ]
17 | })
18 |
19 | await index.write({
20 | dest: `./dist/tiny-compiler.js`,
21 | format: 'umd',
22 | moduleName: 'compiler',
23 | })
24 | });
--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "tiny-compiler",
3 | "version": "1.0.0",
4 | "description": "a tiny compiler",
5 | "main": "index.js",
6 | "scripts": {
7 | "test": "echo \"Error: no test specified\" && exit 1"
8 | },
9 | "repository": {
10 | "type": "git",
11 | "url": "git+https://github.com/jan-wong/tiny-compiler.git"
12 | },
13 | "keywords": [
14 | "compiler",
15 | "javascript"
16 | ],
17 | "author": "wolf",
18 | "license": "ISC",
19 | "bugs": {
20 | "url": "https://github.com/jan-wong/tiny-compiler/issues"
21 | },
22 | "homepage": "https://github.com/jan-wong/tiny-compiler#readme",
23 | "devDependencies": {
24 | "esprima": "^4.0.0"
25 | }
26 | }
27 |
--------------------------------------------------------------------------------
/demo/tokenizer.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | tokenizer
6 |
7 |
12 |
13 |
18 |
19 |
20 | (add 2 (subtract 4 2)) |
21 | => |
22 | |
23 |
24 |
25 |
26 |
32 |
33 |
--------------------------------------------------------------------------------
/demo/parser.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | parser
6 |
7 |
12 |
13 |
18 |
19 |
20 | |
21 | => |
22 | |
23 |
24 |
25 |
26 |
27 |
36 |
37 |
--------------------------------------------------------------------------------
/lib/parse.js:
--------------------------------------------------------------------------------
1 | /**
2 | * @author Jan
3 | * @description 语法分析器
4 | */
5 | function parse(tokens) {
6 | let i = 0; // tokens的下标
7 |
8 | function walk() {
9 | let token = tokens[i];
10 |
11 | // 数值类型
12 | if (token.type === 'number') {
13 | return {
14 | type: 'number',
15 | value: token.value
16 | }
17 | }
18 |
19 | // 处理'(' 入口
20 | if (token.type === 'parren' && token.value === '(') {
21 | token = tokens[++i];
22 |
23 | // 在LISP语言中 运算操作符实际上是一个函数,数值为其中的参数
24 | let node = {
25 | type: 'arithCall',
26 | name: token.value,
27 | params: []
28 | }
29 |
30 | token = tokens[++i];
31 | while(token && token.value !== ')') {
32 | node.params.push(walk());
33 | token = tokens[++i];
34 | }
35 |
36 | // 出口
37 | if (token.type === 'parren' && token.value === ')') return node;
38 | }
39 | }
40 |
41 | const ast = {
42 | type: 'program',
43 | body: []
44 | }
45 |
46 | ast.body.push(walk());
47 |
48 | return ast;
49 | }
50 |
51 | exports.parse = parse;
--------------------------------------------------------------------------------
/demo/transformer.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | parser
6 |
7 |
12 |
13 |
18 |
19 |
20 | |
21 | => |
22 | |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
39 |
40 |
--------------------------------------------------------------------------------
/lib/tokenize.js:
--------------------------------------------------------------------------------
1 | /**
2 | * @author Jan
3 | * @description 词法分析器
4 | * @param {String} input
5 | */
6 | function tokenize(input) {
7 | const tokens = []; //包含tokens的数组
8 | let i = 0; //input字符串序列的下标
9 |
10 | while(i < input.length) {
11 | let char = input[i];
12 |
13 | // 处理空白符
14 | const whiteReg = /\s/;
15 | if (whiteReg.test(char)) {
16 | i++;
17 | continue;
18 | }
19 |
20 | // 处理小括号
21 | if (char === '(' || char === ')') {
22 | tokens.push({type: 'parren', value: input[i]});
23 | i++;
24 | continue;
25 | }
26 |
27 | // 处理算数运算符
28 | const arithReg = /[\+\-\*\/%]/;
29 | if (arithReg.test(char)) {
30 | tokens.push({type: 'arith', value: char});
31 | i++;
32 | continue;
33 | }
34 |
35 | // 处理number
36 | const numberReg = /[0-9\.]/;
37 | if (numberReg.test(char)) {
38 | let numberical = '';
39 |
40 | while(numberReg.test(char)) {
41 | numberical += char;
42 | i++;
43 | char = input[i];
44 | }
45 |
46 | tokens.push({type: 'number', value: numberical});
47 | continue;
48 | }
49 |
50 | throw new Error('不能被识别的token类型');
51 | }
52 |
53 | return tokens;
54 | }
55 |
56 | exports.tokenize = tokenize;
--------------------------------------------------------------------------------
/lib/transform.js:
--------------------------------------------------------------------------------
1 | /**
2 | * @author Jan
3 | * @description 转换器
4 | */
5 |
6 | // 访问者,包含处理对应ast节点类型的函数的对象
7 | const visitor = {
8 | number(node, parent) {
9 | const childNode = {
10 | type: 'number',
11 | value: node.value
12 | }
13 | parent.push(childNode);
14 | },
15 | arithCall(node, parent) {
16 | const expression = {
17 | type: 'arithExpression',
18 | name: node.name,
19 | arguments: []
20 | }
21 | parent.push(expression);
22 |
23 | return expression.arguments;
24 | }
25 | }
26 |
27 | // 深度优先遍历ast,并构建newAst
28 | function DFSwalk(ast, newAst, visitor) {
29 |
30 | function walkNodeArray(nodeArr, newNodeArr) {
31 | nodeArr.forEach(childNode => {
32 | walkNode(childNode, newNodeArr);
33 | });
34 | }
35 |
36 | function walkNode(node, newParentArr) {
37 | const type = node.type;
38 |
39 | if (visitor[type]) {
40 | newParentArr = visitor[type](node, newParentArr) || newParentArr;
41 | }
42 |
43 | switch(type) {
44 | case 'program':
45 | walkNodeArray(node.body, newParentArr);
46 | break;
47 | case 'number':
48 | break;
49 | case 'arithCall':
50 | walkNodeArray(node.params, newParentArr);
51 | break;
52 | default:
53 | throw new Error('不能被识别的类型');
54 | }
55 | }
56 |
57 | walkNode(ast, newAst.body);
58 | }
59 |
60 | // 转换器
61 | function transform(ast) {
62 | const newAst = {
63 | type: 'program',
64 | body: []
65 | }
66 |
67 | DFSwalk(ast, newAst, visitor);
68 |
69 | return newAst;
70 | }
71 |
72 | exports.transform = transform;
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # tiny-compiler (一个简单的编译器)
2 |
3 | ## 前言
4 |
5 | 数值求值的栗子:
6 | ```
7 | LISP Javascript
8 |
9 | (+ 3 2) (3 + 2)
10 | (- 7 1) (7 - 1)
11 | (/ (- 72 10) (+ 4.5 2)) ((72 - 10) / ( 4.5 + 2))
12 | ```
13 |
14 | 如果某一天我们脑洞大开,想使用LISP的语法形式去编写Javascript,或许我们会使用到编译原理,将LISP语法形式的代码编译成Javascript。就像我们编写ES6标准的代码,通过Babel编译成浏览器都支持的ES5标准一样,这非常有趣。那么编译原理对于Javascript开发者有用吗?其实编译原理在很多开源项目上都进行了使用,比如Bebel、TypeScript、CoffeScript、Flow等等,可见掌握一些编译原理对于理解它们是很有用的一件事情。如上是一个数值求值的例子,接下来将演示将如上LISP语法编译为Javascript语法的整个编译过程。源码见[https://github.com/jan-wong/tiny-compiler](https://github.com/jan-wong/tiny-compiler)
15 |
16 | ## 整体实现
17 |
18 | 一般而言,编译原理会被划分为三个阶段:解析(解析又分为词法分析(tokenize)、语法分析(parse))、转换(transform)、代码生成(code generation)
19 |
20 | ### 词法分析器(tokenizer)
21 |
22 | 词法分析器的作用是将输入(input)的字符串序列划分成一个个标记对象(token)组成的数组,以便进行语法分析,标记对象包含标记类型和标记的字面值。标记是什么?标记是源代码的最小单位,一般用空格分开。编程语言的标记种类是有限的,比如有数据类型(字符串、数值、数组等)、操作符(算数操作符、比较操作符、逻辑操作符等)、分隔符(逗号、分号、括号等)、保留字、标识符等等。对'(/ (- 72 10) (+ 4.5 2))'进行词法分析可以得到如下, 源码见tokenize.js:
23 |
24 | ```javascript
25 | [ { type: 'parren', value: '(' },
26 | { type: 'arith', value: '/' },
27 | { type: 'parren', value: '(' },
28 | { type: 'arith', value: '-' },
29 | { type: 'number', value: '72' },
30 | 词法分析(tokenize) { type: 'number', value: '10' },
31 | '(/ (- 72 10) (+ 4.5 2))' =================>>> { type: 'parren', value: ')' },
32 | { type: 'parren', value: '(' },
33 | { type: 'arith', value: '+' },
34 | { type: 'number', value: '4.5' },
35 | { type: 'number', value: '2' },
36 | { type: 'parren', value: ')' },
37 | { type: 'parren', value: ')' } ]
38 | ```
39 |
40 | ### 语法分析器(parser)
41 |
42 | 语法分析器的作用是将输入标记数组(tokens)重新格式化,让标记与标记之间形成关联,最后形成程序、语句或者表达式。我们会用一棵树来描述这种形成相互关系的程序,这棵树唤做抽象语法树(AST)。继续进行语法分析,过程如下,源码在parse.js:
43 |
44 | ```javascript
45 | {"type": "program",
46 | "body": [
47 | {
48 | "type": "arithCall",
49 | "name": "/",
50 | [ { type: 'parren', value: '(' }, "params": [
51 | { type: 'arith', value: '/' }, {
52 | { type: 'parren', value: '(' }, "type": "arithCall",
53 | { type: 'arith', value: '-' }, "name": "-",
54 | { type: 'number', value: '72' }, "params": [
55 | { type: 'number', value: '10' }, 语法分析(parse) { "type": "number", "value": "72" },
56 | { type: 'parren', value: ')' }, ==================>>> { "type": "number", "value": "10" }
57 | { type: 'parren', value: '(' }, ]
58 | { type: 'arith', value: '+' }, },
59 | { type: 'number', value: '4.5' }, {
60 | { type: 'number', value: '2' }, "type": "arithCall",
61 | { type: 'parren', value: ')' }, "name": "+",
62 | { type: 'parren', value: ')' } ] "params": [
63 | { "type": "number", "value": "4.5" },
64 | { "type": "number", "value": "2" }
65 | ]
66 | }
67 | ]
68 | }
69 | ]}
70 | ```
71 |
72 | ### 转换器(transform)
73 | 转换器的作用是将符合LISP语法的语法树(AST)转换为符合Javascript语法的语法树。转换后如下:
74 |
75 | ```javascript
76 | {
77 | "type": "program",
78 | "body": [
79 | {
80 | "type": "arithExpression",
81 | "name": "/",
82 | "arguments": [
83 | {
84 | "type": "arithExpression",
85 | "name": "-",
86 | "arguments": [
87 | { "type": "number", "value": "72" },
88 | { "type": "number", "value": "10" }
89 | ]
90 | },
91 | {
92 | "type": "arithExpression",
93 | "name": "+",
94 | "arguments": [
95 | { "type": "number", "value": "4.5" },
96 | { "type": "number", "value": "2" }
97 | ]
98 | }
99 | ]
100 | }
101 | ]
102 | }
103 | ```
104 | ### 代码生成器(generator)
105 |
106 | 代码生成器的作用是将newAst生成javascript语法形式的代码。具体实现参考generation.js,结果如下:
107 |
108 | ```javscript
109 | ((72 - 10) / (4.5 + 2))
110 | ```
111 |
112 | ### 参考
113 |
114 | [https://github.com/thejameskyle/the-super-tiny-compiler](https://github.com/thejameskyle/the-super-tiny-compiler)
115 |
116 |
117 |
118 |
119 |
120 |
121 |
--------------------------------------------------------------------------------