├── LICENSE ├── README.md ├── super-tiny-compiler-chinese.js ├── super-tiny-compiler.js └── test.js /LICENSE: -------------------------------------------------------------------------------- 1 | Creative Commons Attribution 4.0 International 2 | 3 | ======================================================================= 4 | 5 | Creative Commons Corporation ("Creative Commons") is not a law firm and 6 | does not provide legal services or legal advice. Distribution of 7 | Creative Commons public licenses does not create a lawyer-client or 8 | other relationship. Creative Commons makes its licenses and related 9 | information available on an "as-is" basis. Creative Commons gives no 10 | warranties regarding its licenses, any material licensed under their 11 | terms and conditions, or any related information. Creative Commons 12 | disclaims all liability for damages resulting from their use to the 13 | fullest extent possible. 14 | 15 | Using Creative Commons Public Licenses 16 | 17 | Creative Commons public licenses provide a standard set of terms and 18 | conditions that creators and other rights holders may use to share 19 | original works of authorship and other material subject to copyright 20 | and certain other rights specified in the public license below. The 21 | following considerations are for informational purposes only, are not 22 | exhaustive, and do not form part of our licenses. 23 | 24 | Considerations for licensors: Our public licenses are 25 | intended for use by those authorized to give the public 26 | permission to use material in ways otherwise restricted by 27 | copyright and certain other rights. Our licenses are 28 | irrevocable. Licensors should read and understand the terms 29 | and conditions of the license they choose before applying it. 30 | Licensors should also secure all rights necessary before 31 | applying our licenses so that the public can reuse the 32 | material as expected. Licensors should clearly mark any 33 | material not subject to the license. This includes other CC- 34 | licensed material, or material used under an exception or 35 | limitation to copyright. More considerations for licensors: 36 | wiki.creativecommons.org/Considerations_for_licensors 37 | 38 | Considerations for the public: By using one of our public 39 | licenses, a licensor grants the public permission to use the 40 | licensed material under specified terms and conditions. If 41 | the licensor's permission is not necessary for any reason--for 42 | example, because of any applicable exception or limitation to 43 | copyright--then that use is not regulated by the license. Our 44 | licenses grant only permissions under copyright and certain 45 | other rights that a licensor has authority to grant. Use of 46 | the licensed material may still be restricted for other 47 | reasons, including because others have copyright or other 48 | rights in the material. A licensor may make special requests, 49 | such as asking that all changes be marked or described. 50 | Although not required by our licenses, you are encouraged to 51 | respect those requests where reasonable. More_considerations 52 | for the public: 53 | wiki.creativecommons.org/Considerations_for_licensees 54 | 55 | ======================================================================= 56 | 57 | Creative Commons Attribution 4.0 International Public License 58 | 59 | By exercising the Licensed Rights (defined below), You accept and agree 60 | to be bound by the terms and conditions of this Creative Commons 61 | Attribution 4.0 International Public License ("Public License"). To the 62 | extent this Public License may be interpreted as a contract, You are 63 | granted the Licensed Rights in consideration of Your acceptance of 64 | these terms and conditions, and the Licensor grants You such rights in 65 | consideration of benefits the Licensor receives from making the 66 | Licensed Material available under these terms and conditions. 67 | 68 | 69 | Section 1 -- Definitions. 70 | 71 | a. Adapted Material means material subject to Copyright and Similar 72 | Rights that is derived from or based upon the Licensed Material 73 | and in which the Licensed Material is translated, altered, 74 | arranged, transformed, or otherwise modified in a manner requiring 75 | permission under the Copyright and Similar Rights held by the 76 | Licensor. For purposes of this Public License, where the Licensed 77 | Material is a musical work, performance, or sound recording, 78 | Adapted Material is always produced where the Licensed Material is 79 | synched in timed relation with a moving image. 80 | 81 | b. Adapter's License means the license You apply to Your Copyright 82 | and Similar Rights in Your contributions to Adapted Material in 83 | accordance with the terms and conditions of this Public License. 84 | 85 | c. Copyright and Similar Rights means copyright and/or similar rights 86 | closely related to copyright including, without limitation, 87 | performance, broadcast, sound recording, and Sui Generis Database 88 | Rights, without regard to how the rights are labeled or 89 | categorized. For purposes of this Public License, the rights 90 | specified in Section 2(b)(1)-(2) are not Copyright and Similar 91 | Rights. 92 | 93 | d. Effective Technological Measures means those measures that, in the 94 | absence of proper authority, may not be circumvented under laws 95 | fulfilling obligations under Article 11 of the WIPO Copyright 96 | Treaty adopted on December 20, 1996, and/or similar international 97 | agreements. 98 | 99 | e. Exceptions and Limitations means fair use, fair dealing, and/or 100 | any other exception or limitation to Copyright and Similar Rights 101 | that applies to Your use of the Licensed Material. 102 | 103 | f. Licensed Material means the artistic or literary work, database, 104 | or other material to which the Licensor applied this Public 105 | License. 106 | 107 | g. Licensed Rights means the rights granted to You subject to the 108 | terms and conditions of this Public License, which are limited to 109 | all Copyright and Similar Rights that apply to Your use of the 110 | Licensed Material and that the Licensor has authority to license. 111 | 112 | h. Licensor means the individual(s) or entity(ies) granting rights 113 | under this Public License. 114 | 115 | i. Share means to provide material to the public by any means or 116 | process that requires permission under the Licensed Rights, such 117 | as reproduction, public display, public performance, distribution, 118 | dissemination, communication, or importation, and to make material 119 | available to the public including in ways that members of the 120 | public may access the material from a place and at a time 121 | individually chosen by them. 122 | 123 | j. Sui Generis Database Rights means rights other than copyright 124 | resulting from Directive 96/9/EC of the European Parliament and of 125 | the Council of 11 March 1996 on the legal protection of databases, 126 | as amended and/or succeeded, as well as other essentially 127 | equivalent rights anywhere in the world. 128 | 129 | k. You means the individual or entity exercising the Licensed Rights 130 | under this Public License. Your has a corresponding meaning. 131 | 132 | 133 | Section 2 -- Scope. 134 | 135 | a. License grant. 136 | 137 | 1. Subject to the terms and conditions of this Public License, 138 | the Licensor hereby grants You a worldwide, royalty-free, 139 | non-sublicensable, non-exclusive, irrevocable license to 140 | exercise the Licensed Rights in the Licensed Material to: 141 | 142 | a. reproduce and Share the Licensed Material, in whole or 143 | in part; and 144 | 145 | b. produce, reproduce, and Share Adapted Material. 146 | 147 | 2. Exceptions and Limitations. For the avoidance of doubt, where 148 | Exceptions and Limitations apply to Your use, this Public 149 | License does not apply, and You do not need to comply with 150 | its terms and conditions. 151 | 152 | 3. Term. The term of this Public License is specified in Section 153 | 6(a). 154 | 155 | 4. Media and formats; technical modifications allowed. The 156 | Licensor authorizes You to exercise the Licensed Rights in 157 | all media and formats whether now known or hereafter created, 158 | and to make technical modifications necessary to do so. The 159 | Licensor waives and/or agrees not to assert any right or 160 | authority to forbid You from making technical modifications 161 | necessary to exercise the Licensed Rights, including 162 | technical modifications necessary to circumvent Effective 163 | Technological Measures. For purposes of this Public License, 164 | simply making modifications authorized by this Section 2(a) 165 | (4) never produces Adapted Material. 166 | 167 | 5. Downstream recipients. 168 | 169 | a. Offer from the Licensor -- Licensed Material. Every 170 | recipient of the Licensed Material automatically 171 | receives an offer from the Licensor to exercise the 172 | Licensed Rights under the terms and conditions of this 173 | Public License. 174 | 175 | b. No downstream restrictions. You may not offer or impose 176 | any additional or different terms or conditions on, or 177 | apply any Effective Technological Measures to, the 178 | Licensed Material if doing so restricts exercise of the 179 | Licensed Rights by any recipient of the Licensed 180 | Material. 181 | 182 | 6. No endorsement. Nothing in this Public License constitutes or 183 | may be construed as permission to assert or imply that You 184 | are, or that Your use of the Licensed Material is, connected 185 | with, or sponsored, endorsed, or granted official status by, 186 | the Licensor or others designated to receive attribution as 187 | provided in Section 3(a)(1)(A)(i). 188 | 189 | b. Other rights. 190 | 191 | 1. Moral rights, such as the right of integrity, are not 192 | licensed under this Public License, nor are publicity, 193 | privacy, and/or other similar personality rights; however, to 194 | the extent possible, the Licensor waives and/or agrees not to 195 | assert any such rights held by the Licensor to the limited 196 | extent necessary to allow You to exercise the Licensed 197 | Rights, but not otherwise. 198 | 199 | 2. Patent and trademark rights are not licensed under this 200 | Public License. 201 | 202 | 3. To the extent possible, the Licensor waives any right to 203 | collect royalties from You for the exercise of the Licensed 204 | Rights, whether directly or through a collecting society 205 | under any voluntary or waivable statutory or compulsory 206 | licensing scheme. In all other cases the Licensor expressly 207 | reserves any right to collect such royalties. 208 | 209 | 210 | Section 3 -- License Conditions. 211 | 212 | Your exercise of the Licensed Rights is expressly made subject to the 213 | following conditions. 214 | 215 | a. Attribution. 216 | 217 | 1. If You Share the Licensed Material (including in modified 218 | form), You must: 219 | 220 | a. retain the following if it is supplied by the Licensor 221 | with the Licensed Material: 222 | 223 | i. identification of the creator(s) of the Licensed 224 | Material and any others designated to receive 225 | attribution, in any reasonable manner requested by 226 | the Licensor (including by pseudonym if 227 | designated); 228 | 229 | ii. a copyright notice; 230 | 231 | iii. a notice that refers to this Public License; 232 | 233 | iv. a notice that refers to the disclaimer of 234 | warranties; 235 | 236 | v. a URI or hyperlink to the Licensed Material to the 237 | extent reasonably practicable; 238 | 239 | b. indicate if You modified the Licensed Material and 240 | retain an indication of any previous modifications; and 241 | 242 | c. indicate the Licensed Material is licensed under this 243 | Public License, and include the text of, or the URI or 244 | hyperlink to, this Public License. 245 | 246 | 2. You may satisfy the conditions in Section 3(a)(1) in any 247 | reasonable manner based on the medium, means, and context in 248 | which You Share the Licensed Material. For example, it may be 249 | reasonable to satisfy the conditions by providing a URI or 250 | hyperlink to a resource that includes the required 251 | information. 252 | 253 | 3. If requested by the Licensor, You must remove any of the 254 | information required by Section 3(a)(1)(A) to the extent 255 | reasonably practicable. 256 | 257 | 4. If You Share Adapted Material You produce, the Adapter's 258 | License You apply must not prevent recipients of the Adapted 259 | Material from complying with this Public License. 260 | 261 | 262 | Section 4 -- Sui Generis Database Rights. 263 | 264 | Where the Licensed Rights include Sui Generis Database Rights that 265 | apply to Your use of the Licensed Material: 266 | 267 | a. for the avoidance of doubt, Section 2(a)(1) grants You the right 268 | to extract, reuse, reproduce, and Share all or a substantial 269 | portion of the contents of the database; 270 | 271 | b. if You include all or a substantial portion of the database 272 | contents in a database in which You have Sui Generis Database 273 | Rights, then the database in which You have Sui Generis Database 274 | Rights (but not its individual contents) is Adapted Material; and 275 | 276 | c. You must comply with the conditions in Section 3(a) if You Share 277 | all or a substantial portion of the contents of the database. 278 | 279 | For the avoidance of doubt, this Section 4 supplements and does not 280 | replace Your obligations under this Public License where the Licensed 281 | Rights include other Copyright and Similar Rights. 282 | 283 | 284 | Section 5 -- Disclaimer of Warranties and Limitation of Liability. 285 | 286 | a. UNLESS OTHERWISE SEPARATELY UNDERTAKEN BY THE LICENSOR, TO THE 287 | EXTENT POSSIBLE, THE LICENSOR OFFERS THE LICENSED MATERIAL AS-IS 288 | AND AS-AVAILABLE, AND MAKES NO REPRESENTATIONS OR WARRANTIES OF 289 | ANY KIND CONCERNING THE LICENSED MATERIAL, WHETHER EXPRESS, 290 | IMPLIED, STATUTORY, OR OTHER. THIS INCLUDES, WITHOUT LIMITATION, 291 | WARRANTIES OF TITLE, MERCHANTABILITY, FITNESS FOR A PARTICULAR 292 | PURPOSE, NON-INFRINGEMENT, ABSENCE OF LATENT OR OTHER DEFECTS, 293 | ACCURACY, OR THE PRESENCE OR ABSENCE OF ERRORS, WHETHER OR NOT 294 | KNOWN OR DISCOVERABLE. WHERE DISCLAIMERS OF WARRANTIES ARE NOT 295 | ALLOWED IN FULL OR IN PART, THIS DISCLAIMER MAY NOT APPLY TO YOU. 296 | 297 | b. TO THE EXTENT POSSIBLE, IN NO EVENT WILL THE LICENSOR BE LIABLE 298 | TO YOU ON ANY LEGAL THEORY (INCLUDING, WITHOUT LIMITATION, 299 | NEGLIGENCE) OR OTHERWISE FOR ANY DIRECT, SPECIAL, INDIRECT, 300 | INCIDENTAL, CONSEQUENTIAL, PUNITIVE, EXEMPLARY, OR OTHER LOSSES, 301 | COSTS, EXPENSES, OR DAMAGES ARISING OUT OF THIS PUBLIC LICENSE OR 302 | USE OF THE LICENSED MATERIAL, EVEN IF THE LICENSOR HAS BEEN 303 | ADVISED OF THE POSSIBILITY OF SUCH LOSSES, COSTS, EXPENSES, OR 304 | DAMAGES. WHERE A LIMITATION OF LIABILITY IS NOT ALLOWED IN FULL OR 305 | IN PART, THIS LIMITATION MAY NOT APPLY TO YOU. 306 | 307 | c. The disclaimer of warranties and limitation of liability provided 308 | above shall be interpreted in a manner that, to the extent 309 | possible, most closely approximates an absolute disclaimer and 310 | waiver of all liability. 311 | 312 | 313 | Section 6 -- Term and Termination. 314 | 315 | a. This Public License applies for the term of the Copyright and 316 | Similar Rights licensed here. However, if You fail to comply with 317 | this Public License, then Your rights under this Public License 318 | terminate automatically. 319 | 320 | b. Where Your right to use the Licensed Material has terminated under 321 | Section 6(a), it reinstates: 322 | 323 | 1. automatically as of the date the violation is cured, provided 324 | it is cured within 30 days of Your discovery of the 325 | violation; or 326 | 327 | 2. upon express reinstatement by the Licensor. 328 | 329 | For the avoidance of doubt, this Section 6(b) does not affect any 330 | right the Licensor may have to seek remedies for Your violations 331 | of this Public License. 332 | 333 | c. For the avoidance of doubt, the Licensor may also offer the 334 | Licensed Material under separate terms or conditions or stop 335 | distributing the Licensed Material at any time; however, doing so 336 | will not terminate this Public License. 337 | 338 | d. Sections 1, 5, 6, 7, and 8 survive termination of this Public 339 | License. 340 | 341 | 342 | Section 7 -- Other Terms and Conditions. 343 | 344 | a. The Licensor shall not be bound by any additional or different 345 | terms or conditions communicated by You unless expressly agreed. 346 | 347 | b. Any arrangements, understandings, or agreements regarding the 348 | Licensed Material not stated herein are separate from and 349 | independent of the terms and conditions of this Public License. 350 | 351 | 352 | Section 8 -- Interpretation. 353 | 354 | a. For the avoidance of doubt, this Public License does not, and 355 | shall not be interpreted to, reduce, limit, restrict, or impose 356 | conditions on any use of the Licensed Material that could lawfully 357 | be made without permission under this Public License. 358 | 359 | b. To the extent possible, if any provision of this Public License is 360 | deemed unenforceable, it shall be automatically reformed to the 361 | minimum extent necessary to make it enforceable. If the provision 362 | cannot be reformed, it shall be severed from this Public License 363 | without affecting the enforceability of the remaining terms and 364 | conditions. 365 | 366 | c. No term or condition of this Public License will be waived and no 367 | failure to comply consented to unless expressly agreed to by the 368 | Licensor. 369 | 370 | d. Nothing in this Public License constitutes or may be interpreted 371 | as a limitation upon, or waiver of, any privileges and immunities 372 | that apply to the Licensor or You, including from the legal 373 | processes of any jurisdiction or authority. 374 | 375 | 376 | ======================================================================= 377 | 378 | Creative Commons is not a party to its public licenses. 379 | Notwithstanding, Creative Commons may elect to apply one of its public 380 | licenses to material it publishes and in those instances will be 381 | considered the "Licensor." Except for the limited purpose of indicating 382 | that material is shared under a Creative Commons public license or as 383 | otherwise permitted by the Creative Commons policies published at 384 | creativecommons.org/policies, Creative Commons does not authorize the 385 | use of the trademark "Creative Commons" or any other trademark or logo 386 | of Creative Commons without its prior written consent including, 387 | without limitation, in connection with any unauthorized modifications 388 | to any of its public licenses or any other arrangements, 389 | understandings, or agreements concerning use of licensed material. For 390 | the avoidance of doubt, this paragraph does not form part of the public 391 | licenses. 392 | 393 | Creative Commons may be contacted at creativecommons.org. 394 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | THE SUPER TINY COMPILER 2 | 3 | ***Welcome to The Super Tiny Compiler!*** 4 | 5 | 这是一个超级简单的编译器的例子,包含了现代编译器的几个主要部分,用简单易读的 JavaScript 编写。 6 | 7 | 把这个读完将会有助于你了解*大多数*编译器从前端到后端是如何工作的。 8 | 9 | ### [想直接看代码?点这里](super-tiny-compiler-chinese.js) 10 | 11 | ### 或者... [看看演讲](https://www.youtube.com/watch?v=Tar4WgAfMr4) 12 | 13 | --- 14 | 15 | ### 为啥我要关心这个? 16 | 17 | 确实,大多数人在日常工作中没有必要了解编译器都是如何工作的。但是,编译器无处不在,你使用的很多 18 | 工具的底层原理都是从编译器那儿来的。 19 | 20 | ### 但是编译器太高大上了! 21 | 22 | 额,确实。但这是我们(写编译器的人)的过错,我们把一些本应该很通俗易懂的事情弄得太可怕了, 23 | 让很多人都认为编译器这种东西是可望而不可即的,甚至只有最书呆子的书呆子才能理解。 24 | 25 | ### 好吧,所以我该从哪儿开始? 26 | 27 | 太棒了!直接去看 [super-tiny-compiler-chinese.js](super-tiny-compiler-chinese.js) 这个文件吧! 28 | 29 | ### Tests 30 | 31 | 直接运行 `node test.js` 32 | 33 | --- 34 | 35 | [![cc-by-4.0](https://licensebuttons.net/l/by/4.0/80x15.png)](http://creativecommons.org/licenses/by/4.0/) 36 | -------------------------------------------------------------------------------- /super-tiny-compiler-chinese.js: -------------------------------------------------------------------------------- 1 | /** 2 | * TTTTTTTTTTTTTTTTTTTTTTTHHHHHHHHH HHHHHHHHHEEEEEEEEEEEEEEEEEEEEEE 3 | * T:::::::::::::::::::::TH:::::::H H:::::::HE::::::::::::::::::::E 4 | * T:::::::::::::::::::::TH:::::::H H:::::::HE::::::::::::::::::::E 5 | * T:::::TT:::::::TT:::::THH::::::H H::::::HHEE::::::EEEEEEEEE::::E 6 | * TTTTTT T:::::T TTTTTT H:::::H H:::::H E:::::E EEEEEE 7 | * T:::::T H:::::H H:::::H E:::::E 8 | * T:::::T H::::::HHHHH::::::H E::::::EEEEEEEEEE 9 | * T:::::T H:::::::::::::::::H E:::::::::::::::E 10 | * T:::::T H:::::::::::::::::H E:::::::::::::::E 11 | * T:::::T H::::::HHHHH::::::H E::::::EEEEEEEEEE 12 | * T:::::T H:::::H H:::::H E:::::E 13 | * T:::::T H:::::H H:::::H E:::::E EEEEEE 14 | * TT:::::::TT HH::::::H H::::::HHEE::::::EEEEEEEE:::::E 15 | * T:::::::::T H:::::::H H:::::::HE::::::::::::::::::::E 16 | * T:::::::::T H:::::::H H:::::::HE::::::::::::::::::::E 17 | * TTTTTTTTTTT HHHHHHHHH HHHHHHHHHEEEEEEEEEEEEEEEEEEEEEE 18 | * 19 | * SSSSSSSSSSSSSSS UUUUUUUU UUUUUUUUPPPPPPPPPPPPPPPPP EEEEEEEEEEEEEEEEEEEEEERRRRRRRRRRRRRRRRR 20 | * SS:::::::::::::::SU::::::U U::::::UP::::::::::::::::P E::::::::::::::::::::ER::::::::::::::::R 21 | * S:::::SSSSSS::::::SU::::::U U::::::UP::::::PPPPPP:::::P E::::::::::::::::::::ER::::::RRRRRR:::::R 22 | * S:::::S SSSSSSSUU:::::U U:::::UUPP:::::P P:::::PEE::::::EEEEEEEEE::::ERR:::::R R:::::R 23 | * S:::::S U:::::U U:::::U P::::P P:::::P E:::::E EEEEEE R::::R R:::::R 24 | * S:::::S U:::::U U:::::U P::::P P:::::P E:::::E R::::R R:::::R 25 | * S::::SSSS U:::::U U:::::U P::::PPPPPP:::::P E::::::EEEEEEEEEE R::::RRRRRR:::::R 26 | * SS::::::SSSSS U:::::U U:::::U P:::::::::::::PP E:::::::::::::::E R:::::::::::::RR 27 | * SSS::::::::SS U:::::U U:::::U P::::PPPPPPPPP E:::::::::::::::E R::::RRRRRR:::::R 28 | * SSSSSS::::S U:::::U U:::::U P::::P E::::::EEEEEEEEEE R::::R R:::::R 29 | * S:::::S U:::::U U:::::U P::::P E:::::E R::::R R:::::R 30 | * S:::::S U::::::U U::::::U P::::P E:::::E EEEEEE R::::R R:::::R 31 | * SSSSSSS S:::::S U:::::::UUU:::::::U PP::::::PP EE::::::EEEEEEEE:::::ERR:::::R R:::::R 32 | * S::::::SSSSSS:::::S UU:::::::::::::UU P::::::::P E::::::::::::::::::::ER::::::R R:::::R 33 | * S:::::::::::::::SS UU:::::::::UU P::::::::P E::::::::::::::::::::ER::::::R R:::::R 34 | * SSSSSSSSSSSSSSS UUUUUUUUU PPPPPPPPPP EEEEEEEEEEEEEEEEEEEEEERRRRRRRR RRRRRRR 35 | * 36 | * TTTTTTTTTTTTTTTTTTTTTTTIIIIIIIIIINNNNNNNN NNNNNNNNYYYYYYY YYYYYYY 37 | * T:::::::::::::::::::::TI::::::::IN:::::::N N::::::NY:::::Y Y:::::Y 38 | * T:::::::::::::::::::::TI::::::::IN::::::::N N::::::NY:::::Y Y:::::Y 39 | * T:::::TT:::::::TT:::::TII::::::IIN:::::::::N N::::::NY::::::Y Y::::::Y 40 | * TTTTTT T:::::T TTTTTT I::::I N::::::::::N N::::::NYYY:::::Y Y:::::YYY 41 | * T:::::T I::::I N:::::::::::N N::::::N Y:::::Y Y:::::Y 42 | * T:::::T I::::I N:::::::N::::N N::::::N Y:::::Y:::::Y 43 | * T:::::T I::::I N::::::N N::::N N::::::N Y:::::::::Y 44 | * T:::::T I::::I N::::::N N::::N:::::::N Y:::::::Y 45 | * T:::::T I::::I N::::::N N:::::::::::N Y:::::Y 46 | * T:::::T I::::I N::::::N N::::::::::N Y:::::Y 47 | * T:::::T I::::I N::::::N N:::::::::N Y:::::Y 48 | * TT:::::::TT II::::::IIN::::::N N::::::::N Y:::::Y 49 | * T:::::::::T I::::::::IN::::::N N:::::::N YYYY:::::YYYY 50 | * T:::::::::T I::::::::IN::::::N N::::::N Y:::::::::::Y 51 | * TTTTTTTTTTT IIIIIIIIIINNNNNNNN NNNNNNN YYYYYYYYYYYYY 52 | * 53 | * CCCCCCCCCCCCC OOOOOOOOO MMMMMMMM MMMMMMMMPPPPPPPPPPPPPPPPP IIIIIIIIIILLLLLLLLLLL EEEEEEEEEEEEEEEEEEEEEERRRRRRRRRRRRRRRRR 54 | * CCC::::::::::::C OO:::::::::OO M:::::::M M:::::::MP::::::::::::::::P I::::::::IL:::::::::L E::::::::::::::::::::ER::::::::::::::::R 55 | * CC:::::::::::::::C OO:::::::::::::OO M::::::::M M::::::::MP::::::PPPPPP:::::P I::::::::IL:::::::::L E::::::::::::::::::::ER::::::RRRRRR:::::R 56 | * C:::::CCCCCCCC::::CO:::::::OOO:::::::OM:::::::::M M:::::::::MPP:::::P P:::::PII::::::IILL:::::::LL EE::::::EEEEEEEEE::::ERR:::::R R:::::R 57 | * C:::::C CCCCCCO::::::O O::::::OM::::::::::M M::::::::::M P::::P P:::::P I::::I L:::::L E:::::E EEEEEE R::::R R:::::R 58 | * C:::::C O:::::O O:::::OM:::::::::::M M:::::::::::M P::::P P:::::P I::::I L:::::L E:::::E R::::R R:::::R 59 | * C:::::C O:::::O O:::::OM:::::::M::::M M::::M:::::::M P::::PPPPPP:::::P I::::I L:::::L E::::::EEEEEEEEEE R::::RRRRRR:::::R 60 | * C:::::C O:::::O O:::::OM::::::M M::::M M::::M M::::::M P:::::::::::::PP I::::I L:::::L E:::::::::::::::E R:::::::::::::RR 61 | * C:::::C O:::::O O:::::OM::::::M M::::M::::M M::::::M P::::PPPPPPPPP I::::I L:::::L E:::::::::::::::E R::::RRRRRR:::::R 62 | * C:::::C O:::::O O:::::OM::::::M M:::::::M M::::::M P::::P I::::I L:::::L E::::::EEEEEEEEEE R::::R R:::::R 63 | * C:::::C O:::::O O:::::OM::::::M M:::::M M::::::M P::::P I::::I L:::::L E:::::E R::::R R:::::R 64 | * C:::::C CCCCCCO::::::O O::::::OM::::::M MMMMM M::::::M P::::P I::::I L:::::L LLLLLL E:::::E EEEEEE R::::R R:::::R 65 | * C:::::CCCCCCCC::::CO:::::::OOO:::::::OM::::::M M::::::MPP::::::PP II::::::IILL:::::::LLLLLLLLL:::::LEE::::::EEEEEEEE:::::ERR:::::R R:::::R 66 | * CC:::::::::::::::C OO:::::::::::::OO M::::::M M::::::MP::::::::P I::::::::IL::::::::::::::::::::::LE::::::::::::::::::::ER::::::R R:::::R 67 | * CCC::::::::::::C OO:::::::::OO M::::::M M::::::MP::::::::P I::::::::IL::::::::::::::::::::::LE::::::::::::::::::::ER::::::R R:::::R 68 | * CCCCCCCCCCCCC OOOOOOOOO MMMMMMMM MMMMMMMMPPPPPPPPPP IIIIIIIIIILLLLLLLLLLLLLLLLLLLLLLLLEEEEEEEEEEEEEEEEEEEEEERRRRRRRR RRRRRRR 69 | * 70 | * ======================================================================================================================================================================= 71 | * ======================================================================================================================================================================= 72 | * ======================================================================================================================================================================= 73 | * ======================================================================================================================================================================= 74 | */ 75 | 76 | /** 77 | * 今天让我们来写一个编译器,一个超级无敌小的编译器!它小到如果把所有注释删去的话,大概只剩 78 | * 200行左右的代码。 79 | * 80 | * 我们将会用它将 lisp 风格的函数调用转换为 C 风格。 81 | * 82 | * 如果你对这两种风格不是很熟悉,下面是一个简单的介绍。 83 | * 84 | * 假设我们有两个函数,`add` 和 `subtract`,那么它们的写法将会是下面这样: 85 | * 86 | * LISP C 87 | * 88 | * 2 + 2 (add 2 2) add(2, 2) 89 | * 4 - 2 (subtract 4 2) subtract(4, 2) 90 | * 2 + (4 - 2) (add 2 (subtract 4 2)) add(2, subtract(4, 2)) 91 | * 92 | * 很简单对吧? 93 | * 94 | * 这个转换就是我们将要做的事情。虽然这并不包含 LISP 或者 C 的全部语法,但它足以向我们 95 | * 展示现代编译器很多要点。 96 | * 97 | */ 98 | 99 | /** 100 | * 大多数编译器可以分成三个阶段:解析(Parsing),转换(Transformation)以及代码 101 | * 生成(Code Generation) 102 | * 103 | * 1. *解析*是将最初原始的代码转换为一种更加抽象的表示(译者注:即AST)。* 104 | * 105 | * 2. *转换*将对这个抽象的表示做一些处理,让它能做到编译器期望 106 | * 它做到的事情。 107 | * 108 | * 3. *代码生成*接收处理之后的代码表示,然后把它转换成新的代码。 109 | */ 110 | 111 | /** 112 | * 解析(Parsing) 113 | * ------- 114 | * 115 | * 解析一般来说会分成两个阶段:词法分析(Lexical Analysis)和语法分析(Syntactic Analysis)。 116 | * 117 | * 1. *词法分析*接收原始代码,然后把它分割成一些被称为 Token 的东西,这个过程是在词法分析 118 | * 器(Tokenizer或者Lexer)中完成的。 119 | * 120 | * Token 是一个数组,由一些代码语句的碎片组成。它们可以是数字、标签、标点符号、运算符, 121 | * 或者其它任何东西。 122 | * 123 | * 2. *语法分析* 接收之前生成的 Token,把它们转换成一种抽象的表示,这种抽象的表示描述了代 124 | * 码语句中的每一个片段以及它们之间的关系。这被称为中间表示(intermediate representation) 125 | * 或抽象语法树(Abstract Syntax Tree, 缩写为AST) 126 | * 127 | * 抽象语法树是一个嵌套程度很深的对象,用一种更容易处理的方式代表了代码本身,也能给我们 128 | * 更多信息。 129 | * 130 | * 比如说对于下面这一行代码语句: 131 | * 132 | * (add 2 (subtract 4 2)) 133 | * 134 | * 它产生的 Token 看起来或许是这样的: 135 | * 136 | * [ 137 | * { type: 'paren', value: '(' }, 138 | * { type: 'name', value: 'add' }, 139 | * { type: 'number', value: '2' }, 140 | * { type: 'paren', value: '(' }, 141 | * { type: 'name', value: 'subtract' }, 142 | * { type: 'number', value: '4' }, 143 | * { type: 'number', value: '2' }, 144 | * { type: 'paren', value: ')' }, 145 | * { type: 'paren', value: ')' } 146 | * ] 147 | * 148 | * 它的抽象语法树(AST)看起来或许是这样的: 149 | * 150 | * { 151 | * type: 'Program', 152 | * body: [{ 153 | * type: 'CallExpression', 154 | * name: 'add', 155 | * params: [{ 156 | * type: 'NumberLiteral', 157 | * value: '2' 158 | * }, { 159 | * type: 'CallExpression', 160 | * name: 'subtract', 161 | * params: [{ 162 | * type: 'NumberLiteral', 163 | * value: '4' 164 | * }, { 165 | * type: 'NumberLiteral', 166 | * value: '2' 167 | * }] 168 | * }] 169 | * }] 170 | * } 171 | */ 172 | 173 | /** 174 | * 转换(Transformation) 175 | * -------------- 176 | * 177 | * 编译器的下一步就是转换。它只是把 AST 拿过来然后对它做一些修改。它可以在同种语言下操 178 | * 作 AST,也可以把 AST 翻译成全新的语言。 179 | * 180 | * 下面我们来看看该如何转换 AST。 181 | * 182 | * 你或许注意到了我们的 AST 中有很多相似的元素,这些元素都有 type 属性,它们被称为 AST 183 | * 结点。这些结点含有若干属性,可以用于描述 AST 的部分信息。 184 | * 185 | * 比如下面是一个“NumberLiteral”结点: 186 | * 187 | * { 188 | * type: 'NumberLiteral', 189 | * value: '2' 190 | * } 191 | * 192 | * 又比如下面是一个“CallExpression”结点: 193 | * 194 | * { 195 | * type: 'CallExpression', 196 | * name: 'subtract', 197 | * params: [...nested nodes go here...] 198 | * } 199 | * 200 | * 当转换 AST 的时候我们可以添加、移动、替代这些结点,也可以根据现有的 AST 生成一个全新 201 | * 的 AST 202 | * 203 | * 既然我们编译器的目标是把输入的代码转换为一种新的语言,所以我们将会着重于产生一个针对 204 | * 新语言的全新的 AST。 205 | * 206 | * 207 | * 遍历(Traversal) 208 | * --------- 209 | * 210 | * 为了能处理所有的结点,我们需要遍历它们,使用的是深度优先遍历。 211 | * 212 | * { 213 | * type: 'Program', 214 | * body: [{ 215 | * type: 'CallExpression', 216 | * name: 'add', 217 | * params: [{ 218 | * type: 'NumberLiteral', 219 | * value: '2' 220 | * }, { 221 | * type: 'CallExpression', 222 | * name: 'subtract', 223 | * params: [{ 224 | * type: 'NumberLiteral', 225 | * value: '4' 226 | * }, { 227 | * type: 'NumberLiteral', 228 | * value: '2' 229 | * }] 230 | * }] 231 | * }] 232 | * } 233 | * 234 | * 对于上面的 AST 的遍历流程是这样的: 235 | * 236 | * 1. Program - 从 AST 的顶部结点开始 237 | * 2. CallExpression (add) - Program 的第一个子元素 238 | * 3. NumberLiteral (2) - CallExpression (add) 的第一个子元素 239 | * 4. CallExpression (subtract) - CallExpression (add) 的第二个子元素 240 | * 5. NumberLiteral (4) - CallExpression (subtract) 的第一个子元素 241 | * 6. NumberLiteral (2) - CallExpression (subtract) 的第二个子元素 242 | * 243 | * 如果我们直接在 AST 内部操作,而不是产生一个新的 AST,那么就要在这里介绍所有种类的抽象, 244 | * 但是目前访问(visiting)所有结点的方法已经足够了。 245 | * 246 | * 使用“访问(visiting)”这个词的是因为这是一种模式,代表在对象结构内对元素进行操作。 247 | * 248 | * 访问者(Visitors) 249 | * -------- 250 | * 251 | * 我们最基础的想法是创建一个“访问者(visitor)”对象,这个对象中包含一些方法,可以接收不 252 | * 同的结点。 253 | * 254 | * var visitor = { 255 | * NumberLiteral() {}, 256 | * CallExpression() {} 257 | * }; 258 | * 259 | * 当我们遍历 AST 的时候,如果遇到了匹配 type 的结点,我们可以调用 visitor 中的方法。 260 | * 261 | * 一般情况下为了让这些方法可用性更好,我们会把父结点也作为参数传入。 262 | */ 263 | 264 | /** 265 | * 代码生成(Code Generation) 266 | * --------------- 267 | * 268 | * 编译器的最后一个阶段是代码生成,这个阶段做的事情有时候会和转换(transformation)重叠, 269 | * 但是代码生成最主要的部分还是根据 AST 来输出代码。 270 | * 271 | * 代码生成有几种不同的工作方式,有些编译器将会重用之前生成的 token,有些会创建独立的代码 272 | * 表示,以便于线性地输出代码。但是接下来我们还是着重于使用之前生成好的 AST。 273 | * 274 | * 我们的代码生成器需要知道如何“打印”AST 中所有类型的结点,然后它会递归地调用自身,直到所 275 | * 有代码都被打印到一个很长的字符串中。 276 | * 277 | */ 278 | 279 | /** 280 | * 好了!这就是编译器中所有的部分了。 281 | * 282 | * 当然不是说所有的编译器都像我说的这样。不同的编译器有不同的目的,所以也可能需要不同的步骤。 283 | * 284 | * 但你现在应该对编译器到底是个什么东西有个大概的认识了。 285 | * 286 | * 既然我全都解释一遍了,你应该能写一个属于自己的编译器了吧? 287 | * 288 | * 哈哈开个玩笑,接下来才是重点 :P 289 | * 290 | * 所以我们开始吧... 291 | */ 292 | 293 | /** 294 | * ============================================================================ 295 | * (/^▽^)/ 296 | * 词法分析器(Tokenizer)! 297 | * ============================================================================ 298 | */ 299 | 300 | /** 301 | * 我们从第一个阶段开始,即词法分析,使用的是词法分析器(Tokenizer)。 302 | * 303 | * 我们只是接收代码组成的字符串,然后把它们分割成 token 组成的数组。 304 | * 305 | * (add 2 (subtract 4 2)) => [{ type: 'paren', value: '(' }, ...] 306 | */ 307 | 308 | // 我们从接收一个字符串开始,首先设置两个变量。 309 | function tokenizer(input) { 310 | 311 | // `current`变量类似指针,用于记录我们在代码字符串中的位置。 312 | var current = 0; 313 | 314 | // `tokens`数组是我们放置 token 的地方 315 | var tokens = []; 316 | 317 | // 首先我们创建一个 `while` 循环, `current` 变量会在循环中自增。 318 | // 319 | // 我们这么做的原因是,由于 token 数组的长度是任意的,所以可能要在单个循环中多次 320 | // 增加 `current` 321 | while (current < input.length) { 322 | 323 | // 我们在这里储存了 `input` 中的当前字符 324 | var char = input[current]; 325 | 326 | // 要做的第一件事情就是检查是不是右圆括号。这在之后将会用在 `CallExpressions` 中, 327 | // 但是现在我们关心的只是字符本身。 328 | // 329 | // 检查一下是不是一个左圆括号。 330 | if (char === '(') { 331 | 332 | // 如果是,那么我们 push 一个 type 为 `paren`,value 为左圆括号的对象。 333 | tokens.push({ 334 | type: 'paren', 335 | value: '(' 336 | }); 337 | 338 | // 自增 `current` 339 | current++; 340 | 341 | // 结束本次循环,进入下一次循环 342 | continue; 343 | } 344 | 345 | // 然后我们检查是不是一个右圆括号。这里做的时候和之前一样:检查右圆括号、加入新的 token、 346 | // 自增 `current`,然后进入下一次循环。 347 | if (char === ')') { 348 | tokens.push({ 349 | type: 'paren', 350 | value: ')' 351 | }); 352 | current++; 353 | continue; 354 | } 355 | 356 | // 继续,我们现在检查是不是空格。有趣的是,我们想要空格的本意是分隔字符,但这现在 357 | // 对于我们储存 token 来说不那么重要。我们暂且搁置它。 358 | // 359 | // 所以我们只是简单地检查是不是空格,如果是,那么我们直接进入下一个循环。 360 | var WHITESPACE = /\s/; 361 | if (WHITESPACE.test(char)) { 362 | current++; 363 | continue; 364 | } 365 | 366 | // 下一个 token 的类型是数字。它和之前的 token 不同,因为数字可以由多个数字字符组成, 367 | // 但是我们只能把它们识别为一个 token。 368 | // 369 | // (add 123 456) 370 | // ^^^ ^^^ 371 | // Only two separate tokens 372 | // 这里只有两个 token 373 | // 374 | // 当我们遇到一个数字字符时,将会从这里开始。 375 | var NUMBERS = /[0-9]/; 376 | if (NUMBERS.test(char)) { 377 | 378 | // 创建一个 `value` 字符串,用于 push 字符。 379 | var value = ''; 380 | 381 | // 然后我们循环遍历接下来的字符,直到我们遇到的字符不再是数字字符为止,把遇到的每 382 | // 一个数字字符 push 进 `value` 中,然后自增 `current`。 383 | while (NUMBERS.test(char)) { 384 | value += char; 385 | char = input[++current]; 386 | } 387 | 388 | // 然后我们把类型为 `number` 的 token 放入 `tokens` 数组中。 389 | tokens.push({ 390 | type: 'number', 391 | value: value 392 | }); 393 | 394 | // 进入下一次循环。 395 | continue; 396 | } 397 | 398 | // 最后一种类型的 token 是 `name`。它由一系列的字母组成,这在我们的 lisp 语法中 399 | // 代表了函数。 400 | // 401 | // (add 2 4) 402 | // ^^^ 403 | // Name token 404 | // 405 | var LETTERS = /[a-z]/i; 406 | if (LETTERS.test(char)) { 407 | var value = ''; 408 | 409 | // 同样,我们用一个循环遍历所有的字母,把它们存入 value 中。 410 | while (LETTERS.test(char)) { 411 | value += char; 412 | char = input[++current]; 413 | } 414 | 415 | // 然后添加一个类型为 `name` 的 token,然后进入下一次循环。 416 | tokens.push({ 417 | type: 'name', 418 | value: value 419 | }); 420 | 421 | continue; 422 | } 423 | 424 | // 最后如果我们没有匹配上任何类型的 token,那么我们抛出一个错误。 425 | throw new TypeError('I dont know what this character is: ' + char); 426 | } 427 | 428 | // 词法分析器的最后我们返回 tokens 数组。 429 | return tokens; 430 | } 431 | 432 | /** 433 | * ============================================================================ 434 | * ヽ/❀o ل͜ o\ノ 435 | * 语法分析器(Parser)!!! 436 | * ============================================================================ 437 | */ 438 | 439 | /** 440 | * 语法分析器接受 token 数组,然后把它转化为 AST 441 | * 442 | * [{ type: 'paren', value: '(' }, ...] => { type: 'Program', body: [...] } 443 | */ 444 | 445 | // 现在我们定义 parser 函数,接受 `tokens` 数组 446 | function parser(tokens) { 447 | 448 | // 我们再次声明一个 `current` 变量作为指针。 449 | var current = 0; 450 | 451 | // 但是这次我们使用递归而不是 `while` 循环,所以我们定义一个 `walk` 函数。 452 | function walk() { 453 | 454 | // walk函数里,我们从当前token开始 455 | var token = tokens[current]; 456 | 457 | // 对于不同类型的结点,对应的处理方法也不同,我们从 `number` 类型的 token 开始。 458 | // 检查是不是 `number` 类型 459 | if (token.type === 'number') { 460 | 461 | // 如果是,`current` 自增。 462 | current++; 463 | 464 | // 然后我们会返回一个新的 AST 结点 `NumberLiteral`,并且把它的值设为 token 的值。 465 | return { 466 | type: 'NumberLiteral', 467 | value: token.value 468 | }; 469 | } 470 | 471 | // 接下来我们检查是不是 CallExpressions 类型,我们从左圆括号开始。 472 | if ( 473 | token.type === 'paren' && 474 | token.value === '(' 475 | ) { 476 | 477 | // 我们会自增 `current` 来跳过这个括号,因为括号在 AST 中是不重要的。 478 | token = tokens[++current]; 479 | 480 | // 我们创建一个类型为 `CallExpression` 的根节点,然后把它的 name 属性设置为当前 481 | // token 的值,因为紧跟在左圆括号后面的 token 一定是调用的函数的名字。 482 | var node = { 483 | type: 'CallExpression', 484 | name: token.value, 485 | params: [] 486 | }; 487 | 488 | // 我们再次自增 `current` 变量,跳过当前的 token 489 | token = tokens[++current]; 490 | 491 | // 现在我们循环遍历接下来的每一个 token,直到我们遇到右圆括号,这些 token 将会 492 | // 是 `CallExpression` 的 `params`(参数) 493 | // 494 | // 这也是递归开始的地方,我们采用递归的方式来解决问题,而不是去尝试解析一个可能有无限 495 | // 层嵌套的结点。 496 | // 497 | // 为了更好地解释,我们来看看我们的 Lisp 代码。你会注意到 `add` 函数的参数有两个, 498 | // 一个是数字,另一个是一个嵌套的 `CallExpression`,这个 `CallExpression` 中 499 | // 包含了它自己的参数(两个数字) 500 | // 501 | // (add 2 (subtract 4 2)) 502 | // 503 | // 你也会注意到我们的 token 数组中有多个右圆括号。 504 | // 505 | // [ 506 | // { type: 'paren', value: '(' }, 507 | // { type: 'name', value: 'add' }, 508 | // { type: 'number', value: '2' }, 509 | // { type: 'paren', value: '(' }, 510 | // { type: 'name', value: 'subtract' }, 511 | // { type: 'number', value: '4' }, 512 | // { type: 'number', value: '2' }, 513 | // { type: 'paren', value: ')' }, <<< 右圆括号 514 | // { type: 'paren', value: ')' } <<< 右圆括号 515 | // ] 516 | // 517 | // 遇到嵌套的 `CallExpressions` 时,我们将会依赖嵌套的 `walk` 函数来 518 | // 增加 `current` 变量 519 | // 520 | // 所以我们创建一个 `while` 循环,直到遇到类型为 `'paren'`,值为右圆括号的 token。 521 | while ( 522 | (token.type !== 'paren') || 523 | (token.type === 'paren' && token.value !== ')') 524 | ) { 525 | // 我们调用 `walk` 函数,它将会返回一个结点,然后我们把这个节点 526 | // 放入 `node.params` 中。 527 | node.params.push(walk()); 528 | token = tokens[current]; 529 | } 530 | 531 | // 我们最后一次增加 `current`,跳过右圆括号。 532 | current++; 533 | 534 | // 返回结点。 535 | return node; 536 | } 537 | 538 | // 同样,如果我们遇到了一个类型未知的结点,就抛出一个错误。 539 | throw new TypeError(token.type); 540 | } 541 | 542 | // 现在,我们创建 AST,根结点是一个类型为 `Program` 的结点。 543 | var ast = { 544 | type: 'Program', 545 | body: [] 546 | }; 547 | 548 | // 现在我们开始 `walk` 函数,把结点放入 `ast.body` 中。 549 | // 550 | // 之所以在一个循环中处理,是因为我们的程序可能在 `CallExpressions` 后面包含连续的两个 551 | // 参数,而不是嵌套的。 552 | // 553 | // (add 2 2) 554 | // (subtract 4 2) 555 | // 556 | while (current < tokens.length) { 557 | ast.body.push(walk()); 558 | } 559 | 560 | // 最后我们的语法分析器返回 AST 561 | return ast; 562 | } 563 | 564 | /** 565 | * ============================================================================ 566 | * ⌒(❀>◞౪◟<❀)⌒ 567 | * 遍历器!!! 568 | * ============================================================================ 569 | */ 570 | 571 | /** 572 | * 现在我们有了 AST,我们需要一个 visitor 去遍历所有的结点。当遇到某个类型的结点时,我们 573 | * 需要调用 visitor 中对应类型的处理函数。 574 | * 575 | * traverse(ast, { 576 | * Program(node, parent) { 577 | * // ... 578 | * }, 579 | * 580 | * CallExpression(node, parent) { 581 | * // ... 582 | * }, 583 | * 584 | * NumberLiteral(node, parent) { 585 | * // ... 586 | * } 587 | * }); 588 | */ 589 | 590 | // 所以我们定义一个遍历器,它有两个参数,AST 和 vistor。在它的里面我们又定义了两个函数... 591 | function traverser(ast, visitor) { 592 | 593 | // `traverseArray` 函数允许我们对数组中的每一个元素调用 `traverseNode` 函数。 594 | function traverseArray(array, parent) { 595 | array.forEach(function(child) { 596 | traverseNode(child, parent); 597 | }); 598 | } 599 | 600 | // `traverseNode` 函数接受一个 `node` 和它的父结点 `parent` 作为参数,这个结点会被 601 | // 传入到 visitor 中相应的处理函数那里。 602 | function traverseNode(node, parent) { 603 | 604 | // 首先我们看看 visitor 中有没有对应 `type` 的处理函数。 605 | var method = visitor[node.type]; 606 | 607 | // 如果有,那么我们把 `node` 和 `parent` 都传入其中。 608 | if (method) { 609 | method(node, parent); 610 | } 611 | 612 | // 下面我们对每一个不同类型的结点分开处理。 613 | switch (node.type) { 614 | 615 | // 我们从顶层的 `Program` 开始,Program 结点中有一个 body 属性,它是一个由若干 616 | // 个结点组成的数组,所以我们对这个数组调用 `traverseArray`。 617 | // 618 | // (记住 `traverseArray` 会调用 `traverseNode`,所以我们会递归地遍历这棵树。) 619 | case 'Program': 620 | traverseArray(node.body, node); 621 | break; 622 | 623 | // 下面我们对 `CallExpressions` 做同样的事情,遍历它的 `params`。 624 | case 'CallExpression': 625 | traverseArray(node.params, node); 626 | break; 627 | 628 | // 如果是 `NumberLiterals`,那么就没有任何子结点了,所以我们直接 break 629 | case 'NumberLiteral': 630 | break; 631 | 632 | // 同样,如果我们不能识别当前的结点,那么就抛出一个错误。 633 | default: 634 | throw new TypeError(node.type); 635 | } 636 | } 637 | 638 | // 最后我们对 AST 调用 `traverseNode`,开始遍历。注意 AST 并没有父结点。 639 | traverseNode(ast, null); 640 | } 641 | 642 | /** 643 | * ============================================================================ 644 | * ⁽(◍˃̵͈̑ᴗ˂̵͈̑)⁽ 645 | * 转换器!!! 646 | * ============================================================================ 647 | */ 648 | 649 | /** 650 | * 下面是转换器。转换器接收我们在之前构建好的 AST,然后把它和 visitor 传递进入我们的遍历 651 | * 器中 ,最后得到一个新的 AST。 652 | * 653 | * ---------------------------------------------------------------------------- 654 | * 原始的 AST | 转换后的 AST 655 | * ---------------------------------------------------------------------------- 656 | * { | { 657 | * type: 'Program', | type: 'Program', 658 | * body: [{ | body: [{ 659 | * type: 'CallExpression', | type: 'ExpressionStatement', 660 | * name: 'add', | expression: { 661 | * params: [{ | type: 'CallExpression', 662 | * type: 'NumberLiteral', | callee: { 663 | * value: '2' | type: 'Identifier', 664 | * }, { | name: 'add' 665 | * type: 'CallExpression', | }, 666 | * name: 'subtract', | arguments: [{ 667 | * params: [{ | type: 'NumberLiteral', 668 | * type: 'NumberLiteral', | value: '2' 669 | * value: '4' | }, { 670 | * }, { | type: 'CallExpression', 671 | * type: 'NumberLiteral', | callee: { 672 | * value: '2' | type: 'Identifier', 673 | * }] | name: 'subtract' 674 | * }] | }, 675 | * }] | arguments: [{ 676 | * } | type: 'NumberLiteral', 677 | * | value: '4' 678 | * ---------------------------------- | }, { 679 | * | type: 'NumberLiteral', 680 | * | value: '2' 681 | * | }] 682 | * (那一边比较长/w\) | }] 683 | * | } 684 | * | }] 685 | * | } 686 | * ---------------------------------------------------------------------------- 687 | */ 688 | 689 | // 定义我们的转换器函数,接收 AST 作为参数 690 | function transformer(ast) { 691 | 692 | // 创建 `newAST`,它与我们之前的 AST 类似,有一个类型为 Program 的根节点。 693 | var newAst = { 694 | type: 'Program', 695 | body: [] 696 | }; 697 | 698 | // 下面的代码会有些奇技淫巧,我们在父结点上使用一个属性 `context`(上下文),这样我们就 699 | // 可以把结点放入他们父结点的 context 中。当然可能会有更好的做法,但是为了简单我们姑且 700 | // 这么做吧。 701 | // 702 | // 注意 context 是一个*引用*,从旧的 AST 到新的 AST。 703 | ast._context = newAst.body; 704 | 705 | // 我们把 AST 和 visitor 函数传入遍历器 706 | traverser(ast, { 707 | 708 | // 第一个 visitor 方法接收 `NumberLiterals`。 709 | NumberLiteral: function(node, parent) { 710 | 711 | // 我们创建一个新结点,名字叫 `NumberLiteral`,并把它放入父结点的 context 中。 712 | parent._context.push({ 713 | type: 'NumberLiteral', 714 | value: node.value 715 | }); 716 | }, 717 | 718 | // 下一个,`CallExpressions`。 719 | CallExpression: function(node, parent) { 720 | 721 | // 我们创建一个 `CallExpression` 结点,里面有一个嵌套的 `Identifier`。 722 | var expression = { 723 | type: 'CallExpression', 724 | callee: { 725 | type: 'Identifier', 726 | name: node.name 727 | }, 728 | arguments: [] 729 | }; 730 | 731 | // 下面我们在原来的 `CallExpression` 结点上定义一个新的 context,它是 expression 732 | // 中 arguments 这个数组的引用,我们可以向其中放入参数。 733 | node._context = expression.arguments; 734 | 735 | // 然后来看看父结点是不是一个 `CallExpression`,如果不是... 736 | if (parent.type !== 'CallExpression') { 737 | 738 | // 我们把 `CallExpression` 结点包在一个 `ExpressionStatement` 中,这么做是因为 739 | // 单独存在(原文为top level)的 `CallExpressions` 在 JavaScript 中也可以被当做 740 | // 是声明语句。 741 | // 742 | // 译者注:比如 `var a = foo()` 与 `foo()`,后者既可以当作表达式给某个变量赋值,也 743 | // 可以作为一个独立的语句存在。 744 | expression = { 745 | type: 'ExpressionStatement', 746 | expression: expression 747 | }; 748 | } 749 | 750 | // 最后我们把 `CallExpression`(可能是被包起来的) 放入父结点的 context 中。 751 | parent._context.push(expression); 752 | } 753 | }); 754 | 755 | // 最后返回创建好的新 AST。 756 | return newAst; 757 | } 758 | 759 | /** 760 | * ============================================================================ 761 | * ヾ(〃^∇^)ノ♪ 762 | * 代码生成器!!!! 763 | * ============================================================================ 764 | */ 765 | 766 | /** 767 | * 现在只剩最后一步啦:代码生成器。 768 | * 769 | * 我们的代码生成器会递归地调用它自己,把 AST 中的每个结点打印到一个很大的字符串中。 770 | */ 771 | 772 | function codeGenerator(node) { 773 | 774 | // 对于不同 `type` 的结点分开处理。 775 | switch (node.type) { 776 | 777 | // 如果是 `Program` 结点,那么我们会遍历它的 `body` 属性中的每一个结点,并且递归地 778 | // 对这些结点再次调用 codeGenerator,再把结果打印进入新的一行中。 779 | case 'Program': 780 | return node.body.map(codeGenerator) 781 | .join('\n'); 782 | 783 | // 对于 `ExpressionStatements`,我们对它的 expression 属性递归调用,同时加入一个 784 | // 分号。 785 | case 'ExpressionStatement': 786 | return ( 787 | codeGenerator(node.expression) + 788 | ';' // << (...因为我们喜欢用*正确*的方式写代码) 789 | ); 790 | 791 | // 对于 `CallExpressions`,我们会打印出 `callee`,接着是一个左圆括号,然后对 792 | // arguments 递归调用 codeGenerator,并且在它们之间加一个逗号,最后加上右圆括号。 793 | case 'CallExpression': 794 | return ( 795 | codeGenerator(node.callee) + 796 | '(' + 797 | node.arguments.map(codeGenerator) 798 | .join(', ') + 799 | ')' 800 | ); 801 | 802 | // 对于 `Identifiers` 我们只是返回 `node` 的 name。 803 | case 'Identifier': 804 | return node.name; 805 | 806 | // 对于 `NumberLiterals` 我们只是返回 `node` 的 value 807 | case 'NumberLiteral': 808 | return node.value; 809 | 810 | // 如果我们不能识别这个结点,那么抛出一个错误。 811 | default: 812 | throw new TypeError(node.type); 813 | } 814 | } 815 | 816 | /** 817 | * ============================================================================ 818 | * (۶* ‘ヮ’)۶” 819 | * !!!!!!!!!!!!编译器!!!!!!!!!!! 820 | * ============================================================================ 821 | */ 822 | 823 | /** 824 | * 最后!我们创建 `compiler` 函数,它只是把上面说到的那些函数连接到一起。 825 | * 826 | * 1. input => tokenizer => tokens 827 | * 2. tokens => parser => ast 828 | * 3. ast => transformer => newAst 829 | * 4. newAst => generator => output 830 | */ 831 | 832 | function compiler(input) { 833 | var tokens = tokenizer(input); 834 | var ast = parser(tokens); 835 | var newAst = transformer(ast); 836 | var output = codeGenerator(newAst); 837 | 838 | // 然后返回输出! 839 | return output; 840 | } 841 | 842 | /** 843 | * ============================================================================ 844 | * (๑˃̵ᴗ˂̵)و 845 | * !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!你做到了!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! 846 | * ============================================================================ 847 | */ 848 | 849 | // 现在导出所有接口... 850 | module.exports = { 851 | tokenizer: tokenizer, 852 | parser: parser, 853 | transformer: transformer, 854 | codeGenerator: codeGenerator, 855 | compiler: compiler 856 | }; 857 | -------------------------------------------------------------------------------- /super-tiny-compiler.js: -------------------------------------------------------------------------------- 1 | /** 2 | * TTTTTTTTTTTTTTTTTTTTTTTHHHHHHHHH HHHHHHHHHEEEEEEEEEEEEEEEEEEEEEE 3 | * T:::::::::::::::::::::TH:::::::H H:::::::HE::::::::::::::::::::E 4 | * T:::::::::::::::::::::TH:::::::H H:::::::HE::::::::::::::::::::E 5 | * T:::::TT:::::::TT:::::THH::::::H H::::::HHEE::::::EEEEEEEEE::::E 6 | * TTTTTT T:::::T TTTTTT H:::::H H:::::H E:::::E EEEEEE 7 | * T:::::T H:::::H H:::::H E:::::E 8 | * T:::::T H::::::HHHHH::::::H E::::::EEEEEEEEEE 9 | * T:::::T H:::::::::::::::::H E:::::::::::::::E 10 | * T:::::T H:::::::::::::::::H E:::::::::::::::E 11 | * T:::::T H::::::HHHHH::::::H E::::::EEEEEEEEEE 12 | * T:::::T H:::::H H:::::H E:::::E 13 | * T:::::T H:::::H H:::::H E:::::E EEEEEE 14 | * TT:::::::TT HH::::::H H::::::HHEE::::::EEEEEEEE:::::E 15 | * T:::::::::T H:::::::H H:::::::HE::::::::::::::::::::E 16 | * T:::::::::T H:::::::H H:::::::HE::::::::::::::::::::E 17 | * TTTTTTTTTTT HHHHHHHHH HHHHHHHHHEEEEEEEEEEEEEEEEEEEEEE 18 | * 19 | * SSSSSSSSSSSSSSS UUUUUUUU UUUUUUUUPPPPPPPPPPPPPPPPP EEEEEEEEEEEEEEEEEEEEEERRRRRRRRRRRRRRRRR 20 | * SS:::::::::::::::SU::::::U U::::::UP::::::::::::::::P E::::::::::::::::::::ER::::::::::::::::R 21 | * S:::::SSSSSS::::::SU::::::U U::::::UP::::::PPPPPP:::::P E::::::::::::::::::::ER::::::RRRRRR:::::R 22 | * S:::::S SSSSSSSUU:::::U U:::::UUPP:::::P P:::::PEE::::::EEEEEEEEE::::ERR:::::R R:::::R 23 | * S:::::S U:::::U U:::::U P::::P P:::::P E:::::E EEEEEE R::::R R:::::R 24 | * S:::::S U:::::U U:::::U P::::P P:::::P E:::::E R::::R R:::::R 25 | * S::::SSSS U:::::U U:::::U P::::PPPPPP:::::P E::::::EEEEEEEEEE R::::RRRRRR:::::R 26 | * SS::::::SSSSS U:::::U U:::::U P:::::::::::::PP E:::::::::::::::E R:::::::::::::RR 27 | * SSS::::::::SS U:::::U U:::::U P::::PPPPPPPPP E:::::::::::::::E R::::RRRRRR:::::R 28 | * SSSSSS::::S U:::::U U:::::U P::::P E::::::EEEEEEEEEE R::::R R:::::R 29 | * S:::::S U:::::U U:::::U P::::P E:::::E R::::R R:::::R 30 | * S:::::S U::::::U U::::::U P::::P E:::::E EEEEEE R::::R R:::::R 31 | * SSSSSSS S:::::S U:::::::UUU:::::::U PP::::::PP EE::::::EEEEEEEE:::::ERR:::::R R:::::R 32 | * S::::::SSSSSS:::::S UU:::::::::::::UU P::::::::P E::::::::::::::::::::ER::::::R R:::::R 33 | * S:::::::::::::::SS UU:::::::::UU P::::::::P E::::::::::::::::::::ER::::::R R:::::R 34 | * SSSSSSSSSSSSSSS UUUUUUUUU PPPPPPPPPP EEEEEEEEEEEEEEEEEEEEEERRRRRRRR RRRRRRR 35 | * 36 | * TTTTTTTTTTTTTTTTTTTTTTTIIIIIIIIIINNNNNNNN NNNNNNNNYYYYYYY YYYYYYY 37 | * T:::::::::::::::::::::TI::::::::IN:::::::N N::::::NY:::::Y Y:::::Y 38 | * T:::::::::::::::::::::TI::::::::IN::::::::N N::::::NY:::::Y Y:::::Y 39 | * T:::::TT:::::::TT:::::TII::::::IIN:::::::::N N::::::NY::::::Y Y::::::Y 40 | * TTTTTT T:::::T TTTTTT I::::I N::::::::::N N::::::NYYY:::::Y Y:::::YYY 41 | * T:::::T I::::I N:::::::::::N N::::::N Y:::::Y Y:::::Y 42 | * T:::::T I::::I N:::::::N::::N N::::::N Y:::::Y:::::Y 43 | * T:::::T I::::I N::::::N N::::N N::::::N Y:::::::::Y 44 | * T:::::T I::::I N::::::N N::::N:::::::N Y:::::::Y 45 | * T:::::T I::::I N::::::N N:::::::::::N Y:::::Y 46 | * T:::::T I::::I N::::::N N::::::::::N Y:::::Y 47 | * T:::::T I::::I N::::::N N:::::::::N Y:::::Y 48 | * TT:::::::TT II::::::IIN::::::N N::::::::N Y:::::Y 49 | * T:::::::::T I::::::::IN::::::N N:::::::N YYYY:::::YYYY 50 | * T:::::::::T I::::::::IN::::::N N::::::N Y:::::::::::Y 51 | * TTTTTTTTTTT IIIIIIIIIINNNNNNNN NNNNNNN YYYYYYYYYYYYY 52 | * 53 | * CCCCCCCCCCCCC OOOOOOOOO MMMMMMMM MMMMMMMMPPPPPPPPPPPPPPPPP IIIIIIIIIILLLLLLLLLLL EEEEEEEEEEEEEEEEEEEEEERRRRRRRRRRRRRRRRR 54 | * CCC::::::::::::C OO:::::::::OO M:::::::M M:::::::MP::::::::::::::::P I::::::::IL:::::::::L E::::::::::::::::::::ER::::::::::::::::R 55 | * CC:::::::::::::::C OO:::::::::::::OO M::::::::M M::::::::MP::::::PPPPPP:::::P I::::::::IL:::::::::L E::::::::::::::::::::ER::::::RRRRRR:::::R 56 | * C:::::CCCCCCCC::::CO:::::::OOO:::::::OM:::::::::M M:::::::::MPP:::::P P:::::PII::::::IILL:::::::LL EE::::::EEEEEEEEE::::ERR:::::R R:::::R 57 | * C:::::C CCCCCCO::::::O O::::::OM::::::::::M M::::::::::M P::::P P:::::P I::::I L:::::L E:::::E EEEEEE R::::R R:::::R 58 | * C:::::C O:::::O O:::::OM:::::::::::M M:::::::::::M P::::P P:::::P I::::I L:::::L E:::::E R::::R R:::::R 59 | * C:::::C O:::::O O:::::OM:::::::M::::M M::::M:::::::M P::::PPPPPP:::::P I::::I L:::::L E::::::EEEEEEEEEE R::::RRRRRR:::::R 60 | * C:::::C O:::::O O:::::OM::::::M M::::M M::::M M::::::M P:::::::::::::PP I::::I L:::::L E:::::::::::::::E R:::::::::::::RR 61 | * C:::::C O:::::O O:::::OM::::::M M::::M::::M M::::::M P::::PPPPPPPPP I::::I L:::::L E:::::::::::::::E R::::RRRRRR:::::R 62 | * C:::::C O:::::O O:::::OM::::::M M:::::::M M::::::M P::::P I::::I L:::::L E::::::EEEEEEEEEE R::::R R:::::R 63 | * C:::::C O:::::O O:::::OM::::::M M:::::M M::::::M P::::P I::::I L:::::L E:::::E R::::R R:::::R 64 | * C:::::C CCCCCCO::::::O O::::::OM::::::M MMMMM M::::::M P::::P I::::I L:::::L LLLLLL E:::::E EEEEEE R::::R R:::::R 65 | * C:::::CCCCCCCC::::CO:::::::OOO:::::::OM::::::M M::::::MPP::::::PP II::::::IILL:::::::LLLLLLLLL:::::LEE::::::EEEEEEEE:::::ERR:::::R R:::::R 66 | * CC:::::::::::::::C OO:::::::::::::OO M::::::M M::::::MP::::::::P I::::::::IL::::::::::::::::::::::LE::::::::::::::::::::ER::::::R R:::::R 67 | * CCC::::::::::::C OO:::::::::OO M::::::M M::::::MP::::::::P I::::::::IL::::::::::::::::::::::LE::::::::::::::::::::ER::::::R R:::::R 68 | * CCCCCCCCCCCCC OOOOOOOOO MMMMMMMM MMMMMMMMPPPPPPPPPP IIIIIIIIIILLLLLLLLLLLLLLLLLLLLLLLLEEEEEEEEEEEEEEEEEEEEEERRRRRRRR RRRRRRR 69 | * 70 | * ======================================================================================================================================================================= 71 | * ======================================================================================================================================================================= 72 | * ======================================================================================================================================================================= 73 | * ======================================================================================================================================================================= 74 | */ 75 | 76 | /** 77 | * Today we're going to write a compiler together. But not just any compiler... A 78 | * super duper teeny tiny compiler! A compiler that is so small that if you 79 | * remove all the comments this file would only be ~200 lines of actual code. 80 | * 81 | * We're going to compile some lisp-like function calls into some C-like 82 | * function calls. 83 | * 84 | * If you are not familiar with one or the other. I'll just give you a quick intro. 85 | * 86 | * If we had two functions `add` and `subtract` they would be written like this: 87 | * 88 | * LISP C 89 | * 90 | * 2 + 2 (add 2 2) add(2, 2) 91 | * 4 - 2 (subtract 4 2) subtract(4, 2) 92 | * 2 + (4 - 2) (add 2 (subtract 4 2)) add(2, subtract(4, 2)) 93 | * 94 | * Easy peezy right? 95 | * 96 | * Well good, because this is exactly what we are going to compile. While this 97 | * is neither a complete LISP or C syntax, it will be enough of the syntax to 98 | * demonstrate many of the major pieces of a modern compiler. 99 | */ 100 | 101 | /** 102 | * Most compilers break down into three primary stages: Parsing, Transformation, 103 | * and Code Generation 104 | * 105 | * 1. *Parsing* is taking raw code and turning it into a more abstract 106 | * representation of the code. 107 | * 108 | * 2. *Transformation* takes this abstract representation and manipulates to do 109 | * whatever the compiler wants it to. 110 | * 111 | * 3. *Code Generation* takes the transformed representation of the code and 112 | * turns it into new code. 113 | */ 114 | 115 | /** 116 | * Parsing 117 | * ------- 118 | * 119 | * Parsing typically gets broken down into two phases: Lexical Analysis and 120 | * Syntactic Analysis. 121 | * 122 | * 1. *Lexical Analysis* takes the raw code and splits it apart into these things 123 | * called tokens by a thing called a tokenizer (or lexer). 124 | * 125 | * Tokens are an array of tiny little objects that describe an isolated piece 126 | * of the syntax. They could be numbers, labels, punctuation, operators, 127 | * whatever. 128 | * 129 | * 2. *Syntactic Analysis* takes the tokens and reformats them into a 130 | * representation that describes each part of the syntax and their relation 131 | * to one another. This is known as an intermediate representation or 132 | * Abstract Syntax Tree. 133 | * 134 | * An Abstract Syntax Tree, or AST for short, is a deeply nested object that 135 | * represents code in a way that is both easy to work with and tells us a lot 136 | * of information. 137 | * 138 | * For the following syntax: 139 | * 140 | * (add 2 (subtract 4 2)) 141 | * 142 | * Tokens might look something like this: 143 | * 144 | * [ 145 | * { type: 'paren', value: '(' }, 146 | * { type: 'name', value: 'add' }, 147 | * { type: 'number', value: '2' }, 148 | * { type: 'paren', value: '(' }, 149 | * { type: 'name', value: 'subtract' }, 150 | * { type: 'number', value: '4' }, 151 | * { type: 'number', value: '2' }, 152 | * { type: 'paren', value: ')' }, 153 | * { type: 'paren', value: ')' } 154 | * ] 155 | * 156 | * And an Abstract Syntax Tree (AST) might look like this: 157 | * 158 | * { 159 | * type: 'Program', 160 | * body: [{ 161 | * type: 'CallExpression', 162 | * name: 'add', 163 | * params: [{ 164 | * type: 'NumberLiteral', 165 | * value: '2' 166 | * }, { 167 | * type: 'CallExpression', 168 | * name: 'subtract', 169 | * params: [{ 170 | * type: 'NumberLiteral', 171 | * value: '4' 172 | * }, { 173 | * type: 'NumberLiteral', 174 | * value: '2' 175 | * }] 176 | * }] 177 | * }] 178 | * } 179 | */ 180 | 181 | /** 182 | * Transformation 183 | * -------------- 184 | * 185 | * The next type of stage for a compiler is transformation. Again, this just 186 | * takes the AST from the last step and makes changes to it. It can manipulate 187 | * the AST in the same language or it can translate it into an entirely new 188 | * language. 189 | * 190 | * Let’s look at how we would transform an AST. 191 | * 192 | * You might notice that our AST has elements within it that look very similar. 193 | * There are these objects with a type property. Each of these are known as an 194 | * AST Node. These nodes have defined properties on them that describe one 195 | * isolated part of the tree. 196 | * 197 | * We can have a node for a "NumberLiteral": 198 | * 199 | * { 200 | * type: 'NumberLiteral', 201 | * value: '2' 202 | * } 203 | * 204 | * Or maybe a node for a "CallExpression": 205 | * 206 | * { 207 | * type: 'CallExpression', 208 | * name: 'subtract', 209 | * params: [...nested nodes go here...] 210 | * } 211 | * 212 | * When transforming the AST we can manipulate nodes by 213 | * adding/removing/replacing properties, we can add new nodes, remove nodes, or 214 | * we could leave the existing AST alone and create an entirely new one based 215 | * on it. 216 | * 217 | * Since we’re targeting a new language, we’re going to focus on creating an 218 | * entirely new AST that is specific to the target language. 219 | * 220 | * Traversal 221 | * --------- 222 | * 223 | * In order to navigate through all of these nodes, we need to be able to 224 | * traverse through them. This traversal process goes to each node in the AST 225 | * depth-first. 226 | * 227 | * { 228 | * type: 'Program', 229 | * body: [{ 230 | * type: 'CallExpression', 231 | * name: 'add', 232 | * params: [{ 233 | * type: 'NumberLiteral', 234 | * value: '2' 235 | * }, { 236 | * type: 'CallExpression', 237 | * name: 'subtract', 238 | * params: [{ 239 | * type: 'NumberLiteral', 240 | * value: '4' 241 | * }, { 242 | * type: 'NumberLiteral', 243 | * value: '2' 244 | * }] 245 | * }] 246 | * }] 247 | * } 248 | * 249 | * So for the above AST we would go: 250 | * 251 | * 1. Program - Starting at the top level of the AST 252 | * 2. CallExpression (add) - Moving to the first element of the Program's body 253 | * 3. NumberLiteral (2) - Moving to the first element of CallExpression's params 254 | * 4. CallExpression (subtract) - Moving to the second element of CallExpression's params 255 | * 5. NumberLiteral (4) - Moving to the first element of CallExpression's params 256 | * 6. NumberLiteral (2) - Moving to the second element of CallExpression's params 257 | * 258 | * If we were manipulating this AST directly, instead of creating a separate AST, 259 | * we would likely introduce all sorts of abstractions here. But just visiting 260 | * each node in the tree is enough. 261 | * 262 | * The reason I use the word “visiting” is because there is this pattern of how 263 | * to represent operations on elements of an object structure. 264 | * 265 | * Visitors 266 | * -------- 267 | * 268 | * The basic idea here is that we are going to create a “visitor” object that 269 | * has methods that will accept different node types. 270 | * 271 | * var visitor = { 272 | * NumberLiteral() {}, 273 | * CallExpression() {} 274 | * }; 275 | * 276 | * When we traverse our AST we will call the methods on this visitor whenever we 277 | * encounter a node of a matching type. 278 | * 279 | * In order to make this useful we will also pass the node and a reference to 280 | * the parent node. 281 | * 282 | * var visitor = { 283 | * NumberLiteral(node, parent) {}, 284 | * CallExpression(node, parent) {} 285 | * }; 286 | */ 287 | 288 | /** 289 | * Code Generation 290 | * --------------- 291 | * 292 | * The final phase of a compiler is code generation. Sometimes compilers will do 293 | * things that overlap with transformation, but for the most part code 294 | * generation just means take our AST and string-ify code back out. 295 | * 296 | * Code generators work several different ways, some compilers will reuse the 297 | * tokens from earlier, others will have created a separate representation of 298 | * the code so that they can print node linearly, but from what I can tell most 299 | * will use the same AST we just created, which is what we’re going to focus on. 300 | * 301 | * Effectively our code generator will know how to “print” all of the different 302 | * node types of the AST, and it will recursively call itself to print nested 303 | * nodes until everything is printed into one long string of code. 304 | */ 305 | 306 | /** 307 | * And that's it! That's all the different pieces of a compiler. 308 | * 309 | * Now that isn’t to say every compiler looks exactly like I described here. 310 | * Compilers serve many different purposes, and they might need more steps than 311 | * I have detailed. 312 | * 313 | * But now you should have a general high-level idea of what most compilers look 314 | * like. 315 | * 316 | * Now that I’ve explained all of this, you’re all good to go write your own 317 | * compilers right? 318 | * 319 | * Just kidding, that's what I'm here to help with :P 320 | * 321 | * So let's begin... 322 | */ 323 | 324 | /** 325 | * ============================================================================ 326 | * (/^▽^)/ 327 | * THE TOKENIZER! 328 | * ============================================================================ 329 | */ 330 | 331 | /** 332 | * We're gonna start off with our first phase of parsing, lexical analysis, with 333 | * the tokenizer. 334 | * 335 | * We're just going to take our string of code and break it down into an array 336 | * of tokens. 337 | * 338 | * (add 2 (subtract 4 2)) => [{ type: 'paren', value: '(' }, ...] 339 | */ 340 | 341 | // We start by accepting an input string of code, and we're gonna set up two 342 | // things... 343 | function tokenizer(input) { 344 | 345 | // A `current` variable for tracking our position in the code like a cursor. 346 | var current = 0; 347 | 348 | // And a `tokens` array for pushing our tokens to. 349 | var tokens = []; 350 | 351 | // We start by creating a `while` loop where we are setting up our `current` 352 | // variable to be incremented as much as we want `inside` the loop. 353 | // 354 | // We do this because we may want to increment `current` many times within a 355 | // single loop because our tokens can be any length. 356 | while (current < input.length) { 357 | 358 | // We're also going to store the `current` character in the `input`. 359 | var char = input[current]; 360 | 361 | // The first thing we want to check for is an open parenthesis. This will 362 | // later be used for `CallExpressions` but for now we only care about the 363 | // character. 364 | // 365 | // We check to see if we have an open parenthesis: 366 | if (char === '(') { 367 | 368 | // If we do, we push a new token with the type `paren` and set the value 369 | // to an open parenthesis. 370 | tokens.push({ 371 | type: 'paren', 372 | value: '(' 373 | }); 374 | 375 | // Then we increment `current` 376 | current++; 377 | 378 | // And we `continue` onto the next cycle of the loop. 379 | continue; 380 | } 381 | 382 | // Next we're going to check for a closing parenthesis. We do the same exact 383 | // thing as before: Check for a closing parenthesis, add a new token, 384 | // increment `current`, and `continue`. 385 | if (char === ')') { 386 | tokens.push({ 387 | type: 'paren', 388 | value: ')' 389 | }); 390 | current++; 391 | continue; 392 | } 393 | 394 | // Moving on, we're now going to check for whitespace. This is interesting 395 | // because we care that whitespace exists to separate characters, but it 396 | // isn't actually important for us to store as a token. We would only throw 397 | // it out later. 398 | // 399 | // So here we're just going to test for existence and if it does exist we're 400 | // going to just `continue` on. 401 | var WHITESPACE = /\s/; 402 | if (WHITESPACE.test(char)) { 403 | current++; 404 | continue; 405 | } 406 | 407 | // The next type of token is a number. This is different than what we have 408 | // seen before because a number could be any number of characters and we 409 | // want to capture the entire sequence of characters as one token. 410 | // 411 | // (add 123 456) 412 | // ^^^ ^^^ 413 | // Only two separate tokens 414 | // 415 | // So we start this off when we encounter the first number in a sequence. 416 | var NUMBERS = /[0-9]/; 417 | if (NUMBERS.test(char)) { 418 | 419 | // We're going to create a `value` string that we are going to push 420 | // characters to. 421 | var value = ''; 422 | 423 | // Then we're going to loop through each character in the sequence until 424 | // we encounter a character that is not a number, pushing each character 425 | // that is a number to our `value` and incrementing `current` as we go. 426 | while (NUMBERS.test(char)) { 427 | value += char; 428 | char = input[++current]; 429 | } 430 | 431 | // After that we push our `number` token to the `tokens` array. 432 | tokens.push({ 433 | type: 'number', 434 | value: value 435 | }); 436 | 437 | // And we continue on. 438 | continue; 439 | } 440 | 441 | // The last type of token will be a `name` token. This is a sequence of 442 | // letters instead of numbers, that are the names of functions in our lisp 443 | // syntax. 444 | // 445 | // (add 2 4) 446 | // ^^^ 447 | // Name token 448 | // 449 | var LETTERS = /[a-z]/i; 450 | if (LETTERS.test(char)) { 451 | var value = ''; 452 | 453 | // Again we're just going to loop through all the letters pushing them to 454 | // a value. 455 | while (LETTERS.test(char)) { 456 | value += char; 457 | char = input[++current]; 458 | } 459 | 460 | // And pushing that value as a token with the type `name` and continuing. 461 | tokens.push({ 462 | type: 'name', 463 | value: value 464 | }); 465 | 466 | continue; 467 | } 468 | 469 | // Finally if we have not matched a character by now, we're going to throw 470 | // an error and completely exit. 471 | throw new TypeError('I dont know what this character is: ' + char); 472 | } 473 | 474 | // Then at the end of our `tokenizer` we simply return the tokens array. 475 | return tokens; 476 | } 477 | 478 | /** 479 | * ============================================================================ 480 | * ヽ/❀o ل͜ o\ノ 481 | * THE PARSER!!! 482 | * ============================================================================ 483 | */ 484 | 485 | /** 486 | * For our parser we're going to take our array of tokens and turn it into an 487 | * AST. 488 | * 489 | * [{ type: 'paren', value: '(' }, ...] => { type: 'Program', body: [...] } 490 | */ 491 | 492 | // Okay, so we define a `parser` function that accepts our array of `tokens`. 493 | function parser(tokens) { 494 | 495 | // Again we keep a `current` variable that we will use as a cursor. 496 | var current = 0; 497 | 498 | // But this time we're going to use recursion instead of a `while` loop. So we 499 | // define a `walk` function. 500 | function walk() { 501 | 502 | // Inside the walk function we start by grabbing the `current` token. 503 | var token = tokens[current]; 504 | 505 | // We're going to split each type of token off into a different code path, 506 | // starting off with `number` tokens. 507 | // 508 | // We test to see if we have a `number` token. 509 | if (token.type === 'number') { 510 | 511 | // If we have one, we'll increment `current`. 512 | current++; 513 | 514 | // And we'll return a new AST node called `NumberLiteral` and setting its 515 | // value to the value of our token. 516 | return { 517 | type: 'NumberLiteral', 518 | value: token.value 519 | }; 520 | } 521 | 522 | // Next we're going to look for CallExpressions. We start this off when we 523 | // encounter an open parenthesis. 524 | if ( 525 | token.type === 'paren' && 526 | token.value === '(' 527 | ) { 528 | 529 | // We'll increment `current` to skip the parenthesis since we don't care 530 | // about it in our AST. 531 | token = tokens[++current]; 532 | 533 | // We create a base node with the type `CallExpression`, and we're going 534 | // to set the name as the current token's value since the next token after 535 | // the open parenthesis is the name of the function. 536 | var node = { 537 | type: 'CallExpression', 538 | name: token.value, 539 | params: [] 540 | }; 541 | 542 | // We increment `current` *again* to skip the name token. 543 | token = tokens[++current]; 544 | 545 | // And now we want to loop through each token that will be the `params` of 546 | // our `CallExpression` until we encounter a closing parenthesis. 547 | // 548 | // Now this is where recursion comes in. Instead of trying to parse a 549 | // potentially infinitely nested set of nodes we're going to rely on 550 | // recursion to resolve things. 551 | // 552 | // To explain this, let's take our Lisp code. You can see that the 553 | // parameters of the `add` are a number and a nested `CallExpression` that 554 | // includes its own numbers. 555 | // 556 | // (add 2 (subtract 4 2)) 557 | // 558 | // You'll also notice that in our tokens array we have multiple closing 559 | // parentheses. 560 | // 561 | // [ 562 | // { type: 'paren', value: '(' }, 563 | // { type: 'name', value: 'add' }, 564 | // { type: 'number', value: '2' }, 565 | // { type: 'paren', value: '(' }, 566 | // { type: 'name', value: 'subtract' }, 567 | // { type: 'number', value: '4' }, 568 | // { type: 'number', value: '2' }, 569 | // { type: 'paren', value: ')' }, <<< Closing parenthesis 570 | // { type: 'paren', value: ')' } <<< Closing parenthesis 571 | // ] 572 | // 573 | // We're going to rely on the nested `walk` function to increment our 574 | // `current` variable past any nested `CallExpressions`. 575 | 576 | // So we create a `while` loop that will continue until it encounters a 577 | // token with a `type` of `'paren'` and a `value` of a closing 578 | // parenthesis. 579 | while ( 580 | (token.type !== 'paren') || 581 | (token.type === 'paren' && token.value !== ')') 582 | ) { 583 | // we'll call the `walk` function which will return a `node` and we'll 584 | // push it into our `node.params`. 585 | node.params.push(walk()); 586 | token = tokens[current]; 587 | } 588 | 589 | // Finally we will increment `current` one last time to skip the closing 590 | // parenthesis. 591 | current++; 592 | 593 | // And return the node. 594 | return node; 595 | } 596 | 597 | // Again, if we haven't recognized the token type by now we're going to 598 | // throw an error. 599 | throw new TypeError(token.type); 600 | } 601 | 602 | // Now, we're going to create our AST which will have a root which is a 603 | // `Program` node. 604 | var ast = { 605 | type: 'Program', 606 | body: [] 607 | }; 608 | 609 | // And we're going to kickstart our `walk` function, pushing nodes to our 610 | // `ast.body` array. 611 | // 612 | // The reason we are doing this inside a loop is because our program can have 613 | // `CallExpressions` after one another instead of being nested. 614 | // 615 | // (add 2 2) 616 | // (subtract 4 2) 617 | // 618 | while (current < tokens.length) { 619 | ast.body.push(walk()); 620 | } 621 | 622 | // At the end of our parser we'll return the AST. 623 | return ast; 624 | } 625 | 626 | /** 627 | * ============================================================================ 628 | * ⌒(❀>◞౪◟<❀)⌒ 629 | * THE TRAVERSER!!! 630 | * ============================================================================ 631 | */ 632 | 633 | /** 634 | * So now we have our AST, and we want to be able to visit different nodes with 635 | * a visitor. We need to be able to call the methods on the visitor whenever we 636 | * encounter a node with a matching type. 637 | * 638 | * traverse(ast, { 639 | * Program(node, parent) { 640 | * // ... 641 | * }, 642 | * 643 | * CallExpression(node, parent) { 644 | * // ... 645 | * }, 646 | * 647 | * NumberLiteral(node, parent) { 648 | * // ... 649 | * } 650 | * }); 651 | */ 652 | 653 | // So we define a traverser function which accepts an AST and a 654 | // visitor. Inside we're going to define two functions... 655 | function traverser(ast, visitor) { 656 | 657 | // A `traverseArray` function that will allow us to iterate over an array and 658 | // call the next function that we will define: `traverseNode`. 659 | function traverseArray(array, parent) { 660 | array.forEach(function(child) { 661 | traverseNode(child, parent); 662 | }); 663 | } 664 | 665 | // `traverseNode` will accept a `node` and its `parent` node. So that it can 666 | // pass both to our visitor methods. 667 | function traverseNode(node, parent) { 668 | 669 | // We start by testing for the existence of a method on the visitor with a 670 | // matching `type`. 671 | var method = visitor[node.type]; 672 | 673 | // If it exists we'll call it with the `node` and its `parent`. 674 | if (method) { 675 | method(node, parent); 676 | } 677 | 678 | // Next we are going to split things up by the current node type. 679 | switch (node.type) { 680 | 681 | // We'll start with our top level `Program`. Since Program nodes have a 682 | // property named body that has an array of nodes, we will call 683 | // `traverseArray` to traverse down into them. 684 | // 685 | // (Remember that `traverseArray` will in turn call `traverseNode` so we 686 | // are causing the tree to be traversed recursively) 687 | case 'Program': 688 | traverseArray(node.body, node); 689 | break; 690 | 691 | // Next we do the same with `CallExpressions` and traverse their `params`. 692 | case 'CallExpression': 693 | traverseArray(node.params, node); 694 | break; 695 | 696 | // In the case of `NumberLiterals` we don't have any child nodes to visit, 697 | // so we'll just break. 698 | case 'NumberLiteral': 699 | break; 700 | 701 | // And again, if we haven't recognized the node type then we'll throw an 702 | // error. 703 | default: 704 | throw new TypeError(node.type); 705 | } 706 | } 707 | 708 | // Finally we kickstart the traverser by calling `traverseNode` with our ast 709 | // with no `parent` because the top level of the AST doesn't have a parent. 710 | traverseNode(ast, null); 711 | } 712 | 713 | /** 714 | * ============================================================================ 715 | * ⁽(◍˃̵͈̑ᴗ˂̵͈̑)⁽ 716 | * THE TRANSFORMER!!! 717 | * ============================================================================ 718 | */ 719 | 720 | /** 721 | * Next up, the transformer. Our transformer is going to take the AST that we 722 | * have built and pass it to our traverser function with a visitor and will 723 | * create a new ast. 724 | * 725 | * ---------------------------------------------------------------------------- 726 | * Original AST | Transformed AST 727 | * ---------------------------------------------------------------------------- 728 | * { | { 729 | * type: 'Program', | type: 'Program', 730 | * body: [{ | body: [{ 731 | * type: 'CallExpression', | type: 'ExpressionStatement', 732 | * name: 'add', | expression: { 733 | * params: [{ | type: 'CallExpression', 734 | * type: 'NumberLiteral', | callee: { 735 | * value: '2' | type: 'Identifier', 736 | * }, { | name: 'add' 737 | * type: 'CallExpression', | }, 738 | * name: 'subtract', | arguments: [{ 739 | * params: [{ | type: 'NumberLiteral', 740 | * type: 'NumberLiteral', | value: '2' 741 | * value: '4' | }, { 742 | * }, { | type: 'CallExpression', 743 | * type: 'NumberLiteral', | callee: { 744 | * value: '2' | type: 'Identifier', 745 | * }] | name: 'subtract' 746 | * }] | }, 747 | * }] | arguments: [{ 748 | * } | type: 'NumberLiteral', 749 | * | value: '4' 750 | * ---------------------------------- | }, { 751 | * | type: 'NumberLiteral', 752 | * | value: '2' 753 | * | }] 754 | * (sorry the other one is longer.) | }] 755 | * | } 756 | * | }] 757 | * | } 758 | * ---------------------------------------------------------------------------- 759 | */ 760 | 761 | // So we have our transformer function which will accept the lisp ast. 762 | function transformer(ast) { 763 | 764 | // We'll create a `newAst` which like our previous AST will have a program 765 | // node. 766 | var newAst = { 767 | type: 'Program', 768 | body: [] 769 | }; 770 | 771 | // Next I'm going to cheat a little and create a bit of a hack. We're going to 772 | // use a property named `context` on our parent nodes that we're going to use 773 | // to push nodes to their parents' `context`'s. Normally you would have a 774 | // better abstraction than this, but for our purposes this keeps things 775 | // simple. 776 | // 777 | // Just take note that the context is a reference *from* the old ast *to* the 778 | // new ast. 779 | ast._context = newAst.body; 780 | 781 | // We'll start by calling the traverser function with our ast and a visitor. 782 | traverser(ast, { 783 | 784 | // The first visitor method accepts `NumberLiterals` 785 | NumberLiteral: function(node, parent) { 786 | // We'll create a new node also named `NumberLiteral` that we will push to 787 | // the parent context. 788 | parent._context.push({ 789 | type: 'NumberLiteral', 790 | value: node.value 791 | }); 792 | }, 793 | 794 | // Next up, `CallExpressions`. 795 | CallExpression: function(node, parent) { 796 | 797 | // We start creating a new node `CallExpression` with a nested 798 | // `Identifier`. 799 | var expression = { 800 | type: 'CallExpression', 801 | callee: { 802 | type: 'Identifier', 803 | name: node.name 804 | }, 805 | arguments: [] 806 | }; 807 | 808 | // Next we're going to define a new context on the original 809 | // `CallExpression` node that will reference the `expression`'s arguments 810 | // so that we can push arguments. 811 | node._context = expression.arguments; 812 | 813 | // Then we're going to check if the parent node is a `CallExpression`. 814 | // If it is not... 815 | if (parent.type !== 'CallExpression') { 816 | 817 | // We're going to wrap our `CallExpression` node with an 818 | // `ExpressionStatement`. We do this because the top level 819 | // `CallExpressions` in JavaScript are actually statements. 820 | expression = { 821 | type: 'ExpressionStatement', 822 | expression: expression 823 | }; 824 | } 825 | 826 | // Last, we push our (possibly wrapped) `CallExpression` to the `parent`'s 827 | // `context`. 828 | parent._context.push(expression); 829 | } 830 | }); 831 | 832 | // At the end of our transformer function we'll return the new ast that we 833 | // just created. 834 | return newAst; 835 | } 836 | 837 | /** 838 | * ============================================================================ 839 | * ヾ(〃^∇^)ノ♪ 840 | * THE CODE GENERATOR!!!! 841 | * ============================================================================ 842 | */ 843 | 844 | /** 845 | * Now let's move on to our last phase: The Code Generator. 846 | * 847 | * Our code generator is going to recursively call itself to print each node in 848 | * the tree into one giant string. 849 | */ 850 | 851 | function codeGenerator(node) { 852 | 853 | // We'll break things down by the `type` of the `node`. 854 | switch (node.type) { 855 | 856 | // If we have a `Program` node. We will map through each node in the `body` 857 | // and run them through the code generator and join them with a newline. 858 | case 'Program': 859 | return node.body.map(codeGenerator) 860 | .join('\n'); 861 | 862 | // For `ExpressionStatements` we'll call the code generator on the nested 863 | // expression and we'll add a semicolon... 864 | case 'ExpressionStatement': 865 | return ( 866 | codeGenerator(node.expression) + 867 | ';' // << (...because we like to code the *correct* way) 868 | ); 869 | 870 | // For `CallExpressions` we will print the `callee`, add an open 871 | // parenthesis, we'll map through each node in the `arguments` array and run 872 | // them through the code generator, joining them with a comma, and then 873 | // we'll add a closing parenthesis. 874 | case 'CallExpression': 875 | return ( 876 | codeGenerator(node.callee) + 877 | '(' + 878 | node.arguments.map(codeGenerator) 879 | .join(', ') + 880 | ')' 881 | ); 882 | 883 | // For `Identifiers` we'll just return the `node`'s name. 884 | case 'Identifier': 885 | return node.name; 886 | 887 | // For `NumberLiterals` we'll just return the `node`'s value. 888 | case 'NumberLiteral': 889 | return node.value; 890 | 891 | // And if we haven't recognized the node, we'll throw an error. 892 | default: 893 | throw new TypeError(node.type); 894 | } 895 | } 896 | 897 | /** 898 | * ============================================================================ 899 | * (۶* ‘ヮ’)۶” 900 | * !!!!!!!!THE COMPILER!!!!!!!! 901 | * ============================================================================ 902 | */ 903 | 904 | /** 905 | * FINALLY! We'll create our `compiler` function. Here we will link together 906 | * every part of the pipeline. 907 | * 908 | * 1. input => tokenizer => tokens 909 | * 2. tokens => parser => ast 910 | * 3. ast => transformer => newAst 911 | * 4. newAst => generator => output 912 | */ 913 | 914 | function compiler(input) { 915 | var tokens = tokenizer(input); 916 | var ast = parser(tokens); 917 | var newAst = transformer(ast); 918 | var output = codeGenerator(newAst); 919 | 920 | // and simply return the output! 921 | return output; 922 | } 923 | 924 | /** 925 | * ============================================================================ 926 | * (๑˃̵ᴗ˂̵)و 927 | * !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!YOU MADE IT!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! 928 | * ============================================================================ 929 | */ 930 | 931 | // Now I'm just exporting everything... 932 | module.exports = { 933 | tokenizer: tokenizer, 934 | parser: parser, 935 | transformer: transformer, 936 | codeGenerator: codeGenerator, 937 | compiler: compiler 938 | }; 939 | -------------------------------------------------------------------------------- /test.js: -------------------------------------------------------------------------------- 1 | var superTinyCompiler = require('./super-tiny-compiler'); 2 | var assert = require('assert'); 3 | 4 | var tokenizer = superTinyCompiler.tokenizer; 5 | var parser = superTinyCompiler.parser; 6 | var transformer = superTinyCompiler.transformer; 7 | var codeGenerator = superTinyCompiler.codeGenerator; 8 | var compiler = superTinyCompiler.compiler; 9 | 10 | var input = '(add 2 (subtract 4 2))'; 11 | var output = 'add(2, subtract(4, 2));'; 12 | 13 | var tokens = [ 14 | { type: 'paren', value: '(' }, 15 | { type: 'name', value: 'add' }, 16 | { type: 'number', value: '2' }, 17 | { type: 'paren', value: '(' }, 18 | { type: 'name', value: 'subtract' }, 19 | { type: 'number', value: '4' }, 20 | { type: 'number', value: '2' }, 21 | { type: 'paren', value: ')' }, 22 | { type: 'paren', value: ')' } 23 | ]; 24 | 25 | var ast = { 26 | type: 'Program', 27 | body: [{ 28 | type: 'CallExpression', 29 | name: 'add', 30 | params: [{ 31 | type: 'NumberLiteral', 32 | value: '2' 33 | }, { 34 | type: 'CallExpression', 35 | name: 'subtract', 36 | params: [{ 37 | type: 'NumberLiteral', 38 | value: '4' 39 | }, { 40 | type: 'NumberLiteral', 41 | value: '2' 42 | }] 43 | }] 44 | }] 45 | }; 46 | 47 | var newAst = { 48 | type: 'Program', 49 | body: [{ 50 | type: 'ExpressionStatement', 51 | expression: { 52 | type: 'CallExpression', 53 | callee: { 54 | type: 'Identifier', 55 | name: 'add' 56 | }, 57 | arguments: [{ 58 | type: 'NumberLiteral', 59 | value: '2' 60 | }, { 61 | type: 'CallExpression', 62 | callee: { 63 | type: 'Identifier', 64 | name: 'subtract' 65 | }, 66 | arguments: [{ 67 | type: 'NumberLiteral', 68 | value: '4' 69 | }, { 70 | type: 'NumberLiteral', 71 | value: '2' 72 | }] 73 | }] 74 | } 75 | }] 76 | }; 77 | 78 | assert.deepStrictEqual(tokenizer(input), tokens, 'Tokenizer should turn `input` string into `tokens` array'); 79 | assert.deepStrictEqual(parser(tokens), ast, 'Parser should turn `tokens` array into `ast`'); 80 | assert.deepStrictEqual(transformer(ast), newAst, 'Transformer should turn `ast` into a `newAst`'); 81 | assert.deepStrictEqual(codeGenerator(newAst), output, 'Code Generator should turn `newAst` into `output` string'); 82 | assert.deepStrictEqual(compiler(input), output, 'Compiler should turn `input` into `output`'); 83 | 84 | console.log('All Passed!'); 85 | --------------------------------------------------------------------------------