├── .gitignore
├── LICENSE
├── README.md
├── h.c
├── hello.c
├── makefile
├── xc-tutor.c
└── xc.c
/.gitignore:
--------------------------------------------------------------------------------
1 | # Object files
2 | *.o
3 | *.ko
4 | *.obj
5 | *.elf
6 |
7 | # Precompiled Headers
8 | *.gch
9 | *.pch
10 |
11 | # Libraries
12 | *.lib
13 | *.a
14 | *.la
15 | *.lo
16 |
17 | # Shared objects (inc. Windows DLLs)
18 | *.dll
19 | *.so
20 | *.so.*
21 | *.dylib
22 |
23 | # Executables
24 | *.exe
25 | *.out
26 | *.app
27 | *.i*86
28 | *.x86_64
29 | *.hex
30 |
31 | # Debug files
32 | *.dSYM/
33 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | GNU GENERAL PUBLIC LICENSE
2 | Version 2, June 1991
3 |
4 | Copyright (C) 1989, 1991 Free Software Foundation, Inc.,
5 | 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
6 | Everyone is permitted to copy and distribute verbatim copies
7 | of this license document, but changing it is not allowed.
8 |
9 | Preamble
10 |
11 | The licenses for most software are designed to take away your
12 | freedom to share and change it. By contrast, the GNU General Public
13 | License is intended to guarantee your freedom to share and change free
14 | software--to make sure the software is free for all its users. This
15 | General Public License applies to most of the Free Software
16 | Foundation's software and to any other program whose authors commit to
17 | using it. (Some other Free Software Foundation software is covered by
18 | the GNU Lesser General Public License instead.) You can apply it to
19 | your programs, too.
20 |
21 | When we speak of free software, we are referring to freedom, not
22 | price. Our General Public Licenses are designed to make sure that you
23 | have the freedom to distribute copies of free software (and charge for
24 | this service if you wish), that you receive source code or can get it
25 | if you want it, that you can change the software or use pieces of it
26 | in new free programs; and that you know you can do these things.
27 |
28 | To protect your rights, we need to make restrictions that forbid
29 | anyone to deny you these rights or to ask you to surrender the rights.
30 | These restrictions translate to certain responsibilities for you if you
31 | distribute copies of the software, or if you modify it.
32 |
33 | For example, if you distribute copies of such a program, whether
34 | gratis or for a fee, you must give the recipients all the rights that
35 | you have. You must make sure that they, too, receive or can get the
36 | source code. And you must show them these terms so they know their
37 | rights.
38 |
39 | We protect your rights with two steps: (1) copyright the software, and
40 | (2) offer you this license which gives you legal permission to copy,
41 | distribute and/or modify the software.
42 |
43 | Also, for each author's protection and ours, we want to make certain
44 | that everyone understands that there is no warranty for this free
45 | software. If the software is modified by someone else and passed on, we
46 | want its recipients to know that what they have is not the original, so
47 | that any problems introduced by others will not reflect on the original
48 | authors' reputations.
49 |
50 | Finally, any free program is threatened constantly by software
51 | patents. We wish to avoid the danger that redistributors of a free
52 | program will individually obtain patent licenses, in effect making the
53 | program proprietary. To prevent this, we have made it clear that any
54 | patent must be licensed for everyone's free use or not licensed at all.
55 |
56 | The precise terms and conditions for copying, distribution and
57 | modification follow.
58 |
59 | GNU GENERAL PUBLIC LICENSE
60 | TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
61 |
62 | 0. This License applies to any program or other work which contains
63 | a notice placed by the copyright holder saying it may be distributed
64 | under the terms of this General Public License. The "Program", below,
65 | refers to any such program or work, and a "work based on the Program"
66 | means either the Program or any derivative work under copyright law:
67 | that is to say, a work containing the Program or a portion of it,
68 | either verbatim or with modifications and/or translated into another
69 | language. (Hereinafter, translation is included without limitation in
70 | the term "modification".) Each licensee is addressed as "you".
71 |
72 | Activities other than copying, distribution and modification are not
73 | covered by this License; they are outside its scope. The act of
74 | running the Program is not restricted, and the output from the Program
75 | is covered only if its contents constitute a work based on the
76 | Program (independent of having been made by running the Program).
77 | Whether that is true depends on what the Program does.
78 |
79 | 1. You may copy and distribute verbatim copies of the Program's
80 | source code as you receive it, in any medium, provided that you
81 | conspicuously and appropriately publish on each copy an appropriate
82 | copyright notice and disclaimer of warranty; keep intact all the
83 | notices that refer to this License and to the absence of any warranty;
84 | and give any other recipients of the Program a copy of this License
85 | along with the Program.
86 |
87 | You may charge a fee for the physical act of transferring a copy, and
88 | you may at your option offer warranty protection in exchange for a fee.
89 |
90 | 2. You may modify your copy or copies of the Program or any portion
91 | of it, thus forming a work based on the Program, and copy and
92 | distribute such modifications or work under the terms of Section 1
93 | above, provided that you also meet all of these conditions:
94 |
95 | a) You must cause the modified files to carry prominent notices
96 | stating that you changed the files and the date of any change.
97 |
98 | b) You must cause any work that you distribute or publish, that in
99 | whole or in part contains or is derived from the Program or any
100 | part thereof, to be licensed as a whole at no charge to all third
101 | parties under the terms of this License.
102 |
103 | c) If the modified program normally reads commands interactively
104 | when run, you must cause it, when started running for such
105 | interactive use in the most ordinary way, to print or display an
106 | announcement including an appropriate copyright notice and a
107 | notice that there is no warranty (or else, saying that you provide
108 | a warranty) and that users may redistribute the program under
109 | these conditions, and telling the user how to view a copy of this
110 | License. (Exception: if the Program itself is interactive but
111 | does not normally print such an announcement, your work based on
112 | the Program is not required to print an announcement.)
113 |
114 | These requirements apply to the modified work as a whole. If
115 | identifiable sections of that work are not derived from the Program,
116 | and can be reasonably considered independent and separate works in
117 | themselves, then this License, and its terms, do not apply to those
118 | sections when you distribute them as separate works. But when you
119 | distribute the same sections as part of a whole which is a work based
120 | on the Program, the distribution of the whole must be on the terms of
121 | this License, whose permissions for other licensees extend to the
122 | entire whole, and thus to each and every part regardless of who wrote it.
123 |
124 | Thus, it is not the intent of this section to claim rights or contest
125 | your rights to work written entirely by you; rather, the intent is to
126 | exercise the right to control the distribution of derivative or
127 | collective works based on the Program.
128 |
129 | In addition, mere aggregation of another work not based on the Program
130 | with the Program (or with a work based on the Program) on a volume of
131 | a storage or distribution medium does not bring the other work under
132 | the scope of this License.
133 |
134 | 3. You may copy and distribute the Program (or a work based on it,
135 | under Section 2) in object code or executable form under the terms of
136 | Sections 1 and 2 above provided that you also do one of the following:
137 |
138 | a) Accompany it with the complete corresponding machine-readable
139 | source code, which must be distributed under the terms of Sections
140 | 1 and 2 above on a medium customarily used for software interchange; or,
141 |
142 | b) Accompany it with a written offer, valid for at least three
143 | years, to give any third party, for a charge no more than your
144 | cost of physically performing source distribution, a complete
145 | machine-readable copy of the corresponding source code, to be
146 | distributed under the terms of Sections 1 and 2 above on a medium
147 | customarily used for software interchange; or,
148 |
149 | c) Accompany it with the information you received as to the offer
150 | to distribute corresponding source code. (This alternative is
151 | allowed only for noncommercial distribution and only if you
152 | received the program in object code or executable form with such
153 | an offer, in accord with Subsection b above.)
154 |
155 | The source code for a work means the preferred form of the work for
156 | making modifications to it. For an executable work, complete source
157 | code means all the source code for all modules it contains, plus any
158 | associated interface definition files, plus the scripts used to
159 | control compilation and installation of the executable. However, as a
160 | special exception, the source code distributed need not include
161 | anything that is normally distributed (in either source or binary
162 | form) with the major components (compiler, kernel, and so on) of the
163 | operating system on which the executable runs, unless that component
164 | itself accompanies the executable.
165 |
166 | If distribution of executable or object code is made by offering
167 | access to copy from a designated place, then offering equivalent
168 | access to copy the source code from the same place counts as
169 | distribution of the source code, even though third parties are not
170 | compelled to copy the source along with the object code.
171 |
172 | 4. You may not copy, modify, sublicense, or distribute the Program
173 | except as expressly provided under this License. Any attempt
174 | otherwise to copy, modify, sublicense or distribute the Program is
175 | void, and will automatically terminate your rights under this License.
176 | However, parties who have received copies, or rights, from you under
177 | this License will not have their licenses terminated so long as such
178 | parties remain in full compliance.
179 |
180 | 5. You are not required to accept this License, since you have not
181 | signed it. However, nothing else grants you permission to modify or
182 | distribute the Program or its derivative works. These actions are
183 | prohibited by law if you do not accept this License. Therefore, by
184 | modifying or distributing the Program (or any work based on the
185 | Program), you indicate your acceptance of this License to do so, and
186 | all its terms and conditions for copying, distributing or modifying
187 | the Program or works based on it.
188 |
189 | 6. Each time you redistribute the Program (or any work based on the
190 | Program), the recipient automatically receives a license from the
191 | original licensor to copy, distribute or modify the Program subject to
192 | these terms and conditions. You may not impose any further
193 | restrictions on the recipients' exercise of the rights granted herein.
194 | You are not responsible for enforcing compliance by third parties to
195 | this License.
196 |
197 | 7. If, as a consequence of a court judgment or allegation of patent
198 | infringement or for any other reason (not limited to patent issues),
199 | conditions are imposed on you (whether by court order, agreement or
200 | otherwise) that contradict the conditions of this License, they do not
201 | excuse you from the conditions of this License. If you cannot
202 | distribute so as to satisfy simultaneously your obligations under this
203 | License and any other pertinent obligations, then as a consequence you
204 | may not distribute the Program at all. For example, if a patent
205 | license would not permit royalty-free redistribution of the Program by
206 | all those who receive copies directly or indirectly through you, then
207 | the only way you could satisfy both it and this License would be to
208 | refrain entirely from distribution of the Program.
209 |
210 | If any portion of this section is held invalid or unenforceable under
211 | any particular circumstance, the balance of the section is intended to
212 | apply and the section as a whole is intended to apply in other
213 | circumstances.
214 |
215 | It is not the purpose of this section to induce you to infringe any
216 | patents or other property right claims or to contest validity of any
217 | such claims; this section has the sole purpose of protecting the
218 | integrity of the free software distribution system, which is
219 | implemented by public license practices. Many people have made
220 | generous contributions to the wide range of software distributed
221 | through that system in reliance on consistent application of that
222 | system; it is up to the author/donor to decide if he or she is willing
223 | to distribute software through any other system and a licensee cannot
224 | impose that choice.
225 |
226 | This section is intended to make thoroughly clear what is believed to
227 | be a consequence of the rest of this License.
228 |
229 | 8. If the distribution and/or use of the Program is restricted in
230 | certain countries either by patents or by copyrighted interfaces, the
231 | original copyright holder who places the Program under this License
232 | may add an explicit geographical distribution limitation excluding
233 | those countries, so that distribution is permitted only in or among
234 | countries not thus excluded. In such case, this License incorporates
235 | the limitation as if written in the body of this License.
236 |
237 | 9. The Free Software Foundation may publish revised and/or new versions
238 | of the General Public License from time to time. Such new versions will
239 | be similar in spirit to the present version, but may differ in detail to
240 | address new problems or concerns.
241 |
242 | Each version is given a distinguishing version number. If the Program
243 | specifies a version number of this License which applies to it and "any
244 | later version", you have the option of following the terms and conditions
245 | either of that version or of any later version published by the Free
246 | Software Foundation. If the Program does not specify a version number of
247 | this License, you may choose any version ever published by the Free Software
248 | Foundation.
249 |
250 | 10. If you wish to incorporate parts of the Program into other free
251 | programs whose distribution conditions are different, write to the author
252 | to ask for permission. For software which is copyrighted by the Free
253 | Software Foundation, write to the Free Software Foundation; we sometimes
254 | make exceptions for this. Our decision will be guided by the two goals
255 | of preserving the free status of all derivatives of our free software and
256 | of promoting the sharing and reuse of software generally.
257 |
258 | NO WARRANTY
259 |
260 | 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
261 | FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN
262 | OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
263 | PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
264 | OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
265 | MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS
266 | TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE
267 | PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
268 | REPAIR OR CORRECTION.
269 |
270 | 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
271 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
272 | REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
273 | INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
274 | OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
275 | TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
276 | YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
277 | PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
278 | POSSIBILITY OF SUCH DAMAGES.
279 |
280 | END OF TERMS AND CONDITIONS
281 |
282 | How to Apply These Terms to Your New Programs
283 |
284 | If you develop a new program, and you want it to be of the greatest
285 | possible use to the public, the best way to achieve this is to make it
286 | free software which everyone can redistribute and change under these terms.
287 |
288 | To do so, attach the following notices to the program. It is safest
289 | to attach them to the start of each source file to most effectively
290 | convey the exclusion of warranty; and each file should have at least
291 | the "copyright" line and a pointer to where the full notice is found.
292 |
293 | {description}
294 | Copyright (C) {year} {fullname}
295 |
296 | This program is free software; you can redistribute it and/or modify
297 | it under the terms of the GNU General Public License as published by
298 | the Free Software Foundation; either version 2 of the License, or
299 | (at your option) any later version.
300 |
301 | This program is distributed in the hope that it will be useful,
302 | but WITHOUT ANY WARRANTY; without even the implied warranty of
303 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
304 | GNU General Public License for more details.
305 |
306 | You should have received a copy of the GNU General Public License along
307 | with this program; if not, write to the Free Software Foundation, Inc.,
308 | 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
309 |
310 | Also add information on how to contact you by electronic and paper mail.
311 |
312 | If the program is interactive, make it output a short notice like this
313 | when it starts in an interactive mode:
314 |
315 | Gnomovision version 69, Copyright (C) year name of author
316 | Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
317 | This is free software, and you are welcome to redistribute it
318 | under certain conditions; type `show c' for details.
319 |
320 | The hypothetical commands `show w' and `show c' should show the appropriate
321 | parts of the General Public License. Of course, the commands you use may
322 | be called something other than `show w' and `show c'; they could even be
323 | mouse-clicks or menu items--whatever suits your program.
324 |
325 | You should also get your employer (if you work as a programmer) or your
326 | school, if any, to sign a "copyright disclaimer" for the program, if
327 | necessary. Here is a sample; alter the names:
328 |
329 | Yoyodyne, Inc., hereby disclaims all copyright interest in the program
330 | `Gnomovision' (which makes passes at compilers) written by James Hacker.
331 |
332 | {signature of Ty Coon}, 1 April 1989
333 | Ty Coon, President of Vice
334 |
335 | This General Public License does not permit incorporating your program into
336 | proprietary programs. If your program is a subroutine library, you may
337 | consider it more useful to permit linking proprietary applications with the
338 | library. If this is what you want to do, use the GNU Lesser General
339 | Public License instead of this License.
340 |
341 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | C interpreter that interpretes itself.
2 |
3 | # How to Run the Code
4 |
5 | File `xc.c` is the original one and `xc-tutor.c` is the one that I make for
6 | the tutorial step by step.
7 |
8 | ```
9 | gcc -o xc xc.c (you may need the -m32 option on 64bit machines)
10 | ./xc hello.c
11 | ./xc -s hello.c
12 |
13 | ./xc c4.c hello.c
14 | ./xc c4.c c4.c hello.c
15 | ```
16 |
17 | # About
18 |
19 | This project is inspired by [c4](https://github.com/rswier/c4) and is largely
20 | based on it.
21 |
22 | However, I rewrited them all to make it more understable and help myself to
23 | understand it.
24 |
25 | Despite the complexity we saw in books about compiler design, writing one is
26 | not that hard. You don't need that much theory though they will help for
27 | better understanding the logic behind the code.
28 |
29 | Also I write a series of article about how this compiler is built(in Chinese though):
30 |
31 | 1. [手把手教你构建 C 语言编译器(0)——前言](http://lotabout.me/2015/write-a-C-interpreter-0/)
32 | 2. [手把手教你构建 C 语言编译器(1)——设计](http://lotabout.me/2015/write-a-C-interpreter-1/)
33 | 3. [手把手教你构建 C 语言编译器(2)——虚拟机](http://lotabout.me/2015/write-a-C-interpreter-2/)
34 | 4. [手把手教你构建 C 语言编译器(3)——词法分析器](http://lotabout.me/2015/write-a-C-interpreter-3/)
35 | 4. [手把手教你构建 C 语言编译器(4)——递归下降](http://lotabout.me/2016/write-a-C-interpreter-4/)
36 | 5. [手把手教你构建 C 语言编译器(5)——变量定义](http://lotabout.me/2016/write-a-C-interpreter-5/)
37 | 6. [手把手教你构建 C 语言编译器(6)——函数定义](http://lotabout.me/2016/write-a-C-interpreter-6/)
38 | 7. [手把手教你构建 C 语言编译器(7)——语句](http://lotabout.me/2016/write-a-C-interpreter-7/)
39 | 8. [手把手教你构建 C 语言编译器(8)——表达式](http://lotabout.me/2016/write-a-C-interpreter-8/)
40 | 0. [手把手教你构建 C 语言编译器(9)——总结](http://lotabout.me/2016/write-a-C-interpreter-9/)
41 |
42 | # Resources
43 |
44 | Further Reading:
45 |
46 | - [Let's Build a Compiler](http://compilers.iecc.com/crenshaw/): An excellent
47 | starting material for building compiler.
48 |
49 |
50 | Forks:
51 |
52 | - [A fork that implement debugger for xc.c](https://github.com/descent/write-a-C-interpreter)
53 |
54 |
55 | # Licence
56 |
57 | The original code is licenced with GPL2, so this code will use the same
58 | licence.
59 |
--------------------------------------------------------------------------------
/h.c:
--------------------------------------------------------------------------------
1 | #include
2 |
3 | int main(int argc, char **argv)
4 | {
5 | int i,j;
6 | j = 1 + 2;
7 | //printf("hello\n");
8 | printf("hello %d\n", j);
9 | // printf("xx %d\n", i);
10 | return 2;
11 | }
12 |
--------------------------------------------------------------------------------
/hello.c:
--------------------------------------------------------------------------------
1 | #include
2 |
3 | int fibonacci(int i) {
4 | if (i <= 1) {
5 | return 1;
6 | }
7 | return fibonacci(i-1) + fibonacci(i-2);
8 | }
9 |
10 | int main()
11 | {
12 | int i;
13 | i = 0;
14 | while (i <= 10) {
15 | printf("fibonacci(%2d) = %d\n", i, fibonacci(i));
16 | i = i + 1;
17 | }
18 | return 0;
19 | }
20 |
--------------------------------------------------------------------------------
/makefile:
--------------------------------------------------------------------------------
1 | CC=gcc
2 | CFLAGS = -m32 -g
3 | xc-tutor: xc-tutor.o
4 | $(CC) $(CFLAGS) -o $@ $<
5 | xc-tutor.o: xc-tutor.c
6 | $(CC) $(CFLAGS) -c $<
7 | clean:
8 | rm -rf *.o xc-tutor
9 |
--------------------------------------------------------------------------------
/xc-tutor.c:
--------------------------------------------------------------------------------
1 | // this file is used for tutorial to build the compiler step by step
2 |
3 | #include
4 | #include
5 | #include
6 | #include
7 |
8 |
9 | #include
10 | #include
11 | #include
12 | #include
13 |
14 | #define SUPPORT_DEBUG
15 |
16 | #ifdef SUPPORT_DEBUG
17 | int print_spec_text(int *cur_text);
18 | void print_text();
19 | void print_data();
20 | void print_symbol_table();
21 | void print_source_code();
22 | void debug_usage();
23 | #define MAX_BREAK_POINT 10
24 | unsigned int break_points[MAX_BREAK_POINT];
25 | int last_bp;
26 | int meet_break_point(int *pc);
27 | #endif
28 |
29 | int token; // current token
30 | int token_val; // value of current token (mainly for number)
31 | char *src_begin;
32 | char *src, *old_src; // pointer to source code string;
33 | int poolsize; // default size of text/data/stack
34 | int line; // line number
35 | int *begin_text;
36 | int *begin_stack;
37 | signed char *begin_data;
38 | int *text, // text segment
39 | *old_text, // for dump text segment
40 | *stack; // stack
41 | signed char *data; // data segment
42 | int *pc, *bp, *sp, ax, cycle; // virtual machine registers
43 | int *current_id, // current parsed ID
44 | *symbols; // symbol table
45 | int *idmain; // the `main` function
46 |
47 | // instructions
48 | enum { LEA ,IMM ,JMP ,CALL,JZ ,JNZ ,ENT ,ADJ ,LEV ,LI ,LC ,SI ,SC ,PUSH,
49 | OR ,XOR ,AND ,EQ ,NE ,LT ,GT ,LE ,GE ,SHL ,SHR ,ADD ,SUB ,MUL ,DIV ,MOD ,
50 | OPEN,READ,CLOS,PRTF,MALC,MSET,MCMP,EXIT };
51 |
52 | #ifdef SUPPORT_DEBUG
53 | const char *inst_str[] =
54 | {
55 | "LEA","IMM","JMP","CALL","JZ ","JNZ","ENT","ADJ","LEV","LI ","LC ","SI ","SC ","PUSH"," OR ","XOR","AND","EQ ","NE ","LT ","GT ","LE ","GE ","SHL","SHR","ADD","SUB","MUL","DIV","MOD"," OPEN","READ","CLOS","PRTF","MALC","MSET","MCMP","EXIT"
56 | };
57 |
58 | #define INST_LEN (sizeof(inst_str)/sizeof(char*))
59 |
60 | int inst_has_argu(int inst)
61 | {
62 | if ( (LEA <= inst) && (inst <= ADJ))
63 | return 1;
64 | else
65 | return 0;
66 | }
67 |
68 | const char* inst_2_str(int inst)
69 | {
70 | if ( (0 <= inst) && (inst < INST_LEN))
71 | {
72 | return inst_str[inst];
73 | }
74 | else
75 | return 0;
76 | }
77 | #endif
78 |
79 | // tokens and classes (operators last and in precedence order)
80 | enum {
81 | Num = 128, Fun, Sys, Glo, Loc, Id,
82 | Char, Else, Enum, If, Int, Return, Sizeof, While,
83 | Assign, Cond, Lor, Lan, Or, Xor, And, Eq, Ne, Lt, Gt, Le, Ge, Shl, Shr, Add, Sub, Mul, Div, Mod, Inc, Dec, Brak
84 | };
85 |
86 | #ifdef SUPPORT_DEBUG
87 | const char *class_str_array[] =
88 | {
89 | "Num", "Fun", "Sys", "Glo", "Loc", "Id", "Char", "Else", "Enum", "If", "Int", "Return", "Sizeof", "While", "Assign", "Cond", "Lor", "Lan", "Or", "Xor", "And", "Eq", "Ne", "Lt", "Gt", "Le", "Ge", "Shl", "Shr", "Add", "Sub", "Mul", "Div", "Mod", "Inc", "Dec", "Brak"
90 | };
91 |
92 | const char *class_str(int cls)
93 | {
94 | int class_index = cls-Num;
95 |
96 | if (0 <= class_index && class_index <= Brak)
97 | return class_str_array[class_index];
98 | else
99 | return "no such class";
100 | }
101 | #endif
102 |
103 |
104 | // fields of identifier
105 | enum {Token, Hash, Name, Type, Class, Value, BType, BClass, BValue, IdSize};
106 |
107 | // types of variable/function
108 | enum { CHAR, INT, PTR };
109 |
110 | #ifdef SUPPORT_DEBUG
111 | const char *type_str_array[] = {"CHAR", "INT", "PTR"};
112 | const char * type_str(int type)
113 | {
114 | if (0 <= type && type <= PTR)
115 | return type_str_array[type];
116 | else
117 | return "no such type";
118 | }
119 | #endif
120 |
121 | int basetype; // the type of a declaration, make it global for convenience
122 | int expr_type; // the type of an expression
123 |
124 | // function frame
125 | //
126 | // 0: arg 1
127 | // 1: arg 2
128 | // 2: arg 3
129 | // 3: return address
130 | // 4: old bp pointer <- index_of_bp
131 | // 5: local var 1
132 | // 6: local var 2
133 | int index_of_bp; // index of bp pointer on stack
134 |
135 | void next() {
136 | char *last_pos;
137 | int hash;
138 |
139 | while (token = *src) {
140 | ++src;
141 |
142 | // parse token here
143 | if (token == '\n') {
144 | ++line;
145 | }
146 | else if (token == '#') {
147 | // skip macro, because we will not support it
148 | while (*src != 0 && *src != '\n') {
149 | src++;
150 | }
151 | }
152 | else if ((token >= 'a' && token <= 'z') || (token >= 'A' && token <= 'Z') || (token == '_')) {
153 |
154 | // parse identifier
155 | last_pos = src - 1;
156 | hash = token;
157 |
158 | while ((*src >= 'a' && *src <= 'z') || (*src >= 'A' && *src <= 'Z') || (*src >= '0' && *src <= '9') || (*src == '_')) {
159 | hash = hash * 147 + *src;
160 | src++;
161 | }
162 |
163 | // look for existing identifier, linear search
164 | current_id = symbols;
165 | while (current_id[Token]) {
166 | if (current_id[Hash] == hash && !memcmp((char *)current_id[Name], last_pos, src - last_pos)) {
167 | //found one, return
168 | token = current_id[Token];
169 | return;
170 | }
171 | current_id = current_id + IdSize;
172 | }
173 |
174 |
175 | // store new ID
176 | current_id[Name] = (int)last_pos;
177 | current_id[Hash] = hash;
178 | token = current_id[Token] = Id;
179 | return;
180 | }
181 | else if (token >= '0' && token <= '9') {
182 | // parse number, three kinds: dec(123) hex(0x123) oct(017)
183 | token_val = token - '0';
184 | if (token_val > 0) {
185 | // dec, starts with [1-9]
186 | while (*src >= '0' && *src <= '9') {
187 | token_val = token_val*10 + *src++ - '0';
188 | }
189 | } else {
190 | // starts with 0
191 | if (*src == 'x' || *src == 'X') {
192 | //hex
193 | token = *++src;
194 | while ((token >= '0' && token <= '9') || (token >= 'a' && token <= 'f') || (token >= 'A' && token <= 'F')) {
195 | token_val = token_val * 16 + (token & 15) + (token >= 'A' ? 9 : 0);
196 | token = *++src;
197 | }
198 | } else {
199 | // oct
200 | while (*src >= '0' && *src <= '7') {
201 | token_val = token_val*8 + *src++ - '0';
202 | }
203 | }
204 | }
205 |
206 | token = Num;
207 | return;
208 | }
209 | else if (token == '"' || token == '\'') {
210 | // parse string literal, currently, the only supported escape
211 | // character is '\n', store the string literal into data.
212 | last_pos = data;
213 | while (*src != 0 && *src != token) {
214 | token_val = *src++;
215 | if (token_val == '\\') {
216 | // escape character
217 | token_val = *src++;
218 | if (token_val == 'n') {
219 | token_val = '\n';
220 | }
221 | }
222 |
223 | if (token == '"') {
224 | *data++ = token_val;
225 | }
226 | }
227 |
228 | src++;
229 | // if it is a single character, return Num token
230 | if (token == '"') {
231 | token_val = (int)last_pos;
232 | } else {
233 | token = Num;
234 | }
235 |
236 | return;
237 | }
238 | else if (token == '/') {
239 | if (*src == '/') {
240 | // skip comments
241 | while (*src != 0 && *src != '\n') {
242 | ++src;
243 | }
244 | } else {
245 | // divide operator
246 | token = Div;
247 | return;
248 | }
249 | }
250 | else if (token == '=') {
251 | // parse '==' and '='
252 | if (*src == '=') {
253 | src ++;
254 | token = Eq;
255 | } else {
256 | token = Assign;
257 | }
258 | return;
259 | }
260 | else if (token == '+') {
261 | // parse '+' and '++'
262 | if (*src == '+') {
263 | src ++;
264 | token = Inc;
265 | } else {
266 | token = Add;
267 | }
268 | return;
269 | }
270 | else if (token == '-') {
271 | // parse '-' and '--'
272 | if (*src == '-') {
273 | src ++;
274 | token = Dec;
275 | } else {
276 | token = Sub;
277 | }
278 | return;
279 | }
280 | else if (token == '!') {
281 | // parse '!='
282 | if (*src == '=') {
283 | src++;
284 | token = Ne;
285 | }
286 | return;
287 | }
288 | else if (token == '<') {
289 | // parse '<=', '<<' or '<'
290 | if (*src == '=') {
291 | src ++;
292 | token = Le;
293 | } else if (*src == '<') {
294 | src ++;
295 | token = Shl;
296 | } else {
297 | token = Lt;
298 | }
299 | return;
300 | }
301 | else if (token == '>') {
302 | // parse '>=', '>>' or '>'
303 | if (*src == '=') {
304 | src ++;
305 | token = Ge;
306 | } else if (*src == '>') {
307 | src ++;
308 | token = Shr;
309 | } else {
310 | token = Gt;
311 | }
312 | return;
313 | }
314 | else if (token == '|') {
315 | // parse '|' or '||'
316 | if (*src == '|') {
317 | src ++;
318 | token = Lor;
319 | } else {
320 | token = Or;
321 | }
322 | return;
323 | }
324 | else if (token == '&') {
325 | // parse '&' and '&&'
326 | if (*src == '&') {
327 | src ++;
328 | token = Lan;
329 | } else {
330 | token = And;
331 | }
332 | return;
333 | }
334 | else if (token == '^') {
335 | token = Xor;
336 | return;
337 | }
338 | else if (token == '%') {
339 | token = Mod;
340 | return;
341 | }
342 | else if (token == '*') {
343 | token = Mul;
344 | return;
345 | }
346 | else if (token == '[') {
347 | token = Brak;
348 | return;
349 | }
350 | else if (token == '?') {
351 | token = Cond;
352 | return;
353 | }
354 | else if (token == '~' || token == ';' || token == '{' || token == '}' || token == '(' || token == ')' || token == ']' || token == ',' || token == ':') {
355 | // directly return the character as token;
356 | return;
357 | }
358 | }
359 | return;
360 | }
361 |
362 | void match(int tk) {
363 | if (token == tk) {
364 | next();
365 | } else {
366 | printf("%d: expected token: %d\n", line, tk);
367 | exit(-1);
368 | }
369 | }
370 |
371 | void expression(int level) {
372 | // expressions have various format.
373 | // but majorly can be divided into two parts: unit and operator
374 | // for example `(char) *a[10] = (int *) func(b > 0 ? 10 : 20);
375 | // `a[10]` is an unit while `*` is an operator.
376 | // `func(...)` in total is an unit.
377 | // so we should first parse those unit and unary operators
378 | // and then the binary ones
379 | //
380 | // also the expression can be in the following types:
381 | //
382 | // 1. unit_unary ::= unit | unit unary_op | unary_op unit
383 | // 2. expr ::= unit_unary (bin_op unit_unary ...)
384 |
385 | // unit_unary()
386 | int *id;
387 | int tmp;
388 | int *addr;
389 | {
390 | if (!token) {
391 | printf("%d: unexpected token EOF of expression\n", line);
392 | exit(-1);
393 | }
394 | if (token == Num) {
395 | match(Num);
396 |
397 | // emit code
398 | *++text = IMM;
399 | *++text = token_val;
400 | expr_type = INT;
401 | }
402 | else if (token == '"') {
403 | // continous string "abc" "abc"
404 |
405 |
406 | // emit code
407 | *++text = IMM;
408 | *++text = token_val;
409 |
410 | match('"');
411 | // store the rest strings
412 | while (token == '"') {
413 | match('"');
414 | }
415 |
416 | // append the end of string character '\0', all the data are default
417 | // to 0, so just move data one position forward.
418 | data = (char *)(((int)data + sizeof(int)) & (-sizeof(int)));
419 | // *data = 0;
420 | expr_type = PTR;
421 | }
422 | else if (token == Sizeof) {
423 | // sizeof is actually an unary operator
424 | // now only `sizeof(int)`, `sizeof(char)` and `sizeof(*...)` are
425 | // supported.
426 | match(Sizeof);
427 | match('(');
428 | expr_type = INT;
429 |
430 | if (token == Int) {
431 | match(Int);
432 | } else if (token == Char) {
433 | match(Char);
434 | expr_type = CHAR;
435 | }
436 |
437 | while (token == Mul) {
438 | match(Mul);
439 | expr_type = expr_type + PTR;
440 | }
441 |
442 | match(')');
443 |
444 | // emit code
445 | *++text = IMM;
446 | *++text = (expr_type == CHAR) ? sizeof(char) : sizeof(int);
447 |
448 | expr_type = INT;
449 | }
450 | else if (token == Id) {
451 | // there are several type when occurs to Id
452 | // but this is unit, so it can only be
453 | // 1. function call
454 | // 2. Enum variable
455 | // 3. global/local variable
456 | match(Id);
457 |
458 | id = current_id;
459 |
460 | if (token == '(') {
461 | // function call
462 | match('(');
463 |
464 | // pass in arguments
465 | tmp = 0; // number of arguments
466 | while (token != ')') {
467 | expression(Assign);
468 | *++text = PUSH;
469 | tmp ++;
470 |
471 | if (token == ',') {
472 | match(',');
473 | }
474 |
475 | }
476 | match(')');
477 |
478 | // emit code
479 | if (id[Class] == Sys) {
480 | // system functions
481 | *++text = id[Value];
482 | }
483 | else if (id[Class] == Fun) {
484 | // function call
485 | *++text = CALL;
486 | *++text = id[Value];
487 | }
488 | else {
489 | printf("%d: bad function call\n", line);
490 | exit(-1);
491 | }
492 |
493 | // clean the stack for arguments
494 | if (tmp > 0) {
495 | *++text = ADJ;
496 | *++text = tmp;
497 | }
498 | expr_type = id[Type];
499 | }
500 | else if (id[Class] == Num) {
501 | // enum variable
502 | *++text = IMM;
503 | *++text = id[Value];
504 | expr_type = INT;
505 | }
506 | else {
507 | // variable
508 | if (id[Class] == Loc) {
509 | *++text = LEA;
510 | *++text = index_of_bp - id[Value];
511 | }
512 | else if (id[Class] == Glo) {
513 | *++text = IMM;
514 | *++text = id[Value];
515 | }
516 | else {
517 | printf("%d: undefined variable\n", line);
518 | exit(-1);
519 | }
520 |
521 | // emit code, default behaviour is to load the value of the
522 | // address which is stored in `ax`
523 | expr_type = id[Type];
524 | *++text = (expr_type == Char) ? LC : LI;
525 | }
526 | }
527 | else if (token == '(') {
528 | // cast or parenthesis
529 | match('(');
530 | if (token == Int || token == Char) {
531 | tmp = (token == Char) ? CHAR : INT; // cast type
532 | match(token);
533 | while (token == Mul) {
534 | match(Mul);
535 | tmp = tmp + PTR;
536 | }
537 |
538 | match(')');
539 |
540 | expression(Inc); // cast has precedence as Inc(++)
541 |
542 | expr_type = tmp;
543 | } else {
544 | // normal parenthesis
545 | expression(Assign);
546 | match(')');
547 | }
548 | }
549 | else if (token == Mul) {
550 | // dereference *
551 | match(Mul);
552 | expression(Inc); // dereference has the same precedence as Inc(++)
553 |
554 | if (expr_type >= PTR) {
555 | expr_type = expr_type - PTR;
556 | } else {
557 | printf("%d: bad dereference\n", line);
558 | exit(-1);
559 | }
560 |
561 | *++text = (expr_type == CHAR) ? LC : LI;
562 | }
563 | else if (token == And) {
564 | // get the address of
565 | match(And);
566 | expression(Inc); // get the address of
567 | if (*text == LC || *text == LI) {
568 | text --;
569 | } else {
570 | printf("%d: bad address of\n", line);
571 | exit(-1);
572 | }
573 |
574 | expr_type = expr_type + PTR;
575 | }
576 | else if (token == '!') {
577 | // not
578 | match('!');
579 | expression(Inc);
580 |
581 | // emit code, use == 0
582 | *++text = PUSH;
583 | *++text = IMM;
584 | *++text = 0;
585 | *++text = EQ;
586 |
587 | expr_type = INT;
588 | }
589 | else if (token == '~') {
590 | // bitwise not
591 | match('~');
592 | expression(Inc);
593 |
594 | // emit code, use XOR -1
595 | *++text = PUSH;
596 | *++text = IMM;
597 | *++text = -1;
598 | *++text = XOR;
599 |
600 | expr_type = INT;
601 | }
602 | else if (token == Add) {
603 | // +var, do nothing
604 | match(Add);
605 | expression(Inc);
606 |
607 | expr_type = INT;
608 | }
609 | else if (token == Sub) {
610 | // -var
611 | match(Sub);
612 |
613 | if (token == Num) {
614 | *++text = IMM;
615 | *++text = -token_val;
616 | match(Num);
617 | } else {
618 |
619 | *++text = IMM;
620 | *++text = -1;
621 | *++text = PUSH;
622 | expression(Inc);
623 | *++text = MUL;
624 | }
625 |
626 | expr_type = INT;
627 | }
628 | else if (token == Inc || token == Dec) {
629 | tmp = token;
630 | match(token);
631 | expression(Inc);
632 | if (*text == LC) {
633 | *text = PUSH; // to duplicate the address
634 | *++text = LC;
635 | } else if (*text == LI) {
636 | *text = PUSH;
637 | *++text = LI;
638 | } else {
639 | printf("%d: bad lvalue of pre-increment\n", line);
640 | exit(-1);
641 | }
642 | *++text = PUSH;
643 | *++text = IMM;
644 | *++text = (expr_type > PTR) ? sizeof(int) : sizeof(char);
645 | *++text = (tmp == Inc) ? ADD : SUB;
646 | *++text = (expr_type == CHAR) ? SC : SI;
647 | }
648 | else {
649 | printf("%d: bad expression\n", line);
650 | exit(-1);
651 | }
652 | }
653 |
654 | // binary operator and postfix operators.
655 | {
656 | while (token >= level) {
657 | // handle according to current operator's precedence
658 | tmp = expr_type;
659 | if (token == Assign) {
660 | // var = expr;
661 | match(Assign);
662 | if (*text == LC || *text == LI) {
663 | *text = PUSH; // save the lvalue's pointer
664 | } else {
665 | printf("%d: bad lvalue in assignment\n", line);
666 | exit(-1);
667 | }
668 | expression(Assign);
669 |
670 | expr_type = tmp;
671 | *++text = (expr_type == CHAR) ? SC : SI;
672 | }
673 | else if (token == Cond) {
674 | // expr ? a : b;
675 | match(Cond);
676 | *++text = JZ;
677 | addr = ++text;
678 | expression(Assign);
679 | if (token == ':') {
680 | match(':');
681 | } else {
682 | printf("%d: missing colon in conditional\n", line);
683 | exit(-1);
684 | }
685 | *addr = (int)(text + 3);
686 | *++text = JMP;
687 | addr = ++text;
688 | expression(Cond);
689 | *addr = (int)(text + 1);
690 | }
691 | else if (token == Lor) {
692 | // logic or
693 | match(Lor);
694 | *++text = JNZ;
695 | addr = ++text;
696 | expression(Lan);
697 | *addr = (int)(text + 1);
698 | expr_type = INT;
699 | }
700 | else if (token == Lan) {
701 | // logic and
702 | match(Lan);
703 | *++text = JZ;
704 | addr = ++text;
705 | expression(Or);
706 | *addr = (int)(text + 1);
707 | expr_type = INT;
708 | }
709 | else if (token == Or) {
710 | // bitwise or
711 | match(Or);
712 | *++text = PUSH;
713 | expression(Xor);
714 | *++text = OR;
715 | expr_type = INT;
716 | }
717 | else if (token == Xor) {
718 | // bitwise xor
719 | match(Xor);
720 | *++text = PUSH;
721 | expression(And);
722 | *++text = XOR;
723 | expr_type = INT;
724 | }
725 | else if (token == And) {
726 | // bitwise and
727 | match(And);
728 | *++text = PUSH;
729 | expression(Eq);
730 | *++text = AND;
731 | expr_type = INT;
732 | }
733 | else if (token == Eq) {
734 | // equal ==
735 | match(Eq);
736 | *++text = PUSH;
737 | expression(Ne);
738 | *++text = EQ;
739 | expr_type = INT;
740 | }
741 | else if (token == Ne) {
742 | // not equal !=
743 | match(Ne);
744 | *++text = PUSH;
745 | expression(Lt);
746 | *++text = NE;
747 | expr_type = INT;
748 | }
749 | else if (token == Lt) {
750 | // less than
751 | match(Lt);
752 | *++text = PUSH;
753 | expression(Shl);
754 | *++text = LT;
755 | expr_type = INT;
756 | }
757 | else if (token == Gt) {
758 | // greater than
759 | match(Gt);
760 | *++text = PUSH;
761 | expression(Shl);
762 | *++text = GT;
763 | expr_type = INT;
764 | }
765 | else if (token == Le) {
766 | // less than or equal to
767 | match(Le);
768 | *++text = PUSH;
769 | expression(Shl);
770 | *++text = LE;
771 | expr_type = INT;
772 | }
773 | else if (token == Ge) {
774 | // greater than or equal to
775 | match(Ge);
776 | *++text = PUSH;
777 | expression(Shl);
778 | *++text = GE;
779 | expr_type = INT;
780 | }
781 | else if (token == Shl) {
782 | // shift left
783 | match(Shl);
784 | *++text = PUSH;
785 | expression(Add);
786 | *++text = SHL;
787 | expr_type = INT;
788 | }
789 | else if (token == Shr) {
790 | // shift right
791 | match(Shr);
792 | *++text = PUSH;
793 | expression(Add);
794 | *++text = SHR;
795 | expr_type = INT;
796 | }
797 | else if (token == Add) {
798 | // add
799 | match(Add);
800 | *++text = PUSH;
801 | expression(Mul);
802 |
803 | expr_type = tmp;
804 | if (expr_type > PTR) {
805 | // pointer type, and not `char *`
806 | *++text = PUSH;
807 | *++text = IMM;
808 | *++text = sizeof(int);
809 | *++text = MUL;
810 | }
811 | *++text = ADD;
812 | }
813 | else if (token == Sub) {
814 | // sub
815 | match(Sub);
816 | *++text = PUSH;
817 | expression(Mul);
818 | if (tmp > PTR && tmp == expr_type) {
819 | // pointer subtraction
820 | *++text = SUB;
821 | *++text = PUSH;
822 | *++text = IMM;
823 | *++text = sizeof(int);
824 | *++text = DIV;
825 | expr_type = INT;
826 | } else if (tmp > PTR) {
827 | // pointer movement
828 | *++text = PUSH;
829 | *++text = IMM;
830 | *++text = sizeof(int);
831 | *++text = MUL;
832 | *++text = SUB;
833 | expr_type = tmp;
834 | } else {
835 | // numeral subtraction
836 | *++text = SUB;
837 | expr_type = tmp;
838 | }
839 | }
840 | else if (token == Mul) {
841 | // multiply
842 | match(Mul);
843 | *++text = PUSH;
844 | expression(Inc);
845 | *++text = MUL;
846 | expr_type = tmp;
847 | }
848 | else if (token == Div) {
849 | // divide
850 | match(Div);
851 | *++text = PUSH;
852 | expression(Inc);
853 | *++text = DIV;
854 | expr_type = tmp;
855 | }
856 | else if (token == Mod) {
857 | // Modulo
858 | match(Mod);
859 | *++text = PUSH;
860 | expression(Inc);
861 | *++text = MOD;
862 | expr_type = tmp;
863 | }
864 | else if (token == Inc || token == Dec) {
865 | // postfix inc(++) and dec(--)
866 | // we will increase the value to the variable and decrease it
867 | // on `ax` to get its original value.
868 | if (*text == LI) {
869 | *text = PUSH;
870 | *++text = LI;
871 | }
872 | else if (*text == LC) {
873 | *text = PUSH;
874 | *++text = LC;
875 | }
876 | else {
877 | printf("%d: bad value in increment\n", line);
878 | exit(-1);
879 | }
880 |
881 | *++text = PUSH;
882 | *++text = IMM;
883 | *++text = (expr_type > PTR) ? sizeof(int) : sizeof(char);
884 | *++text = (token == Inc) ? ADD : SUB;
885 | *++text = (expr_type == CHAR) ? SC : SI;
886 | *++text = PUSH;
887 | *++text = IMM;
888 | *++text = (expr_type > PTR) ? sizeof(int) : sizeof(char);
889 | *++text = (token == Inc) ? SUB : ADD;
890 | match(token);
891 | }
892 | else if (token == Brak) {
893 | // array access var[xx]
894 | match(Brak);
895 | *++text = PUSH;
896 | expression(Assign);
897 | match(']');
898 |
899 | if (tmp > PTR) {
900 | // pointer, `not char *`
901 | *++text = PUSH;
902 | *++text = IMM;
903 | *++text = sizeof(int);
904 | *++text = MUL;
905 | }
906 | else if (tmp < PTR) {
907 | printf("%d: pointer type expected\n", line);
908 | exit(-1);
909 | }
910 | expr_type = tmp - PTR;
911 | *++text = ADD;
912 | *++text = (expr_type == CHAR) ? LC : LI;
913 | }
914 | else {
915 | printf("%d: compiler error, token = %d\n", line, token);
916 | exit(-1);
917 | }
918 | }
919 | }
920 | }
921 |
922 | void statement() {
923 | // there are 6 kinds of statements here:
924 | // 1. if (...) [else ]
925 | // 2. while (...)
926 | // 3. { }
927 | // 4. return xxx;
928 | // 5. ;
929 | // 6. expression; (expression end with semicolon)
930 |
931 | int *a, *b; // bess for branch control
932 |
933 | if (token == If) {
934 | // if (...) [else ]
935 | //
936 | // if (...)
937 | // JZ a
938 | //
939 | // else: JMP b
940 | // a: a:
941 | //
942 | // b: b:
943 | //
944 | //
945 | match(If);
946 | match('(');
947 | expression(Assign); // parse condition
948 | match(')');
949 |
950 | // emit code for if
951 | *++text = JZ;
952 | b = ++text;
953 |
954 | statement(); // parse statement
955 | if (token == Else) { // parse else
956 | match(Else);
957 |
958 | // emit code for JMP B
959 | *b = (int)(text + 3);
960 | *++text = JMP;
961 | b = ++text;
962 |
963 | statement();
964 | }
965 |
966 | *b = (int)(text + 1);
967 | }
968 | else if (token == While) {
969 | //
970 | // a: a:
971 | // while ()
972 | // JZ b
973 | //
974 | // JMP a
975 | // b: b:
976 | match(While);
977 |
978 | a = text + 1;
979 |
980 | match('(');
981 | expression(Assign);
982 | match(')');
983 |
984 | *++text = JZ;
985 | b = ++text;
986 |
987 | statement();
988 |
989 | *++text = JMP;
990 | *++text = (int)a;
991 | *b = (int)(text + 1);
992 | }
993 | else if (token == '{') {
994 | // { ... }
995 | match('{');
996 |
997 | while (token != '}') {
998 | statement();
999 | }
1000 |
1001 | match('}');
1002 | }
1003 | else if (token == Return) {
1004 | // return [expression];
1005 | match(Return);
1006 |
1007 | if (token != ';') {
1008 | expression(Assign);
1009 | }
1010 |
1011 | match(';');
1012 |
1013 | // emit code for return
1014 | *++text = LEV;
1015 | }
1016 | else if (token == ';') {
1017 | // empty statement
1018 | match(';');
1019 | }
1020 | else {
1021 | // a = b; or function_call();
1022 | expression(Assign);
1023 | match(';');
1024 | }
1025 | }
1026 |
1027 | void function_parameter() {
1028 | int type;
1029 | int params;
1030 | params = 0;
1031 | while (token != ')') {
1032 | // int name, ...
1033 | type = INT;
1034 | if (token == Int) {
1035 | match(Int);
1036 | } else if (token == Char) {
1037 | type = CHAR;
1038 | match(Char);
1039 | }
1040 |
1041 | // pointer type
1042 | while (token == Mul) {
1043 | match(Mul);
1044 | type = type + PTR;
1045 | }
1046 |
1047 | // parameter name
1048 | if (token != Id) {
1049 | printf("%d: bad parameter declaration\n", line);
1050 | exit(-1);
1051 | }
1052 | if (current_id[Class] == Loc) {
1053 | printf("%d: duplicate parameter declaration\n", line);
1054 | exit(-1);
1055 | }
1056 |
1057 | match(Id);
1058 | // store the local variable
1059 | current_id[BClass] = current_id[Class]; current_id[Class] = Loc;
1060 | current_id[BType] = current_id[Type]; current_id[Type] = type;
1061 | current_id[BValue] = current_id[Value]; current_id[Value] = params++; // index of current parameter
1062 |
1063 | if (token == ',') {
1064 | match(',');
1065 | }
1066 | }
1067 | index_of_bp = params+1;
1068 | }
1069 |
1070 | void function_body() {
1071 | // type func_name (...) {...}
1072 | // -->| |<--
1073 |
1074 | // ... {
1075 | // 1. local declarations
1076 | // 2. statements
1077 | // }
1078 |
1079 | int pos_local; // position of local variables on the stack.
1080 | int type;
1081 | pos_local = index_of_bp;
1082 |
1083 | while (token == Int || token == Char) {
1084 | // local variable declaration, just like global ones.
1085 | basetype = (token == Int) ? INT : CHAR;
1086 | match(token);
1087 |
1088 | while (token != ';') {
1089 | type = basetype;
1090 | while (token == Mul) {
1091 | match(Mul);
1092 | type = type + PTR;
1093 | }
1094 |
1095 | if (token != Id) {
1096 | // invalid declaration
1097 | printf("%d: bad local declaration\n", line);
1098 | exit(-1);
1099 | }
1100 | if (current_id[Class] == Loc) {
1101 | // identifier exists
1102 | printf("%d: duplicate local declaration\n", line);
1103 | exit(-1);
1104 | }
1105 | match(Id);
1106 |
1107 | // store the local variable
1108 | current_id[BClass] = current_id[Class]; current_id[Class] = Loc;
1109 | current_id[BType] = current_id[Type]; current_id[Type] = type;
1110 | current_id[BValue] = current_id[Value]; current_id[Value] = ++pos_local; // index of current parameter
1111 |
1112 | if (token == ',') {
1113 | match(',');
1114 | }
1115 | }
1116 | match(';');
1117 | }
1118 |
1119 | // save the stack size for local variables
1120 | *++text = ENT;
1121 | *++text = pos_local - index_of_bp;
1122 |
1123 | // statements
1124 | while (token != '}') {
1125 | statement();
1126 | }
1127 |
1128 | // emit code for leaving the sub function
1129 | *++text = LEV;
1130 | }
1131 |
1132 | void function_declaration() {
1133 | // type func_name (...) {...}
1134 | // | this part
1135 |
1136 | match('(');
1137 | function_parameter();
1138 | match(')');
1139 | match('{');
1140 | function_body();
1141 | //match('}');
1142 |
1143 | // unwind local variable declarations for all local variables.
1144 | current_id = symbols;
1145 | while (current_id[Token]) {
1146 | if (current_id[Class] == Loc) {
1147 | current_id[Class] = current_id[BClass];
1148 | current_id[Type] = current_id[BType];
1149 | current_id[Value] = current_id[BValue];
1150 | }
1151 | current_id = current_id + IdSize;
1152 | }
1153 | }
1154 |
1155 | void enum_declaration() {
1156 | // parse enum [id] { a = 1, b = 3, ...}
1157 | int i;
1158 | i = 0;
1159 | while (token != '}') {
1160 | if (token != Id) {
1161 | printf("%d: bad enum identifier %d\n", line, token);
1162 | exit(-1);
1163 | }
1164 | next();
1165 | if (token == Assign) {
1166 | // like {a=10}
1167 | next();
1168 | if (token != Num) {
1169 | printf("%d: bad enum initializer\n", line);
1170 | exit(-1);
1171 | }
1172 | i = token_val;
1173 | next();
1174 | }
1175 |
1176 | current_id[Class] = Num;
1177 | current_id[Type] = INT;
1178 | current_id[Value] = i++;
1179 |
1180 | if (token == ',') {
1181 | next();
1182 | }
1183 | }
1184 | }
1185 |
1186 | void global_declaration() {
1187 | // global_declaration ::= enum_decl | variable_decl | function_decl
1188 | //
1189 | // enum_decl ::= 'enum' [id] '{' id ['=' 'num'] {',' id ['=' 'num'} '}'
1190 | //
1191 | // variable_decl ::= type {'*'} id { ',' {'*'} id } ';'
1192 | //
1193 | // function_decl ::= type {'*'} id '(' parameter_decl ')' '{' body_decl '}'
1194 |
1195 |
1196 | int type; // tmp, actual type for variable
1197 | int i; // tmp
1198 |
1199 | basetype = INT;
1200 |
1201 | // parse enum, this should be treated alone.
1202 | if (token == Enum) {
1203 | // enum [id] { a = 10, b = 20, ... }
1204 | match(Enum);
1205 | if (token != '{') {
1206 | match(Id); // skip the [id] part
1207 | }
1208 | if (token == '{') {
1209 | // parse the assign part
1210 | match('{');
1211 | enum_declaration();
1212 | match('}');
1213 | }
1214 |
1215 | match(';');
1216 | return;
1217 | }
1218 |
1219 | // parse type information
1220 | if (token == Int) {
1221 | match(Int);
1222 | }
1223 | else if (token == Char) {
1224 | match(Char);
1225 | basetype = CHAR;
1226 | }
1227 |
1228 | // parse the comma seperated variable declaration.
1229 | while (token != ';' && token != '}') {
1230 | type = basetype;
1231 | // parse pointer type, note that there may exist `int ****x;`
1232 | while (token == Mul) {
1233 | match(Mul);
1234 | type = type + PTR;
1235 | }
1236 |
1237 | if (token != Id) {
1238 | // invalid declaration
1239 | printf("%d: bad global declaration\n", line);
1240 | exit(-1);
1241 | }
1242 | if (current_id[Class]) {
1243 | // identifier exists
1244 | printf("%d: duplicate global declaration\n", line);
1245 | exit(-1);
1246 | }
1247 | match(Id);
1248 | current_id[Type] = type;
1249 |
1250 | if (token == '(') {
1251 | current_id[Class] = Fun;
1252 | current_id[Value] = (int)(text + 1); // the memory address of function
1253 | function_declaration();
1254 | } else {
1255 | // variable declaration
1256 | current_id[Class] = Glo; // global variable
1257 | current_id[Value] = (int)data; // assign memory address
1258 | data = data + sizeof(int);
1259 | // *data = 0;
1260 | }
1261 |
1262 | if (token == ',') {
1263 | match(',');
1264 | }
1265 | }
1266 | next();
1267 | }
1268 |
1269 | void program() {
1270 | // get next token
1271 | next();
1272 | while (token > 0) {
1273 | global_declaration();
1274 | }
1275 | }
1276 |
1277 | void show_regs()
1278 | {
1279 | printf("ax: %#x(%d)\n", ax, ax);
1280 | printf("sp: %p\n", sp);
1281 | printf("bp: %p\n", bp);
1282 | printf("pc: %p\n", pc);
1283 | }
1284 |
1285 | int is_data(unsigned int addr)
1286 | {
1287 | if (((unsigned int)begin_data <= addr) && (addr < (unsigned int)data))
1288 | return 1;
1289 | else
1290 | return 0;
1291 | }
1292 |
1293 | int is_text(unsigned int addr)
1294 | {
1295 | if (((unsigned int)begin_text <= addr) && (addr < (unsigned int)text))
1296 | return 1;
1297 | else
1298 | return 0;
1299 | }
1300 |
1301 | int is_stack(unsigned int addr)
1302 | {
1303 | if (((unsigned int)begin_stack <= addr) && (addr < (unsigned int)begin_stack + poolsize))
1304 | return 1;
1305 | else
1306 | return 0;
1307 | }
1308 |
1309 | int run_debug_func(char *cmd_line)
1310 | {
1311 | switch (cmd_line[0])
1312 | {
1313 | case 'q':
1314 | {
1315 | exit(1);
1316 | }
1317 | case 'd':
1318 | {
1319 | print_data();
1320 | break;
1321 | }
1322 | case 't':
1323 | {
1324 | print_symbol_table();
1325 | break;
1326 | }
1327 | case 'h':
1328 | case '?':
1329 | {
1330 | debug_usage();
1331 | break;
1332 | }
1333 | case 'l':
1334 | {
1335 | print_source_code();
1336 | break;
1337 | }
1338 | case 'e':
1339 | {
1340 | print_text();
1341 | break;
1342 | }
1343 | case 'b': // break pointer, ex: b 0xf756301c
1344 | {
1345 | unsigned int addr;
1346 | char cmd;
1347 |
1348 | sscanf(cmd_line, "%c %x\n", &cmd, &addr);
1349 | if (last_bp < MAX_BREAK_POINT)
1350 | {
1351 | break_points[last_bp] = addr;
1352 | printf("set break_points[%d]: %x\n", last_bp, addr);
1353 | ++last_bp;
1354 | }
1355 | else
1356 | {
1357 | printf("exceed %d break points\n", MAX_BREAK_POINT);
1358 | }
1359 | break;
1360 | }
1361 | case 'x': // show data segment content, ex: x 0xf756301c
1362 | {
1363 | char cmd_str[5];
1364 |
1365 | switch (cmd_line[1])
1366 | {
1367 | case 's':
1368 | {
1369 | break;
1370 | }
1371 | case 'c':
1372 | {
1373 | break;
1374 | }
1375 | case 'i':
1376 | {
1377 | break;
1378 | }
1379 | case 'x':
1380 | {
1381 | break;
1382 | }
1383 | default:
1384 | {
1385 | printf("xxx\n");
1386 | return -1;
1387 | }
1388 |
1389 | }
1390 | unsigned int addr;
1391 |
1392 | sscanf(cmd_line, "%s %x\n", cmd_str, &addr);
1393 | printf("cmd: %s, addr: %#x\n", cmd_str, addr);
1394 | #if 1
1395 | is_text(addr);
1396 | is_stack(addr);
1397 | //if (((unsigned int)begin_data <= addr) && (addr < (unsigned int)data))
1398 | if (is_data(addr) )
1399 | {
1400 | if (cmd_line[1] == 's')
1401 | printf("data seg: %s\n", (char *)addr);
1402 | if (cmd_line[1] == 'i')
1403 | printf("data seg: %#d\n", *(int *)addr);
1404 | if (cmd_line[1] == 'x')
1405 | printf("data seg: %#x\n", *(int *)addr);
1406 | }
1407 | else if (is_text(addr) )
1408 | printf("text seg: %#x(%d)\n", *((int *)addr), *((int *)addr));
1409 | else if (is_stack(addr) )
1410 | printf("stack area: %#x(%d)\n", *((int *)addr), *((int *)addr));
1411 | else
1412 | {
1413 | printf("%x is not in \ntext segment (%p ~ %p)\ndata segment (%p ~ %p)\nstack range (%p ~ %p)\n", addr, begin_text, text, begin_data, data, begin_stack, begin_stack + poolsize);
1414 |
1415 | }
1416 |
1417 | //printf("%#x(%d)\n", *((int *)addr));
1418 | #endif
1419 | break;
1420 | }
1421 | case 'r': // show registers
1422 | {
1423 | show_regs();
1424 | break;
1425 | }
1426 | default:
1427 | {
1428 | break;
1429 | }
1430 |
1431 | } // end switch (input_str[0])
1432 | return 0;
1433 | }
1434 |
1435 | void debug_usage()
1436 | {
1437 | printf("command\n");
1438 | printf("s: step\n");
1439 | printf("q: quit\n");
1440 | printf("c: continue\n");
1441 | printf("r: print all register content\n");
1442 | printf("d: print data\n");
1443 | printf("e: print text\n");
1444 | printf("l: print source code\n");
1445 | printf("t: print symbol table\n");
1446 | printf("xs address: print text/data segment stack area content as string\n");
1447 | printf("xc address: not yet complete. print text/data segment stack area content as char\n");
1448 | printf("xi address: print text/data segment stack area content as int\n");
1449 | printf("xx address: print text/data segment stack area content as hex\n");
1450 | printf("b address: set breakpoint, max breakpoint is %d\n", MAX_BREAK_POINT);
1451 | }
1452 |
1453 | #define INPUT_SIZE 20
1454 | int eval() {
1455 | int line=0, ch;
1456 | unsigned int break_point = 0;
1457 | int continue_run = 0;
1458 | int meet_bp = 0;
1459 | char input_str[INPUT_SIZE+1];
1460 | int op, *tmp;
1461 | while (1)
1462 | {
1463 | #ifdef SUPPORT_DEBUG
1464 | print_spec_text(pc);
1465 | //if (break_point == (unsigned int)pc || continue_run == 0)
1466 |
1467 | debug_input:
1468 | while(continue_run == 0)
1469 | {
1470 | printf("%d ## debug> ", line++);
1471 | fgets(input_str, INPUT_SIZE, stdin);
1472 |
1473 | if (input_str[0] == 's' || input_str[0] == '\n')
1474 | {
1475 | continue_run = 0;
1476 | break;
1477 | }
1478 |
1479 | if (input_str[0] == 'c')
1480 | {
1481 | continue_run = 1;
1482 | break;
1483 | }
1484 | else
1485 | {
1486 | run_debug_func(input_str);
1487 | continue_run = 0;
1488 | }
1489 | }
1490 |
1491 | if (meet_bp == 0 && meet_break_point(pc))
1492 | {
1493 | meet_bp = 1;
1494 | // printf("meet break pointer: %x\n", (unsigned int)pc);
1495 | continue_run = 0;
1496 | goto debug_input;
1497 | }
1498 | else
1499 | {
1500 | meet_bp = 0;
1501 | }
1502 |
1503 | #endif
1504 | op = *pc++; // get next operation code
1505 |
1506 | if (op == IMM) {ax = *pc++;} // load immediate value to ax
1507 | else if (op == LC) {ax = *(char *)ax;} // load character to ax, address in ax
1508 | else if (op == LI) {ax = *(int *)ax;} // load integer to ax, address in ax
1509 | else if (op == SC) {ax = *(char *)*sp++ = ax;} // save character to address, value in ax, address on stack
1510 | else if (op == SI) {*(int *)*sp++ = ax;} // save integer to address, value in ax, address on stack
1511 | else if (op == PUSH) {*--sp = ax;} // push the value of ax onto the stack
1512 | else if (op == JMP) {pc = (int *)*pc;} // jump to the address
1513 | else if (op == JZ) {pc = ax ? pc + 1 : (int *)*pc;} // jump if ax is zero
1514 | else if (op == JNZ) {pc = ax ? (int *)*pc : pc + 1;} // jump if ax is zero
1515 | else if (op == CALL) {*--sp = (int)(pc+1); pc = (int *)*pc;} // call subroutine
1516 | //else if (op == RET) {pc = (int *)*sp++;} // return from subroutine;
1517 | else if (op == ENT) {*--sp = (int)bp; bp = sp; sp = sp - *pc++;} // make new stack frame
1518 | else if (op == ADJ) {sp = sp + *pc++;} // add esp,
1519 | else if (op == LEV) {sp = bp; bp = (int *)*sp++; pc = (int *)*sp++;} // restore call frame and PC
1520 | else if (op == ENT) {*--sp = (int)bp; bp = sp; sp = sp - *pc++;} // make new stack frame
1521 | else if (op == ADJ) {sp = sp + *pc++;} // add esp,
1522 | else if (op == LEV) {sp = bp; bp = (int *)*sp++; pc = (int *)*sp++;} // restore call frame and PC
1523 | else if (op == LEA) {ax = (int)(bp + *pc++);} // load address for arguments.
1524 |
1525 | else if (op == OR) ax = *sp++ | ax;
1526 | else if (op == XOR) ax = *sp++ ^ ax;
1527 | else if (op == AND) ax = *sp++ & ax;
1528 | else if (op == EQ) ax = *sp++ == ax;
1529 | else if (op == NE) ax = *sp++ != ax;
1530 | else if (op == LT) ax = *sp++ < ax;
1531 | else if (op == LE) ax = *sp++ <= ax;
1532 | else if (op == GT) ax = *sp++ > ax;
1533 | else if (op == GE) ax = *sp++ >= ax;
1534 | else if (op == SHL) ax = *sp++ << ax;
1535 | else if (op == SHR) ax = *sp++ >> ax;
1536 | else if (op == ADD) ax = *sp++ + ax;
1537 | else if (op == SUB) ax = *sp++ - ax;
1538 | else if (op == MUL) ax = *sp++ * ax;
1539 | else if (op == DIV) ax = *sp++ / ax;
1540 | else if (op == MOD) ax = *sp++ % ax;
1541 |
1542 |
1543 | else if (op == EXIT) { printf("exit(%d)", *sp); return *sp;}
1544 | else if (op == OPEN) { ax = open((char *)sp[1], sp[0]); }
1545 | else if (op == CLOS) { ax = close(*sp);}
1546 | else if (op == READ) { ax = read(sp[2], (char *)sp[1], *sp); }
1547 | else if (op == PRTF) { tmp = sp + pc[1]; ax = printf((char *)tmp[-1], tmp[-2], tmp[-3], tmp[-4], tmp[-5], tmp[-6]); }
1548 | else if (op == MALC) { ax = (int)malloc(*sp);}
1549 | else if (op == MSET) { ax = (int)memset((char *)sp[2], sp[1], *sp);}
1550 | else if (op == MCMP) { ax = memcmp((char *)sp[2], (char *)sp[1], *sp);}
1551 | else {
1552 | printf("unknown instruction:%d\n", op);
1553 | return -1;
1554 | }
1555 | }
1556 | return 0;
1557 | }
1558 |
1559 | #ifdef SUPPORT_DEBUG
1560 | int meet_break_point(int *pc)
1561 | {
1562 | int i;
1563 | unsigned int cur_addr = (unsigned int)pc;
1564 | //printf("cur_addr: %x\n", cur_addr);
1565 |
1566 | for (i = 0 ; i < last_bp ; ++i)
1567 | {
1568 | //printf("cur break_points[%d]: %x\n", i, break_points[i]);
1569 | if (cur_addr == break_points[i])
1570 | {
1571 | printf("break_points[%d]: %x\n", i, break_points[i]);
1572 | return 1;
1573 | }
1574 | }
1575 | return 0;
1576 | }
1577 |
1578 | #ifdef SUPPORT_DEBUG
1579 | void print_symbol_table()
1580 | {
1581 | int *cur_id;
1582 | cur_id = symbols;
1583 |
1584 | printf("symbol table:\n");
1585 |
1586 | while(cur_id[Token])
1587 | {
1588 | printf("cur_id[Name]: %s\n", cur_id[Name]);
1589 | printf("cur_id[Hash]: %#x\n", cur_id[Hash]);
1590 | printf("cur_id[Type]: %s (%d)\n", type_str(cur_id[Type]), cur_id[Type]);
1591 | printf("cur_id[Class]: %s (%d)\n", class_str(cur_id[Class]), cur_id[Class]);
1592 | printf("cur_id[Value]: %#x (%d)\n", cur_id[Value], cur_id[Value]);
1593 | cur_id = cur_id + IdSize;
1594 | }
1595 | }
1596 | #endif
1597 |
1598 | // return 1: has argument
1599 | // return 0: has no argument
1600 | int print_spec_text(int *cur_text)
1601 | {
1602 | int has_argu=0;
1603 |
1604 | const char* inst_str = inst_2_str(*cur_text);
1605 | has_argu = inst_has_argu(*cur_text);
1606 |
1607 | if (inst_str)
1608 | printf("addr %p ## %s", cur_text, inst_str);
1609 | else
1610 | printf("addr %p ## %x", cur_text, *cur_text);
1611 |
1612 | if (has_argu)
1613 | {
1614 | ++cur_text;
1615 | printf(" %#x(%d)", *cur_text, *cur_text);
1616 | }
1617 | printf("\n");
1618 | return has_argu;
1619 | }
1620 |
1621 | void print_source_code()
1622 | {
1623 | printf("%s\n", src_begin);
1624 | }
1625 |
1626 | void print_text()
1627 | {
1628 | printf("text segment:\n");
1629 |
1630 | int *cur_text = begin_text+1;
1631 | int i;
1632 | int has_argu=0;
1633 | while (cur_text != text)
1634 | {
1635 | has_argu = print_spec_text(cur_text);
1636 | if (has_argu)
1637 | cur_text += 2;
1638 | else
1639 | ++cur_text;
1640 | #if 0
1641 | const char* inst_str = inst_2_str(*cur_text);
1642 | has_argu = inst_has_argu(*cur_text);
1643 | if (*cur_text == -1)
1644 | {
1645 | ++cur_text;
1646 | continue;
1647 | }
1648 | if (inst_str)
1649 | printf("addr %p ## %s\n", cur_text, inst_str);
1650 | else
1651 | printf("addr %p ## %x\n", cur_text, *cur_text);
1652 |
1653 | if (has_argu)
1654 | {
1655 | ++cur_text;
1656 | printf("addr %p ## %#x (%d)\n", cur_text, *cur_text, *cur_text);
1657 | has_argu = 0;
1658 | }
1659 | #endif
1660 | }
1661 |
1662 | }
1663 |
1664 | void print_data()
1665 | {
1666 | signed char *cur_data = begin_data;
1667 | int print_addr = 1;
1668 |
1669 | printf("data segment:\n");
1670 |
1671 | while(cur_data != data)
1672 | {
1673 | if (*cur_data != 0)
1674 | {
1675 | if (print_addr)
1676 | {
1677 | printf("%p: ", cur_data);
1678 | print_addr = 0;
1679 | }
1680 | printf("%c", *cur_data);
1681 | }
1682 | else
1683 | {
1684 | printf("\n");
1685 | print_addr = 1;
1686 | }
1687 | ++cur_data;
1688 | }
1689 | }
1690 | #endif
1691 |
1692 | int main(int argc, char **argv)
1693 | {
1694 |
1695 | int i, fd;
1696 | int *tmp;
1697 |
1698 | argc--;
1699 | argv++;
1700 |
1701 | poolsize = 256 * 1024; // arbitrary size
1702 | line = 1;
1703 |
1704 | if ((fd = open(*argv, 0)) < 0) {
1705 | printf("could not open(%s)\n", *argv);
1706 | return -1;
1707 | }
1708 |
1709 | // allocate memory for virtual machine
1710 | if (!(text = old_text = malloc(poolsize))) {
1711 | printf("could not malloc(%d) for text area\n", poolsize);
1712 | return -1;
1713 | }
1714 | begin_text = text;
1715 |
1716 | if (!(data = malloc(poolsize))) {
1717 | printf("could not malloc(%d) for data area\n", poolsize);
1718 | return -1;
1719 | }
1720 |
1721 | begin_data = data;
1722 |
1723 | if (!(stack = malloc(poolsize))) {
1724 | printf("could not malloc(%d) for stack area\n", poolsize);
1725 | return -1;
1726 | }
1727 |
1728 | begin_stack = stack;
1729 |
1730 | if (!(symbols = malloc(poolsize))) {
1731 | printf("could not malloc(%d) for symbol table\n", poolsize);
1732 | return -1;
1733 | }
1734 |
1735 | #ifdef SUPPORT_DEBUG
1736 | printf("text: %p\n", text);
1737 | printf("data: %p\n", data);
1738 | printf("stack: %p\n", stack);
1739 | #endif
1740 |
1741 | memset(text, 0, poolsize);
1742 | memset(data, 0, poolsize);
1743 | memset(stack, 0, poolsize);
1744 | memset(symbols, 0, poolsize);
1745 | bp = sp = (int *)((int)stack + poolsize);
1746 | ax = 0;
1747 |
1748 | src = "char else enum if int return sizeof while "
1749 | "open read close printf malloc memset memcmp exit void main";
1750 |
1751 | // add keywords to symbol table
1752 | i = Char;
1753 | while (i <= While) {
1754 | next();
1755 | current_id[Token] = i++;
1756 | }
1757 |
1758 | // add library to symbol table
1759 | i = OPEN;
1760 | while (i <= EXIT) {
1761 | next();
1762 | current_id[Class] = Sys;
1763 | current_id[Type] = INT;
1764 | current_id[Value] = i++;
1765 | }
1766 |
1767 | next(); current_id[Token] = Char; // handle void type
1768 | next(); idmain = current_id; // keep track of main
1769 |
1770 |
1771 | // read the source file
1772 | if ((fd = open(*argv, 0)) < 0) {
1773 | printf("could not open(%s)\n", *argv);
1774 | return -1;
1775 | }
1776 |
1777 | if (!(src = old_src = malloc(poolsize))) {
1778 | printf("could not malloc(%d) for source area\n", poolsize);
1779 | return -1;
1780 | }
1781 | // read the source file
1782 | if ((i = read(fd, src, poolsize-1)) <= 0) {
1783 | printf("read() returned %d\n", i);
1784 | return -1;
1785 | }
1786 | src[i] = 0; // add EOF character
1787 | src_begin = src;
1788 | close(fd);
1789 |
1790 | program();
1791 |
1792 | if (!(pc = (int *)idmain[Value])) {
1793 | printf("main() not defined\n");
1794 | return -1;
1795 | }
1796 |
1797 | // setup stack
1798 | sp = (int *)((int)stack + poolsize);
1799 |
1800 | #ifdef SUPPORT_DEBUG
1801 | printf("sp: %x\n", sp);
1802 | #endif
1803 |
1804 | *--sp = EXIT; // call exit if main returns
1805 | *--sp = PUSH; tmp = sp;
1806 | *--sp = argc;
1807 | *--sp = (int)argv;
1808 | *--sp = (int)tmp;
1809 |
1810 | #ifdef SUPPORT_DEBUG
1811 | //print_text();
1812 | //print_data();
1813 | // print_symbol_table();
1814 | #endif
1815 | return eval();
1816 | }
1817 |
--------------------------------------------------------------------------------
/xc.c:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 | #include
4 | #include
5 |
6 | int debug; // print the executed instructions
7 | int assembly; // print out the assembly and source
8 |
9 | int token; // current token
10 |
11 | // instructions
12 | enum { LEA ,IMM ,JMP ,CALL,JZ ,JNZ ,ENT ,ADJ ,LEV ,LI ,LC ,SI ,SC ,PUSH,
13 | OR ,XOR ,AND ,EQ ,NE ,LT ,GT ,LE ,GE ,SHL ,SHR ,ADD ,SUB ,MUL ,DIV ,MOD ,
14 | OPEN,READ,CLOS,PRTF,MALC,MSET,MCMP,EXIT };
15 |
16 | // tokens and classes (operators last and in precedence order)
17 | // copied from c4
18 | enum {
19 | Num = 128, Fun, Sys, Glo, Loc, Id,
20 | Char, Else, Enum, If, Int, Return, Sizeof, While,
21 | Assign, Cond, Lor, Lan, Or, Xor, And, Eq, Ne, Lt, Gt, Le, Ge, Shl, Shr, Add, Sub, Mul, Div, Mod, Inc, Dec, Brak
22 | };
23 |
24 | // fields of identifier
25 | enum {Token, Hash, Name, Type, Class, Value, BType, BClass, BValue, IdSize};
26 |
27 |
28 | // types of variable/function
29 | enum { CHAR, INT, PTR };
30 |
31 | // type of declaration.
32 | enum {Global, Local};
33 |
34 | int *text, // text segment
35 | *stack;// stack
36 | int * old_text; // for dump text segment
37 | char *data; // data segment
38 | int *idmain;
39 |
40 | char *src, *old_src; // pointer to source code string;
41 |
42 | int poolsize; // default size of text/data/stack
43 | int *pc, *bp, *sp, ax, cycle; // virtual machine registers
44 |
45 | int *current_id, // current parsed ID
46 | *symbols, // symbol table
47 | line, // line number of source code
48 | token_val; // value of current token (mainly for number)
49 |
50 | int basetype; // the type of a declaration, make it global for convenience
51 | int expr_type; // the type of an expression
52 |
53 | // function frame
54 | //
55 | // 0: arg 1
56 | // 1: arg 2
57 | // 2: arg 3
58 | // 3: return address
59 | // 4: old bp pointer <- index_of_bp
60 | // 5: local var 1
61 | // 6: local var 2
62 | int index_of_bp; // index of bp pointer on stack
63 |
64 | void next() {
65 | char *last_pos;
66 | int hash;
67 |
68 | while (token = *src) {
69 | ++src;
70 |
71 | if (token == '\n') {
72 | if (assembly) {
73 | // print compile info
74 | printf("%d: %.*s", line, src-old_src, old_src);
75 | old_src = src;
76 |
77 | while (old_text < text) {
78 | printf("%8.4s", & "LEA ,IMM ,JMP ,CALL,JZ ,JNZ ,ENT ,ADJ ,LEV ,LI ,LC ,SI ,SC ,PUSH,"
79 | "OR ,XOR ,AND ,EQ ,NE ,LT ,GT ,LE ,GE ,SHL ,SHR ,ADD ,SUB ,MUL ,DIV ,MOD ,"
80 | "OPEN,READ,CLOS,PRTF,MALC,MSET,MCMP,EXIT"[*++old_text * 5]);
81 |
82 | if (*old_text <= ADJ)
83 | printf(" %d\n", *++old_text);
84 | else
85 | printf("\n");
86 | }
87 | }
88 | ++line;
89 | }
90 | else if (token == '#') {
91 | // skip macro, because we will not support it
92 | while (*src != 0 && *src != '\n') {
93 | src++;
94 | }
95 | }
96 | else if ((token >= 'a' && token <= 'z') || (token >= 'A' && token <= 'Z') || (token == '_')) {
97 |
98 | // parse identifier
99 | last_pos = src - 1;
100 | hash = token;
101 |
102 | while ((*src >= 'a' && *src <= 'z') || (*src >= 'A' && *src <= 'Z') || (*src >= '0' && *src <= '9') || (*src == '_')) {
103 | hash = hash * 147 + *src;
104 | src++;
105 | }
106 |
107 | // look for existing identifier, linear search
108 | current_id = symbols;
109 | while (current_id[Token]) {
110 | if (current_id[Hash] == hash && !memcmp((char *)current_id[Name], last_pos, src - last_pos)) {
111 | //found one, return
112 | token = current_id[Token];
113 | return;
114 | }
115 | current_id = current_id + IdSize;
116 | }
117 |
118 |
119 | // store new ID
120 | current_id[Name] = (int)last_pos;
121 | current_id[Hash] = hash;
122 | token = current_id[Token] = Id;
123 | return;
124 | }
125 | else if (token >= '0' && token <= '9') {
126 | // parse number, three kinds: dec(123) hex(0x123) oct(017)
127 | token_val = token - '0';
128 | if (token_val > 0) {
129 | // dec, starts with [1-9]
130 | while (*src >= '0' && *src <= '9') {
131 | token_val = token_val*10 + *src++ - '0';
132 | }
133 | } else {
134 | // starts with number 0
135 | if (*src == 'x' || *src == 'X') {
136 | //hex
137 | token = *++src;
138 | while ((token >= '0' && token <= '9') || (token >= 'a' && token <= 'f') || (token >= 'A' && token <= 'F')) {
139 | token_val = token_val * 16 + (token & 15) + (token >= 'A' ? 9 : 0);
140 | token = *++src;
141 | }
142 | } else {
143 | // oct
144 | while (*src >= '0' && *src <= '7') {
145 | token_val = token_val*8 + *src++ - '0';
146 | }
147 | }
148 | }
149 |
150 | token = Num;
151 | return;
152 | }
153 | else if (token == '/') {
154 | if (*src == '/') {
155 | // skip comments
156 | while (*src != 0 && *src != '\n') {
157 | ++src;
158 | }
159 | } else {
160 | // divide operator
161 | token = Div;
162 | return;
163 | }
164 | }
165 | else if (token == '"' || token == '\'') {
166 | // parse string literal, currently, the only supported escape
167 | // character is '\n', store the string literal into data.
168 | last_pos = data;
169 | while (*src != 0 && *src != token) {
170 | token_val = *src++;
171 | if (token_val == '\\') {
172 | // escape character
173 | token_val = *src++;
174 | if (token_val == 'n') {
175 | token_val = '\n';
176 | }
177 | }
178 |
179 | if (token == '"') {
180 | *data++ = token_val;
181 | }
182 | }
183 |
184 | src++;
185 | // if it is a single character, return Num token
186 | if (token == '"') {
187 | token_val = (int)last_pos;
188 | } else {
189 | token = Num;
190 | }
191 |
192 | return;
193 | }
194 | else if (token == '=') {
195 | // parse '==' and '='
196 | if (*src == '=') {
197 | src ++;
198 | token = Eq;
199 | } else {
200 | token = Assign;
201 | }
202 | return;
203 | }
204 | else if (token == '+') {
205 | // parse '+' and '++'
206 | if (*src == '+') {
207 | src ++;
208 | token = Inc;
209 | } else {
210 | token = Add;
211 | }
212 | return;
213 | }
214 | else if (token == '-') {
215 | // parse '-' and '--'
216 | if (*src == '-') {
217 | src ++;
218 | token = Dec;
219 | } else {
220 | token = Sub;
221 | }
222 | return;
223 | }
224 | else if (token == '!') {
225 | // parse '!='
226 | if (*src == '=') {
227 | src++;
228 | token = Ne;
229 | }
230 | return;
231 | }
232 | else if (token == '<') {
233 | // parse '<=', '<<' or '<'
234 | if (*src == '=') {
235 | src ++;
236 | token = Le;
237 | } else if (*src == '<') {
238 | src ++;
239 | token = Shl;
240 | } else {
241 | token = Lt;
242 | }
243 | return;
244 | }
245 | else if (token == '>') {
246 | // parse '>=', '>>' or '>'
247 | if (*src == '=') {
248 | src ++;
249 | token = Ge;
250 | } else if (*src == '>') {
251 | src ++;
252 | token = Shr;
253 | } else {
254 | token = Gt;
255 | }
256 | return;
257 | }
258 | else if (token == '|') {
259 | // parse '|' or '||'
260 | if (*src == '|') {
261 | src ++;
262 | token = Lor;
263 | } else {
264 | token = Or;
265 | }
266 | return;
267 | }
268 | else if (token == '&') {
269 | // parse '&' and '&&'
270 | if (*src == '&') {
271 | src ++;
272 | token = Lan;
273 | } else {
274 | token = And;
275 | }
276 | return;
277 | }
278 | else if (token == '^') {
279 | token = Xor;
280 | return;
281 | }
282 | else if (token == '%') {
283 | token = Mod;
284 | return;
285 | }
286 | else if (token == '*') {
287 | token = Mul;
288 | return;
289 | }
290 | else if (token == '[') {
291 | token = Brak;
292 | return;
293 | }
294 | else if (token == '?') {
295 | token = Cond;
296 | return;
297 | }
298 | else if (token == '~' || token == ';' || token == '{' || token == '}' || token == '(' || token == ')' || token == ']' || token == ',' || token == ':') {
299 | // directly return the character as token;
300 | return;
301 | }
302 | }
303 | }
304 |
305 | void match(int tk) {
306 | if (token == tk) {
307 | next();
308 | } else {
309 | printf("%d: expected token: %d\n", line, tk);
310 | exit(-1);
311 | }
312 | }
313 |
314 |
315 | void expression(int level) {
316 | // expressions have various format.
317 | // but majorly can be divided into two parts: unit and operator
318 | // for example `(char) *a[10] = (int *) func(b > 0 ? 10 : 20);
319 | // `a[10]` is an unit while `*` is an operator.
320 | // `func(...)` in total is an unit.
321 | // so we should first parse those unit and unary operators
322 | // and then the binary ones
323 | //
324 | // also the expression can be in the following types:
325 | //
326 | // 1. unit_unary ::= unit | unit unary_op | unary_op unit
327 | // 2. expr ::= unit_unary (bin_op unit_unary ...)
328 |
329 | // unit_unary()
330 | int *id;
331 | int tmp;
332 | int *addr;
333 | {
334 | if (!token) {
335 | printf("%d: unexpected token EOF of expression\n", line);
336 | exit(-1);
337 | }
338 | if (token == Num) {
339 | match(Num);
340 |
341 | // emit code
342 | *++text = IMM;
343 | *++text = token_val;
344 | expr_type = INT;
345 | }
346 | else if (token == '"') {
347 | // continous string "abc" "abc"
348 |
349 |
350 | // emit code
351 | *++text = IMM;
352 | *++text = token_val;
353 |
354 | match('"');
355 | // store the rest strings
356 | while (token == '"') {
357 | match('"');
358 | }
359 |
360 | // append the end of string character '\0', all the data are default
361 | // to 0, so just move data one position forward.
362 | data = (char *)(((int)data + sizeof(int)) & (-sizeof(int)));
363 | expr_type = PTR;
364 | }
365 | else if (token == Sizeof) {
366 | // sizeof is actually an unary operator
367 | // now only `sizeof(int)`, `sizeof(char)` and `sizeof(*...)` are
368 | // supported.
369 | match(Sizeof);
370 | match('(');
371 | expr_type = INT;
372 |
373 | if (token == Int) {
374 | match(Int);
375 | } else if (token == Char) {
376 | match(Char);
377 | expr_type = CHAR;
378 | }
379 |
380 | while (token == Mul) {
381 | match(Mul);
382 | expr_type = expr_type + PTR;
383 | }
384 |
385 | match(')');
386 |
387 | // emit code
388 | *++text = IMM;
389 | *++text = (expr_type == CHAR) ? sizeof(char) : sizeof(int);
390 |
391 | expr_type = INT;
392 | }
393 | else if (token == Id) {
394 | // there are several type when occurs to Id
395 | // but this is unit, so it can only be
396 | // 1. function call
397 | // 2. Enum variable
398 | // 3. global/local variable
399 | match(Id);
400 |
401 | id = current_id;
402 |
403 | if (token == '(') {
404 | // function call
405 | match('(');
406 |
407 | // pass in arguments
408 | tmp = 0; // number of arguments
409 | while (token != ')') {
410 | expression(Assign);
411 | *++text = PUSH;
412 | tmp ++;
413 |
414 | if (token == ',') {
415 | match(',');
416 | }
417 |
418 | }
419 | match(')');
420 |
421 | // emit code
422 | if (id[Class] == Sys) {
423 | // system functions
424 | *++text = id[Value];
425 | }
426 | else if (id[Class] == Fun) {
427 | // function call
428 | *++text = CALL;
429 | *++text = id[Value];
430 | }
431 | else {
432 | printf("%d: bad function call\n", line);
433 | exit(-1);
434 | }
435 |
436 | // clean the stack for arguments
437 | if (tmp > 0) {
438 | *++text = ADJ;
439 | *++text = tmp;
440 | }
441 | expr_type = id[Type];
442 | }
443 | else if (id[Class] == Num) {
444 | // enum variable
445 | *++text = IMM;
446 | *++text = id[Value];
447 | expr_type = INT;
448 | }
449 | else {
450 | // variable
451 | if (id[Class] == Loc) {
452 | *++text = LEA;
453 | *++text = index_of_bp - id[Value];
454 | }
455 | else if (id[Class] == Glo) {
456 | *++text = IMM;
457 | *++text = id[Value];
458 | }
459 | else {
460 | printf("%d: undefined variable\n", line);
461 | exit(-1);
462 | }
463 |
464 | // emit code, default behaviour is to load the value of the
465 | // address which is stored in `ax`
466 | expr_type = id[Type];
467 | *++text = (expr_type == Char) ? LC : LI;
468 | }
469 | }
470 | else if (token == '(') {
471 | // cast or parenthesis
472 | match('(');
473 | if (token == Int || token == Char) {
474 | tmp = (token == Char) ? CHAR : INT; // cast type
475 | match(token);
476 | while (token == Mul) {
477 | match(Mul);
478 | tmp = tmp + PTR;
479 | }
480 |
481 | match(')');
482 |
483 | expression(Inc); // cast has precedence as Inc(++)
484 |
485 | expr_type = tmp;
486 | } else {
487 | // normal parenthesis
488 | expression(Assign);
489 | match(')');
490 | }
491 | }
492 | else if (token == Mul) {
493 | // dereference *
494 | match(Mul);
495 | expression(Inc); // dereference has the same precedence as Inc(++)
496 |
497 | if (expr_type >= PTR) {
498 | expr_type = expr_type - PTR;
499 | } else {
500 | printf("%d: bad dereference\n", line);
501 | exit(-1);
502 | }
503 |
504 | *++text = (expr_type == CHAR) ? LC : LI;
505 | }
506 | else if (token == And) {
507 | // get the address of
508 | match(And);
509 | expression(Inc); // get the address of
510 | if (*text == LC || *text == LI) {
511 | text --;
512 | } else {
513 | printf("%d: bad address of\n", line);
514 | exit(-1);
515 | }
516 |
517 | expr_type = expr_type + PTR;
518 | }
519 | else if (token == '!') {
520 | // not
521 | match('!');
522 | expression(Inc);
523 |
524 | // emit code, use == 0
525 | *++text = PUSH;
526 | *++text = IMM;
527 | *++text = 0;
528 | *++text = EQ;
529 |
530 | expr_type = INT;
531 | }
532 | else if (token == '~') {
533 | // bitwise not
534 | match('~');
535 | expression(Inc);
536 |
537 | // emit code, use XOR -1
538 | *++text = PUSH;
539 | *++text = IMM;
540 | *++text = -1;
541 | *++text = XOR;
542 |
543 | expr_type = INT;
544 | }
545 | else if (token == Add) {
546 | // +var, do nothing
547 | match(Add);
548 | expression(Inc);
549 |
550 | expr_type = INT;
551 | }
552 | else if (token == Sub) {
553 | // -var
554 | match(Sub);
555 |
556 | if (token == Num) {
557 | *++text = IMM;
558 | *++text = -token_val;
559 | match(Num);
560 | } else {
561 |
562 | *++text = IMM;
563 | *++text = -1;
564 | *++text = PUSH;
565 | expression(Inc);
566 | *++text = MUL;
567 | }
568 |
569 | expr_type = INT;
570 | }
571 | else if (token == Inc || token == Dec) {
572 | tmp = token;
573 | match(token);
574 | expression(Inc);
575 | if (*text == LC) {
576 | *text = PUSH; // to duplicate the address
577 | *++text = LC;
578 | } else if (*text == LI) {
579 | *text = PUSH;
580 | *++text = LI;
581 | } else {
582 | printf("%d: bad lvalue of pre-increment\n", line);
583 | exit(-1);
584 | }
585 | *++text = PUSH;
586 | *++text = IMM;
587 | *++text = (expr_type > PTR) ? sizeof(int) : sizeof(char);
588 | *++text = (tmp == Inc) ? ADD : SUB;
589 | *++text = (expr_type == CHAR) ? SC : SI;
590 | }
591 | else {
592 | printf("%d: bad expression\n", line);
593 | exit(-1);
594 | }
595 | }
596 |
597 | // binary operator and postfix operators.
598 | {
599 | while (token >= level) {
600 | // handle according to current operator's precedence
601 | tmp = expr_type;
602 | if (token == Assign) {
603 | // var = expr;
604 | match(Assign);
605 | if (*text == LC || *text == LI) {
606 | *text = PUSH; // save the lvalue's pointer
607 | } else {
608 | printf("%d: bad lvalue in assignment\n", line);
609 | exit(-1);
610 | }
611 | expression(Assign);
612 |
613 | expr_type = tmp;
614 | *++text = (expr_type == CHAR) ? SC : SI;
615 | }
616 | else if (token == Cond) {
617 | // expr ? a : b;
618 | match(Cond);
619 | *++text = JZ;
620 | addr = ++text;
621 | expression(Assign);
622 | if (token == ':') {
623 | match(':');
624 | } else {
625 | printf("%d: missing colon in conditional\n", line);
626 | exit(-1);
627 | }
628 | *addr = (int)(text + 3);
629 | *++text = JMP;
630 | addr = ++text;
631 | expression(Cond);
632 | *addr = (int)(text + 1);
633 | }
634 | else if (token == Lor) {
635 | // logic or
636 | match(Lor);
637 | *++text = JNZ;
638 | addr = ++text;
639 | expression(Lan);
640 | *addr = (int)(text + 1);
641 | expr_type = INT;
642 | }
643 | else if (token == Lan) {
644 | // logic and
645 | match(Lan);
646 | *++text = JZ;
647 | addr = ++text;
648 | expression(Or);
649 | *addr = (int)(text + 1);
650 | expr_type = INT;
651 | }
652 | else if (token == Or) {
653 | // bitwise or
654 | match(Or);
655 | *++text = PUSH;
656 | expression(Xor);
657 | *++text = OR;
658 | expr_type = INT;
659 | }
660 | else if (token == Xor) {
661 | // bitwise xor
662 | match(Xor);
663 | *++text = PUSH;
664 | expression(And);
665 | *++text = XOR;
666 | expr_type = INT;
667 | }
668 | else if (token == And) {
669 | // bitwise and
670 | match(And);
671 | *++text = PUSH;
672 | expression(Eq);
673 | *++text = AND;
674 | expr_type = INT;
675 | }
676 | else if (token == Eq) {
677 | // equal ==
678 | match(Eq);
679 | *++text = PUSH;
680 | expression(Ne);
681 | *++text = EQ;
682 | expr_type = INT;
683 | }
684 | else if (token == Ne) {
685 | // not equal !=
686 | match(Ne);
687 | *++text = PUSH;
688 | expression(Lt);
689 | *++text = NE;
690 | expr_type = INT;
691 | }
692 | else if (token == Lt) {
693 | // less than
694 | match(Lt);
695 | *++text = PUSH;
696 | expression(Shl);
697 | *++text = LT;
698 | expr_type = INT;
699 | }
700 | else if (token == Gt) {
701 | // greater than
702 | match(Gt);
703 | *++text = PUSH;
704 | expression(Shl);
705 | *++text = GT;
706 | expr_type = INT;
707 | }
708 | else if (token == Le) {
709 | // less than or equal to
710 | match(Le);
711 | *++text = PUSH;
712 | expression(Shl);
713 | *++text = LE;
714 | expr_type = INT;
715 | }
716 | else if (token == Ge) {
717 | // greater than or equal to
718 | match(Ge);
719 | *++text = PUSH;
720 | expression(Shl);
721 | *++text = GE;
722 | expr_type = INT;
723 | }
724 | else if (token == Shl) {
725 | // shift left
726 | match(Shl);
727 | *++text = PUSH;
728 | expression(Add);
729 | *++text = SHL;
730 | expr_type = INT;
731 | }
732 | else if (token == Shr) {
733 | // shift right
734 | match(Shr);
735 | *++text = PUSH;
736 | expression(Add);
737 | *++text = SHR;
738 | expr_type = INT;
739 | }
740 | else if (token == Add) {
741 | // add
742 | match(Add);
743 | *++text = PUSH;
744 | expression(Mul);
745 |
746 | expr_type = tmp;
747 | if (expr_type > PTR) {
748 | // pointer type, and not `char *`
749 | *++text = PUSH;
750 | *++text = IMM;
751 | *++text = sizeof(int);
752 | *++text = MUL;
753 | }
754 | *++text = ADD;
755 | }
756 | else if (token == Sub) {
757 | // sub
758 | match(Sub);
759 | *++text = PUSH;
760 | expression(Mul);
761 | if (tmp > PTR && tmp == expr_type) {
762 | // pointer subtraction
763 | *++text = SUB;
764 | *++text = PUSH;
765 | *++text = IMM;
766 | *++text = sizeof(int);
767 | *++text = DIV;
768 | expr_type = INT;
769 | } else if (tmp > PTR) {
770 | // pointer movement
771 | *++text = PUSH;
772 | *++text = IMM;
773 | *++text = sizeof(int);
774 | *++text = MUL;
775 | *++text = SUB;
776 | expr_type = tmp;
777 | } else {
778 | // numeral subtraction
779 | *++text = SUB;
780 | expr_type = tmp;
781 | }
782 | }
783 | else if (token == Mul) {
784 | // multiply
785 | match(Mul);
786 | *++text = PUSH;
787 | expression(Inc);
788 | *++text = MUL;
789 | expr_type = tmp;
790 | }
791 | else if (token == Div) {
792 | // divide
793 | match(Div);
794 | *++text = PUSH;
795 | expression(Inc);
796 | *++text = DIV;
797 | expr_type = tmp;
798 | }
799 | else if (token == Mod) {
800 | // Modulo
801 | match(Mod);
802 | *++text = PUSH;
803 | expression(Inc);
804 | *++text = MOD;
805 | expr_type = tmp;
806 | }
807 | else if (token == Inc || token == Dec) {
808 | // postfix inc(++) and dec(--)
809 | // we will increase the value to the variable and decrease it
810 | // on `ax` to get its original value.
811 | if (*text == LI) {
812 | *text = PUSH;
813 | *++text = LI;
814 | }
815 | else if (*text == LC) {
816 | *text = PUSH;
817 | *++text = LC;
818 | }
819 | else {
820 | printf("%d: bad value in increment\n", line);
821 | exit(-1);
822 | }
823 |
824 | *++text = PUSH;
825 | *++text = IMM;
826 | *++text = (expr_type > PTR) ? sizeof(int) : sizeof(char);
827 | *++text = (token == Inc) ? ADD : SUB;
828 | *++text = (expr_type == CHAR) ? SC : SI;
829 | *++text = PUSH;
830 | *++text = IMM;
831 | *++text = (expr_type > PTR) ? sizeof(int) : sizeof(char);
832 | *++text = (token == Inc) ? SUB : ADD;
833 | match(token);
834 | }
835 | else if (token == Brak) {
836 | // array access var[xx]
837 | match(Brak);
838 | *++text = PUSH;
839 | expression(Assign);
840 | match(']');
841 |
842 | if (tmp > PTR) {
843 | // pointer, `not char *`
844 | *++text = PUSH;
845 | *++text = IMM;
846 | *++text = sizeof(int);
847 | *++text = MUL;
848 | }
849 | else if (tmp < PTR) {
850 | printf("%d: pointer type expected\n", line);
851 | exit(-1);
852 | }
853 | expr_type = tmp - PTR;
854 | *++text = ADD;
855 | *++text = (expr_type == CHAR) ? LC : LI;
856 | }
857 | else {
858 | printf("%d: compiler error, token = %d\n", line, token);
859 | exit(-1);
860 | }
861 | }
862 | }
863 | }
864 |
865 | void statement() {
866 | // there are 8 kinds of statements here:
867 | // 1. if (...) [else ]
868 | // 2. while (...)
869 | // 3. { }
870 | // 4. return xxx;
871 | // 5. ;
872 | // 6. expression; (expression end with semicolon)
873 |
874 | int *a, *b; // bess for branch control
875 |
876 | if (token == If) {
877 | // if (...) [else ]
878 | //
879 | // if (...)
880 | // JZ a
881 | //
882 | // else: JMP b
883 | // a:
884 | //
885 | // b: b:
886 | //
887 | //
888 | match(If);
889 | match('(');
890 | expression(Assign); // parse condition
891 | match(')');
892 |
893 | // emit code for if
894 | *++text = JZ;
895 | b = ++text;
896 |
897 | statement(); // parse statement
898 | if (token == Else) { // parse else
899 | match(Else);
900 |
901 | // emit code for JMP B
902 | *b = (int)(text + 3);
903 | *++text = JMP;
904 | b = ++text;
905 |
906 | statement();
907 | }
908 |
909 | *b = (int)(text + 1);
910 | }
911 | else if (token == While) {
912 | //
913 | // a: a:
914 | // while ()
915 | // JZ b
916 | //
917 | // JMP a
918 | // b: b:
919 | match(While);
920 |
921 | a = text + 1;
922 |
923 | match('(');
924 | expression(Assign);
925 | match(')');
926 |
927 | *++text = JZ;
928 | b = ++text;
929 |
930 | statement();
931 |
932 | *++text = JMP;
933 | *++text = (int)a;
934 | *b = (int)(text + 1);
935 | }
936 | else if (token == '{') {
937 | // { ... }
938 | match('{');
939 |
940 | while (token != '}') {
941 | statement();
942 | }
943 |
944 | match('}');
945 | }
946 | else if (token == Return) {
947 | // return [expression];
948 | match(Return);
949 |
950 | if (token != ';') {
951 | expression(Assign);
952 | }
953 |
954 | match(';');
955 |
956 | // emit code for return
957 | *++text = LEV;
958 | }
959 | else if (token == ';') {
960 | // empty statement
961 | match(';');
962 | }
963 | else {
964 | // a = b; or function_call();
965 | expression(Assign);
966 | match(';');
967 | }
968 | }
969 |
970 | void enum_declaration() {
971 | // parse enum [id] { a = 1, b = 3, ...}
972 | int i;
973 | i = 0;
974 | while (token != '}') {
975 | if (token != Id) {
976 | printf("%d: bad enum identifier %d\n", line, token);
977 | exit(-1);
978 | }
979 | next();
980 | if (token == Assign) {
981 | // like {a=10}
982 | next();
983 | if (token != Num) {
984 | printf("%d: bad enum initializer\n", line);
985 | exit(-1);
986 | }
987 | i = token_val;
988 | next();
989 | }
990 |
991 | current_id[Class] = Num;
992 | current_id[Type] = INT;
993 | current_id[Value] = i++;
994 |
995 | if (token == ',') {
996 | next();
997 | }
998 | }
999 | }
1000 |
1001 | void function_parameter() {
1002 | int type;
1003 | int params;
1004 | params = 0;
1005 | while (token != ')') {
1006 | // int name, ...
1007 | type = INT;
1008 | if (token == Int) {
1009 | match(Int);
1010 | } else if (token == Char) {
1011 | type = CHAR;
1012 | match(Char);
1013 | }
1014 |
1015 | // pointer type
1016 | while (token == Mul) {
1017 | match(Mul);
1018 | type = type + PTR;
1019 | }
1020 |
1021 | // parameter name
1022 | if (token != Id) {
1023 | printf("%d: bad parameter declaration\n", line);
1024 | exit(-1);
1025 | }
1026 | if (current_id[Class] == Loc) {
1027 | printf("%d: duplicate parameter declaration\n", line);
1028 | exit(-1);
1029 | }
1030 |
1031 | match(Id);
1032 | // store the local variable
1033 | current_id[BClass] = current_id[Class]; current_id[Class] = Loc;
1034 | current_id[BType] = current_id[Type]; current_id[Type] = type;
1035 | current_id[BValue] = current_id[Value]; current_id[Value] = params++; // index of current parameter
1036 |
1037 | if (token == ',') {
1038 | match(',');
1039 | }
1040 | }
1041 | index_of_bp = params+1;
1042 | }
1043 |
1044 | void function_body() {
1045 | // type func_name (...) {...}
1046 | // -->| |<--
1047 |
1048 | // ... {
1049 | // 1. local declarations
1050 | // 2. statements
1051 | // }
1052 |
1053 | int pos_local; // position of local variables on the stack.
1054 | int type;
1055 | pos_local = index_of_bp;
1056 |
1057 | while (token == Int || token == Char) {
1058 | // local variable declaration, just like global ones.
1059 | basetype = (token == Int) ? INT : CHAR;
1060 | match(token);
1061 |
1062 | while (token != ';') {
1063 | type = basetype;
1064 | while (token == Mul) {
1065 | match(Mul);
1066 | type = type + PTR;
1067 | }
1068 |
1069 | if (token != Id) {
1070 | // invalid declaration
1071 | printf("%d: bad local declaration\n", line);
1072 | exit(-1);
1073 | }
1074 | if (current_id[Class] == Loc) {
1075 | // identifier exists
1076 | printf("%d: duplicate local declaration\n", line);
1077 | exit(-1);
1078 | }
1079 | match(Id);
1080 |
1081 | // store the local variable
1082 | current_id[BClass] = current_id[Class]; current_id[Class] = Loc;
1083 | current_id[BType] = current_id[Type]; current_id[Type] = type;
1084 | current_id[BValue] = current_id[Value]; current_id[Value] = ++pos_local; // index of current parameter
1085 |
1086 | if (token == ',') {
1087 | match(',');
1088 | }
1089 | }
1090 | match(';');
1091 | }
1092 |
1093 | // save the stack size for local variables
1094 | *++text = ENT;
1095 | *++text = pos_local - index_of_bp;
1096 |
1097 | // statements
1098 | while (token != '}') {
1099 | statement();
1100 | }
1101 |
1102 | // emit code for leaving the sub function
1103 | *++text = LEV;
1104 | }
1105 |
1106 | void function_declaration() {
1107 | // type func_name (...) {...}
1108 | // | this part
1109 |
1110 | match('(');
1111 | function_parameter();
1112 | match(')');
1113 | match('{');
1114 | function_body();
1115 | //match('}');
1116 |
1117 | // unwind local variable declarations for all local variables.
1118 | current_id = symbols;
1119 | while (current_id[Token]) {
1120 | if (current_id[Class] == Loc) {
1121 | current_id[Class] = current_id[BClass];
1122 | current_id[Type] = current_id[BType];
1123 | current_id[Value] = current_id[BValue];
1124 | }
1125 | current_id = current_id + IdSize;
1126 | }
1127 | }
1128 |
1129 | void global_declaration() {
1130 | // int [*]id [; | (...) {...}]
1131 |
1132 |
1133 | int type; // tmp, actual type for variable
1134 | int i; // tmp
1135 |
1136 | basetype = INT;
1137 |
1138 | // parse enum, this should be treated alone.
1139 | if (token == Enum) {
1140 | // enum [id] { a = 10, b = 20, ... }
1141 | match(Enum);
1142 | if (token != '{') {
1143 | match(Id); // skip the [id] part
1144 | }
1145 | if (token == '{') {
1146 | // parse the assign part
1147 | match('{');
1148 | enum_declaration();
1149 | match('}');
1150 | }
1151 |
1152 | match(';');
1153 | return;
1154 | }
1155 |
1156 | // parse type information
1157 | if (token == Int) {
1158 | match(Int);
1159 | }
1160 | else if (token == Char) {
1161 | match(Char);
1162 | basetype = CHAR;
1163 | }
1164 |
1165 | // parse the comma seperated variable declaration.
1166 | while (token != ';' && token != '}') {
1167 | type = basetype;
1168 | // parse pointer type, note that there may exist `int ****x;`
1169 | while (token == Mul) {
1170 | match(Mul);
1171 | type = type + PTR;
1172 | }
1173 |
1174 | if (token != Id) {
1175 | // invalid declaration
1176 | printf("%d: bad global declaration\n", line);
1177 | exit(-1);
1178 | }
1179 | if (current_id[Class]) {
1180 | // identifier exists
1181 | printf("%d: duplicate global declaration\n", line);
1182 | exit(-1);
1183 | }
1184 | match(Id);
1185 | current_id[Type] = type;
1186 |
1187 | if (token == '(') {
1188 | current_id[Class] = Fun;
1189 | current_id[Value] = (int)(text + 1); // the memory address of function
1190 | function_declaration();
1191 | } else {
1192 | // variable declaration
1193 | current_id[Class] = Glo; // global variable
1194 | current_id[Value] = (int)data; // assign memory address
1195 | data = data + sizeof(int);
1196 | }
1197 |
1198 | if (token == ',') {
1199 | match(',');
1200 | }
1201 | }
1202 | next();
1203 | }
1204 |
1205 | void program() {
1206 | // get next token
1207 | next();
1208 | while (token > 0) {
1209 | global_declaration();
1210 | }
1211 | }
1212 |
1213 | int eval() {
1214 | int op, *tmp;
1215 | cycle = 0;
1216 | while (1) {
1217 | cycle ++;
1218 | op = *pc++; // get next operation code
1219 |
1220 | // print debug info
1221 | if (debug) {
1222 | printf("%d> %.4s", cycle,
1223 | & "LEA ,IMM ,JMP ,CALL,JZ ,JNZ ,ENT ,ADJ ,LEV ,LI ,LC ,SI ,SC ,PUSH,"
1224 | "OR ,XOR ,AND ,EQ ,NE ,LT ,GT ,LE ,GE ,SHL ,SHR ,ADD ,SUB ,MUL ,DIV ,MOD ,"
1225 | "OPEN,READ,CLOS,PRTF,MALC,MSET,MCMP,EXIT"[op * 5]);
1226 | if (op <= ADJ)
1227 | printf(" %d\n", *pc);
1228 | else
1229 | printf("\n");
1230 | }
1231 | if (op == IMM) {ax = *pc++;} // load immediate value to ax
1232 | else if (op == LC) {ax = *(char *)ax;} // load character to ax, address in ax
1233 | else if (op == LI) {ax = *(int *)ax;} // load integer to ax, address in ax
1234 | else if (op == SC) {ax = *(char *)*sp++ = ax;} // save character to address, value in ax, address on stack
1235 | else if (op == SI) {*(int *)*sp++ = ax;} // save integer to address, value in ax, address on stack
1236 | else if (op == PUSH) {*--sp = ax;} // push the value of ax onto the stack
1237 | else if (op == JMP) {pc = (int *)*pc;} // jump to the address
1238 | else if (op == JZ) {pc = ax ? pc + 1 : (int *)*pc;} // jump if ax is zero
1239 | else if (op == JNZ) {pc = ax ? (int *)*pc : pc + 1;} // jump if ax is zero
1240 | else if (op == CALL) {*--sp = (int)(pc+1); pc = (int *)*pc;} // call subroutine
1241 | //else if (op == RET) {pc = (int *)*sp++;} // return from subroutine;
1242 | else if (op == ENT) {*--sp = (int)bp; bp = sp; sp = sp - *pc++;} // make new stack frame
1243 | else if (op == ADJ) {sp = sp + *pc++;} // add esp,
1244 | else if (op == LEV) {sp = bp; bp = (int *)*sp++; pc = (int *)*sp++;} // restore call frame and PC
1245 | else if (op == LEA) {ax = (int)(bp + *pc++);} // load address for arguments.
1246 |
1247 | else if (op == OR) ax = *sp++ | ax;
1248 | else if (op == XOR) ax = *sp++ ^ ax;
1249 | else if (op == AND) ax = *sp++ & ax;
1250 | else if (op == EQ) ax = *sp++ == ax;
1251 | else if (op == NE) ax = *sp++ != ax;
1252 | else if (op == LT) ax = *sp++ < ax;
1253 | else if (op == LE) ax = *sp++ <= ax;
1254 | else if (op == GT) ax = *sp++ > ax;
1255 | else if (op == GE) ax = *sp++ >= ax;
1256 | else if (op == SHL) ax = *sp++ << ax;
1257 | else if (op == SHR) ax = *sp++ >> ax;
1258 | else if (op == ADD) ax = *sp++ + ax;
1259 | else if (op == SUB) ax = *sp++ - ax;
1260 | else if (op == MUL) ax = *sp++ * ax;
1261 | else if (op == DIV) ax = *sp++ / ax;
1262 | else if (op == MOD) ax = *sp++ % ax;
1263 |
1264 | else if (op == EXIT) { printf("exit(%d)", *sp); return *sp;}
1265 | else if (op == OPEN) { ax = open((char *)sp[1], sp[0]); }
1266 | else if (op == CLOS) { ax = close(*sp);}
1267 | else if (op == READ) { ax = read(sp[2], (char *)sp[1], *sp); }
1268 | else if (op == PRTF) { tmp = sp + pc[1]; ax = printf((char *)tmp[-1], tmp[-2], tmp[-3], tmp[-4], tmp[-5], tmp[-6]); }
1269 | else if (op == MALC) { ax = (int)malloc(*sp);}
1270 | else if (op == MSET) { ax = (int)memset((char *)sp[2], sp[1], *sp);}
1271 | else if (op == MCMP) { ax = memcmp((char *)sp[2], (char *)sp[1], *sp);}
1272 | else {
1273 | printf("unknown instruction:%d\n", op);
1274 | return -1;
1275 | }
1276 | }
1277 | }
1278 |
1279 | int main(int argc, char **argv)
1280 | {
1281 | int i, fd;
1282 | int *tmp;
1283 |
1284 | argc--;
1285 | argv++;
1286 |
1287 | // parse arguments
1288 | if (argc > 0 && **argv == '-' && (*argv)[1] == 's') {
1289 | assembly = 1;
1290 | --argc;
1291 | ++argv;
1292 | }
1293 | if (argc > 0 && **argv == '-' && (*argv)[1] == 'd') {
1294 | debug = 1;
1295 | --argc;
1296 | ++argv;
1297 | }
1298 | if (argc < 1) {
1299 | printf("usage: xc [-s] [-d] file ...\n");
1300 | return -1;
1301 | }
1302 |
1303 | if ((fd = open(*argv, 0)) < 0) {
1304 | printf("could not open(%s)\n", *argv);
1305 | return -1;
1306 | }
1307 |
1308 | poolsize = 256 * 1024; // arbitrary size
1309 | line = 1;
1310 |
1311 | // allocate memory
1312 | if (!(text = malloc(poolsize))) {
1313 | printf("could not malloc(%d) for text area\n", poolsize);
1314 | return -1;
1315 | }
1316 | if (!(data = malloc(poolsize))) {
1317 | printf("could not malloc(%d) for data area\n", poolsize);
1318 | return -1;
1319 | }
1320 | if (!(stack = malloc(poolsize))) {
1321 | printf("could not malloc(%d) for stack area\n", poolsize);
1322 | return -1;
1323 | }
1324 | if (!(symbols = malloc(poolsize))) {
1325 | printf("could not malloc(%d) for symbol table\n", poolsize);
1326 | return -1;
1327 | }
1328 |
1329 | memset(text, 0, poolsize);
1330 | memset(data, 0, poolsize);
1331 | memset(stack, 0, poolsize);
1332 | memset(symbols, 0, poolsize);
1333 |
1334 | old_text = text;
1335 |
1336 | src = "char else enum if int return sizeof while "
1337 | "open read close printf malloc memset memcmp exit void main";
1338 |
1339 | // add keywords to symbol table
1340 | i = Char;
1341 | while (i <= While) {
1342 | next();
1343 | current_id[Token] = i++;
1344 | }
1345 |
1346 | // add library to symbol table
1347 | i = OPEN;
1348 | while (i <= EXIT) {
1349 | next();
1350 | current_id[Class] = Sys;
1351 | current_id[Type] = INT;
1352 | current_id[Value] = i++;
1353 | }
1354 |
1355 | next(); current_id[Token] = Char; // handle void type
1356 | next(); idmain = current_id; // keep track of main
1357 |
1358 | if (!(src = old_src = malloc(poolsize))) {
1359 | printf("could not malloc(%d) for source area\n", poolsize);
1360 | return -1;
1361 | }
1362 | // read the source file
1363 | if ((i = read(fd, src, poolsize-1)) <= 0) {
1364 | printf("read() returned %d\n", i);
1365 | return -1;
1366 | }
1367 | src[i] = 0; // add EOF character
1368 | close(fd);
1369 |
1370 | program();
1371 |
1372 | if (!(pc = (int *)idmain[Value])) {
1373 | printf("main() not defined\n");
1374 | return -1;
1375 | }
1376 |
1377 | // dump_text();
1378 | if (assembly) {
1379 | // only for compile
1380 | return 0;
1381 | }
1382 |
1383 | // setup stack
1384 | sp = (int *)((int)stack + poolsize);
1385 | *--sp = EXIT; // call exit if main returns
1386 | *--sp = PUSH; tmp = sp;
1387 | *--sp = argc;
1388 | *--sp = (int)argv;
1389 | *--sp = (int)tmp;
1390 |
1391 | return eval();
1392 | }
1393 |
--------------------------------------------------------------------------------