├── .gitignore
├── project
    ├── build.properties
    └── build
    │   └── LetsBuildACompiler.scala
├── src
    ├── test
    │   └── scala
    │   │   └── CradleSpec.scala
    └── main
    │   └── scala
    │       └── Cradle.scala
├── README.textile
└── reference
    └── crenshaw-txt
        ├── readme.txt
        ├── tutor1.txt
        ├── tutor8.txt
        ├── tutor4.txt
        ├── tutor9.txt
        ├── tutor2.txt
        ├── tutor12.txt
        └── tutor3.txt


/.gitignore:
--------------------------------------------------------------------------------
1 | lib_managed
2 | project/boot
3 | target


--------------------------------------------------------------------------------
/project/build.properties:
--------------------------------------------------------------------------------
1 | #Project properties
2 | #Sat Nov 28 17:14:14 PST 2009
3 | project.organization=Alex Payne
4 | project.name=Let's Build A Compiler
5 | sbt.version=0.5.6
6 | project.version=1.0
7 | scala.version=2.7.7
8 | project.initialize=false
9 | 


--------------------------------------------------------------------------------
/project/build/LetsBuildACompiler.scala:
--------------------------------------------------------------------------------
1 | import sbt._
2 | 
3 | class LetsBuildACompilerProject(info: ProjectInfo) extends DefaultProject(info) {
4 | 	// repositories
5 | 	val scalaTools = "Scala-Tools" at "http://scala-tools.org/repo-releases"
6 | 
7 | 	// dependencies
8 | 	val specs = "org.scala-tools.testing" % "specs" % "1.6.1"
9 | }


--------------------------------------------------------------------------------
/src/test/scala/CradleSpec.scala:
--------------------------------------------------------------------------------
 1 | package net.al3x.letsbuildacompiler.test
 2 | 
 3 | import org.specs._
 4 | import scala.io.Source
 5 | 
 6 | object CradleSpec extends Specification {
 7 |   "the stupid cradle" should {
 8 |     var cradle = null
 9 | 
10 |     "emit assembly for a single number input" in {
11 |       val input = Source.fromString("1")
12 |       val cradle = new Cradle(input)
13 |       cradle.init
14 |       cradle.expression mustEqual "\tMOVE #1,D0"
15 |     }
16 |   }
17 | }


--------------------------------------------------------------------------------
/README.textile:
--------------------------------------------------------------------------------
 1 | h1. Let's Build A Compiler!
 2 | 
 3 | h2. Introduction
 4 | 
 5 | This an implementation of a simple compiler, built roughly in accordance with a "tutorial by Crenshaw, 1988":http://compilers.iecc.com/crenshaw/. An ASCII copy of the tutorial is included in the 'reference' directory.
 6 | 
 7 | Crenshaw implements his compiler in a particular edition of Pascal, long since obsolete. I'll attempt to implement mine in Scala (2.7.7).
 8 | 
 9 | h2. Progress
10 | 
11 | * 2009Nov28 -
12 | 
13 | h2. Authors
14 | 
15 | "Alex Payne":http://al3x.net/ (at the behest and encouragement of "Steve Jenson":http://saladwithsteve.com/).


--------------------------------------------------------------------------------
/src/main/scala/Cradle.scala:
--------------------------------------------------------------------------------
 1 | package net.al3x.letsbuildacompiler
 2 | 
 3 | import scala.io.Source
 4 | 
 5 | 
 6 | class Cradle(input: Source) {
 7 |   var look: Char = ' '
 8 | 
 9 |   // Read New Character From Input Stream
10 |   def getChar: Char = {
11 |     look = input.next
12 |     look
13 |   }
14 | 
15 |   // Report an Error
16 |   // throw new TypeOfException("error message")
17 | 
18 |   // Report Error and Halt
19 |   // throw new TypeOfException("error message"); exit(-1)
20 | 
21 |   // Report What Was Expected
22 |   def expected(s: String) = {
23 |     println("expected " + s)
24 |     System.exit(-1)
25 |   }
26 | 
27 |   def expected(c: Char) = {
28 |     println("expected " + c.toString)
29 |     System.exit(-1)
30 |   }
31 | 
32 |   // Match a Specific Input Character
33 |   def matchChar(c: Char) = {
34 |     if (look == c) {
35 |       getChar
36 |     } else {
37 |       expected(c)
38 |     }
39 |   }
40 | 
41 |   // Recognize an Alpha Character
42 |   // char.isLetter
43 | 
44 |   // Recognize a Decimal Digit
45 |   // char.isDigit
46 | 
47 |   // Get an Identifier
48 |   def getName: Char = {
49 |     if (!look.isLetter) {
50 |       expected("Name")
51 |     } else {
52 |       return look
53 |     }
54 |     getChar
55 |   }
56 | 
57 |   // Get a Number
58 |   def getNum: Char = {
59 |     if (!look.isDigit) {
60 |       expected("Integer")
61 |     } else {
62 |       return look
63 |     }
64 |     getChar
65 |   }
66 | 
67 |   // Output a String with Tab
68 |   def emit(s: String) = "\t" + s
69 | 
70 |   // Output a String with Tab and CRLF
71 |   def emitLn(s: String) = emit(s)
72 | 
73 |   // Parse and Translate a Math Expression
74 |   def expression = {
75 |     emitLn("MOVE #" + getNum + ",D0")
76 |   }
77 | 
78 |   // Initialize
79 |   def init = getChar
80 | }
81 | 
82 | 
83 | object Cradle {
84 |   def main(args: Array[String]) {
85 |     val c = new Cradle(Source.fromInputStream(System.in))
86 |     c.init
87 |     c.expression
88 |   }
89 | }
90 | 


--------------------------------------------------------------------------------
/reference/crenshaw-txt/readme.txt:
--------------------------------------------------------------------------------
 1 | TUTOR.ZIP
 2 | 
 3 | This file contains all of the installments of Jack Crenshaw's
 4 | tutorial on compiler construction, including the new Installment 15. 
 5 | The intended audience is those folks who are not computer scientists,
 6 | but who enjoy computing and have always wanted to know how compilers
 7 | work. A lot of compiler theory has been left out, but the practical
 8 | issues are covered. By the time you have completed the series, you
 9 | should be able to design and build your own working compiler. It will
10 | not be the world's best, nor will it put out incredibly tight code.
11 | Your product will probably never put Borland or MicroSoft out of
12 | business.  But it will work, and it will be yours.
13 | 
14 | A word about the file format: The files were originally created using
15 | Borland's DOS editor, Sprint.  Sprint could write to a text file only
16 | if you formatted the file to go to the selected printer.  I used the
17 | most common printer I could think of, the Epson MX-80, but even then
18 | the files ended up with printer control sequences at the beginning
19 | and end of each page.
20 | 
21 | To bring the files up to date and get myself positioned to continue
22 | the series, I recently (1994) converted all the files to work with
23 | Microsoft Word for Windows.  Unlike Sprint, Word allows you to write
24 | the file as a DOS text file.  Unfortunately, this gave me a new
25 | problem, because when Word is writing to a text file, it doesn't
26 | write hard page breaks or page numbers.  In other words, in six years
27 | we've gone from a file with page breaks and page numbers, but
28 | embedded escape sequences, to files with no embedded escape sequences
29 | but no page breaks or page numbers.  Isn't progress wonderful?
30 | 
31 | Of course, it's possible for me to insert the page numbers as
32 | straight text, rather than asking the editor to do it for me.  But
33 | since Word won't allow me to write page breaks to the file, we would
34 | end up with files with page numbers that may or may not fall at the
35 | ends of the pages, depending on your editor and your printer.  It
36 | seems to me that almost every file I've ever downloaded from
37 | CompuServe or BBS's that had such page numbering was incompatible
38 | with my printer, and gave me pages that were one line short or one
39 | line long, with the page numbers consequently walking up the page.  
40 | 
41 | So perhaps this new format is, after all, the safest one for general
42 | distribution.  The files as they exist will look just fine if read
43 | into any text editor capable of reading DOS text files.  Since most
44 | editors these days include rather sophisticated word processing
45 | capabilities, you should be able to get your editor to paginate for
46 | you, prior to printing.
47 | 
48 | I hope you like the tutorials.  Much thought went into them.
49 | 
50 | 
51 | 									Jack W. Crenshaw
52 | 
53 | 								CompuServe 72325,1327
54 | 


--------------------------------------------------------------------------------
/reference/crenshaw-txt/tutor1.txt:
--------------------------------------------------------------------------------
  1 | 
  2 | 
  3 | 
  4 | 
  5 | 
  6 | 
  7 | 
  8 | 
  9 | 
 10 | 
 11 | 
 12 | 
 13 | 
 14 | 
 15 | 
 16 | 
 17 | 
 18 | 
 19 | 
 20 | 
 21 | 
 22 | 
 23 | 
 24 | 
 25 | 
 26 | 
 27 | 
 28 |                             LET'S BUILD A COMPILER!
 29 | 
 30 |                                        By
 31 | 
 32 |                             Jack W. Crenshaw, Ph.D.
 33 | 
 34 |                                   24 July 1988
 35 | 
 36 | 
 37 |                               Part I: INTRODUCTION
 38 | 
 39 | 
 40 | *****************************************************************
 41 | *                                                               *
 42 | *                        COPYRIGHT NOTICE                       *
 43 | *                                                               *
 44 | *   Copyright (C) 1988 Jack W. Crenshaw. All rights reserved.   *
 45 | *                                                               *
 46 | *****************************************************************
 47 | 
 48 | 
 49 | INTRODUCTION
 50 | 
 51 | 
 52 | This series of articles is a tutorial on the theory  and practice
 53 | of  developing language parsers and compilers.    Before  we  are
 54 | finished,  we  will  have  covered  every   aspect   of  compiler
 55 | construction, designed a new programming  language,  and  built a
 56 | working compiler.
 57 | 
 58 | Though I am not a computer scientist by education (my Ph.D. is in
 59 | a different  field, Physics), I have been interested in compilers
 60 | for many years.  I have  bought  and tried to digest the contents
 61 | of virtually every  book  on  the  subject ever written.  I don't
 62 | mind  telling you that it was slow going.    Compiler  texts  are
 63 | written for Computer  Science  majors, and are tough sledding for
 64 | the rest of us.  But over the years a bit of it began to seep in.
 65 | What really caused it to jell was when I began  to  branch off on
 66 | my own and begin to try things on my own computer.  Now I plan to
 67 | share with you what I have  learned.    At the end of this series
 68 | you will by no means be  a  computer scientist, nor will you know
 69 | all the esoterics of  compiler  theory.    I intend to completely
 70 | ignore the more theoretical  aspects  of  the  subject.  What you
 71 | _WILL_ know is all  the  practical aspects that one needs to know
 72 | to build a working system.
 73 | 
 74 | This is a "learn-by-doing" series.  In the course of the series I
 75 | will be performing  experiments  on  a  computer.    You  will be
 76 | expected to follow along,  repeating  the  experiments that I do,
 77 | and  performing  some  on your own.  I will be using Turbo Pascal
 78 | 4.0 on a PC  clone.   I will periodically insert examples written
 79 | in TP.  These will be executable code, which you will be expected
 80 | to copy into your own computer and run.  If you don't have a copy
 81 | of  Turbo,  you  will be severely limited in how well you will be
 82 | able to follow what's going on.  If you don't have a copy, I urge
 83 | you to get one.  After  all,  it's an excellent product, good for
 84 | many other uses!
 85 | 
 86 | Some articles on compilers show you examples, or show you  (as in
 87 | the case of Small-C) a finished product, which you can  then copy
 88 | and  use without a whole lot of understanding of how it works.  I
 89 | hope to do much more  than  that.    I  hope to teach you HOW the
 90 | things get done,  so that you can go off on your own and not only
 91 | reproduce what I have done, but improve on it.
 92 |                               
 93 | This is admittedly an ambitious undertaking, and it won't be done
 94 | in  one page.  I expect to do it in the course  of  a  number  of
 95 | articles.    Each  article will cover a single aspect of compiler
 96 | theory,  and  will  pretty  much  stand  alone.   If  all  you're
 97 | interested in at a given time is one  aspect,  then  you  need to
 98 | look only at that one article.  Each article will be  uploaded as
 99 | it  is complete, so you will have to wait for the last one before
100 | you can consider yourself finished.  Please be patient.
101 | 
102 | 
103 | 
104 | The average text on  compiler  theory covers a lot of ground that
105 | we won't be covering here.  The typical sequence is:
106 | 
107 |  o An introductory chapter describing what a compiler is.
108 | 
109 |  o A chapter or two on syntax equations, using Backus-Naur Form
110 |    (BNF).
111 | 
112 |  o A chapter or two on lexical scanning, with emphasis on
113 |    deterministic and non-deterministic finite automata.
114 | 
115 |  o Several chapters on parsing theory, beginning with top-down
116 |    recursive descent, and ending with LALR parsers.
117 | 
118 |  o A chapter on intermediate languages, with emphasis on P-code
119 |    and similar reverse polish representations.
120 | 
121 |  o Many chapters on alternative ways to handle subroutines and
122 |    parameter passing, type declarations, and such.
123 | 
124 |  o A chapter toward the end on code generation, usually for some
125 |    imaginary CPU with a simple instruction set.  Most readers
126 |    (and in fact, most college classes) never make it this far.
127 | 
128 |  o A final chapter or two on optimization. This chapter often
129 |    goes unread, too.
130 | 
131 | 
132 | I'll  be taking a much different approach in  this  series.    To
133 | begin  with,  I  won't dwell long on options.  I'll be giving you
134 | _A_ way that works.  If you want  to  explore  options,  well and
135 | good ...  I  encourage  you  to do so ... but I'll be sticking to
136 | what I know.   I also will skip over most of the theory that puts
137 | people  to  sleep.  Don't get me  wrong:  I  don't  belittle  the
138 | theory, and it's vitally important  when it comes to dealing with
139 | the more tricky  parts  of  a  given  language.  But I believe in
140 | putting first things first.    Here we'll be dealing with the 95%
141 | of compiler techniques that don't need a lot of theory to handle.
142 | 
143 | I  also  will  discuss only one approach  to  parsing:  top-down,
144 | recursive descent parsing, which is the  _ONLY_  technique that's
145 | at  all   amenable  to  hand-crafting  a  compiler.    The  other
146 | approaches are only useful if you have a tool like YACC, and also
147 | don't care how much memory space the final product uses.
148 |                               
149 | I  also take a page from the work of Ron Cain, the author of  the
150 | original Small C.  Whereas almost all other compiler authors have
151 | historically  used  an  intermediate  language  like  P-code  and
152 | divided  the  compiler  into two parts (a front end that produces
153 | P-code,  and   a  back  end  that  processes  P-code  to  produce
154 | executable   object  code),  Ron  showed  us   that   it   is   a
155 | straightforward  matter  to  make  a  compiler  directly  produce
156 | executable  object  code,  in  the  form  of  assembler  language
157 | statements.  The code will _NOT_ be the world's tightest code ...
158 | producing optimized code is  a  much  more  difficult job. But it
159 | will work, and work reasonably well.  Just so that I  don't leave
160 | you with the impression that our end product will be worthless, I
161 | _DO_ intend to show you how  to  "soup up" the compiler with some
162 | optimization.
163 | 
164 | 
165 | 
166 | Finally, I'll be  using  some  tricks  that I've found to be most
167 | helpful in letting  me  understand what's going on without wading
168 | through a lot of boiler plate.  Chief among these  is  the use of
169 | single-character tokens, with no embedded spaces,  for  the early
170 | design work.  I figure that  if  I  can get a parser to recognize
171 | and deal with I-T-L, I can  get  it  to do the same with IF-THEN-
172 | ELSE.  And I can.  In the second "lesson,"   I'll  show  you just
173 | how easy it  is  to  extend  a  simple parser to handle tokens of
174 | arbitrary length.  As another  trick,  I  completely  ignore file
175 | I/O, figuring that  if  I  can  read source from the keyboard and
176 | output object to the screen, I can also do it from/to disk files.
177 | Experience  has  proven  that  once  a   translator   is  working
178 | correctly, it's a  straightforward  matter to redirect the I/O to
179 | files.    The last trick is that I make no attempt  to  do  error
180 | correction/recovery.   The   programs   we'll  be  building  will
181 | RECOGNIZE errors, and will not CRASH, but they  will  simply stop
182 | on the first error ... just like good ol' Turbo does.  There will
183 | be  other tricks that you'll see as you go. Most of them can't be
184 | found in any compiler textbook, but they work.
185 | 
186 | A word about style and efficiency.    As  you will see, I tend to
187 | write programs in  _VERY_  small, easily understood pieces.  None
188 | of the procedures we'll  be  working with will be more than about
189 | 15-20 lines long.  I'm a fervent devotee  of  the  KISS  (Keep It
190 | Simple, Sidney) school of software development.  I  try  to never
191 | do something tricky or  complex,  when  something simple will do.
192 | Inefficient?  Perhaps, but you'll like the  results.    As  Brian
193 | Kernighan has said,  FIRST  make  it  run, THEN make it run fast.
194 | If, later on,  you want to go back and tighten up the code in one
195 | of  our products, you'll be able to do so, since the code will be
196 | quite understandable. If you  do  so, however, I urge you to wait
197 | until the program is doing everything you want it to.
198 | 
199 | I  also  have  a  tendency  to  delay  building  a module until I
200 | discover that I need  it.    Trying  to anticipate every possible
201 | future contingency can  drive  you  crazy,  and  you'll generally
202 | guess wrong anyway.    In  this  modern day of screen editors and
203 | fast compilers, I don't hesitate to change a module when I feel I
204 | need a more powerful one.  Until then,  I'll  write  only  what I
205 | need.
206 | 
207 | One final caveat: One of the principles we'll be sticking to here
208 | is that we don't  fool  around with P-code or imaginary CPUs, but
209 | that we will start out on day one  producing  working, executable
210 | object code, at least in the form of  assembler  language source.
211 | However, you may not  like  my  choice  of assembler language ...
212 | it's 68000 code, which is what works on my system (under SK*DOS).
213 | I  think  you'll  find, though, that the translation to any other
214 | CPU such as the 80x86 will  be  quite obvious, though, so I don't
215 | see  a problem here.  In fact, I hope someone out there who knows
216 | the '86 language better than I do will offer  us  the  equivalent
217 | object code fragments as we need them.
218 | 
219 | 
220 | THE CRADLE
221 | 
222 | Every program needs some boiler  plate  ...  I/O  routines, error
223 | message routines, etc.   The  programs we develop here will be no
224 | exceptions.    I've  tried to hold  this  stuff  to  an  absolute
225 | minimum, however, so that we  can  concentrate  on  the important
226 | stuff without losing it  among  the  trees.  The code given below
227 | represents about the minimum that we need to  get  anything done.
228 | It consists of some I/O routines, an error-handling routine and a
229 | skeleton, null main program.   I  call  it  our  cradle.    As we
230 | develop other routines, we'll add them to the cradle, and add the
231 | calls to them as we  need to.  Make a copy of the cradle and save
232 | it, because we'll be using it more than once.
233 | 
234 | There are many different ways to organize the scanning activities
235 | of  a  parser.   In Unix systems, authors tend to  use  getc  and
236 | ungetc.  I've had very good luck with the  approach  shown  here,
237 | which is to use  a  single, global, lookahead character.  Part of
238 | the initialization procedure  (the  only part, so far!) serves to
239 | "prime  the  pump"  by reading the first character from the input
240 | stream.  No other special  techniques are required with Turbo 4.0
241 | ... each successive call to  GetChar will read the next character
242 | in the stream.
243 | 
244 | 
245 | {--------------------------------------------------------------}
246 | program Cradle;
247 | 
248 | {--------------------------------------------------------------}
249 | { Constant Declarations }
250 | 
251 | const TAB = ^I;
252 | 
253 | {--------------------------------------------------------------}
254 | { Variable Declarations }
255 | 
256 | var Look: char;              { Lookahead Character }
257 |                               
258 | {--------------------------------------------------------------}
259 | { Read New Character From Input Stream }
260 | 
261 | procedure GetChar;
262 | begin
263 |    Read(Look);
264 | end;
265 | 
266 | {--------------------------------------------------------------}
267 | { Report an Error }
268 | 
269 | procedure Error(s: string);
270 | begin
271 |    WriteLn;
272 |    WriteLn(^G, 'Error: ', s, '.');
273 | end;
274 | 
275 | 
276 | {--------------------------------------------------------------}
277 | { Report Error and Halt }
278 | 
279 | procedure Abort(s: string);
280 | begin
281 |    Error(s);
282 |    Halt;
283 | end;
284 | 
285 | 
286 | {--------------------------------------------------------------}
287 | { Report What Was Expected }
288 | 
289 | procedure Expected(s: string);
290 | begin
291 |    Abort(s + ' Expected');
292 | end;
293 | 
294 | {--------------------------------------------------------------}
295 | { Match a Specific Input Character }
296 | 
297 | procedure Match(x: char);
298 | begin
299 |    if Look = x then GetChar
300 |    else Expected('''' + x + '''');
301 | end;
302 | 
303 | 
304 | {--------------------------------------------------------------}
305 | { Recognize an Alpha Character }
306 | 
307 | function IsAlpha(c: char): boolean;
308 | begin
309 |    IsAlpha := upcase(c) in ['A'..'Z'];
310 | end;
311 |                               
312 | 
313 | {--------------------------------------------------------------}
314 | 
315 | { Recognize a Decimal Digit }
316 | 
317 | function IsDigit(c: char): boolean;
318 | begin
319 |    IsDigit := c in ['0'..'9'];
320 | end;
321 | 
322 | 
323 | {--------------------------------------------------------------}
324 | { Get an Identifier }
325 | 
326 | function GetName: char;
327 | begin
328 |    if not IsAlpha(Look) then Expected('Name');
329 |    GetName := UpCase(Look);
330 |    GetChar;
331 | end;
332 | 
333 | 
334 | {--------------------------------------------------------------}
335 | { Get a Number }
336 | 
337 | function GetNum: char;
338 | begin
339 |    if not IsDigit(Look) then Expected('Integer');
340 |    GetNum := Look;
341 |    GetChar;
342 | end;
343 | 
344 | 
345 | {--------------------------------------------------------------}
346 | { Output a String with Tab }
347 | 
348 | procedure Emit(s: string);
349 | begin
350 |    Write(TAB, s);
351 | end;
352 | 
353 | 
354 | 
355 | 
356 | {--------------------------------------------------------------}
357 | { Output a String with Tab and CRLF }
358 | 
359 | procedure EmitLn(s: string);
360 | begin
361 |    Emit(s);
362 |    WriteLn;
363 | end;
364 | 
365 | {--------------------------------------------------------------}
366 | { Initialize }
367 | 
368 | procedure Init;
369 | begin
370 |    GetChar;
371 | end;
372 | 
373 | 
374 | {--------------------------------------------------------------}
375 | { Main Program }
376 | 
377 | begin
378 |    Init;
379 | end.
380 | {--------------------------------------------------------------}
381 | 
382 | 
383 | That's it for this introduction.  Copy the code above into TP and
384 | compile it.  Make sure that it compiles and runs  correctly. Then
385 | proceed to the first lesson, which is on expression parsing.
386 | 
387 | 
388 | *****************************************************************
389 | *                                                               *
390 | *                        COPYRIGHT NOTICE                       *
391 | *                                                               *
392 | *   Copyright (C) 1988 Jack W. Crenshaw. All rights reserved.   *
393 | *                                                               *
394 | *****************************************************************
395 | 
396 | 
397 | 
398 | 
399 | 


--------------------------------------------------------------------------------
/reference/crenshaw-txt/tutor8.txt:
--------------------------------------------------------------------------------
  1 | 
  2 | 
  3 | 
  4 | 
  5 | 
  6 | 
  7 | 
  8 | 
  9 | 
 10 | 
 11 | 
 12 | 
 13 | 
 14 | 
 15 | 
 16 | 
 17 | 
 18 | 
 19 | 
 20 | 
 21 | 
 22 | 
 23 | 
 24 | 
 25 | 
 26 | 
 27 | 
 28 | 
 29 |                      LET'S BUILD A COMPILER!
 30 | 
 31 |                                 By
 32 | 
 33 |                      Jack W. Crenshaw, Ph.D.
 34 | 
 35 |                            2 April 1989
 36 | 
 37 | 
 38 |                   Part VIII: A LITTLE PHILOSOPHY
 39 | 
 40 | 
 41 | *****************************************************************
 42 | *                                                               *
 43 | *                        COPYRIGHT NOTICE                       *
 44 | *                                                               *
 45 | *   Copyright (C) 1989 Jack W. Crenshaw. All rights reserved.   *
 46 | *                                                               *
 47 | *****************************************************************
 48 | 
 49 | 
 50 | INTRODUCTION
 51 | 
 52 | This is going to be a  different  kind of session than the others
 53 | in our series on  parsing  and  compiler  construction.  For this
 54 | session, there won't be  any  experiments to do or code to write.
 55 | This  once,  I'd  like  to  just  talk  with  you  for  a  while.
 56 | Mercifully, it will be a short  session,  and then we can take up
 57 | where we left off, hopefully with renewed vigor.
 58 | 
 59 | When  I  was  in college, I found that I could  always  follow  a
 60 | prof's lecture a lot better if I knew where he was going with it.
 61 | I'll bet you were the same.
 62 | 
 63 | So I thought maybe it's about  time  I told you where we're going
 64 | with this series: what's coming up in future installments, and in
 65 | general what all  this  is  about.   I'll also share some general
 66 | thoughts concerning the usefulness of what we've been doing.
 67 | 
 68 | 
 69 | THE ROAD HOME
 70 | 
 71 | So far, we've  covered  the parsing and translation of arithmetic
 72 | expressions,  Boolean expressions, and combinations connected  by
 73 | relational  operators.    We've also done the  same  for  control
 74 | constructs.    In  all of this we've leaned heavily on the use of
 75 | top-down, recursive  descent  parsing,  BNF  definitions  of  the
 76 | syntax, and direct generation of assembly-language code.  We also
 77 | learned the value of  such  tricks  as single-character tokens to
 78 | help  us  see  the  forest  through  the  trees.    In  the  last
 79 | installment  we dealt with lexical scanning,  and  I  showed  you
 80 | simple but powerful ways to remove the single-character barriers.
 81 | 
 82 | Throughout the whole study, I've emphasized  the  KISS philosophy
 83 | ... Keep It Simple, Sidney ... and I hope by now  you've realized
 84 | just  how  simple  this stuff can really be.  While there are for
 85 | sure areas of compiler  theory  that  are truly intimidating, the
 86 | ultimate message of this series is that in practice you  can just
 87 | politely  sidestep   many  of  these  areas.    If  the  language
 88 | definition  cooperates  or,  as in this series, if you can define
 89 | the language as you go, it's possible to write down  the language
 90 | definition in BNF with reasonable ease.  And, as we've  seen, you
 91 | can crank out parse procedures from the BNF just about as fast as
 92 | you can type.
 93 | 
 94 | As our compiler has taken form, it's gotten more parts,  but each
 95 | part  is  quite small and simple, and  very  much  like  all  the
 96 | others.
 97 | 
 98 | At this point, we have many  of  the makings of a real, practical
 99 | compiler.  As a matter of  fact,  we  already have all we need to
100 | build a toy  compiler  for  a  language as powerful as, say, Tiny
101 | BASIC.  In the next couple of installments, we'll  go  ahead  and
102 | define that language.
103 | 
104 | To round out  the  series,  we  still  have a few items to cover.
105 | These include:
106 | 
107 |    o Procedure calls, with and without parameters
108 | 
109 |    o Local and global variables
110 | 
111 |    o Basic types, such as character and integer types
112 | 
113 |    o Arrays
114 | 
115 |    o Strings
116 | 
117 |    o User-defined types and structures
118 | 
119 |    o Tree-structured parsers and intermediate languages
120 | 
121 |    o Optimization
122 | 
123 | These will all be  covered  in  future  installments.  When we're
124 | finished, you'll have all the tools you need to design  and build
125 | your own languages, and the compilers to translate them.
126 | 
127 | I can't  design  those  languages  for  you,  but I can make some
128 | comments  and  recommendations.    I've  already  sprinkled  some
129 | throughout past installments.    You've  seen,  for  example, the
130 | control constructs I prefer.
131 | 
132 | These constructs are going  to  be part of the languages I build.
133 | I  have  three  languages in mind at this point, two of which you
134 | will see in installments to come:
135 | 
136 | TINY - A  minimal,  but  usable  language  on the order  of  Tiny
137 |        BASIC or Tiny C.  It won't be very practical, but  it will
138 |        have enough power to let you write and  run  real programs
139 |        that do something worthwhile.
140 | 
141 | KISS - The  language  I'm  building for my  own  use.    KISS  is
142 |        intended to be  a  systems programming language.  It won't
143 |        have strong typing  or  fancy data structures, but it will
144 |        support most of  the  things  I  want to do with a higher-
145 |        order language (HOL), except perhaps writing compilers.
146 |                               
147 | I've also  been  toying  for  years  with  the idea of a HOL-like
148 | assembler,  with  structured  control  constructs   and  HOL-like
149 | assignment statements.  That, in  fact, was the impetus behind my
150 | original foray into the jungles of compiler theory.  This one may
151 | never be built, simply  because  I've  learned that it's actually
152 | easier to implement a language like KISS, that only uses a subset
153 | of the CPU instructions.    As you know, assembly language can be
154 | bizarre  and  irregular  in the extreme, and a language that maps
155 | one-for-one onto it can be a real challenge.  Still,  I've always
156 | felt that the syntax used  in conventional assemblers is dumb ...
157 | why is
158 | 
159 |      MOVE.L A,B
160 | 
161 | better, or easier to translate, than
162 | 
163 |      B=A ?
164 | 
165 | I  think  it  would  be  an  interesting  exercise to  develop  a
166 | "compiler" that  would give the programmer complete access to and
167 | control over the full complement  of the CPU instruction set, and
168 | would allow you to generate  programs  as  efficient  as assembly
169 | language, without the pain  of  learning a set of mnemonics.  Can
170 | it be done?  I don't  know.  The  real question may be, "Will the
171 | resulting language be any  easier  to  write  than assembly"?  If
172 | not, there's no point in it.  I think that it  can  be  done, but
173 | I'm not completely sure yet how the syntax should look.
174 | 
175 | Perhaps you have some  comments  or suggestions on this one.  I'd
176 | love to hear them.
177 | 
178 | You probably won't be surprised to learn that I've already worked
179 | ahead in most  of the areas that we will cover.  I have some good
180 | news:  Things  never  get  much  harder than they've been so far.
181 | It's  possible  to  build a complete, working compiler for a real
182 | language, using nothing  but  the same kinds of techniques you've
183 | learned so far.  And THAT brings up some interesting questions.
184 | 
185 | 
186 | WHY IS IT SO SIMPLE?
187 | 
188 | Before embarking  on this series, I always thought that compilers
189 | were just naturally complex computer  programs  ...  the ultimate
190 | challenge.  Yet the things we have done here have  usually turned
191 | out to be quite simple, sometimes even trivial.
192 | 
193 | For awhile, I thought  is  was simply because I hadn't yet gotten
194 | into the meat  of  the  subject.    I had only covered the simple
195 | parts.  I will freely admit  to  you  that, even when I began the
196 | series,  I  wasn't  sure how far we would be able  to  go  before
197 | things got too complex to deal with in the ways  we  have so far.
198 | But at this point I've already  been  down the road far enough to
199 | see the end of it.  Guess what?
200 |                               
201 | 
202 |                      THERE ARE NO HARD PARTS!
203 | 
204 | 
205 | Then, I thought maybe it was because we were not  generating very
206 | good object  code.    Those  of  you  who have been following the
207 | series and trying sample compiles know that, while the code works
208 | and  is  rather  foolproof,  its  efficiency is pretty awful.   I
209 | figured that if we were  concentrating on turning out tight code,
210 | we would soon find all that missing complexity.
211 | 
212 | To  some  extent,  that one is true.  In particular, my first few
213 | efforts at trying to improve efficiency introduced  complexity at
214 | an alarming rate.  But since then I've been tinkering around with
215 | some simple optimizations and I've found some that result in very
216 | respectable code quality, WITHOUT adding a lot of complexity.
217 | 
218 | Finally, I thought that  perhaps  the  saving  grace was the "toy
219 | compiler" nature of the study.   I  have made no pretense that we
220 | were  ever  going  to be able to build a compiler to compete with
221 | Borland and Microsoft.  And yet, again, as I get deeper into this
222 | thing the differences are starting to fade away.
223 | 
224 | Just  to make sure you get the message here, let me state it flat
225 | out:
226 | 
227 |    USING THE TECHNIQUES WE'VE USED  HERE,  IT  IS  POSSIBLE TO
228 |    BUILD A PRODUCTION-QUALITY, WORKING COMPILER WITHOUT ADDING
229 |    A LOT OF COMPLEXITY TO WHAT WE'VE ALREADY DONE.
230 | 
231 | 
232 | Since  the series began I've received  some  comments  from  you.
233 | Most of them echo my own thoughts:  "This is easy!    Why  do the
234 | textbooks make it seem so hard?"  Good question.
235 | 
236 | Recently, I've gone back and looked at some of those texts again,
237 | and even bought and read some new ones.  Each  time,  I come away
238 | with the same feeling: These guys have made it seem too hard.
239 | 
240 | What's going on here?  Why does the whole thing seem difficult in
241 | the texts, but easy to us?    Are  we that much smarter than Aho,
242 | Ullman, Brinch Hansen, and all the rest?
243 | 
244 | Hardly.  But we  are  doing some things differently, and more and
245 | more  I'm  starting  to appreciate the value of our approach, and
246 | the way that  it  simplifies  things.    Aside  from  the obvious
247 | shortcuts that I outlined in Part I, like single-character tokens
248 | and console I/O, we have  made some implicit assumptions and done
249 | some things differently from those who have designed compilers in
250 | the past. As it turns out, our approach makes life a lot easier.
251 | 
252 | So why didn't all those other guys use it?
253 | 
254 | You have to remember the context of some of the  earlier compiler
255 | development.  These people were working with very small computers
256 | of  limited  capacity.      Memory  was  very  limited,  the  CPU
257 | instruction  set  was  minimal, and programs ran  in  batch  mode
258 | rather  than  interactively.   As it turns out, these caused some
259 | key design decisions that have  really  complicated  the designs.
260 | Until recently,  I hadn't realized how much of classical compiler
261 | design was driven by the available hardware.
262 | 
263 | Even in cases where these  limitations  no  longer  apply, people
264 | have  tended  to  structure their programs in the same way, since
265 | that is the way they were taught to do it.
266 | 
267 | In  our case, we have started with a blank sheet of paper.  There
268 | is a danger there, of course,  that  you will end up falling into
269 | traps that other people have long since learned to avoid.  But it
270 | also has allowed us to  take different approaches that, partly by
271 | design  and partly by pure dumb luck, have  allowed  us  to  gain
272 | simplicity.
273 | 
274 | Here are the areas that I think have  led  to  complexity  in the
275 | past:
276 | 
277 |   o  Limited RAM Forcing Multiple Passes
278 | 
279 |      I  just  read  "Brinch  Hansen  on  Pascal   Compilers"  (an
280 |      excellent book, BTW).  He  developed a Pascal compiler for a
281 |      PC, but he started the effort in 1981 with a 64K system, and
282 |      so almost every design decision  he made was aimed at making
283 |      the compiler fit  into  RAM.    To do this, his compiler has
284 |      three passes, one of which is the lexical scanner.  There is
285 |      no way he could, for  example, use the distributed scanner I
286 |      introduced  in  the last installment,  because  the  program
287 |      structure wouldn't allow it.  He also required  not  one but
288 |      two intermediate  languages,  to  provide  the communication
289 |      between phases.
290 | 
291 |      All the early compiler writers  had to deal with this issue:
292 |      Break the compiler up into enough parts so that it  will fit
293 |      in memory.  When  you  have multiple passes, you need to add
294 |      data structures to support the  information  that  each pass
295 |      leaves behind for the next.   That adds complexity, and ends
296 |      up driving the  design.    Lee's  book,  "The  Anatomy  of a
297 |      Compiler,"  mentions a FORTRAN compiler developed for an IBM
298 |      1401.  It had no fewer than 63 separate passes!  Needless to
299 |      say,  in a compiler like this  the  separation  into  phases
300 |      would dominate the design.
301 | 
302 |      Even in  situations  where  RAM  is  plentiful,  people have
303 |      tended  to  use  the same techniques because  that  is  what
304 |      they're familiar with.   It  wasn't  until Turbo Pascal came
305 |      along that we found how simple a compiler could  be  if  you
306 |      started with different assumptions.
307 | 
308 | 
309 |   o  Batch Processing
310 |                               
311 |      In the early days, batch  processing was the only choice ...
312 |      there was no interactive computing.   Even  today, compilers
313 |      run in essentially batch mode.
314 | 
315 |      In a mainframe compiler as  well  as  many  micro compilers,
316 |      considerable effort is expended on error recovery ... it can
317 |      consume as much as 30-40%  of  the  compiler  and completely
318 |      drive the design.  The idea is to avoid halting on the first
319 |      error, but rather to keep going at all costs,  so  that  you
320 |      can  tell  the  programmer about as many errors in the whole
321 |      program as possible.
322 | 
323 |      All of that harks back to the days of the  early mainframes,
324 |      where turnaround time was measured  in hours or days, and it
325 |      was important to squeeze every last ounce of information out
326 |      of each run.
327 | 
328 |      In this series, I've been very careful to avoid the issue of
329 |      error recovery, and instead our compiler  simply  halts with
330 |      an error message on  the  first error.  I will frankly admit
331 |      that it was mostly because I wanted to take the easy way out
332 |      and keep things simple.   But  this  approach,  pioneered by
333 |      Borland in Turbo Pascal, also has a lot going for it anyway.
334 |      Aside from keeping the  compiler  simple,  it also fits very
335 |      well  with   the  idea  of  an  interactive  system.    When
336 |      compilation is  fast, and especially when you have an editor
337 |      such as Borland's that  will  take you right to the point of
338 |      the error, then it makes a  lot  of sense to stop there, and
339 |      just restart the compilation after the error is fixed.
340 | 
341 | 
342 |   o  Large Programs
343 | 
344 |      Early compilers were designed to handle  large  programs ...
345 |      essentially infinite ones.    In those days there was little
346 |      choice;  the  idea  of  subroutine  libraries  and  separate
347 |      compilation  were  still  in  the  future.      Again,  this
348 |      assumption led to  multi-pass designs and intermediate files
349 |      to hold the results of partial processing.
350 | 
351 |      Brinch Hansen's  stated goal was that the compiler should be
352 |      able to compile itself.   Again, because of his limited RAM,
353 |      this drove him to a multi-pass design.  He needed  as little
354 |      resident compiler code as possible,  so  that  the necessary
355 |      tables and other data structures would fit into RAM.
356 | 
357 |      I haven't stated this one yet, because there  hasn't  been a
358 |      need  ... we've always just read and  written  the  data  as
359 |      streams, anyway.  But  for  the  record,  my plan has always
360 |      been that, in  a  production compiler, the source and object
361 |      data should all coexist  in  RAM with the compiler, a la the
362 |      early Turbo Pascals.  That's why I've been  careful  to keep
363 |      routines like GetChar  and  Emit  as  separate  routines, in
364 |      spite of their small size.   It  will be easy to change them
365 |      to read to and write from memory.
366 | 
367 | 
368 |   o  Emphasis on Efficiency
369 | 
370 |      John  Backus has stated that, when  he  and  his  colleagues
371 |      developed the original FORTRAN compiler, they KNEW that they
372 |      had to make it produce tight code.  In those days, there was
373 |      a strong sentiment against HOLs  and  in  favor  of assembly
374 |      language, and  efficiency was the reason.  If FORTRAN didn't
375 |      produce very good  code  by  assembly  standards,  the users
376 |      would simply refuse to use it.  For the record, that FORTRAN
377 |      compiler turned out to  be  one  of  the most efficient ever
378 |      built, in terms of code quality.  But it WAS complex!
379 | 
380 |      Today,  we have CPU power and RAM size  to  spare,  so  code
381 |      efficiency is not  so  much  of  an  issue.    By studiously
382 |      ignoring this issue, we  have  indeed  been  able to Keep It
383 |      Simple.    Ironically,  though, as I have said, I have found
384 |      some optimizations that we can  add  to  the  basic compiler
385 |      structure, without having to add a lot of complexity.  So in
386 |      this  case we get to have our cake and eat it too:  we  will
387 |      end up with reasonable code quality, anyway.
388 | 
389 | 
390 |   o  Limited Instruction Sets
391 | 
392 |      The early computers had primitive instruction sets.   Things
393 |      that  we  take  for granted, such as  stack  operations  and
394 |      indirect addressing, came only with great difficulty.
395 | 
396 |      Example: In most compiler designs, there is a data structure
397 |      called the literal pool.  The compiler  typically identifies
398 |      all literals used in the program, and collects  them  into a
399 |      single data structure.    All references to the literals are
400 |      done  indirectly  to  this  pool.    At  the   end   of  the
401 |      compilation, the  compiler  issues  commands  to  set  aside
402 |      storage and initialize the literal pool.
403 | 
404 |      We haven't had to address that  issue  at all.  When we want
405 |      to load a literal, we just do it, in line, as in
406 | 
407 |           MOVE #3,D0
408 | 
409 |      There is something to be said for the use of a literal pool,
410 |      particularly on a machine like  the 8086 where data and code
411 |      can  be separated.  Still, the whole  thing  adds  a  fairly
412 |      large amount of complexity with little in return.
413 | 
414 |      Of course, without the stack we would be lost.  In  a micro,
415 |      both  subroutine calls and temporary storage depend  heavily
416 |      on the stack, and  we  have used it even more than necessary
417 |      to ease expression parsing.
418 | 
419 | 
420 |   o  Desire for Generality
421 | 
422 |      Much of the content of the typical compiler text is taken up
423 |      with issues we haven't addressed here at all ... things like
424 |      automated  translation  of  grammars,  or generation of LALR
425 |      parse tables.  This is not simply because  the  authors want
426 |      to impress you.  There are good, practical  reasons  why the
427 |      subjects are there.
428 | 
429 |      We have been concentrating on the use of a recursive-descent
430 |      parser to parse a  deterministic  grammar,  i.e.,  a grammar
431 |      that is not ambiguous and, therefore, can be parsed with one
432 |      level of lookahead.  I haven't made much of this limitation,
433 |      but  the  fact  is  that  this represents a small subset  of
434 |      possible grammars.  In fact,  there is an infinite number of
435 |      grammars that we can't parse using our techniques.    The LR
436 |      technique is a more powerful one, and can deal with grammars
437 |      that we can't.
438 | 
439 |      In compiler theory, it's important  to know how to deal with
440 |      these  other  grammars,  and  how  to  transform  them  into
441 |      grammars  that  are  easier to deal with.  For example, many
442 |      (but not all) ambiguous  grammars  can  be  transformed into
443 |      unambiguous ones.  The way to do this is not always obvious,
444 |      though, and so many people  have  devoted  years  to develop
445 |      ways to transform them automatically.
446 | 
447 |      In practice, these  issues  turn out to be considerably less
448 |      important.  Modern languages tend  to be designed to be easy
449 |      to parse, anyway.   That  was a key motivation in the design
450 |      of Pascal.   Sure,  there are pathological grammars that you
451 |      would be hard pressed to write unambiguous BNF  for,  but in
452 |      the  real  world  the best answer is probably to avoid those
453 |      grammars!
454 | 
455 |      In  our  case,  of course, we have sneakily let the language
456 |      evolve  as  we  go, so we haven't painted ourselves into any
457 |      corners here.  You may not always have that luxury.   Still,
458 |      with a little  care  you  should  be able to keep the parser
459 |      simple without having to resort to automatic  translation of
460 |      the grammar.
461 | 
462 | 
463 | We have taken  a  vastly  different  approach in this series.  We
464 | started with a clean sheet  of  paper,  and  developed techniques
465 | that work in the context that  we  are in; that is, a single-user
466 | PC  with  rather  ample CPU power and RAM space.  We have limited
467 | ourselves to reasonable grammars that  are easy to parse, we have
468 | used the instruction set of the CPU to advantage, and we have not
469 | concerned ourselves with efficiency.  THAT's why it's been easy.
470 | 
471 | Does this mean that we are forever doomed  to  be  able  to build
472 | only toy compilers?   No, I don't think so.  As I've said, we can
473 | add  certain   optimizations   without   changing   the  compiler
474 | structure.  If we want to process large files, we can  always add
475 | file  buffering  to do that.  These  things  do  not  affect  the
476 | overall program design.
477 | 
478 | And I think  that's  a  key  factor.   By starting with small and
479 | limited  cases,  we  have been able to concentrate on a structure
480 | for  the  compiler  that is natural  for  the  job.    Since  the
481 | structure naturally fits the job, it is almost bound to be simple
482 | and transparent.   Adding  capability doesn't have to change that
483 | basic  structure.    We  can  simply expand things like the  file
484 | structure or add an optimization layer.  I guess  my  feeling  is
485 | that, back when resources were tight, the structures people ended
486 | up  with  were  artificially warped to make them work under those
487 | conditions, and weren't optimum  structures  for  the  problem at
488 | hand.
489 | 
490 | 
491 | CONCLUSION
492 | 
493 | Anyway, that's my arm-waving  guess  as to how we've been able to
494 | keep things simple.  We started with something simple and  let it
495 | evolve  naturally,  without  trying  to   force   it   into  some
496 | traditional mold.
497 | 
498 | We're going to  press on with this.  I've given you a list of the
499 | areas  we'll  be  covering in future installments.    With  those
500 | installments, you  should  be  able  to  build  complete, working
501 | compilers for just about any occasion, and build them simply.  If
502 | you REALLY want to build production-quality compilers,  you'll be
503 | able to do that, too.
504 | 
505 | For those of you who are chafing at the bit for more parser code,
506 | I apologize for this digression.  I just thought  you'd  like  to
507 | have things put  into  perspective  a  bit.  Next time, we'll get
508 | back to the mainstream of the tutorial.
509 | 
510 | So far, we've only looked at pieces of compilers,  and  while  we
511 | have  many  of  the  makings  of a complete language, we  haven't
512 | talked about how to put  it  all  together.    That  will  be the
513 | subject of our next  two  installments.  Then we'll press on into
514 | the new subjects I listed at the beginning of this installment.
515 | 
516 | See you then.
517 | 
518 | *****************************************************************
519 | *                                                               *
520 | *                        COPYRIGHT NOTICE                       *
521 | *                                                               *
522 | *   Copyright (C) 1989 Jack W. Crenshaw. All rights reserved.   *
523 | *                                                               *
524 | *****************************************************************
525 | 
526 | 


--------------------------------------------------------------------------------
/reference/crenshaw-txt/tutor4.txt:
--------------------------------------------------------------------------------
  1 | 
  2 | 
  3 | 
  4 | 
  5 | 
  6 | 
  7 | 
  8 | 
  9 | 
 10 | 
 11 | 
 12 | 
 13 | 
 14 | 
 15 | 
 16 | 
 17 | 
 18 | 
 19 | 
 20 | 
 21 | 
 22 | 
 23 | 
 24 | 
 25 | 
 26 | 
 27 | 
 28 | 
 29 |                      LET'S BUILD A COMPILER!
 30 | 
 31 |                                 By
 32 | 
 33 |                      Jack W. Crenshaw, Ph.D.
 34 | 
 35 |                            24 July 1988
 36 | 
 37 | 
 38 |                       Part IV: INTERPRETERS
 39 | 
 40 | 
 41 | *****************************************************************
 42 | *                                                               *
 43 | *                        COPYRIGHT NOTICE                       *
 44 | *                                                               *
 45 | *   Copyright (C) 1988 Jack W. Crenshaw. All rights reserved.   *
 46 | *                                                               *
 47 | *****************************************************************
 48 | 
 49 | 
 50 | INTRODUCTION
 51 | 
 52 | In the first three installments of this series,  we've  looked at
 53 | parsing and  compiling math expressions, and worked our way grad-
 54 | ually and methodically from dealing  with  very  simple one-term,
 55 | one-character "expressions" up through more general ones, finally
 56 | arriving at a very complete parser that could parse and translate
 57 | complete  assignment  statements,  with  multi-character  tokens,
 58 | embedded white space, and function calls.  This  time,  I'm going
 59 | to walk you through the process one more time, only with the goal
 60 | of interpreting rather than compiling object code.
 61 | 
 62 | Since this is a series on compilers, why should  we  bother  with
 63 | interpreters?  Simply because I want you to see how the nature of
 64 | the  parser changes as we change the goals.  I also want to unify
 65 | the concepts of the two types of translators, so that you can see
 66 | not only the differences, but also the similarities.
 67 | 
 68 | Consider the assignment statement
 69 | 
 70 |                x = 2 * y + 3
 71 | 
 72 | In a compiler, we want the target CPU to execute  this assignment
 73 | at EXECUTION time.  The translator itself doesn't  do  any arith-
 74 | metic ... it only issues the object code that will cause  the CPU
 75 | to do it when the code is executed.  For  the  example above, the
 76 | compiler would issue code to compute the expression and store the
 77 | results in variable x.
 78 | 
 79 | For an interpreter,  on  the  other  hand, no object code is gen-
 80 | erated.   Instead, the arithmetic is computed immediately, as the
 81 | parsing is going on.  For the example, by the time parsing of the
 82 | statement is complete, x will have a new value.
 83 | 
 84 | The approach we've been  taking  in  this  whole series is called
 85 | "syntax-driven translation."  As you are aware by now, the struc-
 86 | ture of the  parser  is  very  closely  tied to the syntax of the
 87 | productions we parse.  We  have built Pascal procedures that rec-
 88 | ognize every language  construct.   Associated with each of these
 89 | constructs (and procedures) is  a  corresponding  "action," which
 90 | does  whatever  makes  sense to do  once  a  construct  has  been
 91 | recognized.    In  our  compiler  so far, every  action  involves
 92 | emitting object code, to be executed later at execution time.  In
 93 | an interpreter, every action  involves  something  to be done im-
 94 | mediately.
 95 | 
 96 | What I'd like you to see here is that the  layout  ... the struc-
 97 | ture ... of  the  parser  doesn't  change.  It's only the actions
 98 | that change.   So  if  you  can  write an interpreter for a given
 99 | language, you can also write a compiler, and vice versa.  Yet, as
100 | you  will  see,  there  ARE  differences,  and  significant ones.
101 | Because the actions are different,  the  procedures  that  do the
102 | recognizing end up being written differently.    Specifically, in
103 | the interpreter  the recognizing procedures end up being coded as
104 | FUNCTIONS that return numeric values to their callers.    None of
105 | the parsing routines for our compiler did that.
106 | 
107 | Our compiler, in fact,  is  what we might call a "pure" compiler.
108 | Each time a construct is recognized, the object  code  is emitted
109 | IMMEDIATELY.  (That's one reason the code is not very efficient.)
110 | The interpreter we'll be building  here is a pure interpreter, in
111 | the sense that there is  no  translation,  such  as "tokenizing,"
112 | performed on the source code.  These represent  the  two extremes
113 | of translation.  In  the  real  world,  translators are rarely so
114 | pure, but tend to have bits of each technique.
115 | 
116 | I can think of  several  examples.    I've already mentioned one:
117 | most interpreters, such as Microsoft BASIC,  for  example, trans-
118 | late the source code (tokenize it) into an  intermediate  form so
119 | that it'll be easier to parse real time.
120 | 
121 | Another example is an assembler.  The purpose of an assembler, of
122 | course, is to produce object code, and it normally does that on a
123 | one-to-one basis: one object instruction per line of source code.
124 | But almost every assembler also permits expressions as arguments.
125 | In this case, the expressions  are  always  constant expressions,
126 | and  so the assembler isn't supposed to  issue  object  code  for
127 | them.  Rather,  it  "interprets" the expressions and computes the
128 | corresponding constant result, which is what it actually emits as
129 | object code.
130 | 
131 | As a matter of fact, we  could  use  a bit of that ourselves. The
132 | translator we built in the  previous  installment  will dutifully
133 | spit out object code  for  complicated  expressions,  even though
134 | every term in  the  expression  is  a  constant.  In that case it
135 | would be far better if the translator behaved a bit more  like an
136 | interpreter, and just computed the equivalent constant result.
137 | 
138 | There is  a concept in compiler theory called "lazy" translation.
139 | The  idea is that you typically don't just  emit  code  at  every
140 | action.  In fact, at the extreme you don't emit anything  at all,
141 | until  you  absolutely  have to.  To accomplish this, the actions
142 | associated with the parsing routines  typically  don't  just emit
143 | code.  Sometimes  they  do,  but  often  they  simply  return in-
144 | formation back to the caller.  Armed with  such  information, the
145 | caller can then make a better choice of what to do.
146 | 
147 | For example, given the statement
148 | 
149 |                x = x + 3 - 2 - (5 - 4)  ,
150 | 
151 | our compiler will dutifully spit  out a stream of 18 instructions
152 | to load each parameter into  registers,  perform  the arithmetic,
153 | and store the result.  A lazier evaluation  would  recognize that
154 | the arithmetic involving constants can  be  evaluated  at compile
155 | time, and would reduce the expression to
156 | 
157 |                x = x + 0  .
158 | 
159 | An  even  lazier  evaluation would then be smart enough to figure
160 | out that this is equivalent to
161 | 
162 |                x = x  ,
163 | 
164 | which  calls  for  no  action  at  all.   We could reduce 18  in-
165 | structions to zero!
166 | 
167 | Note that there is no chance of optimizing this way in our trans-
168 | lator as it stands, because every action takes place immediately.
169 | 
170 | Lazy  expression  evaluation  can  produce  significantly  better
171 | object code than  we  have  been  able  to  so  far.  I warn you,
172 | though: it complicates the parser code considerably, because each
173 | routine now has to make decisions as to whether  to  emit  object
174 | code or not.  Lazy evaluation is certainly not named that because
175 | it's easier on the compiler writer!
176 | 
177 | Since we're operating mainly on  the KISS principle here, I won't
178 | go  into much more depth on this subject.  I just want you to  be
179 | aware  that  you  can get some code optimization by combining the
180 | techniques of compiling and  interpreting.    In  particular, you
181 | should know that the parsing  routines  in  a  smarter translator
182 | will generally  return  things  to  their  caller,  and sometimes
183 | expect things as  well.    That's  the main reason for going over
184 | interpretation in this installment.
185 | 
186 | 
187 | THE INTERPRETER
188 | 
189 | OK, now that you know WHY we're going into all this, let's do it.
190 | Just to give you practice, we're going to start over with  a bare
191 | cradle and build up the translator all over again.  This time, of
192 | course, we can go a bit faster.
193 | 
194 | Since we're now going  to  do arithmetic, the first thing we need
195 | to do is to change function GetNum, which up till now  has always
196 | returned a character  (or  string).    Now, it's better for it to
197 | return an integer.    MAKE  A  COPY of the cradle (for goodness's
198 | sake, don't change the version  in  Cradle  itself!!)  and modify
199 | GetNum as follows:
200 | 
201 | 
202 | {--------------------------------------------------------------}
203 | { Get a Number }
204 | 
205 | function GetNum: integer;
206 | begin
207 |    if not IsDigit(Look) then Expected('Integer');
208 |    GetNum := Ord(Look) - Ord('0');
209 |    GetChar;
210 | end;
211 | {--------------------------------------------------------------}
212 | 
213 | 
214 | Now, write the following version of Expression:
215 | 
216 | 
217 | {---------------------------------------------------------------}
218 | { Parse and Translate an Expression }
219 | 
220 | function Expression: integer;
221 | begin
222 |    Expression := GetNum;
223 | end;
224 | {--------------------------------------------------------------}
225 | 
226 | 
227 | Finally, insert the statement
228 | 
229 | 
230 |    Writeln(Expression);
231 | 
232 | 
233 | at the end of the main program.  Now compile and test.
234 | 
235 | All this program  does  is  to  "parse"  and  translate  a single
236 | integer  "expression."    As always, you should make sure that it
237 | does that with the digits 0..9, and gives an  error  message  for
238 | anything else.  Shouldn't take you very long!
239 | 
240 | OK, now let's extend this to include addops.    Change Expression
241 | to read:
242 | 
243 | 
244 | {---------------------------------------------------------------}
245 | { Parse and Translate an Expression }
246 | 
247 | function Expression: integer;
248 | var Value: integer;
249 | begin
250 |    if IsAddop(Look) then
251 |       Value := 0
252 |    else
253 |       Value := GetNum;
254 |    while IsAddop(Look) do begin
255 |       case Look of
256 |        '+': begin
257 |                Match('+');
258 |                Value := Value + GetNum;
259 |             end;
260 |        '-': begin
261 |                Match('-');
262 |                Value := Value - GetNum;
263 |             end;
264 |       end;
265 |    end;
266 |    Expression := Value;
267 | end;
268 | {--------------------------------------------------------------}
269 | 
270 | 
271 | The structure of Expression, of  course,  parallels  what  we did
272 | before,  so  we  shouldn't have too much  trouble  debugging  it.
273 | There's  been  a  SIGNIFICANT  development, though, hasn't there?
274 | Procedures Add and Subtract went away!  The reason  is  that  the
275 | action to be taken  requires  BOTH arguments of the operation.  I
276 | could have chosen to retain the procedures and pass into them the
277 | value of the expression to date,  which  is Value.  But it seemed
278 | cleaner to me to  keep  Value as strictly a local variable, which
279 | meant that the code for Add and Subtract had to be moved in line.
280 | This result suggests  that,  while the structure we had developed
281 | was nice and  clean  for our simple-minded translation scheme, it
282 | probably  wouldn't do for use with lazy  evaluation.    That's  a
283 | little tidbit we'll probably want to keep in mind for later.
284 | 
285 | OK,  did the translator work?  Then let's  take  the  next  step.
286 | It's not hard to  figure  out what procedure Term should now look
287 | like.  Change every call to GetNum in function  Expression  to  a
288 | call to Term, and then enter the following form for Term:
289 | 
290 | 
291 | 
292 | 
293 | {---------------------------------------------------------------}
294 | { Parse and Translate a Math Term }
295 | 
296 | function Term: integer;
297 | var Value: integer;
298 | begin
299 |    Value := GetNum;
300 |    while Look in ['*', '/'] do begin
301 |       case Look of
302 |        '*': begin
303 |                Match('*');
304 |                Value := Value * GetNum;
305 |             end;
306 |        '/': begin
307 |                Match('/');
308 |                Value := Value div GetNum;
309 |             end;
310 |       end;
311 |    end;
312 |    Term := Value;
313 | end;
314 | {--------------------------------------------------------------}
315 | 
316 | Now, try it out.    Don't forget two things: first, we're dealing
317 | with integer division, so, for example, 1/3 should come out zero.
318 | Second, even  though we can output multi-digit results, our input
319 | is still restricted to single digits.
320 | 
321 | That seems like a silly restriction at this point, since  we have
322 | already  seen how easily function GetNum can  be  extended.    So
323 | let's go ahead and fix it right now.  The new version is
324 | 
325 | 
326 | {--------------------------------------------------------------}
327 | { Get a Number }
328 | 
329 | function GetNum: integer;
330 | var Value: integer;
331 | begin
332 |    Value := 0;
333 |    if not IsDigit(Look) then Expected('Integer');
334 |    while IsDigit(Look) do begin
335 |       Value := 10 * Value + Ord(Look) - Ord('0');
336 |       GetChar;
337 |    end;
338 |    GetNum := Value;
339 | end;
340 | {--------------------------------------------------------------}
341 | 
342 | 
343 | If you've compiled and  tested  this  version of the interpreter,
344 | the  next  step  is to install function Factor, complete with pa-
345 | renthesized  expressions.  We'll hold off a  bit  longer  on  the
346 | variable  names.    First, change the references  to  GetNum,  in
347 | function Term, so that they call Factor instead.   Now  code  the
348 | following version of Factor:
349 | 
350 | 
351 | 
352 | 
353 | {---------------------------------------------------------------}
354 | { Parse and Translate a Math Factor }
355 | 
356 | function Expression: integer; Forward;
357 | 
358 | function Factor: integer;
359 | begin
360 |    if Look = '(' then begin
361 |       Match('(');
362 |       Factor := Expression;
363 |       Match(')');
364 |       end
365 |    else
366 |        Factor := GetNum;
367 | end;
368 | {---------------------------------------------------------------}
369 | 
370 | That was pretty easy, huh?  We're rapidly closing in on  a useful
371 | interpreter.
372 | 
373 | 
374 | A LITTLE PHILOSOPHY
375 | 
376 | Before going any further, there's something I'd like  to  call to
377 | your attention.  It's a concept that we've been making use  of in
378 | all these sessions, but I haven't explicitly mentioned it up till
379 | now.  I think it's time, because it's a concept so useful, and so
380 | powerful,  that  it  makes all the difference  between  a  parser
381 | that's trivially easy, and one that's too complex to deal with.
382 | 
383 | In the early days of compiler technology, people  had  a terrible
384 | time  figuring  out  how to deal with things like operator prece-
385 | dence  ...  the  way  that  multiply  and  divide operators  take
386 | precedence over add and subtract, etc.  I remember a colleague of
387 | some  thirty years ago, and how excited he was to find out how to
388 | do it.  The technique used involved building two  stacks,    upon
389 | which you pushed each operator  or operand.  Associated with each
390 | operator was a precedence level,  and the rules required that you
391 | only actually performed an operation  ("reducing"  the  stack) if
392 | the precedence level showing on top of the stack was correct.  To
393 | make life more interesting,  an  operator  like ')' had different
394 | precedence levels, depending  upon  whether or not it was already
395 | on the stack.  You  had to give it one value before you put it on
396 | the stack, and another to decide when to take it  off.   Just for
397 | the experience, I worked all of  this  out for myself a few years
398 | ago, and I can tell you that it's very tricky.
399 | 
400 | We haven't  had  to  do  anything like that.  In fact, by now the
401 | parsing of an arithmetic statement should seem like child's play.
402 | How did we get so lucky?  And where did the precedence stacks go?
403 | 
404 | A similar thing is going on  in  our interpreter above.  You just
405 | KNOW that in  order  for  it  to do the computation of arithmetic
406 | statements (as opposed to the parsing of them), there have  to be
407 | numbers pushed onto a stack somewhere.  But where is the stack?
408 | 
409 | Finally,  in compiler textbooks, there are  a  number  of  places
410 | where  stacks  and  other structures are discussed.  In the other
411 | leading parsing method (LR), an explicit stack is used.  In fact,
412 | the technique is very  much  like the old way of doing arithmetic
413 | expressions.  Another concept  is  that of a parse tree.  Authors
414 | like to draw diagrams  of  the  tokens  in a statement, connected
415 | into a tree with  operators  at the internal nodes.  Again, where
416 | are the trees and stacks in our technique?  We haven't seen any.
417 | The answer in all cases is that the structures are  implicit, not
418 | explicit.    In  any computer language, there is a stack involved
419 | every  time  you  call  a  subroutine.  Whenever a subroutine  is
420 | called, the return address is pushed onto the CPU stack.   At the
421 | end of the subroutine, the address is popped back off and control
422 | is  transferred  there.   In a recursive language such as Pascal,
423 | there can also be local data pushed onto the stack, and  it, too,
424 | returns when it's needed.
425 | 
426 | For example,  function  Expression  contains  a  local  parameter
427 | called  Value, which it fills by a call to Term.  Suppose, in its
428 | next call to  Term  for  the  second  argument,  that  Term calls
429 | Factor, which recursively  calls  Expression  again.    That "in-
430 | stance" of Expression gets another value for its  copy  of Value.
431 | What happens  to  the  first  Value?    Answer: it's still on the
432 | stack, and  will  be  there  again  when  we return from our call
433 | sequence.
434 | 
435 | In other words, the reason things look so simple  is  that  we've
436 | been making maximum use of the resources of the  language.    The
437 | hierarchy levels  and  the  parse trees are there, all right, but
438 | they're hidden within the  structure  of  the parser, and they're
439 | taken care of by the order with which the various  procedures are
440 | called.  Now that you've seen how we do it, it's probably hard to
441 | imagine doing it  any other way.  But I can tell you that it took
442 | a lot of years for compiler writers to get that smart.  The early
443 | compilers were too complex  too  imagine.    Funny how things get
444 | easier with a little practice.
445 | 
446 | The reason  I've  brought  all  this up is as both a lesson and a
447 | warning.  The lesson: things can be easy when you do  them right.
448 | The warning: take a look at what you're doing.  If, as you branch
449 | out on  your  own,  you  begin to find a real need for a separate
450 | stack or tree structure, it may be time to ask yourself if you're
451 | looking at things the right way.  Maybe you just aren't using the
452 | facilities of the language as well as you could be.
453 | 
454 | 
455 | The next step is to add variable names.  Now,  though,  we have a
456 | slight problem.  For  the  compiler, we had no problem in dealing
457 | with variable names ... we just issued the names to the assembler
458 | and let the rest  of  the program take care of allocating storage
459 | for  them.  Here, on the other hand, we need to be able to  fetch
460 | the values of the variables and return them as the  return values
461 | of Factor.  We need a storage mechanism for these variables.
462 | 
463 | Back in the early days of personal computing,  Tiny  BASIC lived.
464 | It had  a  grand  total  of  26  possible variables: one for each
465 | letter of the  alphabet.    This  fits nicely with our concept of
466 | single-character tokens, so we'll  try  the  same  trick.  In the
467 | beginning of your  interpreter,  just  after  the  declaration of
468 | variable Look, insert the line:
469 | 
470 |                Table: Array['A'..'Z'] of integer;
471 | 
472 | We also need to initialize the array, so add this procedure:
473 | 
474 | 
475 | 
476 | 
477 | {---------------------------------------------------------------}
478 | { Initialize the Variable Area }
479 | 
480 | procedure InitTable;
481 | var i: char;
482 | begin
483 |    for i := 'A' to 'Z' do
484 |       Table[i] := 0;
485 | end;
486 | {---------------------------------------------------------------}
487 | 
488 | 
489 | You must also insert a call to InitTable, in procedure Init.
490 | DON'T FORGET to do that, or the results may surprise you!
491 | 
492 | Now that we have an array  of  variables, we can modify Factor to
493 | use it.  Since we don't have a way (so far) to set the variables,
494 | Factor  will always return zero values for  them,  but  let's  go
495 | ahead and extend it anyway.  Here's the new version:
496 | 
497 | 
498 | {---------------------------------------------------------------}
499 | { Parse and Translate a Math Factor }
500 | 
501 | function Expression: integer; Forward;
502 | 
503 | function Factor: integer;
504 | begin
505 |    if Look = '(' then begin
506 |       Match('(');
507 |       Factor := Expression;
508 |       Match(')');
509 |       end
510 |    else if IsAlpha(Look) then
511 |       Factor := Table[GetName]
512 |    else
513 |        Factor := GetNum;
514 | end;
515 | {---------------------------------------------------------------}
516 | 
517 | 
518 | As always, compile and test this version of the  program.    Even
519 | though all the variables are now zeros, at least we can correctly
520 | parse the complete expressions, as well as catch any badly formed
521 | expressions.
522 | 
523 | I suppose you realize the next step: we need to do  an assignment
524 | statement so we can  put  something INTO the variables.  For now,
525 | let's  stick  to  one-liners,  though  we will soon  be  handling
526 | multiple statements.
527 | 
528 | The assignment statement parallels what we did before:
529 | 
530 | 
531 | {--------------------------------------------------------------}
532 | { Parse and Translate an Assignment Statement }
533 |                              
534 | 
535 | 
536 | procedure Assignment;
537 | var Name: char;
538 | begin
539 |    Name := GetName;
540 |    Match('=');
541 |    Table[Name] := Expression;
542 | end;
543 | {--------------------------------------------------------------}
544 | 
545 | 
546 | To test this,  I  added  a  temporary write statement in the main
547 | program,  to  print out the value of A.  Then I  tested  it  with
548 | various assignments to it.
549 | 
550 | Of course, an interpretive language that can only accept a single
551 | line of program  is not of much value.  So we're going to want to
552 | handle multiple statements.  This  merely  means  putting  a loop
553 | around  the  call  to Assignment.  So let's do that now. But what
554 | should be the loop exit criterion?  Glad you  asked,  because  it
555 | brings up a point we've been able to ignore up till now.
556 | 
557 | One of the most tricky things  to  handle in any translator is to
558 | determine when to bail out of  a  given construct and go look for
559 | something else.  This hasn't been a problem for us so far because
560 | we've only allowed for  a  single kind of construct ... either an
561 | expression  or an assignment statement.   When  we  start  adding
562 | loops and different kinds of statements, you'll find that we have
563 | to be very careful that things terminate properly.  If we put our
564 | interpreter in a loop, we need a way to quit.    Terminating on a
565 | newline is no good, because that's what sends us back for another
566 | line.  We could always let an unrecognized character take us out,
567 | but that would cause every run to end in an error  message, which
568 | certainly seems uncool.
569 | 
570 | What we need  is  a  termination  character.  I vote for Pascal's
571 | ending period ('.').   A  minor  complication  is that Turbo ends
572 | every normal line  with  TWO characters, the carriage return (CR)
573 | and line feed (LF).   At  the  end  of  each line, we need to eat
574 | these characters before processing the next one.   A  natural way
575 | to do this would  be  with  procedure  Match, except that Match's
576 | error  message  prints  the character, which of course for the CR
577 | and/or  LF won't look so great.  What we need is a special proce-
578 | dure for this, which we'll no doubt be using over and over.  Here
579 | it is:
580 | 
581 | 
582 | {--------------------------------------------------------------}
583 | { Recognize and Skip Over a Newline }
584 | 
585 | procedure NewLine;
586 | begin
587 |    if Look = CR then begin
588 |       GetChar;
589 |       if Look = LF then
590 |          GetChar;
591 |    end;
592 | end;
593 | {--------------------------------------------------------------}
594 | 
595 | 
596 | Insert this procedure at any convenient spot ... I put  mine just
597 | after Match.  Now, rewrite the main program to look like this:
598 | 
599 | 
600 | {--------------------------------------------------------------}
601 | { Main Program }
602 | 
603 | begin
604 |    Init;
605 |    repeat
606 |       Assignment;
607 |       NewLine;
608 |    until Look = '.';
609 | end.
610 | {--------------------------------------------------------------}
611 | 
612 | 
613 | Note that the  test for a CR is now gone, and that there are also
614 | no  error tests within NewLine itself.   That's  OK,  though  ...
615 | whatever is left over in terms of bogus characters will be caught
616 | at the beginning of the next assignment statement.
617 | 
618 | Well, we now have a functioning interpreter.  It doesn't do  us a
619 | lot of  good,  however,  since  we have no way to read data in or
620 | write it out.  Sure would help to have some I/O!
621 | 
622 | Let's wrap this session  up,  then,  by  adding the I/O routines.
623 | Since we're  sticking to single-character tokens, I'll use '?' to
624 | stand for a read statement, and  '!'  for a write, with the char-
625 | acter  immediately  following  them  to  be used as  a  one-token
626 | "parameter list."  Here are the routines:
627 | 
628 | {--------------------------------------------------------------}
629 | { Input Routine }
630 | 
631 | procedure Input;
632 | begin
633 |    Match('?');
634 |    Read(Table[GetName]);
635 | end;
636 | 
637 | 
638 | {--------------------------------------------------------------}
639 | { Output Routine }
640 | 
641 | procedure Output;
642 | begin
643 |    Match('!');
644 |    WriteLn(Table[GetName]);
645 | end;
646 | {--------------------------------------------------------------}
647 | 
648 | They aren't very fancy, I admit ... no prompt character on input,
649 | for example ... but they get the job done.
650 | 
651 | The corresponding changes in  the  main  program are shown below.
652 | Note that we use the usual  trick  of a case statement based upon
653 | the current lookahead character, to decide what to do.
654 | 
655 | 
656 | {--------------------------------------------------------------}
657 | { Main Program }
658 | 
659 | begin
660 |    Init;
661 |    repeat
662 |       case Look of
663 |        '?': Input;
664 |        '!': Output;
665 |        else Assignment;
666 |       end;
667 |       NewLine;
668 |    until Look = '.';
669 | end.
670 | {--------------------------------------------------------------}
671 | 
672 | 
673 | You have now completed a  real, working interpreter.  It's pretty
674 | sparse, but it works just like the "big boys."  It includes three
675 | kinds of program statements  (and  can  tell the difference!), 26
676 | variables,  and  I/O  statements.  The only things that it lacks,
677 | really, are control statements,  subroutines,    and some kind of
678 | program editing function.  The program editing part, I'm going to
679 | pass on.  After all, we're  not  here  to build a product, but to
680 | learn  things.    The control statements, we'll cover in the next
681 | installment, and the subroutines soon  after.  I'm anxious to get
682 | on with that, so we'll leave the interpreter as it stands.
683 | 
684 | I hope that by  now  you're convinced that the limitation of sin-
685 | gle-character names  and the processing of white space are easily
686 | taken  care  of, as we did in the last session.   This  time,  if
687 | you'd like to play around with these extensions, be my  guest ...
688 | they're  "left as an exercise for the student."    See  you  next
689 | time.
690 | 
691 | *****************************************************************
692 | *                                                               *
693 | *                        COPYRIGHT NOTICE                       *
694 | *                                                               *
695 | *   Copyright (C) 1988 Jack W. Crenshaw. All rights reserved.   *
696 | *                                                               *
697 | *****************************************************************
698 | 
699 |  1 --
700 | 
701 | 
702 | 


--------------------------------------------------------------------------------
/reference/crenshaw-txt/tutor9.txt:
--------------------------------------------------------------------------------
  1 | 
  2 | 
  3 | 
  4 | 
  5 | 
  6 | 
  7 | 
  8 | 
  9 | 
 10 | 
 11 | 
 12 | 
 13 | 
 14 | 
 15 | 
 16 | 
 17 | 
 18 | 
 19 | 
 20 | 
 21 | 
 22 | 
 23 | 
 24 | 
 25 | 
 26 | 
 27 | 
 28 | 
 29 |                      LET'S BUILD A COMPILER!
 30 | 
 31 |                                 By
 32 | 
 33 |                      Jack W. Crenshaw, Ph.D.
 34 | 
 35 |                           16 April 1989
 36 | 
 37 | 
 38 |                        Part IX: A TOP VIEW
 39 | 
 40 | 
 41 | *****************************************************************
 42 | *                                                               *
 43 | *                        COPYRIGHT NOTICE                       *
 44 | *                                                               *
 45 | *   Copyright (C) 1989 Jack W. Crenshaw. All rights reserved.   *
 46 | *                                                               *
 47 | *****************************************************************
 48 | 
 49 | 
 50 | INTRODUCTION
 51 | 
 52 | In  the  previous  installments,  we  have  learned  many of  the
 53 | techniques required to  build  a full-blown compiler.  We've done
 54 | both  assignment   statements   (with   Boolean   and  arithmetic
 55 | expressions),  relational operators, and control constructs.   We
 56 | still haven't  addressed procedure or function calls, but even so
 57 | we  could  conceivably construct a  mini-language  without  them.
 58 | I've  always  thought  it would be fun to see just  how  small  a
 59 | language  one  could  build  that  would still be useful.   We're
 60 | ALMOST in a position to do that now.  The  problem  is: though we
 61 | know  how  to  parse and translate the constructs, we still don't
 62 | know quite how to put them all together into a language.
 63 | 
 64 | In those earlier installments, the  development  of  our programs
 65 | had  a decidedly bottom-up flavor.  In  the  case  of  expression
 66 | parsing,  for  example,  we  began  with  the  very lowest  level
 67 | constructs, the individual constants  and  variables,  and worked
 68 | our way up to more complex expressions.
 69 | 
 70 | Most people regard  the  top-down design approach as being better
 71 | than  the  bottom-up  one.  I do too,  but  the  way  we  did  it
 72 | certainly seemed natural enough for the kinds of  things  we were
 73 | parsing.
 74 | 
 75 | You mustn't get  the  idea, though, that the incremental approach
 76 | that  we've  been  using  in  all these tutorials  is  inherently
 77 | bottom-up.  In  this  installment  I'd  like to show you that the
 78 | approach can work just as well when applied from the top down ...
 79 | maybe better.  We'll consider languages such as C and Pascal, and
 80 | see how complete compilers can be built starting from the top.
 81 | 
 82 | In the next installment, we'll  apply the same technique to build
 83 | a  complete  translator  for a subset of the KISS language, which
 84 | I'll be  calling  TINY.    But one of my goals for this series is
 85 | that you will  not only be able to see how a compiler for TINY or
 86 | KISS  works,  but  that you will also be able to design and build
 87 | compilers for your own languages.  The C and Pascal examples will
 88 | help.    One  thing I'd like you  to  see  is  that  the  natural
 89 | structure of the compiler depends very much on the language being
 90 | translated, so the simplicity and  ease  of  construction  of the
 91 | compiler  depends  very  much  on  letting the language  set  the
 92 | program structure.
 93 |                               
 94 | It's  a bit much to produce a full C or Pascal compiler here, and
 95 | we won't try.   But we can flesh out the top levels far enough so
 96 | that you can see how it goes.
 97 | 
 98 | Let's get started.
 99 | 
100 | 
101 | THE TOP LEVEL
102 | 
103 | One of the biggest  mistakes  people make in a top-down design is
104 | failing  to start at the true top.  They think they know what the
105 | overall structure of the  design  should be, so they go ahead and
106 | write it down.
107 | 
108 | Whenever  I  start a new design, I always like to do  it  at  the
109 | absolute beginning.   In  program design language (PDL), this top
110 | level looks something like:
111 | 
112 | 
113 |      begin
114 |         solve the problem
115 |      end
116 | 
117 | 
118 | OK, I grant  you that this doesn't give much of a hint as to what
119 | the next level is, but I  like  to  write it down anyway, just to
120 | give me that warm feeling that I am indeed starting at the top.
121 | 
122 | For our problem, the overall function of a compiler is to compile
123 | a complete program.  Any definition of the  language,  written in
124 | BNF,  begins here.  What does the top level BNF look like?  Well,
125 | that depends quite a bit on the language to be translated.  Let's
126 | take a look at Pascal.
127 | 
128 | 
129 | THE STRUCTURE OF PASCAL
130 | 
131 | Most  texts  for  Pascal  include  a   BNF   or  "railroad-track"
132 | definition of the language.  Here are the first few lines of one:
133 | 
134 | 
135 |      <program> ::= <program-header> <block> '.'
136 | 
137 |      <program-header> ::= PROGRAM <ident>
138 | 
139 |      <block> ::= <declarations> <statements>
140 | 
141 | 
142 | We can write recognizers  to  deal  with  each of these elements,
143 | just as we've done before.  For each one, we'll use  our familiar
144 | single-character tokens to represent the input, then flesh things
145 | out a little at a time.    Let's begin with the first recognizer:
146 | the program itself.
147 |                               
148 | To translate this, we'll  start  with a fresh copy of the Cradle.
149 | Since we're back to single-character  names, we'll just use a 'p'
150 | to stand for 'PROGRAM.'
151 | 
152 | To a fresh copy of the cradle, add the following code, and insert
153 | a call to it from the main program:
154 | 
155 | 
156 | {--------------------------------------------------------------}
157 | { Parse and Translate A Program }
158 | 
159 | procedure Prog;
160 | var  Name: char;
161 | begin
162 |    Match('p');            { Handles program header part }
163 |    Name := GetName;
164 |    Prolog(Name);
165 |    Match('.');
166 |    Epilog(Name);
167 | end;
168 | {--------------------------------------------------------------}
169 | 
170 | 
171 | The procedures  Prolog and Epilog perform whatever is required to
172 | let the program interface with the operating system,  so  that it
173 | can execute as a program.  Needless to  say,  this  part  will be
174 | VERY OS-dependent.  Remember, I've been emitting code for a 68000
175 | running under the OS I use, which is SK*DOS.   I  realize most of
176 | you are using PC's  and  would rather see something else, but I'm
177 | in this thing too deep to change now!
178 | 
179 | Anyhow, SK*DOS is a  particularly  easy OS to interface to.  Here
180 | is the code for Prolog and Epilog:
181 | 
182 | 
183 | {--------------------------------------------------------------}
184 | { Write the Prolog }
185 | 
186 | procedure Prolog;
187 | begin
188 |    EmitLn('WARMST EQU $A01E');
189 | end;
190 | 
191 | 
192 | {--------------------------------------------------------------}
193 | { Write the Epilog }
194 | 
195 | procedure Epilog(Name: char);
196 | begin
197 |    EmitLn('DC WARMST');
198 |    EmitLn('END ' + Name);
199 | end;
200 | {--------------------------------------------------------------}
201 |                               
202 | As usual, add  this  code  and  try  out the "compiler."  At this
203 | point, there is only one legal input:
204 | 
205 | 
206 |      px.   (where x is any single letter, the program name)
207 | 
208 | 
209 | Well,  as  usual  our first effort is rather unimpressive, but by
210 | now  I'm sure you know that things  will  get  more  interesting.
211 | There is one important thing to  note:   THE OUTPUT IS A WORKING,
212 | COMPLETE, AND EXECUTABLE PROGRAM (at least after it's assembled).
213 | 
214 | This  is  very  important.  The  nice  feature  of  the  top-down
215 | approach is that at any stage you can  compile  a  subset  of the
216 | complete language and get  a  program that will run on the target
217 | machine.    From here on, then, we  need  only  add  features  by
218 | fleshing out the language constructs.  It's all  very  similar to
219 | what we've been doing all along, except that we're approaching it
220 | from the other end.
221 | 
222 | 
223 | FLESHING IT OUT
224 | 
225 | To flesh out  the  compiler,  we  only have to deal with language
226 | features  one by one.  I like to start with a stub procedure that
227 | does  nothing, then add detail in  incremental  fashion.    Let's
228 | begin  by  processing  a block, in accordance with its PDL above.
229 | We can do this in two stages.  First, add the null procedure:
230 | 
231 | 
232 | {--------------------------------------------------------------}
233 | { Parse and Translate a Pascal Block }
234 | 
235 | procedure DoBlock(Name: char);
236 | begin
237 | end;
238 | {--------------------------------------------------------------}
239 | 
240 | 
241 | and modify Prog to read:
242 | 
243 | 
244 | {--------------------------------------------------------------}
245 | { Parse and Translate A Program }
246 | 
247 | procedure Prog;
248 | var  Name: char;
249 | begin
250 |    Match('p');
251 |    Name := GetName;
252 |    Prolog;
253 |    DoBlock(Name);
254 |    Match('.');
255 |    Epilog(Name);
256 | end;
257 | {--------------------------------------------------------------}
258 | 
259 | 
260 | That certainly  shouldn't change the behavior of the program, and
261 | it doesn't.  But now the  definition  of Prog is complete, and we
262 | can proceed to flesh out DoBlock.  That's done right from its BNF
263 | definition:
264 | 
265 | 
266 | {--------------------------------------------------------------}
267 | { Parse and Translate a Pascal Block }
268 | 
269 | procedure DoBlock(Name: char);
270 | begin
271 |    Declarations;
272 |    PostLabel(Name);
273 |    Statements;
274 | end;
275 | {--------------------------------------------------------------}
276 | 
277 | 
278 | The  procedure  PostLabel  was  defined  in  the  installment  on
279 | branches.  Copy it into your cradle.
280 | 
281 | I probably need to  explain  the  reason  for inserting the label
282 | where I have.  It has to do with the operation of SK*DOS.  Unlike
283 | some OS's,  SK*DOS allows the entry point to the main  program to
284 | be  anywhere in the program.  All you have to do is to give  that
285 | point a name.  The call  to  PostLabel puts that name just before
286 | the first executable statement  in  the  main  program.  How does
287 | SK*DOS know which of the many labels is the entry point, you ask?
288 | It's the one that matches the END statement  at  the  end  of the
289 | program.
290 | 
291 | OK,  now  we  need  stubs  for  the  procedures Declarations  and
292 | Statements.  Make them null procedures as we did before.
293 | 
294 | Does the program  still run the same?  Then we can move on to the
295 | next stage.
296 | 
297 | 
298 | DECLARATIONS
299 | 
300 | The BNF for Pascal declarations is:
301 | 
302 | 
303 |      <declarations> ::= ( <label list>    |
304 |                           <constant list> |
305 |                           <type list>     |
306 |                           <variable list> |
307 |                           <procedure>     |
308 |                           <function>         )*
309 |                               
310 | 
311 | (Note  that  I'm  using the more liberal definition used by Turbo
312 | Pascal.  In the standard Pascal definition, each  of  these parts
313 | must be in a specific order relative to the rest.)
314 | 
315 | As  usual,  let's  let a single character represent each of these
316 | declaration types.  The new form of Declarations is:
317 | 
318 | 
319 | {--------------------------------------------------------------}
320 | { Parse and Translate the Declaration Part }
321 | 
322 | procedure Declarations;
323 | begin
324 |    while Look in ['l', 'c', 't', 'v', 'p', 'f'] do
325 |       case Look of
326 |        'l': Labels;
327 |        'c': Constants;
328 |        't': Types;
329 |        'v': Variables;
330 |        'p': DoProcedure;
331 |        'f': DoFunction;
332 |       end;
333 | end;
334 | {--------------------------------------------------------------}
335 | 
336 | 
337 | Of course, we need stub  procedures for each of these declaration
338 | types.  This time,  they  can't  quite  be null procedures, since
339 | otherwise we'll end up with an infinite While loop.  At  the very
340 | least, each recognizer must  eat  the  character that invokes it.
341 | Insert the following procedures:
342 | 
343 | 
344 | {--------------------------------------------------------------}
345 | { Process Label Statement }
346 | 
347 | procedure Labels;
348 | begin
349 |    Match('l');
350 | end;
351 | 
352 | 
353 | {--------------------------------------------------------------}
354 | { Process Const Statement }
355 | 
356 | procedure Constants;
357 | begin
358 |    Match('c');
359 | end;
360 | 
361 | 
362 | {--------------------------------------------------------------}
363 | { Process Type Statement }
364 | procedure Types;
365 | begin
366 |    Match('t');
367 | end;
368 | 
369 | 
370 | {--------------------------------------------------------------}
371 | { Process Var Statement }
372 | 
373 | procedure Variables;
374 | begin
375 |    Match('v');
376 | end;
377 | 
378 | 
379 | {--------------------------------------------------------------}
380 | { Process Procedure Definition }
381 | 
382 | procedure DoProcedure;
383 | begin
384 |    Match('p');
385 | end;
386 | 
387 | 
388 | {--------------------------------------------------------------}
389 | { Process Function Definition }
390 | 
391 | procedure DoFunction;
392 | begin
393 |    Match('f');
394 | end;
395 | {--------------------------------------------------------------}
396 | 
397 | 
398 | Now try out the  compiler  with a few representative inputs.  You
399 | can  mix  the  declarations any way you like, as long as the last
400 | character  in  the  program is'.' to  indicate  the  end  of  the
401 | program.  Of course,  none  of  the declarations actually declare
402 | anything, so you don't need  (and can't use) any characters other
403 | than those standing for the keywords.
404 | 
405 | We can flesh out the statement  part  in  a similar way.  The BNF
406 | for it is:
407 | 
408 | 
409 |      <statements> ::= <compound statement>
410 | 
411 |      <compound statement> ::= BEGIN <statement>
412 |                                    (';' <statement>) END
413 | 
414 | 
415 | Note that statements can  begin  with  any identifier except END.
416 | So the first stub form of procedure Statements is:
417 |                               
418 | 
419 | {--------------------------------------------------------------}
420 | { Parse and Translate the Statement Part }
421 | 
422 | procedure Statements;
423 | begin
424 |    Match('b');
425 |    while Look <> 'e' do
426 |       GetChar;
427 |    Match('e');
428 | end;
429 | {--------------------------------------------------------------}
430 | 
431 | 
432 | At  this  point  the  compiler   will   accept   any   number  of
433 | declarations, followed by the  BEGIN  block  of the main program.
434 | This  block  itself  can contain any characters at all (except an
435 | END), but it must be present.
436 | 
437 | The simplest form of input is now
438 | 
439 |      'pxbe.'
440 | 
441 | Try  it.    Also  try  some  combinations  of  this.   Make  some
442 | deliberate errors and see what happens.
443 | 
444 | At this point you should be beginning to see the drill.  We begin
445 | with a stub translator to process a program, then  we  flesh  out
446 | each procedure in turn,  based  upon its BNF definition.  Just as
447 | the lower-level BNF definitions add detail and elaborate upon the
448 | higher-level ones, the lower-level  recognizers  will  parse more
449 | detail  of  the  input  program.    When  the  last stub has been
450 | expanded,  the  compiler  will  be  complete.    That's  top-down
451 | design/implementation in its purest form.
452 | 
453 | You might note that even though we've been adding procedures, the
454 | output of the program hasn't changed.  That's as  it  should  be.
455 | At these  top  levels  there  is  no  emitted code required.  The
456 | recognizers are  functioning as just that: recognizers.  They are
457 | accepting input sentences, catching bad ones, and channeling good
458 | input to the right places, so  they  are  doing their job.  If we
459 | were to pursue this a bit longer, code would start to appear.
460 | 
461 | The  next  step  in our expansion should  probably  be  procedure
462 | Statements.  The Pascal definition is:
463 | 
464 | 
465 |     <statement> ::= <simple statement> | <structured statement>
466 | 
467 |     <simple statement> ::= <assignment> | <procedure call> | null
468 | 
469 |     <structured statement> ::= <compound statement> |
470 |                                <if statement>       |
471 |                                <case statement>     |
472 |                                <while statement>    |
473 |                                <repeat statement>   |
474 |                                <for statement>      |
475 |                                <with statement>
476 | 
477 | 
478 | These  are  starting  to look familiar.  As a matter of fact, you
479 | have already gone  through  the process of parsing and generating
480 | code for both assignment statements and control structures.  This
481 | is where the top level meets our bottom-up  approach  of previous
482 | sessions.  The constructs will be a little  different  from those
483 | we've  been  using  for KISS, but the differences are nothing you
484 | can't handle.
485 | 
486 | I  think  you can get the picture now as to the  procedure.    We
487 | begin with a complete BNF  description of the language.  Starting
488 | at  the  top  level, we code  up  the  recognizer  for  that  BNF
489 | statement, using stubs  for  the next-level recognizers.  Then we
490 | flesh those lower-level statements out one by one.
491 | 
492 | As it happens, the definition of Pascal is  very  compatible with
493 | the  use of BNF, and BNF descriptions  of  the  language  abound.
494 | Armed  with  such   a   description,  you  will  find  it  fairly
495 | straightforward to continue the process we've begun.
496 | 
497 | You  might  have  a go at fleshing a few of these constructs out,
498 | just  to get a feel for it.  I don't expect you  to  be  able  to
499 | complete a Pascal compiler  here  ...  there  are too many things
500 | such  as  procedures  and types that we haven't addressed yet ...
501 | but  it  might  be helpful to try some of the more familiar ones.
502 | It will do  you  good  to  see executable programs coming out the
503 | other end.
504 | 
505 | If I'm going to address those issues that we haven't covered yet,
506 | I'd rather  do  it  in  the context of KISS.  We're not trying to
507 | build a complete Pascal  compiler  just yet, so I'm going to stop
508 | the expansion of Pascal here.    Let's  take  a  look  at  a very
509 | different language.
510 | 
511 | 
512 | THE STRUCTURE OF C
513 | 
514 | The C language is quite another matter, as you'll see.   Texts on
515 | C  rarely  include  a BNF definition of  the  language.  Probably
516 | that's because the language is quite hard to write BNF for.
517 | 
518 | One reason I'm showing you these structures now is so that  I can
519 | impress upon you these two facts:
520 | 
521 |  (1) The definition of  the  language drives the structure of the
522 |      compiler.  What works for one language may be a disaster for
523 |      another.    It's  a very bad idea to try to  force  a  given
524 |      structure upon the compiler.  Rather, you should let the BNF
525 |      drive the structure, as we have done here.
526 |                              
527 |  (2) A language that is hard to write BNF for  will  probably  be
528 |      hard  to  write  a compiler for, as well.  C  is  a  popular
529 |      language,  and  it  has  a  reputation  for  letting you  do
530 |      virtually  anything that is possible to  do.    Despite  the
531 |      success of Small C, C is _NOT_ an easy language to parse.
532 | 
533 | 
534 | A C program has  less  structure than its Pascal counterpart.  At
535 | the top level, everything in C is a static declaration, either of
536 | data or of a function.  We can capture this thought like this:
537 | 
538 | 
539 |      <program> ::= ( <global declaration> )*
540 | 
541 |      <global declaration> ::= <data declaration>  |
542 |                               <function>
543 | 
544 | In Small C, functions  can  only have the default type int, which
545 | is not declared.  This makes  the  input easy to parse: the first
546 | token is either "int," "char," or the name  of  a  function.   In
547 | Small  C, the preprocessor commands are  also  processed  by  the
548 | compiler proper, so the syntax becomes:
549 | 
550 | 
551 |      <global declaration> ::= '#' <preprocessor command>  |
552 |                               'int' <data list>           |
553 |                               'char' <data list>          |
554 |                               <ident> <function body>     |
555 | 
556 | 
557 | Although we're really more interested in full C  here,  I'll show
558 | you the  code corresponding to this top-level structure for Small
559 | C.
560 | 
561 | 
562 | {--------------------------------------------------------------}
563 | { Parse and Translate A Program }
564 | 
565 | procedure Prog;
566 | begin
567 |    while Look <> ^Z do begin
568 |       case Look of
569 |        '#': PreProc;
570 |        'i': IntDecl;
571 |        'c': CharDecl;
572 |       else DoFunction(Int);
573 |       end;
574 |    end;
575 | end;
576 | {--------------------------------------------------------------}
577 | 
578 | Note that I've had to use a ^Z to indicate the end of the source.
579 | C has no keyword such as END or the '.' to otherwise indicate the
580 | end.
581 |                              
582 | With full C,  things  aren't  even  this easy.  The problem comes
583 | about because in full C, functions can also have types.   So when
584 | the compiler sees a  keyword  like  "int,"  it still doesn't know
585 | whether to expect a  data  declaration  or a function definition.
586 | Things get more  complicated  since  the  next token may not be a
587 | name  ... it may start with an '*' or '(', or combinations of the
588 | two.
589 | 
590 | More specifically, the BNF for full C begins with:
591 | 
592 | 
593 |      <program> ::= ( <top-level decl> )*
594 | 
595 |      <top-level decl> ::= <function def> | <data decl>
596 | 
597 |      <data decl> ::= [<class>] <type> <decl-list>
598 | 
599 |      <function def> ::= [<class>] [<type>] <function decl>
600 | 
601 | 
602 | You  can  now  see the problem:   The  first  two  parts  of  the
603 | declarations for data and functions can be the same.   Because of
604 | the  ambiguity  in  the grammar as  written  above,  it's  not  a
605 | suitable  grammar  for  a  recursive-descent  parser.     Can  we
606 | transform it into one that is suitable?  Yes, with a little work.
607 | Suppose we write it this way:
608 | 
609 | 
610 |      <top-level decl> ::= [<class>] <decl>
611 | 
612 |      <decl> ::= <type> <typed decl> | <function decl>
613 | 
614 |      <typed decl> ::= <data list> | <function decl>
615 | 
616 | 
617 | We  can  build  a  parsing  routine  for  the   class   and  type
618 | definitions, and have them store away their findings  and  go on,
619 | without their ever having to "know" whether a function or  a data
620 | declaration is being processed.
621 | 
622 | To begin, key in the following version of the main program:
623 | 
624 | 
625 | {--------------------------------------------------------------}
626 | { Main Program }
627 | 
628 | begin
629 |    Init;
630 |    while Look <> ^Z do begin
631 |       GetClass;
632 |       GetType;
633 |       TopDecl;
634 |    end;
635 | end.
636 | 
637 | {--------------------------------------------------------------}
638 | 
639 | 
640 | For the first round, just make the three procedures stubs that do
641 | nothing _BUT_ call GetChar.
642 | 
643 | Does this program work?  Well, it would be hard put NOT to, since
644 | we're not really asking it to do anything.  It's been said that a
645 | C compiler will accept virtually any input without choking.  It's
646 | certainly true of THIS  compiler,  since in effect all it does is
647 | to eat input characters until it finds a ^Z.
648 | 
649 | Next, let's make  GetClass  do something worthwhile.  Declare the
650 | global variable
651 | 
652 | 
653 |      var Class: char;
654 | 
655 | 
656 | and change GetClass to do the following:
657 | 
658 | 
659 | {--------------------------------------------------------------}
660 | {  Get a Storage Class Specifier }
661 | 
662 | Procedure GetClass;
663 | begin
664 |    if Look in ['a', 'x', 's'] then begin
665 |       Class := Look;
666 |       GetChar;
667 |       end
668 |    else Class := 'a';
669 | end;
670 | {--------------------------------------------------------------}
671 | 
672 | 
673 | Here, I've used three  single  characters  to represent the three
674 | storage classes "auto," "extern,"  and  "static."   These are not
675 | the only three possible classes ... there are also "register" and
676 | "typedef," but this should  give  you the picture.  Note that the
677 | default class is "auto."
678 | 
679 | We  can  do  a  similar  thing  for  types.   Enter the following
680 | procedure next:
681 | 
682 | 
683 | {--------------------------------------------------------------}
684 | {  Get a Type Specifier }
685 | 
686 | procedure GetType;
687 | begin
688 |    Typ := ' ';
689 |    if Look = 'u' then begin
690 |       Sign := 'u';
691 |       Typ := 'i';
692 |       GetChar;
693 |       end
694 |    else Sign := 's';
695 |    if Look in ['i', 'l', 'c'] then begin
696 |       Typ := Look;
697 |       GetChar;
698 |    end;
699 | end;
700 | {--------------------------------------------------------------}
701 | 
702 | Note that you must add two more global variables, Sign and Typ.
703 | 
704 | With these two procedures in place, the compiler will process the
705 | class and type definitions and store away their findings.  We can
706 | now process the rest of the declaration.
707 | 
708 | We  are by no means out of the woods yet, because there are still
709 | many complexities just in the definition of the  type,  before we
710 | even get to the actual data or function names.  Let's pretend for
711 | the moment that we have passed all those gates, and that the next
712 | thing in the  input stream is a name.  If the name is followed by
713 | a left paren, we have a function declaration.  If not, we have at
714 | least one data item,  and  possibly a list, each element of which
715 | can have an initializer.
716 | 
717 | Insert the following version of TopDecl:
718 | 
719 | 
720 | {--------------------------------------------------------------}
721 | { Process a Top-Level Declaration }
722 | 
723 | procedure TopDecl;
724 | var Name: char;
725 | begin
726 |    Name := Getname;
727 |    if Look = '(' then
728 |       DoFunc(Name)
729 |    else
730 |       DoData(Name);
731 | end;
732 | {--------------------------------------------------------------}
733 | 
734 | 
735 | (Note that, since we have already read the name, we must  pass it
736 | along to the appropriate routine.)
737 | 
738 | Finally, add the two procedures DoFunc and DoData:
739 | 
740 | 
741 | {--------------------------------------------------------------}
742 | { Process a Function Definition }
743 | 
744 | procedure DoFunc(n: char);
745 | begin
746 |    Match('(');
747 |    Match(')');
748 |    Match('{');
749 |    Match('}');
750 |    if Typ = ' ' then Typ := 'i';
751 |    Writeln(Class, Sign, Typ, ' function ', n);
752 | end;
753 | 
754 | {--------------------------------------------------------------}
755 | { Process a Data Declaration }
756 | 
757 | procedure DoData(n: char);
758 | begin
759 |    if Typ = ' ' then Expected('Type declaration');
760 |    Writeln(Class, Sign, Typ, ' data ', n);
761 |    while Look = ',' do begin
762 |       Match(',');
763 |       n := GetName;
764 |       WriteLn(Class, Sign, Typ, ' data ', n);
765 |    end;
766 |    Match(';');
767 | end;
768 | {--------------------------------------------------------------}
769 | 
770 | 
771 | Since  we're  still  a long way from producing executable code, I
772 | decided to just have these two routines tell us what they found.
773 | 
774 | OK, give this program a try.    For data declarations, it's OK to
775 | give a list separated by commas.  We  can't  process initializers
776 | as yet.  We also can't process argument lists for  the functions,
777 | but the "(){}" characters should be there.
778 | 
779 | We're still a _VERY_ long way from having a C compiler,  but what
780 | we have is starting to process the right kinds of inputs,  and is
781 | recognizing both good  and  bad  inputs.    In  the  process, the
782 | natural structure of the compiler is starting to take form.
783 | 
784 | Can we continue this until we have something that acts  more like
785 | a compiler. Of course we can.  Should we?  That's another matter.
786 | I don't know about you, but I'm beginning to get dizzy, and we've
787 | still  got  a  long  way  to  go  to  even  get  past   the  data
788 | declarations.
789 | 
790 | At  this  point,  I think you can see how the  structure  of  the
791 | compiler evolves from the language  definition.    The structures
792 | we've seen for our  two  examples, Pascal and C, are as different
793 | as night and day.  Pascal was designed at least partly to be easy
794 | to parse, and that's  reflected  in the compiler.  In general, in
795 | Pascal there is more structure and we have a better idea  of what
796 | kinds of constructs to expect at any point.  In  C,  on the other
797 | hand,  the  program  is  essentially  a  list   of  declarations,
798 | terminated only by the end of file.
799 | 
800 | We  could  pursue  both  of  these structures much  farther,  but
801 | remember that our purpose here is  not  to  build a Pascal or a C
802 | compiler, but rather to study compilers in general.  For those of
803 | you  who DO want to deal with Pascal or C, I hope I've given  you
804 | enough of a start so that you can  take  it  from  here (although
805 | you'll soon need some of the stuff we still haven't  covered yet,
806 | such as typing and procedure calls).    For the rest of you, stay
807 | with me through the next installment.  There, I'll be leading you
808 | through the development of a complete compiler for TINY, a subset
809 | of KISS.
810 | 
811 | See you then.
812 | 
813 | 
814 | *****************************************************************
815 | *                                                               *
816 | *                        COPYRIGHT NOTICE                       *
817 | *                                                               *
818 | *   Copyright (C) 1989 Jack W. Crenshaw. All rights reserved.   *
819 | *                                                               *
820 | *****************************************************************
821 | 
822 | 


--------------------------------------------------------------------------------
/reference/crenshaw-txt/tutor2.txt:
--------------------------------------------------------------------------------
  1 | 
  2 | 
  3 | 
  4 | 
  5 | 
  6 | 
  7 | 
  8 | 
  9 | 
 10 | 
 11 | 
 12 | 
 13 | 
 14 | 
 15 | 
 16 | 
 17 | 
 18 | 
 19 | 
 20 | 
 21 | 
 22 | 
 23 | 
 24 | 
 25 | 
 26 | 
 27 | 
 28 | 
 29 | 
 30 |                      LET'S BUILD A COMPILER!
 31 | 
 32 |                                 By
 33 | 
 34 |                      Jack W. Crenshaw, Ph.D.
 35 | 
 36 |                            24 July 1988
 37 | 
 38 | 
 39 |                    Part II: EXPRESSION PARSING
 40 | 
 41 | 
 42 | *****************************************************************
 43 | *                                                               *
 44 | *                        COPYRIGHT NOTICE                       *
 45 | *                                                               *
 46 | *   Copyright (C) 1988 Jack W. Crenshaw. All rights reserved.   *
 47 | *                                                               *
 48 | *****************************************************************
 49 | 
 50 | 
 51 | GETTING STARTED
 52 | 
 53 | If you've read the introduction document to this series, you will
 54 | already know what  we're  about.    You will also have copied the
 55 | cradle software  into your Turbo Pascal system, and have compiled
 56 | it.  So you should be ready to go.
 57 | 
 58 | 
 59 | The purpose of this article is for us to learn  how  to parse and
 60 | translate mathematical expressions.  What we would like to see as
 61 | output is a series of assembler-language statements  that perform
 62 | the desired actions.    For purposes of definition, an expression
 63 | is the right-hand side of an equation, as in
 64 | 
 65 |                x = 2*y + 3/(4*z)
 66 | 
 67 | In the early going, I'll be taking things in _VERY_  small steps.
 68 | That's  so  that  the beginners among you won't get totally lost.
 69 | There are also  some  very  good  lessons to be learned early on,
 70 | that will serve us well later.  For the more experienced readers:
 71 | bear with me.  We'll get rolling soon enough.
 72 | 
 73 | SINGLE DIGITS
 74 | 
 75 | In keeping with the whole theme of this series (KISS, remember?),
 76 | let's start with the absolutely most simple case we can think of.
 77 | That, to me, is an expression consisting of a single digit.
 78 | 
 79 | Before starting to code, make sure you have a  baseline  copy  of
 80 | the  "cradle" that I gave last time.  We'll be using it again for
 81 | other experiments.  Then add this code:
 82 | 
 83 | 
 84 | {---------------------------------------------------------------}
 85 | { Parse and Translate a Math Expression }
 86 | 
 87 | procedure Expression;
 88 | begin
 89 |    EmitLn('MOVE #' + GetNum + ',D0')
 90 | end;
 91 | {---------------------------------------------------------------}
 92 | 
 93 | 
 94 | And add the  line  "Expression;"  to  the main program so that it
 95 | reads:
 96 |                               
 97 | 
 98 | {---------------------------------------------------------------}
 99 | begin
100 |    Init;
101 |    Expression;
102 | end.
103 | {---------------------------------------------------------------}
104 | 
105 | 
106 | Now  run  the  program. Try any single-digit number as input. You
107 | should get a single line of assembler-language output.    Now try
108 | any  other character as input, and you'll  see  that  the  parser
109 | properly reports an error.
110 | 
111 | 
112 | CONGRATULATIONS! You have just written a working translator!
113 | 
114 | OK, I grant you that it's pretty limited. But don't brush  it off
115 | too  lightly.  This little "compiler" does,  on  a  very  limited
116 | scale,  exactly  what  any larger compiler does:    it  correctly
117 | recognizes legal  statements in the input "language" that we have
118 | defined for it, and  it  produces  correct,  executable assembler
119 | code,  suitable  for  assembling  into  object  format.  Just  as
120 | importantly,  it correctly  recognizes  statements  that  are NOT
121 | legal, and gives a  meaningful  error message.  Who could ask for
122 | more?  As we expand our  parser,  we'd better make sure those two
123 | characteristics always hold true.
124 | 
125 | There  are  some  other  features  of  this  tiny  program  worth
126 | mentioning.    First,  you  can  see that we don't separate  code
127 | generation from parsing ...  as  soon as the parser knows what we
128 | want  done, it generates the object code directly.    In  a  real
129 | compiler, of course, the reads in GetChar would be  from  a  disk
130 | file, and the writes to another  disk  file, but this way is much
131 | easier to deal with while we're experimenting.
132 | 
133 | Also note that an expression must leave a result somewhere.  I've
134 | chosen the  68000  register  DO.    I  could have made some other
135 | choices, but this one makes sense.
136 | 
137 | 
138 | BINARY EXPRESSIONS
139 | 
140 | Now that we have that under our belt,  let's  branch  out  a bit.
141 | Admittedly, an "expression" consisting of only  one  character is
142 | not going to meet our needs for long, so let's see what we can do
143 | to extend it. Suppose we want to handle expressions of the form:
144 | 
145 |                          1+2
146 |      or                  4-3
147 |      or, in general, <term> +/- <term>
148 | 
149 | (That's a bit of Backus-Naur Form, or BNF.)
150 |                               
151 | To do this we need a procedure that recognizes a term  and leaves
152 | its   result   somewhere,  and  another   that   recognizes   and
153 | distinguishes  between   a  '+'  and  a  '-'  and  generates  the
154 | appropriate code.  But if Expression is going to leave its result
155 | in DO, where should Term leave its result?    Answer:    the same
156 | place.  We're  going  to  have  to  save the first result of Term
157 | somewhere before we get the next one.
158 | 
159 | OK, basically what we want to  do  is have procedure Term do what
160 | Expression was doing before.  So just RENAME procedure Expression
161 | as Term, and enter the following new version of Expression:
162 | 
163 | 
164 | 
165 | 
166 | {---------------------------------------------------------------}
167 | { Parse and Translate an Expression }
168 | 
169 | procedure Expression;
170 | begin
171 |    Term;
172 |    EmitLn('MOVE D0,D1');
173 |    case Look of
174 |     '+': Add;
175 |     '-': Subtract;
176 |    else Expected('Addop');
177 |    end;
178 | end;
179 | {--------------------------------------------------------------}
180 | 
181 | 
182 | Next, just above Expression enter these two procedures:
183 | 
184 | 
185 | {--------------------------------------------------------------}
186 | { Recognize and Translate an Add }
187 | 
188 | procedure Add;
189 | begin
190 |    Match('+');
191 |    Term;
192 |    EmitLn('ADD D1,D0');
193 | end;
194 | 
195 | 
196 | {-------------------------------------------------------------}
197 | { Recognize and Translate a Subtract }
198 | 
199 | procedure Subtract;
200 | begin
201 |    Match('-');
202 |    Term;
203 |    EmitLn('SUB D1,D0');
204 | end;
205 | {-------------------------------------------------------------}
206 |                               
207 | 
208 | When you're finished with that,  the order of the routines should
209 | be:
210 | 
211 |  o Term (The OLD Expression)
212 |  o Add
213 |  o Subtract
214 |  o Expression
215 | 
216 | Now run the program.  Try any combination you can think of of two
217 | single digits,  separated  by  a  '+' or a '-'.  You should get a
218 | series of four assembler-language instructions out  of  each run.
219 | Now  try  some  expressions with deliberate errors in them.  Does
220 | the parser catch the errors?
221 | 
222 | Take  a  look  at the object  code  generated.    There  are  two
223 | observations we can make.  First, the code generated is  NOT what
224 | we would write ourselves.  The sequence
225 | 
226 |         MOVE #n,D0
227 |         MOVE D0,D1
228 | 
229 | is inefficient.  If we were  writing  this code by hand, we would
230 | probably just load the data directly to D1.
231 | 
232 | There is a  message  here:  code  generated by our parser is less
233 | efficient  than the code we would write by hand.  Get used to it.
234 | That's going to be true throughout this series.  It's true of all
235 | compilers to some extent.  Computer scientists have devoted whole
236 | lifetimes to the issue of code optimization, and there are indeed
237 | things that can be done to improve the quality  of  code  output.
238 | Some compilers do quite well, but  there  is a heavy price to pay
239 | in complexity, and it's  a  losing  battle  anyway ... there will
240 | probably never come a time when  a  good  assembler-language pro-
241 | grammer can't out-program a compiler.    Before  this  session is
242 | over, I'll briefly mention some ways that we can do a  little op-
243 | timization,  just  to  show you that we can indeed improve things
244 | without too much trouble.  But remember, we're here to learn, not
245 | to see how tight we can make  the  object  code.    For  now, and
246 | really throughout  this  series  of  articles,  we'll  studiously
247 | ignore optimization and  concentrate  on  getting  out  code that
248 | works.
249 | 
250 | Speaking of which: ours DOESN'T!  The code is _WRONG_!  As things
251 | are working  now, the subtraction process subtracts D1 (which has
252 | the FIRST argument in it) from D0 (which has the second).  That's
253 | the wrong way, so we end up with the wrong  sign  for the result.
254 | So let's fix up procedure Subtract with a  sign-changer,  so that
255 | it reads
256 | 
257 | 
258 | {-------------------------------------------------------------}
259 | { Recognize and Translate a Subtract }
260 | 
261 | procedure Subtract;
262 | begin
263 |    Match('-');
264 |    Term;
265 |    EmitLn('SUB D1,D0');
266 |    EmitLn('NEG D0');
267 | end;
268 | {-------------------------------------------------------------}
269 | 
270 | 
271 | Now  our  code  is even less efficient, but at least it gives the
272 | right answer!  Unfortunately, the  rules that give the meaning of
273 | math expressions require that the terms in an expression come out
274 | in an inconvenient  order  for  us.    Again, this is just one of
275 | those facts of life you learn to live with.   This  one will come
276 | back to haunt us when we get to division.
277 | 
278 | OK,  at this point we have a parser that can recognize the sum or
279 | difference of two digits.    Earlier,  we  could only recognize a
280 | single digit.  But  real  expressions can have either form (or an
281 | infinity of others).  For kicks, go back and run the program with
282 | the single input line '1'.
283 | 
284 | Didn't work, did it?   And  why  should  it?    We  just finished
285 | telling  our  parser  that the only kinds of expressions that are
286 | legal are those  with  two  terms.    We  must  rewrite procedure
287 | Expression to be a lot more broadminded, and this is where things
288 | start to take the shape of a real parser.
289 | 
290 | 
291 | 
292 | 
293 | GENERAL EXPRESSIONS
294 | 
295 | In the  REAL  world,  an  expression  can  consist of one or more
296 | terms, separated  by  "addops"  ('+'  or  '-').   In BNF, this is
297 | written
298 | 
299 |           <expression> ::= <term> [<addop> <term>]*
300 | 
301 | 
302 | We  can  accomodate  this definition of an  expression  with  the
303 | addition of a simple loop to procedure Expression:
304 | 
305 | 
306 | {---------------------------------------------------------------}
307 | { Parse and Translate an Expression }
308 | 
309 | procedure Expression;
310 | begin
311 |    Term;
312 |    while Look in ['+', '-'] do begin
313 |       EmitLn('MOVE D0,D1');
314 |       case Look of
315 |        '+': Add;
316 |        '-': Subtract;
317 |       else Expected('Addop');
318 |       end;
319 |    end;
320 | end;
321 | {--------------------------------------------------------------}
322 | 
323 | 
324 | NOW we're getting somewhere!   This version handles any number of
325 | terms, and it only cost us two extra lines of code.  As we go on,
326 | you'll discover that this is characteristic  of  top-down parsers
327 | ... it only takes a few lines of code to accomodate extensions to
328 | the  language.    That's  what  makes  our  incremental  approach
329 | possible.  Notice, too, how well the code of procedure Expression
330 | matches the BNF definition.   That, too, is characteristic of the
331 | method.  As you get proficient in the approach, you'll  find that
332 | you can turn BNF into parser code just about as  fast  as you can
333 | type!
334 | 
335 | OK, compile the new version of our parser, and give it a try.  As
336 | usual,  verify  that  the  "compiler"   can   handle   any  legal
337 | expression,  and  will  give a meaningful error  message  for  an
338 | illegal one.  Neat, eh?  You might note that in our test version,
339 | any error message comes  out  sort of buried in whatever code had
340 | already been  generated. But remember, that's just because we are
341 | using  the  CRT  as  our  "output  file"  for   this   series  of
342 | experiments.  In a production version, the two  outputs  would be
343 | separated ... one to the output file, and one to the screen.
344 | 
345 | 
346 | USING THE STACK
347 | 
348 | At  this  point  I'm going to  violate  my  rule  that  we  don't
349 | introduce any complexity until  it's  absolutely  necessary, long
350 | enough to point out a problem with the code we're generating.  As
351 | things stand now, the parser  uses D0 for the "primary" register,
352 | and D1 as  a place to store the partial sum.  That works fine for
353 | now,  because  as  long as we deal with only the "addops" '+' and
354 | '-', any new term can be added in as soon as it is found.  But in
355 | general that isn't true.  Consider, for example, the expression
356 | 
357 |                1+(2-(3+(4-5)))
358 |                               
359 | If we put the '1' in D1, where  do  we  put  the  '2'?    Since a
360 | general expression can have any degree of complexity, we're going
361 | to run out of registers fast!
362 | 
363 | Fortunately,  there's  a  simple  solution.    Like  every modern
364 | microprocessor, the 68000 has a stack, which is the perfect place
365 | to save a variable number of items. So instead of moving the term
366 | in D0 to  D1, let's just push it onto the stack.  For the benefit
367 | of  those unfamiliar with 68000 assembler  language,  a  push  is
368 | written
369 | 
370 |                -(SP)
371 | 
372 | and a pop,     (SP)+ .
373 | 
374 | 
375 | So let's change the EmitLn in Expression to read:
376 | 
377 |                EmitLn('MOVE D0,-(SP)');
378 | 
379 | and the two lines in Add and Subtract to
380 | 
381 |                EmitLn('ADD (SP)+,D0')
382 | 
383 | and            EmitLn('SUB (SP)+,D0'),
384 | 
385 | respectively.  Now try the parser again and make sure  we haven't
386 | broken it.
387 | 
388 | Once again, the generated code is less efficient than before, but
389 | it's a necessary step, as you'll see.
390 | 
391 | 
392 | MULTIPLICATION AND DIVISION
393 | 
394 | Now let's get down to some REALLY serious business.  As  you  all
395 | know,  there  are  other  math   operators   than   "addops"  ...
396 | expressions can also have  multiply  and  divide operations.  You
397 | also  know  that  there  is  an implied operator  PRECEDENCE,  or
398 | hierarchy, associated with expressions, so that in  an expression
399 | like
400 | 
401 |                     2 + 3 * 4,
402 | 
403 | we know that we're supposed to multiply FIRST, then  add.    (See
404 | why we needed the stack?)
405 | 
406 | In the early days of compiler technology, people used some rather
407 | complex techniques to insure that the  operator  precedence rules
408 | were  obeyed.    It turns out,  though,  that  none  of  this  is
409 | necessary ... the rules can be accommodated quite  nicely  by our
410 | top-down  parsing technique.  Up till now,  the  only  form  that
411 | we've considered for a term is that of a  single  decimal  digit.
412 | 
413 | More generally, we  can  define  a  term as a PRODUCT of FACTORS;
414 | i.e.,
415 | 
416 |           <term> ::= <factor>  [ <mulop> <factor ]*
417 | 
418 | What  is  a factor?  For now, it's what a term used to be  ...  a
419 | single digit.
420 | 
421 | Notice the symmetry: a  term  has the same form as an expression.
422 | As a matter of fact, we can  add  to  our  parser  with  a little
423 | judicious  copying and renaming.  But  to  avoid  confusion,  the
424 | listing below is the complete set of parsing routines.  (Note the
425 | way we handle the reversal of operands in Divide.)
426 | 
427 | 
428 | {---------------------------------------------------------------}
429 | { Parse and Translate a Math Factor }
430 | 
431 | procedure Factor;
432 | begin
433 |    EmitLn('MOVE #' + GetNum + ',D0')
434 | end;
435 | 
436 | 
437 | {--------------------------------------------------------------}
438 | { Recognize and Translate a Multiply }
439 | 
440 | procedure Multiply;
441 | begin
442 |    Match('*');
443 |    Factor;
444 |    EmitLn('MULS (SP)+,D0');
445 | end;
446 | 
447 | 
448 | {-------------------------------------------------------------}
449 | { Recognize and Translate a Divide }
450 | 
451 | procedure Divide;
452 | begin
453 |    Match('/');
454 |    Factor;
455 |    EmitLn('MOVE (SP)+,D1');
456 |    EmitLn('DIVS D1,D0');
457 | end;
458 | 
459 | 
460 | {---------------------------------------------------------------}
461 | { Parse and Translate a Math Term }
462 | 
463 | procedure Term;
464 | begin
465 |    Factor;
466 |    while Look in ['*', '/'] do begin
467 |       EmitLn('MOVE D0,-(SP)');
468 |       case Look of
469 |        '*': Multiply;
470 |        '/': Divide;
471 |       else Expected('Mulop');
472 |       end;
473 |    end;
474 | end;
475 | 
476 | 
477 | 
478 | 
479 | {--------------------------------------------------------------}
480 | { Recognize and Translate an Add }
481 | 
482 | procedure Add;
483 | begin
484 |    Match('+');
485 |    Term;
486 |    EmitLn('ADD (SP)+,D0');
487 | end;
488 | 
489 | 
490 | {-------------------------------------------------------------}
491 | { Recognize and Translate a Subtract }
492 | 
493 | procedure Subtract;
494 | begin
495 |    Match('-');
496 |    Term;
497 |    EmitLn('SUB (SP)+,D0');
498 |    EmitLn('NEG D0');
499 | end;
500 | 
501 | 
502 | {---------------------------------------------------------------}
503 | { Parse and Translate an Expression }
504 | 
505 | procedure Expression;
506 | begin
507 |    Term;
508 |    while Look in ['+', '-'] do begin
509 |       EmitLn('MOVE D0,-(SP)');
510 |       case Look of
511 |        '+': Add;
512 |        '-': Subtract;
513 |       else Expected('Addop');
514 |       end;
515 |    end;
516 | end;
517 | {--------------------------------------------------------------}
518 | 
519 | 
520 | Hot dog!  A NEARLY functional parser/translator, in only 55 lines
521 | of Pascal!  The output is starting to look really useful,  if you
522 | continue to overlook the inefficiency,  which  I  hope  you will.
523 | Remember, we're not trying to produce tight code here.
524 | 
525 | 
526 | PARENTHESES
527 | 
528 | We  can  wrap  up this part of the parser with  the  addition  of
529 | parentheses with  math expressions.  As you know, parentheses are
530 | a  mechanism to force a desired operator  precedence.    So,  for
531 | example, in the expression
532 | 
533 |                2*(3+4) ,
534 | 
535 | the parentheses force the addition  before  the  multiply.   Much
536 | more importantly, though, parentheses  give  us  a  mechanism for
537 | defining expressions of any degree of complexity, as in
538 | 
539 |                (1+2)/((3+4)+(5-6))
540 | 
541 | The  key  to  incorporating  parentheses  into our parser  is  to
542 | realize that  no matter how complicated an expression enclosed by
543 | parentheses may be,  to  the  rest  of  the world it looks like a
544 | simple factor.  That is, one of the forms for a factor is:
545 | 
546 |           <factor> ::= (<expression>)
547 | 
548 | This is where the recursion comes in. An expression can contain a
549 | factor which contains another expression which contains a factor,
550 | etc., ad infinitum.
551 | 
552 | Complicated or not, we can take care of this by adding just a few
553 | lines of Pascal to procedure Factor:
554 |                              
555 | 
556 | {---------------------------------------------------------------}
557 | { Parse and Translate a Math Factor }
558 | 
559 | procedure Expression; Forward;
560 | 
561 | procedure Factor;
562 | begin
563 |    if Look = '(' then begin
564 |       Match('(');
565 |       Expression;
566 |       Match(')');
567 |       end
568 |    else
569 |       EmitLn('MOVE #' + GetNum + ',D0');
570 | end;
571 | {--------------------------------------------------------------}
572 | 
573 | 
574 | Note again how easily we can extend the parser, and how  well the
575 | Pascal code matches the BNF syntax.
576 | 
577 | As usual, compile the new version and make sure that it correctly
578 | parses  legal sentences, and flags illegal  ones  with  an  error
579 | message.
580 | 
581 | 
582 | UNARY MINUS
583 | 
584 | At  this  point,  we have a parser that can handle just about any
585 | expression, right?  OK, try this input sentence:
586 | 
587 |                          -1
588 | 
589 | WOOPS!  It doesn't work, does it?   Procedure  Expression expects
590 | everything to start with an integer, so it coughs up  the leading
591 | minus  sign.  You'll find that +3 won't  work  either,  nor  will
592 | something like
593 | 
594 |                     -(3-2) .
595 | 
596 | There  are  a  couple of ways to fix the problem.    The  easiest
597 | (although not necessarily the best)  way is to stick an imaginary
598 | leading zero in  front  of  expressions  of this type, so that -3
599 | becomes 0-3.  We can easily patch this into our  existing version
600 | of Expression:
601 | 
602 | 
603 | 
604 | {---------------------------------------------------------------}
605 | { Parse and Translate an Expression }
606 | 
607 | procedure Expression;
608 | begin
609 |    if IsAddop(Look) then
610 |       EmitLn('CLR D0')
611 |    else
612 |       Term;
613 |    while IsAddop(Look) do begin
614 |       EmitLn('MOVE D0,-(SP)');
615 |       case Look of
616 |        '+': Add;
617 |        '-': Subtract;
618 |       else Expected('Addop');
619 |       end;
620 |    end;
621 | end;
622 | {--------------------------------------------------------------}
623 |  
624 | 
625 | I TOLD you that making changes  was  easy!   This time it cost us
626 | only  three  new lines of Pascal.   Note  the  new  reference  to
627 | function IsAddop.  Since the test for an addop appeared  twice, I
628 | chose  to  embed  it in the new function.  The  form  of  IsAddop
629 | should be apparent from that for IsAlpha.  Here it is:
630 | 
631 | 
632 | {--------------------------------------------------------------}
633 | { Recognize an Addop }
634 | 
635 | function IsAddop(c: char): boolean;
636 | begin
637 |    IsAddop := c in ['+', '-'];
638 | end;
639 | {--------------------------------------------------------------}
640 | 
641 | 
642 | OK, make these changes to the program and recompile.   You should
643 | also include IsAddop in your baseline copy of the cradle.   We'll
644 | be needing  it  again  later.   Now try the input -1 again.  Wow!
645 | The efficiency of the code is  pretty  poor ... six lines of code
646 | just for loading a simple constant ... but at least it's correct.
647 | Remember, we're not trying to replace Turbo Pascal here.
648 | 
649 | At this point we're just about finished with the structure of our
650 | expression parser.   This version of the program should correctly
651 | parse and compile just about any expression you care to  throw at
652 | it.    It's still limited in that  we  can  only  handle  factors
653 | involving single decimal digits.    But I hope that by now you're
654 | starting  to  get  the  message  that we can  accomodate  further
655 | extensions  with  just  some  minor  changes to the parser.   You
656 | probably won't be  surprised  to  hear  that a variable or even a
657 | function call is just another kind of a factor.
658 |                              
659 | In  the next session, I'll show you just how easy it is to extend
660 | our parser to take care of  these  things too, and I'll also show
661 | you just  how easily we can accomodate multicharacter numbers and
662 | variable names.  So you see,  we're  not  far at all from a truly
663 | useful parser.
664 | 
665 | 
666 | 
667 | 
668 | A WORD ABOUT OPTIMIZATION
669 | 
670 | Earlier in this session, I promised to give you some hints  as to
671 | how we can improve the quality of the generated code.  As I said,
672 | the  production of tight code is not the  main  purpose  of  this
673 | series of articles.  But you need to at least know that we aren't
674 | just  wasting our time here ... that we  can  indeed  modify  the
675 | parser further to  make  it produce better code, without throwing
676 | away everything we've done to date.  As usual, it turns  out that
677 | SOME optimization is not that difficult to do ... it simply takes
678 | some extra code in the parser.
679 | 
680 | There are two basic approaches we can take:
681 | 
682 |   o Try to fix up the code after it's generated
683 | 
684 |     This is  the concept of "peephole" optimization.  The general
685 |     idea it that we  know  what  combinations of instructions the
686 |     compiler  is  going  to generate, and we also know which ones
687 |     are pretty bad (such as the code for -1, above).    So all we
688 |     do  is  to   scan   the  produced  code,  looking  for  those
689 |     combinations, and replacing  them  by better ones.  It's sort
690 |     of   a   macro   expansion,   in   reverse,   and   a  fairly
691 |     straightforward  exercise  in   pattern-matching.   The  only
692 |     complication,  really, is that there may be  a  LOT  of  such
693 |     combinations to look for.  It's called  peephole optimization
694 |     simply because it only looks at a small group of instructions
695 |     at a time.  Peephole  optimization can have a dramatic effect
696 |     on  the  quality  of the code,  with  little  change  to  the
697 |     structure of the compiler  itself.   There is a price to pay,
698 |     though,  in  both  the  speed,   size, and complexity of  the
699 |     compiler.  Looking for all those combinations calls for a lot
700 |     of IF tests, each one of which is a source of error.  And, of
701 |     course, it takes time.
702 | 
703 |      In  the  classical  implementation  of a peephole optimizer,
704 |     it's done as a second pass to the compiler.  The  output code
705 |     is  written  to  disk,  and  then  the  optimizer  reads  and
706 |     processes the disk file again.  As a matter of fact,  you can
707 |     see that the optimizer could  even be a separate PROGRAM from
708 |     the compiler proper.  Since the optimizer only  looks  at the
709 |     code through a  small  "window"  of  instructions  (hence the
710 |     name), a better implementation would be to simply buffer up a
711 |     few lines of output, and scan the buffer after each EmitLn.
712 | 
713 |   o Try to generate better code in the first place
714 |                              
715 |     This approach calls for us to look for  special  cases BEFORE
716 |     we Emit them.  As a trivial example,  we  should  be  able to
717 |     identify a constant zero,  and  Emit a CLR instead of a load,
718 |     or even do nothing at all, as in an add of zero, for example.
719 |     Closer to home, if we had chosen to recognize the unary minus
720 |     in Factor  instead of in Expression, we could treat constants
721 |     like -1 as ordinary constants,  rather  then  generating them
722 |     from  positive  ones.   None of these things are difficult to
723 |     deal with ... they only add extra tests in the code, which is
724 |     why  I  haven't  included them in our program.  The way I see
725 |     it, once we get to the point that we have a working compiler,
726 |     generating useful code  that  executes, we can always go back
727 |     and tweak the thing to tighten up the code produced.   That's
728 |     why there are Release 2.0's in the world.
729 | 
730 | There IS one more type  of  optimization  worth  mentioning, that
731 | seems to promise pretty tight code without too much hassle.  It's
732 | my "invention" in the  sense  that I haven't seen it suggested in
733 | print anywhere, though I have  no  illusions  that  it's original
734 | with me.
735 | 
736 | This  is to avoid such a heavy use of the stack, by making better
737 | use of the CPU registers.  Remember back when we were  doing only
738 | addition  and  subtraction,  that we used registers  D0  and  D1,
739 | rather than the stack?  It worked, because with  only  those  two
740 | operations, the "stack" never needs more than two entries.
741 | 
742 | Well,  the 68000 has eight data registers.  Why not use them as a
743 | privately managed stack?  The key is to recognize  that,  at  any
744 | point in its processing,  the  parser KNOWS how many items are on
745 | the  stack, so it can indeed manage it properly.  We can define a
746 | private "stack pointer" that keeps  track  of  which  stack level
747 | we're at, and addresses the  corresponding  register.   Procedure
748 | Factor,  for  example,  would  not  cause data to be loaded  into
749 | register  D0,  but   into  whatever  the  current  "top-of-stack"
750 | register happened to be.
751 | 
752 | What we're doing in effect is to replace the CPU's RAM stack with
753 | a  locally  managed  stack  made  up  of  registers.    For  most
754 | expressions, the stack level  will  never  exceed eight, so we'll
755 | get pretty good code out.  Of course, we also  have  to deal with
756 | those  odd cases where the stack level  DOES  exceed  eight,  but
757 | that's no problem  either.    We  simply let the stack spill over
758 | into the CPU  stack.    For  levels  beyond eight, the code is no
759 | worse  than  what  we're generating now, and for levels less than
760 | eight, it's considerably better.
761 | 
762 | For the record, I  have  implemented  this  concept, just to make
763 | sure  it  works  before  I  mentioned  it to you.  It does.    In
764 | practice, it turns out that you can't really use all eight levels
765 | ... you need at least one register free to  reverse  the  operand
766 | order for division  (sure  wish  the  68000 had an XTHL, like the
767 | 8080!).  For expressions  that  include  function calls, we would
768 | also need a register reserved for them. Still, there  is  a  nice
769 | improvement in code size for most expressions.
770 | 
771 | So, you see, getting  better  code  isn't  that difficult, but it
772 | does add complexity to the our translator ...  complexity  we can
773 | do without at this point.  For that reason,  I  STRONGLY  suggest
774 | that we continue to ignore efficiency issues for the rest of this
775 | series,  secure  in  the knowledge that we can indeed improve the
776 | code quality without throwing away what we've done.
777 | 
778 | Next lesson, I'll show you how to deal with variables factors and
779 | function calls.  I'll also show you just how easy it is to handle
780 | multicharacter tokens and embedded white space.
781 | 
782 | *****************************************************************
783 | *                                                               *
784 | *                        COPYRIGHT NOTICE                       *
785 | *                                                               *
786 | *   Copyright (C) 1988 Jack W. Crenshaw. All rights reserved.   *
787 | *                                                               *
788 | *****************************************************************
789 |  
790 | 
791 | 
792 | 
793 | 


--------------------------------------------------------------------------------
/reference/crenshaw-txt/tutor12.txt:
--------------------------------------------------------------------------------
  1 | 
  2 | 
  3 | 
  4 | 
  5 | 
  6 | 
  7 | 
  8 | 
  9 | 
 10 | 
 11 | 
 12 | 
 13 | 
 14 | 
 15 | 
 16 | 
 17 | 
 18 | 
 19 | 
 20 | 
 21 | 
 22 | 
 23 | 
 24 | 
 25 | 
 26 | 
 27 | 
 28 |                      LET'S BUILD A COMPILER!
 29 | 
 30 |                                 By
 31 | 
 32 |                      Jack W. Crenshaw, Ph.D.
 33 | 
 34 |                            5 June 1989
 35 | 
 36 | 
 37 |                        Part XII: MISCELLANY
 38 | 
 39 | 
 40 | *****************************************************************
 41 | *                                                               *
 42 | *                        COPYRIGHT NOTICE                       *
 43 | *                                                               *
 44 | *   Copyright (C) 1989 Jack W. Crenshaw. All rights reserved.   *
 45 | *                                                               *
 46 | *****************************************************************
 47 | 
 48 | 
 49 | INTRODUCTION
 50 | 
 51 | This installment is another one  of  those  excursions  into side
 52 | alleys  that  don't  seem to fit  into  the  mainstream  of  this
 53 | tutorial  series.    As I mentioned last time, it was while I was
 54 | writing this installment that I realized some changes  had  to be
 55 | made  to  the  compiler structure.  So I had to digress from this
 56 | digression long enough to develop the new structure  and  show it
 57 | to you.
 58 | 
 59 | Now that that's behind us, I can tell you what I  set  out  to in
 60 | the first place.  This shouldn't  take  long, and then we can get
 61 | back into the mainstream.
 62 | 
 63 | Several people have asked  me  about  things that other languages
 64 | provide, but so far I haven't addressed in this series.   The two
 65 | biggies are semicolons and  comments.    Perhaps  you've wondered
 66 | about them, too, and  wondered  how things would change if we had
 67 | to  deal with them.  Just so you can proceed with what's to come,
 68 | without being  bothered by that nagging feeling that something is
 69 | missing, we'll address such issues here.
 70 | 
 71 | 
 72 | SEMICOLONS
 73 | 
 74 | Ever since the introduction of Algol, semicolons have been a part
 75 | of  almost every modern language.  We've all  used  them  to  the
 76 | point that they are taken for  granted.   Yet I suspect that more
 77 | compilation errors have  occurred  due  to  misplaced  or missing
 78 | semicolons  than  any  other single cause.  And if we had a penny
 79 | for  every  extra  keystroke programmers have used  to  type  the
 80 | little rascals, we could pay off the national debt.
 81 | 
 82 | Having  been  brought  up with FORTRAN, it took me a long time to
 83 | get used to using semicolons, and to tell the  truth  I've  never
 84 | quite understood why they  were  necessary.    Since I program in
 85 | Pascal, and since the use of semicolons in Pascal is particularly
 86 | tricky,  that one little character is still  by  far  my  biggest
 87 | source of errors.
 88 | 
 89 | When  I  began  developing  KISS,  I resolved to  question  EVERY
 90 | construct in other languages, and to try to avoid the most common
 91 | problems that occur with them.  That puts the semicolon very high
 92 | on my hit list.
 93 | 
 94 | To  understand  the  role of the semicolon, you have to look at a
 95 | little history.
 96 | 
 97 | Early programming languages were line-oriented.  In  FORTRAN, for
 98 | example, various parts  of  the statement had specific columns or
 99 | fields that they had to appear in.  Since  some  statements  were
100 | too  long for one line, the  "continuation  card"  mechanism  was
101 | provided to let  the  compiler  know  that a given card was still
102 | part of the previous  line.   The mechanism survives to this day,
103 | even though punched cards are now things of the distant past.
104 | 
105 | When  other  languages  came  along,  they  also  adopted various
106 | mechanisms for dealing with multiple-line statements.  BASIC is a
107 | good  example.  It's important to  recognize,  though,  that  the
108 | FORTRAN  mechanism  was   not   so  much  required  by  the  line
109 | orientation of that  language,  as by the column-orientation.  In
110 | those versions of FORTRAN  where  free-form  input  is permitted,
111 | it's no longer needed.
112 | 
113 | When the fathers  of  Algol introduced that language, they wanted
114 | to get away  from  line-oriented programs like FORTRAN and BASIC,
115 | and allow for free-form input.   This included the possibility of
116 | stringing multiple statements on a single line, as in
117 | 
118 | 
119 |      a=b; c=d; e=e+1;
120 | 
121 | 
122 | In cases like this,  the  semicolon is almost REQUIRED.  The same
123 | line, without the semicolons, just looks "funny":
124 | 
125 | 
126 |      a=b c= d e=e+1
127 | 
128 | I suspect that this is the major ... perhaps ONLY ...  reason for
129 | semicolons: to keep programs from looking funny.
130 | 
131 | But  the  idea  of stringing multiple statements  together  on  a
132 | single  line  is  a  dubious  one  at  best.  It's not very  good
133 | programming  style,  and  harks back to  the  days  when  it  was
134 | considered improtant to conserve cards.  In these  days  of CRT's
135 | and indented code, the clarity of programs is  far  better served
136 | by  keeping statements separate.  It's still  nice  to  have  the
137 | OPTION  of  multiple  statements,  but  it seems a shame to  keep
138 | programmers  in  slavery  to the semicolon, just to keep that one
139 | rare case from "looking funny."
140 | 
141 | When I started in with KISS, I tried  to  keep  an  open mind.  I
142 | decided that I would use  semicolons when it became necessary for
143 | the parser, but not until then.  I figured this would happen just
144 | about  the time I added the ability  to  spread  statements  over
145 | multiple lines.  But, as you  can  see, that never happened.  The
146 | TINY compiler is perfectly  happy  to  parse the most complicated
147 | statement, spread over any number of lines, without semicolons.
148 | 
149 | Still, there are people  who  have  used  semicolons for so long,
150 | they feel naked  without them.  I'm one of them.  Once I had KISS
151 | defined sufficiently well, I began to write a few sample programs
152 | in the language.    I  discovered,  somewhat to my horror, that I
153 | kept  putting  semicolons  in anyway.   So  now  I'm  facing  the
154 | prospect of a NEW  rash  of  compiler  errors, caused by UNWANTED
155 | semicolons.  Phooey!
156 | 
157 | Perhaps more to the point, there are readers out  there  who  are
158 | designing their own languages, which may  include  semicolons, or
159 | who  want to use the techniques of  these  tutorials  to  compile
160 | conventional languages like  C.    In  either case, we need to be
161 | able to deal with semicolons.
162 | 
163 | 
164 | SYNTACTIC SUGAR
165 | 
166 | This whole discussion brings  up  the  issue of "syntactic sugar"
167 | ... constructs that are added to a language, not because they are
168 | needed, but because they help make the programs look right to the
169 | programmer.    After  all, it's nice  to  have  a  small,  simple
170 | compiler,    but  it  would  be  of  little  use if the resulting
171 | language  were  cryptic  and hard to program.  The language FORTH
172 | comes  to mind (a premature OUCH! for the  barrage  I  know  that
173 | one's going to fetch me).  If we can add features to the language
174 | that  make the programs easier to read  and  understand,  and  if
175 | those features  help keep the programmer from making errors, then
176 | we should do so.    Particularly if the constructs don't add much
177 | to the complexity of the language or its compiler.
178 | 
179 | The  semicolon  could  be considered an example,  but  there  are
180 | plenty of others, such as the 'THEN' in a IF-statement,  the 'DO'
181 | in a WHILE-statement,  and  even the 'PROGRAM' statement, which I
182 | came within a gnat's eyelash of leaving out  of  TINY.    None of
183 | these tokens  add  much  to  the  syntax  of the language ... the
184 | compiler can figure out  what's  going on without them.  But some
185 | folks feel that they  DO  add to the readability of programs, and
186 | that can be very important.
187 | 
188 | There are two schools of thought on this subject, which  are well
189 | represented by two of our most popular languages, C and Pascal.
190 | 
191 | To  the minimalists, all such sugar should be  left  out.    They
192 | argue that it clutters up the language and adds to the  number of
193 | keystrokes  programmers  must type.   Perhaps  more  importantly,
194 | every extra token or keyword represents a trap laying in wait for
195 | the inattentive programmer.  If you leave out  a  token, misplace
196 | it, or misspell it, the compiler  will  get you.  So these people
197 | argue that the best approach is to get rid of such things.  These
198 | folks tend to like C, which has a minimum of unnecessary keywords
199 | and punctuation.
200 | 
201 | Those from the other school tend to like Pascal.  They argue that
202 | having to type a few extra characters is a small price to pay for
203 | legibility.    After  all, humans have to read the programs, too.
204 | Their best argument is that each such construct is an opportunity
205 | to tell the compiler that you really mean for it  to  do what you
206 | said to.  The sugary tokens serve as useful landmarks to help you
207 | find your way.
208 | 
209 | The differences are well represented by the two  languages.   The
210 | most oft-heard complaint about  C  is  that  it is too forgiving.
211 | When you make a mistake in C, the  erroneous  code  is  too often
212 | another  legal  C  construct.    So  the  compiler  just  happily
213 | continues to compile, and  leaves  you  to  find the error during
214 | debug.    I guess that's why debuggers  are  so  popular  with  C
215 | programmers.
216 | 
217 | On the  other  hand,  if  a  Pascal  program compiles, you can be
218 | pretty  sure that the program will do what you told it.  If there
219 | is an error at run time, it's probably a design error.
220 | 
221 | The  best  example  of  useful  sugar  is  the semicolon  itself.
222 | Consider the code fragment:
223 | 
224 | 
225 |      a=1+(2*b+c)   b...
226 | 
227 | 
228 | Since there is no operator connecting the token 'b' with the rest
229 | of the  statement, the compiler will conclude that the expression
230 | ends  with  the  ')', and the 'b'  is  the  beginning  of  a  new
231 | statement.    But  suppose  I  have simply left out the  intended
232 | operator, and I really want to say:
233 | 
234 | 
235 |      a=1+(2*b+c)*b...
236 | 
237 | 
238 | In  this  case  the compiler will get an error, all right, but it
239 | won't be very meaningful  since  it will be expecting an '=' sign
240 | after the 'b' that really shouldn't be there.
241 | 
242 | If, on the other hand, I include a semicolon after the  'b', THEN
243 | there  can  be no doubt where I  intend  the  statement  to  end.
244 | Syntactic  sugar,  then,  can  serve  a  very  useful purpose  by
245 | providing some additional insurance that we remain on track.
246 | 
247 | I find  myself  somewhere  in  the middle of all this.  I tend to
248 | favor the Pascal-ers' view ... I'd much rather find  my  bugs  at
249 | compile time rather than run time.  But I also hate to just throw
250 | verbosity  in  for  no apparent reason, as in COBOL.  So far I've
251 | consistently left most of the Pascal sugar out of KISS/TINY.  But
252 | I certainly have no strong feelings either way, and  I  also  can
253 | see the value of sprinkling a little sugar around  just  for  the
254 | extra  insurance  that  it  brings.    If  you like  this  latter
255 | approach, things like that are easy to add.  Just  remember that,
256 | like  the semicolon, each item of sugar  is  something  that  can
257 | potentially cause a compile error by its omission.
258 | 
259 | 
260 | DEALING WITH SEMICOLONS
261 | 
262 | There  are  two  distinct  ways  in which semicolons are used  in
263 | popular  languages.    In Pascal, the semicolon is regarded as an
264 | statement SEPARATOR.  No semicolon  is  required  after  the last
265 | statement in a block.  The syntax is:
266 | 
267 | 
268 |      <block> ::= <statement> ( ';' <statement>)*
269 | 
270 |      <statement> ::= <assignment> | <if> | <while> ... | null
271 | 
272 | 
273 | (The null statement is IMPORTANT!)
274 | 
275 | Pascal  also defines some semicolons in  other  places,  such  as
276 | after the PROGRAM statement.
277 | 
278 | In  C  and  Ada, on the other hand, the semicolon is considered a
279 | statement TERMINATOR,  and  follows  all  statements  (with  some
280 | embarrassing and confusing  exceptions).   The syntax for this is
281 | simply:
282 | 
283 | 
284 |      <block> ::= ( <statement> ';')*
285 | 
286 | 
287 | Of  the two syntaxes, the Pascal one seems on the face of it more
288 | rational, but experience has shown  that it leads to some strange
289 | difficulties.  People get  so  used  to  typing a semicolon after
290 | every  statement  that  they tend to  type  one  after  the  last
291 | statement in a block, also.  That usually doesn't cause  any harm
292 | ...  it  just gets treated as a  null  statement.    Many  Pascal
293 | programmers, including yours truly,  do  just  that. But there is
294 | one  place you absolutely CANNOT type  a  semicolon,  and  that's
295 | right before an ELSE.  This little gotcha  has  cost  me  many an
296 | extra  compilation,  particularly  when  the  ELSE  is  added  to
297 | existing code.    So  the  C/Ada  choice  turns out to be better.
298 | Apparently Nicklaus Wirth thinks so, too:  In his  Modula  2,  he
299 | abandoned the Pascal approach.
300 | 
301 | Given either of these two syntaxes, it's an easy matter (now that
302 | we've  reorganized  the  parser!) to add these  features  to  our
303 | parser.  Let's take the last case first, since it's simpler.
304 | 
305 | To begin, I've made things easy by introducing a new recognizer:
306 | 
307 | 
308 | {--------------------------------------------------------------}
309 | { Match a Semicolon }
310 | 
311 | procedure Semi;
312 | begin
313 |    MatchString(';');
314 | end;
315 | {--------------------------------------------------------------}
316 | 
317 | 
318 | This procedure works very much like our old Match.  It insists on
319 | finding a semicolon as the next token.  Having found it, it skips
320 | to the next one.
321 | 
322 | Since a  semicolon follows a statement, procedure Block is almost
323 | the only one we need to change:
324 | 
325 | 
326 | {--------------------------------------------------------------}
327 | { Parse and Translate a Block of Statements }
328 | 
329 | procedure Block;
330 | begin
331 |    Scan;
332 |    while not(Token in ['e', 'l']) do begin
333 |       case Token of
334 |        'i': DoIf;
335 |        'w': DoWhile;
336 |        'R': DoRead;
337 |        'W': DoWrite;
338 |        'x': Assignment;
339 |       end;
340 |       Semi;
341 |       Scan;
342 |    end;
343 | end;
344 | {--------------------------------------------------------------}
345 | 
346 | 
347 | Note carefully the subtle change in the case statement.  The call
348 | to  Assignment  is now guarded by a test on Token.   This  is  to
349 | avoid calling Assignment when the  token  is  a  semicolon (which
350 | could happen if the statement is null).
351 | 
352 | Since declarations are also  statements,  we  also  need to add a
353 | call to Semi within procedure TopDecls:
354 | 
355 | 
356 | {--------------------------------------------------------------}
357 | { Parse and Translate Global Declarations }
358 | 
359 | procedure TopDecls;
360 | begin
361 |    Scan;
362 |    while Token = 'v' do begin
363 |       Alloc;
364 |       while Token = ',' do
365 |          Alloc;
366 |       Semi;
367 |    end;
368 | end;
369 | {--------------------------------------------------------------}
370 | 
371 | 
372 | Finally, we need one for the PROGRAM statement:
373 | 
374 | 
375 | {--------------------------------------------------------------}
376 | { Main Program }
377 | 
378 | begin
379 |    Init;
380 |    MatchString('PROGRAM');
381 |    Semi;
382 |    Header;
383 |    TopDecls;
384 |    MatchString('BEGIN');
385 |    Prolog;
386 |    Block;
387 |    MatchString('END');
388 |    Epilog;
389 | end.
390 | {--------------------------------------------------------------}
391 | 
392 | 
393 | It's as easy as that.  Try it with a copy of TINY and see how you
394 | like it.
395 | 
396 | The Pascal version  is  a  little  trickier,  but  it  still only
397 | requires  minor  changes,  and those only to procedure Block.  To
398 | keep things as simple as possible, let's split the procedure into
399 | two parts.  The following procedure handles just one statement:
400 | 
401 | 
402 | {--------------------------------------------------------------}
403 | { Parse and Translate a Single Statement }
404 | 
405 | procedure Statement;
406 | begin
407 |    Scan;
408 |    case Token of
409 |     'i': DoIf;
410 |     'w': DoWhile;
411 |     'R': DoRead;
412 |     'W': DoWrite;
413 |     'x': Assignment;
414 |    end;
415 | end;
416 | {--------------------------------------------------------------}
417 | 
418 | 
419 | Using this procedure, we can now rewrite Block like this:
420 | 
421 | 
422 | {--------------------------------------------------------------}
423 | { Parse and Translate a Block of Statements }
424 | 
425 | procedure Block;
426 | begin
427 |    Statement;
428 |    while Token = ';' do begin
429 |       Next;
430 |       Statement;
431 |    end;
432 | end;
433 | {--------------------------------------------------------------}
434 | 
435 | 
436 | That  sure  didn't  hurt, did it?  We can now parse semicolons in
437 | Pascal-like fashion.
438 | 
439 | 
440 | A COMPROMISE
441 | 
442 | Now that we know how to deal with semicolons, does that mean that
443 | I'm going to put them in KISS/TINY?  Well, yes and  no.    I like
444 | the extra sugar and the security that comes with knowing for sure
445 | where the  ends  of  statements  are.    But I haven't changed my
446 | dislike for the compilation errors associated with semicolons.
447 | 
448 | So I have what I think is a nice compromise: Make them OPTIONAL!
449 | 
450 | Consider the following version of Semi:
451 | 
452 | 
453 | {--------------------------------------------------------------}
454 | { Match a Semicolon }
455 | 
456 | procedure Semi;
457 | begin
458 |    if Token = ';' then Next;
459 | end;
460 | {--------------------------------------------------------------}
461 | 
462 | 
463 | This procedure will ACCEPT a semicolon whenever it is called, but
464 | it won't INSIST on one.  That means that when  you  choose to use
465 | semicolons, the compiler  will  use the extra information to help
466 | keep itself on track.  But if you omit one (or omit them all) the
467 | compiler won't complain.  The best of both worlds.
468 | 
469 | Put this procedure in place in the first version of  your program
470 | (the  one for C/Ada syntax), and you have  the  makings  of  TINY
471 | Version 1.2.
472 | 
473 | 
474 | COMMENTS
475 | 
476 | Up  until  now  I have carefully avoided the subject of comments.
477 | You would think that this would be an easy subject ... after all,
478 | the compiler doesn't have to deal with comments at all; it should
479 | just ignore them.  Well, sometimes that's true.
480 | 
481 | Comments can be just about as easy or as difficult as  you choose
482 | to make them.    At  one  extreme,  we can arrange things so that
483 | comments  are  intercepted  almost  the  instant  they  enter the
484 | compiler.  At the  other,  we can treat them as lexical elements.
485 | Things  tend to get interesting when  you  consider  things  like
486 | comment delimiters contained in quoted strings.
487 | 
488 | 
489 | SINGLE-CHARACTER DELIMITERS
490 | 
491 | Here's an example.  Suppose we assume the  Turbo  Pascal standard
492 | and use curly braces for comments.  In this case we  have single-
493 | character delimiters, so our parsing is a little easier.
494 | 
495 | One  approach  is  to  strip  the  comments  out the  instant  we
496 | encounter them in the input stream; that is,  right  in procedure
497 | GetChar.    To  do  this,  first  change  the  name of GetChar to
498 | something else, say GetCharX.  (For the record, this is  going to
499 | be a TEMPORARY change, so best not do this with your only copy of
500 | TINY.  I assume you understand that you should  always  do  these
501 | experiments with a working copy.)
502 | 
503 | Now, we're going to need a  procedure  to skip over comments.  So
504 | key in the following one:
505 | 
506 | 
507 | {--------------------------------------------------------------}
508 | { Skip A Comment Field }
509 | 
510 | procedure SkipComment;
511 | begin
512 |    while Look <> '}' do
513 |       GetCharX;
514 |    GetCharX;
515 | end;
516 | {--------------------------------------------------------------}
517 | 
518 | 
519 | Clearly, what this procedure is going to do is to simply read and
520 | discard characters from the input  stream, until it finds a right
521 | curly brace.  Then it reads one more character and returns  it in
522 | Look.
523 | 
524 | Now we can  write  a  new  version of GetChar that SkipComment to
525 | strip out comments:
526 | 
527 | 
528 | {--------------------------------------------------------------}
529 | { Get Character from Input Stream }
530 | { Skip Any Comments }
531 | 
532 | procedure GetChar;
533 | begin
534 |    GetCharX;
535 |    if Look = '{' then SkipComment;
536 | end;
537 | {--------------------------------------------------------------}
538 | 
539 | 
540 | Code this up  and  give  it  a  try.    You'll find that you can,
541 | indeed, bury comments anywhere you like.  The comments never even
542 | get into the parser proper ... every call to GetChar just returns
543 | any character that's NOT part of a comment.
544 | 
545 | As a matter of fact, while  this  approach gets the job done, and
546 | may even be  perfectly  satisfactory  for  you, it does its job a
547 | little  TOO  well.    First  of all, most  programming  languages
548 | specify that a comment should be treated like a  space,  so  that
549 | comments aren't allowed  to  be embedded in, say, variable names.
550 | This current version doesn't care WHERE you put comments.
551 | 
552 | Second, since the  rest  of  the  parser can't even receive a '{'
553 | character, you will not be allowed to put one in a quoted string.
554 | 
555 | Before you turn up your nose at this simplistic solution, though,
556 | I should point out  that  as respected a compiler as Turbo Pascal
557 | also won't allow  a  '{' in a quoted string.  Try it.  And as for
558 | embedding a comment in an  identifier, I can't imagine why anyone
559 | would want to do such a  thing,  anyway, so the question is moot.
560 | For 99% of all  applications,  what I've just shown you will work
561 | just fine.
562 | 
563 | But,  if  you  want  to  be  picky  about it  and  stick  to  the
564 | conventional treatment, then we  need  to  move  the interception
565 | point downstream a little further.
566 | 
567 | To  do  this,  first change GetChar back to the way  it  was  and
568 | change the name called in SkipComment.  Then, let's add  the left
569 | brace as a possible whitespace character:
570 | 
571 | 
572 | {--------------------------------------------------------------}
573 | { Recognize White Space }
574 | 
575 | function IsWhite(c: char): boolean;
576 | begin
577 |    IsWhite := c in [' ', TAB, CR, LF, '{'];
578 | end;
579 | {--------------------------------------------------------------}
580 | 
581 | 
582 | Now, we can deal with comments in procedure SkipWhite:
583 | 
584 | 
585 | {--------------------------------------------------------------}
586 | { Skip Over Leading White Space }
587 | 
588 | procedure SkipWhite;
589 | begin
590 |    while IsWhite(Look) do begin
591 |       if Look = '{' then
592 |          SkipComment
593 |       else
594 |          GetChar;
595 |    end;
596 | end;
597 | {--------------------------------------------------------------}
598 | 
599 | 
600 | Note  that SkipWhite is written so that we  will  skip  over  any
601 | combination of whitespace characters and comments, in one call.
602 | 
603 | OK, give this one a try, too.   You'll  find  that  it will let a
604 | comment serve to delimit tokens.  It's worth mentioning that this
605 | approach also gives us the  ability to handle curly braces within
606 | quoted strings, since within such  strings we will not be testing
607 | for or skipping over whitespace.
608 | 
609 | There's one last  item  to  deal  with:  Nested  comments.   Some
610 | programmers like the idea  of  nesting  comments, since it allows
611 | you to comment out code during debugging.  The  code  I've  given
612 | here won't allow that and, again, neither will Turbo Pascal.
613 | 
614 | But the fix is incredibly easy.  All  we  need  to  do is to make
615 | SkipComment recursive:
616 | 
617 | 
618 | {--------------------------------------------------------------}
619 | { Skip A Comment Field }
620 | 
621 | procedure SkipComment;
622 | begin
623 |    while Look <> '}' do begin
624 |       GetChar;
625 |       if Look = '{' then SkipComment;
626 |    end;
627 |    GetChar;
628 | end;
629 | {--------------------------------------------------------------}
630 | 
631 | 
632 | That does it.  As  sophisticated a comment-handler as you'll ever
633 | need.
634 | 
635 | 
636 | MULTI-CHARACTER DELIMITERS
637 | 
638 | That's all well and  good  for cases where a comment is delimited
639 | by single  characters,  but  what  about  the  cases such as C or
640 | standard Pascal, where two  characters  are  required?  Well, the
641 | principles are still the same, but we have to change our approach
642 | quite a bit.  I'm sure it won't surprise you to learn that things
643 | get harder in this case.
644 | 
645 | For the multi-character situation, the  easiest thing to do is to
646 | intercept the left delimiter  back  at the GetChar stage.  We can
647 | "tokenize" it right there, replacing it by a single character.
648 | 
649 | Let's assume we're using the C delimiters '/*' and '*/'.   First,
650 | we  need  to  go back to the "GetCharX' approach.  In yet another
651 | copy of your compiler, rename  GetChar to GetCharX and then enter
652 | the following new procedure GetChar:
653 | 
654 | 
655 | {--------------------------------------------------------------}
656 | { Read New Character.  Intercept '/*' }
657 | 
658 | procedure GetChar;
659 | begin
660 |    if TempChar <> ' ' then begin
661 |       Look := TempChar;
662 |       TempChar := ' ';
663 |       end
664 |    else begin
665 |       GetCharX;
666 |       if Look = '/' then begin
667 |          Read(TempChar);
668 |          if TempChar = '*' then begin
669 |             Look := '{';
670 |             TempChar := ' ';
671 |          end;
672 |       end;
673 |    end;
674 | end;
675 | {--------------------------------------------------------------}
676 | 
677 | 
678 | As you can see, what this procedure does is  to  intercept  every
679 | occurrence of '/'.  It then examines the NEXT  character  in  the
680 | stream.  If the character  is  a  '*',  then  we  have  found the
681 | beginning  of  a  comment,  and  GetChar  will  return  a  single
682 | character replacement for it.   (For  simplicity,  I'm  using the
683 | same '{' character  as I did for Pascal.  If you were writing a C
684 | compiler, you'd no doubt want to pick some other character that's
685 | not  used  elsewhere  in C.  Pick anything you like ... even $FF,
686 | anything that's unique.)
687 | 
688 | If the character  following  the  '/'  is NOT a '*', then GetChar
689 | tucks it away in the new global TempChar, and  returns  the  '/'.
690 | 
691 | Note that you need to declare this new variable and initialize it
692 | to ' '.  I like to do  things  like  that  using the Turbo "typed
693 | constant" construct:
694 | 
695 | 
696 |      const TempChar: char = ' ';
697 | 
698 | 
699 | Now we need a new version of SkipComment:
700 | 
701 | 
702 | {--------------------------------------------------------------}
703 | { Skip A Comment Field }
704 | 
705 | procedure SkipComment;
706 | begin
707 |    repeat
708 |       repeat
709 |          GetCharX;
710 |       until Look = '*';
711 |       GetCharX;
712 |    until Look = '/';
713 |    GetChar;
714 | end;
715 | {--------------------------------------------------------------}
716 | 
717 | 
718 | A  few  things  to  note:  first  of  all, function  IsWhite  and
719 | procedure SkipWhite  don't  need  to  be  changed,  since GetChar
720 | returns the '{' token.  If you change that token  character, then
721 | of  course you also need to change the  character  in  those  two
722 | routines.
723 | 
724 | Second, note that  SkipComment  doesn't call GetChar in its loop,
725 | but  GetCharX.    That  means   that  the  trailing  '/'  is  not
726 | intercepted and  is seen by SkipComment.  Third, although GetChar
727 | is the  procedure  doing  the  work,  we  can still deal with the
728 | comment  characters  embedded  in  a  quoted  string,  by calling
729 | GetCharX  instead  of  GetChar  while  we're  within  the string.
730 | Finally,  note  that  we can again provide for nested comments by
731 | adding a single statement to SkipComment, just as we did before.
732 | 
733 | 
734 | ONE-SIDED COMMENTS
735 | 
736 | So far I've shown you  how  to  deal  with  any  kind  of comment
737 | delimited on the left and the  right.   That only leaves the one-
738 | sided comments like those in assembler language or  in  Ada, that
739 | are terminated by the end of the line.  In a  way,  that  case is
740 | easier.   The only procedure that would need  to  be  changed  is
741 | SkipComment, which must now terminate at the newline characters:
742 | 
743 | 
744 | {--------------------------------------------------------------}
745 | { Skip A Comment Field }
746 | 
747 | procedure SkipComment;
748 | begin
749 |    repeat
750 |       GetCharX;
751 |    until Look = CR;
752 |    GetChar;
753 | end;
754 | {--------------------------------------------------------------}
755 | 
756 | 
757 | If the leading character is  a  single  one,  as  in  the  ';' of
758 | assembly language, then we're essentially done.  If  it's  a two-
759 | character token, as in the '--'  of  Ada, we need only modify the
760 | tests  within  GetChar.   Either way, it's an easier problem than
761 | the balanced case.
762 | 
763 | 
764 | CONCLUSION
765 | 
766 | At this point we now have the ability to deal with  both comments
767 | and semicolons, as well as other kinds of syntactic sugar.   I've
768 | shown  you several ways to deal with  each,  depending  upon  the
769 | convention  desired.    The  only  issue left is: which of  these
770 | conventions should we use in KISS/TINY?
771 | 
772 | For the reasons that I've given as we went  along,  I'm  choosing
773 | the following:
774 | 
775 | 
776 |  (1) Semicolons are TERMINATORS, not separators
777 | 
778 |  (2) Semicolons are OPTIONAL
779 | 
780 |  (3) Comments are delimited by curly braces
781 | 
782 |  (4) Comments MAY be nested
783 | 
784 | 
785 | Put the code corresponding to these cases into your copy of TINY.
786 | You now have TINY Version 1.2.
787 | 
788 | Now that we  have  disposed  of  these  sideline  issues,  we can
789 | finally get back into the mainstream.  In  the  next installment,
790 | we'll talk  about procedures and parameter passing, and we'll add
791 | these important features to TINY.  See you then.
792 | 
793 | 
794 | *****************************************************************
795 | *                                                               *
796 | *                        COPYRIGHT NOTICE                       *
797 | *                                                               *
798 | *   Copyright (C) 1989 Jack W. Crenshaw. All rights reserved.   *
799 | *                                                               *
800 | *****************************************************************
801 | 
802 | 


--------------------------------------------------------------------------------
/reference/crenshaw-txt/tutor3.txt:
--------------------------------------------------------------------------------
  1 | 
  2 | 
  3 | 
  4 | 
  5 | 
  6 | 
  7 | 
  8 | 
  9 | 
 10 | 
 11 | 
 12 | 
 13 | 
 14 | 
 15 | 
 16 | 
 17 | 
 18 | 
 19 | 
 20 | 
 21 | 
 22 | 
 23 | 
 24 | 
 25 | 
 26 | 
 27 | 
 28 | 
 29 |                      LET'S BUILD A COMPILER!
 30 | 
 31 |                                 By
 32 | 
 33 |                      Jack W. Crenshaw, Ph.D.
 34 | 
 35 |                             4 Aug 1988
 36 | 
 37 | 
 38 |                     Part III: MORE EXPRESSIONS
 39 | 
 40 | 
 41 | *****************************************************************
 42 | *                                                               *
 43 | *                        COPYRIGHT NOTICE                       *
 44 | *                                                               *
 45 | *   Copyright (C) 1988 Jack W. Crenshaw. All rights reserved.   *
 46 | *                                                               *
 47 | *****************************************************************
 48 | 
 49 | 
 50 | INTRODUCTION
 51 | 
 52 | In the last installment, we examined the techniques used to parse
 53 | and  translate a general math expression.  We  ended  up  with  a
 54 | simple parser that  could handle arbitrarily complex expressions,
 55 | with two restrictions:
 56 | 
 57 |   o No variables were allowed, only numeric factors
 58 | 
 59 |   o The numeric factors were limited to single digits
 60 | 
 61 | In this installment, we'll get  rid of those restrictions.  We'll
 62 | also extend what  we've  done  to  include  assignment statements
 63 | function  calls  and.    Remember,   though,   that   the  second
 64 | restriction was  mainly self-imposed  ... a choice of convenience
 65 | on our part, to make life easier and to let us concentrate on the
 66 | fundamental concepts.    As  you'll  see  in  a bit, it's an easy
 67 | restriction to get rid of, so don't get  too  hung  up  about it.
 68 | We'll use the trick when it serves us to do so, confident that we
 69 | can discard it when we're ready to.
 70 | 
 71 | 
 72 | VARIABLES
 73 | 
 74 | Most expressions  that we see in practice involve variables, such
 75 | as
 76 | 
 77 |                b * b + 4 * a * c
 78 | 
 79 | No  parser is much good without being able  to  deal  with  them.
 80 | Fortunately, it's also quite easy to do.
 81 | 
 82 | Remember that in our parser as it currently stands, there are two
 83 | kinds of  factors  allowed:  integer  constants  and  expressions
 84 | within parentheses.  In BNF notation,
 85 | 
 86 |      <factor> ::= <number> | (<expression>)
 87 | 
 88 | The '|' stands for "or", meaning of course that either form  is a
 89 | legal form for a factor.   Remember,  too, that we had no trouble
 90 | knowing which was which  ...  the  lookahead  character is a left
 91 | paren '(' in one case, and a digit in the other.
 92 |                               
 93 | It probably won't come as too much of a surprise that  a variable
 94 | is just another kind of factor.    So  we extend the BNF above to
 95 | read:
 96 | 
 97 | 
 98 |      <factor> ::= <number> | (<expression>) | <variable>
 99 | 
100 | 
101 | Again, there is no  ambiguity:  if  the  lookahead character is a
102 | letter,  we  have  a variable; if a digit, we have a number. Back
103 | when we translated the number, we just issued code  to  load  the
104 | number,  as immediate data, into D0.  Now we do the same, only we
105 | load a variable.
106 | 
107 | A minor complication in the  code generation arises from the fact
108 | that most  68000 operating systems, including the SK*DOS that I'm
109 | using, require the code to be  written  in "position-independent"
110 | form, which  basically means that everything is PC-relative.  The
111 | format for a load in this language is
112 | 
113 |                MOVE X(PC),D0
114 | 
115 | where X is, of course, the variable name.  Armed with that, let's
116 | modify the current version of Factor to read:
117 | 
118 | 
119 | {---------------------------------------------------------------}
120 | { Parse and Translate a Math Factor }
121 | 
122 | procedure Expression; Forward;
123 | 
124 | procedure Factor;
125 | begin
126 |    if Look = '(' then begin
127 |       Match('(');
128 |       Expression;
129 |       Match(')');
130 |       end
131 |    else if IsAlpha(Look) then
132 |       EmitLn('MOVE ' + GetName + '(PC),D0')
133 |    else
134 |       EmitLn('MOVE #' + GetNum + ',D0');
135 | end;
136 | {--------------------------------------------------------------}
137 | 
138 | 
139 | I've  remarked before how easy it is to  add  extensions  to  the
140 | parser, because of  the  way  it's  structured.  You can see that
141 | this  still  holds true here.  This time it cost us  all  of  two
142 | extra lines of code.  Notice, too, how the if-else-else structure
143 | exactly parallels the BNF syntax equation.
144 | 
145 | OK, compile and test this new version of the parser.  That didn't
146 | hurt too badly, did it?
147 |                               
148 | 
149 | FUNCTIONS
150 | 
151 | There is only one  other  common kind of factor supported by most
152 | languages: the function call.  It's really too early  for  us  to
153 | deal with functions well,  because  we  haven't yet addressed the
154 | issue of parameter passing.  What's more, a "real" language would
155 | include a mechanism to  support  more than one type, one of which
156 | should be a function type.  We haven't gotten there  yet, either.
157 | But I'd still like to deal with functions  now  for  a  couple of
158 | reasons.    First,  it  lets  us  finally  wrap  up the parser in
159 | something very close to its final form, and second, it  brings up
160 | a new issue which is very much worth talking about.
161 | 
162 | Up  till  now,  we've  been  able  to  write  what  is  called  a
163 | "predictive parser."  That  means  that at any point, we can know
164 | by looking at the current  lookahead character exactly what to do
165 | next.  That isn't the case when we add functions.  Every language
166 | has some naming rules  for  what  constitutes a legal identifier.
167 | For the present, ours is simply that it  is  one  of  the letters
168 | 'a'..'z'.  The  problem  is  that  a variable name and a function
169 | name obey  the  same  rules.   So how can we tell which is which?
170 | One way is to require that they each be declared before  they are
171 | used.    Pascal  takes that approach.  The other is that we might
172 | require a function to be followed by a (possibly empty) parameter
173 | list.  That's the rule used in C.
174 | 
175 | Since  we  don't  yet have a mechanism for declaring types, let's
176 | use the C  rule for now.  Since we also don't have a mechanism to
177 | deal  with parameters, we can only handle  empty  lists,  so  our
178 | function calls will have the form
179 | 
180 |                     x()  .
181 | 
182 | Since  we're  not  dealing  with  parameter lists yet,  there  is
183 | nothing  to do but to call the function, so we need only to issue
184 | a BSR (call) instead of a MOVE.
185 | 
186 | Now that there are two  possibilities for the "If IsAlpha" branch
187 | of the test in Factor, let's treat them in a  separate procedure.
188 | Modify Factor to read:
189 | 
190 | 
191 | {---------------------------------------------------------------}
192 | { Parse and Translate a Math Factor }
193 | 
194 | procedure Expression; Forward;
195 | 
196 | procedure Factor;
197 | begin
198 |    if Look = '(' then begin
199 |       Match('(');
200 |       Expression;
201 |       Match(')');
202 |       end
203 |    else if IsAlpha(Look) then
204 |       Ident
205 |    else
206 |       EmitLn('MOVE #' + GetNum + ',D0');
207 | end;
208 | {--------------------------------------------------------------}
209 | 
210 | 
211 | and insert before it the new procedure
212 | 
213 | 
214 | {---------------------------------------------------------------}
215 | { Parse and Translate an Identifier }
216 | 
217 | procedure Ident;
218 | var Name: char;
219 | begin
220 |    Name := GetName;
221 |    if Look = '(' then begin
222 |       Match('(');
223 |       Match(')');
224 |       EmitLn('BSR ' + Name);
225 |       end
226 |    else
227 |       EmitLn('MOVE ' + Name + '(PC),D0')
228 | end;
229 | {---------------------------------------------------------------}
230 | 
231 | 
232 | OK, compile and  test  this  version.  Does  it  parse  all legal
233 | expressions?  Does it correctly flag badly formed ones?
234 | 
235 | The important thing to notice is that even though  we  no  longer
236 | have  a predictive parser, there is  little  or  no  complication
237 | added with the recursive descent approach that we're  using.   At
238 | the point where  Factor  finds an identifier (letter), it doesn't
239 | know whether it's a variable name or a function name, nor does it
240 | really care.  It simply passes it on to Ident and leaves it up to
241 | that procedure to figure it out.  Ident, in  turn,  simply  tucks
242 | away the identifier and then reads one more  character  to decide
243 | which kind of identifier it's dealing with.
244 | 
245 | Keep this approach in mind.  It's a very powerful concept, and it
246 | should be used  whenever  you  encounter  an  ambiguous situation
247 | requiring further lookahead.   Even  if  you  had to look several
248 | tokens ahead, the principle would still work.
249 | 
250 | 
251 | MORE ON ERROR HANDLING
252 | 
253 | As long as we're talking  philosophy,  there's  another important
254 | issue to point out:  error  handling.    Notice that although the
255 | parser correctly rejects (almost)  every malformed  expression we
256 | can  throw at it, with a meaningful  error  message,  we  haven't
257 | really had to  do much work to make that happen.  In fact, in the
258 | whole parser per se (from  Ident  through  Expression)  there are
259 | only two calls to the error routine, Expected.  Even those aren't
260 | necessary ... if you'll look again in Term and Expression, you'll
261 | see that those statements can't be reached.  I put them  in early
262 | on as a  bit  of  insurance,  but  they're no longer needed.  Why
263 | don't you delete them now?
264 | 
265 | So how did we get this nice error handling  virtually  for  free?
266 | It's simply  that  I've  carefully  avoided  reading  a character
267 | directly  using  GetChar.  Instead,  I've  relied  on  the  error
268 | handling in GetName,  GetNum,  and  Match  to  do  all  the error
269 | checking for me.    Astute  readers  will notice that some of the
270 | calls to Match (for example, the ones in Add  and  Subtract)  are
271 | also unnecessary ... we already know what the character is by the
272 | time  we get there ... but it maintains  a  certain  symmetry  to
273 | leave them in, and  the  general rule to always use Match instead
274 | of GetChar is a good one.
275 | 
276 | I mentioned an "almost" above.   There  is a case where our error
277 | handling  leaves a bit to be desired.  So far we haven't told our
278 | parser what and  end-of-line  looks  like,  or  what  to  do with
279 | embedded  white  space.  So  a  space  character  (or  any  other
280 | character not part of the recognized character set) simply causes
281 | the parser to terminate, ignoring the unrecognized characters.
282 | 
283 | It  could  be  argued  that  this is reasonable behavior at  this
284 | point.  In a "real"  compiler, there is usually another statement
285 | following the one we're working on, so any characters not treated
286 | as part of our expression will either be used for or  rejected as
287 | part of the next one.
288 | 
289 | But  it's  also a very easy thing to fix up, even  if  it's  only
290 | temporary.   All  we  have  to  do  is assert that the expression
291 | should end with an end-of-line , i.e., a carriage return.
292 | 
293 | To see what I'm talking about, try the input line
294 | 
295 |                1+2 <space> 3+4
296 | 
297 | See  how the space was treated as a terminator?  Now, to make the
298 | compiler properly flag this, add the line
299 | 
300 |                if Look <> CR then Expected('Newline');
301 | 
302 | in the main  program,  just  after  the call to Expression.  That
303 | catches anything left over in the input stream.  Don't  forget to
304 | define CR in the const statement:
305 | 
306 |                CR = ^M;
307 | 
308 | As usual, recompile the program and verify that it does what it's
309 | supposed to.
310 | 
311 | 
312 | ASSIGNMENT STATEMENTS
313 | 
314 | OK,  at  this  point we have a parser that works very nicely. I'd
315 | like to  point  out  that  we  got  it  using  only  88  lines of
316 | executable code, not  counting  what  was  in  the  cradle.   The
317 | compiled  object  file  is  a  whopping  4752  bytes.   Not  bad,
318 | considering we weren't trying very  hard  to  save  either source
319 | code or object size.  We just stuck to the KISS principle.
320 | 
321 | Of course, parsing an expression  is not much good without having
322 | something to do with it afterwards.  Expressions USUALLY (but not
323 | always) appear in assignment statements, in the form
324 | 
325 |           <Ident> = <Expression>
326 | 
327 | We're only a breath  away  from being able to parse an assignment
328 | statement, so let's take that  last  step.  Just  after procedure
329 | Expression, add the following new procedure:
330 | 
331 | 
332 | {--------------------------------------------------------------}
333 | { Parse and Translate an Assignment Statement }
334 | 
335 | procedure Assignment;
336 | var Name: char;
337 | begin
338 |    Name := GetName;
339 |    Match('=');
340 |    Expression;
341 |    EmitLn('LEA ' + Name + '(PC),A0');
342 |    EmitLn('MOVE D0,(A0)')
343 | end;
344 | {--------------------------------------------------------------}
345 | 
346 | 
347 | Note again that the  code  exactly parallels the BNF.  And notice
348 | further that  the error checking was painless, handled by GetName
349 | and Match.
350 | 
351 | The reason for the two  lines  of  assembler  has  to  do  with a
352 | peculiarity in the  68000,  which requires this kind of construct
353 | for PC-relative code.
354 | 
355 | Now change the call to Expression, in the main program, to one to
356 | Assignment.  That's all there is to it.
357 | 
358 | Son of a gun!  We are actually  compiling  assignment statements.
359 | If those were the only kind of statements in a language, all we'd
360 | have to  do  is  put  this in a loop and we'd have a full-fledged
361 | compiler!
362 | 
363 | Well, of course they're not the only kind.  There are also little
364 | items  like  control  statements  (IFs  and  loops),  procedures,
365 | declarations, etc.  But cheer  up.    The  arithmetic expressions
366 | that we've been dealing with are among the most challenging  in a
367 | language.      Compared  to  what  we've  already  done,  control
368 | statements  will be easy.  I'll be covering  them  in  the  fifth
369 | installment.  And the other statements will all fall in  line, as
370 | long as we remember to KISS.
371 | 
372 | 
373 | MULTI-CHARACTER TOKENS
374 | 
375 | Throughout  this   series,   I've   been   carefully  restricting
376 | everything  we  do  to  single-character  tokens,  all  the while
377 | assuring  you  that  it wouldn't be difficult to extend to multi-
378 | character ones.    I  don't  know if you believed me or not ... I
379 | wouldn't  really blame you if you were a  bit  skeptical.    I'll
380 | continue  to use  that approach in  the  sessions  which  follow,
381 | because it helps keep complexity away.    But I'd like to back up
382 | those  assurances, and wrap up this portion  of  the  parser,  by
383 | showing you  just  how  easy  that  extension  really is.  In the
384 | process, we'll also provide for embedded white space.  Before you
385 | make  the  next  few changes, though, save the current version of
386 | the parser away under another name.  I have some more uses for it
387 | in  the  next  installment, and we'll be working with the single-
388 | character version.
389 | 
390 | Most compilers separate out the handling of the input stream into
391 | a separate module called  the  lexical scanner.  The idea is that
392 | the scanner deals with all the character-by-character  input, and
393 | returns the separate units  (tokens)  of  the  stream.  There may
394 | come a time when we'll want  to  do something like that, too, but
395 | for  now  there  is  no  need. We can handle the  multi-character
396 | tokens that we need by very slight and  very  local modifications
397 | to GetName and GetNum.
398 | 
399 | The usual definition of an identifier is that the first character
400 | must be a letter, but the rest can be  alphanumeric  (letters  or
401 | numbers).  To  deal  with  this,  we  need  one  other recognizer
402 | function
403 | 
404 | 
405 | {--------------------------------------------------------------}
406 | { Recognize an Alphanumeric }
407 | 
408 | function IsAlNum(c: char): boolean;
409 | begin
410 |    IsAlNum := IsAlpha(c) or IsDigit(c);
411 | end;
412 | {--------------------------------------------------------------}
413 | 
414 | 
415 | Add this function to your parser.  I put mine just after IsDigit.
416 | While you're  at  it,  might  as  well  include it as a permanent
417 | member of Cradle, too.
418 |                               
419 | Now, we need  to  modify  function  GetName  to  return  a string
420 | instead of a character:
421 | 
422 | 
423 | {--------------------------------------------------------------}
424 | { Get an Identifier }
425 | 
426 | function GetName: string;
427 | var Token: string;
428 | begin
429 |    Token := '';
430 |    if not IsAlpha(Look) then Expected('Name');
431 |    while IsAlNum(Look) do begin
432 |       Token := Token + UpCase(Look);
433 |       GetChar;
434 |    end;
435 |    GetName := Token;
436 | end;
437 | {--------------------------------------------------------------}
438 | 
439 | 
440 | Similarly, modify GetNum to read:
441 | 
442 | 
443 | {--------------------------------------------------------------}
444 | { Get a Number }
445 | 
446 | function GetNum: string;
447 | var Value: string;
448 | begin
449 |    Value := '';
450 |    if not IsDigit(Look) then Expected('Integer');
451 |    while IsDigit(Look) do begin
452 |       Value := Value + Look;
453 |       GetChar;
454 |    end;
455 |    GetNum := Value;
456 | end;
457 | {--------------------------------------------------------------}
458 | 
459 | 
460 | Amazingly enough, that  is  virtually all the changes required to
461 | the  parser!  The local variable Name  in  procedures  Ident  and
462 | Assignment was originally declared as  "char",  and  must  now be
463 | declared string[8].  (Clearly,  we  could  make the string length
464 | longer if we chose, but most assemblers limit the length anyhow.)
465 | Make  this  change,  and  then  recompile and test. _NOW_ do  you
466 | believe that it's a simple change?
467 | 
468 | 
469 | WHITE SPACE
470 | 
471 | Before we leave this parser for awhile, let's  address  the issue
472 | of  white  space.   As it stands now, the parser  will  barf  (or
473 | simply terminate) on a single space  character  embedded anywhere
474 | in  the input stream.  That's pretty  unfriendly  behavior.    So
475 | let's "productionize" the thing  a  bit  by eliminating this last
476 | restriction.
477 | 
478 | The  key  to easy handling of white space is to come  up  with  a
479 | simple rule for how the parser should treat the input stream, and
480 | to  enforce that rule everywhere.  Up  till  now,  because  white
481 | space wasn't permitted, we've been able to assume that after each
482 | parsing action, the lookahead character  Look  contains  the next
483 | meaningful  character,  so  we could test it  immediately.    Our
484 | design was based upon this principle.
485 | 
486 | It still sounds like a good rule to me, so  that's  the one we'll
487 | use.    This  means  that  every routine that advances the  input
488 | stream must skip over white space, and leave  the  next non-white
489 | character in Look.   Fortunately,  because  we've been careful to
490 | use GetName, GetNum, and Match  for most of our input processing,
491 | it is  only  those  three  routines  (plus  Init) that we need to
492 | modify.
493 | 
494 | Not  surprisingly,  we  start  with  yet  another  new recognizer
495 | routine:
496 | 
497 | 
498 | {--------------------------------------------------------------}
499 | { Recognize White Space }
500 | 
501 | function IsWhite(c: char): boolean;
502 | begin
503 |    IsWhite := c in [' ', TAB];
504 | end;
505 | {--------------------------------------------------------------}
506 | 
507 | 
508 | We  also need a routine that  will  eat  white-space  characters,
509 | until it finds a non-white one:
510 | 
511 | 
512 | {--------------------------------------------------------------}
513 | { Skip Over Leading White Space }
514 | 
515 | procedure SkipWhite;
516 | begin
517 |    while IsWhite(Look) do
518 |       GetChar;
519 | end;
520 | {--------------------------------------------------------------}
521 | 
522 | 
523 | Now,  add calls to SkipWhite to Match,  GetName,  and  GetNum  as
524 | shown below:
525 | 
526 | 
527 | {--------------------------------------------------------------}
528 | { Match a Specific Input Character }
529 | 
530 | procedure Match(x: char);
531 | begin
532 |    if Look <> x then Expected('''' + x + '''')
533 |    else begin
534 |       GetChar;
535 |       SkipWhite;
536 |    end;
537 | end;
538 | 
539 | 
540 | {--------------------------------------------------------------}
541 | { Get an Identifier }
542 | 
543 | function GetName: string;
544 | var Token: string;
545 | begin
546 |    Token := '';
547 |    if not IsAlpha(Look) then Expected('Name');
548 |    while IsAlNum(Look) do begin
549 |       Token := Token + UpCase(Look);
550 |       GetChar;
551 |    end;
552 |    GetName := Token;
553 |    SkipWhite;
554 | end;
555 | 
556 | 
557 | {--------------------------------------------------------------}
558 | { Get a Number }
559 | 
560 | function GetNum: string;
561 | var Value: string;
562 | begin
563 |    Value := '';
564 |    if not IsDigit(Look) then Expected('Integer');
565 |    while IsDigit(Look) do begin
566 |       Value := Value + Look;
567 |       GetChar;
568 |    end;
569 |    GetNum := Value;
570 |    SkipWhite;
571 | end;
572 | {--------------------------------------------------------------}
573 | 
574 | (Note  that  I  rearranged  Match  a  bit,  without changing  the
575 | functionality.)
576 | 
577 | Finally, we need to skip over leading blanks where we  "prime the
578 | pump" in Init:
579 |                              
580 | {--------------------------------------------------------------}
581 | { Initialize }
582 | 
583 | procedure Init;
584 | begin
585 |    GetChar;
586 |    SkipWhite;
587 | end;
588 | {--------------------------------------------------------------}
589 | 
590 | 
591 | Make these changes and recompile the program.  You will find that
592 | you will have to move Match below SkipWhite, to  avoid  an  error
593 | message from the Pascal compiler.  Test the program as  always to
594 | make sure it works properly.
595 | 
596 | Since we've made quite  a  few  changes  during this session, I'm
597 | reproducing the entire parser below:
598 | 
599 | 
600 | {--------------------------------------------------------------}
601 | program parse;
602 | 
603 | {--------------------------------------------------------------}
604 | { Constant Declarations }
605 | 
606 | const TAB = ^I;
607 |        CR = ^M;
608 | 
609 | {--------------------------------------------------------------}
610 | { Variable Declarations }
611 | 
612 | var Look: char;              { Lookahead Character }
613 | 
614 | {--------------------------------------------------------------}
615 | { Read New Character From Input Stream }
616 | 
617 | procedure GetChar;
618 | begin
619 |    Read(Look);
620 | end;
621 | 
622 | {--------------------------------------------------------------}
623 | { Report an Error }
624 | 
625 | procedure Error(s: string);
626 | begin
627 |    WriteLn;
628 |    WriteLn(^G, 'Error: ', s, '.');
629 | end;
630 | 
631 | 
632 | {--------------------------------------------------------------}
633 | { Report Error and Halt }
634 |                              
635 | procedure Abort(s: string);
636 | begin
637 |    Error(s);
638 |    Halt;
639 | end;
640 | 
641 | 
642 | {--------------------------------------------------------------}
643 | { Report What Was Expected }
644 | 
645 | procedure Expected(s: string);
646 | begin
647 |    Abort(s + ' Expected');
648 | end;
649 | 
650 | 
651 | {--------------------------------------------------------------}
652 | { Recognize an Alpha Character }
653 | 
654 | function IsAlpha(c: char): boolean;
655 | begin
656 |    IsAlpha := UpCase(c) in ['A'..'Z'];
657 | end;
658 | 
659 | 
660 | {--------------------------------------------------------------}
661 | { Recognize a Decimal Digit }
662 | 
663 | function IsDigit(c: char): boolean;
664 | begin
665 |    IsDigit := c in ['0'..'9'];
666 | end;
667 | 
668 | 
669 | {--------------------------------------------------------------}
670 | { Recognize an Alphanumeric }
671 | 
672 | function IsAlNum(c: char): boolean;
673 | begin
674 |    IsAlNum := IsAlpha(c) or IsDigit(c);
675 | end;
676 | 
677 | 
678 | {--------------------------------------------------------------}
679 | { Recognize an Addop }
680 | 
681 | function IsAddop(c: char): boolean;
682 | begin
683 |    IsAddop := c in ['+', '-'];
684 | end;
685 | 
686 | 
687 | {--------------------------------------------------------------}
688 | { Recognize White Space }
689 |                              
690 | function IsWhite(c: char): boolean;
691 | begin
692 |    IsWhite := c in [' ', TAB];
693 | end;
694 | 
695 | 
696 | {--------------------------------------------------------------}
697 | { Skip Over Leading White Space }
698 | 
699 | procedure SkipWhite;
700 | begin
701 |    while IsWhite(Look) do
702 |       GetChar;
703 | end;
704 | 
705 | 
706 | {--------------------------------------------------------------}
707 | { Match a Specific Input Character }
708 | 
709 | procedure Match(x: char);
710 | begin
711 |    if Look <> x then Expected('''' + x + '''')
712 |    else begin
713 |       GetChar;
714 |       SkipWhite;
715 |    end;
716 | end;
717 | 
718 | 
719 | {--------------------------------------------------------------}
720 | { Get an Identifier }
721 | 
722 | function GetName: string;
723 | var Token: string;
724 | begin
725 |    Token := '';
726 |    if not IsAlpha(Look) then Expected('Name');
727 |    while IsAlNum(Look) do begin
728 |       Token := Token + UpCase(Look);
729 |       GetChar;
730 |    end;
731 |    GetName := Token;
732 |    SkipWhite;
733 | end;
734 | 
735 | 
736 | {--------------------------------------------------------------}
737 | { Get a Number }
738 | 
739 | function GetNum: string;
740 | var Value: string;
741 | begin
742 |    Value := '';
743 |    if not IsDigit(Look) then Expected('Integer');
744 |    while IsDigit(Look) do begin
745 |       Value := Value + Look;
746 |       GetChar;
747 |    end;
748 |    GetNum := Value;
749 |    SkipWhite;
750 | end;
751 | 
752 | 
753 | {--------------------------------------------------------------}
754 | { Output a String with Tab }
755 | 
756 | procedure Emit(s: string);
757 | begin
758 |    Write(TAB, s);
759 | end;
760 | 
761 | 
762 | {--------------------------------------------------------------}
763 | { Output a String with Tab and CRLF }
764 | 
765 | procedure EmitLn(s: string);
766 | begin
767 |    Emit(s);
768 |    WriteLn;
769 | end;
770 | 
771 | 
772 | {---------------------------------------------------------------}
773 | { Parse and Translate a Identifier }
774 | 
775 | procedure Ident;
776 | var Name: string[8];
777 | begin
778 |    Name:= GetName;
779 |    if Look = '(' then begin
780 |       Match('(');
781 |       Match(')');
782 |       EmitLn('BSR ' + Name);
783 |       end
784 |    else
785 |       EmitLn('MOVE ' + Name + '(PC),D0');
786 | end;
787 | 
788 | 
789 | {---------------------------------------------------------------}
790 | { Parse and Translate a Math Factor }
791 | 
792 | procedure Expression; Forward;
793 | 
794 | procedure Factor;
795 | begin
796 |    if Look = '(' then begin
797 |       Match('(');
798 |       Expression;
799 |       Match(')');
800 |       end
801 |    else if IsAlpha(Look) then
802 |       Ident
803 |    else
804 |       EmitLn('MOVE #' + GetNum + ',D0');
805 | end;
806 | 
807 | 
808 | {--------------------------------------------------------------}
809 | { Recognize and Translate a Multiply }
810 | 
811 | procedure Multiply;
812 | begin
813 |    Match('*');
814 |    Factor;
815 |    EmitLn('MULS (SP)+,D0');
816 | end;
817 | 
818 | 
819 | {-------------------------------------------------------------}
820 | { Recognize and Translate a Divide }
821 | 
822 | procedure Divide;
823 | begin
824 |    Match('/');
825 |    Factor;
826 |    EmitLn('MOVE (SP)+,D1');
827 |    EmitLn('EXS.L D0');
828 |    EmitLn('DIVS D1,D0');
829 | end;
830 | 
831 | 
832 | {---------------------------------------------------------------}
833 | { Parse and Translate a Math Term }
834 | 
835 | procedure Term;
836 | begin
837 |    Factor;
838 |    while Look in ['*', '/'] do begin
839 |       EmitLn('MOVE D0,-(SP)');
840 |       case Look of
841 |        '*': Multiply;
842 |        '/': Divide;
843 |       end;
844 |    end;
845 | end;
846 | 
847 | 
848 | {--------------------------------------------------------------}
849 | { Recognize and Translate an Add }
850 | 
851 | procedure Add;
852 | begin
853 |    Match('+');
854 |    Term;
855 |    EmitLn('ADD (SP)+,D0');
856 | end;
857 | 
858 | 
859 | {-------------------------------------------------------------}
860 | { Recognize and Translate a Subtract }
861 | 
862 | procedure Subtract;
863 | begin
864 |    Match('-');
865 |    Term;
866 |    EmitLn('SUB (SP)+,D0');
867 |    EmitLn('NEG D0');
868 | end;
869 | 
870 | 
871 | {---------------------------------------------------------------}
872 | { Parse and Translate an Expression }
873 | 
874 | procedure Expression;
875 | begin
876 |    if IsAddop(Look) then
877 |       EmitLn('CLR D0')
878 |    else
879 |       Term;
880 |    while IsAddop(Look) do begin
881 |       EmitLn('MOVE D0,-(SP)');
882 |       case Look of
883 |        '+': Add;
884 |        '-': Subtract;
885 |       end;
886 |    end;
887 | end;
888 | 
889 | 
890 | {--------------------------------------------------------------}
891 | { Parse and Translate an Assignment Statement }
892 | 
893 | procedure Assignment;
894 | var Name: string[8];
895 | begin
896 |    Name := GetName;
897 |    Match('=');
898 |    Expression;
899 |    EmitLn('LEA ' + Name + '(PC),A0');
900 |    EmitLn('MOVE D0,(A0)')
901 | end;
902 | 
903 | 
904 | {--------------------------------------------------------------}
905 | { Initialize }
906 |                              
907 | procedure Init;
908 | begin
909 |    GetChar;
910 |    SkipWhite;
911 | end;
912 | 
913 | 
914 | {--------------------------------------------------------------}
915 | { Main Program }
916 | 
917 | begin
918 |    Init;
919 |    Assignment;
920 |    If Look <> CR then Expected('NewLine');
921 | end.
922 | {--------------------------------------------------------------}
923 | 
924 | 
925 | Now the parser is complete.  It's got every feature we can put in
926 | a  one-line "compiler."  Tuck it away in a safe place.  Next time
927 | we'll move on to a new subject, but we'll still be  talking about
928 | expressions for quite awhile.  Next installment, I plan to talk a
929 | bit about interpreters as opposed  to compilers, and show you how
930 | the structure of the parser changes a bit as we change  what sort
931 | of action has to be taken.  The information we pick up there will
932 | serve  us in good stead later on, even if you have no interest in
933 | interpreters.  See you next time.
934 | 
935 | 
936 | *****************************************************************
937 | *                                                               *
938 | *                        COPYRIGHT NOTICE                       *
939 | *                                                               *
940 | *   Copyright (C) 1988 Jack W. Crenshaw. All rights reserved.   *
941 | *                                                               *
942 | *****************************************************************
943 | 
944 | 
945 | 
946 | 
947 | 


--------------------------------------------------------------------------------