├── .gdbinit
├── .gitignore
├── ChangeLog
├── Makefile
├── README.md
└── tokthr.S


/.gdbinit:
--------------------------------------------------------------------------------
 1 | b *0x08048075
 2 | display/i $pc
 3 | display/a ((void**)$esp)[2]
 4 | display/a ((void**)$esp)[1]
 5 | display/a ((void**)$esp)[0]
 6 | display/a $esi
 7 | display/a $eax
 8 | 
 9 | define rstack
10 |   p/a ((void**)$ebp)[0]@3
11 | end
12 | document rstack
13 |   Prints the top three entries of the return stack, 
14 |   topmost first.
15 | end
16 | 
17 | define fb
18 |   b execute_eax if $esi == &$arg0
19 | end
20 | document fb
21 |   Sets a breakpoint to stop the interpreter when it enters a given Forth word.
22 | end


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | tokthr
2 | tokthr-disassembly
3 | tokthr-symbols
4 | 


--------------------------------------------------------------------------------
/ChangeLog:
--------------------------------------------------------------------------------
 1 | 2007-11-16  Kragen Javier Sitaker  <kragen@thrifty>
 2 | 
 3 | 	* Changelog: added.
 4 | 
 5 | 	* tokthr2.S: added read, gets, fgets, tib, tibmax, tibsize, asm
 6 | 	macro 'create'.  Now we can read input.  Added fif, floop, felse
 7 | 	macros to calculate jump offsets for me; no more miscalculated
 8 | 	jump offsets yielding puzzling bugs.
 9 | 
10 | 	* Makefile: somewhere along the line added this.
11 | 
12 | 2007-11-15  Kragen Javier Sitaker  <kragen@thrifty>
13 | 
14 | 	* tokthr2.S: Bumped version from 2 to 3.  Added run-time
15 | 	dictionary of names.  Moved "instructions" (main program) to end
16 | 	of file.  For dictionary handling, added count, cr, words, dict,
17 | 	dictp, dictsize, nextwords, pastdict?, <, >=, 0=, cbcmp, c@+,
18 | 	bcmp, memcmp, unloop, find, r2@, 2swap.  Added "s: " label to .s
19 | 	output.
20 | 
21 | 2007-11-12..14  Kragen Javier Sitaker  <kragen@thrifty>
22 | 
23 | 	* tokthr2.S: Added "version 2", many comments.  Wrote
24 | 	define_bytecode, defasm, defbytes macros so I can stop writing
25 | 	bytecode numerically.  Shortened tail end of primitives with
26 | 	pusheax and pushedxeax.  Changed execute to execute a token.
27 | 	Implemented DO LOOP.  Removed direct threading, switched to a
28 | 	variant of "bit threading".  Added make_counted_string macro and
29 | 	words that will later be called -1 + 1- rot -rot tuck emit u. (u.)
30 | 	((u.)) ~ 1+ negate . printstack pick cells * depth - / cellsize
31 | 	nip i 2dup.
32 | 
33 | 	* .gdbinit: somewhere along the line added this.
34 | 
35 | 2007-11-11  Kragen Javier Sitaker  <kragen@thrifty>
36 | 
37 | 	tokthr.S: Tried out various things to see if they could shrink the
38 | 	machine code.
39 | 
40 | 2007-11-10  Kragen Javier Sitaker  <kragen@thrifty>
41 | 
42 | 	* tokthr.S: Have most of the basic primitives and have written a
43 | 	little bit of bytecode, but nothing that does anything
44 | 	interesting.
45 | 
46 | 2007-11-09  Kragen Javier Sitaker  <kragen@thrifty>
47 | 
48 | 	* tokthr.S: Prototyped four-instruction token-threaded "next"
49 | 	routine and wrote hello-world program with it.


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | all = tokthr tokthr-symbols tokthr-disassembly
 2 | all: $(all)
 3 | clean:
 4 | 	rm -f $(all) *~ .*~
 5 | tokthr: tokthr.S
 6 | 	gcc -m32 -nostdlib -static -o $@ $<
 7 | tokthr-symbols: tokthr
 8 | 	(echo '-*- auto-revert -*-'; nm $<) > $@
 9 | tokthr-disassembly: tokthr
10 | 	(echo '-*- auto-revert -*-'; objdump -D $<) > $@
11 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | tokthr
 2 | ======
 3 | 
 4 | > That’s the longest “hello world” I’ve *ever seen*! -- [Bob Appleyard] [0]
 5 | 
 6 | This is a tiny token-threaded Forth interpreter that I started in 2007
 7 | and haven’t finished yet.  It’s about 1000 lines of x86 assembly,
 8 | using gas’s macro capabilities, and it compiles to about 1400 bytes of
 9 | executable, 399 of which are x86 code.  (The rest is bytecode.)
10 | 
11 | I don’t have a good name for it yet, and I don’t know if I’ll ever
12 | finish it.  It’s in the public domain.
13 | 
14 | [0]: http://www.reddit.com/r/programming/comments/8vj71/small_is_beautiful/c0aki5s "Comment on Reddit when someone pointed to an earlier version of this as an exemplar of smallness"
15 | 


--------------------------------------------------------------------------------
/tokthr.S:
--------------------------------------------------------------------------------
   1 | # Simple token-threaded "Forth" interpreter. -*- asm -*-
   2 | # Version 3.
   3 | # by Kragen Javier Sitaker; dedicated to the public domain,
   4 | # i.e. I relinquish whatever exclusive rights copyright law
   5 | # gives me with regard to this work.
   6 | # Major parts taken from Richard W.M. Jones's public-domain
   7 | # JONESFORTH 42 by Richard W.M. Jones <rich@annexia.org>
   8 | # http://annexia.org/forth
   9 | 
  10 | # As of version 2, this program just outputs the following under
  11 | # Linux:
  12 | # hell, world, hello
  13 | # -120 1 104 
  14 | # s: 102 101 100 
  15 | # 397 
  16 | # lit8 lit ret execute (if) (else) (loop) ! @ c! c@ rp@ rp! r> >r sp@ sp! pop dup over swap 0< & | ^ um+ /% um* r@ syscall5 syscall3 syscall1 bye 0 1 type hello world comma count cr -1 + 1- rot -rot tuck emit u. (u.) ((u.)) ~ 1+ negate . scolon .s pick cells * depth - / cellsize nip (do) i 2dup 2drop dict dictp dictsize nextword pastdict? words < >= 0= cbcmp c@+ bcmp memcmp unloop find r2@ 2swap 
  17 | # hello134517863 
  18 | # , 0 
  19 | # 134517710 
  20 | 
  21 | # to compile:   
  22 | # gcc -m32 -nostdlib -static -o tokthr tokthr.S
  23 | 
  24 | 
  25 | ### Why Small Things are Interesting
  26 | 
  27 | # There are still a lot of computers out there that have tens of
  28 | # kilobytes of memory or less, and they cost a lot less than,
  29 | # say, a cellphone.  Cellphones are apparently still too
  30 | # expensive for half the world's population.  I want to see how
  31 | # close I can get to having a comfortable programming
  32 | # environment in a smaller device.
  33 | 
  34 | # Some smallish microcontroller chips from five different
  35 | # manufacturers, with current Digi-Key prices:
  36 | # Name              bytes RAM  bytes ROM  MHz  price    
  37 | # ATtiny2313        128        2048       20   US$1.38  
  38 | # ATMega48-20AU     512        4096       20   US$1.62  
  39 | # MSP430F1111AIPW   128        2264       8    US$2.43  
  40 | # LPC2101           2048       8192       70   US$2.52  
  41 | # H8/300H Tiny      1536       8192       12   US$3.58  
  42 | # M16C/R8C/Tiny/1B  1024       16384      12   US$3.54  
  43 | # SX28AC/SS         136        3072       50   US$2.79  
  44 | 
  45 | # There are essentially no 386-compatible devices in this price
  46 | # range as far as I can tell; for why I'm not worried about
  47 | # that, see the section "How Small This Is".
  48 | 
  49 | # More practically and short-termly, small projects can take
  50 | # less time to finish, and I feel like I need to learn about
  51 | # different approaches to implementing programming languages.
  52 | 
  53 | ### Why This is Small
  54 | 
  55 | # The normal Forth representation of a function is as an array
  56 | # of pointers to the other functions it calls, in sequence; a
  57 | # few of those other functions may move the interpreter pointer
  58 | # around in that array, or snarf up a constant that's stored in
  59 | # the array, or stuff like that, but for the most part, the
  60 | # functions just get called in sequence.  This is called
  61 | # "threaded code" and it's fairly compact, especially on 16-bit
  62 | # systems where the pointers are only two bytes.
  63 | 
  64 | # A traditional approach taken by Forth implementations to
  65 | # reduce code size even further is called "token threading".
  66 | # Rather than making arrays of 16-bit or 32-bit pointers, they
  67 | # make lists of 8-bit indices into an array of pointers.  This
  68 | # has two advantages:
  69 | 
  70 | # 1. the indices are one fourth the size of a list of 32-bit
  71 | #    pointers;
  72 | # 2. it is possible to save these lists of indices somewhere
  73 | #    outside of memory and continue to use them even after
  74 | #    making some changes to the code, as long as the same
  75 | #    indices in the table have the same meanings.  So, for
  76 | #    example, you could write some boot firmware in this
  77 | #    "bytecode".
  78 | 
  79 | # It also has some disadvantages:
  80 | # 1. You run out of space in the table.  Even a fairly minimal
  81 | #    full Forth system contains close to 256 subroutines.  You
  82 | #    can mitigate this by packing, say, two 12-bit pointers
  83 | #    every three bytes, or maybe by having a special bytecode
  84 | #    that looks up the next byte in an extended table.
  85 | # 2. It's slower and makes the machine-code part of the program
  86 | #    take more space.  The traditional LODSW; JMP AX version of
  87 | #    $NEXT from the eForth Model, which fetches and executes the
  88 | #    next execution token in the threaded list, is three bytes
  89 | #    and two instructions; my 'next' here is 41 bytes and 14
  90 | #    instructions, which is big enough that I jump to it (2
  91 | #    bytes) rather than making an assembler macro.  Which blows
  92 | #    your branch target buffers to pieces.  Oh well.  The
  93 | #    performance penalty is probably two orders of magnitude
  94 | #    over native code, but I haven't measured it yet.  I
  95 | #    measured an earlier version on my 700MHz PIII laptop on an
  96 | #    empty loop at about a factor of 3.5 over simple
  97 | #    direct-threading, which in turn is on the order of 10 times
  98 | #    slower than machine code.
  99 | 
 100 | # Anyway, so this is an example program built using this
 101 | # technique.  It implements two Forthlike stacks and interpreted
 102 | # subroutines, but not yet the ability to define new subroutines
 103 | # at run-time.
 104 | 
 105 | ### What's Here
 106 | 
 107 | # I've implemented all of the primitives from C. H. Ting and
 108 | # Bill Muench's public-domain (?) eForth Model 1.0, except for
 109 | # the following:
 110 | # - I haven't implemented their lowercase "next" (as in FOR
 111 | #   NEXT) because I think it's a bad idea, it's complex, and it
 112 | #   can be implemented at a higher level if you really need it;
 113 | #   instead, I implemented (loop).        
 114 | # - I didn't implement !IO because it's a no-op in this context;
 115 | # - I haven't yet implemented ?RX, although I think it's
 116 | #   possible to implement it on top of syscall5, using select();
 117 | # Additionally, I implemented multiply and divide primitives.
 118 | 
 119 | # However, some of it is untested and therefore probably broken.
 120 | # Procedure call and return and the system calls do work.
 121 | 
 122 | # Currently registers are used as follows:
 123 | # %esi --- interpreter pointer; points to next byte to execute
 124 | # %ebp --- return stack pointer; points to last thing pushed.  This stack, 
 125 | #          like the other one, grows downwards.
 126 | # %esp --- data stack pointer; points to last thing pushed.  This is
 127 | #          the processor's standard stack pointer; "push" and "pop"
 128 | #          instructions use it, which makes assembly code to use it
 129 | #          quite concise.  The Intel "call" and "ret" instructions
 130 | #          would also use this stack, but they aren't used in this
 131 | #          program.
 132 | # flags --- the "down" direction flag must be cleared.
 133 | 
 134 | # It's probably missing a couple of primitives needed because of
 135 | # the token-threading implementation strategy; the address of
 136 | # the token table probably needs to be knowable, at least.
 137 | 
 138 | # Direct and indirect threading, the normal Forth approaches to
 139 | # allowing unrestricted coexistence of words written in assembly
 140 | # language and interpreted Forth, both had heavy space costs
 141 | # here --- close to 100% for the bytecode currently in the
 142 | # system.  So the inner interpreter checks, for each bytecode,
 143 | # whether it is in the range of bytecodes whose interpretations
 144 | # are in native code, and picks the relevant code path.  This
 145 | # avoids consuming any space per-word for this distinction, but with
 146 | # what appears to be a heavy performance cost.
 147 | 
 148 | # This is similar to an approach called "bit threading"; as
 149 | # explained by Jeff Fox in a comp.lang.forth message
 150 | # 2007-05-13, on thread "Cases where Forth seems a little
 151 | # clunky":
 152 | 
 153 | #       I have seen hardware and software implemenations of
 154 | #       bit-threading where the msb of the address space
 155 | #       selects between threaded code address lists and
 156 | #       addresses of CODE subroutines. In both cases 0 is a
 157 | #       valid address and negative addresses are valid. I think
 158 | #       this applied to Novix.
 159 | 
 160 | # Except that this approach uses a fraction of a bit for most
 161 | # tokens instead of a whole bit.
 162 | 
 163 | ### How Small This Is
 164 | 
 165 | # As a point of comparison, eForth 1.0's machine-code part seems
 166 | # to be 171 instructions and 399 bytes, including some data
 167 | # that's mixed in there with it.
 168 | 
 169 | # As of version 3, the machine-code part of this interpreter is
 170 | # 239 bytes in 129 machine-code instructions, plus 19 bytes of
 171 | # read-only data, 172 bytes of token table for the 86
 172 | # currently-defined words (30 of which are primitive), 397
 173 | # bytes of their names, and another 370 bytes of bytecode in
 174 | # the 56 bytecode words (including a couple of data values
 175 | # scattered around), and then another 56 bytes in the main
 176 | # program, for a total of 1253 bytes.  So the program is
 177 | # already less than half written in assembly, in terms of
 178 | # object-code size.
 179 | 
 180 | # As of version 3, the machine-code part of this program uses
 181 | # only 25 different instructions: cld; jmp, jz, jnc, jbe; push,
 182 | # pop, lodsb, lodsl, xchg, mov; cmp, movsbl, inc, and, xor, or,
 183 | # lea, cdq, rcl, add, idiv, imul; and int.  Interestingly, many
 184 | # of them are only used once.
 185 | 
 186 | # Just before version 3, the non-comment lines were 14%
 187 | # assembler macros, 51% assembly language, and 35% bytecode.
 188 | 
 189 | # From Brad Rodriguez's January/February 1993 Computer Journal
 190 | # article, "Moving Forth: Part 1: Design Decisions in the Forth
 191 | # Kernel":
 192 | #       You can expect the usual 16-bit Forth kernel (see below)
 193 | #       to occupy about 8K bytes of program space. For a full
 194 | #       kernel that can compile Forth definitions, you should
 195 | #       allow a minimum of 1K byte of RAM.
 196 | 
 197 | # I'm pretty sure I can beat the 8K requirement by quite a bit
 198 | # and still be able to compile Forth definitions --- I'm hoping
 199 | # for a factor of 4.  Consider, from Jeff Fox's "Thoughtful
 200 | # Programming", chapter 3: http://www.ultratechnology.com/forth3.htm
 201 | 
 202 | #       People assume that since Chuck has refined his Forth down to
 203 | #       about a 1K object that this means he has just stripped his
 204 | #       Forth down to a 1K kernel that will boot like in the old days
 205 | #       and that he is going to compile a complete Forth system on top
 206 | #       of the 1K before he starts an application. This is wrong. The
 207 | #       complete Forth system is 1K, and the reason for that is
 208 | #       maximize Chuck's productivity. What stops people from doing
 209 | #       what they need to do to solve a problem is all the time spend
 210 | #       solving all the related sub-problems that pop up as a result
 211 | #       of complex interconnections between components. To maximize
 212 | #       his productivity Chuck minimizes the number of these side
 213 | #       problems that pop up. Keep it simple, and don't get to where
 214 | #       you are spending 90% or 99% or you time dealing with related
 215 | #       sub-problems. Avoid unsolvable problems, don't waste your time
 216 | #       trying to solve them.
 217 | 
 218 | # Consider also this quote from Elizabeth Rather:
 219 | 
 220 | #       As you have seen, so much depends on the specific
 221 | #       machine architecture. We implemented a TTC [token
 222 | #       threaded] Forth on some low-end AVR processors with
 223 | #       very limited code space, and it ran faster than a
 224 | #       native-code 8051 at comparable clock speed.
 225 | 
 226 | # (2007-06-23 comp.lang.forth post on thread "Build your own
 227 | # Forth for Microchip PIC (Episode 838): Threading")
 228 | # http://objectmix.com/forth/168105-build-your-own-forth-microchip-pic-episode-838-threading.html
 229 | 
 230 | # Consider also this quote from the abstract of Frank
 231 | # Sergeant's 1991 "A 3-Instruction Forth for Embedded Systems
 232 | # Work: Illustrated on the Motorola MC68HC11":
 233 | 
 234 | #       A 3- instruction Forth makes Forth affordable for
 235 | #       target systems with very limited memory. It can be
 236 | #       brought up quickly on strange new hardware. You don't
 237 | #       have to do without Forth because of memory or time
 238 | #       limitations. It only takes 66 bytes for the Motorola
 239 | #       MC68HC11. Full source is provided. . . . The absolute
 240 | #       minimum the target must do, it seems to me, is fetch a
 241 | #       byte, store a byte, and call a subroutine.
 242 | 
 243 | # http://pygmy.utoh.org/3ins4th.html
 244 | 
 245 | # As I said before, there are no small, cheap 386s, and so of
 246 | # course the code size of this version is an approximation, and
 247 | # it won't be a simple recompile to port it to one of these
 248 | # other architectures; but 129 instructions' worth of assembly
 249 | # are probably not that big a deal to rewrite for a new
 250 | # platform.  (I'll probably want to write multiply and divide
 251 | # routines, though.)
 252 | 
 253 | ### Other Things I Tried
 254 | 
 255 | # I tried switching to caching the top of the data stack in a
 256 | # register, on the theory that it would shorten things like
 257 | # 'and'.  Currently 'and' is pop %eax; pop %ebx; and %ebx,
 258 | # %eax; push %eax; jmp next.  If top of stack is cached in %eax
 259 | # instead of being stored in memory, this becomes pop %ebx; and
 260 | # %ebx, %eax; jmp next, which is considerably shorter.
 261 | # However, most things don't change, and other things become
 262 | # longer due to the extra work to save top-of-stack.  I tried
 263 | # using both %ebx and %eax as the top-of-stack cache.
 264 | 
 265 | # In the version using %ebx as top-of-stack, the total size of
 266 | # the machine-code part was 216 bytes, 115 instructions,
 267 | # compared to 197 bytes, 112 instructions for the version using
 268 | # the current strategy.  In the version using %eax as
 269 | # top-of-stack, it was only 215 bytes, but that's still worse
 270 | # than 197.
 271 | 
 272 | # In previous versions, all routines were machine-code routines
 273 | # that you could just jmp to.  High-level bytecode words began
 274 | # with "call dolist", which took the saved %eip off the stack
 275 | # and stuck it in %esi.  Unfortunately, that added 5 bytes to
 276 | # each bytecode word; in version 2, the bytecode region is 221
 277 | # bytes and contains 36 word definitions (the 30 machine-code
 278 | # primitives aren't defined there) --- so 5 bytes each would
 279 | # have been 180 bytes of overhead, or 82%!  It also would have
 280 | # required a region to be both executable and writable to
 281 | # support run-time routine definition, which is kind of a pain
 282 | # thse days, and also on Harvard-architecture microcontrollers.
 283 | 
 284 | # In previous versions, the token-table entries were 32 bits
 285 | # each (instead of 16 bits as they are now), which added
 286 | # another 2 bytes of overhead per word.  In version 2, there
 287 | # are currently 66 words, so that's another 132 bytes of shaved
 288 | # overhead.
 289 | 
 290 | ### Performance (Speed)
 291 | 
 292 | # To version 2, I added a simple program to print out a
 293 | # badly-formatted 8-bit extended ASCII table; it was 11
 294 | # bytecode operations long.  It executed 81615 bytecodes in all
 295 | # (according to gdb).  On my 700MHz PIII-Coppermine laptop from
 296 | # 1999, 'time' reports CPU times varying from 4-12
 297 | # milliseconds.  So it seems like it can execute about 10 000
 298 | # bytecodes in a millisecond, or about 10 million bytecodes in
 299 | # a second --- about 100ns per bytecode.
 300 | 
 301 | # It also made 1459 system calls.
 302 | 
 303 | # That's slightly faster than Python's bytecode engine (maybe
 304 | # --- probably within measurement error --- and anyway Python's
 305 | # bytecodes are larger-grained), and an order of magnitude
 306 | # slower than simple direct-threading, and about three times
 307 | # slower than direct-threading with a simple bytecode
 308 | # indirection layer.
 309 | 
 310 | # I would be surprised if version 3 had any measurable change
 311 | # in speed from version 2.
 312 | 
 313 | # To version 3, I added a little loop to repeatedly search the
 314 | # 86-word wordlist for ", ", which isn't there.  I was able to
 315 | # do this search 10 000 times in 4.59 CPU seconds, which is
 316 | # 5.34 microseconds per comparison, or about 3700 CPU cycles.
 317 | # If we eventually have 300 words in the wordlist, each search
 318 | # will therefore take 1.6 milliseconds in the worst case, so
 319 | # the system won't be able to compile or interpret more than
 320 | # about 50-100 lines of code per second.  It should probably be
 321 | # pretty easy to fix this particular performance problem
 322 | # (recode wordlist search in asm, restructure, or whatever) if
 323 | # it comes up, but it's a bit worrisome...
 324 | 
 325 | ### What's Wrong With This Program
 326 | 
 327 | # - It's a long way from doing anything useful.
 328 | # - There's 21 instructions of unused code which may be broken.
 329 | #   In Version 3, these words are in use and so probably work:
 330 | #   hello, sub1, type, comma, world, newline, dolit_s8, dot,
 331 | #   bye, exit, branch_on_0, c_bang, drop, dup, swap, negative,
 332 | #   umplus, divmod, syscall5, syscall3, zero, syscall1, rpop,
 333 | #   rpush, one, dolit_32, neg1, add, emit, tuck, udot,
 334 | #   udot_nospc, udot_nonzero, branch, invert, add1, negate,
 335 | #   xor, printstack, cells, mul, depth, div, sub, cellsize,
 336 | #   nip, pick, twodup, i, _do, r_at, over, at, sp_at, bang,
 337 | #   twoswap, r_2at, find, unloop, memcmp, bcmp, c_at_inc,
 338 | #   cbcmp, zeq, ge, lt, words, pastdict, nextword, dictsize,
 339 | #   dictp, dict, c_at.  These are not tested, and therefore may be
 340 | #   broken: execute, rp_at, rp_bang, sp_bang, and, or.
 341 | # - There's no dictionary structure yet.
 342 | # - It probably needs another couple of primitives.
 343 | # - There's no checking for stack overflow or underflow, but
 344 | #   they will break things.
 345 | # - It's slow; see above about performance.
 346 | 
 347 | ### The Beginning of the Program
 348 | 
 349 | # .. include system library header so we know __NR_exit = 1 and
 350 | # __NR_write = 4
 351 | #include <asm/unistd.h>
 352 | 
 353 | ### The token table and dictionary
 354 | 
 355 |         # To save space, we're trying to avoid storing pointers
 356 |         # as much as possible.  So most of the code is
 357 |         # represented as "tokens", which are offsets into the
 358 |         # "token table", which contains 16-bit offsets into
 359 |         # either the machine-code primitives or the data
 360 |         # segment.
 361 | 
 362 |         # The "dictionary" is stored in this program as a
 363 |         # sequence of strings, stuffed next to each other with
 364 |         # no intervening pointers at all.  The idea is that if
 365 |         # you want to execute or compile a word, you walk
 366 |         # through the dictionary, examining each string in turn
 367 |         # to see if it's the right word.  If so, then the
 368 |         # number of strings you rejected is the index into the
 369 |         # token table.
 370 | 
 371 |         # This implies that the dictionary needs to be stored
 372 |         # somewhere where it can expand without overwriting
 373 |         # other stuff.  So I'm putting it in its own section
 374 |         # for the time being.  I'm pretty sure that means it'll
 375 |         # get at least a page, which is enough space to define
 376 |         # at least several hundred words.  Maybe at some future
 377 |         # time I'll copy it into .bss at boot time instead.
 378 | 
 379 |         .data 1                 # Start putting stuff in data subsection 1
 380 |         .align 4
 381 |         ## table to define the "bytecode" instructions
 382 | token_table:
 383 |         # "a" means "allocatable", "w" means "writable"
 384 |         .section .dictionary, "aw"
 385 | dictionary:
 386 | 
 387 |         ## I was frustrated with the unreadability of my
 388 |         ## bytecode lists; I was counting token table entries
 389 |         ## by hand and writing bytecodes numerically.  So I
 390 |         ## wrote a macro to help.
 391 | 
 392 |         ## Note that we are using a separate .subsection
 393 |         ## directive because gas 2.17 doesn't support putting
 394 |         ## that in the .pushsection line, even though it is
 395 |         ## documented to do so; see message from Maciej
 396 |         ## W. Rozycki on 2007-10-11, subject "Re: How to use
 397 |         ## .pushsection?",
 398 |         ## http://sourceware.org/ml/binutils/2007-10/msg00176.html
 399 |         ## for more details)
 400 | 
 401 |         ## The first few entries in the table of bytecodes are
 402 |         ## all defined in machine code; the rest are all
 403 |         ## defined in bytecode.  The inner interpreter examines
 404 |         ## each bytecode to determine which category it falls
 405 |         ## in in order to figure out how to execute it,
 406 |         ## including what base address to add its offset to.
 407 |         ## This sucks for extensibility but rocks for
 408 |         ## compactness.
 409 | 
 410 |         .macro countedstring name
 411 |         .byte stringlength\@
 412 | 1:      .ascii "\name"
 413 |         ## Here we count the length of the string --- computers
 414 |         ## are for counting bytes so people don't have to!
 415 |         stringlength\@ = . - 1b
 416 |         .endm
 417 | 
 418 |         .macro define_bytecode name, realname, origin
 419 |         .pushsection .data      # save current position, go to data section
 420 |         .subsection 1           # and subsection 1, where we put the addrs
 421 |         b_\name = (. - token_table) / 2 # define b_foo as the index of this ptr
 422 |         .ifeq b_\name - 256
 423 |         .error "\name got bytecode 256"
 424 |         .endif
 425 |         .short \name - \origin # insert offset which will be resolved next
 426 |         .popsection             # return to where we were, and
 427 |         .pushsection .dictionary
 428 |         countedstring "\realname"
 429 |         .popsection
 430 | \name:                          # define the name
 431 |         .endm
 432 |         .macro defasm name, realname
 433 |         define_bytecode \name, "\realname", machine_code_primitives
 434 |         .endm
 435 |         .macro defbytes name, realname
 436 |         define_bytecode \name, "\realname", bytecode_start
 437 |         .endm
 438 | 
 439 | ### The Return Stack
 440 | # We put Forth return addresses here, but programs can also use
 441 | # it for other purposes.
 442 | 
 443 |         .bss
 444 |         .space 4096
 445 | initial_return_stack_pointer:   
 446 | 
 447 | ### Initialization
 448 |                 
 449 |         .text                   # the following stuff goes in the text segment
 450 |         .global _start          # declare _start as a global symbol 
 451 |                                 # (otherwise ld won't be able to find it)
 452 | _start:                         # this is the entry point for ELF I guess
 453 |         cld                     # clear direction flag; unnecessary?
 454 |         mov $initial_return_stack_pointer, %ebp
 455 |         mov $instructions, %esi # %esi is the interpreter pointer register
 456 |         jmp next                # and now we start the interpreter.
 457 |                                 # (somewhat silly since we could just
 458 |                                 # fall through..)
 459 | 
 460 | ### The Machine-Code Primitives
 461 | # Also next (aka the address interpreter or inner interpreter)
 462 | # is in this section.
 463 | 
 464 | machine_code_primitives:
 465 |                 
 466 | # dolit_s8 takes a signed 8-bit literal from the instruction
 467 | # stream and pushes it onto the stack.
 468 | 
 469 |         defasm dolit_s8, "lit8"
 470 |         lodsb
 471 |         movsbl %al, %eax
 472 |         jmp pusheax
 473 | 
 474 |         defasm dolit_32, "lit"  # more general dolit
 475 |         lodsl
 476 |         jmp pusheax
 477 |         
 478 |         defasm exit, "ret"      # Return from a colon defn.
 479 |         xchg %ebp, %esp
 480 |         pop %esi
 481 |         xchg %ebp, %esp
 482 |         jmp next
 483 | 
 484 |         defasm execute, "execute" # Run xt on data stack.
 485 |         pop %eax                # Here 'xt' is the one-byte token.
 486 |         jmp execute_eax
 487 | 
 488 | # Branch if top of stack is 0 (implementing IF).
 489 | # Both branch instructions take a signed byte offset from the bytecode
 490 | # stream.
 491 |         defasm branch_on_0, "(if)"
 492 |         pop %eax
 493 |         and %eax, %eax
 494 |         jz branch
 495 |         inc %esi                # skip 1-byte jump offset
 496 |         jmp next
 497 | 
 498 |         defasm branch, "(else)"
 499 |         lodsb
 500 |         movsbl %al, %eax        # same insn size as cbtw; cwde
 501 |         add %eax, %esi
 502 |         jmp next        
 503 | 
 504 | # (loop) is what we do at the end of a DO LOOP.
 505 | 
 506 | # DO puts a number on top of the return stack that is zero minus
 507 | # the number of iterations remaining.  So when it finally
 508 | # reaches zero, we're done.  It also put a number underneath
 509 | # that on the return stack from which you can recover the
 510 | # iteration counter.
 511 | 
 512 | # This scheme is mostly due to F-83, except that in F-83 it
 513 | # reached 0x8000 instead of 0, which seemed perverse to me.
 514 | 
 515 | # This isn't strictly necessary as part of the minimal primitive
 516 | # set, but it seemed like it would make inner loops maybe ten
 517 | # times faster, and as with most words that do return-stack
 518 | # manipulation, the size penalty is actually negative.  (This
 519 | # version is 14 bytes; in bytecode it would be 17.)
 520 | 
 521 | # ( -- ) ( R: -1 adjustment -- ) in end-of-loop case, and skip
 522 | # the interpreter pointer over the jump offset.
 523 | # ( -- ) ( R: counter -- counter+1 ) in normal case,  and adjust
 524 | # interpreter pointer by number of bytes stored after call to
 525 | # this routine.
 526 | 
 527 |         defasm _loop, "(loop)"
 528 |         xor %eax, %eax          # mov $1, %eax is 5 bytes
 529 |         inc %eax
 530 |         add %eax, (%ebp)
 531 |         jnc branch              # if no carry, go branch
 532 |         # If there was a carry, we're done!
 533 |         add $8, %ebp            # drop loop-sys from rstack
 534 |         lodsb                   # skip jump offset
 535 |         jmp next
 536 | 
 537 | # Store a cell.
 538 |         defasm bang, "!"
 539 |         pop %ebx
 540 |         pop (%ebx)              # I'm amazed this is legal
 541 |         jmp next
 542 | 
 543 | # Fetch a cell.
 544 |         defasm at, "@"
 545 |         pop %ebx
 546 |         push (%ebx)             # I'm amazed this is legal too
 547 |         jmp next
 548 | 
 549 | # Store a byte.
 550 |         defasm c_bang, "c!"
 551 |         pop %ebx
 552 |         pop %eax
 553 |         mov %al, (%ebx)         # push and pop don't do bytes
 554 |         jmp next
 555 | 
 556 | # Fetch a byte.
 557 |         defasm c_at, "c@"
 558 |         pop %ebx
 559 |         xor %eax, %eax
 560 |         mov (%ebx), %al
 561 |         jmp pusheax
 562 |         
 563 | # Get the return stack pointer.
 564 |         defasm rp_at, "rp@"
 565 |         push %ebp
 566 |         jmp next
 567 |         
 568 | # Set the return stack pointer.
 569 |         defasm rp_bang, "rp!"
 570 |         pop %ebp
 571 |         jmp next
 572 |         
 573 | # Pop the return stack to the data stack
 574 |         defasm rpop, "r>"
 575 |         xchg %esp, %ebp
 576 |         pop %eax
 577 |         xchg %esp, %ebp
 578 |         jmp pusheax
 579 |         
 580 | # Push the return stack from the data stack
 581 |         defasm rpush, ">r"
 582 |         lea -4(%ebp), %ebp
 583 |         pop (%ebp)
 584 |         jmp next
 585 | 
 586 | # Get the data stack pointer (before it gets pushed).
 587 |         defasm sp_at, "sp@"
 588 |         push %esp               # safe on 286 and later
 589 |         jmp next
 590 | 
 591 | # Set the data stack pointer.
 592 |         defasm sp_bang, "sp!"
 593 |         pop %esp
 594 |         jmp next
 595 |         
 596 | # Pop the stack.
 597 |         defasm drop, "pop"
 598 |         pop %eax
 599 |         jmp next
 600 |         
 601 | # Push a copy of TOS.
 602 | # eForth 1.0 used BX to index the stack here, for a couple of
 603 | # reasons: on the 8086, SP got decremented prior to the fetch,
 604 | # and also wasn't valid as a base or index register.
 605 |         defasm dup, "dup"
 606 |         pop %eax
 607 |         push %eax
 608 |         jmp pusheax
 609 |         
 610 | # Stack manipulation ( w1 w2 -- w1 w2 w1 )
 611 | # technically not necessary, but it's so easy and tiny
 612 |         defasm over, "over"
 613 |         push 4(%esp)           
 614 |         jmp next
 615 | 
 616 | # Swap top two stack items ("exch" in PostScript)
 617 |         defasm swap, "swap"
 618 |         pop %edx
 619 |         pop %eax
 620 | #       jmp pushedxeax       fall through because pushedxeax is next
 621 | 
 622 | # pusheax and pushedxeax: a prologue to 'next' that first pushes %edx
 623 | # and %eax, or just %eax.
 624 | 
 625 | # For a net savings of 13 bytes, last I checked, in all those
 626 | # primitives that finish up by pushing something!  Clever trick from
 627 | # F-83's 1PUSH and 2PUSH.
 628 | pushedxeax:     
 629 |         push %edx
 630 | pusheax:        
 631 |         push %eax
 632 |         # now we fall through to 'next'
 633 |                 
 634 | # "next" fetches the next bytecode and runs it.  It's placed
 635 | # here in the middle of the bytecode definitions so that more
 636 | # of them can use the short two-byte jump form to get to it.
 637 | 
 638 | next:
 639 |         xor %eax, %eax          # set %eax to 0
 640 |         xor %ebx, %ebx          # clear high half of %ebx
 641 |         lodsb                   # load %al from where %esi points
 642 |                                 # (%esi is the interpreter pointer)
 643 | execute_eax:
 644 |         ## load offset of new word into %ebx
 645 |         mov token_table(,%eax,2), %bx  # bx := token_table[eax * 2bytes]
 646 |         cmp $last_asm_bytecode, %eax
 647 |         jbe next_primitive      # if primitive, handle primitive word
 648 |         ## otherwise, handle a bytecode definition or "colon list"
 649 |         # save old %esi on return stack
 650 |         xchg %ebp, %esp
 651 |         push %esi
 652 |         xchg %ebp, %esp
 653 |         lea bytecode_start(%ebx), %esi
 654 |         jmp next
 655 | 
 656 | next_primitive:
 657 |         lea machine_code_primitives(%ebx), %ebx
 658 |         jmp *%ebx
 659 |         
 660 | 
 661 | # Push true if n negative. ( n -- f )
 662 |         defasm negative, "0<"
 663 |         pop %eax                
 664 |         cdq
 665 |         push %edx
 666 |         jmp next
 667 | 
 668 | # Bitwise operators:
 669 |         defasm and, "&"
 670 |         pop %eax
 671 |         pop %ebx
 672 |         and %ebx, %eax
 673 |         jmp pusheax
 674 | 
 675 |         defasm or, "|"
 676 |         pop %eax
 677 |         pop %ebx
 678 |         or %ebx, %eax
 679 |         jmp pusheax
 680 | 
 681 |         defasm xor, "^"
 682 |         pop %eax
 683 |         pop %ebx
 684 |         xor %ebx, %eax
 685 |         jmp pusheax
 686 |         
 687 | # add two unsigned numbers, returning sum and carry.
 688 | # ( u1 u2 -- u3 cy )
 689 |         defasm umplus, "um+"
 690 |         xor %eax, %eax
 691 |         pop %edx
 692 |         pop %ebx
 693 |         add %ebx, %edx
 694 |         rcl $1, %eax
 695 |         jmp pushedxeax
 696 | 
 697 | # Divide double-precision by single-precision, unsigned (?).
 698 | # UM/MOD from eForth.  ( udl udh un -- ur uq )
 699 |         defasm divmod, "/%"
 700 |         pop %ebx
 701 |         pop %edx
 702 |         pop %eax
 703 |         idiv %ebx
 704 |         jmp pushedxeax
 705 | 
 706 | # Multiply two single-precision numbers, giving a double-
 707 | # precision result. ( d1 d2 -- udl udh )
 708 |         defasm mmul, "um*"
 709 |         pop %eax
 710 |         pop %ebx
 711 |         imul %ebx
 712 |         push %eax
 713 |         push %edx
 714 |         jmp next
 715 | 
 716 | # Copy the top of the return stack onto the data stack.
 717 |         defasm r_at, "r@"
 718 |         push (%ebp)
 719 |         jmp next
 720 | 
 721 | # syscall5:   
 722 | # Linux system call with up to 5 arguments
 723 | # This is no longer the fashionable way to make system calls
 724 | # in Linux.  Now you're supposed to use SYSENTER on newer
 725 | # CPUs, and rather than have you figure out which one to use,
 726 | # the kernel mmaps a chunk of code called a VDSO into your
 727 | # memory space at a random address and tells you where to
 728 | # find it using the ELF auxiliary vector.  Then you're
 729 | # supposed to invoke the dynamic linker or something to parse
 730 | # the ELF executable mysteriously manifested in this way by
 731 | # the kernel, and then resolve an undefined symbol in libc
 732 | # into calls to it.  See "What is linux-gate.so.1?"
 733 | # http://www.trilithium.com/johan/2005/08/linux-gate/
 734 | # "The Linux kernel: System Calls" by Andries Brouwer, 2003-02-01
 735 | # http://www.win.tue.nl/%7Eaeb/linux/lk/lk-4.html
 736 | # "About ELF Auxiliary Vectors" by Manu Garg
 737 | # http://manugarg.googlepages.com/aboutelfauxiliaryvectors
 738 | 
 739 | # But the old int $0x80 approach still works, thank goodness,
 740 | # because all of that is *way* more than these ten
 741 | # instructions.
 742 |         defasm syscall5, "syscall5"
 743 |         pop %edi
 744 |         ## we have to save %esi for the interpreter
 745 |         mov %esi, -4(%ebp)
 746 |         pop %esi
 747 |         pop %edx
 748 |         pop %ecx
 749 |         pop %ebx
 750 |         pop %eax
 751 |         int $0x80
 752 |         mov -4(%ebp), %esi
 753 |         jmp pusheax
 754 | 
 755 |         last_asm_bytecode = b_syscall5
 756 | 
 757 | ### Basic Interpreted Words
 758 |         ## a macro for defining interpreted words
 759 |         ## Because after I left off b_exit once, I wasted a long
 760 |         ## time trying to figure out what was wrong, so I use this when I can:
 761 |         .macro def name, realname, bytes:vararg
 762 |         defbytes \name, "\realname"
 763 |         .byte \bytes
 764 |         .byte b_exit
 765 |         .endm
 766 |         ## Macros for conditional branch and loop:
 767 |         ## Because I am tired of tracking down bugs due to 
 768 |         ## getting the jump offsets wrong.
 769 |         .macro fif, target      # if, or end of while loop
 770 |         .byte b_branch_on_0, \target - . - 1
 771 |         .endm
 772 |         .macro floop, target    # do loop
 773 |         .byte b__loop, \target - . - 1
 774 |         .endm
 775 |         .macro felse, target    # else, unconditional jump
 776 |         .byte b_branch, \target - . - 1
 777 |         .endm
 778 | 
 779 |         .data 2                 # separate subsection from token table
 780 | bytecode_start:
 781 | # System call with three arguments.
 782 |         def syscall3, "syscall3", b_zero, b_zero, b_syscall5
 783 | # System call with one argument.
 784 |         def syscall1, "syscall1", b_zero, b_zero, b_syscall3
 785 |         def bye, "bye", b_dolit_s8,__NR_exit, b_zero, b_syscall1 # exit program
 786 |         def zero, "0", b_dolit_s8,0            # push 0
 787 |         def one, "1", b_dolit_s8,1
 788 |         
 789 | # This word outputs a string whose address and count are on 
 790 | # the stack.  ( b u -- )
 791 | 
 792 |         defbytes type, "type"
 793 |         .byte b_rpush, b_rpush  # move two args onto rstack
 794 |                                 # system call is __NR_write:    
 795 |         .byte b_dolit_s8,__NR_write
 796 |         .byte b_one             # push constant 1: stdout
 797 |         .byte b_rpop, b_rpop    # move two args back from rstack
 798 |         .byte b_syscall3        # call syscall with 3 args
 799 |         .byte b_drop            # discard return value
 800 |         .byte b_exit            # return
 801 | 
 802 | # The next few words exist just to poke string addresses
 803 | # and lengths onto the stack so "type" can print them.
 804 |         .macro def_counted_string name, contents
 805 |         defbytes \name, "\name"
 806 |         .byte b_dolit_32        # dolit_32 pushes a 32-bit
 807 |         .int string_\name       # literal --- an addr, here
 808 |                                 # now push literal length and return
 809 |         .byte b_count, b_exit
 810 |         .pushsection .rodata    # define the actual string:
 811 | string_\name:
 812 |         countedstring "\contents"
 813 |         .popsection
 814 |         .endm
 815 | 
 816 |         def_counted_string hello, "hello"
 817 |         def_counted_string world, "world"
 818 |         def_counted_string comma, ", "
 819 | 
 820 |         # convert a counted string in memory to an address and
 821 |         # count on the stack
 822 |         def count, "count", b_dup, b_add1, b_swap, b_c_at
 823 | 
 824 |         def cr, "cr", b_dolit_s8, '\n, b_emit
 825 | 
 826 | ### Some More Basic Words
 827 | 
 828 |         def neg1, "-1", b_dolit_s8, -1   # ( -- -1 )
 829 |         def add, "+", b_umplus, b_drop  # ( a b -- a+b ) drop the carry
 830 |         def sub1, "1-", b_neg1, b_add    # ( n -- n-1 )
 831 |         def rot, "rot", b_rpush, b_swap, b_rpop, b_swap # ( a b c -- b c a )
 832 |         def unrot, "-rot", b_rot, b_rot    # ( a b c -- c a b )
 833 |         def tuck, "tuck", b_dup, b_unrot    # ( a b -- b a b )
 834 | 
 835 | # emit: output a single byte.  eForth calls this "TX!".
 836 | 
 837 | # This version is 11 bytes, including the buffer byte, plus the 2-byte
 838 | # token table pointer. a machine-code version I wrote the other day
 839 | # was 28 bytes.  However, I also added rot, unrot, and tuck to support
 840 | # this function, and they total 11 bytes, plus 6 bytes of overhead.
 841 | # For a total of 11+2+11+6 = 30 bytes.  Not winning yet on size over
 842 | # x86 asm!  But we're getting close.
 843 | 
 844 | emit_buffer:
 845 |         .byte 0
 846 |         defbytes emit, "emit"
 847 |         .byte b_dolit_32
 848 |         .int emit_buffer
 849 |         .byte b_tuck            # save a copy of address for b_type
 850 |         .byte b_c_bang          # store into emit buffer
 851 |         .byte b_one, b_type, b_exit # output one-byte buffer
 852 | 
 853 | ### "u." prints out an unsigned number.
 854 | # I had a version of this in x86 machine code in 52 bytes (23
 855 | # instructions), essentially exactly the same code as here.
 856 | # This is 31 bytes, plus 6 bytes of overhead, plus I had 
 857 | # to define b_divmod (9 bytes plus 2 bytes overhead).  Now we are
 858 | # starting to win!
 859 | 
 860 |         defbytes udot, "u."            # print space after number
 861 |         .byte b_udot_nospc, b_dolit_s8, 0x20, b_emit, b_exit 
 862 |         defbytes udot_nospc, "(u.)"    # print number without space
 863 |         .byte b_dup
 864 |         fif 1f
 865 |         .byte b_udot_nonzero, b_exit
 866 | 1:      .byte b_drop, b_dolit_s8, '0, b_emit, b_exit
 867 |         defbytes udot_nonzero, "((u.))"
 868 |         .byte b_zero, b_dolit_s8,10, b_divmod # divide by 10
 869 |         .byte b_dup
 870 |         fif 2f                  # recurse if nonzero
 871 |         .byte b_udot_nonzero
 872 |         felse 3f
 873 | 2:      .byte b_drop            # drop zero quotient
 874 | 3:      .byte b_dolit_s8, '0, b_add, b_emit # print digit
 875 |         .byte b_exit
 876 |         
 877 | ### Add signed numeric output, ".".  This cost 20 bytes plus 8 bytes
 878 | # of overhead, but added some fundamental numeric operations; only 12
 879 | # of those 28 bytes are specific to "."
 880 | 
 881 |         # logical bitwise not
 882 |         def invert, "~", b_dolit_s8, -1, b_xor
 883 |         def add1, "1+", b_one, b_add
 884 |         # arithmetic negation
 885 |         def negate, "negate", b_invert, b_add1
 886 |         # print signed number
 887 |         defbytes dot, "."
 888 |         .byte b_dup, b_negative
 889 |         fif 1f
 890 |         .byte b_dolit_s8, '-, b_emit, b_negate # in the negative case
 891 | 1:      .byte b_udot, b_exit
 892 | 
 893 | ### Obviously the next thing to do is to add ".S", print the
 894 | # stack, so that I can stop having to investigate problems by
 895 | # using gdb.
 896 | 
 897 | # The bytecode for this consumed 78 bytes in 12 words, plus a
 898 | # new 8-byte primitive (mmul) and a new 14-byte primitive
 899 | # (_loop), plus six bytes in the initialization routine, for 28
 900 | # bytes of overhead and a total of 78+28+14+8+6 = 134 bytes.
 901 | # This is definitely not a size win over machine code!  Machine
 902 | # code would only be 22 bytes in 7 instructions, if there were a
 903 | # way to just CALL the "." routine from machine code, which
 904 | # there isn't.
 905 |         
 906 | # However, the words added were  cells * - / cellsize nip (do)
 907 | # (loop) 2dup  which are all generally useful, and  
 908 | # depth pick .s  which are more special-purpose.
 909 | # The special-purpose words are 35 out of those 78 bytes.  
 910 | 
 911 | # PRINTSTACK itself is only 15 bytes, and there's hope that the
 912 | # 6 bytes of PICK and the 14 bytes of DEPTH will be useful in
 913 | # other debugging routines.
 914 | 
 915 | # I'm not happy with (do) and (loop), only because (do)
 916 | # implements dpANS DO, not dpANS ?DO, so it loops many times
 917 | # when it should loop zero times;
 918 | 
 919 |         def_counted_string scolon, "s: "
 920 |         defbytes printstack, ".s"
 921 |         .byte b_scolon, b_type
 922 |         .byte b_depth, b_zero   # loop limits
 923 |         .byte b_twodup, b_xor
 924 |         fif 1f                  # skip loop if stack empty
 925 |         .byte b__do
 926 | 2:      .byte b_i, b_pick, b_dot # DO I PICK . LOOP
 927 |         floop 2b
 928 | 1:      .byte b_cr, b_exit
 929 |         def pick, "pick", b_add1, b_cells, b_sp_at, b_add, b_at
 930 |         def cells, "cells", b_cellsize, b_mul
 931 |         def mul, "*", b_mmul, b_drop # drop upper 32 bits of multiplication result
 932 | bottom_of_stack:
 933 |         .int 0
 934 |         defbytes depth, "depth"
 935 |         .byte b_sp_at, b_dolit_32
 936 |         .int bottom_of_stack
 937 |         .byte b_at, b_swap, b_sub, b_zero, b_cellsize, b_div, b_exit
 938 | 
 939 |         def sub, "-", b_negate, b_add # subtract ( a b -- a-b )
 940 |         def div, "/", b_divmod, b_nip # int divide ( ul uh n -- quotient )
 941 |         def cellsize, "cellsize", b_dolit_s8,4   # ( -- 4 )
 942 |         def nip, "nip", b_swap, b_drop  # stack manipulation ( a b -- b )
 943 | 
 944 |         # 10 0 DO ... LOOP loops 0, 1...9.
 945 |         # _do sets up return stack for _loop
 946 |         # similar to F83: ( limit initial -- ) ( R: X -- X initial-limit limit )
 947 |         defbytes _do, "(do)"
 948 |         .byte b_over, b_sub,  b_swap,  b_rpop,  b_swap, b_rpush
 949 |         .byte b_swap, b_rpush,  b_rpush,  b_exit
 950 |         ##  return loop counter
 951 |         def i, "i", b_rpop, b_rpop, b_r_at, b_over, b_rpush, b_add, b_swap, b_rpush
 952 | 
 953 |         def twodup, "2dup", b_over, b_over
 954 |         def twodrop, "2drop", b_drop, b_drop
 955 | 
 956 | # Now some stuff for dealing with the dictionary.
 957 | 
 958 | # This stuff was from 804930a to 8049396, 140 bytes.  In that we got:
 959 | # - new words: dict dictp dictsize nextword pastdict? words < >= 0=
 960 | #   cbcmp c@+ bcmp memcmp unloop find r2@ 2swap
 961 | # - less concretely:
 962 | #   - the ability to list of words in the dictionary;
 963 | #   - the ability to find words in the dictionary;
 964 | #   - <, >=, and 0= numerical comparisons;
 965 | #   - cbcmp, bcmp, and memcmp memory manipulations;
 966 | #   - 2swap and r2@ stack manipulations;
 967 | #   - unloop loop control;
 968 | #   - c@+ for iterating over memory.
 969 | # That's 17 new words, averaging 8.2 bytecodes each.
 970 | 
 971 | dictionary_pointer:             
 972 |         .int end_of_dictionary
 973 |         defbytes dict, "dict"
 974 |         .byte b_dolit_32
 975 |         .int dictionary
 976 |         .byte b_exit
 977 |         defbytes dictp, "dictp"
 978 |         .byte b_dolit_32
 979 |         .int dictionary_pointer
 980 |         .byte b_exit
 981 |         def dictsize, "dictsize", b_dictp, b_at, b_dict, b_sub
 982 |         def nextword, "nextword", b_dup, b_c_at, b_add, b_add1
 983 |         def pastdict, "pastdict?", b_dictp, b_at, b_ge
 984 |         defbytes words, "words"
 985 |         .byte b_dict
 986 | 1:      .byte b_dup, b_count, b_type, b_dolit_s8,32, b_emit
 987 |         .byte b_nextword
 988 |         .byte b_dup, b_pastdict
 989 |         fif 1b
 990 |         .byte b_exit
 991 |         def lt, "<", b_sub, b_negative
 992 |         def ge, ">=", b_lt, b_zeq
 993 |         # logical not: return true for 0, false (0) otherwise
 994 |         defbytes zeq, "0="
 995 |         fif 1f
 996 |         .byte b_zero, b_exit
 997 | 1:      .byte b_neg1, b_exit
 998 | 
 999 |         # To find a word in the dictionary:
1000 |         # - move the word onto the return stack, and get the dictionary pointer
1001 |         # - then loop:
1002 |         #   - see if the word is at the current place
1003 |         #   - if so, clean up and return that place
1004 |         #   - otherwise, go to the next word
1005 |         #   - and repeat if we're still in the dictionary
1006 |         # - then clean up the stacks and return 0
1007 |         # Tells whether a counted string equals an address-and-count string.
1008 |         # 0 for equal, nonzero for unequal.
1009 |         # ( c-addr1 c-addr2 u -- n )
1010 |         def cbcmp, "cbcmp", b_rot, b_count, b_twoswap, b_bcmp
1011 |         # like F21 @A+: ( c-addr -- c-addr+1 char )
1012 |         def c_at_inc, "c@+", b_dup, b_add1, b_swap, b_c_at
1013 |         # Keep in mind memcmp() in libc is only 30 bytes long.
1014 |         # This bcmp is a little different from C memcmp or bcmp in
1015 |         # that it compares two lengths.
1016 |         defbytes bcmp, "bcmp" # ( c-addr1 u1 c-addr2 u2 -- n )
1017 |         .byte b_rot, b_over, b_xor
1018 |         fif 3f
1019 |         .byte b_twodrop, b_drop, b_one, b_exit
1020 | 3:      .byte b_memcmp, b_exit
1021 |         defbytes memcmp, "memcmp"  # ( c-addr1 c-addr2 u -- n )
1022 |         .byte b_zero, b__do
1023 | 2:      .byte b_c_at_inc, b_rot, b_c_at_inc, b_rot
1024 |         .byte b_sub, b_dup # - dup if
1025 |         fif 1f
1026 |         .byte b_unrot, b_twodrop, b_unloop, b_exit
1027 | 1:      .byte b_drop, b_swap
1028 |         floop 2b
1029 |         .byte b_twodrop, b_zero, b_exit
1030 |         # this should probably go with the other do loop stuff
1031 |         def unloop, "unloop", b_rpop, b_rpop, b_rpop, b_twodrop, b_rpush
1032 | 
1033 |         # FIND: ( c-addr u -- token 1 | 0 )
1034 |         
1035 |         # 30 bytes; jonesforth 42's asm version is 56 bytes.  It's
1036 |         # fairly directly comparable, although jonesforth's FIND has
1037 |         # to do bit-masking and includes its own inline NEXTWORD and
1038 |         # CBCMP, which are actually fairly large here.
1039 |         
1040 |         defbytes find, "find"
1041 |         .byte b_rpush, b_rpush, b_zero, b_dict
1042 | 1:      .byte b_dup, b_r_2at, b_cbcmp, b_zeq            # start loop
1043 |         fif 2f                  # bcmp 0= if
1044 |         .byte b_rpop, b_rpop, b_twodrop, b_drop, b_one, b_exit
1045 | 2:      .byte b_swap, b_add1, b_swap, b_nextword, b_dup, b_pastdict
1046 |         fif 1b
1047 |         .byte b_rpop, b_rpop, b_twodrop, b_twodrop, b_zero, b_exit
1048 | 
1049 |         # copy two cells from return stack
1050 |         defbytes r_2at, "r2@"
1051 |         .byte b_rpop, b_rpop, b_r_at, b_over, b_rpush, b_rot, b_rpush, b_exit
1052 |         # ( a b c d -- c d a b )
1053 |         def twoswap, "2swap", b_rpush, b_unrot, b_rpop, b_unrot
1054 | 
1055 |         .macro create, name
1056 |         defbytes \name, "\name"
1057 |         .byte b_dolit_32
1058 |         .int 1f
1059 |         .byte b_exit
1060 | 1:      
1061 |         .endm
1062 |         
1063 |         create "tib"
1064 |         _tibmax = 80
1065 |         .space _tibmax
1066 |         defbytes tibmax, "tibmax"
1067 |         .byte b_dolit_32
1068 |         .int _tibmax
1069 |         .byte b_exit
1070 |         create "tibsize"
1071 |         .int 0
1072 | 
1073 |         defbytes fgets, "fgets"
1074 |         .byte b_tib, b_tibmax, b_read, b_tibsize, b_bang # XXX handle errors
1075 |         .byte b_tib, b_tibsize, b_at, b_exit
1076 |         def gets, "gets", b_zero, b_fgets
1077 | 
1078 |         defbytes read, "read"
1079 |         .byte b_rpush, b_rpush, b_dolit_s8,__NR_read
1080 |         .byte b_zero             # fd 0: stdin
1081 |         .byte b_rpop, b_rpop, b_syscall3, b_exit
1082 | 
1083 |         def bl, "bl", b_dolit_s8,32 # space, blank
1084 | 
1085 |         # parse parses out a token of input from a string and leaves
1086 |         # the token's address and length atop the stack
1087 |         # ( c-addr u -- c-addr+n u-n c-addr2 u2 )
1088 |         defbytes parse, "parse"
1089 |         .byte b_skipwhitespace
1090 |         ##  XXX finish him!
1091 |         .byte b_exit
1092 | 
1093 |         defbytes skipwhitespace, "-wsp"
1094 | 1:      .byte b_dup, b_zeq
1095 |         fif 3f                  # return empty tail
1096 |         .byte b_exit
1097 | 3:      .byte b_sub1, b_swap, b_c_at_inc, b_whitespace
1098 |         fif 2f                  # escape loop if not whitespace
1099 |         .byte b_swap
1100 |         felse 1b
1101 | 2:      .byte b_sub1, b_swap, b_add1, b_exit
1102 | 
1103 |         ## costs 9+5 bytes
1104 |         def_counted_string wsps, " \n\t"
1105 |         defbytes whitespace, "wsp"
1106 |         .byte b_dup, b_bl, b_xor
1107 |         fif 1f
1108 |         .byte b_dup, b_dolit_s8,'\n, b_xor
1109 |         fif 1f
1110 |         .byte b_dup, b_dolit_s8,'\t, b_xor
1111 |         fif 1f
1112 |         .byte b_drop, b_zero, b_exit
1113 | 1:      .byte b_drop, b_one, b_exit
1114 | 
1115 | #         def repl, "repl", b_gets, b_eval, b_exit
1116 | 
1117 | #         defbytes eval, "eval"
1118 | # 2:      .byte b_parse
1119 | #         .byte b_dup, b_zeq
1120 | #         fif 3f                  # escape from loop
1121 | #         .byte b_rot, b_rpush, b_rot, b_rpush, b_find
1122 | #         fif 1f
1123 | #         .byte b_execute
1124 | # 1:      .byte b_rpop, b_rpop
1125 | #         felse 2b
1126 | # 3:      .byte b_2drop, b_2drop, b_exit
1127 | 
1128 |         .data 3
1129 | instructions:
1130 |         # And here is the actual "main program" in that bytecode.
1131 |         .byte b_sp_at, b_dolit_32
1132 |         .int bottom_of_stack    # variable to remember initial stack bottom
1133 |         .byte b_bang            # initialize that variable
1134 |         .byte b_hello           # string "hello" and count
1135 |         .byte b_sub1            # subtract 1 from count: "hell"
1136 |         .byte b_type            # spit it out
1137 |         .byte b_comma, b_type, b_world, b_type # ", world"
1138 |         .byte b_comma, b_type, b_hello, b_type, b_cr
1139 |         # test the "dot" command to print out numbers
1140 |         .byte b_dolit_s8, -120, b_dot
1141 |         # test positive numbers and "depth" command
1142 |         .byte b_dolit_s8, 104, b_depth, b_dot, b_dot, b_cr
1143 |         # test printstack
1144 |         .byte b_dolit_s8,100, b_dolit_s8,101, b_dolit_s8,102, b_printstack
1145 |         # test dictsize
1146 |         .byte b_dictsize, b_dot, b_cr
1147 |         .byte b_words, b_cr
1148 |         .byte b_hello, b_twodup, b_type, b_find, b_printstack
1149 |         .byte b_comma, b_twodup, b_type, b_find, b_printstack
1150 |         .byte b_dict, b_dot, b_cr
1151 |         .byte b_dolit_s8, '?, b_emit, b_bl, b_emit, b_gets, b_twodup
1152 |         .byte b_zero, b__do
1153 | 1:      .byte b_c_at_inc, b_whitespace, b_dot
1154 |         floop 1b
1155 |         .byte b_drop
1156 |         .byte b_skipwhitespace, b_type
1157 |         .byte b_bye
1158 | 
1159 |         # At end of the assembly program, we initialize the
1160 |         # end_of_dictionary pointer by putting it at the end of the
1161 |         # assembled .dictionary section:
1162 |         .section .dictionary
1163 | end_of_dictionary:      
1164 | 


--------------------------------------------------------------------------------