├── .gdbinit ├── .gitignore ├── ChangeLog ├── Makefile ├── README.md └── tokthr.S /.gdbinit: -------------------------------------------------------------------------------- 1 | b *0x08048075 2 | display/i $pc 3 | display/a ((void**)$esp)[2] 4 | display/a ((void**)$esp)[1] 5 | display/a ((void**)$esp)[0] 6 | display/a $esi 7 | display/a $eax 8 | 9 | define rstack 10 | p/a ((void**)$ebp)[0]@3 11 | end 12 | document rstack 13 | Prints the top three entries of the return stack, 14 | topmost first. 15 | end 16 | 17 | define fb 18 | b execute_eax if $esi == &$arg0 19 | end 20 | document fb 21 | Sets a breakpoint to stop the interpreter when it enters a given Forth word. 22 | end -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | tokthr 2 | tokthr-disassembly 3 | tokthr-symbols 4 | -------------------------------------------------------------------------------- /ChangeLog: -------------------------------------------------------------------------------- 1 | 2007-11-16 Kragen Javier Sitaker 2 | 3 | * Changelog: added. 4 | 5 | * tokthr2.S: added read, gets, fgets, tib, tibmax, tibsize, asm 6 | macro 'create'. Now we can read input. Added fif, floop, felse 7 | macros to calculate jump offsets for me; no more miscalculated 8 | jump offsets yielding puzzling bugs. 9 | 10 | * Makefile: somewhere along the line added this. 11 | 12 | 2007-11-15 Kragen Javier Sitaker 13 | 14 | * tokthr2.S: Bumped version from 2 to 3. Added run-time 15 | dictionary of names. Moved "instructions" (main program) to end 16 | of file. For dictionary handling, added count, cr, words, dict, 17 | dictp, dictsize, nextwords, pastdict?, <, >=, 0=, cbcmp, c@+, 18 | bcmp, memcmp, unloop, find, r2@, 2swap. Added "s: " label to .s 19 | output. 20 | 21 | 2007-11-12..14 Kragen Javier Sitaker 22 | 23 | * tokthr2.S: Added "version 2", many comments. Wrote 24 | define_bytecode, defasm, defbytes macros so I can stop writing 25 | bytecode numerically. Shortened tail end of primitives with 26 | pusheax and pushedxeax. Changed execute to execute a token. 27 | Implemented DO LOOP. Removed direct threading, switched to a 28 | variant of "bit threading". Added make_counted_string macro and 29 | words that will later be called -1 + 1- rot -rot tuck emit u. (u.) 30 | ((u.)) ~ 1+ negate . printstack pick cells * depth - / cellsize 31 | nip i 2dup. 32 | 33 | * .gdbinit: somewhere along the line added this. 34 | 35 | 2007-11-11 Kragen Javier Sitaker 36 | 37 | tokthr.S: Tried out various things to see if they could shrink the 38 | machine code. 39 | 40 | 2007-11-10 Kragen Javier Sitaker 41 | 42 | * tokthr.S: Have most of the basic primitives and have written a 43 | little bit of bytecode, but nothing that does anything 44 | interesting. 45 | 46 | 2007-11-09 Kragen Javier Sitaker 47 | 48 | * tokthr.S: Prototyped four-instruction token-threaded "next" 49 | routine and wrote hello-world program with it. -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | all = tokthr tokthr-symbols tokthr-disassembly 2 | all: $(all) 3 | clean: 4 | rm -f $(all) *~ .*~ 5 | tokthr: tokthr.S 6 | gcc -m32 -nostdlib -static -o $@ $< 7 | tokthr-symbols: tokthr 8 | (echo '-*- auto-revert -*-'; nm $<) > $@ 9 | tokthr-disassembly: tokthr 10 | (echo '-*- auto-revert -*-'; objdump -D $<) > $@ 11 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | tokthr 2 | ====== 3 | 4 | > That’s the longest “hello world” I’ve *ever seen*! -- [Bob Appleyard] [0] 5 | 6 | This is a tiny token-threaded Forth interpreter that I started in 2007 7 | and haven’t finished yet. It’s about 1000 lines of x86 assembly, 8 | using gas’s macro capabilities, and it compiles to about 1400 bytes of 9 | executable, 399 of which are x86 code. (The rest is bytecode.) 10 | 11 | I don’t have a good name for it yet, and I don’t know if I’ll ever 12 | finish it. It’s in the public domain. 13 | 14 | [0]: http://www.reddit.com/r/programming/comments/8vj71/small_is_beautiful/c0aki5s "Comment on Reddit when someone pointed to an earlier version of this as an exemplar of smallness" 15 | -------------------------------------------------------------------------------- /tokthr.S: -------------------------------------------------------------------------------- 1 | # Simple token-threaded "Forth" interpreter. -*- asm -*- 2 | # Version 3. 3 | # by Kragen Javier Sitaker; dedicated to the public domain, 4 | # i.e. I relinquish whatever exclusive rights copyright law 5 | # gives me with regard to this work. 6 | # Major parts taken from Richard W.M. Jones's public-domain 7 | # JONESFORTH 42 by Richard W.M. Jones 8 | # http://annexia.org/forth 9 | 10 | # As of version 2, this program just outputs the following under 11 | # Linux: 12 | # hell, world, hello 13 | # -120 1 104 14 | # s: 102 101 100 15 | # 397 16 | # lit8 lit ret execute (if) (else) (loop) ! @ c! c@ rp@ rp! r> >r sp@ sp! pop dup over swap 0< & | ^ um+ /% um* r@ syscall5 syscall3 syscall1 bye 0 1 type hello world comma count cr -1 + 1- rot -rot tuck emit u. (u.) ((u.)) ~ 1+ negate . scolon .s pick cells * depth - / cellsize nip (do) i 2dup 2drop dict dictp dictsize nextword pastdict? words < >= 0= cbcmp c@+ bcmp memcmp unloop find r2@ 2swap 17 | # hello134517863 18 | # , 0 19 | # 134517710 20 | 21 | # to compile: 22 | # gcc -m32 -nostdlib -static -o tokthr tokthr.S 23 | 24 | 25 | ### Why Small Things are Interesting 26 | 27 | # There are still a lot of computers out there that have tens of 28 | # kilobytes of memory or less, and they cost a lot less than, 29 | # say, a cellphone. Cellphones are apparently still too 30 | # expensive for half the world's population. I want to see how 31 | # close I can get to having a comfortable programming 32 | # environment in a smaller device. 33 | 34 | # Some smallish microcontroller chips from five different 35 | # manufacturers, with current Digi-Key prices: 36 | # Name bytes RAM bytes ROM MHz price 37 | # ATtiny2313 128 2048 20 US$1.38 38 | # ATMega48-20AU 512 4096 20 US$1.62 39 | # MSP430F1111AIPW 128 2264 8 US$2.43 40 | # LPC2101 2048 8192 70 US$2.52 41 | # H8/300H Tiny 1536 8192 12 US$3.58 42 | # M16C/R8C/Tiny/1B 1024 16384 12 US$3.54 43 | # SX28AC/SS 136 3072 50 US$2.79 44 | 45 | # There are essentially no 386-compatible devices in this price 46 | # range as far as I can tell; for why I'm not worried about 47 | # that, see the section "How Small This Is". 48 | 49 | # More practically and short-termly, small projects can take 50 | # less time to finish, and I feel like I need to learn about 51 | # different approaches to implementing programming languages. 52 | 53 | ### Why This is Small 54 | 55 | # The normal Forth representation of a function is as an array 56 | # of pointers to the other functions it calls, in sequence; a 57 | # few of those other functions may move the interpreter pointer 58 | # around in that array, or snarf up a constant that's stored in 59 | # the array, or stuff like that, but for the most part, the 60 | # functions just get called in sequence. This is called 61 | # "threaded code" and it's fairly compact, especially on 16-bit 62 | # systems where the pointers are only two bytes. 63 | 64 | # A traditional approach taken by Forth implementations to 65 | # reduce code size even further is called "token threading". 66 | # Rather than making arrays of 16-bit or 32-bit pointers, they 67 | # make lists of 8-bit indices into an array of pointers. This 68 | # has two advantages: 69 | 70 | # 1. the indices are one fourth the size of a list of 32-bit 71 | # pointers; 72 | # 2. it is possible to save these lists of indices somewhere 73 | # outside of memory and continue to use them even after 74 | # making some changes to the code, as long as the same 75 | # indices in the table have the same meanings. So, for 76 | # example, you could write some boot firmware in this 77 | # "bytecode". 78 | 79 | # It also has some disadvantages: 80 | # 1. You run out of space in the table. Even a fairly minimal 81 | # full Forth system contains close to 256 subroutines. You 82 | # can mitigate this by packing, say, two 12-bit pointers 83 | # every three bytes, or maybe by having a special bytecode 84 | # that looks up the next byte in an extended table. 85 | # 2. It's slower and makes the machine-code part of the program 86 | # take more space. The traditional LODSW; JMP AX version of 87 | # $NEXT from the eForth Model, which fetches and executes the 88 | # next execution token in the threaded list, is three bytes 89 | # and two instructions; my 'next' here is 41 bytes and 14 90 | # instructions, which is big enough that I jump to it (2 91 | # bytes) rather than making an assembler macro. Which blows 92 | # your branch target buffers to pieces. Oh well. The 93 | # performance penalty is probably two orders of magnitude 94 | # over native code, but I haven't measured it yet. I 95 | # measured an earlier version on my 700MHz PIII laptop on an 96 | # empty loop at about a factor of 3.5 over simple 97 | # direct-threading, which in turn is on the order of 10 times 98 | # slower than machine code. 99 | 100 | # Anyway, so this is an example program built using this 101 | # technique. It implements two Forthlike stacks and interpreted 102 | # subroutines, but not yet the ability to define new subroutines 103 | # at run-time. 104 | 105 | ### What's Here 106 | 107 | # I've implemented all of the primitives from C. H. Ting and 108 | # Bill Muench's public-domain (?) eForth Model 1.0, except for 109 | # the following: 110 | # - I haven't implemented their lowercase "next" (as in FOR 111 | # NEXT) because I think it's a bad idea, it's complex, and it 112 | # can be implemented at a higher level if you really need it; 113 | # instead, I implemented (loop). 114 | # - I didn't implement !IO because it's a no-op in this context; 115 | # - I haven't yet implemented ?RX, although I think it's 116 | # possible to implement it on top of syscall5, using select(); 117 | # Additionally, I implemented multiply and divide primitives. 118 | 119 | # However, some of it is untested and therefore probably broken. 120 | # Procedure call and return and the system calls do work. 121 | 122 | # Currently registers are used as follows: 123 | # %esi --- interpreter pointer; points to next byte to execute 124 | # %ebp --- return stack pointer; points to last thing pushed. This stack, 125 | # like the other one, grows downwards. 126 | # %esp --- data stack pointer; points to last thing pushed. This is 127 | # the processor's standard stack pointer; "push" and "pop" 128 | # instructions use it, which makes assembly code to use it 129 | # quite concise. The Intel "call" and "ret" instructions 130 | # would also use this stack, but they aren't used in this 131 | # program. 132 | # flags --- the "down" direction flag must be cleared. 133 | 134 | # It's probably missing a couple of primitives needed because of 135 | # the token-threading implementation strategy; the address of 136 | # the token table probably needs to be knowable, at least. 137 | 138 | # Direct and indirect threading, the normal Forth approaches to 139 | # allowing unrestricted coexistence of words written in assembly 140 | # language and interpreted Forth, both had heavy space costs 141 | # here --- close to 100% for the bytecode currently in the 142 | # system. So the inner interpreter checks, for each bytecode, 143 | # whether it is in the range of bytecodes whose interpretations 144 | # are in native code, and picks the relevant code path. This 145 | # avoids consuming any space per-word for this distinction, but with 146 | # what appears to be a heavy performance cost. 147 | 148 | # This is similar to an approach called "bit threading"; as 149 | # explained by Jeff Fox in a comp.lang.forth message 150 | # 2007-05-13, on thread "Cases where Forth seems a little 151 | # clunky": 152 | 153 | # I have seen hardware and software implemenations of 154 | # bit-threading where the msb of the address space 155 | # selects between threaded code address lists and 156 | # addresses of CODE subroutines. In both cases 0 is a 157 | # valid address and negative addresses are valid. I think 158 | # this applied to Novix. 159 | 160 | # Except that this approach uses a fraction of a bit for most 161 | # tokens instead of a whole bit. 162 | 163 | ### How Small This Is 164 | 165 | # As a point of comparison, eForth 1.0's machine-code part seems 166 | # to be 171 instructions and 399 bytes, including some data 167 | # that's mixed in there with it. 168 | 169 | # As of version 3, the machine-code part of this interpreter is 170 | # 239 bytes in 129 machine-code instructions, plus 19 bytes of 171 | # read-only data, 172 bytes of token table for the 86 172 | # currently-defined words (30 of which are primitive), 397 173 | # bytes of their names, and another 370 bytes of bytecode in 174 | # the 56 bytecode words (including a couple of data values 175 | # scattered around), and then another 56 bytes in the main 176 | # program, for a total of 1253 bytes. So the program is 177 | # already less than half written in assembly, in terms of 178 | # object-code size. 179 | 180 | # As of version 3, the machine-code part of this program uses 181 | # only 25 different instructions: cld; jmp, jz, jnc, jbe; push, 182 | # pop, lodsb, lodsl, xchg, mov; cmp, movsbl, inc, and, xor, or, 183 | # lea, cdq, rcl, add, idiv, imul; and int. Interestingly, many 184 | # of them are only used once. 185 | 186 | # Just before version 3, the non-comment lines were 14% 187 | # assembler macros, 51% assembly language, and 35% bytecode. 188 | 189 | # From Brad Rodriguez's January/February 1993 Computer Journal 190 | # article, "Moving Forth: Part 1: Design Decisions in the Forth 191 | # Kernel": 192 | # You can expect the usual 16-bit Forth kernel (see below) 193 | # to occupy about 8K bytes of program space. For a full 194 | # kernel that can compile Forth definitions, you should 195 | # allow a minimum of 1K byte of RAM. 196 | 197 | # I'm pretty sure I can beat the 8K requirement by quite a bit 198 | # and still be able to compile Forth definitions --- I'm hoping 199 | # for a factor of 4. Consider, from Jeff Fox's "Thoughtful 200 | # Programming", chapter 3: http://www.ultratechnology.com/forth3.htm 201 | 202 | # People assume that since Chuck has refined his Forth down to 203 | # about a 1K object that this means he has just stripped his 204 | # Forth down to a 1K kernel that will boot like in the old days 205 | # and that he is going to compile a complete Forth system on top 206 | # of the 1K before he starts an application. This is wrong. The 207 | # complete Forth system is 1K, and the reason for that is 208 | # maximize Chuck's productivity. What stops people from doing 209 | # what they need to do to solve a problem is all the time spend 210 | # solving all the related sub-problems that pop up as a result 211 | # of complex interconnections between components. To maximize 212 | # his productivity Chuck minimizes the number of these side 213 | # problems that pop up. Keep it simple, and don't get to where 214 | # you are spending 90% or 99% or you time dealing with related 215 | # sub-problems. Avoid unsolvable problems, don't waste your time 216 | # trying to solve them. 217 | 218 | # Consider also this quote from Elizabeth Rather: 219 | 220 | # As you have seen, so much depends on the specific 221 | # machine architecture. We implemented a TTC [token 222 | # threaded] Forth on some low-end AVR processors with 223 | # very limited code space, and it ran faster than a 224 | # native-code 8051 at comparable clock speed. 225 | 226 | # (2007-06-23 comp.lang.forth post on thread "Build your own 227 | # Forth for Microchip PIC (Episode 838): Threading") 228 | # http://objectmix.com/forth/168105-build-your-own-forth-microchip-pic-episode-838-threading.html 229 | 230 | # Consider also this quote from the abstract of Frank 231 | # Sergeant's 1991 "A 3-Instruction Forth for Embedded Systems 232 | # Work: Illustrated on the Motorola MC68HC11": 233 | 234 | # A 3- instruction Forth makes Forth affordable for 235 | # target systems with very limited memory. It can be 236 | # brought up quickly on strange new hardware. You don't 237 | # have to do without Forth because of memory or time 238 | # limitations. It only takes 66 bytes for the Motorola 239 | # MC68HC11. Full source is provided. . . . The absolute 240 | # minimum the target must do, it seems to me, is fetch a 241 | # byte, store a byte, and call a subroutine. 242 | 243 | # http://pygmy.utoh.org/3ins4th.html 244 | 245 | # As I said before, there are no small, cheap 386s, and so of 246 | # course the code size of this version is an approximation, and 247 | # it won't be a simple recompile to port it to one of these 248 | # other architectures; but 129 instructions' worth of assembly 249 | # are probably not that big a deal to rewrite for a new 250 | # platform. (I'll probably want to write multiply and divide 251 | # routines, though.) 252 | 253 | ### Other Things I Tried 254 | 255 | # I tried switching to caching the top of the data stack in a 256 | # register, on the theory that it would shorten things like 257 | # 'and'. Currently 'and' is pop %eax; pop %ebx; and %ebx, 258 | # %eax; push %eax; jmp next. If top of stack is cached in %eax 259 | # instead of being stored in memory, this becomes pop %ebx; and 260 | # %ebx, %eax; jmp next, which is considerably shorter. 261 | # However, most things don't change, and other things become 262 | # longer due to the extra work to save top-of-stack. I tried 263 | # using both %ebx and %eax as the top-of-stack cache. 264 | 265 | # In the version using %ebx as top-of-stack, the total size of 266 | # the machine-code part was 216 bytes, 115 instructions, 267 | # compared to 197 bytes, 112 instructions for the version using 268 | # the current strategy. In the version using %eax as 269 | # top-of-stack, it was only 215 bytes, but that's still worse 270 | # than 197. 271 | 272 | # In previous versions, all routines were machine-code routines 273 | # that you could just jmp to. High-level bytecode words began 274 | # with "call dolist", which took the saved %eip off the stack 275 | # and stuck it in %esi. Unfortunately, that added 5 bytes to 276 | # each bytecode word; in version 2, the bytecode region is 221 277 | # bytes and contains 36 word definitions (the 30 machine-code 278 | # primitives aren't defined there) --- so 5 bytes each would 279 | # have been 180 bytes of overhead, or 82%! It also would have 280 | # required a region to be both executable and writable to 281 | # support run-time routine definition, which is kind of a pain 282 | # thse days, and also on Harvard-architecture microcontrollers. 283 | 284 | # In previous versions, the token-table entries were 32 bits 285 | # each (instead of 16 bits as they are now), which added 286 | # another 2 bytes of overhead per word. In version 2, there 287 | # are currently 66 words, so that's another 132 bytes of shaved 288 | # overhead. 289 | 290 | ### Performance (Speed) 291 | 292 | # To version 2, I added a simple program to print out a 293 | # badly-formatted 8-bit extended ASCII table; it was 11 294 | # bytecode operations long. It executed 81615 bytecodes in all 295 | # (according to gdb). On my 700MHz PIII-Coppermine laptop from 296 | # 1999, 'time' reports CPU times varying from 4-12 297 | # milliseconds. So it seems like it can execute about 10 000 298 | # bytecodes in a millisecond, or about 10 million bytecodes in 299 | # a second --- about 100ns per bytecode. 300 | 301 | # It also made 1459 system calls. 302 | 303 | # That's slightly faster than Python's bytecode engine (maybe 304 | # --- probably within measurement error --- and anyway Python's 305 | # bytecodes are larger-grained), and an order of magnitude 306 | # slower than simple direct-threading, and about three times 307 | # slower than direct-threading with a simple bytecode 308 | # indirection layer. 309 | 310 | # I would be surprised if version 3 had any measurable change 311 | # in speed from version 2. 312 | 313 | # To version 3, I added a little loop to repeatedly search the 314 | # 86-word wordlist for ", ", which isn't there. I was able to 315 | # do this search 10 000 times in 4.59 CPU seconds, which is 316 | # 5.34 microseconds per comparison, or about 3700 CPU cycles. 317 | # If we eventually have 300 words in the wordlist, each search 318 | # will therefore take 1.6 milliseconds in the worst case, so 319 | # the system won't be able to compile or interpret more than 320 | # about 50-100 lines of code per second. It should probably be 321 | # pretty easy to fix this particular performance problem 322 | # (recode wordlist search in asm, restructure, or whatever) if 323 | # it comes up, but it's a bit worrisome... 324 | 325 | ### What's Wrong With This Program 326 | 327 | # - It's a long way from doing anything useful. 328 | # - There's 21 instructions of unused code which may be broken. 329 | # In Version 3, these words are in use and so probably work: 330 | # hello, sub1, type, comma, world, newline, dolit_s8, dot, 331 | # bye, exit, branch_on_0, c_bang, drop, dup, swap, negative, 332 | # umplus, divmod, syscall5, syscall3, zero, syscall1, rpop, 333 | # rpush, one, dolit_32, neg1, add, emit, tuck, udot, 334 | # udot_nospc, udot_nonzero, branch, invert, add1, negate, 335 | # xor, printstack, cells, mul, depth, div, sub, cellsize, 336 | # nip, pick, twodup, i, _do, r_at, over, at, sp_at, bang, 337 | # twoswap, r_2at, find, unloop, memcmp, bcmp, c_at_inc, 338 | # cbcmp, zeq, ge, lt, words, pastdict, nextword, dictsize, 339 | # dictp, dict, c_at. These are not tested, and therefore may be 340 | # broken: execute, rp_at, rp_bang, sp_bang, and, or. 341 | # - There's no dictionary structure yet. 342 | # - It probably needs another couple of primitives. 343 | # - There's no checking for stack overflow or underflow, but 344 | # they will break things. 345 | # - It's slow; see above about performance. 346 | 347 | ### The Beginning of the Program 348 | 349 | # .. include system library header so we know __NR_exit = 1 and 350 | # __NR_write = 4 351 | #include 352 | 353 | ### The token table and dictionary 354 | 355 | # To save space, we're trying to avoid storing pointers 356 | # as much as possible. So most of the code is 357 | # represented as "tokens", which are offsets into the 358 | # "token table", which contains 16-bit offsets into 359 | # either the machine-code primitives or the data 360 | # segment. 361 | 362 | # The "dictionary" is stored in this program as a 363 | # sequence of strings, stuffed next to each other with 364 | # no intervening pointers at all. The idea is that if 365 | # you want to execute or compile a word, you walk 366 | # through the dictionary, examining each string in turn 367 | # to see if it's the right word. If so, then the 368 | # number of strings you rejected is the index into the 369 | # token table. 370 | 371 | # This implies that the dictionary needs to be stored 372 | # somewhere where it can expand without overwriting 373 | # other stuff. So I'm putting it in its own section 374 | # for the time being. I'm pretty sure that means it'll 375 | # get at least a page, which is enough space to define 376 | # at least several hundred words. Maybe at some future 377 | # time I'll copy it into .bss at boot time instead. 378 | 379 | .data 1 # Start putting stuff in data subsection 1 380 | .align 4 381 | ## table to define the "bytecode" instructions 382 | token_table: 383 | # "a" means "allocatable", "w" means "writable" 384 | .section .dictionary, "aw" 385 | dictionary: 386 | 387 | ## I was frustrated with the unreadability of my 388 | ## bytecode lists; I was counting token table entries 389 | ## by hand and writing bytecodes numerically. So I 390 | ## wrote a macro to help. 391 | 392 | ## Note that we are using a separate .subsection 393 | ## directive because gas 2.17 doesn't support putting 394 | ## that in the .pushsection line, even though it is 395 | ## documented to do so; see message from Maciej 396 | ## W. Rozycki on 2007-10-11, subject "Re: How to use 397 | ## .pushsection?", 398 | ## http://sourceware.org/ml/binutils/2007-10/msg00176.html 399 | ## for more details) 400 | 401 | ## The first few entries in the table of bytecodes are 402 | ## all defined in machine code; the rest are all 403 | ## defined in bytecode. The inner interpreter examines 404 | ## each bytecode to determine which category it falls 405 | ## in in order to figure out how to execute it, 406 | ## including what base address to add its offset to. 407 | ## This sucks for extensibility but rocks for 408 | ## compactness. 409 | 410 | .macro countedstring name 411 | .byte stringlength\@ 412 | 1: .ascii "\name" 413 | ## Here we count the length of the string --- computers 414 | ## are for counting bytes so people don't have to! 415 | stringlength\@ = . - 1b 416 | .endm 417 | 418 | .macro define_bytecode name, realname, origin 419 | .pushsection .data # save current position, go to data section 420 | .subsection 1 # and subsection 1, where we put the addrs 421 | b_\name = (. - token_table) / 2 # define b_foo as the index of this ptr 422 | .ifeq b_\name - 256 423 | .error "\name got bytecode 256" 424 | .endif 425 | .short \name - \origin # insert offset which will be resolved next 426 | .popsection # return to where we were, and 427 | .pushsection .dictionary 428 | countedstring "\realname" 429 | .popsection 430 | \name: # define the name 431 | .endm 432 | .macro defasm name, realname 433 | define_bytecode \name, "\realname", machine_code_primitives 434 | .endm 435 | .macro defbytes name, realname 436 | define_bytecode \name, "\realname", bytecode_start 437 | .endm 438 | 439 | ### The Return Stack 440 | # We put Forth return addresses here, but programs can also use 441 | # it for other purposes. 442 | 443 | .bss 444 | .space 4096 445 | initial_return_stack_pointer: 446 | 447 | ### Initialization 448 | 449 | .text # the following stuff goes in the text segment 450 | .global _start # declare _start as a global symbol 451 | # (otherwise ld won't be able to find it) 452 | _start: # this is the entry point for ELF I guess 453 | cld # clear direction flag; unnecessary? 454 | mov $initial_return_stack_pointer, %ebp 455 | mov $instructions, %esi # %esi is the interpreter pointer register 456 | jmp next # and now we start the interpreter. 457 | # (somewhat silly since we could just 458 | # fall through..) 459 | 460 | ### The Machine-Code Primitives 461 | # Also next (aka the address interpreter or inner interpreter) 462 | # is in this section. 463 | 464 | machine_code_primitives: 465 | 466 | # dolit_s8 takes a signed 8-bit literal from the instruction 467 | # stream and pushes it onto the stack. 468 | 469 | defasm dolit_s8, "lit8" 470 | lodsb 471 | movsbl %al, %eax 472 | jmp pusheax 473 | 474 | defasm dolit_32, "lit" # more general dolit 475 | lodsl 476 | jmp pusheax 477 | 478 | defasm exit, "ret" # Return from a colon defn. 479 | xchg %ebp, %esp 480 | pop %esi 481 | xchg %ebp, %esp 482 | jmp next 483 | 484 | defasm execute, "execute" # Run xt on data stack. 485 | pop %eax # Here 'xt' is the one-byte token. 486 | jmp execute_eax 487 | 488 | # Branch if top of stack is 0 (implementing IF). 489 | # Both branch instructions take a signed byte offset from the bytecode 490 | # stream. 491 | defasm branch_on_0, "(if)" 492 | pop %eax 493 | and %eax, %eax 494 | jz branch 495 | inc %esi # skip 1-byte jump offset 496 | jmp next 497 | 498 | defasm branch, "(else)" 499 | lodsb 500 | movsbl %al, %eax # same insn size as cbtw; cwde 501 | add %eax, %esi 502 | jmp next 503 | 504 | # (loop) is what we do at the end of a DO LOOP. 505 | 506 | # DO puts a number on top of the return stack that is zero minus 507 | # the number of iterations remaining. So when it finally 508 | # reaches zero, we're done. It also put a number underneath 509 | # that on the return stack from which you can recover the 510 | # iteration counter. 511 | 512 | # This scheme is mostly due to F-83, except that in F-83 it 513 | # reached 0x8000 instead of 0, which seemed perverse to me. 514 | 515 | # This isn't strictly necessary as part of the minimal primitive 516 | # set, but it seemed like it would make inner loops maybe ten 517 | # times faster, and as with most words that do return-stack 518 | # manipulation, the size penalty is actually negative. (This 519 | # version is 14 bytes; in bytecode it would be 17.) 520 | 521 | # ( -- ) ( R: -1 adjustment -- ) in end-of-loop case, and skip 522 | # the interpreter pointer over the jump offset. 523 | # ( -- ) ( R: counter -- counter+1 ) in normal case, and adjust 524 | # interpreter pointer by number of bytes stored after call to 525 | # this routine. 526 | 527 | defasm _loop, "(loop)" 528 | xor %eax, %eax # mov $1, %eax is 5 bytes 529 | inc %eax 530 | add %eax, (%ebp) 531 | jnc branch # if no carry, go branch 532 | # If there was a carry, we're done! 533 | add $8, %ebp # drop loop-sys from rstack 534 | lodsb # skip jump offset 535 | jmp next 536 | 537 | # Store a cell. 538 | defasm bang, "!" 539 | pop %ebx 540 | pop (%ebx) # I'm amazed this is legal 541 | jmp next 542 | 543 | # Fetch a cell. 544 | defasm at, "@" 545 | pop %ebx 546 | push (%ebx) # I'm amazed this is legal too 547 | jmp next 548 | 549 | # Store a byte. 550 | defasm c_bang, "c!" 551 | pop %ebx 552 | pop %eax 553 | mov %al, (%ebx) # push and pop don't do bytes 554 | jmp next 555 | 556 | # Fetch a byte. 557 | defasm c_at, "c@" 558 | pop %ebx 559 | xor %eax, %eax 560 | mov (%ebx), %al 561 | jmp pusheax 562 | 563 | # Get the return stack pointer. 564 | defasm rp_at, "rp@" 565 | push %ebp 566 | jmp next 567 | 568 | # Set the return stack pointer. 569 | defasm rp_bang, "rp!" 570 | pop %ebp 571 | jmp next 572 | 573 | # Pop the return stack to the data stack 574 | defasm rpop, "r>" 575 | xchg %esp, %ebp 576 | pop %eax 577 | xchg %esp, %ebp 578 | jmp pusheax 579 | 580 | # Push the return stack from the data stack 581 | defasm rpush, ">r" 582 | lea -4(%ebp), %ebp 583 | pop (%ebp) 584 | jmp next 585 | 586 | # Get the data stack pointer (before it gets pushed). 587 | defasm sp_at, "sp@" 588 | push %esp # safe on 286 and later 589 | jmp next 590 | 591 | # Set the data stack pointer. 592 | defasm sp_bang, "sp!" 593 | pop %esp 594 | jmp next 595 | 596 | # Pop the stack. 597 | defasm drop, "pop" 598 | pop %eax 599 | jmp next 600 | 601 | # Push a copy of TOS. 602 | # eForth 1.0 used BX to index the stack here, for a couple of 603 | # reasons: on the 8086, SP got decremented prior to the fetch, 604 | # and also wasn't valid as a base or index register. 605 | defasm dup, "dup" 606 | pop %eax 607 | push %eax 608 | jmp pusheax 609 | 610 | # Stack manipulation ( w1 w2 -- w1 w2 w1 ) 611 | # technically not necessary, but it's so easy and tiny 612 | defasm over, "over" 613 | push 4(%esp) 614 | jmp next 615 | 616 | # Swap top two stack items ("exch" in PostScript) 617 | defasm swap, "swap" 618 | pop %edx 619 | pop %eax 620 | # jmp pushedxeax fall through because pushedxeax is next 621 | 622 | # pusheax and pushedxeax: a prologue to 'next' that first pushes %edx 623 | # and %eax, or just %eax. 624 | 625 | # For a net savings of 13 bytes, last I checked, in all those 626 | # primitives that finish up by pushing something! Clever trick from 627 | # F-83's 1PUSH and 2PUSH. 628 | pushedxeax: 629 | push %edx 630 | pusheax: 631 | push %eax 632 | # now we fall through to 'next' 633 | 634 | # "next" fetches the next bytecode and runs it. It's placed 635 | # here in the middle of the bytecode definitions so that more 636 | # of them can use the short two-byte jump form to get to it. 637 | 638 | next: 639 | xor %eax, %eax # set %eax to 0 640 | xor %ebx, %ebx # clear high half of %ebx 641 | lodsb # load %al from where %esi points 642 | # (%esi is the interpreter pointer) 643 | execute_eax: 644 | ## load offset of new word into %ebx 645 | mov token_table(,%eax,2), %bx # bx := token_table[eax * 2bytes] 646 | cmp $last_asm_bytecode, %eax 647 | jbe next_primitive # if primitive, handle primitive word 648 | ## otherwise, handle a bytecode definition or "colon list" 649 | # save old %esi on return stack 650 | xchg %ebp, %esp 651 | push %esi 652 | xchg %ebp, %esp 653 | lea bytecode_start(%ebx), %esi 654 | jmp next 655 | 656 | next_primitive: 657 | lea machine_code_primitives(%ebx), %ebx 658 | jmp *%ebx 659 | 660 | 661 | # Push true if n negative. ( n -- f ) 662 | defasm negative, "0<" 663 | pop %eax 664 | cdq 665 | push %edx 666 | jmp next 667 | 668 | # Bitwise operators: 669 | defasm and, "&" 670 | pop %eax 671 | pop %ebx 672 | and %ebx, %eax 673 | jmp pusheax 674 | 675 | defasm or, "|" 676 | pop %eax 677 | pop %ebx 678 | or %ebx, %eax 679 | jmp pusheax 680 | 681 | defasm xor, "^" 682 | pop %eax 683 | pop %ebx 684 | xor %ebx, %eax 685 | jmp pusheax 686 | 687 | # add two unsigned numbers, returning sum and carry. 688 | # ( u1 u2 -- u3 cy ) 689 | defasm umplus, "um+" 690 | xor %eax, %eax 691 | pop %edx 692 | pop %ebx 693 | add %ebx, %edx 694 | rcl $1, %eax 695 | jmp pushedxeax 696 | 697 | # Divide double-precision by single-precision, unsigned (?). 698 | # UM/MOD from eForth. ( udl udh un -- ur uq ) 699 | defasm divmod, "/%" 700 | pop %ebx 701 | pop %edx 702 | pop %eax 703 | idiv %ebx 704 | jmp pushedxeax 705 | 706 | # Multiply two single-precision numbers, giving a double- 707 | # precision result. ( d1 d2 -- udl udh ) 708 | defasm mmul, "um*" 709 | pop %eax 710 | pop %ebx 711 | imul %ebx 712 | push %eax 713 | push %edx 714 | jmp next 715 | 716 | # Copy the top of the return stack onto the data stack. 717 | defasm r_at, "r@" 718 | push (%ebp) 719 | jmp next 720 | 721 | # syscall5: 722 | # Linux system call with up to 5 arguments 723 | # This is no longer the fashionable way to make system calls 724 | # in Linux. Now you're supposed to use SYSENTER on newer 725 | # CPUs, and rather than have you figure out which one to use, 726 | # the kernel mmaps a chunk of code called a VDSO into your 727 | # memory space at a random address and tells you where to 728 | # find it using the ELF auxiliary vector. Then you're 729 | # supposed to invoke the dynamic linker or something to parse 730 | # the ELF executable mysteriously manifested in this way by 731 | # the kernel, and then resolve an undefined symbol in libc 732 | # into calls to it. See "What is linux-gate.so.1?" 733 | # http://www.trilithium.com/johan/2005/08/linux-gate/ 734 | # "The Linux kernel: System Calls" by Andries Brouwer, 2003-02-01 735 | # http://www.win.tue.nl/%7Eaeb/linux/lk/lk-4.html 736 | # "About ELF Auxiliary Vectors" by Manu Garg 737 | # http://manugarg.googlepages.com/aboutelfauxiliaryvectors 738 | 739 | # But the old int $0x80 approach still works, thank goodness, 740 | # because all of that is *way* more than these ten 741 | # instructions. 742 | defasm syscall5, "syscall5" 743 | pop %edi 744 | ## we have to save %esi for the interpreter 745 | mov %esi, -4(%ebp) 746 | pop %esi 747 | pop %edx 748 | pop %ecx 749 | pop %ebx 750 | pop %eax 751 | int $0x80 752 | mov -4(%ebp), %esi 753 | jmp pusheax 754 | 755 | last_asm_bytecode = b_syscall5 756 | 757 | ### Basic Interpreted Words 758 | ## a macro for defining interpreted words 759 | ## Because after I left off b_exit once, I wasted a long 760 | ## time trying to figure out what was wrong, so I use this when I can: 761 | .macro def name, realname, bytes:vararg 762 | defbytes \name, "\realname" 763 | .byte \bytes 764 | .byte b_exit 765 | .endm 766 | ## Macros for conditional branch and loop: 767 | ## Because I am tired of tracking down bugs due to 768 | ## getting the jump offsets wrong. 769 | .macro fif, target # if, or end of while loop 770 | .byte b_branch_on_0, \target - . - 1 771 | .endm 772 | .macro floop, target # do loop 773 | .byte b__loop, \target - . - 1 774 | .endm 775 | .macro felse, target # else, unconditional jump 776 | .byte b_branch, \target - . - 1 777 | .endm 778 | 779 | .data 2 # separate subsection from token table 780 | bytecode_start: 781 | # System call with three arguments. 782 | def syscall3, "syscall3", b_zero, b_zero, b_syscall5 783 | # System call with one argument. 784 | def syscall1, "syscall1", b_zero, b_zero, b_syscall3 785 | def bye, "bye", b_dolit_s8,__NR_exit, b_zero, b_syscall1 # exit program 786 | def zero, "0", b_dolit_s8,0 # push 0 787 | def one, "1", b_dolit_s8,1 788 | 789 | # This word outputs a string whose address and count are on 790 | # the stack. ( b u -- ) 791 | 792 | defbytes type, "type" 793 | .byte b_rpush, b_rpush # move two args onto rstack 794 | # system call is __NR_write: 795 | .byte b_dolit_s8,__NR_write 796 | .byte b_one # push constant 1: stdout 797 | .byte b_rpop, b_rpop # move two args back from rstack 798 | .byte b_syscall3 # call syscall with 3 args 799 | .byte b_drop # discard return value 800 | .byte b_exit # return 801 | 802 | # The next few words exist just to poke string addresses 803 | # and lengths onto the stack so "type" can print them. 804 | .macro def_counted_string name, contents 805 | defbytes \name, "\name" 806 | .byte b_dolit_32 # dolit_32 pushes a 32-bit 807 | .int string_\name # literal --- an addr, here 808 | # now push literal length and return 809 | .byte b_count, b_exit 810 | .pushsection .rodata # define the actual string: 811 | string_\name: 812 | countedstring "\contents" 813 | .popsection 814 | .endm 815 | 816 | def_counted_string hello, "hello" 817 | def_counted_string world, "world" 818 | def_counted_string comma, ", " 819 | 820 | # convert a counted string in memory to an address and 821 | # count on the stack 822 | def count, "count", b_dup, b_add1, b_swap, b_c_at 823 | 824 | def cr, "cr", b_dolit_s8, '\n, b_emit 825 | 826 | ### Some More Basic Words 827 | 828 | def neg1, "-1", b_dolit_s8, -1 # ( -- -1 ) 829 | def add, "+", b_umplus, b_drop # ( a b -- a+b ) drop the carry 830 | def sub1, "1-", b_neg1, b_add # ( n -- n-1 ) 831 | def rot, "rot", b_rpush, b_swap, b_rpop, b_swap # ( a b c -- b c a ) 832 | def unrot, "-rot", b_rot, b_rot # ( a b c -- c a b ) 833 | def tuck, "tuck", b_dup, b_unrot # ( a b -- b a b ) 834 | 835 | # emit: output a single byte. eForth calls this "TX!". 836 | 837 | # This version is 11 bytes, including the buffer byte, plus the 2-byte 838 | # token table pointer. a machine-code version I wrote the other day 839 | # was 28 bytes. However, I also added rot, unrot, and tuck to support 840 | # this function, and they total 11 bytes, plus 6 bytes of overhead. 841 | # For a total of 11+2+11+6 = 30 bytes. Not winning yet on size over 842 | # x86 asm! But we're getting close. 843 | 844 | emit_buffer: 845 | .byte 0 846 | defbytes emit, "emit" 847 | .byte b_dolit_32 848 | .int emit_buffer 849 | .byte b_tuck # save a copy of address for b_type 850 | .byte b_c_bang # store into emit buffer 851 | .byte b_one, b_type, b_exit # output one-byte buffer 852 | 853 | ### "u." prints out an unsigned number. 854 | # I had a version of this in x86 machine code in 52 bytes (23 855 | # instructions), essentially exactly the same code as here. 856 | # This is 31 bytes, plus 6 bytes of overhead, plus I had 857 | # to define b_divmod (9 bytes plus 2 bytes overhead). Now we are 858 | # starting to win! 859 | 860 | defbytes udot, "u." # print space after number 861 | .byte b_udot_nospc, b_dolit_s8, 0x20, b_emit, b_exit 862 | defbytes udot_nospc, "(u.)" # print number without space 863 | .byte b_dup 864 | fif 1f 865 | .byte b_udot_nonzero, b_exit 866 | 1: .byte b_drop, b_dolit_s8, '0, b_emit, b_exit 867 | defbytes udot_nonzero, "((u.))" 868 | .byte b_zero, b_dolit_s8,10, b_divmod # divide by 10 869 | .byte b_dup 870 | fif 2f # recurse if nonzero 871 | .byte b_udot_nonzero 872 | felse 3f 873 | 2: .byte b_drop # drop zero quotient 874 | 3: .byte b_dolit_s8, '0, b_add, b_emit # print digit 875 | .byte b_exit 876 | 877 | ### Add signed numeric output, ".". This cost 20 bytes plus 8 bytes 878 | # of overhead, but added some fundamental numeric operations; only 12 879 | # of those 28 bytes are specific to "." 880 | 881 | # logical bitwise not 882 | def invert, "~", b_dolit_s8, -1, b_xor 883 | def add1, "1+", b_one, b_add 884 | # arithmetic negation 885 | def negate, "negate", b_invert, b_add1 886 | # print signed number 887 | defbytes dot, "." 888 | .byte b_dup, b_negative 889 | fif 1f 890 | .byte b_dolit_s8, '-, b_emit, b_negate # in the negative case 891 | 1: .byte b_udot, b_exit 892 | 893 | ### Obviously the next thing to do is to add ".S", print the 894 | # stack, so that I can stop having to investigate problems by 895 | # using gdb. 896 | 897 | # The bytecode for this consumed 78 bytes in 12 words, plus a 898 | # new 8-byte primitive (mmul) and a new 14-byte primitive 899 | # (_loop), plus six bytes in the initialization routine, for 28 900 | # bytes of overhead and a total of 78+28+14+8+6 = 134 bytes. 901 | # This is definitely not a size win over machine code! Machine 902 | # code would only be 22 bytes in 7 instructions, if there were a 903 | # way to just CALL the "." routine from machine code, which 904 | # there isn't. 905 | 906 | # However, the words added were cells * - / cellsize nip (do) 907 | # (loop) 2dup which are all generally useful, and 908 | # depth pick .s which are more special-purpose. 909 | # The special-purpose words are 35 out of those 78 bytes. 910 | 911 | # PRINTSTACK itself is only 15 bytes, and there's hope that the 912 | # 6 bytes of PICK and the 14 bytes of DEPTH will be useful in 913 | # other debugging routines. 914 | 915 | # I'm not happy with (do) and (loop), only because (do) 916 | # implements dpANS DO, not dpANS ?DO, so it loops many times 917 | # when it should loop zero times; 918 | 919 | def_counted_string scolon, "s: " 920 | defbytes printstack, ".s" 921 | .byte b_scolon, b_type 922 | .byte b_depth, b_zero # loop limits 923 | .byte b_twodup, b_xor 924 | fif 1f # skip loop if stack empty 925 | .byte b__do 926 | 2: .byte b_i, b_pick, b_dot # DO I PICK . LOOP 927 | floop 2b 928 | 1: .byte b_cr, b_exit 929 | def pick, "pick", b_add1, b_cells, b_sp_at, b_add, b_at 930 | def cells, "cells", b_cellsize, b_mul 931 | def mul, "*", b_mmul, b_drop # drop upper 32 bits of multiplication result 932 | bottom_of_stack: 933 | .int 0 934 | defbytes depth, "depth" 935 | .byte b_sp_at, b_dolit_32 936 | .int bottom_of_stack 937 | .byte b_at, b_swap, b_sub, b_zero, b_cellsize, b_div, b_exit 938 | 939 | def sub, "-", b_negate, b_add # subtract ( a b -- a-b ) 940 | def div, "/", b_divmod, b_nip # int divide ( ul uh n -- quotient ) 941 | def cellsize, "cellsize", b_dolit_s8,4 # ( -- 4 ) 942 | def nip, "nip", b_swap, b_drop # stack manipulation ( a b -- b ) 943 | 944 | # 10 0 DO ... LOOP loops 0, 1...9. 945 | # _do sets up return stack for _loop 946 | # similar to F83: ( limit initial -- ) ( R: X -- X initial-limit limit ) 947 | defbytes _do, "(do)" 948 | .byte b_over, b_sub, b_swap, b_rpop, b_swap, b_rpush 949 | .byte b_swap, b_rpush, b_rpush, b_exit 950 | ## return loop counter 951 | def i, "i", b_rpop, b_rpop, b_r_at, b_over, b_rpush, b_add, b_swap, b_rpush 952 | 953 | def twodup, "2dup", b_over, b_over 954 | def twodrop, "2drop", b_drop, b_drop 955 | 956 | # Now some stuff for dealing with the dictionary. 957 | 958 | # This stuff was from 804930a to 8049396, 140 bytes. In that we got: 959 | # - new words: dict dictp dictsize nextword pastdict? words < >= 0= 960 | # cbcmp c@+ bcmp memcmp unloop find r2@ 2swap 961 | # - less concretely: 962 | # - the ability to list of words in the dictionary; 963 | # - the ability to find words in the dictionary; 964 | # - <, >=, and 0= numerical comparisons; 965 | # - cbcmp, bcmp, and memcmp memory manipulations; 966 | # - 2swap and r2@ stack manipulations; 967 | # - unloop loop control; 968 | # - c@+ for iterating over memory. 969 | # That's 17 new words, averaging 8.2 bytecodes each. 970 | 971 | dictionary_pointer: 972 | .int end_of_dictionary 973 | defbytes dict, "dict" 974 | .byte b_dolit_32 975 | .int dictionary 976 | .byte b_exit 977 | defbytes dictp, "dictp" 978 | .byte b_dolit_32 979 | .int dictionary_pointer 980 | .byte b_exit 981 | def dictsize, "dictsize", b_dictp, b_at, b_dict, b_sub 982 | def nextword, "nextword", b_dup, b_c_at, b_add, b_add1 983 | def pastdict, "pastdict?", b_dictp, b_at, b_ge 984 | defbytes words, "words" 985 | .byte b_dict 986 | 1: .byte b_dup, b_count, b_type, b_dolit_s8,32, b_emit 987 | .byte b_nextword 988 | .byte b_dup, b_pastdict 989 | fif 1b 990 | .byte b_exit 991 | def lt, "<", b_sub, b_negative 992 | def ge, ">=", b_lt, b_zeq 993 | # logical not: return true for 0, false (0) otherwise 994 | defbytes zeq, "0=" 995 | fif 1f 996 | .byte b_zero, b_exit 997 | 1: .byte b_neg1, b_exit 998 | 999 | # To find a word in the dictionary: 1000 | # - move the word onto the return stack, and get the dictionary pointer 1001 | # - then loop: 1002 | # - see if the word is at the current place 1003 | # - if so, clean up and return that place 1004 | # - otherwise, go to the next word 1005 | # - and repeat if we're still in the dictionary 1006 | # - then clean up the stacks and return 0 1007 | # Tells whether a counted string equals an address-and-count string. 1008 | # 0 for equal, nonzero for unequal. 1009 | # ( c-addr1 c-addr2 u -- n ) 1010 | def cbcmp, "cbcmp", b_rot, b_count, b_twoswap, b_bcmp 1011 | # like F21 @A+: ( c-addr -- c-addr+1 char ) 1012 | def c_at_inc, "c@+", b_dup, b_add1, b_swap, b_c_at 1013 | # Keep in mind memcmp() in libc is only 30 bytes long. 1014 | # This bcmp is a little different from C memcmp or bcmp in 1015 | # that it compares two lengths. 1016 | defbytes bcmp, "bcmp" # ( c-addr1 u1 c-addr2 u2 -- n ) 1017 | .byte b_rot, b_over, b_xor 1018 | fif 3f 1019 | .byte b_twodrop, b_drop, b_one, b_exit 1020 | 3: .byte b_memcmp, b_exit 1021 | defbytes memcmp, "memcmp" # ( c-addr1 c-addr2 u -- n ) 1022 | .byte b_zero, b__do 1023 | 2: .byte b_c_at_inc, b_rot, b_c_at_inc, b_rot 1024 | .byte b_sub, b_dup # - dup if 1025 | fif 1f 1026 | .byte b_unrot, b_twodrop, b_unloop, b_exit 1027 | 1: .byte b_drop, b_swap 1028 | floop 2b 1029 | .byte b_twodrop, b_zero, b_exit 1030 | # this should probably go with the other do loop stuff 1031 | def unloop, "unloop", b_rpop, b_rpop, b_rpop, b_twodrop, b_rpush 1032 | 1033 | # FIND: ( c-addr u -- token 1 | 0 ) 1034 | 1035 | # 30 bytes; jonesforth 42's asm version is 56 bytes. It's 1036 | # fairly directly comparable, although jonesforth's FIND has 1037 | # to do bit-masking and includes its own inline NEXTWORD and 1038 | # CBCMP, which are actually fairly large here. 1039 | 1040 | defbytes find, "find" 1041 | .byte b_rpush, b_rpush, b_zero, b_dict 1042 | 1: .byte b_dup, b_r_2at, b_cbcmp, b_zeq # start loop 1043 | fif 2f # bcmp 0= if 1044 | .byte b_rpop, b_rpop, b_twodrop, b_drop, b_one, b_exit 1045 | 2: .byte b_swap, b_add1, b_swap, b_nextword, b_dup, b_pastdict 1046 | fif 1b 1047 | .byte b_rpop, b_rpop, b_twodrop, b_twodrop, b_zero, b_exit 1048 | 1049 | # copy two cells from return stack 1050 | defbytes r_2at, "r2@" 1051 | .byte b_rpop, b_rpop, b_r_at, b_over, b_rpush, b_rot, b_rpush, b_exit 1052 | # ( a b c d -- c d a b ) 1053 | def twoswap, "2swap", b_rpush, b_unrot, b_rpop, b_unrot 1054 | 1055 | .macro create, name 1056 | defbytes \name, "\name" 1057 | .byte b_dolit_32 1058 | .int 1f 1059 | .byte b_exit 1060 | 1: 1061 | .endm 1062 | 1063 | create "tib" 1064 | _tibmax = 80 1065 | .space _tibmax 1066 | defbytes tibmax, "tibmax" 1067 | .byte b_dolit_32 1068 | .int _tibmax 1069 | .byte b_exit 1070 | create "tibsize" 1071 | .int 0 1072 | 1073 | defbytes fgets, "fgets" 1074 | .byte b_tib, b_tibmax, b_read, b_tibsize, b_bang # XXX handle errors 1075 | .byte b_tib, b_tibsize, b_at, b_exit 1076 | def gets, "gets", b_zero, b_fgets 1077 | 1078 | defbytes read, "read" 1079 | .byte b_rpush, b_rpush, b_dolit_s8,__NR_read 1080 | .byte b_zero # fd 0: stdin 1081 | .byte b_rpop, b_rpop, b_syscall3, b_exit 1082 | 1083 | def bl, "bl", b_dolit_s8,32 # space, blank 1084 | 1085 | # parse parses out a token of input from a string and leaves 1086 | # the token's address and length atop the stack 1087 | # ( c-addr u -- c-addr+n u-n c-addr2 u2 ) 1088 | defbytes parse, "parse" 1089 | .byte b_skipwhitespace 1090 | ## XXX finish him! 1091 | .byte b_exit 1092 | 1093 | defbytes skipwhitespace, "-wsp" 1094 | 1: .byte b_dup, b_zeq 1095 | fif 3f # return empty tail 1096 | .byte b_exit 1097 | 3: .byte b_sub1, b_swap, b_c_at_inc, b_whitespace 1098 | fif 2f # escape loop if not whitespace 1099 | .byte b_swap 1100 | felse 1b 1101 | 2: .byte b_sub1, b_swap, b_add1, b_exit 1102 | 1103 | ## costs 9+5 bytes 1104 | def_counted_string wsps, " \n\t" 1105 | defbytes whitespace, "wsp" 1106 | .byte b_dup, b_bl, b_xor 1107 | fif 1f 1108 | .byte b_dup, b_dolit_s8,'\n, b_xor 1109 | fif 1f 1110 | .byte b_dup, b_dolit_s8,'\t, b_xor 1111 | fif 1f 1112 | .byte b_drop, b_zero, b_exit 1113 | 1: .byte b_drop, b_one, b_exit 1114 | 1115 | # def repl, "repl", b_gets, b_eval, b_exit 1116 | 1117 | # defbytes eval, "eval" 1118 | # 2: .byte b_parse 1119 | # .byte b_dup, b_zeq 1120 | # fif 3f # escape from loop 1121 | # .byte b_rot, b_rpush, b_rot, b_rpush, b_find 1122 | # fif 1f 1123 | # .byte b_execute 1124 | # 1: .byte b_rpop, b_rpop 1125 | # felse 2b 1126 | # 3: .byte b_2drop, b_2drop, b_exit 1127 | 1128 | .data 3 1129 | instructions: 1130 | # And here is the actual "main program" in that bytecode. 1131 | .byte b_sp_at, b_dolit_32 1132 | .int bottom_of_stack # variable to remember initial stack bottom 1133 | .byte b_bang # initialize that variable 1134 | .byte b_hello # string "hello" and count 1135 | .byte b_sub1 # subtract 1 from count: "hell" 1136 | .byte b_type # spit it out 1137 | .byte b_comma, b_type, b_world, b_type # ", world" 1138 | .byte b_comma, b_type, b_hello, b_type, b_cr 1139 | # test the "dot" command to print out numbers 1140 | .byte b_dolit_s8, -120, b_dot 1141 | # test positive numbers and "depth" command 1142 | .byte b_dolit_s8, 104, b_depth, b_dot, b_dot, b_cr 1143 | # test printstack 1144 | .byte b_dolit_s8,100, b_dolit_s8,101, b_dolit_s8,102, b_printstack 1145 | # test dictsize 1146 | .byte b_dictsize, b_dot, b_cr 1147 | .byte b_words, b_cr 1148 | .byte b_hello, b_twodup, b_type, b_find, b_printstack 1149 | .byte b_comma, b_twodup, b_type, b_find, b_printstack 1150 | .byte b_dict, b_dot, b_cr 1151 | .byte b_dolit_s8, '?, b_emit, b_bl, b_emit, b_gets, b_twodup 1152 | .byte b_zero, b__do 1153 | 1: .byte b_c_at_inc, b_whitespace, b_dot 1154 | floop 1b 1155 | .byte b_drop 1156 | .byte b_skipwhitespace, b_type 1157 | .byte b_bye 1158 | 1159 | # At end of the assembly program, we initialize the 1160 | # end_of_dictionary pointer by putting it at the end of the 1161 | # assembled .dictionary section: 1162 | .section .dictionary 1163 | end_of_dictionary: 1164 | --------------------------------------------------------------------------------