├── .gitattributes ├── .gitignore ├── LICENCE.txt ├── Makefile ├── README.md ├── stage-0 ├── .gitignore ├── Makefile ├── README.md └── unhex.x ├── stage-1 ├── .gitignore ├── Makefile ├── README.md ├── elfify.x ├── unhexl.ts.x └── unhexl.ts.xl ├── stage-2 ├── .gitignore ├── Makefile ├── README.md ├── as.s └── as.ts.xl ├── stage-3 ├── .gitignore ├── Makefile ├── README.md ├── as.s ├── ld.s ├── test1.s ├── test2.s └── test3.s ├── stage-4 ├── .gitignore ├── Makefile ├── README.txt ├── char.s ├── crt0.s ├── ctype.s ├── error.s ├── exit.c ├── expr.s ├── i386.s ├── imath.s ├── input.c ├── main.s ├── malloc.c ├── memory.s ├── output.c ├── scanner.s ├── signal.c ├── stdarg.c ├── stdio.s ├── stmt.s ├── string.s ├── string2.c ├── symtab.s └── unistd.s └── stage-5 ├── .gitignore ├── Makefile ├── README.txt ├── cc.c ├── cli.c ├── cmp.c ├── codegen.c ├── cpp-tests ├── Makefile ├── builtin.c ├── builtin.i ├── directive.c ├── directive.i ├── empty.c ├── empty.i ├── fn.c ├── fn.i ├── glue.c ├── hash.c ├── hash.i ├── include.c ├── include.i ├── includemacro.c ├── includemacro.i ├── macros.c ├── macros.i ├── nocpp.c ├── nocpp.i ├── obj.c ├── obj.i ├── rescan.c ├── rescan.i ├── simple.c ├── simple.i ├── suppress.c ├── suppress.i └── vers2.h ├── cpp.c ├── cpptype.c ├── eval.c ├── expr.c ├── i386.c ├── include ├── bits │ ├── eof.h │ ├── file.h │ ├── file_access.h │ ├── null.h │ ├── size_t.h │ ├── std_streams.h │ ├── string.h │ ├── struct_tm.h │ └── time_t.h ├── errno.h ├── rbc_init.h ├── stdio.h ├── string.h └── time.h ├── macros.c ├── main.c ├── node.c ├── nodenew.c ├── pvector.c ├── pvector.h ├── scanbase.c ├── scanner.c ├── stmt.c ├── symtab.c ├── timeconv.c └── type.c /.gitattributes: -------------------------------------------------------------------------------- 1 | *.x linguist-language=text 2 | *.xl linguist-language=gas 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .*.swp 2 | bin/ 3 | lib/ 4 | include/ 5 | !stage-*/include/ 6 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | # Makefile 2 | 3 | # Copyright (C) 2009, 2011, 2012, 2013, 2020 4 | # Richard Smith 5 | # All rights reserved. 6 | 7 | STAGES = 0 1 2 3 4 5 8 | 9 | SHELL = /bin/sh 10 | PATH = . 11 | 12 | RM = /bin/rm 13 | MKDIR = /bin/mkdir 14 | MAKE = /usr/bin/make 15 | 16 | BINDIR = bin 17 | LIBDIR = lib 18 | INCDIR = include 19 | 20 | world: 21 | $(RM) -rf $(BINDIR) $(LIBDIR) $(INCDIR) 22 | $(MAKE) init 23 | set -e; for n in $(STAGES); do $(MAKE) -r -C stage-$$n $@; done 24 | 25 | init: 26 | $(MKDIR) -p $(BINDIR) $(LIBDIR) $(INCDIR) 27 | 28 | check: 29 | set -e; for n in $(STAGES); do $(MAKE) -r -C stage-$$n $@; done 30 | 31 | clean: 32 | set -e; for n in $(STAGES); do $(MAKE) -r -C stage-$$n $@; done 33 | $(RM) -rf $(BINDIR) $(LIBDIR) $(INCDIR) 34 | 35 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Bootstrap Experiment 2 | 3 | In this experiment I aim to develop from the ground up a working compiler, 4 | assembler, linker and library, for a C-like language. I start with a minimal 5 | program capable of generating itself from its source and gradually develop 6 | higher level tools and abstractions, as follows. 7 | 8 | This programs produced in this project are 32-bit ELF executables which run on 9 | a Linux kernel running on an Intel x86 processor. They run fine on modern 10 | 64-bit systems. 11 | 12 | 13 | ## Stage 0 – `unhex` 14 | 15 | The starting point of the experiment is a tiny program for packing hexadecimal 16 | octets into binary. 17 | 18 | ## Stage 1 – `unhexl` & `elfify` 19 | 20 | This stage adds a tool to wrap a text section into a minimal ELF executable, 21 | as well as further developing the `unhex` program to support labels, 22 | references to earlier labels, and a freer input format that allows comments. 23 | 24 | ## Stage 2 – `as` 25 | 26 | Here we introduce a light-weight assembler, written in machine code using no 27 | forward jumps. It generates a text section that can be wrapped with the stage 28 | 1 `elfify` program to produce an executable. 29 | 30 | ## Stage 3 – `as` & `ld` 31 | 32 | The assembler is rewritten in assembler language and is joined by a linker, 33 | which together allow for separate compilation units. 34 | 35 | ## Stage 4 – `cc`, `crt0.o` & `libc.o` 36 | 37 | The project's first compiler is added at this stage. Its input language is a 38 | typeless subset of C similar to B, and it emits assembler language. We also 39 | build a startup file (`crt0.o`) and the start of a simple C library. 40 | 41 | ## Stage 5 – `ccx`, `cpp`, `cc` & `cmp` 42 | 43 | The compiler is rewritten in its source language, and a type system added. We 44 | use it to implement a fairly standards-compliant C preprocessor, and a 45 | compiler driver (`cc`) that spawns the `cpp`, `ccx` (the compiler proper), 46 | `as` and `ld`. Finally, `cmp` is a POSIX compliant utility to compare two 47 | files, which is used in a preproccesor test suite. 48 | 49 | ## Licensing 50 | 51 | The code in this project is copyright (C) Richard Smith, 2009–2021, and 52 | is licensed for use under version 3 or later of the [GNU General Public 53 | License](LICENCE.txt), a copy of which can be found in the file `LICENCE.txt`. 54 | The documentation in these `README` files is licensed under the 55 | [Creative Commons BY-NC-SA licence, 56 | version 4](https://creativecommons.org/licenses/by-nc/4.0/). 57 | -------------------------------------------------------------------------------- /stage-0/.gitignore: -------------------------------------------------------------------------------- 1 | .*.swp 2 | unhex 3 | -------------------------------------------------------------------------------- /stage-0/Makefile: -------------------------------------------------------------------------------- 1 | # stage-0/Makefile 2 | 3 | # Copyright (C) 2009, 2011, 2015, 2020 Richard Smith 4 | # All rights reserved. 5 | 6 | SHELL = /bin/sh 7 | 8 | CHMOD = /bin/chmod 9 | RM = /bin/rm 10 | CP = /bin/cp 11 | CMP = /usr/bin/cmp 12 | MAKE = /usr/bin/make 13 | CAT = /bin/cat 14 | PRINTF = /usr/bin/printf 15 | 16 | BINDIR = ../bin 17 | PATH = $(BINDIR) 18 | 19 | all: init unhex 20 | 21 | init: 22 | @test -d $(BINDIR) || $(MAKE) -C .. init 23 | 24 | unhex: init unhex.x 25 | for x in `$(CAT) unhex.x`; do $(PRINTF) \\x$$x; done > unhex 26 | $(CHMOD) a+x unhex 27 | 28 | check: check-unhex 29 | 30 | check-unhex: 31 | ./unhex < unhex.x > unhex2 32 | $(CMP) unhex unhex2 33 | $(RM) unhex2 34 | 35 | install: unhex 36 | $(CP) unhex $(BINDIR) 37 | 38 | clean: 39 | $(RM) -f unhex unhex2 40 | 41 | world: 42 | set -e; for TARGET in clean init all check install; do \ 43 | $(MAKE) $$TARGET; \ 44 | done 45 | -------------------------------------------------------------------------------- /stage-0/README.md: -------------------------------------------------------------------------------- 1 | # Bootstrap: Stage 0 2 | 3 | The starting point for this bootstrap experiment is the `unhex` program. 4 | It is a very simple program for converting a stream of hexadecimal 5 | octets on standard input into a binary file written to standard output. 6 | 7 | > Usage: `unhex < test.x > test` 8 | 9 | where `.x` is used as the canonical extension for its input files. The 10 | source file format is very restrictive: 11 | 12 | ```ebnf 13 | XDIGIT ::= [0-9A-F] 14 | CHAR ::= any character 15 | 16 | octet ::= XDIGIT XDIGIT CHAR 17 | file ::= octet* 18 | ``` 19 | 20 | This format is exceptionally easy to parse, which was the whole idea. 21 | By allowing an arbitrary third character it allows some degree of source 22 | code prettification by using spaces, new lines or other punctuation 23 | marks. 24 | 25 | Any deviation from this format will result in garbage being written 26 | to the output stream, as no error checking is done. In particular, 27 | there must not be any trailing space on lines, nor can there be blank 28 | lines. 29 | 30 | The `unhex.x` file contains the hexadecimal octets for `unhex`. Processing 31 | it with `unhex` yields another copy of `unhex`, which we check is identical 32 | to the inital copy as a way of testing that the program is working. 33 | 34 | The program is deliberately minimal. Of necessity, it starts with an 35 | ELF header (52 bytes), followed by one program header for the whole file 36 | (32 bytes). The executable code is at end (109 bytes). There are no 37 | section headers and no `.shstrtab` section, which together mean that the 38 | binutils diagnostic tools (`objdump`, etc.) are of limited use on it. 39 | 40 | Conceptually the program should have been written using some lower-level 41 | technique, such as with a hex-editor. But instead, the Makefile 42 | contains a simple one-line shell script to perform the same action as 43 | `unhex`, which is used to create the first `unhex` binary. 44 | -------------------------------------------------------------------------------- /stage-0/unhex.x: -------------------------------------------------------------------------------- 1 | 7F 45 4C 46 01 01 01 00 2 | 00 00 00 00 00 00 00 00 3 | 02 00 03 00 01 00 00 00 4 | 6F 80 04 08 34 00 00 00 5 | 00 00 00 00 00 00 00 00 6 | 34 00 7 | 20 00 01 00 8 | 00 00 00 00 9 | 00 00 10 | 01 00 00 00 00 00 00 00 11 | 00 80 04 08 00 80 04 08 12 | C1 00 00 00 C1 00 00 00 13 | 05 00 00 00 00 10 00 00 14 | 2C 30 15 | 5D 16 | C3 17 | 55 18 | 89 E5 19 | 8B 45 08 20 | 3C 41 21 | 7C F2 22 | 2C 37 23 | EB F0 24 | 89 C3 25 | B8 01 00 00 00 26 | CD 80 27 | 89 E5 28 | 50 29 | BA 03 00 00 00 30 | 8D 4D FC 31 | BB 00 00 00 00 32 | B8 03 00 00 00 33 | CD 80 34 | 3D 00 00 00 00 35 | 7E D9 36 | E8 C6 FF FF FF 37 | 88 C5 38 | 8B 45 FC 39 | 88 E0 40 | 50 41 | E8 B9 FF FF FF 42 | 83 C4 04 43 | B1 04 44 | D2 E5 45 | 00 C5 46 | 88 6D FC 47 | BA 01 00 00 00 48 | 8D 4D FC 49 | BB 01 00 00 00 50 | B8 04 00 00 00 51 | CD 80 52 | EB B1 53 | -------------------------------------------------------------------------------- /stage-1/.gitignore: -------------------------------------------------------------------------------- 1 | .*.swp 2 | unhex 3 | unhexl 4 | elfify 5 | -------------------------------------------------------------------------------- /stage-1/Makefile: -------------------------------------------------------------------------------- 1 | # stage-1/Makefile 2 | 3 | # Copyright (C) 2009, 2011, 2020 Richard Smith 4 | # All rights reserved. 5 | 6 | SHELL = /bin/sh 7 | 8 | CHMOD = /bin/chmod 9 | RM = /bin/rm 10 | CP = /bin/cp 11 | CMP = /usr/bin/cmp 12 | MAKE = /usr/bin/make 13 | 14 | BINDIR = ../bin 15 | PATH = $(BINDIR) 16 | 17 | 18 | all: init unhexl elfify 19 | 20 | init: 21 | @test -d $(BINDIR) || $(MAKE) -C .. init 22 | @test -x $(BINDIR)/unhex || $(MAKE) -C ../stage-0 install 23 | 24 | elfify: elfify.x 25 | unhex < elfify.x > elfify 26 | $(CHMOD) a+x elfify 27 | 28 | unhexl: elfify unhexl.ts.x 29 | unhex < unhexl.ts.x > unhexl.ts 30 | ./elfify unhexl.ts > unhexl 31 | $(CHMOD) a+x unhexl 32 | $(RM) unhexl.ts 33 | 34 | check: check-unhexl 35 | 36 | check-unhexl: unhexl elfify unhexl.ts.xl 37 | ./unhexl < unhexl.ts.xl > unhexl.ts 38 | ./elfify unhexl.ts > unhexl2 39 | $(CHMOD) a+x unhexl2 40 | ./unhexl2 < unhexl.ts.xl > unhexl2.ts 41 | $(CMP) unhexl.ts unhexl2.ts 42 | ./elfify unhexl2.ts > unhexl3 43 | $(CMP) unhexl2 unhexl3 44 | $(RM) unhexl.ts unhexl2.ts unhexl2 unhexl3 45 | 46 | install: unhexl elfify 47 | $(CP) unhexl elfify $(BINDIR) 48 | $(RM) -f $(BINDIR)/unhex 49 | 50 | clean: 51 | $(RM) -f unhexl elfify unhexl.ts unhexl2.ts unhexl2 unhexl3 52 | 53 | world: 54 | set -e; for TARGET in clean init all check install; do \ 55 | $(MAKE) $$TARGET; \ 56 | done 57 | -------------------------------------------------------------------------------- /stage-1/README.md: -------------------------------------------------------------------------------- 1 | # Bootstrap: Stage 1 2 | 3 | In writing the stage 0 `unhex` tool, the two most tedious and error-prone 4 | tasks were generating valid ELF headers, which entailed keeping track 5 | of the size of the `.text` section and the location of the entry point, 6 | and calculating the file offsets used as arguments to various `JMP` and 7 | `Jcc` statements. These two tasks were particularly prone to introduce 8 | errors as the code was modified, perhaps to correct some error found 9 | during testing. Keeping all of the offsets and sizes updated proved 10 | rather more onerous than manually converting the assembly language 11 | into hexadecimal values. 12 | 13 | Therefore stage 1 adds two new tools, `unhexl` and `elfify`, to handle 14 | these tasks. The first, `unhexl`, is a significantly improved version of 15 | the stage 0 `unhex`. It allows arbitrary white-space and comments. 16 | More importantly, it allows labels to be defined and referenced – that 17 | is what the `l` at the end of the program name refers to. 18 | 19 | The grammar is: 20 | 21 | ```ebnf 22 | WS ::= [ \t\n] 23 | XDIGIT ::= [0-9A-F] 24 | LCHAR ::= [0-9A-Za-z_] 25 | LSTART ::= [^ \t\n0-9A-F#] 26 | LREFEND ::= [^:0-9A-Za-z_] 27 | CHAR ::= any character 28 | 29 | comment ::= '#' CHAR* '\n' 30 | octet ::= XDIGIT XDIGIT 31 | label ::= LSTART LCHAR+ 32 | ldef ::= label ':' 33 | lref ::= label LREFEND 34 | 35 | file ::= ( comment | octet | lref | ldef | WS* )* 36 | ``` 37 | 38 | In order to keep the grammar simple, only upper case letters are 39 | accepted in hexadecimal octets and labels must not start with a 40 | valid hexadecimal digit. It is suggested that labels start with 41 | a '.' or a lower case letter. 42 | 43 | Label references are converted into little-endian 32-bit offsets 44 | relative to the end of the address being written. For example, the 45 | following x86 assembly will generate an infinite loop with the label 46 | reference expanding to `BF FF FF FF` (or -5 in decimal). 47 | 48 | ```asm 49 | foo: 50 | E9 foo 51 | ``` 52 | 53 | The program takes its source on standard input and does a single pass 54 | over it writing data to standard output. 55 | 56 | > Usage: `unhexl < test.xl > test` 57 | 58 | The fact that it is a single pass means references can only be made to 59 | labels already defined. The lack of dynamic memory allocation in stage 60 | 1 means the number of labels is limited to 256. For the same reason, 61 | the line length is limited to 80 characters. These constraints are not 62 | checked by the stage 1 program – failure to stick to 80 characters per 63 | lines or 256 labels *will* result in a buffer overflow. Some other 64 | errors, including various syntax errors are flagged by a non-zero return 65 | status, but basically, only minimal error checking is done. 66 | 67 | The second program, `elfify`, takes a `.text` section and converts it into 68 | a stand-alone ELF program. Unlike the stage 0 `unhex`, this program has 69 | section headers and a minimimal `.shstrtab` section so that tools such as 70 | `objdump -d` will work on it. It also adds these to any executable it 71 | creates. 72 | 73 | Because `elfify` needs to find out place the size of the `.text` section 74 | in the ELF program header, it cannot act as a straightforward filter 75 | on standard input. (Placing the program header at the end of the file 76 | does not help because the program header's offset is needed in the ELF 77 | header.) Instead it takes the name of the file containing the `.text` 78 | section as its only command line argument. 79 | 80 | > Usage: `elfify test.ts > test` 81 | 82 | where `.ts` is used as the canonical extension for a `.text` section. 83 | 84 | As `elfify` does not parse the `.text` section, it cannot work out where 85 | the entry point is: it just assumes that the entry point is 5 bytes 86 | before the end of the `.text` section. It is therefore suggested that all 87 | programs end with a jump to the real entry point. On x86, a 32-bit 88 | relative jump requires precisely 5 bytes and can easily be generated 89 | by `unhexl` by ending the file with: 90 | 91 | ```asm 92 | E9 main 93 | ``` 94 | 95 | We check the stage 1 tools are working correctly by using them to build 96 | a new copy of `unhexl` from source in its own input language. (This is 97 | why there are two copies of the source for `unhexl`: a `.ts.x` file for 98 | processing with the stage 0 `unhex`, and a `.ts.xl` file for use with the 99 | stage 1 `unhexl`.) This is repeated, and the second and third generation 100 | unhexl binaries are required to be identical. 101 | -------------------------------------------------------------------------------- /stage-1/elfify.x: -------------------------------------------------------------------------------- 1 | 7F 45 4C 46 01 01 01 00 2 | 00 00 00 00 00 00 00 00 3 | 02 00 03 00 01 00 00 00 4 | 0F 82 04 08 34 00 00 00 5 | 54 00 00 00 00 00 00 00 6 | 34 00 7 | 20 00 01 00 8 | 28 00 03 00 9 | 02 00 10 | 01 00 00 00 00 00 00 00 11 | 00 80 04 08 00 80 04 08 12 | 14 02 00 00 14 02 00 00 13 | 05 00 00 00 00 10 00 00 14 | 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 15 | 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 16 | 01 00 00 00 01 00 00 00 06 00 00 00 E0 80 04 08 E0 00 00 00 17 | 34 01 00 00 00 00 00 00 00 00 00 00 04 00 00 00 00 00 00 00 18 | 07 00 00 00 03 00 00 00 00 00 00 00 00 00 00 00 CC 00 00 00 19 | 14 00 00 00 00 00 00 00 00 00 00 00 01 00 00 00 00 00 00 00 20 | 00 2E 74 65 78 74 00 2E 73 68 73 74 72 74 61 62 00 00 00 00 21 | 55 22 | 89 E5 23 | 8B 75 04 24 | 5D 25 | C3 26 | 55 27 | 89 E5 28 | BA 04 00 00 00 29 | 8D 4D 08 30 | BB 01 00 00 00 31 | B8 04 00 00 00 32 | CD 80 33 | 5D 34 | C3 35 | BB 00 00 00 00 36 | B8 01 00 00 00 37 | CD 80 38 | BB 01 00 00 00 39 | B8 01 00 00 00 40 | CD 80 41 | 89 E5 42 | 83 7D 00 01 43 | 7E EC 44 | B9 00 00 00 00 45 | 8B 5D 08 46 | B8 05 00 00 00 47 | CD 80 48 | 83 F8 00 49 | 7C D8 50 | 50 51 | 81 EC 00 01 00 00 52 | 89 E1 53 | 8B 5D FC 54 | B8 6C 00 00 00 55 | CD 80 56 | E8 93 FF FF FF 57 | 81 EE 4D 01 00 00 58 | BA 18 00 00 00 59 | 89 F1 60 | BB 01 00 00 00 61 | B8 04 00 00 00 62 | CD 80 63 | 8B 85 10 FF FF FF 64 | 81 C0 DB 80 04 08 65 | 50 66 | E8 70 FF FF FF 67 | 83 C4 04 68 | 83 C6 1C 69 | BA 28 00 00 00 70 | 89 F1 71 | BB 01 00 00 00 72 | B8 04 00 00 00 73 | CD 80 74 | 8B 85 10 FF FF FF 75 | 81 C0 E0 00 00 00 76 | 50 77 | E8 45 FF FF FF 78 | E8 40 FF FF FF 79 | 83 C4 04 80 | 83 C6 30 81 | BA 44 00 00 00 82 | 89 F1 83 | BB 01 00 00 00 84 | B8 04 00 00 00 85 | CD 80 86 | FF B5 10 FF FF FF 87 | E8 1C FF FF FF 88 | 83 C4 04 89 | 83 C6 48 90 | BA 4C 00 00 00 91 | 89 F1 92 | BB 01 00 00 00 93 | B8 04 00 00 00 94 | CD 80 95 | 89 E1 96 | BA 00 01 00 00 97 | 8B 5D FC 98 | B8 03 00 00 00 99 | CD 80 100 | 83 F8 00 101 | 0F 8E 02 FF FF FF 102 | 89 C2 103 | BB 01 00 00 00 104 | B8 04 00 00 00 105 | CD 80 106 | EB D8 107 | E9 05 FF FF FF 108 | -------------------------------------------------------------------------------- /stage-1/unhexl.ts.x: -------------------------------------------------------------------------------- 1 | 55 2 | 89 E5 3 | 8B 45 08 4 | 3C 20 5 | 74 0A 6 | 3C 09 7 | 74 06 8 | 3C 0A 9 | 74 02 10 | 31 C0 11 | 5D 12 | C3 13 | 55 14 | 89 E5 15 | 8B 45 08 16 | 3C 30 17 | 7C 18 18 | 3C 3A 19 | 7C 16 20 | 3C 41 21 | 7C 10 22 | 3C 5B 23 | 7C 0E 24 | 3C 5F 25 | 74 0A 26 | 3C 61 27 | 7C 04 28 | 3C 7B 29 | 7C 02 30 | 31 C0 31 | 5D 32 | C3 33 | 55 34 | 89 E5 35 | 8B 45 08 36 | 3C 30 37 | 7C 0C 38 | 3C 3A 39 | 7C 0F 40 | 3C 41 41 | 7C 04 42 | 3C 46 43 | 7E 0B 44 | B8 FF FF FF FF 45 | EB 06 46 | 2C 30 47 | EB 02 48 | 2C 37 49 | 5D 50 | C3 51 | 55 52 | 89 E5 53 | BA 01 00 00 00 54 | 31 DB 55 | B8 03 00 00 00 56 | CD 80 57 | 83 F8 01 58 | 0F 85 3F 01 00 00 59 | 5D 60 | C3 61 | 89 E5 62 | 81 EC 58 10 00 00 63 | 8D 85 A8 EF FF FF 64 | 89 45 A8 65 | C7 45 FC 00 00 00 00 66 | BA 01 00 00 00 67 | 8D 4D AC 68 | 31 DB 69 | B8 03 00 00 00 70 | CD 80 71 | 83 F8 00 72 | 0F 8C 0B 01 00 00 73 | 89 C3 74 | 0F 84 08 01 00 00 75 | 8A 45 AC 76 | 50 77 | E8 40 FF FF FF 78 | 83 F8 00 79 | 5A 80 | 75 CF 81 | 80 7D AC 23 82 | 74 4D 83 | 52 84 | E8 6A FF FF FF 85 | 5B 86 | 3C FF 87 | 74 55 88 | 50 89 | 8D 4D AD 90 | E8 81 FF FF FF 91 | 5B 92 | FF 75 AD 93 | E8 53 FF FF FF 94 | 5A 95 | 83 F8 FF 96 | 0F 84 C7 00 00 00 97 | C6 C1 04 98 | D2 E3 99 | 00 D8 100 | 50 101 | BA 01 00 00 00 102 | 89 E0 103 | 8D 08 104 | BB 01 00 00 00 105 | B8 04 00 00 00 106 | CD 80 107 | 5A 108 | FF 45 FC 109 | E9 7C FF FF FF 110 | 8D 4D AC 111 | E8 40 FF FF FF 112 | 80 7D AC 0A 113 | 75 F2 114 | E9 69 FF FF FF 115 | 8D 4D AC 116 | 41 117 | E8 2C FF FF FF 118 | FF 31 119 | E8 DA FE FF FF 120 | 5B 121 | 83 F8 00 122 | 75 ED 123 | 80 39 3A 124 | 9C 125 | C6 01 00 126 | 41 127 | 8D 75 AC 128 | 29 F1 129 | 83 F9 12 130 | 0F 8F 62 00 00 00 131 | 9D 132 | 75 1F 133 | 8D 5D A8 134 | 8B 3B 135 | 39 F7 136 | 0F 8D 52 00 00 00 137 | F3 A4 138 | 8B 45 FC 139 | 8B 3B 140 | 89 47 0C 141 | 83 03 10 142 | E9 1B FF FF FF 143 | 8D BD A8 EF FF FF 144 | 3B 7D A8 145 | 7D 35 146 | 51 147 | 56 148 | 57 149 | F3 A6 150 | 5F 151 | 5E 152 | 59 153 | 74 05 154 | 83 C7 10 155 | EB EC 156 | 83 45 FC 04 157 | 8B 47 0C 158 | 2B 45 FC 159 | 50 160 | BA 04 00 00 00 161 | 89 E0 162 | 8D 08 163 | BB 01 00 00 00 164 | B8 04 00 00 00 165 | CD 80 166 | 58 167 | E9 DB FE FF FF 168 | BB 01 00 00 00 169 | B8 01 00 00 00 170 | CD 80 171 | E9 B2 FE FF FF 172 | -------------------------------------------------------------------------------- /stage-1/unhexl.ts.xl: -------------------------------------------------------------------------------- 1 | # unhexl.ts.xl 2 | 3 | # Copyright (C) 2009, 2011 Richard Smith 4 | # All rights reserved. 5 | 6 | #### # Function: bool isws(char) 7 | # Tests whether its argument is in [ \t\n] 8 | 9 | # As with many of the functions here, it is turned upside down 10 | # so the entry point is in the middle. This is because unhexl 11 | # is limited to jumps up the file. 12 | .L1: 13 | 5D # POP %ebp 14 | C3 # RET 15 | isws: 16 | 55 # PUSH %ebp 17 | 89 E5 # MOVL %esp, %ebp 18 | 8B 45 08 # MOVL 8(%ebp), %eax 19 | 3C 20 # CMPB $0x20, %al # ' ' 20 | 0F 84 .L1 # JE .L1 21 | 3C 09 # CMPB $0x09, %al # '\t' 22 | 0F 84 .L1 # JE .L1 23 | 3C 0A # CMPB $0x0A, %al # '\n' 24 | 0F 84 .L1 # JE .L1 25 | 31 C0 # XORL %eax, %eax 26 | E9 .L1 # JMP .L1 27 | 28 | #### # Function: bool islchr(char) 29 | # Tests whether its argument is in [0-9A-Za-z_] 30 | .L2: 31 | 31 C0 # XORL %eax, %eax 32 | .L3: 33 | 5D # POP %ebp 34 | C3 # RET 35 | islchr: 36 | 55 # PUSH %ebp 37 | 89 E5 # MOVL %esp, %ebp 38 | 8B 45 08 # MOVL 8(%ebp), %eax 39 | 3C 30 # CMPB $0x30, %al # '0' 40 | 0F 8C .L2 # JL .L2 41 | 3C 39 # CMPB $0x39, %al # '9' 42 | 0F 8E .L3 # JLE .L3 43 | 3C 41 # CMPB $0x41, %al # 'A' 44 | 0F 8C .L2 # JL .L2 45 | 3C 5A # CMPB $0x5A, %al # 'Z' 46 | 0F 8E .L3 # JLE .L3 47 | 3C 5F # CMPB $0x5F, %al # '_' 48 | 0F 84 .L3 # JE .L3 49 | 3C 61 # CMPB $0x61, %al # 'a' 50 | 0F 8C .L2 # JL .L2 51 | 3C 7A # CMPB $0x7A, %al # 'z' 52 | 0F 8E .L3 # JLE .L3 53 | E9 .L2 # JMP .L2 54 | 55 | #### # Function: int xchar(char) 56 | # Tests whether its argument is a character in [0-9A-F], and if so, 57 | # coverts it to a decimal number; otherwise returns -1. 58 | .L6: 59 | 2C 37 # SUBB $0x37, %al # 'A'-10 60 | .L7: 61 | 5D # POP %ebp 62 | C3 # RET 63 | .L4: 64 | B8 FF FF FF FF # MOVL $-1, %eax 65 | E9 .L7 # JMP .L7 66 | .L5: 67 | 2C 30 # SUBB $0x30, %al # '0' 68 | E9 .L7 # JMP .L7 69 | xchr: 70 | 55 # PUSH %ebp 71 | 89 E5 # MOVL %esp, %ebp 72 | 8B 45 08 # MOVL 8(%ebp), %eax 73 | 3C 30 # CMPB $0x30, %al # '0' 74 | 0F 8C .L4 # JL .L4 75 | 3C 39 # CMPB $0x39, %al # '9' 76 | 0F 8E .L5 # JLE .L5 77 | 3C 41 # CMPB $0x41, %al # 'A' 78 | 0F 8C .L4 # JL .L4 79 | 3C 46 # CMPB $0x46, %al # 'F' 80 | 0F 8E .L6 # JLE .L6 81 | E9 .L4 # JMP .L4 82 | 83 | #### # Not a proper function. 84 | # Exits program 85 | error: 86 | BB 01 00 00 00 # MOVL $1, %ebx 87 | success: 88 | B8 01 00 00 00 # MOVL $1, %eax 89 | CD 80 # INT $0x80 90 | 91 | #### # Function: void readone( [%ecx] char* ) 92 | # Reads one byte into (%ecx) which should already be set. 93 | # Clobbers %edx, %ebx and %eax. 94 | # Exits on failure. 95 | readone: 96 | 55 # PUSH %ebp 97 | 89 E5 # MOVL %esp, %ebp 98 | BA 01 00 00 00 # MOVL $1, %edx 99 | 31 DB # XORL %ebx, %ebx 100 | B8 03 00 00 00 # MOVL $3, %eax 101 | CD 80 # INT $0x80 102 | 83 F8 01 # CMPL $1, %eax 103 | 0F 85 error # JNE error 104 | 5D # POP %ebp 105 | C3 # RET 106 | 107 | #### # The main function. 108 | # Stack is arranged as follows: 109 | # 110 | # -4(%ebp) int* addr 111 | # -84(%ebp) char buffer[80] 112 | # -88(%ebp) label* label_end 113 | # -4184(%ebp) label labels[256] 114 | # 115 | # where label is a { char name[12]; int addr }. 116 | 117 | ret: 118 | # This ret is labelled to allow various bits of main to 119 | # jump up to it in order to effect a forwards jump. 120 | 31 C0 # XORL %eax, %eax 121 | C3 # RET 122 | 123 | # --- Test for a comment. 124 | # If found, skip over comment line until we've read a LF 125 | # At end of section, %eax=1 iff we read a comment. 126 | # If %eax=0, all other registers are unaltered. 127 | comment: 128 | 80 7D AC 23 # CMPB $0x23, -84(%ebp) 129 | 0F 85 ret # JNE ret 130 | .L10: 131 | 8D 4D AC # LEA -84(%ebp), %ecx 132 | E8 readone # CALL readone 133 | 80 7D AC 0A # CMPL $0x0A, -84(%ebp) # '\n' 134 | 0F 85 .L10 # JNE .L10 135 | B8 01 00 00 00 # MOVL $1, %eax 136 | C3 # RET 137 | 138 | # --- Test for an octet. 139 | octet: 140 | FF 75 AC # PUSH -84(%ebp) 141 | E8 xchr # CALL xchr 142 | 5B # POP %ebx 143 | 3C FF # CMPB $-1, %al 144 | 0F 84 ret # JE ret 145 | 146 | # Yes, we do. Read the next byte 147 | 50 # PUSH %eax 148 | 8D 4D AD # LEA -83(%ebp), %ecx 149 | E8 readone # CALL readone 150 | 5B # POP %ebx 151 | 152 | # Process it 153 | FF 75 AD # PUSH -83(%ebp) 154 | E8 xchr # CALL xchr 155 | 5A # POP %edx 156 | 83 F8 FF # CMPL $-1, %eax 157 | 0F 84 error # JE error 158 | C6 C1 04 # MOVB $4, %cl 159 | D2 E3 # SALB %cl, %bl 160 | 00 D8 # ADDB %bl, %al 161 | 162 | # Byte is now in %al; lets write it 163 | 50 # PUSH %eax 164 | BA 01 00 00 00 # MOVL $1, %edx 165 | 89 E0 # MOVL %esp, %eax 166 | 8D 08 # LEA (%eax), %ecx 167 | BB 01 00 00 00 # MOVL $1, %ebx 168 | B8 04 00 00 00 # MOVL $4, %eax 169 | CD 80 # INT $0x80 170 | 5A # POP %edx 171 | 172 | # Increment the address and return 173 | FF 45 FC # INCL -4(%ebp) 174 | B8 01 00 00 00 # MOVL $1, %eax 175 | C3 # RET 176 | 177 | 178 | # Parts of the label section 179 | labeldef: 180 | # Check that we're not about to over run the label store, 181 | # and then store the label 182 | 8D 5D A8 # LEA -88(%ebp), %ebx 183 | 8B 3B # MOVL (%ebx), %edi 184 | 39 DF # CMPL %ebx, %edi -- is this right? 185 | 0F 8D error # JGE error 186 | F3 # REP 187 | A4 # MOVSB 188 | 8B 45 FC # MOVL -4(%ebp), %eax 189 | 8B 3B # MOVL (%ebx), %edi 190 | 89 47 0C # MOVL %eax, 12(%edi) 191 | 83 03 10 # ADDL $16, (%ebx) 192 | B8 01 00 00 00 # MOVL $1, %eax 193 | C3 # RET 194 | 195 | labelref: 196 | # Look up the label 197 | 8D BD 98 EF FF FF # LEA -4200(%ebp), %edi 198 | .L14: 199 | 83 C7 10 # ADDL $16, %edi 200 | 3B 7D A8 # CMPL -88(%ebp), %edi 201 | 0F 8D error # JGE error 202 | 51 # PUSH %ecx 203 | 56 # PUSH %esi 204 | 57 # PUSH %edi 205 | F3 # REPE 206 | A6 # CMPSB 207 | 5F # POP %edi 208 | 5E # POP %esi 209 | 59 # POP %ecx 210 | 0F 85 .L14 # JNE .L14 211 | 212 | # Found it. Increment address by four and print offset 213 | 83 45 FC 04 # ADDL $4, -4(%ebp) 214 | 8B 47 0C # MOVL 12(%edi), %eax 215 | 2B 45 FC # SUBL -4(%ebp), %eax 216 | 50 # PUSH %eax 217 | BA 04 00 00 00 # MOVL $4, %edx 218 | 89 E0 # MOVL %esp, %eax 219 | 8D 08 # LEA (%eax), %ecx 220 | BB 01 00 00 00 # MOVL $1, %ebx 221 | B8 04 00 00 00 # MOVL $4, %eax 222 | CD 80 # INT $0x80 223 | 58 # POP %eax 224 | B8 01 00 00 00 # MOVL $1, %eax 225 | C3 # RET 226 | 227 | # --- Test for a label (either definition or reference). 228 | label: 229 | # Read a label 230 | 8D 4D AC # LEA -84(%ebp), %ecx 231 | .L12: 232 | 41 # INCL %ecx 233 | E8 readone # CALL readone 234 | FF 31 # PUSH (%ecx) 235 | E8 islchr # CALL islchr 236 | 5B # POP %ebx 237 | 83 F8 00 # CMPL $0, %eax 238 | 0F 85 .L12 # JNE .L12 239 | 240 | # (%ecx) is now something other than lchr. Is it a colon? 241 | # Also, null terminate, load %esi with start of string, and 242 | # %ecx with its length inc. NUL. 243 | 80 39 3A # CMPB $0x3A, (%ecx) 244 | 9C # PUSHF 245 | C6 01 00 # MOVB $0, (%ecx) 246 | 41 # INCL %ecx 247 | 8D 75 AC # LEA -84(%ebp), %esi 248 | 29 F1 # SUBL %esi, %ecx 249 | 83 F9 12 # CMPL $12, %ecx 250 | 0F 8F error # JG error 251 | 9D # POPF 252 | 0F 85 labelref # JNE labelref 253 | E9 labeldef # JMP labeldef 254 | 255 | # --- The main loop 256 | main: 257 | 89 E5 # MOVL %esp, %ebp 258 | 81 EC 58 10 00 00 # SUBL $4184, %esp 259 | 8D 85 A8 EF FF FF # LEA -4184(%ebp), %eax 260 | 89 45 A8 # MOVL %eax, -88(%ebp) 261 | C7 45 FC 00 00 00 00 # MOVL $0, -4(%ebp) 262 | 263 | .L8: 264 | # Read one byte (not with readone because EOF is permitted) 265 | BA 01 00 00 00 # MOVL $1, %edx 266 | 8D 4D AC # LEA -84(%ebp), %ecx 267 | 31 DB # XORL %ebx, %ebx 268 | B8 03 00 00 00 # MOVL $3, %eax 269 | CD 80 # INT $0x80 270 | 83 F8 00 # CMPL $0, %eax 271 | 0F 8C error # JL error 272 | 89 C3 # MOVL %eax, %ebx 273 | 0F 84 success # JE success 274 | 275 | # Is the byte white space? If so, loop back 276 | 8A 45 AC # MOVB -84(%ebp), %al 277 | 50 # PUSH %eax 278 | E8 isws # CALL isws 279 | 83 F8 00 # CMPL $0, %eax 280 | 5A # POP %edx 281 | 0F 85 .L8 # JNE .L8 282 | 283 | # We have a byte. What is it? 284 | E8 comment # CALL comment 285 | 83 F8 00 # CMP $0, %eax 286 | 0F 85 .L8 # JNE .L8 287 | 288 | E8 octet # CALL octet 289 | 83 F8 00 # CMP $0, %eax 290 | 0F 85 .L8 # JNE .L8 291 | 292 | E8 label # CALL label 293 | 83 F8 00 # CMP $0, %eax 294 | 0F 85 .L8 # JNE .L8 295 | 296 | E9 error # JMP error 297 | 298 | #### # And finally, the entry point. 299 | # Last per requirement for elfify. 300 | E9 main # JMP main 301 | -------------------------------------------------------------------------------- /stage-2/.gitignore: -------------------------------------------------------------------------------- 1 | .*.swp 2 | unhexl 3 | elfify 4 | as 5 | -------------------------------------------------------------------------------- /stage-2/Makefile: -------------------------------------------------------------------------------- 1 | # stage-2/Makefile 2 | 3 | # Copyright (C) 2010, 2011, 2020 Richard Smith 4 | # All rights reserved. 5 | 6 | SHELL = /bin/sh 7 | 8 | CHMOD = /bin/chmod 9 | RM = /bin/rm 10 | CP = /bin/cp 11 | CMP = /usr/bin/cmp 12 | MAKE = /usr/bin/make 13 | 14 | BINDIR = ../bin 15 | PATH = $(BINDIR) 16 | 17 | all: init as 18 | 19 | init: 20 | @test -d $(BINDIR) || $(MAKE) -C .. init 21 | @test -x $(BINDIR)/unhexl || $(MAKE) -C ../stage-1 install 22 | 23 | as: as.ts.xl 24 | unhexl < as.ts.xl > as.ts 25 | elfify as.ts > as 26 | $(CHMOD) a+x as 27 | $(RM) as.ts 28 | 29 | check: check-as 30 | 31 | check-as: as as.s 32 | ./as as.s > as2.ts 33 | elfify as2.ts > as2 34 | $(CHMOD) a+x as2 35 | ./as2 as.s > as3.ts 36 | $(CMP) as2.ts as3.ts 37 | $(RM) as2.ts as3.ts as2 38 | 39 | install: as 40 | $(CP) as $(BINDIR)/as0 41 | $(RM) -f $(BINDIR)/unhexl 42 | 43 | clean: 44 | $(RM) -f as as.ts as2.ts as3.ts as2 45 | 46 | world: 47 | set -e; for TARGET in clean init all check install; do \ 48 | $(MAKE) $$TARGET; \ 49 | done 50 | -------------------------------------------------------------------------------- /stage-2/README.md: -------------------------------------------------------------------------------- 1 | # Bootstrap: Stage 2 2 | 3 | Although the two stage 1 tools, `unhexl` and `elfify`, certainly eased the 4 | process of writing code, manually assembling opcodes is still painful, 5 | especially the encoding of ModR/M bytes for instructions such as `MOV`. 6 | After the need to manually track file offsets for jumps, badly-encoded 7 | opcodes were the most frequent source of error when writing the stage 1 8 | tools. As the stage 1 tools have already alleviated the need to 9 | handle offsets manually, the next logical step is a lightweight 10 | assembler. This is the main tool introduced in this stage. 11 | 12 | The stage 2 assembler makes two passes over the assembler source code, 13 | the first building up a symbol table and the second writing out machine 14 | code. This means that forwards jumps are supported (unlike in the stage 15 | 1 `unhexl`). The supported instruction set is loosely based on the Intel 16 | 8086 instruction set, but with 32-bit addressing (which the 8086, a 17 | 16-bit microprocessor, lacked). Some more recent instructions are added 18 | such as the far conditional jumps (opcode `0F 8x`, introduced in the 386). 19 | 20 | For many instructions, we simplify the implementation by only supporting 21 | the general version (e.g. the far jump) without the corresponding 22 | special cases (e.g. near or short jumps). Mandatory instruction size 23 | suffixes (`L` for 32-bit, `B` for 8-bit) are used to limit each mnemonic to 24 | a single opcode family; no 16-bit instructions are supported. There is 25 | no support for adressing that involves a SIB byte – so `(%ebp)` is not 26 | supported, though `0(%ebp)` is. 27 | 28 | Labels are restricted to 11 characters, and a maximum of 256 labels are 29 | allowed. When a label appears as an argument to a mnemonic, it is 30 | always treated as a 32-bit program counter relative (pcrel) offset. 31 | This is correct for `JMP`, `Jcc` and `CALL`, but wrong when trying to locate 32 | data complied into the program. There is a `.hex` directive that takes a 33 | stream of hexadecimal octets (without their '0x' prefix); it is useful 34 | for including data into the object file, or manually assembling an 35 | unsupported instruction. This is the only assembler directive 36 | implemented in the stage-2 assembler. 37 | 38 | The assembler uses AT&T syntax, with the source before the destination 39 | for operations with two operands, as that is the *de facto* standard in 40 | the Unix environment. This introduces some complications into the 41 | assembler: for example, in `MOVL addr, %eax`, `addr` is to be interpreted 42 | as a pcrel address, which means relative to the end of the instruction. 43 | However, when `addr` is parsed, the instruction length is unknown: 44 | had the destination been `4(%ebp)`, that would have been longer. This 45 | requires some look-ahead in the parser. 46 | 47 | Instructions are supposed to be delimited by either a new line or a 48 | semicolon; however, this has relaxed to allow prefixes (such as `REP`) to 49 | be treated as instructions with no arguments (like `NOP`). Thus 50 | `REP SCASB` is valid, though the assembler believes it to be two 51 | instructions, not one with a prefix. The full grammar is: 52 | 53 | ```ebnf 54 | HWS ::= [ \t] 55 | DIGIT ::= [0-9] 56 | NZDIGIT ::= [1-9] 57 | XDIGIT ::= [0-9A-F] 58 | LCHAR ::= [0-9A-Za-z_] 59 | LSTART ::= [.A-Za-z_] 60 | CHAR ::= any character 61 | 62 | comment ::= '#' CHAR* '\n' 63 | endline ::= HWS* ( comment | '\n' | ';' ) 64 | identifier ::= LSTART LCHAR+ 65 | labeldef ::= identifier ':' 66 | mnemonic ::= identifier # from the list of known mnemonics 67 | integer ::= ( '0' 'x' XDIGIT+ | NZDIGIT DIGIT* | '0' ) 68 | immediate ::= '$' integer 69 | regname8 ::= 'al' | 'cl' | 'dl' | 'bl' | 'ah' | 'ch' | 'dh' | 'bh' 70 | regname32 ::= 'eax' | 'ecx' | 'edx' | 'ebx' | 'esp' | 'ebp' | 'esi' | 'edi' 71 | register ::= '%' ( regname8 | regname32 ) 72 | regmem ::= register | integer? '(' '%' regname32 ')' 73 | argument ::= HWS* ( immediate | identifier | regmem ) 74 | arguments ::= argument HWS* ',' arguments | argument 75 | instruction ::= mnemonic arguments endline? 76 | octet ::= HWS* XDIGIT XDIGIT 77 | hexbytes ::= '.hex' octet* endline 78 | directive ::= hexbytes 79 | file ::= labeldef | instruction | directive | endline 80 | ``` 81 | 82 | The list of supported mnemonics is: 83 | 84 | ``` 85 | ADCx ADDx ANDx BSFL BSRL CALL CBW CDQ CLC CLD CMPx CMPSx 86 | CWDE DECx DIVx HLT IDIVx IMULx INCx INT Jcc JMP LEA LEAVE 87 | LODSx MOVx MOVSx MULx NEGx NOP NOTx ORx POP POPF PUSH 88 | PUSHF REP REPE REPNE RET SALx SARx SBBx SCASx SHLx SHRx 89 | STC STD STOSx SUBx TESTL XCHGL XORx. 90 | ``` 91 | 92 | In that list, `x` represents a size suffix `L` or `B`, and `cc` is a 93 | condition (`A`, `AE`, `B`, `BE`, `C`, `E`, `G`, `GE`, `L`, `LE`, `O`, `P`, 94 | `PE`, `PO`, `S`, `Z`, together with the negative `Ncc` versions, 95 | [except for `PE` and `PO`]). Some instructions have implicit arguments, 96 | and they *must not* be specified in the source. The shift opcodes 97 | (`SAL`, `SAR`, `SHL`, `SHR`) always shift by `%cl` bits, and the 98 | multiplication-like opcodes (`MUL`, `IMUL`, `DIV`, `IDIV`) 99 | always act of `%edx:%eax` (in 32-bit mode) or `%ax` (in 8-bit mode). 100 | 101 | Not all of the instructions normally represented by these mnemonics are 102 | supported. `INT` only takes a 8-bit immediate; `CALL`, `JMP` and `Jcc` 103 | take a program counter relative 32-bit immediate; the unary arithmetics, 104 | `INCx`, `DECx`, `NEGx`, `NOTx`, `SALx`, `SHLx`, `SARx`, `SHRx`, `MULx`, 105 | `IMULx`, `DIVx` and `IDIVx`, take a single 8- or 32-bit r/m operand 106 | (matching the regmem production); and `PUSH` and `POP` take a 32-bit 107 | r/m. `LEA` takes a 32-bit r/m followed by a 32-bit register; the binary 108 | arithmetics, `MOVx`, `ADDx`, `SUBx`, `ADCx`, `SBBx`, `CMPx`, `ANDx`, 109 | `ORx` and `XORx` take either a r/m and a register (in either order) 110 | or an immediate followed by a r/m, all of the appropriate size. The 111 | remaining mnemonics have no operands. 112 | 113 | Because of the need to make two passes over the source, it takes the 114 | name of the source code file as its only command line argument; a 115 | `.text` section is printed on standard output. 116 | 117 | > Usage: `as test.s > test.ts` 118 | 119 | The output is not a valid executable – it just the `.text` section. It 120 | therefore needs using in conjunction with the stage 1 `elfify` tool to 121 | produce an executable. 122 | -------------------------------------------------------------------------------- /stage-3/.gitignore: -------------------------------------------------------------------------------- 1 | .*.swp 2 | elfify 3 | as0 4 | as 5 | ld 6 | -------------------------------------------------------------------------------- /stage-3/Makefile: -------------------------------------------------------------------------------- 1 | # stage-3/Makefile 2 | 3 | # Copyright (C) 2011, 2012, 2013, 2020 Richard Smith 4 | # All rights reserved. 5 | 6 | SHELL = /bin/sh 7 | 8 | CHMOD = /bin/chmod 9 | RM = /bin/rm 10 | CP = /bin/cp 11 | CMP = /usr/bin/cmp 12 | MAKE = /usr/bin/make 13 | 14 | BINDIR = ../bin 15 | PATH = $(BINDIR) 16 | 17 | all: init as ld 18 | 19 | init: 20 | @test -d $(BINDIR) || $(MAKE) -C .. init 21 | @test -x $(BINDIR)/as0 || $(MAKE) -C ../stage-2 install 22 | 23 | # as0 is the assemlber symlinked from stage 2. 24 | # as1 is this stage's assembler (written in asm), assembled by as0. 25 | # as is the same source, assembled by itself (by as1), and therefore similar. 26 | # as2 is a test assembler produced by as; it should be binary identical to as. 27 | 28 | # The same conventions apply to ld, except that there is no ld0. 29 | 30 | as1: as.s 31 | as0 as.s > as.ts 32 | elfify as.ts > as1 33 | $(CHMOD) a+x as1 34 | $(RM) as.ts 35 | 36 | ld1: ld.s 37 | as0 ld.s > ld.ts 38 | elfify ld.ts > ld1 39 | $(CHMOD) a+x ld1 40 | $(RM) ld.ts 41 | 42 | check: check-as check-ld check-sep 43 | 44 | as: as1 ld1 as.s 45 | ./as1 as.s 46 | ./ld1 -o as as.o 47 | $(RM) as.o 48 | 49 | ld: as1 ld1 ld.s 50 | ./as1 ld.s 51 | ./ld1 -o ld ld.o 52 | $(RM) ld.o 53 | 54 | check-as: as ld as.s 55 | $(RM) -f as.o 56 | ./as as.s 57 | ./ld -o as2 as.o 58 | $(CMP) as as2 59 | $(RM) as.o as2 60 | 61 | check-ld: as ld ld.s 62 | $(RM) -f ld.o 63 | ./as ld.s 64 | ./ld -o ld2 ld.o 65 | $(CMP) ld ld2 66 | $(RM) ld.o ld2 67 | 68 | testprog: as ld test1.s test2.s test3.s 69 | ./as test2.s 70 | ./as test3.s 71 | ./ld -r -o test2+3.o test2.o test3.o 72 | ./as test1.s 73 | ./ld -o testprog test1.o test2+3.o 74 | $(RM) test1.o test2.o test3.o test2+3.o 75 | 76 | check-sep: testprog 77 | ./testprog 78 | $(RM) testprog 79 | 80 | .INTERMEDIATE: as.ts ld.ts as.o ld.o as1 ld1 81 | 82 | install: init as ld 83 | $(CP) as ld $(BINDIR) 84 | $(RM) -f $(BINDIR)/as0 $(BINDIR)/elfify 85 | 86 | clean: 87 | $(RM) -f as.ts ld.ts as.o ld.o as1 ld1 88 | $(RM) -f as2.o ld2.o ld as2 ld2 as ld 89 | $(RM) -f test1.o test2.o test3.o test2+3.o testprog 90 | 91 | world: 92 | set -e; for TARGET in clean init all check install; do \ 93 | $(MAKE) $$TARGET; \ 94 | done 95 | -------------------------------------------------------------------------------- /stage-3/README.md: -------------------------------------------------------------------------------- 1 | # Bootstrap: Stage 3 2 | 3 | One major limitation to the stage 2 assembler is that there are rather 4 | small fixed limits to many quantities: labels are limited to 11 5 | characters, and there must be no more than 256 of them; lines can be no 6 | more than 80 characters long. These limits exist because there is no 7 | heap allocation of memory, and fixed-sized arrays are declared on the 8 | stack. This puts a severe limit on the complexity of any program 9 | assembled using the stage 2 assembler. 10 | 11 | To overcome this limitation, in this stage we take the stage 2 assembler 12 | source, translates it into assembly language (from hexadecimal), and 13 | adds dynamic memory mangement to the assembler with a standard `malloc()` 14 | and `realloc()` interface. This is used to remove the limit of 256 15 | labels. 16 | 17 | The present `malloc()` implementation is very inefficient because it punts 18 | all the work to the kernel with an `mmap(MAP_ANON)` syscall. This results 19 | in a new memory page being allocated for every block of memory 20 | allocated. For the present purpose this is acceptable, but it will 21 | rapidly cease to be as code becomes more complex. However, at present 22 | there is a strong disincentive to implementing a better `malloc()` 23 | implementation: there is no mechanism for code reuse. Bug fixes or 24 | improvements to the implementation will tend to get lost or incorrectly 25 | applied to the multiple copies scattered around the code. 26 | 27 | To allow code reuse this stage introduces separate assembly and linking 28 | steps allowing each executable to be formed from multiple object files. 29 | This means that some object files (for instance, those containing the 30 | `malloc()` implementation) can linked into several different executables. 31 | To support this, the assembler (which now writes ELF directly, removing 32 | the need for the stage 1 `elfify` program) includes `.rel.text`, `.symtab` and 33 | `.strtab` sections in its output. The `R_386_PC32` relocation type is used 34 | for relocations between symbols in the `.text` section; the relocations 35 | are stored in the `.rel.text` section. 36 | 37 | Another difficulty with the stage 2 assembler was that the only place to 38 | store data was on the stack or in a register. Variables like the input 39 | read buffer had to be placed on the stack and pointers to it passed 40 | around to all functions, with the result that the code was rarely 41 | refactored into separate functions. This is addressed by the stage-3 42 | assembler which supports a writable `.data` section. References in the 43 | `.text` section to objects in the `.data` section are handled by way of 44 | `R_386_32` relocations. 45 | 46 | Objects in the `.data` can be initialised with the address of other 47 | objects, e.g. by passing a symbol name to the argument of an `.int` 48 | directive. These are handled by `R_386_32` relocations which are 49 | stored in a new `.rel.data` section. 50 | 51 | The `.text` and `.data` directives are used to switch between sections, and 52 | several other new assembler directives are added. The complete list is 53 | now as follows 54 | 55 | ``` 56 | .text .data .global .globl .local .int .byte .long .hex 57 | .zero .align .string 58 | ``` 59 | 60 | The `.global` (or equivalently, `.globl`) and `.local` directives take a 61 | symbol name as their single argument. They specify the binding of 62 | that symbol. Global binding is currently the default (for compatibility 63 | with stage 2), though that will be changed in a later stage. 64 | 65 | The `.int` and `.byte` directives allow 32-bit and 8-bit integers to be 66 | included directly into the output; `.long` is a synonym for `.int`. 67 | Multiple integers, separated by commas, can be included as arguments. 68 | Unlike the existing `.hex` directive (unchanged from stage 2) which only 69 | accepts hexadecimal octets without prefixes, these support any form of 70 | literal. The `.zero` directive takes one argument and writes that number 71 | of zeros to the output. The `.align` directive also writes a number of 72 | zeros to the output, but the argument to `.align` is the alignment 73 | required. Thus `.align` 16 outputs enough zero bytes to align the section 74 | to the next 16-byte boundary. 75 | 76 | The `.string` directive allows for strings in double quotes with 77 | a maximum length of 78 characters. They are automatically null 78 | terminated, and the following escapes understood: 79 | 80 | ``` 81 | \n \t \" \\ \0 82 | ``` 83 | 84 | The use of symbols as address immediates without a `$` prefix (e.g. `ADDL 85 | foo, %eax`) is deprecated. In the stage-2 assembler, this stores the 86 | address of foo in %eax. To get this behaviour in the stage-3 assembler, 87 | add the $ prefix. We also now support the versions of the `MOV` 88 | instruction that transfer a symbol value (rather than the address) to or 89 | from the accumulator. Thus `MOVL foo, %eax` copies the symbol value (as 90 | is standard practice for that notation) and not the symbol address (as 91 | in stage 2). The following instructions are also added: 92 | 93 | ```asm 94 | MOVZX, SETcc 95 | ``` 96 | 97 | Support for the following AT&T aliases for Intel mnemonics has also 98 | been added: 99 | 100 | ``` 101 | CBTW, CLTD, CWTL, MOVZBL, MOVSBL 102 | ``` 103 | 104 | There is also very limited support for instructions with a SIB bytes: 105 | just enough to allow `%esp` to be dereferenced in an r/m32, e.g. in 106 | `MOVL (%esp), %eax`. 107 | 108 | Indirect relative branches and calls are now supported, e.g. with 109 | `CALL *%eax`. These are needed for function pointers and computed jumps 110 | as used in a jump table for switch statements. 111 | 112 | Character literals are now allowed as immediates, enclosed in single 113 | quotes. (Note this is unlike the GNU assembler, where character 114 | literals begin with a single quote, but do not have a closing quote.) 115 | They can be preceded by a `$` which is optional (on the basis that no-one 116 | would choose to write an address in terms of its ASCII representation). 117 | The same escape characters are accepted as for strings. Multicharacter 118 | literals are allowed where a 32-bit immediate is expected, and may 119 | contain upto four characters. Their layout is such as to make them 120 | useful for short text fragments: 'xyz' the same layout as "xyz". (Note 121 | that this layout is the opposite design decision to that made in gcc. 122 | Neither the C standard nor the ABI provide any guidance on the matter.) 123 | 124 | The assembler requires its source file to be suffixed `.s` and 125 | automatically assigns the output file name by replacing the `.s` with a 126 | `.o` suffix. 127 | 128 | > Usage: `as test.s` 129 | 130 | The linker can take arbitrary input and output file names. The output 131 | file is specified with the `-o` option which must be first on the command 132 | line. 133 | 134 | > Usage: `ld [-r] -o output file1.o file2.o ...` 135 | 136 | If `-r` is specified, the linker partially links its input generating an 137 | object file as output. Without it, the output is an executable. At 138 | present, even when partial linking, there cannot be any undefined 139 | symbols in the output. 140 | 141 | 142 | TODO 143 | 144 | undefined symbols when partially linking (ld -r) 145 | ?? case insensitive mnemonics, registers & hex numbers 146 | -------------------------------------------------------------------------------- /stage-3/test1.s: -------------------------------------------------------------------------------- 1 | # stage-3/test1.s 2 | 3 | # Copyright (C) 2012, 2013 Richard Smith 4 | # All rights reserved. 5 | 6 | .data 7 | .local bar 8 | bar: 9 | .int 0x2A 10 | 11 | .text 12 | _start: 13 | MOVL %esp, %ebp 14 | 15 | # bar = square(bar) 16 | MOVL bar, %eax 17 | PUSH %eax 18 | CALL square 19 | POP %ecx 20 | MOVL %eax, bar 21 | 22 | MOVL bar, %eax 23 | MOVL %eax, %ecx 24 | MOVL foo, %eax 25 | CMPL %eax, %ecx 26 | SETNE %al 27 | MOVZBL %al, %eax 28 | MOVL %eax, foo 29 | 30 | # Call exit(foo) 31 | MOVL foo, %eax 32 | PUSH %eax 33 | 34 | # It should be safe to embed this literal in the middle of this function 35 | .data 36 | .string "\"Hello,\tworld\!\"" 37 | .byte 0xFF 38 | 39 | .text 40 | CALL exit 41 | HLT 42 | 43 | .data 44 | -------------------------------------------------------------------------------- /stage-3/test2.s: -------------------------------------------------------------------------------- 1 | # stage-3/test2.s 2 | 3 | # Copyright (C) 2012, 2013 Richard Smith 4 | # All rights reserved. 5 | 6 | .data 7 | .global bar 8 | bar: 9 | .int 0x09 10 | 11 | .text 12 | exit: 13 | PUSH %ebp 14 | MOVL %esp, %ebp 15 | PUSH %ebx 16 | 17 | MOVL 8(%ebp), %ebx 18 | MOVL $1, %eax # __NR_exit 19 | INT $0x80 20 | 21 | POP %ebx 22 | LEAVE 23 | RET 24 | 25 | .data 26 | .byte '!', '_' 27 | .zero 35 28 | .byte 0x22 29 | .int 'Fish' 30 | -------------------------------------------------------------------------------- /stage-3/test3.s: -------------------------------------------------------------------------------- 1 | # stage-3/test3.s 2 | 3 | # Copyright (C) 2012, 2013 Richard Smith 4 | # All rights reserved. 5 | 6 | .text 7 | .global square 8 | square: 9 | PUSH %ebp 10 | MOVL %esp, %ebp 11 | MOVL 8(%ebp), %eax 12 | MOVL %eax, %ecx 13 | MULL %ecx 14 | LEAVE 15 | RET 16 | 17 | .data 18 | .globl foo 19 | foo: 20 | .int 1764 # == 0x2A * 0x2A 21 | 22 | -------------------------------------------------------------------------------- /stage-4/.gitignore: -------------------------------------------------------------------------------- 1 | .*.swp 2 | cc 3 | crt0.o 4 | libc.o 5 | -------------------------------------------------------------------------------- /stage-4/Makefile: -------------------------------------------------------------------------------- 1 | # stage-4/Makefile 2 | 3 | # Copyright (C) 2012, 2013, 2014, 2016, 2020 4 | # Richard Smith 5 | # All rights reserved. 6 | 7 | SHELL = /bin/sh 8 | 9 | RM = /bin/rm 10 | CP = /bin/cp 11 | LN_S = /bin/ln -sf 12 | MAKE = /usr/bin/make 13 | CMP = /usr/bin/cmp 14 | 15 | BINDIR = ../bin 16 | LIBDIR = ../lib 17 | PATH = $(BINDIR) 18 | 19 | 20 | all: init cc libc.o crt0.o 21 | 22 | init: 23 | @test -d $(LIBDIR) || $(MAKE) -C .. init 24 | @test -x $(BINDIR)/ld || $(MAKE) -C ../stage-3 install 25 | 26 | # Suppress the default rules 27 | .SUFFIXES: 28 | 29 | %.o: %.s 30 | as $< 31 | 32 | %.s: %.c cc0 33 | ./cc0 -S $< 34 | 35 | # LIB_OBJS contain a minimal C library written entirely in assembler. 36 | # LIB0_OBJS adds bootstrap .o files written in assembler that will be replaced. 37 | # LIB1_OBJS adds to and replaces the LIB0_OBJS with versions written in C. 38 | 39 | LIB_OBJS = string.o ctype.o unistd.o char.o imath.o 40 | LIB0_OBJS = $(LIB_OBJS) error.o stdio.o memory.o 41 | LIB1_OBJS = $(LIB_OBJS) exit.o output.o input.o malloc.o signal.o string2.o \ 42 | stdarg.o 43 | 44 | CC_OBJS = i386.o scanner.o symtab.o expr.o stmt.o main.o 45 | 46 | # libc0.o is the primative libc, written solely in assembler. 47 | # cc0 is the compiler linked against the libc0.o. 48 | # libc.o is the complete stage-4 libc, with some C code compiled using cc0. 49 | # cc is the compiler relinked against the new libc.o. 50 | 51 | libc0.o: $(LIB0_OBJS) 52 | ld -r -o libc0.o $(LIB0_OBJS) 53 | 54 | cc0: libc0.o crt0.o $(CC_OBJS) 55 | ld -o cc0 libc0.o crt0.o $(CC_OBJS) 56 | 57 | 58 | libc.o: $(LIB1_OBJS) 59 | ld -r -o libc.o $(LIB1_OBJS) 60 | 61 | cc: libc.o crt0.o $(CC_OBJS) 62 | ld -o cc libc.o crt0.o $(CC_OBJS) 63 | 64 | .INTERMEDIATE: $(CC_OBJS) $(LIB1_OBJS) $(LIB0_OBJS) libc0.o cc0 65 | 66 | check: check-output check-input check-malloc 67 | 68 | check-output: cc output.c output.o 69 | $(LN_S) output.c output2.c 70 | ./cc -S output2.c 71 | as output2.s 72 | $(CMP) output2.o output.o 73 | $(RM) output2.c output2.o output2.s 74 | 75 | check-input: cc input.c input.o 76 | $(LN_S) input.c input2.c 77 | ./cc -S input2.c 78 | as input2.s 79 | $(CMP) input2.o input.o 80 | $(RM) input2.c input2.o input2.s 81 | 82 | check-malloc: cc malloc.c malloc.o 83 | $(LN_S) malloc.c malloc2.c 84 | ./cc -S malloc2.c 85 | as malloc2.s 86 | $(CMP) malloc2.o malloc.o 87 | $(RM) malloc2.c malloc2.o malloc2.s 88 | 89 | install: init cc libc.o crt0.o 90 | $(CP) cc $(BINDIR)/cc0 91 | $(CP) libc.o crt0.o $(LIBDIR) 92 | 93 | clean: 94 | $(RM) -f $(LIB0_OBJS) 95 | $(RM) -f $(LIB1_OBJS) output.s input.s malloc.s signal.s libc.o 96 | $(RM) -f $(CC_OBJS) cc0 cc crt0.o 97 | $(RM) -f output2.o output2.s input2.o input2.s malloc2.o malloc2.s 98 | 99 | world: 100 | set -e; for TARGET in clean init all check install; do \ 101 | $(MAKE) $$TARGET; \ 102 | done 103 | -------------------------------------------------------------------------------- /stage-4/README.txt: -------------------------------------------------------------------------------- 1 | BOOTSTRAP STAGE 4 2 | 3 | The main product of stage 4 is a complier. The original intention had 4 | been to implement a B compiler, perhaps with the caveat that the few 5 | constructs that changed syntactically between B and C would be 6 | implemented in the C way -- for example, += not =+, and 'extern' not 7 | 'extrn'. But it rapidly became obvious that the B memory model was 8 | not forwards compatible with the C memory model on machines with byte 9 | addressing. Implementing B on a x86 would require pointers to be 10 | represented as integer offsets into memory, and a pointer dereference, 11 | *ptr, would translate into a ModR/M + SIB instruction: (%ebx, %eax, 4) 12 | where %ebx is NULL and %eax is the pointer value. Taking the address 13 | of an automatic variable would be worse and involve an explicit bit- 14 | shift. 15 | 16 | Nevertheless, B's lack of a type system significantly simplifies the 17 | implementation, and this feature of B has been retained in the stage 4 18 | compiler. Our single type is a 32-bit integer which also serves as an 19 | address. Incrementing the value increments the underlying address by 20 | one, as with a char* in C. This means, that unlike in B, incrementing 21 | an address does not move to the next integer in an array: use ptr += 4 22 | for that. However, subscripting with [] works with 32-bit word offsets, 23 | so that ptr[1] is equivalent to *(ptr + 4). To treat a pointer as a 24 | string and get character-level access, B uses two functions lchar(s,n) 25 | and rchar(s,n,c) to get and set a character, respectively, at the given 26 | offset. These are provided in char.s. When types are introduced in a 27 | subsequent stage, this behaviour can be preserved because subscripting 28 | an int (other than with a pointer) is not legal in C. 29 | 30 | For forwards compatibility, certain type constructs are allowed and 31 | completely ignored. The (otherwise unsupported) int keyword may be 32 | placed immediately after auto, and the identifiers in an auto 33 | declaration can be preceded with one or more *. A list of parameter 34 | declarations may precede the opening brace of a function. 35 | 36 | Summary of differences from B: 37 | 38 | * Compound assignment operators are spelt OP= instead of =OP. 39 | * There are no relop assignment operators (e.g. =<, =>=, ===). 40 | * Definitions require '=' (i.e. 'i = 42' not 'i 42'). 41 | * Arrays require a size (i.e. 'auto a[1] = {0}' not 'auto a[] = {0}'). 42 | * Arrays with too many intialisers do not expand to accommodate them. 43 | * The '{' ... '}' around single-statement functions are required. 44 | * We support logical && and || complete with short circuiting. 45 | * We support the 'continue' keyword from C. 46 | * We support C's 'do' ... 'while' loop construct. 47 | * We allow 'static' on global variables and functions. 48 | * The return statement does not require brackets. 49 | * We don't allow backspace (character 0x7F) or dot (.) in identifiers. 50 | * The escape characters in strings is \ not *, and there is no \e. 51 | * We don't support the switch statement, or therefore case labels. 52 | * We don't support goto and labeled statements. 53 | * Not a difference, but B does not support 'for' loops and nor do we. 54 | 55 | The stage 4 compiler is a simple afair, making a single pass over the 56 | input file and code generation is done straight out of the parser, 57 | without building an abstract syntax tree (AST) representation. This 58 | means that the code generated is very inefficient, and even very obvious 59 | optimisations are not made. As an example, all lvalue-to-rvalue 60 | conversions are done as separate statements, so to read a local auto 61 | variable, we generated LEA -offset(%ebp), %eax; MOVL (%eax), %eax 62 | instead of the more obvious MOVL -offset(%ebp), %eax. 63 | 64 | Usage: cc -S file.c 65 | 66 | The compiler is initially linked against a trivial I/O library that 67 | implements the basic C I/O functions in an unbuffered manner, doing one 68 | syscall per call to getchar() or putchar(). Similarly, malloc() is 69 | implemented as in stage 3, by sending each allocation request to the 70 | kernel as a mmap(MAP_ANON) call. The resultant compiler, cc0, is used 71 | to compile an improved set of I/O and memory-management functions that 72 | do buffering. These are linked together with ld -r into a proto-C- 73 | library, libc.o. There is also a trivial startup file, crt0.o, that 74 | implements _start() by calling exit(main()). We use these to relink 75 | the compiler against this to produce a significantly faster compiler. 76 | 77 | Linking a program is typically achieved with a command such as: 78 | 79 | ld -o prog libc.o crt0.o file1.o file2.o ... 80 | 81 | -------------------------------------------------------------------------------- /stage-4/char.s: -------------------------------------------------------------------------------- 1 | # char.s -- functions write/read a character to/from a string 2 | 3 | # Copyright (C) 2013 Richard Smith 4 | # All rights reserved. 5 | 6 | # The functions here are needed to work around the lack of a character type 7 | # in the B language. 8 | 9 | .text 10 | 11 | #### # Function: char rchar(char const* s, size_t n); 12 | # 13 | # B's char function, renamed to avoid forward compatibility problems 14 | # with C's keyword. Returns the byte S[N], zero padded in a word. 15 | .globl rchar 16 | rchar: 17 | PUSH %ebp 18 | MOVL %esp, %ebp 19 | 20 | XORL %eax, %eax 21 | MOVL 8(%ebp), %edx 22 | ADDL 12(%ebp), %edx 23 | MOVB (%edx), %al 24 | 25 | POP %ebp 26 | RET 27 | 28 | #### # Function: char lchar(char* s, size_t n, char c); 29 | # 30 | # B's lchar function. Sets S[N] = C, and returns C, zero padded in 31 | # a word. 32 | .globl lchar 33 | lchar: 34 | PUSH %ebp 35 | MOVL %esp, %ebp 36 | 37 | XORL %eax, %eax 38 | MOVB 16(%ebp), %al 39 | MOVL 8(%ebp), %edx 40 | ADDL 12(%ebp), %edx 41 | MOVB %al, (%edx) 42 | 43 | POP %ebp 44 | RET 45 | 46 | -------------------------------------------------------------------------------- /stage-4/crt0.s: -------------------------------------------------------------------------------- 1 | # crt0.s 2 | 3 | # Copyright (C) 2012, 2013 Richard Smith 4 | # All rights reserved. 5 | 6 | #### # Function: void _start() 7 | # 8 | # The ELF entry point. 9 | _start: 10 | XORL %ebp, %ebp 11 | PUSH %ebp 12 | MOVL %esp, %ebp 13 | 14 | MOVL $__io_flush, %eax 15 | PUSH %eax 16 | CALL atexit 17 | POP %eax 18 | 19 | LEA 8(%ebp), %eax # argv 20 | PUSH %eax 21 | PUSH 4(%ebp) # argc 22 | CALL main 23 | PUSH %eax 24 | CALL exit 25 | HLT 26 | 27 | -------------------------------------------------------------------------------- /stage-4/ctype.s: -------------------------------------------------------------------------------- 1 | # ctype.s 2 | 3 | # Copyright (C) 2012, 2013 Richard Smith 4 | # All rights reserved. 5 | 6 | .data 7 | .local ctype_tbl 8 | ctype_tbl: 9 | 10 | # This table contains character bits for the 0x80 characters in ASCII: 11 | # 12 | # space = 0x1, print = 0x2, cntrl = 0x4, upper = 0x8, 13 | # lower = 0x10, alpha = 0x20, digit = 0x40, punct = 0x80, xdigit = 0x100 14 | # 15 | # The categories match those in [category.ctype] in C++ and enable 16 | # effecient implementation of the C standard library ctype functions. 17 | # NB. We would ideally use .short, but the level 3 as doesn't support 18 | # either the directive or the necessary operations on 16-bit registers. 19 | 20 | .int 0x004, 0x004, 0x004, 0x004, 0x004, 0x004, 0x004, 0x004 # 0x00 21 | .int 0x004, 0x005, 0x005, 0x005, 0x005, 0x005, 0x004, 0x004 22 | .int 0x004, 0x004, 0x004, 0x004, 0x004, 0x004, 0x004, 0x004 # 0x10 23 | .int 0x004, 0x004, 0x004, 0x004, 0x004, 0x004, 0x004, 0x004 24 | .int 0x003, 0x082, 0x082, 0x082, 0x082, 0x082, 0x082, 0x082 # 0x20 25 | .int 0x082, 0x082, 0x082, 0x082, 0x082, 0x082, 0x082, 0x082 26 | .int 0x142, 0x142, 0x142, 0x142, 0x142, 0x142, 0x142, 0x142 # 0x30 27 | .int 0x142, 0x142, 0x082, 0x082, 0x082, 0x082, 0x082, 0x082 28 | .int 0x082, 0x12A, 0x12A, 0x12A, 0x12A, 0x12A, 0x12A, 0x02A # 0x40 29 | .int 0x02A, 0x02A, 0x02A, 0x02A, 0x02A, 0x02A, 0x02A, 0x02A 30 | .int 0x02A, 0x02A, 0x02A, 0x02A, 0x02A, 0x02A, 0x02A, 0x02A # 0x50 31 | .int 0x02A, 0x02A, 0x02A, 0x082, 0x082, 0x082, 0x082, 0x082 32 | .int 0x082, 0x132, 0x132, 0x132, 0x132, 0x132, 0x132, 0x032 # 0x60 33 | .int 0x032, 0x032, 0x032, 0x032, 0x032, 0x032, 0x032, 0x032 34 | .int 0x032, 0x032, 0x032, 0x032, 0x032, 0x032, 0x032, 0x032 # 0x70 35 | .int 0x032, 0x032, 0x032, 0x082, 0x082, 0x082, 0x082, 0x004 36 | 37 | 38 | .text 39 | .local getctype 40 | #### # Function: int getctype(int chr); 41 | # 42 | # A utility function to return the character class(es) of CHR. 43 | getctype: 44 | PUSH %ebp 45 | MOVL %esp, %ebp 46 | 47 | # Characters >= 0x80 (as unsigned, which catches EOF) return 0 48 | XORL %eax, %eax 49 | MOVL 8(%ebp), %edx 50 | CMPL $0x80, %edx 51 | JAE .L1 52 | 53 | # Find ctype_tbl[%eax] 54 | # Unfortunately, stage 3 as doesn't support MOVL (%eax,%edx,4), %eax. 55 | MOVB $2, %cl 56 | SHLL %edx # Multiply %edx by 2 57 | MOVL $ctype_tbl, %eax 58 | ADDL %edx, %eax 59 | MOVL (%eax), %eax 60 | 61 | .L1: 62 | POP %ebp 63 | RET 64 | 65 | 66 | 67 | #### # Function: int isspace(int c); 68 | # 69 | # Standard C library function to test for ASCII space characters. 70 | .globl isspace 71 | isspace: 72 | PUSH %ebp 73 | MOVL %esp, %ebp 74 | PUSH 8(%ebp) 75 | CALL getctype 76 | POP %ecx 77 | ANDL $0x1, %eax 78 | POP %ebp 79 | RET 80 | 81 | 82 | #### # Function: int isdigit(int c); 83 | # 84 | # Standard C library function to test for ASCII digits, 0-9. 85 | .globl isdigit 86 | isdigit: 87 | PUSH %ebp 88 | MOVL %esp, %ebp 89 | PUSH 8(%ebp) 90 | CALL getctype 91 | POP %ecx 92 | ANDL $0x40, %eax 93 | POP %ebp 94 | RET 95 | 96 | 97 | #### # Function: int isalpha(int c); 98 | # 99 | # Standard C library function to test for ASCII letter, a-z, A-Z. 100 | .globl isalpha 101 | isalpha: 102 | PUSH %ebp 103 | MOVL %esp, %ebp 104 | PUSH 8(%ebp) 105 | CALL getctype 106 | POP %ecx 107 | ANDL $0x20, %eax 108 | POP %ebp 109 | RET 110 | 111 | 112 | #### # Function: int isalnum(int c); 113 | # 114 | # Standard C library function to test for ASCII letters or digits. 115 | .globl isalnum 116 | isalnum: 117 | PUSH %ebp 118 | MOVL %esp, %ebp 119 | PUSH 8(%ebp) 120 | CALL getctype 121 | POP %ecx 122 | ANDL $0x60, %eax 123 | POP %ebp 124 | RET 125 | 126 | 127 | #### # Function: int ispunct(int c); 128 | # 129 | # Standard C library function to test for ASCII punctuation, 130 | # i.e. anything that is not a digit, letter, space or control. 131 | .globl ispunct 132 | ispunct: 133 | PUSH %ebp 134 | MOVL %esp, %ebp 135 | PUSH 8(%ebp) 136 | CALL getctype 137 | POP %ecx 138 | ANDL $0x80, %eax 139 | POP %ebp 140 | RET 141 | 142 | 143 | #### # Function: int isxdigit(int c); 144 | # 145 | # Standard C library function to test for hex ASCII digits, 0-9A-Fa-f. 146 | .globl isxdigit 147 | isxdigit: 148 | PUSH %ebp 149 | MOVL %esp, %ebp 150 | PUSH 8(%ebp) 151 | CALL getctype 152 | POP %ecx 153 | ANDL $0x100, %eax 154 | POP %ebp 155 | RET 156 | 157 | 158 | -------------------------------------------------------------------------------- /stage-4/error.s: -------------------------------------------------------------------------------- 1 | # error.s -- bootstrap code for error handling 2 | 3 | # Copyright (C) 2012, 2013 Richard Smith 4 | # All rights reserved. 5 | 6 | #### # Function: void _error() 7 | # 8 | # All library error handling is done here. 9 | # (Note we can JMP here instead of CALLing it, as we never RET.) 10 | _error: 11 | MOVL $1, %eax 12 | PUSH %eax 13 | CALL exit 14 | HLT 15 | 16 | 17 | #### # Function: int atexit( void (*fn)(void) ) 18 | # 19 | # Dummy function that does nothing. 20 | atexit: 21 | RET 22 | 23 | 24 | #### # Function: void exit(int status) 25 | # 26 | # Clear up streams and terminate program execution with given status. 27 | exit: 28 | PUSH %ebp 29 | MOVL %esp, %ebp 30 | 31 | PUSH 8(%ebp) 32 | CALL _exit 33 | HLT 34 | -------------------------------------------------------------------------------- /stage-4/exit.c: -------------------------------------------------------------------------------- 1 | /* exit.c 2 | * 3 | * Copyright (C) 2013 Richard Smith 4 | * All rights reserved. 5 | */ 6 | 7 | static atexit_vec[3] = { 0, 0, 0 }; /* start, end, end_store */ 8 | 9 | atexit(fn) { 10 | if ( !atexit_vec[0] ) { 11 | auto sz = 1 * 4; /* 4 == sizeof( void (*)() ) */ 12 | auto p = malloc(sz); 13 | atexit_vec[0] = atexit_vec[1] = p; 14 | atexit_vec[2] = p + sz; 15 | } 16 | else { 17 | auto sz = atexit_vec[1] - atexit_vec[0]; 18 | auto p = atexit_vec[0]; 19 | if ( atexit_vec[1] == atexit_vec[2] ) { 20 | p = realloc( atexit_vec[0], 2*sz ); 21 | atexit_vec[0] = p; 22 | atexit_vec[1] = p + sz; 23 | atexit_vec[2] = p + 2*sz; 24 | } 25 | } 26 | 27 | *atexit_vec[1] = fn; 28 | atexit_vec[1] += 4; /* sizeof( void (*)() ) */ 29 | return 1; 30 | } 31 | 32 | exit(code) { 33 | /* Call registered functions in reverse order */ 34 | auto ptr = atexit_vec[1], start = atexit_vec[0]; 35 | while ( ptr > start ) { 36 | ptr -= 4; /* sizeof( void (*)() ) */ 37 | (*ptr)(); 38 | } 39 | 40 | /* We don't need to flush stdout and stderr, because there is an 41 | * atexit handler registered that does that. */ 42 | 43 | _exit( code ); 44 | } 45 | -------------------------------------------------------------------------------- /stage-4/imath.s: -------------------------------------------------------------------------------- 1 | # imath.s -- functions for integer maths 2 | 3 | # Copyright (C) 2013 Richard Smith 4 | # All rights reserved. 5 | 6 | .text 7 | 8 | #### # Function: int abs(int i); 9 | # 10 | # The C library's abs() function. 11 | .globl abs 12 | abs: 13 | PUSH %ebp 14 | MOVL %esp, %ebp 15 | MOVL 8(%ebp), %eax 16 | 17 | # Implementation without branching 18 | CDQ # sign extend into %edx:%eax 19 | XORL %edx, %eax 20 | SUBL %edx, %eax 21 | 22 | POP %ebp 23 | RET 24 | 25 | 26 | #### # Function: unsigned __mul_add( unsigned* val, 27 | # unsigned mul, unsigned add ); 28 | # 29 | # Calculate *val = *val * mul + add and return the carry bits 30 | # (i.e. the high 32 bits). This is used in strtoul(). 31 | .globl __mul_add 32 | __mul_add: 33 | PUSH %ebp 34 | MOVL %esp, %ebp 35 | 36 | MOVL 8(%ebp), %ecx 37 | MOVL (%ecx), %eax 38 | MULL 12(%ebp) # %edx:%eax <= *val * mul (unsigned) 39 | 40 | XORL %ecx, %ecx # do first as it clears CF 41 | ADDL 16(%ebp), %eax # %eax += add; sets CF 42 | ADCL %ecx, %edx # %edx += CF (%ecx is zero) 43 | 44 | MOVL 8(%ebp), %ecx 45 | MOVL %eax, (%ecx) # Update *val 46 | MOVL %edx, %eax # Return carry bits from %edx 47 | 48 | POP %ebp 49 | RET 50 | 51 | 52 | #### # Function: void __add64( unsigned* hi1, unsigned* low1, 53 | # unsigned hi2, unsigned low2 ); 54 | # 55 | # Treat hi1:low1 and h12:low2 as 64-bit integers and add them. 56 | .globl __add64 57 | __add64: 58 | PUSH %ebp 59 | MOVL %esp, %ebp 60 | 61 | MOVL 12(%ebp), %ecx 62 | MOVL 20(%ebp), %eax 63 | ADDL %eax, (%ecx) # sets CF 64 | 65 | MOVL 8(%ebp), %ecx 66 | MOVL 16(%ebp), %eax 67 | ADCL %eax, (%ecx) # adds in CF 68 | 69 | POP %ebp 70 | RET 71 | 72 | #### # Function: void __sub64( unsigned* hi1, unsigned* low1, 73 | # unsigned hi2, unsigned low2 ); 74 | # 75 | # Treat hi1:low1 and h12:low2 as 64-bit integers and subtract the 76 | # second from the first.. 77 | .globl __sub64 78 | __sub64: 79 | PUSH %ebp 80 | MOVL %esp, %ebp 81 | 82 | MOVL 12(%ebp), %ecx 83 | MOVL 20(%ebp), %eax 84 | SUBL (%ecx), %eax # sets CF 85 | 86 | MOVL 8(%ebp), %ecx 87 | MOVL 16(%ebp), %eax 88 | SBBL (%ecx), %eax # subtracts CF too 89 | 90 | POP %ebp 91 | RET 92 | 93 | -------------------------------------------------------------------------------- /stage-4/main.s: -------------------------------------------------------------------------------- 1 | # main.s -- entry point 2 | 3 | # Copyright (C) 2012, 2013 Richard Smith 4 | # All rights reserved. 5 | 6 | .data 7 | .globl frame_size 8 | frame_size: 9 | .int 0 10 | 11 | #### # Function: void strg_class(char* name, bool is_static); 12 | # 13 | # Emit a .globl or .local directive for NAME as appropriate for 14 | # the storage class in IS_STATIC 15 | .data .LC2: 16 | .string ".globl\t%s\n" 17 | .LC3: 18 | .string ".local\t%s\n" 19 | .text 20 | .local strg_class 21 | strg_class: 22 | PUSH %ebp 23 | MOVL %esp, %ebp 24 | 25 | PUSH 8(%ebp) 26 | MOVL 12(%ebp), %eax 27 | TESTL %eax, %eax 28 | JZ .L9 29 | MOVL $.LC3, %eax 30 | JMP .L10 31 | .L9: 32 | MOVL $.LC2, %eax 33 | .L10: 34 | PUSH %eax 35 | CALL printf 36 | 37 | LEAVE 38 | RET 39 | 40 | 41 | #### # Function: int init_a_decl(int dim); 42 | # 43 | # init-a-list ::= constant ( ',' constant )* 44 | # 45 | # init-a-decl ::= ( '=' '{' init-a-list '}' )? ';' 46 | # 47 | # Current token is '=' if an initialiser is present. 48 | # Returns the number of uninitialised elements. 49 | .data .LC5: 50 | .string ".int\t" 51 | .text 52 | .local init_a_decl 53 | init_a_decl: 54 | PUSH %ebp 55 | MOVL %esp, %ebp 56 | PUSH 8(%ebp) # -4(%ebp) local copy of dim 57 | 58 | MOVL token, %eax 59 | CMPL '=', %eax 60 | JNE .L11 61 | CALL next 62 | CMPL '{', %eax 63 | JNE _error 64 | 65 | MOVL $.LC5, %eax 66 | PUSH %eax 67 | CALL putstr 68 | POP %eax 69 | 70 | .L12: # Loop over initialisers. 71 | # Do we have too many? 72 | DECL -4(%ebp) 73 | CMPL $0, -4(%ebp) 74 | JL _error 75 | 76 | CALL next 77 | CMPL 'num', %eax 78 | JE .L13 79 | CMPL 'char', %eax 80 | JE .L13 81 | CMPL 'id', %eax 82 | JE .L13 83 | JMP _error 84 | 85 | .L13: # number or char 86 | MOVL $value, %eax 87 | PUSH %eax 88 | CALL putstr 89 | POP %eax 90 | 91 | CALL next 92 | CMPL ',', %eax 93 | JNE .L14 94 | 95 | PUSH %eax 96 | CALL putchar 97 | POP %eax 98 | JMP .L12 99 | 100 | .L14: 101 | CMPL '}', %eax 102 | JNE _error 103 | 104 | MOVL '\n', %eax 105 | PUSH %eax 106 | CALL putchar 107 | POP %eax 108 | 109 | CALL next 110 | .L11: 111 | CMPL ';', %eax 112 | JNE _error 113 | CALL next 114 | 115 | POP %eax 116 | POP %ebp 117 | RET 118 | 119 | 120 | #### # Function: void array_decl(char* name, bool is_static); 121 | # 122 | # array-decl ::= ( 'static' )? name '[' number '] init-a-decl 123 | # 124 | # Process an array declaration for NAME. Current token is '['. 125 | 126 | .data .LC4: 127 | .string ".data\n%s:\n" 128 | .LC6: 129 | .string ".zero\t%d\n" 130 | .text 131 | .local array_decl 132 | array_decl: 133 | PUSH %ebp 134 | MOVL %esp, %ebp 135 | 136 | # Require an array size 137 | CALL next 138 | CMPL 'num', %eax 139 | JNE _error 140 | 141 | PUSH %eax # endptr slot 142 | MOVL %esp, %ecx 143 | 144 | XORL %eax, %eax 145 | PUSH %eax # guess base 146 | PUSH %ecx # &endptr 147 | MOVL $value, %eax 148 | PUSH %eax 149 | CALL strtol 150 | ADDL $12, %esp 151 | POP %ecx 152 | PUSH %eax # store size 153 | 154 | # %ecx contains the end ptr 155 | MOVL (%ecx), %ecx 156 | CMPB $0, %cl 157 | JNE _error 158 | 159 | MOVB $2, %cl # *= sizeof(int) 160 | SHLL %eax 161 | PUSH %eax # byte size: probably not neeed? 162 | XORL %eax, %eax 163 | PUSH %eax # not an lvalue 164 | PUSH %eax # frame offset == 0 (i.e. for linker) 165 | PUSH 8(%ebp) # name 166 | CALL save_sym 167 | ADDL $16, %esp 168 | 169 | CALL next 170 | CMPL ']', %eax 171 | JNE _error 172 | 173 | # Emit the symbol name 174 | PUSH 8(%ebp) 175 | MOVL $.LC4, %eax 176 | PUSH %eax 177 | CALL printf 178 | POP %eax 179 | POP %eax 180 | 181 | # Do a series of .int decls for the initialisers 182 | CALL next 183 | CALL init_a_decl 184 | 185 | MOVB $2, %cl # *= sizeof(int) 186 | SHLL %eax 187 | TESTL %eax, %eax 188 | JZ .L15 189 | 190 | PUSH %eax 191 | MOVL $.LC6, %eax 192 | PUSH %eax 193 | CALL printf 194 | POP %eax 195 | POP %eax 196 | 197 | .L15: 198 | LEAVE 199 | RET 200 | 201 | 202 | #### # Function: void int_decl(char* name, bool is_static); 203 | # 204 | # constant ::= number | char | name 205 | # 206 | # int_decl ::= ( 'static' )? name ( '=' constant )? ';' 207 | # 208 | # Process an integer declaration for NAME. The name has been read, 209 | # and TOKEN advanced to the next token: either '=' or ';' 210 | .data .LC1: 211 | .string ".data\n%s:\n\t.int %s\n" 212 | .text 213 | .local int_decl 214 | int_decl: 215 | PUSH %ebp 216 | MOVL %esp, %ebp 217 | 218 | XORL %eax, %eax 219 | PUSH %eax # external, so zero size on stack 220 | INCL %eax 221 | PUSH %eax # objects are lvalues 222 | DECL %eax 223 | PUSH %eax # frame_off == 0 for undefined 224 | PUSH 8(%ebp) # name 225 | CALL save_sym 226 | ADDL $16, %esp 227 | 228 | MOVL '0', %eax 229 | PUSH %eax 230 | MOVL token, %eax 231 | CMPL '=', %eax 232 | JNE .L3 233 | 234 | # Read an initialiser 235 | CALL next 236 | CMPL 'num', %eax 237 | JE .L3a 238 | CMPL 'char', %eax 239 | JE .L3a 240 | CMPL 'id', %eax 241 | JE .L3a 242 | JMP _error 243 | 244 | .L3a: 245 | # Because the next token is punctuation, value is not set. 246 | CALL next 247 | MOVL $value, %eax 248 | PUSH %eax 249 | JMP .L4 250 | .L3: 251 | PUSH %esp 252 | .L4: 253 | PUSH 8(%ebp) 254 | MOVL $.LC1, %eax 255 | PUSH %eax 256 | CALL printf 257 | POP %eax 258 | POP %eax 259 | POP %eax 260 | 261 | MOVL token, %eax 262 | CMPL ';', %eax 263 | JNE _error 264 | CALL next 265 | 266 | LEAVE 267 | RET 268 | 269 | 270 | #### # Function: void func_decl(char* name, bool is_static); 271 | # 272 | # func-params ::= name ( ',' name )* 273 | # 274 | # func-head ::= ( 'static' )? name '(' func-params? ')' 275 | # 276 | # func-decl ::= func-head param-decls block 277 | # 278 | # Process a function declaration. Current token is '('. 279 | .local func_decl 280 | func_decl: 281 | PUSH %ebp 282 | MOVL %esp, %ebp 283 | 284 | CALL new_scope 285 | XORL %eax, %eax 286 | MOVL %eax, frame_size 287 | 288 | CALL next 289 | CMPL ')', %eax 290 | JE .L5 291 | 292 | MOVL $4, %ecx 293 | PUSH %ecx # all parameters have size 4 -4(%ebp) 294 | MOVL $1, %ecx 295 | PUSH %ecx # parameters are lvalues -8(%ebp) 296 | MOVL $8, %ecx 297 | PUSH %ecx # frame_off -12(%ebp) 298 | 299 | .L5a: 300 | CMPL 'id', %eax 301 | JNE _error 302 | 303 | MOVL $value, %eax 304 | PUSH %eax 305 | CALL save_sym 306 | POP %eax 307 | CALL next 308 | 309 | ADDL $4, -12(%ebp) 310 | CMPL ',', %eax 311 | JNE .L5b 312 | CALL next 313 | JMP .L5a 314 | 315 | .L5b: 316 | CMPL ')', %eax 317 | JNE _error 318 | POP %eax 319 | .L5: 320 | PUSH 8(%ebp) 321 | CALL prolog 322 | POP %eax 323 | 324 | CALL new_label # For return 325 | PUSH %eax 326 | XORL %eax, %eax 327 | PUSH %eax 328 | PUSH %eax 329 | 330 | CALL next 331 | CALL param_decls 332 | CALL block 333 | POP %eax 334 | POP %eax 335 | CALL local_label 336 | POP %eax 337 | 338 | # Don't call clear_stack because this scope only contains 339 | # function parameters, and the caller cleans up the stack. 340 | CALL end_scope 341 | CALL epilog 342 | 343 | LEAVE 344 | RET 345 | 346 | #### # Function: void ext_decl(); 347 | # 348 | # Process an external declaration. 349 | # 350 | # ext-decl ::= func-decl | int-decl | array-decl 351 | # 352 | # When called, TOKEN should be the name. 353 | .local ext_decl 354 | ext_decl: 355 | PUSH %ebp 356 | MOVL %esp, %ebp 357 | 358 | SUBL $16, %esp # -16(%ebp) buffer 359 | 360 | # Are we static? 361 | XORL %eax, %eax 362 | PUSH %eax # bool is_static = false; -20(%ebp) 363 | MOVL token, %eax 364 | CMPL 'stat', %eax 365 | JNE .L8 366 | INCL -20(%ebp) # is_static = 1 367 | CALL next 368 | .L8: 369 | CALL skip_type 370 | .L8a: 371 | # Skip any pointer declarators 372 | MOVL token, %eax 373 | CMPL '*', %eax 374 | JNE .L8b 375 | CALL next 376 | JMP .L8a 377 | .L8b: 378 | # Check that we've read a identifier first 379 | CMPL 'id', %eax 380 | JNE _error 381 | 382 | # Store a copy of the name. (We can't emit the label yet as we 383 | # don't yet know which section it belongs in.) 384 | MOVL $value, %eax 385 | PUSH %eax # src 386 | CALL strlen 387 | CMPL $11, %eax 388 | JG _error 389 | LEA -16(%ebp), %eax 390 | PUSH %eax # dest 391 | CALL strcpy 392 | POP %ecx 393 | POP %eax 394 | PUSH %ecx # Pointer to name 395 | 396 | CALL strg_class 397 | 398 | # Get the next token and dispatch based on it. 399 | CALL next 400 | CMPL '(', %eax 401 | JE .L6 402 | CMPL '[', %eax 403 | JE .L6a 404 | CALL int_decl 405 | JMP .L7 406 | .L6: 407 | CALL func_decl 408 | JMP .L7 409 | .L6a: 410 | CALL array_decl 411 | .L7: 412 | LEAVE 413 | RET 414 | 415 | #### # Function: void program(); 416 | # 417 | # program ::= ext-decl* 418 | # 419 | program: 420 | PUSH %ebp 421 | MOVL %esp, %ebp 422 | .L1: 423 | MOVL token, %eax 424 | CMPL $-1, %eax 425 | JE .L2 426 | 427 | CALL ext_decl 428 | 429 | # Add a blank line 430 | MOVL '\n', %eax 431 | PUSH %eax 432 | CALL putchar 433 | POP %eax 434 | JMP .L1 435 | .L2: 436 | POP %ebp 437 | RET 438 | 439 | 440 | #### # Function: int main(int argc, char** argv); 441 | # 442 | main: 443 | PUSH %ebp 444 | MOVL %esp, %ebp 445 | 446 | CMPL $3, 8(%ebp) # Require at least two arguments: -S file.c 447 | JL _error 448 | 449 | # Check that argv[1] == '-S' 450 | MOVL '-S', %eax 451 | PUSH %eax 452 | PUSH %esp 453 | MOVL 12(%ebp), %eax 454 | PUSH 4(%eax) 455 | CALL strcmp 456 | ADDL $12, %esp 457 | TESTL %eax, %eax 458 | JNZ _error 459 | 460 | # Do we have -o file.o? 461 | MOVL $8, %edx # 8 == 4*argn 462 | CMPL $3, 8(%ebp) # If just two args (-S file.c) 463 | JE .L16 464 | CMPL $5, 8(%ebp) # Otherwise four (-S -o file.o file.c) 465 | JNE _error 466 | 467 | # Check that argv[2] == '-o' 468 | MOVL '-o', %eax 469 | PUSH %eax 470 | PUSH %esp 471 | MOVL 12(%ebp), %eax 472 | PUSH 8(%eax) 473 | CALL strcmp 474 | ADDL $12, %esp 475 | TESTL %eax, %eax 476 | JNZ _error 477 | 478 | MOVL $16, %edx # 16 == 4*argn 479 | 480 | .L16: 481 | # Use argv[4*argn] as a filename and reopen stdin as it. 482 | MOVL 'r', %eax 483 | PUSH %eax 484 | MOVL %esp, %ecx 485 | MOVL stdin, %eax 486 | PUSH %eax # stream 487 | PUSH %ecx # mode 488 | MOVL 12(%ebp), %eax 489 | ADDL %edx, %eax 490 | PUSH (%eax) # filename (argv[argn]) 491 | CALL freopen 492 | ADDL $16, %esp 493 | TESTL %eax, %eax 494 | JZ _error 495 | 496 | # Do we have an explicit output filename? 497 | MOVL 12(%ebp), %eax 498 | MOVL 12(%eax), %edx 499 | CMPL $5, 8(%ebp) 500 | JE .L17 501 | 502 | # Construct the output filename. 503 | MOVL 12(%ebp), %eax 504 | PUSH 8(%eax) 505 | CALL strlen 506 | POP %edx 507 | ADDL %eax, %edx 508 | CMPB 'c', -1(%edx) 509 | JNE _error 510 | CMPB '.', -2(%edx) 511 | JNE _error 512 | MOVB 's', -1(%edx) 513 | MOVL 12(%ebp), %eax 514 | MOVL 8(%eax), %edx 515 | 516 | .L17: 517 | # And reopen stdout as it. 518 | MOVL 'w', %eax 519 | PUSH %eax 520 | MOVL %esp, %ecx 521 | MOVL stdout, %eax 522 | PUSH %eax # stream 523 | PUSH %ecx # mode 524 | PUSH %edx # filename 525 | CALL freopen 526 | ADDL $16, %esp 527 | TESTL %eax, %eax 528 | JZ _error 529 | 530 | CALL init_symtab 531 | CALL next 532 | CALL program 533 | XORL %eax, %eax 534 | LEAVE 535 | RET 536 | 537 | 538 | #### # Function: void _error() 539 | # 540 | # All error handling is done here. 541 | # NB. There is a duplicate (identical) defintion in libc0.o 542 | # (Note we can JMP here instead of CALLing it, as we never RET.) 543 | _error: 544 | MOVL $1, %eax 545 | PUSH %eax 546 | CALL exit 547 | HLT 548 | 549 | 550 | -------------------------------------------------------------------------------- /stage-4/malloc.c: -------------------------------------------------------------------------------- 1 | /* malloc.c 2 | * 3 | * Copyright (C) 2013, 2018 Richard Smith 4 | * All rights reserved. 5 | */ 6 | 7 | 8 | /* struct header { 9 | * size_t size; 10 | * bool is_free; 11 | * struct header* next; 12 | * struct header* prev; 13 | * void* page_start; // effectively a page id 14 | * }; */ 15 | 16 | static 17 | __heap = 0; 18 | 19 | /* This is a function pointer: void (*__membdgfn)( int op, void *ptr ) 20 | * If set, it is called just before malloc() returns with op == 1, and 21 | * just after free() is called with a non-zero pointer with op == 2. */ 22 | static 23 | __memdbgfn = 0; 24 | 25 | __dbg_alloc( fn ) { 26 | __memdbgfn = fn; 27 | } 28 | 29 | static 30 | __find_blk( last, size ) { 31 | /* Look for a block of at least SIZE bytes in the list at LAST */ 32 | while ( last && !( last[1] && last[0] >= size ) ) 33 | last = last[2]; 34 | return last; 35 | } 36 | 37 | static 38 | __new_blk( size ) { 39 | /* Allocate a new block for at least SIZE bytes and prepend to __heap */ 40 | auto blksz = size > 0x0FEC ? size : 0x0FEC; /* 0x0FEC == 0x1000 - 20 */ 41 | auto p = mmap(0, blksz + 20, 0x3, 0x22, -1, 0); 42 | p[0] = blksz; 43 | p[1] = 1; 44 | p[2] = __heap; 45 | p[3] = 0; 46 | p[4] = p; 47 | if (__heap) __heap[3] = p; 48 | __heap = p; 49 | return p; 50 | } 51 | 52 | static 53 | __frag( blk, size ) { 54 | /* If the block BLK is significantly bigger than SIZE bytes, then fragment 55 | * it into a block of exactly SIZE bytes and a second block for the rest */ 56 | if ( blk[0] >= size + 20 + 4 ) { 57 | auto b2 = blk + size + 20; 58 | b2[0] = blk[0] - size - 20; 59 | b2[1] = 1; 60 | b2[2] = blk[2]; 61 | b2[3] = blk; 62 | b2[4] = blk[4]; 63 | if (b2[2]) b2[2][3] = b2; 64 | blk[0] = size; 65 | blk[2] = b2; 66 | } 67 | } 68 | 69 | static 70 | __defrag2( blk, b2 ) { 71 | /* The block at BLK and the next block, which is at B2, are both empty 72 | * so coalesce them into a single block. */ 73 | blk[0] += b2[0] + 20; 74 | blk[2] = b2[2]; 75 | if (blk[2]) blk[2][3] = blk; 76 | } 77 | 78 | static 79 | __defrag( blk ) { 80 | /* See whether the block at BLK can be coalesced with either neighbour */ 81 | if ( blk[2] && blk[4] == blk[2][4] && blk[2][1] ) __defrag2( blk, blk[2] ); 82 | if ( blk[3] && blk[4] == blk[3][4] && blk[3][1] ) __defrag2( blk[3], blk ); 83 | } 84 | 85 | /* The C library malloc() */ 86 | malloc( size ) { 87 | auto p = __find_blk( __heap, size ); 88 | if (!p) p = __new_blk( size ); 89 | __frag( p, size ); 90 | p[1] = 0; 91 | auto ptr = p + 20; 92 | if (__memdbgfn) __memdbgfn( 1, ptr ); 93 | return ptr; 94 | } 95 | 96 | /* The C library free() */ 97 | free( ptr ) { 98 | if (ptr) { 99 | if (__memdbgfn) __memdbgfn( 2, ptr ); 100 | auto p = ptr - 20; 101 | if (p[1]) abort(); /* Double free */ 102 | p[1] = 1; 103 | __defrag( p ); 104 | } 105 | } 106 | 107 | /* The C library realloc() */ 108 | realloc( ptr, size ) { 109 | if ( !ptr ) 110 | return malloc( size ); 111 | 112 | auto h = ptr - 20; 113 | if ( h[2] && h[4] == h[2][4] && h[2][1] && h[0] + h[2][0] + 20 >= size ) { 114 | __defrag2( h, h[2] ); 115 | __frag( h, size ); 116 | return ptr; 117 | } 118 | else { 119 | auto p = malloc(size); 120 | auto cpsz = h[0] < size ? h[0] : size; 121 | memcpy( p, ptr, cpsz ); 122 | free( ptr ); 123 | return p; 124 | } 125 | } 126 | -------------------------------------------------------------------------------- /stage-4/memory.s: -------------------------------------------------------------------------------- 1 | # memory.s -- bootstrap code for memory handling 2 | 3 | # Copyright (C) 2012, 2013 Richard Smith 4 | # All rights reserved. 5 | 6 | #### # Function: void* malloc(size_t sz) 7 | # Crude dynamic memory allocation, by punting directly to kernel 8 | malloc: 9 | PUSH %ebp 10 | MOVL %esp, %ebp 11 | PUSH %ebx 12 | 13 | # How many bytes do we need? 14 | MOVL 8(%ebp), %ecx # sz 15 | ADDL $0x4, %ecx # header containing size 16 | 17 | # Punt off to mmap(MAP_ANON). Highly suboptimal, but simple to code. 18 | XORL %eax, %eax # 0 offset 19 | PUSH %eax 20 | DECL %eax 21 | PUSH %eax # fd -1 for MAP_ANON 22 | MOVL $0x22, %eax # MAP_ANON (0x20) | MAP_PRIVATE (0x2) 23 | PUSH %eax 24 | MOVL $0x3, %eax # PROT_READ (0x1) | PROT_WRITE (0x2) 25 | PUSH %eax 26 | PUSH %ecx # size 27 | XORL %eax, %eax # NULL 28 | PUSH %eax 29 | CALL mmap 30 | CMPL $-1, %eax 31 | JE _error 32 | MOVL -24(%ebp), %ecx # restore %ecx 33 | 34 | # Write size into malloc header 35 | MOVL %ecx, (%eax) 36 | ADDL $4, %eax 37 | 38 | # Cleanup 39 | MOVL -4(%ebp), %ebx 40 | LEAVE 41 | RET 42 | 43 | 44 | #### # Function: void* realloc(void* ptr, size_t sz) 45 | # Grows memory allocated by malloc, above 46 | realloc: 47 | PUSH %ebp 48 | MOVL %esp, %ebp 49 | PUSH %ebx 50 | PUSH %esi 51 | 52 | # Leave space for header (4 bytes) 53 | MOVL 8(%ebp), %ebx # ptr 54 | SUBL $4, %ebx 55 | MOVL (%ebx), %ecx # old size 56 | MOVL 12(%ebp), %edx # size 57 | ADDL $4, %edx 58 | PUSH %edx 59 | 60 | # Get kernel to mremap the block 61 | MOVL $1, %esi # 1 == MREMAP_MAYMOVE 62 | MOVL $163, %eax # 163 == __NR_mremap 63 | INT $0x80 64 | CMPL $-4096, %eax # -4095 <= %eax < 0 for errno 65 | JA _error # unsigned comparison handles above 66 | 67 | # Write header 68 | POP %ecx 69 | MOVL %ecx, (%eax) 70 | ADDL $4, %eax 71 | 72 | # Cleanup 73 | POP %esi 74 | POP %ebx 75 | POP %ebp 76 | RET 77 | 78 | 79 | #### # Function: void free(void* ptr) 80 | # Grows memory allocated by malloc, above 81 | free: 82 | PUSH %ebp 83 | MOVL %esp, %ebp 84 | PUSH %ebx 85 | 86 | MOVL 8(%ebp), %ebx # ptr 87 | SUBL $4, %ebx 88 | MOVL (%ebx), %ecx # old size 89 | MOVL $91, %eax # 91 == __NR_munmap 90 | INT $0x80 91 | CMPL $-4096, %eax # -4095 <= %eax < 0 for errno 92 | JA _error # unsigned comparison handles above 93 | 94 | # Cleanup 95 | POP %ebx 96 | POP %ebp 97 | RET 98 | -------------------------------------------------------------------------------- /stage-4/scanner.s: -------------------------------------------------------------------------------- 1 | # scanner.s -- code to tokenising B input stream 2 | 3 | # Copyright (C) 2012, 2013 Richard Smith 4 | # All rights reserved. 5 | 6 | .data 7 | 8 | # We use TOKEN as an enum for the different token types. 9 | .globl token 10 | token: 11 | .int 0 12 | 13 | # The VALUE buffer contains tokens as they are being read. 14 | .globl value 15 | value: 16 | .zero 80 17 | 18 | 19 | .text 20 | 21 | #### # Function: void skip_ccomm(); 22 | # 23 | # Skips over a C-style comment (the opening /* having been read 24 | # already). 25 | .local skip_ccomm 26 | skip_ccomm: 27 | PUSH %ebp 28 | MOVL %esp, %ebp 29 | 30 | .L20: 31 | CALL getchar 32 | CMPL $-1, %eax 33 | JE _error 34 | CMPB '*', %al 35 | JNE .L20 36 | 37 | CALL getchar 38 | CMPL $-1, %eax 39 | JE _error 40 | CMPB '/', %al 41 | JNE .L20 42 | 43 | POP %ebp 44 | RET 45 | 46 | 47 | #### # Function: int skip_white(); 48 | # 49 | # Skips over any white space characters (including comments), and 50 | # returns the next character (having ungot it). 51 | .local skip_white 52 | skip_white: 53 | PUSH %ebp 54 | MOVL %esp, %ebp 55 | 56 | .L1: 57 | CALL getchar 58 | PUSH %eax 59 | CALL isspace 60 | TESTL %eax, %eax 61 | POP %eax 62 | JNZ .L1 63 | 64 | # Handle comments 65 | CMPB '/', %al 66 | JNE .L18 67 | PUSH %eax 68 | CALL getchar 69 | CMPB '*', %al 70 | JNE .L19 71 | POP %eax 72 | CALL skip_ccomm 73 | JMP .L1 74 | 75 | .L19: 76 | PUSH %eax 77 | CALL ungetchar 78 | POP %eax 79 | POP %eax 80 | 81 | .L18: 82 | PUSH %eax 83 | CALL ungetchar 84 | POP %eax 85 | 86 | POP %ebp 87 | RET 88 | 89 | 90 | #### # Function: int isidchar1(int chr); 91 | # 92 | # Test whether CHR can start an identifier. 93 | .local isidchar1 94 | isidchar1: 95 | PUSH %ebp 96 | MOVL %esp, %ebp 97 | 98 | MOVL 8(%ebp), %ecx 99 | MOVL $1, %eax 100 | CMPB '_', %cl 101 | JE .L2 102 | PUSH %ecx 103 | CALL isalpha 104 | .L2: 105 | LEAVE 106 | RET 107 | 108 | 109 | #### # Function: int isidchar(int chr); 110 | # 111 | # Test whether CHR can occur in an identifier, other than as 112 | # the first character. 113 | .local isidchar 114 | isidchar: 115 | PUSH %ebp 116 | MOVL %esp, %ebp 117 | 118 | MOVL 8(%ebp), %ecx 119 | MOVL $1, %eax 120 | CMPB '_', %cl 121 | JE .L3 122 | PUSH %ecx 123 | CALL isalnum 124 | .L3: 125 | LEAVE 126 | RET 127 | 128 | 129 | #### # Function: int ismopchar(int chr); 130 | # 131 | # Is CHR a character than can occur at the start of a multi-character 132 | # operator? 133 | .data 134 | .local mopchars 135 | mopchars: 136 | .string "+-*/<>&|!=%^" 137 | .text 138 | .local ismopchar 139 | ismopchar: 140 | PUSH %ebp 141 | MOVL %esp, %ebp 142 | 143 | PUSH 8(%ebp) 144 | MOVL $mopchars, %eax 145 | PUSH %eax 146 | CALL strchr 147 | 148 | LEAVE 149 | RET 150 | 151 | #### # Function: void get_multiop(); 152 | # 153 | # Reads a multi-character operator. 154 | .data 155 | .local mops2 156 | mops2: 157 | .int '++', '--', '<<', '>>', '<=', '>=', '==', '!=', '&&', '||' 158 | .int '*=', '%=', '/=', '+=', '-=', '&=', '|=', '^=' 159 | .int 0 # <-- end of table 160 | 161 | .text 162 | .local get_multiop 163 | get_multiop: 164 | PUSH %ebp 165 | MOVL %esp, %ebp 166 | 167 | CALL getchar 168 | MOVL %eax, token 169 | 170 | CALL getchar 171 | CMPL $-1, %eax 172 | JE .L13 173 | PUSH %eax 174 | MOVL token, %eax 175 | MOVL %eax, %ecx 176 | MOVB -4(%ebp), %ch # %ecx is now the two-char token 177 | 178 | MOVL $mops2, %eax 179 | MOVL %eax, %edx 180 | .L14: 181 | # Loop testing tokens 182 | CMPL %ecx, (%edx) 183 | JE .L15 184 | INCL %edx 185 | CMPL $0, (%edx) 186 | JNE .L14 187 | JMP .L17 188 | 189 | .L15: 190 | # Definitely got a two-character token. What about a third? 191 | POP %eax 192 | MOVL %ecx, %eax 193 | MOVL %eax, token 194 | CMPL '<<', %ecx 195 | JE .L16 196 | CMPL '>>', %ecx 197 | JE .L16 198 | JMP .L13 199 | .L16: 200 | # Handle <<= and >>= 201 | CALL getchar 202 | CMPL $-1, %eax 203 | JE .L13 204 | PUSH %eax 205 | CMPB '=', %al 206 | JNE .L17 207 | POP %edx 208 | MOVB $16, %cl 209 | SALL %edx 210 | MOVL token, %eax 211 | ORL %edx, %eax 212 | MOVL %eax, token 213 | JMP .L13 214 | .L17: 215 | CALL ungetchar 216 | POP %eax 217 | .L13: 218 | POP %ebp 219 | RET 220 | 221 | #### # Function: int get_word(); 222 | # 223 | # Reads an identifier or keyword (without distinguishing them) 224 | # into VALUE, and returns the next byte (having ungot it). 225 | .local get_word 226 | get_word: 227 | PUSH %ebp 228 | MOVL %esp, %ebp 229 | PUSH %edi 230 | 231 | # Skip whitespace and test for an identifier 232 | CALL skip_white 233 | PUSH %eax 234 | CALL isidchar1 235 | POP %ecx 236 | TESTL %eax, %eax 237 | JZ _error 238 | 239 | MOVL 'id', %eax # 'id' for identifier 240 | MOVL %eax, token 241 | MOVL $value, %eax 242 | MOVL %eax, %edi # string pointer 243 | DECL %edi 244 | 245 | .L4: # Loop reading characters, and check for buffer overflow 246 | INCL %edi 247 | MOVL $value, %eax 248 | SUBL %edi, %eax 249 | CMPL $-79, %eax 250 | JLE _error 251 | 252 | CALL getchar 253 | MOVB %al, (%edi) 254 | PUSH %eax 255 | CALL isidchar 256 | TESTL %eax, %eax 257 | POP %eax 258 | JNE .L4 259 | 260 | # Unget the last character 261 | PUSH %eax 262 | CALL ungetchar 263 | POP %eax 264 | 265 | # Write null terminator 266 | XORB %cl, %cl 267 | MOVB %cl, (%edi) 268 | 269 | CALL chk_keyword 270 | 271 | POP %edi 272 | POP %ebp 273 | RET 274 | 275 | 276 | #### # Function: void chk_keyword(); 277 | # 278 | # Check whether VALUE contains a keyword, and if so, sets TOKEN 279 | # accordingly. 280 | # 281 | .data 282 | .local keywords 283 | .align 12 284 | keywords: 285 | # These are the supported keywords 286 | .string "auto" .align 12 287 | .string "break" .align 12 288 | .string "continue" .align 12 289 | .string "do" .align 12 290 | .string "else" .align 12 291 | .string "extern" .align 12 292 | .string "if" .align 12 293 | .string "return" .align 12 294 | .string "static" .align 12 295 | .string "while" .align 12 296 | 297 | # These keywords are not supported, but are skipped in some contexts 298 | .string "char" .align 12 299 | .string "int" .align 12 300 | .string "struct" .align 12 301 | .byte 0 # <-- the end of table marker 302 | 303 | .text 304 | .local chk_keyword 305 | chk_keyword: 306 | PUSH %ebp 307 | MOVL %esp, %ebp 308 | PUSH %edi 309 | PUSH %esi 310 | 311 | MOVL $value, %edi 312 | MOVL $keywords, %esi 313 | .L10: 314 | CMPB $0, (%esi) 315 | JE .L12 316 | PUSH %edi 317 | PUSH %esi 318 | CALL strcmp 319 | POP %ecx 320 | POP %ecx 321 | TESTL %eax, %eax 322 | JZ .L11 323 | 324 | ADDL $12, %esi 325 | JMP .L10 326 | .L11: 327 | # Found it. Use the first dword of the name to put in TOKEN. 328 | MOVL (%esi), %eax 329 | MOVL %eax, token 330 | 331 | .L12: 332 | POP %esi 333 | POP %edi 334 | POP %ebp 335 | RET 336 | 337 | 338 | #### # Function: int get_qlit(); 339 | # 340 | # Reads the textual representation of a character or string literal 341 | # into VALUE, including the quotation marks, sets TOKEN to 'chr' or 342 | # 'str' (as appropriate) and returns the next byte (having ungot it). 343 | .local get_qlit 344 | get_qlit: 345 | PUSH %ebp 346 | MOVL %esp, %ebp 347 | 348 | # Skip whitespace and test for the opening '\'' 349 | CALL skip_white 350 | PUSH %eax # -4(%ebp) is the quote character 351 | CMPB '\'', %al 352 | JE .L21a 353 | CMPB '\"', %al 354 | JE .L21b 355 | JNE _error 356 | 357 | .L21a: 358 | MOVL 'char', %eax # 'char' for character literal 359 | JMP .L21c 360 | .L21b: 361 | MOVL 'str', %eax # 'str' for character literal 362 | .L21c: 363 | MOVL %eax, token 364 | MOVL $value, %eax 365 | MOVL %eax, %edi # string pointer 366 | 367 | CALL getchar 368 | MOVB %al, (%edi) 369 | 370 | .L21: # Loop reading characters, and check for buffer overflow 371 | INCL %edi 372 | MOVL $value, %eax 373 | SUBL %edi, %eax 374 | CMPL $-78, %eax # 78 to allow for \', etc. 375 | JLE _error 376 | 377 | CALL getchar 378 | CMPL $-1, %eax 379 | JE _error 380 | MOVB %al, (%edi) 381 | CMPB -4(%ebp), %al 382 | JE .L21d 383 | CMPB '\\', %al 384 | JNE .L21 385 | 386 | # Read an escaped character 387 | INCL %edi 388 | CALL getchar 389 | CMPL $-1, %eax 390 | JE _error 391 | MOVB %al, (%edi) 392 | 393 | JMP .L21 394 | 395 | .L21d: 396 | # Write null terminator 397 | INCL %edi 398 | XORB %cl, %cl 399 | MOVB %cl, (%edi) 400 | 401 | # Peek another character 402 | CALL getchar 403 | PUSH %eax 404 | CALL ungetchar 405 | POP %eax 406 | 407 | LEAVE 408 | RET 409 | 410 | 411 | #### # Function: int get_number(); 412 | # 413 | # Reads the textual representation of a number into VALUE, 414 | # and returns the next byte (having ungot it). 415 | .local get_number 416 | get_number: 417 | PUSH %ebp 418 | MOVL %esp, %ebp 419 | PUSH %edi 420 | 421 | MOVL 'num', %eax # 'num' for number 422 | MOVL %eax, token 423 | MOVL $value, %eax 424 | MOVL %eax, %edi # string pointer 425 | 426 | # Skip whitespace and test for an identifier 427 | CALL getchar 428 | MOVB %al, (%edi) 429 | PUSH %eax 430 | CALL isdigit 431 | POP %ecx 432 | TESTL %eax, %eax 433 | JZ _error 434 | CMPB '0', (%edi) 435 | JNE .L5 436 | 437 | # The first character was '0', so either octal, hex or zero. 438 | 439 | # Store that one byte, and look at the next one 440 | INCL %edi 441 | CALL getchar 442 | MOVB %al, (%edi) 443 | CMPB 'x', %al 444 | JE .L24 445 | PUSH %eax 446 | CALL isdigit 447 | POP %ecx 448 | TESTL %eax, %eax 449 | JNZ .L5 450 | 451 | # It must be a literal zero 452 | MOVL %ecx, %eax # restore character 453 | JMP .L23 454 | 455 | .L24: # It's a hex number 456 | INCL %edi 457 | MOVL $value, %eax 458 | SUBL %edi, %eax 459 | CMPL $-79, %eax 460 | JLE _error 461 | 462 | CALL getchar 463 | MOVB %al, (%edi) 464 | PUSH %eax 465 | CALL isxdigit 466 | TESTL %eax, %eax 467 | POP %eax 468 | JNZ .L24 469 | 470 | PUSH %eax 471 | CALL isalpha 472 | TESTL %eax, %eax 473 | JNZ _error 474 | POP %eax 475 | JMP .L23 476 | 477 | .L5: # It's a decimal or octal number -- we don't care which 478 | # Loop reading characters, and check for buffer overflow 479 | INCL %edi 480 | MOVL $value, %eax 481 | SUBL %edi, %eax 482 | CMPL $-79, %eax 483 | JLE _error 484 | 485 | CALL getchar 486 | MOVB %al, (%edi) 487 | PUSH %eax 488 | CALL isdigit 489 | TESTL %eax, %eax 490 | POP %eax 491 | JNZ .L5 492 | 493 | PUSH %eax 494 | CALL isalpha 495 | TESTL %eax, %eax 496 | JNZ _error 497 | POP %eax 498 | 499 | .L23: 500 | # Unget the last character 501 | PUSH %eax 502 | CALL ungetchar 503 | POP %eax 504 | 505 | # Write null terminator 506 | XORB %cl, %cl 507 | MOVB %cl, (%edi) 508 | 509 | POP %edi 510 | POP %ebp 511 | RET 512 | 513 | 514 | #### # Function: int next(); 515 | # 516 | # Reads the next token, returning the token type (or -1 for EOF) 517 | next: 518 | PUSH %ebp 519 | MOVL %esp, %ebp 520 | 521 | CALL skip_white 522 | CMPL $-1, %eax 523 | JE .L6a 524 | 525 | PUSH %eax 526 | CALL isidchar1 527 | POP %ecx 528 | TESTL %eax, %eax 529 | JNZ .L7 530 | 531 | PUSH %ecx 532 | CALL isdigit 533 | POP %ecx 534 | TESTL %eax, %eax 535 | JNZ .L8 536 | 537 | PUSH %ecx 538 | CALL ismopchar 539 | POP %ecx 540 | TESTL %eax, %eax 541 | JNZ .L8a 542 | 543 | CMPB '\'', %cl 544 | JE .L8b 545 | CMPB '\"', %cl 546 | JE .L8b 547 | 548 | CALL getchar 549 | .L6a: 550 | MOVL %eax, token 551 | JMP .L6 552 | 553 | .L7: 554 | CALL get_word 555 | JMP .L9 556 | .L8: 557 | CALL get_number 558 | JMP .L9 559 | .L8a: 560 | CALL get_multiop 561 | JMP .L9 562 | .L8b: 563 | CALL get_qlit 564 | JMP .L9 565 | .L9: 566 | MOVL token, %eax 567 | .L6: 568 | POP %ebp 569 | RET 570 | -------------------------------------------------------------------------------- /stage-4/signal.c: -------------------------------------------------------------------------------- 1 | /* signal.c -- signal handling functions 2 | * 3 | * Copyright (C) 2013 Richard Smith 4 | * All rights reserved. 5 | */ 6 | 7 | /* The C library raise() */ 8 | raise( sig ) { 9 | return kill( getpid(), sig ); 10 | } 11 | 12 | /* The C library abort() */ 13 | abort() { 14 | raise(6); /* SIGABRT == 6 */ 15 | 16 | /* If we're still here, reinstate the default handler and retry. */ 17 | signal(6, 0); /* SIG_DFL == 0 */ 18 | raise(6); 19 | 20 | /* This shouldn't be possible. */ 21 | _exit(128 + 6); 22 | } 23 | -------------------------------------------------------------------------------- /stage-4/stdarg.c: -------------------------------------------------------------------------------- 1 | /* stdarg.c 2 | * 3 | * Copyright (C) 2005, 2014 Richard Smith 4 | * All rights reserved. 5 | */ 6 | 7 | /* The intention is that the C library macro va_arg can be implemented as: 8 | * 9 | * #define va_arg( ap, type ) \ 10 | * ( * (type*) __va_arg( &(ap), sizeof(type) ) ) 11 | */ 12 | __va_arg( ap, size ) { 13 | auto a = *ap; 14 | /* Round size to 4 byte alignment */ 15 | size = (size + 3) & ~3; 16 | *ap += size; 17 | return a; 18 | } 19 | 20 | /* The intention is that the C library macro va_start can be implemented as: 21 | * 22 | * #define va_start( ap, last ) \ 23 | * ( __va_start( &(ap), &(last), sizeof(last) ) ) 24 | */ 25 | __va_start( ap, last, last_size ) { 26 | *ap = last; 27 | __va_arg( ap, last_size ); 28 | } 29 | -------------------------------------------------------------------------------- /stage-4/stdio.s: -------------------------------------------------------------------------------- 1 | # stdio.s -- bootstrap code for I/O 2 | 3 | # Copyright (C) 2012, 2013 Richard Smith 4 | # All rights reserved. 5 | 6 | #### # Function: void putstr(char* str) 7 | # 8 | # The B library putstr() function. Writes STR to standard output. 9 | # Unlike the C library puts(), no terminating '\n' is added 10 | # automatically. 11 | putstr: 12 | PUSH %ebp 13 | MOVL %esp, %ebp 14 | 15 | PUSH 8(%ebp) 16 | CALL strlen 17 | POP %ecx 18 | 19 | PUSH %eax 20 | PUSH %ecx 21 | MOVL $1, %eax # 1 == STDOUT_FILENO 22 | PUSH %eax 23 | CALL write 24 | POP %ecx 25 | POP %ecx 26 | POP %ecx 27 | 28 | CMPL %eax, %ecx 29 | JNE _error 30 | 31 | POP %ebp 32 | RET 33 | 34 | 35 | #### # Function: void putchar(int chr) 36 | # 37 | # The C standard library putchar() function. Writes the one 38 | # characters in CHR to standard output. The B library version 39 | # should write multiple characters from CHR (up to four): this 40 | # is prohibited by the C standard, and we do not currently do it. 41 | putchar: 42 | PUSH %ebp 43 | MOVL %esp, %ebp 44 | 45 | MOVL $1, %eax 46 | PUSH %eax # strlen 47 | LEA 8(%ebp), %eax 48 | PUSH %eax # &chr 49 | MOVL $1, %eax # 1 == STDOUT_FILENO 50 | PUSH %eax 51 | CALL write 52 | POP %ecx 53 | POP %ecx 54 | POP %ecx 55 | 56 | CMPL %eax, %ecx 57 | JNE _error 58 | 59 | POP %ebp 60 | RET 61 | 62 | #### # Function: void printf(char* fmt, ...); 63 | # 64 | # A very light-weight version of printf, handling just the 65 | # %%, %c, %d and %s format specifiers, with no widths or 66 | # precisions. The B library version also support %o which we 67 | # don't do yet. 68 | printf: 69 | PUSH %ebp 70 | MOVL %esp, %ebp 71 | PUSH %ebx 72 | PUSH %esi 73 | PUSH %edi 74 | 75 | MOVL 8(%ebp), %esi # %esi is ptr into fmt 76 | LEA 8(%ebp), %edi # %edi is ptr to prev va_arg 77 | 78 | .L2: # Loop over the format string 79 | CMPB $0, (%esi) 80 | JE .L3 81 | MOVB (%esi), %al 82 | CMPB '%', %al 83 | JE .L4 84 | 85 | .L6: # Write a raw character 86 | PUSH %eax 87 | CALL putchar 88 | POP %eax 89 | .L5: 90 | INCL %esi 91 | JMP .L2 92 | 93 | .L4: 94 | # We have a format specifier 95 | INCL %esi 96 | CMPB $0, (%esi) 97 | JE _error 98 | MOVB (%esi), %al 99 | CMPB '%', %al 100 | JE .L6 # write a literal '%' 101 | 102 | # Read the next vararg into %eax, putting the format char in %cl 103 | ADDL $4, %edi # read next va_arg 104 | MOVB %al, %cl 105 | MOVL (%edi), %eax 106 | 107 | # Test for 'c', 's' and 'd' format specifiers, otherwise fail 108 | CMPB 'c', %cl 109 | JE .L6 # write the va_arg character 110 | CMPB 's', %cl 111 | JE .L7 112 | CMPB 'd', %cl 113 | JE .L8 114 | JMP _error 115 | 116 | .L7: # Handle the %s format specifier 117 | PUSH %eax 118 | CALL putstr 119 | POP %eax 120 | JMP .L5 121 | 122 | .L8: # Handle the %d format specifier. Special case 0 123 | TESTL %eax, %eax 124 | JNZ .L9 125 | MOVB '0', %al 126 | JMP .L6 127 | 128 | .L9: # Do we need a -ve sign? 129 | CMPL $0, %eax 130 | JG .L10 131 | PUSH %eax 132 | MOVB '-', %cl 133 | PUSH %ecx 134 | CALL putchar 135 | POP %ecx 136 | POP %eax 137 | NEGL %eax 138 | 139 | .L10: # Set up a temporary buffer, as we'll write from right to left 140 | MOVL %esp, %ebx 141 | DECL %ebx 142 | SUBL $16, %esp 143 | MOVB $0, (%ebx) # '\0' terminator 144 | DECL %ebx 145 | MOVL $10, %ecx 146 | 147 | .L11: 148 | XORL %edx, %edx 149 | IDIVL %ecx # acts on %edx:%eax 150 | ADDB '0', %dl # remainder is in %dl, conv. to char 151 | MOVB %dl, (%ebx) 152 | TESTL %eax, %eax 153 | JZ .L12 154 | DECL %ebx 155 | JMP .L11 156 | 157 | .L12: 158 | PUSH %ebx 159 | CALL putstr 160 | POP %ebx 161 | ADDL $16, %esp 162 | JMP .L5 163 | 164 | .L3: # Cleanup 165 | POP %edi 166 | POP %esi 167 | POP %ebx 168 | POP %ebp 169 | RET 170 | 171 | #### # Function: int ungetchar(int c); 172 | # 173 | # A version of the C standard library ungetc() that acts 174 | # on standard input. 175 | .data 176 | .local unget_count 177 | unget_count: 178 | .byte 0 # How many characters are in the unget slot? 179 | .local unget_data 180 | unget_data: 181 | .int 0 # %al -- bool: is slot in use? 182 | # %ah -- char: slot content 183 | .text ungetchar: 184 | PUSH %ebp 185 | MOVL %esp, %ebp 186 | 187 | # If c == EOF, we should do nothing and return EOF. 188 | MOVL 8(%ebp), %eax 189 | CMPL $-1, %eax 190 | JE .L15 191 | 192 | # Have we space to write another character? 193 | MOVB unget_count, %al 194 | CMPB $4, %al 195 | JAE _error 196 | INCB %al 197 | MOVB %al, unget_count 198 | 199 | # Write the character to the unget slot. 200 | MOVL unget_data, %eax 201 | MOVB $8, %cl 202 | SHLL %eax 203 | MOVB 8(%ebp), %al 204 | MOVL %eax, unget_data 205 | 206 | # And return the character 207 | XORL %eax, %eax 208 | MOVB 8(%ebp), %al 209 | .L15: 210 | POP %ebp 211 | RET 212 | 213 | 214 | #### # Function: int getchar(void); 215 | # 216 | # The C standard library getchar() function. Reads one character 217 | # from standard input and returns it. If end-of-file occurs, we 218 | # return -1: in this respect it differs from the B library 219 | # version which returns the ASCII EOT character (0x04, ^D). 220 | getchar: 221 | PUSH %ebp 222 | MOVL %esp, %ebp 223 | 224 | # Has a character been ungetc'd? 225 | MOVB unget_count, %al 226 | CMPB $0, %al 227 | JE .L14 228 | DECB %al 229 | MOVB %al, unget_count 230 | 231 | # Read the character from the unget slot. 232 | MOVL unget_data, %eax 233 | XORL %edx, %edx 234 | MOVB %al, %dl 235 | MOVB $8, %cl 236 | SHRL %eax 237 | MOVL %eax, unget_data 238 | MOVL %edx, %eax 239 | JMP .L13 240 | 241 | .L14: # Read from OS 242 | XORL %eax, %eax 243 | PUSH %eax # A 4-byte buffer: %esp points here 244 | MOVL %esp, %ecx 245 | 246 | MOVL $1, %eax 247 | PUSH %eax # strlen 248 | PUSH %ecx # ptr to buffer 249 | XORL %eax, %eax 250 | PUSH %eax # 0 == STDOUT_FILENO 251 | CALL read 252 | MOVL %eax, %edx 253 | POP %ecx 254 | POP %ecx 255 | POP %ecx # strlen == 1 256 | POP %eax # character read 257 | 258 | CMPL %edx, %ecx # Successfully read one byte 259 | JE .L13 260 | CMPL $0, %edx # Necessarily indicates end of file 261 | JNE _error 262 | MOVL $-1, %eax # -1 == EOF 263 | .L13: 264 | POP %ebp 265 | RET 266 | 267 | 268 | __io_flush: 269 | RET 270 | 271 | .data: 272 | stdout: .int 1 # These are nominally pointers and need to be distinguishable 273 | stdin: .int 1 # from the NULL pointer returned on error from e.g. freopen. 274 | 275 | 276 | .text: 277 | #### # Function: FILE* freopen( char const* filename, char const* mode, 278 | # FILE* stream ); 279 | # 280 | # A minimal freopen, just to get the main() function of the compiler 281 | # to work. 282 | freopen: 283 | PUSH %ebp 284 | MOVL %esp, %ebp 285 | PUSH %ebx 286 | 287 | MOVL 12(%ebp), %eax 288 | CMPB 'w', (%eax) 289 | JE .L18 290 | 291 | # reopen stdin 292 | XORL %eax, %eax # 0 == O_RDONLY 293 | PUSH %eax 294 | PUSH 8(%ebp) 295 | CALL open 296 | ADDL $8, %esp 297 | CMPL $-1, %eax 298 | JE _error 299 | 300 | XORL %ecx, %ecx # stdin 301 | JMP .L19 302 | .L18: 303 | # reopen stdout 304 | MOVL $0644, %eax # permissions 305 | PUSH %eax 306 | MOVL $0x241, %eax # O_WRONLY=1|O_CREAT=0x40|O_TRUNC=0x200 307 | PUSH %eax 308 | PUSH 8(%ebp) 309 | CALL open 310 | ADDL $12, %esp 311 | CMPL $-1, %eax 312 | JE _error 313 | 314 | XORL %ecx, %ecx 315 | INCL %ecx # stdout 316 | 317 | .L19: 318 | PUSH %ecx # new_fd 319 | PUSH %eax # old_fd 320 | CALL dup2 321 | CMPL $-1, %eax 322 | JE _error 323 | CALL close 324 | POP %eax 325 | POP %eax 326 | 327 | MOVL 16(%ebp), %eax 328 | 329 | POP %ebx 330 | POP %ebp 331 | RET 332 | 333 | 334 | #### # Function: int strtol( char const* str, char const **endptr ); 335 | # 336 | # Convert STR to an integer. This is really the standard atoi, 337 | # rather than than strtol. As the whole file is discarded after 338 | # the bootstrap cc0 is created, this is okay. 339 | strtol: 340 | PUSH %ebp 341 | MOVL %esp, %ebp 342 | PUSH %esi 343 | 344 | XORL %eax, %eax # value 345 | MOVL 8(%ebp), %esi # ptr 346 | 347 | # Recurse to process a -ve sign 348 | CMPB '-', (%esi) 349 | JNE .L16 350 | INCL %esi 351 | PUSH %esi 352 | CALL strtol 353 | POP %edx 354 | NEGL %eax 355 | JMP .L17 356 | .L16: 357 | XORL %ecx, %ecx 358 | MOVB (%esi), %cl 359 | SUBL '0', %ecx 360 | CMPL '9', %ecx 361 | JA .L17 # unsigned, so everything else is > '9' 362 | 363 | PUSH %ecx 364 | MOVL $10, %ecx 365 | MULL %ecx 366 | POP %ecx 367 | ADDL %ecx, %eax 368 | 369 | INCL %esi 370 | JMP .L16 371 | .L17: 372 | MOVL 12(%ebp), %ecx 373 | MOVL %esi, (%ecx) 374 | 375 | POP %esi 376 | POP %ebp 377 | RET 378 | -------------------------------------------------------------------------------- /stage-4/string.s: -------------------------------------------------------------------------------- 1 | # stdio.s 2 | 3 | # Copyright (C) 2012, 2013 Richard Smith 4 | # All rights reserved. 5 | 6 | 7 | #### # Function: size_t strlen(char* s); 8 | strlen: 9 | PUSH %ebp 10 | MOVL %esp, %ebp 11 | PUSH %edi 12 | 13 | MOVL 8(%ebp), %edi 14 | XORL %eax, %eax 15 | XORL %ecx, %ecx 16 | DECL %ecx 17 | REPNE SCASB 18 | SUBL 8(%ebp), %edi 19 | LEA -1(%edi), %eax # DEC %edi; MOVL %edi, %eax 20 | 21 | POP %edi 22 | POP %ebp 23 | RET 24 | 25 | 26 | #### # Function: size_t strnlen(char* s, size_t maxlen); 27 | strnlen: 28 | PUSH %ebp 29 | MOVL %esp, %ebp 30 | PUSH %edi 31 | 32 | MOVL 8(%ebp), %edi 33 | XORL %eax, %eax 34 | MOVL 12(%ebp), %ecx 35 | INCL %ecx 36 | REPNE SCASB 37 | SUBL 8(%ebp), %edi 38 | LEA -1(%edi), %eax # DEC %edi; MOVL %edi, %eax 39 | 40 | POP %edi 41 | POP %ebp 42 | RET 43 | 44 | 45 | #### # Function: int strcmp(char const* a, char const* b); 46 | strcmp: 47 | PUSH %ebp 48 | MOVL %esp, %ebp 49 | PUSH %esi 50 | PUSH %edi 51 | 52 | # Note the order is chosen so we set CF correctly. 53 | MOVL 12(%ebp), %edi 54 | MOVL 8(%ebp), %esi 55 | .L1: 56 | LODSB # Loads (%esi) to %al 57 | SCASB # Compares (%edi) to %al 58 | JNE .L2 59 | CMPB $0, %al 60 | JNE .L1 61 | 62 | # They're equal 63 | XORL %eax, %eax 64 | JMP .L3 65 | .L2: 66 | # SCAS internally does SUBL (%edi), %al, so if (%edi) > %al 67 | # (or b > a), the carry flag will be set. SBB %eax, %eax 68 | # is a useful trick for setting %eax to -1 if CF. 69 | SBBL %eax, %eax 70 | ORB $1, %al 71 | .L3: 72 | POP %edi 73 | POP %esi 74 | POP %ebp 75 | RET 76 | 77 | 78 | #### # Function: int strncmp(char const* a, char const* b, size_t n); 79 | strncmp: 80 | PUSH %ebp 81 | MOVL %esp, %ebp 82 | PUSH %esi 83 | PUSH %edi 84 | 85 | MOVL 16(%ebp), %ecx 86 | 87 | # Note the order is chosen so we set CF correctly. 88 | MOVL 12(%ebp), %edi 89 | MOVL 8(%ebp), %esi 90 | .L1a: 91 | LODSB # Loads (%esi) to %al 92 | SCASB # Compares (%edi) to %al 93 | JNE .L2a 94 | CMPB $0, %al 95 | JE .L1b 96 | DECL %ecx 97 | JNZ .L1a 98 | .L1b: 99 | # They're equal 100 | XORL %eax, %eax 101 | JMP .L3a 102 | .L2a: 103 | # SCAS internally does SUBL (%edi), %al, so if (%edi) > %al 104 | # (or b > a), the carry flag will be set. SBB %eax, %eax 105 | # is a useful trick for setting %eax to -1 if CF. 106 | SBBL %eax, %eax 107 | ORB $1, %al 108 | .L3a: 109 | POP %edi 110 | POP %esi 111 | POP %ebp 112 | RET 113 | 114 | 115 | #### # Function: int strchr(char const* a, int c); 116 | # 117 | strchr: 118 | PUSH %ebp 119 | MOVL %esp, %ebp 120 | PUSH %esi 121 | 122 | MOVL 12(%ebp), %ecx 123 | MOVL 8(%ebp), %esi 124 | .L4: 125 | LODSB 126 | CMPB %cl, %al 127 | JE .L5 128 | CMPB $0, %al 129 | JNE .L4 130 | 131 | # Not found 132 | XORL %eax, %eax 133 | JMP .L6 134 | 135 | .L5: # Found it. Note LODSB will have incremented %esi 136 | LEA -1(%esi), %eax # DEC %esi; MOVL %esi, %eax 137 | .L6: 138 | POP %esi 139 | POP %ebp 140 | RET 141 | 142 | 143 | #### # Function: int strcpy(char* dest, char const* str); 144 | # 145 | strcpy: 146 | PUSH %ebp 147 | MOVL %esp, %ebp 148 | PUSH %esi 149 | PUSH %edi 150 | 151 | MOVL 12(%ebp), %esi 152 | MOVL 8(%ebp), %edi 153 | 154 | .L7: # We cannot use REP MOVSB because that does not check for a 155 | # terminating null character. 156 | LODSB 157 | STOSB 158 | CMPB $0, %al 159 | JNE .L7 160 | 161 | MOVL 8(%ebp), %eax 162 | 163 | POP %edi 164 | POP %esi 165 | POP %ebp 166 | RET 167 | 168 | 169 | #### # Function: int strncpy(char* dest, char const* str, size_t n); 170 | # 171 | strncpy: 172 | PUSH %ebp 173 | MOVL %esp, %ebp 174 | PUSH %esi 175 | PUSH %edi 176 | 177 | MOVL 16(%ebp), %ecx 178 | TESTL %ecx, %ecx 179 | JZ .L9 180 | 181 | MOVL 12(%ebp), %esi 182 | MOVL 8(%ebp), %edi 183 | 184 | .L8: # We cannot use REP MOVSB because that does not check for a 185 | # terminating null character. 186 | LODSB 187 | STOSB 188 | DECL %ecx 189 | JZ .L9 190 | CMPB $0, %al 191 | JNE .L8 192 | .L9: 193 | MOVL 8(%ebp), %eax 194 | 195 | POP %edi 196 | POP %esi 197 | POP %ebp 198 | RET 199 | 200 | 201 | #### # Function: int memcpy(char* dest, char const* str, size_t n); 202 | # 203 | memcpy: 204 | PUSH %ebp 205 | MOVL %esp, %ebp 206 | PUSH %esi 207 | PUSH %edi 208 | 209 | MOVL 16(%ebp), %ecx 210 | MOVL 12(%ebp), %esi 211 | MOVL 8(%ebp), %edi 212 | 213 | REP MOVSB 214 | 215 | MOVL 8(%ebp), %eax 216 | 217 | POP %edi 218 | POP %esi 219 | POP %ebp 220 | RET 221 | 222 | 223 | #### # Function: void* memset(void* s, int c, size_t n); 224 | # Set N bytes of memory pointed to by S to the C byte 225 | memset: 226 | PUSH %ebp 227 | MOVL %esp, %ebp 228 | PUSH %edi 229 | 230 | MOVL 8(%ebp), %edi 231 | MOVB 12(%ebp), %al 232 | MOVL 16(%ebp), %ecx 233 | REP STOSB 234 | MOVL 8(%ebp), %eax 235 | 236 | POP %edi 237 | POP %ebp 238 | RET 239 | 240 | 241 | #### # __asm_std() and __asm_cld() simply invoke those instructions. 242 | # They're used by memmove() to invoke memcpy backwards 243 | __asm_std: 244 | STD 245 | RET 246 | __asm_cld: 247 | CLD 248 | RET 249 | 250 | -------------------------------------------------------------------------------- /stage-4/string2.c: -------------------------------------------------------------------------------- 1 | /* string2.c -- additional, higher-level string handling functions 2 | * 3 | * Copyright (C) 2013, 2021 Richard Smith 4 | * All rights reserved. 5 | */ 6 | 7 | /* The BSD extension strlcat() */ 8 | strlcat(dest, src, n) { 9 | auto l1 = strnlen(dest, n), l2 = strnlen(src, n-l1); 10 | strncpy( dest + l1, src, l2 ); 11 | lchar(dest, l1 + l2 < n ? l1 + l2 : n - 1, '\0'); 12 | return dest; 13 | } 14 | 15 | /* The C library strcat() */ 16 | strcat(dest, src) { 17 | auto l1 = strlen(dest), l2 = strlen(src); 18 | strcpy( dest + l1, src ); 19 | lchar(dest, l1 + l2, '\0'); 20 | return dest; 21 | } 22 | 23 | /* The C library strncat() */ 24 | strncat(dest, src, n) { 25 | auto l1 = strlen(dest), l2 = strlen(src); 26 | if (l2 > n) l2 = n; 27 | strcpy( dest + l1, src, l2 ); 28 | lchar(dest, l1 + l2, '\0'); 29 | return dest; 30 | } 31 | 32 | /* The C library memmove() */ 33 | memmove(dest, src, n) { 34 | /* If we're copying to earlier memory, or if the blocks do not overlap, 35 | * then a forwards copy, as done by memcpy, will be fine. */ 36 | if ( dest < src || dest > src + n ) return memcpy(dest, src, n); 37 | 38 | /* Otherwise we set the direction flag (DF), then call memcpy with the 39 | * end pointers (which will then copy backwards), and clear DF. 40 | * We do not clear DF in memcpy because the ABI requires DF always to 41 | * be cleared before library calls. */ 42 | __asm_std(); 43 | memcpy(dest+n-1, src+n-1, n); 44 | __asm_cld(); 45 | return dest; 46 | } 47 | 48 | /* The C library strdup() */ 49 | strdup( str ) { 50 | auto l = strlen(str); 51 | auto str2 = malloc(l + 1); 52 | strcpy( str2, str ); 53 | lchar( str2, l, 0 ); 54 | return str2; 55 | } 56 | 57 | /* The C library strspn() */ 58 | strspn( str, chars ) { 59 | auto i = 0, c; 60 | while ( c = rchar(str, i) ) { 61 | if ( !strchr(chars, c) ) 62 | break; 63 | ++i; 64 | } 65 | return i; 66 | } 67 | 68 | 69 | /* The C library strcspn() */ 70 | strcspn( str, chars ) { 71 | auto i = 0, c; 72 | while ( c = rchar(str, i) ) { 73 | if ( strchr(chars, c) ) 74 | break; 75 | ++i; 76 | } 77 | return i; 78 | } 79 | -------------------------------------------------------------------------------- /stage-4/symtab.s: -------------------------------------------------------------------------------- 1 | # symtab.s -- Code to manipulate the symbol table 2 | 3 | # Copyright (C) 2012, 2013 Richard Smith 4 | # All rights reserved. 5 | 6 | .data 7 | 8 | .local st_start 9 | st_start: 10 | .int 0 11 | .local st_end 12 | st_end: 13 | .int 0 14 | .local st_endstore 15 | st_endstore: 16 | .int 0 17 | .local st_scope_id 18 | st_scope_id: 19 | .int 0 20 | 21 | # struct entry { char sym[12]; int32_t frame_off; int32_t scope_id; 22 | # type_t lval; type_t size; }; -- sizeof(entry) == 28 23 | 24 | .text 25 | 26 | #### # Function: void st_init(); 27 | # 28 | # Initialise the symbol table 29 | init_symtab: 30 | PUSH %ebp 31 | MOVL %esp, %ebp 32 | # MOVL $1792, %ecx # 64 * sizeof(entry) 33 | MOVL $28, %ecx # sizeof(entry) 34 | PUSH %ecx 35 | CALL malloc 36 | POP %ecx 37 | MOVL %eax, st_start 38 | MOVL %eax, st_end 39 | ADDL %ecx, %eax 40 | MOVL %eax, st_endstore 41 | POP %ebp 42 | RET 43 | 44 | 45 | #### # Function: void grow_symtab(); 46 | # 47 | # Double the size of the symbol table storage 48 | .local grow_symtab 49 | grow_symtab: 50 | PUSH %ebp 51 | MOVL %esp, %ebp 52 | 53 | MOVL st_start, %eax 54 | MOVL %eax, %edx 55 | MOVL st_endstore, %eax 56 | SUBL %edx, %eax 57 | MOVB $1, %cl 58 | SHLL %eax 59 | 60 | PUSH %eax # new size 61 | PUSH %edx # current start ptr 62 | CALL realloc 63 | 64 | # Store new pointers 65 | MOVL %eax, st_start 66 | 67 | MOVL %eax, %edx # new ptr 68 | POP %ecx # old ptr 69 | MOVL st_end, %eax 70 | SUBL %ecx, %eax 71 | ADDL %edx, %eax 72 | MOVL %eax, st_end 73 | 74 | POP %eax # new size 75 | ADDL %edx, %eax 76 | MOVL %eax, st_endstore 77 | 78 | POP %ebp 79 | RET 80 | 81 | 82 | #### # Function: void save_sym( char const* name, int32_t frame_off, 83 | # type_t lval, int32_t sym_size ); 84 | # 85 | # Save a local symbol. 86 | save_sym: 87 | PUSH %ebp 88 | MOVL %esp, %ebp 89 | 90 | PUSH 8(%ebp) # src -4(%ebp) 91 | MOVL st_end, %eax 92 | PUSH %eax # dest -8(%ebp) 93 | 94 | # Check that we're not about to overrun the symbol table, 95 | MOVL st_endstore, %eax 96 | CMPL %eax, -8(%ebp) 97 | JL .L1 98 | CALL grow_symtab 99 | MOVL st_end, %eax 100 | MOVL %eax, -8(%ebp) 101 | .L1: 102 | CALL strcpy 103 | POP %edx 104 | MOVL 12(%ebp), %ecx 105 | MOVL %ecx, 12(%edx) # save value 106 | MOVL st_scope_id, %eax 107 | MOVL %eax, 16(%edx) 108 | MOVL 16(%ebp), %ecx 109 | MOVL %ecx, 20(%edx) # lval flag 110 | MOVL 20(%ebp), %ecx 111 | MOVL %ecx, 24(%edx) # symbol size 112 | ADDL $28, %edx # sizeof(entry) 113 | MOVL %edx, %eax 114 | MOVL %eax, st_end 115 | 116 | LEAVE 117 | RET 118 | 119 | 120 | #### # Function: void new_scope(); 121 | # 122 | # Called on parsing '{' or similar to start a new nested scope. 123 | new_scope: 124 | PUSH %ebp 125 | MOVL %esp, %ebp 126 | 127 | MOVL st_scope_id, %eax 128 | INCL %eax 129 | MOVL %eax, st_scope_id 130 | 131 | POP %ebp 132 | RET 133 | 134 | 135 | #### # Function: int end_scope(); 136 | # 137 | # Called on parsing '}' or similar to remove symbols from the table. 138 | # Returns number of bytes that need removing from the stack. 139 | end_scope: 140 | PUSH %ebp 141 | MOVL %esp, %ebp 142 | 143 | MOVL st_end, %eax 144 | MOVL %eax, %ecx # %ecx end 145 | MOVL st_start, %eax 146 | SUBL $28, %eax # sizeof(entry) 147 | MOVL %eax, %edx # %edx ptr 148 | .L4: 149 | # Zero %eax in case we jump to the end where %eax is the scope size 150 | XORL %eax, %eax 151 | 152 | ADDL $28, %edx # sizeof(entry) 153 | CMPL %ecx, %edx 154 | JGE .L5 155 | 156 | MOVL st_scope_id, %eax 157 | CMPL %eax, 16(%edx) 158 | JL .L4 159 | 160 | # The symbol table is sorted by scope id, so as soon as we find 161 | # one symbol in the current scope, all later ones must be too. 162 | 163 | # First, shrink the table 164 | MOVL %edx, %eax 165 | MOVL %eax, st_end 166 | 167 | # Then iterate over the remainder adding up the scope size 168 | MOVL 24(%edx), %eax # %edx is now scope size 169 | .L7: 170 | ADDL $28, %edx # sizeof(entry) 171 | CMPL %ecx, %edx 172 | JGE .L5 173 | ADDL 24(%edx), %eax 174 | JMP .L7 175 | .L5: 176 | PUSH %eax # store frame size 177 | MOVL $st_scope_id, %eax 178 | DECL (%eax) 179 | POP %eax 180 | 181 | POP %ebp 182 | RET 183 | 184 | 185 | #### # Function: int lookup_sym(char const* name, int* off); 186 | # 187 | # Return the lvalue flag for the symbol NAME, or 1 if it is 188 | # not defined (as we assume external symbols are lvalues). 189 | # Also set *OFF to the symbol table offset of the symbol, or 190 | # 0 if it is not defined (as 0 is not a valid offset because 191 | # 0(%ebp) is the calling frame's base pointer.) 192 | lookup_sym: 193 | PUSH %ebp 194 | MOVL %esp, %ebp 195 | PUSH %edi 196 | PUSH %esi 197 | 198 | PUSH 8(%ebp) 199 | MOVL st_start, %eax 200 | MOVL %eax, %edi 201 | MOVL st_end, %eax 202 | MOVL %eax, %esi 203 | SUBL $28, %edi # sizeof(entry) 204 | .L2: 205 | ADDL $28, %edi # sizeof(entry) 206 | XORL %eax, %eax 207 | CMPL %esi, %edi 208 | JGE .L3 209 | PUSH %edi 210 | CALL strcmp 211 | POP %ecx 212 | TESTL %eax, %eax 213 | JNZ .L2 214 | MOVL 20(%edi), %eax # return lv flag 215 | MOVL 12(%edi), %edx # frame offset 216 | JMP .L6 217 | .L3: 218 | # Symbol not found -- we assume it's an external function 219 | # which is not an lvalue (so return 0) 220 | XORL %eax, %eax 221 | XORL %edx, %edx # use 0 frame offset for error 222 | .L6: 223 | # write *off 224 | MOVL 12(%ebp), %ecx 225 | MOVL %edx, (%ecx) 226 | 227 | POP %ecx 228 | POP %esi 229 | POP %edi 230 | POP %ebp 231 | RET 232 | 233 | -------------------------------------------------------------------------------- /stage-4/unistd.s: -------------------------------------------------------------------------------- 1 | # unistd.s -- Linux syscalls 2 | 3 | # Copyright (C) 2013, 2014 Richard Smith 4 | # All rights reserved. 5 | 6 | .data 7 | #### # Variable: int errno; 8 | .globl errno 9 | errno: 10 | .int 0 11 | 12 | 13 | .text 14 | 15 | #### # Function: ssize_t write(int fd, const void *buf, size_t count); 16 | # 17 | .globl write 18 | write: 19 | PUSH %ebp 20 | MOVL %esp, %ebp 21 | PUSH %ebx 22 | 23 | MOVL 16(%ebp), %edx 24 | MOVL 12(%ebp), %ecx 25 | MOVL 8(%ebp), %ebx 26 | MOVL $4, %eax # 4 == __NR_write 27 | INT $0x80 28 | CMPL $-4096, %eax # -4095 <= %eax < 0 for errno 29 | JNA .L1 30 | 31 | NEGL %eax 32 | MOVL %eax, errno 33 | XORL %eax, %eax 34 | DECL %eax 35 | .L1: 36 | POP %ebx 37 | POP %ebp 38 | RET 39 | 40 | 41 | #### # Function: ssize_t read(int fd, void *buf, size_t count); 42 | # 43 | .globl read 44 | read: 45 | PUSH %ebp 46 | MOVL %esp, %ebp 47 | PUSH %ebx 48 | 49 | MOVL 16(%ebp), %edx 50 | MOVL 12(%ebp), %ecx 51 | MOVL 8(%ebp), %ebx 52 | MOVL $3, %eax # 3 == __NR_read 53 | INT $0x80 54 | CMPL $-4096, %eax # -4095 <= %eax < 0 for errno 55 | JNA .L2 56 | 57 | NEGL %eax 58 | MOVL %eax, errno 59 | XORL %eax, %eax 60 | DECL %eax 61 | .L2: 62 | POP %ebx 63 | POP %ebp 64 | RET 65 | 66 | 67 | #### # Function: void _exit(int status) 68 | # 69 | # Terminate program execution with given status. 70 | .globl _exit 71 | _exit: 72 | PUSH %ebp 73 | MOVL %esp, %ebp 74 | MOVL 8(%ebp), %ebx 75 | MOVL $1, %eax # 1 == __NR_exit 76 | INT $0x80 77 | HLT 78 | 79 | 80 | #### # Function: int open(char const* filename, int flags, int mode); 81 | # 82 | .globl open 83 | open: 84 | PUSH %ebp 85 | MOVL %esp, %ebp 86 | PUSH %ebx 87 | 88 | MOVL 16(%ebp), %edx 89 | MOVL 12(%ebp), %ecx 90 | MOVL 8(%ebp), %ebx 91 | MOVL $5, %eax # 5 == __NR_open 92 | INT $0x80 93 | CMPL $-4096, %eax # -4095 <= %eax < 0 for errno 94 | JNA .L3 95 | 96 | NEGL %eax 97 | MOVL %eax, errno 98 | XORL %eax, %eax 99 | DECL %eax 100 | .L3: 101 | POP %ebx 102 | POP %ebp 103 | RET 104 | 105 | 106 | #### # Function: int close(int fd); 107 | # 108 | .globl close 109 | close: 110 | PUSH %ebp 111 | MOVL %esp, %ebp 112 | PUSH %ebx 113 | 114 | MOVL 8(%ebp), %ebx 115 | MOVL $6, %eax # 6 == __NR_close 116 | INT $0x80 117 | CMPL $-4096, %eax # -4095 <= %eax < 0 for errno 118 | JNA .L4 119 | 120 | NEGL %eax 121 | MOVL %eax, errno 122 | XORL %eax, %eax 123 | DECL %eax 124 | .L4: 125 | POP %ebx 126 | POP %ebp 127 | RET 128 | 129 | 130 | #### # Function: int dup2(int oldfd, int newfd); 131 | # 132 | .globl dup2 133 | dup2: 134 | PUSH %ebp 135 | MOVL %esp, %ebp 136 | PUSH %ebx 137 | 138 | MOVL 12(%ebp), %ecx 139 | MOVL 8(%ebp), %ebx 140 | MOVL $63, %eax # 63 == __NR_dup2 141 | INT $0x80 142 | CMPL $-4096, %eax # -4095 <= %eax < 0 for errno 143 | JNA .L5 144 | 145 | NEGL %eax 146 | MOVL %eax, errno 147 | XORL %eax, %eax 148 | DECL %eax 149 | .L5: 150 | POP %ebx 151 | POP %ebp 152 | RET 153 | 154 | 155 | #### # Function: void* mmap(void *addr, size_t length, int prot, 156 | # int flags, int fd, off_t offset); 157 | # 158 | .globl mmap 159 | mmap: 160 | PUSH %ebp 161 | MOVL %esp, %ebp 162 | PUSH %ebx 163 | 164 | LEA 8(%ebp), %ebx 165 | MOVL $90, %eax # 90 == __NR_mmap 166 | INT $0x80 167 | CMPL $-4096, %eax # -4095 <= %eax < 0 for errno 168 | JNA .L6 169 | 170 | NEGL %eax 171 | MOVL %eax, errno 172 | XORL %eax, %eax 173 | DECL %eax 174 | 175 | .L6: 176 | POP %ebx 177 | POP %ebp 178 | RET 179 | 180 | 181 | #### # Function: void (*signal(int signum, void (*handler)(int)))(int); 182 | # 183 | .globl signal 184 | signal: 185 | PUSH %ebp 186 | MOVL %esp, %ebp 187 | PUSH %ebx 188 | 189 | MOVL 12(%ebp), %ecx 190 | MOVL 8(%ebp), %ebx 191 | MOVL $48, %eax # 48 == __NR_signal 192 | INT $0x80 193 | CMPL $-4096, %eax # -4095 <= %eax < 0 for errno 194 | JNA .L7 195 | 196 | NEGL %eax 197 | MOVL %eax, errno 198 | XORL %eax, %eax 199 | DECL %eax 200 | .L7: 201 | POP %ebx 202 | POP %ebp 203 | RET 204 | 205 | 206 | #### # Function: int kill( pid_t pid, int sig ); 207 | # 208 | .globl kill 209 | kill: 210 | PUSH %ebp 211 | MOVL %esp, %ebp 212 | PUSH %ebx 213 | 214 | MOVL 12(%ebp), %ecx 215 | MOVL 8(%ebp), %ebx 216 | MOVL $37, %eax # 37 == __NR_kill 217 | INT $0x80 218 | CMPL $-4096, %eax # -4095 <= %eax < 0 for errno 219 | JNA .L8 220 | 221 | NEGL %eax 222 | MOVL %eax, errno 223 | XORL %eax, %eax 224 | DECL %eax 225 | .L8: 226 | POP %ebx 227 | POP %ebp 228 | RET 229 | 230 | 231 | #### # Function: pid_t getpid(); 232 | # 233 | .globl getpid 234 | getpid: 235 | PUSH %ebp 236 | MOVL %esp, %ebp 237 | MOVL $20, %eax # 20 == __NR_getpid 238 | INT $0x80 239 | # NB the getpid syscall cannot fail. 240 | POP %ebp 241 | RET 242 | 243 | 244 | #### # Function: int execve(char* filename, char* argv[], char* envp[]); 245 | # 246 | .globl execve 247 | execve: 248 | PUSH %ebp 249 | MOVL %esp, %ebp 250 | PUSH %ebx 251 | 252 | MOVL 16(%ebp), %edx 253 | MOVL 12(%ebp), %ecx 254 | MOVL 8(%ebp), %ebx 255 | MOVL $11, %eax # 11 == __NR_execve 256 | INT $0x80 257 | CMPL $-4096, %eax # -4095 <= %eax < 0 for errno 258 | JNA .L9 259 | 260 | NEGL %eax 261 | MOVL %eax, errno 262 | XORL %eax, %eax 263 | DECL %eax 264 | .L9: 265 | POP %ebx 266 | POP %ebp 267 | RET 268 | 269 | 270 | #### # Function: int fork(); 271 | # 272 | .globl fork 273 | fork: 274 | PUSH %ebp 275 | MOVL %esp, %ebp 276 | 277 | MOVL $2, %eax # 2 == __NR_fork 278 | INT $0x80 279 | CMPL $-4096, %eax # -4095 <= %eax < 0 for errno 280 | JNA .L10 281 | 282 | NEGL %eax 283 | MOVL %eax, errno 284 | XORL %eax, %eax 285 | DECL %eax 286 | .L10: 287 | POP %ebp 288 | RET 289 | 290 | 291 | #### # Function: int waitpid(int pid, int* status, int options); 292 | # 293 | .globl waitpd 294 | waitpid: 295 | PUSH %ebp 296 | MOVL %esp, %ebp 297 | PUSH %ebx 298 | 299 | MOVL 16(%ebp), %edx 300 | MOVL 12(%ebp), %ecx 301 | MOVL 8(%ebp), %ebx 302 | MOVL $7, %eax # 7 == __NR_waitpid 303 | INT $0x80 304 | CMPL $-4096, %eax # -4095 <= %eax < 0 for errno 305 | JNA .L11 306 | 307 | NEGL %eax 308 | MOVL %eax, errno 309 | XORL %eax, %eax 310 | DECL %eax 311 | .L11: 312 | POP %ebx 313 | POP %ebp 314 | RET 315 | 316 | 317 | #### # Function: int unlink(char* filename); 318 | # 319 | .globl unlink 320 | unlink: 321 | PUSH %ebp 322 | MOVL %esp, %ebp 323 | PUSH %ebx 324 | 325 | MOVL 8(%ebp), %ebx 326 | MOVL $10, %eax # 10 == __NR_unlink 327 | INT $0x80 328 | CMPL $-4096, %eax # -4095 <= %eax < 0 for errno 329 | JNA .L12 330 | 331 | NEGL %eax 332 | MOVL %eax, errno 333 | XORL %eax, %eax 334 | DECL %eax 335 | .L12: 336 | POP %ebx 337 | POP %ebp 338 | RET 339 | 340 | 341 | #### # Function: time_t time(time_t *t); 342 | # 343 | .globl time 344 | time: 345 | PUSH %ebp 346 | MOVL %esp, %ebp 347 | PUSH %ebx 348 | 349 | MOVL 8(%ebp), %ebx 350 | MOVL $13, %eax # 13 == __NR_time 351 | INT $0x80 352 | CMPL $-4096, %eax # -4095 <= %eax < 0 for errno 353 | JNA .L13 354 | 355 | NEGL %eax 356 | MOVL %eax, errno 357 | XORL %eax, %eax 358 | DECL %eax 359 | .L13: 360 | POP %ebx 361 | POP %ebp 362 | RET 363 | -------------------------------------------------------------------------------- /stage-5/.gitignore: -------------------------------------------------------------------------------- 1 | .*.swp 2 | *.o 3 | cc 4 | ccx 5 | cpp 6 | cmp 7 | ccx1 8 | -------------------------------------------------------------------------------- /stage-5/Makefile: -------------------------------------------------------------------------------- 1 | # stage-5/Makefile 2 | 3 | # Copyright (C) 2013, 2014, 2015, 2020 Richard Smith 4 | # All rights reserved. 5 | 6 | SHELL = /bin/sh 7 | 8 | RM = /bin/rm 9 | CP = /bin/cp 10 | LN_S = /bin/ln -sf 11 | MAKE = /usr/bin/make 12 | 13 | BINDIR = ../bin 14 | LIBDIR = ../lib 15 | INCDIR = ../include 16 | PATH = $(BINDIR) 17 | 18 | all: init ccx cpp cc cmp 19 | 20 | init: 21 | @test -d $(INCDIR) || $(MAKE) -C .. init 22 | @test -x $(BINDIR)/cc0 || $(MAKE) -C ../stage-4 install 23 | 24 | # cc0 is the compiler symlinked from stage 4. 25 | # ccx1 is this stage's compiler (written in the cc0 language) compiled with cc0. 26 | # ccx is nearly the same code, though with a replacement node.c, compiled 27 | # by itself (by ccx1), and is therefore smaller. 28 | # ccx2 is a test compiler produced by ccx; it should be binary identical to ccx. 29 | 30 | 31 | # Suppress the default rules 32 | .SUFFIXES: 33 | 34 | %.o: %.s 35 | as $< 36 | 37 | %0.s: %.c 38 | cc0 -S -o $@ $< 39 | 40 | # The --compatibility=4 flag enables compatibility with the stage-4 cc. 41 | %1.s: %.c ccx1 42 | ./ccx1 --compatibility=4 -o $@ $< 43 | 44 | # We cannot remove --compatibility=4 on this build, because of the untyped 45 | # use of structs-as-arrays in the code. They should be coded as structs, 46 | # but that's not possible because it would be too difficult to add struct 47 | # support to the stage-4 cc. 48 | %2.s: %.i ccx 49 | ./ccx --compatibility=4 -o $@ $< 50 | 51 | %.i: %.c cpp 52 | ./cpp -Iinclude -o $@ $< 53 | 54 | %.s: %.c ccx 55 | ./ccx -o $@ $< 56 | 57 | # All these files get compiled with --compatibility=4 58 | CCX_OBJS = scanbase.o scanner.o symtab.o expr.o stmt.o type.o \ 59 | codegen.o i386.o main.o cli.o 60 | 61 | # We replace node.o with a new version written using structs (and which 62 | # therefore won't compile using the stage-4 cc), partly as a test of the 63 | # stage-4 cc. 64 | CCX0_OBJS = $(CCX_OBJS) node.o 65 | CCX1_OBJS = $(CCX_OBJS) nodenew.o 66 | 67 | ccx1: $(CCX0_OBJS:%.o=%0.o) 68 | ld -o $@ $(LIBDIR)/crt0.o $(CCX0_OBJS:%.o=%0.o) $(LIBDIR)/libc.o 69 | 70 | ccx: $(CCX1_OBJS:%.o=%1.o) 71 | ld -o $@ $(LIBDIR)/crt0.o $(CCX1_OBJS:%.o=%1.o) $(LIBDIR)/libc.o 72 | 73 | # These files need to compile without preprocessing, and are not given 74 | # --compatiibility=4. Note that cli.o, scanbase.o and expr.o are also 75 | # used by the compiler and need also to compile with the stage 4 cc. 76 | CPP_OBJS = scanbase.o cpp.o nodenew.o macros.o pvector.o cli.o expr.o \ 77 | cpptype.o eval.o 78 | 79 | cpp: $(CPP_OBJS) 80 | ld -o $@ $(LIBDIR)/crt0.o $(CPP_OBJS) $(LIBDIR)/libc.o 81 | 82 | CC_OBJS = pvector.o timeconv.o cc.o cli.o 83 | 84 | cc: $(CC_OBJS:%.o=%2.o) 85 | ld -o $@ $(LIBDIR)/crt0.o $(CC_OBJS:%.o=%2.o) $(LIBDIR)/libc.o 86 | 87 | CC = ./cc --with-cpp=./cpp --with-ccx=./ccx -I./include 88 | 89 | cmp: cc cmp.c cli.c 90 | $(CC) -o cmp cmp.c cli.c 91 | 92 | install: init ccx cpp cc cmp 93 | $(CP) ccx cpp cc cmp $(BINDIR) 94 | $(RM) -f $(BINDIR)/cc0 95 | $(CP) -r include/* $(INCDIR) 96 | 97 | .INTERMEDIATE: $(CCX0_OBJS:%.o=%0.o) $(CCX1_OBJS:%.o=%1.o) \ 98 | $(CCX1_OBJS:%.o=%.s) $(CCX1_OBJS) $(CPP_OBJS) $(CC_OBJS) 99 | 100 | clean: 101 | $(RM) -f $(CCX0_OBJS:%.o=%0.s) $(CCX0_OBJS:%.o=%0.o) ccx1 102 | $(RM) -f $(CCX1_OBJS:%.o=%1.s) $(CCX1_OBJS:%.o=%1.o) ccx 103 | $(RM) -f $(CPP_OBJS:%.o=%.s) $(CPP_OBJS) cpp 104 | $(RM) -f $(CCX1_OBJS:%.o=%2.s) $(CCX1_OBJS:%.o=%2.i) 105 | $(RM) -f $(CC_OBJS:%.o=%.i) $(CC_OBJS:%.o=%2.s) $(CC_OBJS:%.o=%2.s) cc 106 | $(RM) -f $(CCX1_OBJS:%.o=%2.o) ccx2 $(CC_OBJS:%.o=%2.o) cc2 107 | $(RM) -f cmp.o cmp.s cmp.i cmp 108 | 109 | check-cmp: ccx2 cc2 cmp 110 | ./cmp ccx2 ccx 111 | ./cmp cc2 cc 112 | ! ./cmp -s cc2 ccx2 113 | $(RM) -f cc2 ccx2 114 | 115 | # Build ccx2 and cc2 with a single command. This is a good test of the 116 | # driver logic. 117 | ccx2: cc $(CCX1_OBJS:%.o=%.c) 118 | $(CC) -o ccx2 --compatibility=4 $(CCX1_OBJS:%.o=%.c) 119 | 120 | cc2: cc ccx2 $(CC_OBJS:%.o=%.c) 121 | $(CC) -o cc2 --with-ccx=ccx2 $(CC_OBJS:%.o=%.c) 122 | 123 | check: check-cmp 124 | $(MAKE) -r -C cpp-tests $@ 125 | 126 | world: 127 | set -e; for TARGET in clean init all check install; do \ 128 | $(MAKE) $$TARGET; \ 129 | done 130 | -------------------------------------------------------------------------------- /stage-5/README.txt: -------------------------------------------------------------------------------- 1 | BOOTSTRAP STAGE 5 2 | 3 | Stage 5 reimplements the compiler from stage 4 in the B-like language 4 | of stage-4 compiler. This allows a significantly more advanced 5 | implementation, with the result that it supports most of syntax of K&R 6 | C (most notably, a type system) and is more efficient. 7 | 8 | New features in stage 5 compiler: 9 | - for loops 10 | - goto and labelled statements 11 | - switch, case labels and default (implemented inefficiently as a 12 | sequence of if-else statements) 13 | - a type system, including all of C's integer and character types, 14 | pointers, arrays and function pointers 15 | - structs 16 | - member access with the -> and . operators 17 | - sizeof operator 18 | - comma operator 19 | - type casts 20 | - C++-style comments 21 | - typedefs 22 | - #line directives (in the compiler proper) for improved diagnostics 23 | 24 | The compiler is named ccx. 25 | 26 | Usage: ccx [OPTIONS] FILENAME 27 | 28 | Options: 29 | --help Displays the help text 30 | -o FILENAME Specifies the output file name 31 | --compatibility=N Sets compatibility with the stage N tools 32 | 33 | If no -o option is specified, the input file has a .c or .i extension, 34 | the output is the same file name but with a .s extension. 35 | 36 | The --compatibility=4 flag enables compatibility with the stage 4 37 | compiler. This will permit arbitrary assignment to implicit int (but 38 | not any variable with a declared type). It will give an error when 39 | subscripting something other than an array of 4-byte objects; most 40 | commonly this triggers with character arrays which had to be manipulated 41 | with lchar and rchar in stage 4. An error is also given when doing 42 | pointer arithmetic on objects that are not single bytes. 43 | 44 | 45 | The new C-like compiler is used to implement a simple preprocessor, cpp, 46 | which is almost entirely compliant with the C90 standard. (The only 47 | known deviations from the standard are that it fails to handle 48 | white-space correctly in stringification, doesn't implement digraphs or 49 | trigraphs, or the ... punctuator, and will not concatenate string 50 | literals.) 51 | 52 | Usage: cpp [OPTIONS] FILENAME 53 | 54 | Options: 55 | --help Displays the help text 56 | -o FILENAME Specifies the output file name (default: stdout) 57 | -I DIRECTORY Appends a directory to the header search path 58 | -D NAME[=VAL] Pre-defines a macro, optionally with a value 59 | --include FILENAME Prefixes the specified file to the input 60 | -P Don't put #line directives in output 61 | 62 | 63 | Finally, a compiler driver called cc has been written to simplify the 64 | use of the four build tools (cpp, ccx, as and ld). 65 | 66 | Usage: cc [OPTIONS] FILES... 67 | 68 | Options: 69 | --help Displays the help text 70 | -o FILENAME Specifies the output file name 71 | -E Halt after preprocessing, generating .i files 72 | -S Halt after compiling, generating .s files 73 | -c Halt after assembling, generating .o files 74 | -I DIRECTORY Appends a directory to the header search path 75 | -D NAME[=VAL] Pre-defines a macro, optionally with a value 76 | --compatibility=N Sets compatibility with the stage N tools 77 | --nostdlib Do not link against crt0.o and libc.o 78 | --with-cpp=PROGRAM Use the specified program as the preprocessor 79 | --with-ccx=PROGRAM Use the specified program as the compiler 80 | --with-as=PROGRAM Use the specified program as the assembler 81 | --with-ld=PROGRAM Use the specified program as the linker 82 | 83 | Input files are distinguished using their extensions. A .c file as 84 | assumed to be a C file that needs preprocessing; a .i file is assumed 85 | not to require preprocessing; a .s file is assumed to be in assembly; 86 | and a .o file is assumed to be an object file. 87 | 88 | The compiler driver instructs the preprocessor to search the include/ 89 | directory and prepend include=include/rbc_init.h (which currently only 90 | defines the version number in __RBC_INIT). The __DATE__ and __TIME__ 91 | macros are also defined by the driver and passed to the preprocessor via 92 | the command line. 93 | 94 | TODO: 95 | - Errors on duplicate declarations at global scope 96 | - Tentative definitions 97 | - Prototypes 98 | - Unions, bit fields (probably not in this stage?), floats 99 | - n1062 #scopes? 100 | - #pragma once? Or #once and #forget per p0538r0? 101 | - Use temporary file names in driver 102 | -------------------------------------------------------------------------------- /stage-5/cc.c: -------------------------------------------------------------------------------- 1 | /* cc.c -- the C compiler driver 2 | * 3 | * Copyright (C) 2013, 2014 Richard Smith 4 | * All rights reserved. 5 | */ 6 | 7 | /* The Makefile sticks --compatibility=4 on the command line. Remove it. */ 8 | #pragma RBC compatibility 5 9 | 10 | #include 11 | #include "pvector.h" 12 | 13 | static struct pvector *temps, *pp_args, *cc_args, *as_args, *ld_args; 14 | static int last_stage = 4; /* -E = 1, -S = -2, -c = -3 */ 15 | static char* o_name = 0; /* The -o option, if any is given. */ 16 | static int nostdlib = 0; /* --nostdlib */ 17 | 18 | static 19 | usage() { 20 | cli_error("Usage: cc [-E | -S | -c] [-o output] [options] files...\n"); 21 | } 22 | 23 | extern char* opt_arg(); 24 | 25 | parse_args(argc, argv) 26 | int argc; 27 | char **argv; 28 | { 29 | int i = 1; 30 | 31 | while ( i < argc ) { 32 | char *arg = argv[i], *arg2; 33 | 34 | if ( arg2 = opt_arg( argv, argc, &i, "-I" ) ) { 35 | pvec_push( pp_args, "-I" ); 36 | pvec_push( pp_args, arg2 ); 37 | } 38 | 39 | else if ( arg2 = opt_arg( argv, argc, &i, "-D" ) ) { 40 | pvec_push( pp_args, "-D" ); 41 | pvec_push( pp_args, arg2 ); 42 | } 43 | 44 | else if ( strcmp( arg, "-E" ) == 0 ) { 45 | if ( last_stage != 4 ) 46 | cli_error("At most one of -E, -S and -c may be used\n"); 47 | last_stage = 1; ++i; 48 | } 49 | 50 | else if ( strcmp( arg, "-S" ) == 0 ) { 51 | if ( last_stage != 4 ) 52 | cli_error("At most one of -E, -S and -c may be used\n"); 53 | last_stage = 2; ++i; 54 | } 55 | 56 | else if ( strcmp( arg, "-c" ) == 0 ) { 57 | if ( last_stage != 4 ) 58 | cli_error("At most one of -E, -S and -c may be used\n"); 59 | last_stage = 3; ++i; 60 | } 61 | 62 | else if ( arg2 = opt_arg( argv, argc, &i, "-o" ) ) { 63 | if ( o_name ) cli_error( 64 | "Multiple output files specified: '%s' and '%s'\n", 65 | o_name, arg2 ); 66 | o_name = arg2; 67 | } 68 | 69 | else if ( arg2 = opt_arg( argv, argc, &i, "--compatibility" ) ) { 70 | pvec_push( cc_args, arg ); ++i; 71 | } 72 | 73 | else if ( strcmp( arg, "--nostdlib" ) == 0 ) { 74 | nostdlib = 1; ++i; 75 | } 76 | 77 | else if ( strcmp( arg, "--help" ) == 0 ) 78 | usage(); 79 | 80 | /* These options override the default backend programs */ 81 | else if ( arg2 = opt_arg( argv, argc, &i, "--with-cpp" ) ) 82 | pp_args->start[0] = arg2; 83 | else if ( arg2 = opt_arg( argv, argc, &i, "--with-ccx" ) ) 84 | cc_args->start[0] = arg2; 85 | else if ( arg2 = opt_arg( argv, argc, &i, "--with-as" ) ) 86 | as_args->start[0] = arg2; 87 | else if ( arg2 = opt_arg( argv, argc, &i, "--with-ld" ) ) 88 | ld_args->start[0] = arg2; 89 | 90 | else if ( arg[0] == '-' ) 91 | cli_error("Unknown option: %s\n", arg); 92 | 93 | else ++i; 94 | } 95 | } 96 | 97 | /* Invoke the command in ARGS. Return 0 for success or 1 for failure. */ 98 | invoke(args) 99 | struct pvector* args; 100 | { 101 | int pid, status; 102 | if ( ( pid = fork() ) == 0 ) 103 | execve( args->start[0], args->start, 0 ); 104 | else if ( pid == -1 ) { 105 | extern stderr; 106 | fprintf(stderr, "cc: Unable to invoke %s", args->start[0]); 107 | exit(1); 108 | } 109 | 110 | waitpid( pid, &status, 0 ); 111 | 112 | /* WTERMSIG(status) || WEXITSTATUS(status) */ 113 | return (status & 0xff7f) ? 1 : 0; 114 | } 115 | 116 | preprocess(argc, argv) 117 | int argc; 118 | char **argv; 119 | { 120 | int i = 0, fail = 0; 121 | extern char* strdup(); 122 | 123 | while ( ++i < argc && !fail ) { 124 | char *arg = argv[i]; 125 | int l = strlen(arg); 126 | if ( arg[0] != '-' && l > 2 && arg[l-1] == 'c' && arg[l-2] == '.' ) { 127 | char *oname; 128 | if ( o_name && last_stage == 1 ) oname = strdup(o_name); 129 | else { oname = strdup(arg); oname[l-1] = 'i'; } 130 | 131 | pvec_push( pp_args, "-o" ); 132 | pvec_push( pp_args, oname ); 133 | pvec_push( pp_args, arg ); 134 | fail = invoke( pp_args ); 135 | pvec_pop( pp_args ); 136 | pvec_pop( pp_args ); 137 | pvec_pop( pp_args ); 138 | 139 | if ( last_stage != 1 ) pvec_push( temps, oname ); 140 | else free( oname ); 141 | } 142 | } 143 | 144 | return fail; 145 | } 146 | 147 | compile(argc, argv) 148 | int argc; 149 | char **argv; 150 | { 151 | int i = 0, fail = 0; 152 | extern char* strdup(); 153 | 154 | while ( ++i < argc && !fail ) { 155 | char *arg = argv[i]; 156 | int l = strlen(arg); 157 | if ( arg[0] != '-' && l > 2 && (arg[l-1] == 'c' || arg[l-1] == 'i') 158 | && arg[l-2] == '.' ) { 159 | char *iname = strdup(arg); 160 | iname[l-1] = 'i'; 161 | 162 | if ( o_name && last_stage == 2 ) { 163 | pvec_push( cc_args, "-o" ); 164 | pvec_push( cc_args, o_name ); 165 | } 166 | 167 | pvec_push( cc_args, iname ); 168 | fail = invoke( cc_args ); 169 | pvec_pop( cc_args ); 170 | 171 | if ( o_name && last_stage == 2 ) { 172 | pvec_pop( cc_args ); 173 | pvec_pop( cc_args ); 174 | } 175 | else if ( last_stage != 2 ) { 176 | char *oname = strdup(arg); 177 | oname[l-1] = 's'; 178 | pvec_push( temps, oname ); 179 | } 180 | 181 | free( iname ); 182 | } 183 | } 184 | 185 | return fail; 186 | } 187 | 188 | assemble(argc, argv) 189 | int argc; 190 | char **argv; 191 | { 192 | int i = 0, fail = 0; 193 | extern char* strdup(); 194 | 195 | while ( ++i < argc && !fail ) { 196 | char *arg = argv[i]; 197 | int l = strlen(arg); 198 | if ( arg[0] != '-' && l > 2 && (arg[l-1] == 'c' || arg[l-1] == 'i' 199 | || arg[l-1] == 's') && arg[l-2] == '.' ) { 200 | char *iname = strdup(arg); 201 | iname[l-1] = 's'; 202 | 203 | if ( o_name && last_stage == 3 ) { 204 | /* The stage 3 assembler doesn't support -o, but we'll be 205 | * replacing that soon. */ 206 | pvec_push( as_args, "-o" ); 207 | pvec_push( as_args, o_name ); 208 | } 209 | 210 | pvec_push( as_args, iname ); 211 | fail = invoke( as_args ); 212 | pvec_pop( as_args ); 213 | 214 | if ( o_name && last_stage == 3 ) { 215 | pvec_pop( as_args ); 216 | pvec_pop( as_args ); 217 | } 218 | else if ( last_stage != 3 ) { 219 | char *oname = strdup(arg); 220 | oname[l-1] = 'o'; 221 | pvec_push( temps, oname ); 222 | } 223 | 224 | free( iname ); 225 | } 226 | } 227 | 228 | return fail; 229 | } 230 | 231 | link(argc, argv) 232 | int argc; 233 | char **argv; 234 | { 235 | int i = 0, fail = 0; 236 | struct pvector* free_list = pvec_new(); 237 | char** f; 238 | extern char* strdup(); 239 | 240 | if ( o_name ) { 241 | pvec_push( ld_args, "-o" ); 242 | pvec_push( ld_args, o_name ); 243 | } 244 | /* Ideally the linker would default to producing a.out, but we may still 245 | * be using the stage-3 linker which is very primitive and does not. */ 246 | else { 247 | pvec_push( ld_args, "-o" ); 248 | pvec_push( ld_args, "a.out" ); 249 | } 250 | 251 | if ( !nostdlib ) 252 | pvec_push( ld_args, "../lib/crt0.o" ); 253 | 254 | while ( ++i < argc ) { 255 | char *arg = argv[i]; 256 | int l = strlen(arg); 257 | if ( arg[0] != '-' && l > 2 && (arg[l-1] == 'c' || arg[l-1] == 'i' 258 | || arg[l-1] == 's' || arg[l-1]== 'o') && arg[l-2] == '.' ) { 259 | char *iname = strdup(arg); 260 | iname[l-1] = 'o'; 261 | pvec_push( ld_args, iname ); 262 | pvec_push( free_list, iname ); 263 | } 264 | } 265 | 266 | if ( !nostdlib ) 267 | /* This should be -lc, but the stage 3 linker doesn't accept that. */ 268 | pvec_push( ld_args, "../lib/libc.o" ); 269 | 270 | fail = invoke( ld_args ); 271 | for ( f = free_list->start; f != free_list->end; ++f ) free(*f); 272 | return fail; 273 | } 274 | 275 | static char *months[12] = { 276 | "Jan", "Feb", "Mar", "Apr", "May", "Jun", 277 | "Jul", "Aug", "Sep", "Oct", "Nov", "Dec" 278 | }; 279 | 280 | main(argc, argv) 281 | int argc; 282 | char **argv; 283 | { 284 | char** t; 285 | int res; 286 | time_t now = time(NULL); 287 | struct tm *tm = gmtime(&now); 288 | char buf1[32], buf2[32]; 289 | 290 | temps = pvec_new(); 291 | pp_args = pvec_new(); pvec_push( pp_args, "../bin/cpp" ); 292 | cc_args = pvec_new(); pvec_push( cc_args, "../bin/ccx" ); 293 | as_args = pvec_new(); pvec_push( as_args, "../bin/as" ); 294 | ld_args = pvec_new(); pvec_push( ld_args, "../bin/ld" ); 295 | 296 | /* The purpose of rbc_init.h is so that neither this compiler driver, 297 | * nor the preprocessor, need to be updated when the compiler is updated 298 | * to include new functionality. */ 299 | pvec_push( pp_args, "-I../include" ); 300 | pvec_push( pp_args, "--include=rbc_init.h" ); 301 | 302 | /* Define these standard macros, as they need compiler support. 303 | * Other variables, such as __STDC__ probably belong in . */ 304 | snprintf(buf1, 32, "-D__DATE__=\"%3s %2d %4d\"", 305 | months[tm->tm_mon], tm->tm_mday, tm->tm_year+1900); 306 | snprintf(buf2, 32, "-D__TIME__=\"%02d:%02d:%02d\"", 307 | tm->tm_hour, tm->tm_min, tm->tm_sec); 308 | pvec_push( pp_args, buf1 ); 309 | pvec_push( pp_args, buf2 ); 310 | 311 | parse_args( argc, argv ); 312 | res = preprocess(argc, argv); 313 | if (!res && last_stage > 1) res = compile(argc, argv); 314 | if (!res && last_stage > 2) res = assemble(argc, argv); 315 | if (!res && last_stage > 3) res = link(argc, argv); 316 | 317 | pvec_delete(pp_args); 318 | pvec_delete(cc_args); 319 | pvec_delete(ld_args); 320 | pvec_delete(as_args); 321 | 322 | for ( t = temps->start; t != temps->end; ++t ) { 323 | unlink(*t); 324 | free(*t); 325 | } 326 | pvec_delete(temps); 327 | 328 | return res; 329 | } 330 | -------------------------------------------------------------------------------- /stage-5/cli.c: -------------------------------------------------------------------------------- 1 | /* cli.c -- command line interface utils 2 | * 3 | * Copyright (C) 2013 Richard Smith 4 | * All rights reserved. 5 | */ 6 | 7 | cli_error(fmt) 8 | char *fmt; 9 | { 10 | extern stderr; 11 | vfprintf(stderr, fmt, &fmt); 12 | exit(1); 13 | } 14 | 15 | char* opt_arg(argv, argc, argnptr, argname) 16 | char **argv; 17 | int argc, *argnptr; 18 | char *argname; 19 | { 20 | auto char *arg = argv[*argnptr]; 21 | auto int arglen = strlen(argname); 22 | if ( strncmp( arg, argname, arglen ) == 0 ) { 23 | if ( rchar( arg, arglen ) == 0 ) { 24 | if ( ++*argnptr == argc ) 25 | cli_error("The %s option takes an argument\n", argname); 26 | arg = argv[*argnptr]; 27 | ++*argnptr; 28 | return arg; 29 | } 30 | /* Short arguments (e.g. -X) do not have an '=' before their values. */ 31 | else if ( arglen == 2 ) { 32 | arg += arglen; 33 | ++*argnptr; 34 | return arg; 35 | } 36 | /* Long arguments (e.g. --foo) need an '=' before their values. */ 37 | else if ( rchar( arg, arglen ) == '=' ) { 38 | arg += arglen + 1; 39 | ++*argnptr; 40 | return arg; 41 | } 42 | } 43 | return 0; 44 | } 45 | 46 | 47 | -------------------------------------------------------------------------------- /stage-5/cmp.c: -------------------------------------------------------------------------------- 1 | /* cmp.c -- an implementation of the POSIX cmp(1) utility 2 | * 3 | * Copyright (C) 2015 Richard Smith 4 | * All rights reserved. 5 | */ 6 | 7 | /* The Makefile sticks --compatibility=4 on the command line. Remove it. */ 8 | #pragma RBC compatibility 5 9 | 10 | #include 11 | 12 | usage() { 13 | cli_error("Usage: cmp [-l|-s] file1 file2\n"); 14 | } 15 | 16 | main(argc, argv) 17 | int argc; 18 | char **argv; 19 | { 20 | char *na, *nb; 21 | FILE *a, *b; 22 | int opt_s = 0, opt_l = 0; 23 | int i = 1, bytes = 0, lines = 1, status = 0; 24 | 25 | if (argc < 3) usage(); 26 | 27 | /* The -s option suppressed output. */ 28 | if ( strcmp( argv[i], "-s" ) == 0 ) 29 | opt_s = 1, ++i; 30 | /* The -l option prints a byte-by-byte comparison in a whacky format. */ 31 | else if ( strcmp( argv[i], "-l" ) == 0 ) 32 | opt_l = 1, ++i; 33 | 34 | na = argv[i++]; 35 | if ( strcmp( na, "-" ) ) { 36 | a = fopen( na, "r" ); 37 | if (!a) cli_error( "cmp: unable to open file '%s'\n", na ); 38 | } 39 | else a = stdin; 40 | 41 | nb = argv[i++]; 42 | if ( strcmp( nb, "-" ) ) { 43 | b = fopen( nb, "r" ); 44 | if (!b) cli_error( "cmp: unable to open file '%s'\n", nb ); 45 | } 46 | else if ( a == stdin ) 47 | cli_error( "cmp: cannot read standard input twice\n" ); 48 | else b = stdin; 49 | 50 | if (i != argc) usage(); 51 | 52 | while (1) { 53 | int ca = fgetc(a), cb = fgetc(b); 54 | ++bytes; 55 | 56 | /* The format of these messages, and whether they go to stdout or 57 | * stderr, is prescribed by POSIX. */ 58 | if ( ca == EOF && cb == EOF ) 59 | break; 60 | else if ( ca == EOF || cb == EOF ) { 61 | if (!opt_s) { 62 | fflush(stdout); 63 | fprintf(stderr, "cmp: EOF on %s\n", ca == EOF ? na : nb); 64 | } 65 | status = 1; 66 | break; 67 | } 68 | else if ( ca != cb ) { 69 | status = 1; 70 | if (opt_l) 71 | printf( "%d %o %o\n", bytes, ca, cb ); 72 | else { 73 | if (!opt_s) 74 | printf( "%s %s differ: char %d, line %d\n", 75 | na, nb, bytes, lines ); 76 | break; 77 | } 78 | } 79 | else if ( ca == '\n' ) 80 | ++lines; 81 | } 82 | 83 | return status; 84 | } 85 | -------------------------------------------------------------------------------- /stage-5/cpp-tests/Makefile: -------------------------------------------------------------------------------- 1 | # stage-5/cpp-tests/Makefile 2 | 3 | # Copyright (C) 2015, 2016, 2018, 2020 Richard Smith 4 | # All rights reserved. 5 | 6 | SHELL = /bin/sh 7 | PATH = .. 8 | 9 | MAKE = /usr/bin/make 10 | 11 | all: check 12 | 13 | # We have a proper test suite for the preprocessor 14 | CPP_TESTS = empty nocpp obj builtin fn simple suppress directive hash \ 15 | rescan macros include includemacro 16 | 17 | check: $(CPP_TESTS:%=%.run) 18 | 19 | # Suppress the default rules 20 | .SUFFIXES: 21 | 22 | ../cpp ../cmp: 23 | $(MAKE) -C .. $(@:../%=%) 24 | 25 | # The exit status of a pipeline is the status of the rightmost command. 26 | # That means that if the preprocessor exits with non-zero status (e.g. 27 | # because it aborts or segfaults) after it has written everything we 28 | # expect, the error is ignored and the cmp succeeds. In bash we'd fix 29 | # this with set -o pipefail, but that doesn't exist in more primitive 30 | # shells. The 2>&1 ensures that the error message is diverted to the cmp 31 | # where it breaks the comparison forcing a test failure. 32 | %.run: %.c %.i ../cpp ../cmp 33 | cpp -P $< 2>&1 | cmp - $(<:%.c=%.i) 34 | 35 | -------------------------------------------------------------------------------- /stage-5/cpp-tests/builtin.c: -------------------------------------------------------------------------------- 1 | /* This test checks that the two built-in macros work. */ 2 | set_location(__FILE__, __LINE__); 3 | -------------------------------------------------------------------------------- /stage-5/cpp-tests/builtin.i: -------------------------------------------------------------------------------- 1 | set_location ( "builtin.c" , 2 ) ; 2 | -------------------------------------------------------------------------------- /stage-5/cpp-tests/directive.c: -------------------------------------------------------------------------------- 1 | /* These are all null directives. */ 2 | # 3 | # 4 | # 5 | \ 6 | # 7 | /* Foo # include */ # 8 | 9 | /* Example from C99 6.10/4. 10 | * 11 | * "the sequence of preprocessing tokens on the second line is not a 12 | * preprocessing directive, because it does not begin with a # at the 13 | * start of translation phase 4, even though it will do so after the 14 | * macro EMPTY has been replaced." */ 15 | 16 | #define EMPTY 17 | EMPTY # include 18 | 19 | 20 | -------------------------------------------------------------------------------- /stage-5/cpp-tests/directive.i: -------------------------------------------------------------------------------- 1 | # include < missing . h > 2 | -------------------------------------------------------------------------------- /stage-5/cpp-tests/empty.c: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ras52/bootstrap/9bea23e118d61a63a1fa6de70181706e5aca9f3a/stage-5/cpp-tests/empty.c -------------------------------------------------------------------------------- /stage-5/cpp-tests/empty.i: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ras52/bootstrap/9bea23e118d61a63a1fa6de70181706e5aca9f3a/stage-5/cpp-tests/empty.i -------------------------------------------------------------------------------- /stage-5/cpp-tests/fn.c: -------------------------------------------------------------------------------- 1 | /* This test checks that a very simple function-like macro works. */ 2 | #define nop() 3 | 4 | nop() 5 | -------------------------------------------------------------------------------- /stage-5/cpp-tests/fn.i: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ras52/bootstrap/9bea23e118d61a63a1fa6de70181706e5aca9f3a/stage-5/cpp-tests/fn.i -------------------------------------------------------------------------------- /stage-5/cpp-tests/glue.c: -------------------------------------------------------------------------------- 1 | /* Example from C99 6.10.3.5/6 */ 2 | #define str(s) # s 3 | #define xstr(s) str(s) 4 | #define debug(s, t) printf("x" # s "= %d, x" # t "= %s", \ 5 | x ## s, x ## t) 6 | #define INCFILE(n) vers ## n 7 | #define glue(a, b) a ## b 8 | #define xglue(a, b) glue(a, b) 9 | #define HIGHLOW "hello" 10 | #define LOW LOW ", world" 11 | 12 | debug(1, 2); 13 | fputs(str(strncmp("abc\0d", "abc", '\4') // this goes away 14 | == 0) str(: @\n), s); 15 | include xstr(INCFILE(2).h) 16 | glue(HIGH, LOW); 17 | xglue(HIGH, LOW) 18 | -------------------------------------------------------------------------------- /stage-5/cpp-tests/hash.c: -------------------------------------------------------------------------------- 1 | /* Example from C99 6.10.3.3/4 */ 2 | #define hash_hash # ## # 3 | #define mkstr(a) # a 4 | #define in_between(a) mkstr(a) 5 | #define join(c, d) in_between(c hash_hash d) 6 | char p[] = join(x, y); 7 | -------------------------------------------------------------------------------- /stage-5/cpp-tests/hash.i: -------------------------------------------------------------------------------- 1 | char p [ ] = "x ## y" ; 2 | -------------------------------------------------------------------------------- /stage-5/cpp-tests/include.c: -------------------------------------------------------------------------------- 1 | #include "vers2.h" 2 | -------------------------------------------------------------------------------- /stage-5/cpp-tests/include.i: -------------------------------------------------------------------------------- 1 | const int i = 42 ; 2 | -------------------------------------------------------------------------------- /stage-5/cpp-tests/includemacro.c: -------------------------------------------------------------------------------- 1 | /* Example from C99 6.10.2/8 */ 2 | #define VERSION 2 3 | #if VERSION == 1 4 | #define INCFILE "vers1.h" 5 | #elif VERSION == 2 6 | #define INCFILE "vers2.h" 7 | #else 8 | #define INCFILE "versN.h" 9 | #endif 10 | #include INCFILE 11 | -------------------------------------------------------------------------------- /stage-5/cpp-tests/includemacro.i: -------------------------------------------------------------------------------- 1 | const int i = 42 ; 2 | -------------------------------------------------------------------------------- /stage-5/cpp-tests/macros.c: -------------------------------------------------------------------------------- 1 | /* Example from C99 6.10.3.5/5 */ 2 | #define x 3 3 | #define f(a) f(x * (a)) 4 | #undef x 5 | #define x 2 6 | #define g f 7 | #define z z[0] 8 | #define h g(~ 9 | #define m(a) a(w) 10 | #define w 0,1 11 | #define t(a) a 12 | #define p() int 13 | #define q(x) x 14 | #define r(x,y) x ## y 15 | #define str(x) # x 16 | 17 | f(y+1) 18 | + f(f(z)) % t(t(g)(0) + t)(1); 19 | g(x+(3,4)-w) | h 5) & m 20 | (f)^m(m); 21 | p() i[q()] = { q(1), r(2,3), r(4,), r(,5), r(,) }; 22 | char c[2][6] = { str(hello), str() }; 23 | -------------------------------------------------------------------------------- /stage-5/cpp-tests/macros.i: -------------------------------------------------------------------------------- 1 | f ( 2 * ( y + 1 ) ) + f ( 2 * ( f ( 2 * ( z [ 0 ] ) ) ) ) % f ( 2 * ( 0 ) ) + t ( 1 ) ; 2 | f ( 2 * ( 2 + ( 3 , 4 ) - 0 , 1 ) ) | f ( 2 * ( ~ 5 ) ) & 3 | f ( 2 * ( 0 , 1 ) ) ^ m ( 0 , 1 ) ; 4 | int i [ ] = { 1 , 23 , 4 , 5 , } ; 5 | char c [ 2 ] [ 6 ] = { "hello" , "" } ; 6 | -------------------------------------------------------------------------------- /stage-5/cpp-tests/nocpp.c: -------------------------------------------------------------------------------- 1 | /* The purpose of this test it to check for memory leaks etc. when the 2 | * preprocessor encounters identifiers that don't expand. */ 3 | extern inf f(); 4 | int i; 5 | -------------------------------------------------------------------------------- /stage-5/cpp-tests/nocpp.i: -------------------------------------------------------------------------------- 1 | extern inf f ( ) ; 2 | int i ; 3 | -------------------------------------------------------------------------------- /stage-5/cpp-tests/obj.c: -------------------------------------------------------------------------------- 1 | /* This test checks that a very simple object-like macro works. */ 2 | #define NULL 0 3 | 4 | NULL 5 | -------------------------------------------------------------------------------- /stage-5/cpp-tests/obj.i: -------------------------------------------------------------------------------- 1 | 0 2 | -------------------------------------------------------------------------------- /stage-5/cpp-tests/rescan.c: -------------------------------------------------------------------------------- 1 | /* This is a slightly non-trivial test of rescanning the result of macro 2 | * expansion. For a long time this case was causing a memory leak. */ 3 | #define f(a) (0+a) 4 | #define g f 5 | #define t(a) a 6 | 7 | t(g); 8 | -------------------------------------------------------------------------------- /stage-5/cpp-tests/rescan.i: -------------------------------------------------------------------------------- 1 | f ; 2 | -------------------------------------------------------------------------------- /stage-5/cpp-tests/simple.c: -------------------------------------------------------------------------------- 1 | #if 1 2 | #define INT int 3 | #define DECLARE(type, name) type name; 4 | #endif 5 | 6 | DECLARE(INT, i) 7 | -------------------------------------------------------------------------------- /stage-5/cpp-tests/simple.i: -------------------------------------------------------------------------------- 1 | int i ; 2 | -------------------------------------------------------------------------------- /stage-5/cpp-tests/suppress.c: -------------------------------------------------------------------------------- 1 | /* This test checks that macros are not expanded if not followed by an 2 | * open parenthesis. */ 3 | #define f() 1234 4 | 5 | (f)(); 6 | -------------------------------------------------------------------------------- /stage-5/cpp-tests/suppress.i: -------------------------------------------------------------------------------- 1 | ( f ) ( ) ; 2 | -------------------------------------------------------------------------------- /stage-5/cpp-tests/vers2.h: -------------------------------------------------------------------------------- 1 | /* Used by include.c */ 2 | const int i = 42; 3 | -------------------------------------------------------------------------------- /stage-5/cpptype.c: -------------------------------------------------------------------------------- 1 | /* cpptype.c -- stub code to avoid the preprocessor knowing about types 2 | * 3 | * Copyright (C) 2014 Richard Smith 4 | * All rights reserved. 5 | */ 6 | 7 | is_typedef( name ) { 8 | return 0; 9 | } 10 | 11 | lookup_type( node ) { 12 | return implct_int(); 13 | } 14 | 15 | is_declared( name ) { 16 | /* For the purpose of preprocessor expression evaluation, all names 17 | * can be used undeclared (c.f. C90 6.8.1, which says they're equal to 0). 18 | * This effectively means all identifiers are declared. */ 19 | return 1; 20 | } 21 | 22 | chk_subscr( node ) { 23 | /* The normal logic for disallowing [] in a integral constant expression 24 | * is based on the impossibility of their being arguments of any type 25 | * for which it would be valid. This logic is in the C version of 26 | * chk_subscr(). In the preprocessor, we give a blanket rejection. */ 27 | error("Subscript operator not permitted in constant expression"); 28 | } 29 | 30 | chk_member( node ) { 31 | /* As per chk_subscr(), the logic for C is rather round-about. */ 32 | error("Member access not permitted in constant expression"); 33 | } 34 | 35 | chk_addr( node ) { 36 | /* As per chk_subscr(), the logic for C is rather round-about. */ 37 | error("Taking addresses not permitted in constant expression"); 38 | } 39 | 40 | chk_deref( node ) { 41 | /* As per chk_subscr(), the logic for C is rather round-about. */ 42 | error("Dereferencing not permitted in constant expression"); 43 | } 44 | 45 | chk_arg( node ) { 46 | error("Function calls not permitted in constant expression"); 47 | } 48 | 49 | /* Operations that are already disallowed by virtue of being in a ICE */ 50 | chk_incdec( node ) {} 51 | chk_call( node ) {} 52 | chk_assign( node ) {} 53 | chk_comma( node ) {} 54 | 55 | /* Operations that are always valid because the only types are int */ 56 | chk_mult( node ) {} 57 | chk_add( node ) {} 58 | chk_shift( node ) {} 59 | chk_cmp( node ) {} 60 | chk_bitop( node ) {} 61 | chk_int( node ) {} 62 | 63 | type_name( node ) { 64 | /* This gets called in sizeof expessions and in casts. The preprocessor 65 | * never sees a sizeof expression because it does not recognise keywords 66 | * and therefore sees sizeof(foo) as a function call. Nor does the 67 | * preprocessor ever see a cast because, for similar reasons, it never 68 | * recognises anything as a decl spec. Thus, in (int)foo, the (int) 69 | * is treated as a primary expression with int evaluation to 0, and 70 | * the 'foo' causes a parser error. */ 71 | int_error("Unexpected type name in preprocessor"); 72 | } 73 | 74 | prom_type( type ) { 75 | return type; 76 | } 77 | 78 | /* Static types */ 79 | static s_int = 0; 80 | 81 | init_stypes() { 82 | s_int = new_node('dclt', 3); 83 | set_op( s_int, 0, new_node('int', 0) ); 84 | } 85 | 86 | implct_int() { 87 | return s_int; 88 | } 89 | 90 | fini_stypes() { 91 | free_node(s_int); 92 | } 93 | 94 | size_t_type() { 95 | int_error("Use of size_t in preprocessor"); 96 | } 97 | 98 | type_size() { 99 | int_error("Size of type required in preprocessor"); 100 | } 101 | 102 | is_dclspec() { 103 | return 0; 104 | } 105 | -------------------------------------------------------------------------------- /stage-5/eval.c: -------------------------------------------------------------------------------- 1 | /* eval.c -- evaluate integral constant expressions 2 | * 3 | * Copyright (C) 2014 Richard Smith 4 | * All rights reserved. 5 | */ 6 | 7 | /* If only we had a preprocessor to avoid these duplications ... :-) */ 8 | struct node { 9 | int code; /* character code for the node, e.g. '+' or 'if'. */ 10 | int arity; /* the number of nodes in the ops[] array. */ 11 | struct node* type; /* Always NULL in the preprocessor. */ 12 | struct node* ops[4]; 13 | }; 14 | 15 | struct node *add_ref(); 16 | 17 | static 18 | struct node * 19 | new_num(val) { 20 | struct node *n = new_node('num', 0); 21 | n->type = add_ref( implct_int() ); 22 | n->ops[0] = (struct node*) val; 23 | return n; 24 | } 25 | 26 | struct node * 27 | eval(n) 28 | struct node *n; 29 | { 30 | int c = n->code, a = n->arity; 31 | /* The following are not supported: 32 | * id str unary-* unary-& ++ -- size () [] -> . (cast) , = @= 33 | */ 34 | 35 | if ( a == 0 ) { 36 | if ( c == 'num' ) return add_ref(n); 37 | else if ( c == 'chr' ) return new_num( parse_chr( node_str(n) ) ); 38 | else int_error("Unable to evaluate literal '%Mc'", c); 39 | } 40 | 41 | else if ( a == 1 ) { 42 | struct node *arg = eval( n->ops[0] ), *val; 43 | if ( c == '+' ) return arg; 44 | else if ( c == '-' ) val = new_num( -node_ival(arg) ); 45 | else if ( c == '~' ) val = new_num( ~node_ival(arg) ); 46 | else if ( c == '!' ) val = new_num( !node_ival(arg) ); 47 | else int_error("Unable to evaluate unary '%Mc'", c); 48 | free_node(arg); 49 | return val; 50 | } 51 | 52 | else if ( a == 2 ) { 53 | struct node *lhs = eval( n->ops[0] ), *rhs = eval( n->ops[1] ), *val; 54 | if ( c == ',' ) val = add_ref(rhs); 55 | else if ( c == '*' ) val = new_num( node_ival(lhs) * node_ival(rhs) ); 56 | else if ( c == '/' ) val = new_num( node_ival(lhs) / node_ival(rhs) ); 57 | else if ( c == '%' ) val = new_num( node_ival(lhs) % node_ival(rhs) ); 58 | else if ( c == '+' ) val = new_num( node_ival(lhs) + node_ival(rhs) ); 59 | else if ( c == '-' ) val = new_num( node_ival(lhs) - node_ival(rhs) ); 60 | else if ( c == '<<' ) val = new_num( node_ival(lhs) << node_ival(rhs) ); 61 | else if ( c == '>>' ) val = new_num( node_ival(lhs) >> node_ival(rhs) ); 62 | else if ( c == '<' ) val = new_num( node_ival(lhs) < node_ival(rhs) ); 63 | else if ( c == '>' ) val = new_num( node_ival(lhs) > node_ival(rhs) ); 64 | else if ( c == '<=' ) val = new_num( node_ival(lhs) <= node_ival(rhs) ); 65 | else if ( c == '>=' ) val = new_num( node_ival(lhs) >= node_ival(rhs) ); 66 | else if ( c == '==' ) val = new_num( node_ival(lhs) == node_ival(rhs) ); 67 | else if ( c == '!=' ) val = new_num( node_ival(lhs) != node_ival(rhs) ); 68 | else if ( c == '&' ) val = new_num( node_ival(lhs) & node_ival(rhs) ); 69 | else if ( c == '^' ) val = new_num( node_ival(lhs) ^ node_ival(rhs) ); 70 | else if ( c == '|' ) val = new_num( node_ival(lhs) | node_ival(rhs) ); 71 | /* We don't care about whether these short circuit as it is not 72 | * possible to detect that. */ 73 | else if ( c == '&&' ) val = new_num( node_ival(lhs) && node_ival(rhs) ); 74 | else if ( c == '||' ) val = new_num( node_ival(lhs) || node_ival(rhs) ); 75 | else int_error("Unable to evaluate binry '%Mc'", c); 76 | free_node(rhs); free_node(lhs); 77 | return val; 78 | } 79 | 80 | else if ( c == '?:' ) { 81 | struct node *cond = eval( n->ops[0] ); 82 | int val = node_ival(cond); 83 | free_node(cond); 84 | return eval( n->ops[ val ? 1 : 2 ] ); 85 | } 86 | 87 | else int_error("Unable to evaluate unknown operator '%Mc'", c); 88 | } 89 | -------------------------------------------------------------------------------- /stage-5/i386.c: -------------------------------------------------------------------------------- 1 | /* i386.c -- i386 specific code 2 | * 3 | * Copyright (C) 2013 Richard Smith 4 | * All rights reserved. 5 | */ 6 | 7 | static 8 | sz_suffix(sz) { 9 | extern compat_flag; 10 | if (sz == 4) return 'L'; 11 | else if (sz == 2) return 'W'; 12 | else if (sz == 1) return 'B'; 13 | int_error("Unexpected size operand to instruction: %d", sz); 14 | } 15 | 16 | static 17 | sz_accum(sz) { 18 | if (sz == 4) return "%eax"; 19 | else if (sz == 2) return "%ax"; 20 | else if (sz == 1) return "%al"; 21 | int_error("Unexpected size register requested: %d", sz); 22 | } 23 | 24 | static 25 | sz_aux_reg(sz) { 26 | if (sz == 4) return "%ecx"; 27 | else if (sz == 2) return "%cx"; 28 | else if (sz == 1) return "%cl"; 29 | int_error("Unexpected size register requested: %d", sz); 30 | } 31 | 32 | static 33 | acc_to_aux(stream) { 34 | fputs("\tMOVL\t%eax, %ecx\n", stream); 35 | } 36 | 37 | load_num(stream, num, is_unsgn) { 38 | /* We want to print big unsigned numbers in hex to avoid an error from 39 | * the stage-3 assembler which detects signed overflow on decimals. */ 40 | fprintf(stream, is_unsgn && num & 0x80000000 41 | ? "\tMOVL\t$0x%X, %%eax\n" : "\tMOVL\t$%d, %%eax\n", num); 42 | } 43 | 44 | load_chr(stream, chr) { 45 | fprintf(stream, "\tMOVL\t$%s, %%eax\n", chr); 46 | } 47 | 48 | defn_str(stream, str, clabel) { 49 | fprintf(stream, ".data\n.LC%d:\n\t.string %s\n.text\n", clabel, str); 50 | } 51 | 52 | load_str(stream, clabel) { 53 | fprintf(stream, "\tMOVL\t$.LC%d, %%eax\n", clabel); 54 | } 55 | 56 | load_local(stream, offset, need_addr) { 57 | fprintf(stream, "\t%s\t%d(%%ebp), %%eax\n", 58 | need_addr ? "LEA" : "MOVL", offset); 59 | } 60 | 61 | load_symbol(stream, name, need_addr) { 62 | fprintf(stream, "\tMOVL\t%s%s, %%eax\n", need_addr ? "$" : "", name); 63 | } 64 | 65 | save_local(stream, offset, sz) { 66 | fprintf(stream, "\tMOV%c\t%s, %d(%%ebp)\n", 67 | sz_suffix(sz), sz_accum(sz), offset); 68 | } 69 | 70 | asm_push(stream) { 71 | fputs("\tPUSHL\t%eax\n", stream); 72 | } 73 | 74 | arith_neg(stream) { 75 | fputs("\tNEGL\t%eax\n", stream); 76 | } 77 | 78 | bit_not(stream) { 79 | fputs("\tNOTL\t%eax\n", stream); 80 | } 81 | 82 | logic_not(stream, sz) { 83 | auto accum = sz_accum(sz); 84 | fprintf(stream, "\tTEST%c\t%s, %s\n", sz_suffix(sz), accum, accum); 85 | fputs("\tSETZ\t%al\n\tMOVZBL\t%al, %eax\n", stream); 86 | } 87 | 88 | dereference(stream, sz, need_lval) { 89 | if (!need_lval) 90 | fprintf(stream, "\tMOV%c\t(%%eax), %s\n", sz_suffix(sz), 91 | sz_accum(sz)); 92 | } 93 | 94 | static 95 | do_inc(stream, reg, n) { 96 | if (n == 0) 97 | ; 98 | else if (n == 1 || n == -1) 99 | fprintf(stream, "\t%s%c\t(%s)\n", n == 1 ? "INC" : "DEC", 'L', reg); 100 | else 101 | fprintf(stream, "\t%s%c\t$%d, (%s)\n", 102 | n > 0 ? "ADD" : "SUB", 'L', abs(n), reg); 103 | } 104 | 105 | increment(stream, sz, n) { 106 | do_inc(stream, "%eax", n); 107 | dereference(stream, sz, 0); 108 | } 109 | 110 | postfix_inc(stream, sz, n) { 111 | acc_to_aux(stream); 112 | dereference(stream, sz, 0); 113 | do_inc(stream, "%ecx", n); 114 | } 115 | 116 | static 117 | start_binop(stream, is_assign, is_sym) { 118 | if (is_sym && !is_assign) fputs("\tPOPL\t%ecx\n", stream); 119 | else { 120 | acc_to_aux(stream); 121 | fputs("\tPOPL\t%eax\n", stream); 122 | } 123 | } 124 | 125 | pop_mult(stream, is_assign, is_unsgn) { 126 | fputs("\tPOPL\t%ecx\n", stream); 127 | fprintf(stream, "\t%s%c\t%s\n", is_unsgn ? "MUL" : "IMUL", 'L', 128 | is_assign ? "(%ecx)" : "%ecx"); 129 | if (is_assign) 130 | fputs("\tMOVL\t%eax, (%ecx)\n\tMOVL\t%ecx, %eax\n", stream); 131 | } 132 | 133 | /* Note that this *only* POPs the first argument if !is_assign. 134 | * Otherwise doing so is the caller's responsibility. */ 135 | static 136 | common_div(stream, is_assign, is_unsgn) { 137 | acc_to_aux(stream); 138 | if (is_assign) 139 | fputs("\tMOVL\t(%esp), %eax\nMOVL\t(%eax), %eax\n", stream); 140 | else 141 | fputs("\tPOPL\t%eax\n", stream); 142 | fputs("\tXORL\t%edx, %edx\n", stream); 143 | fprintf(stream, "\t%s%c\t%%ecx\n", is_unsgn ? "DIV" : "IDIV", 'L'); 144 | } 145 | 146 | pop_div(stream, is_assign, is_unsgn) { 147 | common_div(stream, is_assign, is_unsgn); 148 | if (is_assign) 149 | fputs("\tPOPL\t%ecx\nMOVL\t%eax, (%ecx)\nMOVL\t%ecx, %eax\n", stream); 150 | } 151 | 152 | pop_mod(stream, is_assign, is_unsgn) { 153 | common_div(stream, is_assign, is_unsgn); 154 | if (is_assign) 155 | fputs("\tPOPL\t%eax\nMOVL\t%edx, (%eax)\n", stream); 156 | else 157 | fputs("\tMOVL\t%edx, %eax\n", stream); 158 | } 159 | 160 | static 161 | pop_shift(stream, mnemonic, is_assign) { 162 | start_binop(stream, is_assign, 0); 163 | fprintf(stream, "\t%s\t%s\n", mnemonic, is_assign ? "(%eax)" : "%eax"); 164 | } 165 | 166 | pop_lshift(stream, is_assign) { pop_shift(stream, "SALL", is_assign); } 167 | pop_rshift(stream, is_assign) { pop_shift(stream, "SARL", is_assign); } 168 | 169 | static 170 | pop_rel(stream, sz, cond) { 171 | fprintf( stream, "\tPOPL\t%%ecx\n\tCMP%c\t%s, %s\n", 172 | sz_suffix(sz), sz_accum(sz), sz_aux_reg(sz) ); 173 | fprintf( stream, "\tSET%s\t%%al\n\tMOVZBL\t%%al, %%eax\n", cond ); 174 | } 175 | 176 | pop_gt(stream, sz, is_unsgn) { pop_rel(stream, sz, is_unsgn ? "A" : "G" ); } 177 | pop_lt(stream, sz, is_unsgn) { pop_rel(stream, sz, is_unsgn ? "B" : "L" ); } 178 | pop_ge(stream, sz, is_unsgn) { pop_rel(stream, sz, is_unsgn ? "AE" : "GE"); } 179 | pop_le(stream, sz, is_unsgn) { pop_rel(stream, sz, is_unsgn ? "BE" : "LE"); } 180 | pop_eq(stream, sz) { pop_rel(stream, sz, "E"); } 181 | pop_ne(stream, sz) { pop_rel(stream, sz, "NE"); } 182 | 183 | static 184 | pop_binop(stream, mnemonic, is_assign, is_sym, sz) { 185 | start_binop(stream, is_assign, is_sym); 186 | fprintf(stream, "\t%s%c\t%s, %s\n", mnemonic, sz_suffix(sz), 187 | sz_aux_reg(sz), is_assign ? "(%eax)" : sz_accum(sz) ); 188 | } 189 | 190 | pop_add(stream, is_assign, sz) { 191 | pop_binop(stream, "ADD", is_assign, 1, sz); 192 | } 193 | pop_sub(stream, is_assign, sz) { 194 | pop_binop(stream, "SUB", is_assign, 0, sz); 195 | } 196 | pop_bitand(stream, is_assign, sz) { 197 | pop_binop(stream, "AND", is_assign, 1, sz); 198 | } 199 | pop_bitor(stream, is_assign, sz) { 200 | pop_binop(stream, "OR", is_assign, 1, sz); 201 | } 202 | pop_bitxor(stream, is_assign, sz) { 203 | pop_binop(stream, "XOR", is_assign, 1, sz); 204 | } 205 | 206 | static 207 | ilog2(i) { 208 | auto l = 0; 209 | while (i >>= 1) ++l; 210 | return l; 211 | } 212 | 213 | mem_access(stream, offset, need_addr) { 214 | fprintf(stream, "\t%s\t%d(%%eax), %%eax\n", 215 | need_addr ? "LEA" : "MOVL", offset); 216 | } 217 | 218 | scale_elt(stream, elt_size, dir) { 219 | if ( elt_size == 1 ) 220 | ; 221 | else if ( (elt_size & (elt_size-1)) == 0 ) 222 | /* If the size is a power of two, then use SHLL for speed */ 223 | fprintf(stream, "\tMOVB\t$%d, %%cl\n\t%sL\t%%eax\n", ilog2(elt_size), 224 | dir > 0 ? "SHL" : "SHR"); 225 | else { 226 | if (dir < 0) fputs("\tXORL\t%edx, %edx\n", stream); 227 | fprintf(stream, "\tMOVL\t$%d, %%ecx\n\t%sL\t%%ecx\n", elt_size, 228 | dir > 0 ? "MUL" : "DIV"); 229 | } 230 | } 231 | 232 | pop_assign(stream, sz) { 233 | start_binop(stream, 1, 0); 234 | fprintf(stream, "\tMOV%c\t%s, (%%eax)\n", 235 | sz_suffix(sz), sz_aux_reg(sz)); 236 | } 237 | 238 | load_zero(stream) { 239 | fputs("\tXORL\t%eax, %eax\n", stream); 240 | } 241 | 242 | alloc_stack(stream, sz) { 243 | if (sz) 244 | fprintf(stream, "\tSUBL\t$%d, %%esp\n", sz); 245 | } 246 | 247 | clear_stack(stream, sz) { 248 | if (sz) 249 | fprintf(stream, "\tADDL\t$%d, %%esp\n", sz); 250 | } 251 | 252 | asm_call(stream, fn_name, cleanup_sz) { 253 | fprintf(stream, "\tCALL\t%s\n", fn_name); 254 | clear_stack( stream, cleanup_sz ); 255 | } 256 | 257 | call_ptr(stream, cleanup_sz) { 258 | fputs("\tCALL\t*%eax\n", stream); 259 | clear_stack( stream, cleanup_sz ); 260 | } 261 | 262 | static 263 | cond_branch(stream, sz, mnemonic, label_num) { 264 | auto acc = sz_accum(sz); 265 | fprintf(stream, "\tTEST%c\t%s, %s\n\t%s\t.L%d\n", 266 | sz_suffix(sz), acc, acc, mnemonic, label_num); 267 | } 268 | 269 | branch_ifz(stream, sz, label_num) { 270 | cond_branch(stream, sz, "JZ", label_num); 271 | } 272 | 273 | branch_ifnz(stream, sz, label_num) { 274 | cond_branch(stream, sz, "JNZ", label_num); 275 | } 276 | 277 | branch(stream, label_num) { 278 | fprintf(stream, "\tJMP\t.L%d\n", label_num); 279 | } 280 | 281 | branch_eq_n(stream, n, label_num) { 282 | fprintf(stream, "\tCMPL\t$%d, %%eax\n\tJE\t.L%d\n", n, label_num); 283 | } 284 | 285 | emit_label(stream, label_num) { 286 | fprintf(stream, ".L%d:\n", label_num); 287 | } 288 | 289 | cast_bool(stream) { 290 | fputs("\tTESTL\t%eax, %eax\n\tSETNZ\t%al\n\tMOVZBL\t%al, %eax\n", stream); 291 | } 292 | 293 | globl_decl(stream, name) { 294 | fprintf(stream, ".globl\t%s\n", name); 295 | } 296 | 297 | local_decl(stream, name) { 298 | fprintf(stream, ".local\t%s\n", name); 299 | } 300 | 301 | prolog(stream, name, frame_sz) { 302 | fprintf(stream, ".text\n%s:\n\tPUSHL\t%%ebp\n\tMOVL\t%%esp, %%ebp\n", name); 303 | alloc_stack(stream, frame_sz); 304 | } 305 | 306 | epilog(stream, frame_sz) { 307 | clear_stack(stream, frame_sz); 308 | fputs("\tLEAVE\n\tRET\n\n", stream); 309 | } 310 | 311 | data_decl(stream, name) { 312 | fprintf(stream, ".data\n%s:\n", name); 313 | } 314 | 315 | int_decl_n(stream, num) { 316 | fprintf(stream, "\t.int\t%d\n", num); 317 | } 318 | 319 | int_decl_s(stream, str) { 320 | fprintf(stream, "\t.int\t%s\n", str); 321 | } 322 | 323 | int_decl_lc(stream, clabel) { 324 | fprintf(stream, "\t.int\t.LC%d\n", clabel); 325 | } 326 | 327 | zero_direct(stream, n) { 328 | fprintf(stream, "\t.zero\t%d\n", n); 329 | } 330 | 331 | promote(stream, is_unsgn, oldsz, newsz) { 332 | if ( oldsz < newsz ) 333 | fprintf( stream, "\tMOV%c%c%c\t%s, %s\n", is_unsgn ? 'Z' : 'S', 334 | sz_suffix(oldsz), sz_suffix(newsz), 335 | sz_accum(oldsz), sz_accum(newsz) ); 336 | } 337 | -------------------------------------------------------------------------------- /stage-5/include/bits/eof.h: -------------------------------------------------------------------------------- 1 | /* -- the EOF value for 2 | * 3 | * Copyright (C) 2005, 2013 Richard Smith 4 | * All rights reserved. 5 | */ 6 | 7 | #ifndef __RBC_BITS_EOF_INCLUDED 8 | #define __RBC_BITS_EOF_INCLUDED 9 | 10 | /* C90 7.9.1 The definition of the EOF macro. 11 | * 12 | * EOF "expands to a negative integral constant expression that is returned 13 | * by several functions to indicate end-of-file, that is, no more input from 14 | * a stream." 15 | */ 16 | #define EOF (-1) 17 | 18 | #endif 19 | 20 | -------------------------------------------------------------------------------- /stage-5/include/bits/file.h: -------------------------------------------------------------------------------- 1 | /* -- define FILE 2 | * 3 | * Copyright (C) 2005, 2015 Richard Smith 4 | * All rights reserved. 5 | */ 6 | 7 | #ifndef __RBC_BITS_FILE_INCLUDED 8 | #define __RBC_BITS_FILE_INCLUDED 9 | 10 | /* C90 7.9.1 The definition of the FILE type. 11 | * 12 | * It can remain an incomplete type. */ 13 | typedef struct __stdio_file_t FILE; 14 | 15 | #endif 16 | -------------------------------------------------------------------------------- /stage-5/include/bits/file_access.h: -------------------------------------------------------------------------------- 1 | /* -- file access functions for 2 | * 3 | * Copyright (C) 2005, 2013, 2015 Richard Smith 4 | * All rights reserved. 5 | */ 6 | 7 | #ifndef __RBC_BITS_FILE_ACCESS_INCLUDED 8 | #define __RBC_BITS_FILE_ACCESS_INCLUDED 9 | 10 | #include 11 | #include 12 | 13 | /* These macros match values are hardcoded into stage-4/output.c. */ 14 | 15 | /* Values suitable for the mode argument to setvbuf */ 16 | #define _IOFBF 1 /* Fully buffered */ 17 | #define _IOLBF 2 /* Line buffered */ 18 | #define _IONBF 3 /* Unbuffered */ 19 | 20 | /* The size buffer required as the buf argument to setbuf. 21 | * Note: This is not (necessarily) the default buffer size. */ 22 | #define BUFSIZ 4096 23 | 24 | #if 0 25 | /* C90 7.9.5.1: The fclose function */ 26 | int fclose( FILE* stream ); 27 | 28 | /* C90 7.9.5.2: The fflush function */ 29 | int fflush( FILE* stream ); 30 | 31 | /* C90 7.9.5.3: The fopen function */ 32 | FILE *fopen( char const* filename, char const* mode ); 33 | 34 | /* C90 7.9.5.4: The freopen function */ 35 | FILE *freopen( char const *filename, char const *mode, FILE *stream ); 36 | 37 | /* C90 7.9.5.5: The setbuf function */ 38 | void setbuf( FILE *stream, char *buf ); 39 | 40 | /* C90 7.9.5.6: The setvbuf function */ 41 | int setvbuf( FILE *stream, char *buf, int mode, size_t size ); 42 | 43 | #else 44 | 45 | /* Temporary versions while we don't support prototypes. */ 46 | FILE *freopen(); 47 | FILE *fopen(); 48 | 49 | #endif 50 | 51 | #endif 52 | -------------------------------------------------------------------------------- /stage-5/include/bits/null.h: -------------------------------------------------------------------------------- 1 | /* -- the NULL macro is defined by multiple headers 2 | * 3 | * Copyright (C) 2005, 2013 Richard Smith 4 | * All rights reserved. 5 | */ 6 | 7 | #ifndef __RBC_BITS_NULL_INCLUDED 8 | #define __RBC_BITS_NULL_INCLUDED 9 | 10 | /* C90 7.1.6 Common definitions (part implementation) 11 | * 12 | * "An integral constant expression with the value 0, or such an 13 | * expression cast to type void *, is called a null pointer constant." 14 | * [C90 6.2.2.3] 15 | * 16 | * NULL "expands to an implementation-defined null pointer 17 | * constant." [C90 7.1.6] 18 | * 19 | * It is also defined in , see C90 7.9.1 20 | */ 21 | 22 | /* FIXME There is no void type in stage 5. */ 23 | #define NULL 0 24 | 25 | #endif 26 | -------------------------------------------------------------------------------- /stage-5/include/bits/size_t.h: -------------------------------------------------------------------------------- 1 | /* -- the size_t type is defined by multiple headers 2 | * 3 | * Copyright (C) 2005, 2013 Richard Smith 4 | * All rights reserved. 5 | */ 6 | 7 | #ifndef __RBC_BITS_SIZE_T_INCLUDED 8 | #define __RBC_BITS_SIZE_T_INCLUDED 9 | 10 | /* C90 7.1.6 Common definitions (part implementation) 11 | * 12 | * "size_t is the unsigned integral type of the result of the sizeof 13 | * operator." [C90 7.1.6] 14 | * 15 | * It is also defined in , see C90 7.9.1 16 | */ 17 | typedef unsigned int size_t; 18 | 19 | #endif 20 | -------------------------------------------------------------------------------- /stage-5/include/bits/std_streams.h: -------------------------------------------------------------------------------- 1 | /* -- define stdin, stdout & stderr 2 | * 3 | * Copyright (C) 2005, 2015 Richard Smith 4 | * All rights reserved. 5 | */ 6 | 7 | #ifndef __RBC_BITS_STD_STREAMS_INCLUDED 8 | #define __RBC_BITS_STD_STREAMS_INCLUDED 9 | 10 | #include 11 | 12 | /* C90 7.9.1 The definition of the standard stream macros. 13 | * 14 | * They "are expressions of type "pointed to FILE" that point to the FILE 15 | * objects associated, respectively, with standard error, input, and output 16 | * stream." But we want ELF objects of the same name. 17 | */ 18 | extern FILE *stdin, *stdout, *stderr; 19 | 20 | #define stdin stdin 21 | #define stdout stdout 22 | #define stderr stderr 23 | 24 | #endif 25 | -------------------------------------------------------------------------------- /stage-5/include/bits/string.h: -------------------------------------------------------------------------------- 1 | /* -- string manipulation functions 2 | * 3 | * Copyright (C) 2005, 2021 Richard Smith 4 | * All rights reserved. 5 | */ 6 | 7 | #ifndef __RBC_BITS_STRING_INCLUDED 8 | #define __RBC_BITS_STRING_INCLUDED 9 | 10 | #if 0 11 | /* C90 7.11.2: Copying functions */ 12 | void* memcpy( void* dest, void const* src, size_t n ); 13 | void* memmove( void* deset, void const* src, size_t n ); 14 | int strcpy( char* dest, char const* str ); 15 | int strncpy( char* dest, char const* str, size_t n ); 16 | 17 | /* C90 7.11.3: Concatenation functions */ 18 | char* strcat( char* dest, char const* src ) 19 | char* strncat( char* dest, char const* src, size_t n ); 20 | 21 | /* C90 7.11.4: Comparison functions */ 22 | /* int memcmp( void const* s1, void const* s2, size_t n ); */ 23 | int strcmp( char const* s1, char const* s2 ); 24 | /* int strcoll( char const* s1, char const* s2 ); */ 25 | int strncmp( char const* s1, char const* s2, size_t n ); 26 | /* int strxfrm( char const* s1, char const* s2, size_t n ); */ 27 | 28 | /* C90 7.11.5: Search functions */ 29 | /* void* memchr( void const* s, int c, size_t n ); */ 30 | char* strchr( char const* s, int c ); 31 | size_t strcspn( char const* str, char const* chrs ); 32 | /* char* strpbrk( char const* str, char const* chrs ); */ 33 | /* char* strrchr( char const* str, int c ); */ 34 | size_t strspn( char const* str, char const* chrs ); 35 | /* size_t strstr( char const* str, char const* substr ); */ 36 | /* size_t strtok( char const* str, char const* chrs ); */ 37 | 38 | /* C90 7.11.6: Miscellaneous functions */ 39 | void* memset( void* s, int c, size_t n ); 40 | /* char* strerror( int errnum ); */ 41 | size_t strlen( char const* s ); 42 | 43 | /* POSIX.1-2001 extensions */ 44 | char *strdup( const char *str ); 45 | /* TODO: memccpy, strtok_r */ 46 | 47 | /* POSIX.1-2008 extensions */ 48 | /* TODO: stpcpy, stpncpy, strndup, strsignal */ 49 | size_t strlen( char const* s, size_t n ) 50 | 51 | /* Random BSD extension */ 52 | char* strlcat( char* dest, char const* src, size_t n ) 53 | 54 | #else 55 | 56 | /* Temporary versions while we don't support prototypes. */ 57 | void* memcpy(); 58 | void* memmove(); 59 | char* strcat(); 60 | char* strncat(); 61 | char* strchr(); 62 | void* memset(); 63 | char* strdup(); 64 | char* strlcat(); 65 | 66 | #endif 67 | 68 | #endif 69 | -------------------------------------------------------------------------------- /stage-5/include/bits/struct_tm.h: -------------------------------------------------------------------------------- 1 | /* -- the definition of struct tm 2 | * 3 | * Copyright (C) 2014 Richard Smith 4 | * All rights reserved. 5 | */ 6 | 7 | #ifndef __RBC_BITS_STRUCT_TM_INCLUDED 8 | #define __RBC_BITS_STRUCT_TM_INCLUDED 9 | 10 | /* struct tm 11 | * 12 | * The C standard requires all the following fields to be present, but 13 | * imposes no order. There seems no advantage to not following the order 14 | * in the standard. Nor do we have a need for additional fields. 15 | */ 16 | struct tm { 17 | int tm_sec; 18 | int tm_min; 19 | int tm_hour; 20 | int tm_mday; 21 | int tm_mon; 22 | int tm_year; 23 | int tm_wday; 24 | int tm_yday; 25 | int tm_isdst; 26 | }; 27 | 28 | #endif 29 | -------------------------------------------------------------------------------- /stage-5/include/bits/time_t.h: -------------------------------------------------------------------------------- 1 | /* -- the definition of struct tm 2 | * 3 | * Copyright (C) 2014 Richard Smith 4 | * All rights reserved. 5 | */ 6 | 7 | #ifndef __RBC_BITS_TIME_T_INCLUDED 8 | #define __RBC_BITS_TIME_T_INCLUDED 9 | 10 | /* time_t 11 | * 12 | * The C standard simply requires this to be arithmetic, and not necessarily 13 | * even signed. The Linux kernel ABI makes it a 32-bit signed type. 14 | */ 15 | typedef int time_t; 16 | 17 | #endif 18 | -------------------------------------------------------------------------------- /stage-5/include/errno.h: -------------------------------------------------------------------------------- 1 | /* -- standard C library header for error diagnostics 2 | * 3 | * Copyright (C) 2005, 2008, 2020 Richard Smith 4 | * All rights reserved. 5 | */ 6 | 7 | #ifndef __RBC_ERRNO_INCLUDED 8 | #define __RBC_ERRNO_INCLUDED 9 | 10 | /* C90 7.1.4 defines three macros: EDOM, ERANGE and errno. 11 | * 12 | * Other macro names beginning E[0-9A-Z] are reserved for use by the 13 | * implementation for use as errno values (though obviously EOF conflicts). 14 | * 15 | * See discussion in C N1338 which suggests requiring errno to be a macro. */ 16 | 17 | #define EDOM 33 18 | #define ERANGE 34 19 | 20 | extern int errno; 21 | 22 | #define errno errno 23 | 24 | #endif 25 | -------------------------------------------------------------------------------- /stage-5/include/rbc_init.h: -------------------------------------------------------------------------------- 1 | /* rbc_init.h -- definitions read at the start of every compilation 2 | * 3 | * Copyright (C) 2013, 2015 Richard Smith 4 | * All rights reserved. 5 | */ 6 | 7 | /* The primary purpose of this file was so that neither the compiler 8 | * driver, nor the preprocessor, need to be updated when the compiler 9 | * is updated to include new functionality. */ 10 | 11 | /* The current compiler version. Note version numbers are stages, 12 | * so 4 was the first version, and this is 5. */ 13 | #define __RBC_VERSION 5 14 | 15 | /* To allow headers in this stage to be used later, it's convenient to 16 | * allow some extra keywords to appear in them. */ 17 | #define void int 18 | #define const 19 | -------------------------------------------------------------------------------- /stage-5/include/stdio.h: -------------------------------------------------------------------------------- 1 | /* -- standard C library header for input/output 2 | * 3 | * Copyright (C) 2005, 2008, 2015 Richard Smith 4 | * All rights reserved. 5 | */ 6 | 7 | #ifndef __RBC_STDIO_INCLUDED 8 | #define __RBC_STDIO_INCLUDED 9 | 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | 17 | #endif 18 | -------------------------------------------------------------------------------- /stage-5/include/string.h: -------------------------------------------------------------------------------- 1 | /* -- standard C library header for strings 2 | * 3 | * Copyright (C) 2005, 2021 Richard Smith 4 | * All rights reserved. 5 | */ 6 | 7 | #ifndef __RBC_STRING_INCLUDED 8 | #define __RBC_STRING_INCLUDED 9 | 10 | #include 11 | #include 12 | #include 13 | 14 | #endif 15 | -------------------------------------------------------------------------------- /stage-5/include/time.h: -------------------------------------------------------------------------------- 1 | /* -- standard C library header for date and time handling 2 | * 3 | * Copyright (C) 2014 Richard Smith 4 | * All rights reserved. 5 | */ 6 | 7 | #ifndef __RBC_TIME_INCLUDED 8 | #define __RBC_TIME_INCLUDED 9 | 10 | #include 11 | #include 12 | #include 13 | #include 14 | 15 | /* TODO: define clock_t and CLOCKS_PER_SEC */ 16 | 17 | #endif 18 | -------------------------------------------------------------------------------- /stage-5/main.c: -------------------------------------------------------------------------------- 1 | /* main.c -- code to parse command line and initialise the compiler 2 | * 3 | * Copyright (C) 2013 Richard Smith 4 | * All rights reserved. 5 | */ 6 | 7 | /* Is the --compatibility=4 flag given? */ 8 | compat_flag = 0; 9 | 10 | static 11 | compile(output) { 12 | auto node; 13 | while ( peek_token() && (node = top_level()) ) { 14 | codegen( output, node ); 15 | free_node( node ); 16 | } 17 | } 18 | 19 | usage() { 20 | cli_error("Usage: ccx [--compatibility=N] [-o filename.s] filename.c\n"); 21 | } 22 | 23 | main(argc, argv) 24 | int argc; 25 | char **argv; 26 | { 27 | extern char* strdup(); 28 | extern struct FILE* fopen(); 29 | extern char* opt_arg(); 30 | 31 | auto char *filename = 0, *outname = 0; 32 | auto int l, i = 1, freeout = 0; 33 | auto struct FILE* file; 34 | 35 | while ( i < argc ) { 36 | auto char *arg = argv[i], *arg2; 37 | 38 | if ( arg2 = opt_arg( argv, argc, &i, "-o" ) ) { 39 | if ( outname ) cli_error( 40 | "Multiple output files specified: '%s' and '%s'\n", 41 | outname, arg2 ); 42 | outname = arg2; 43 | } 44 | 45 | else if ( strcmp( arg, "--help" ) == 0 ) 46 | usage(); 47 | 48 | else if ( arg2 = opt_arg( argv, argc, &i, "--compatibility" ) ) { 49 | if ( strcmp( arg2, "4" ) == 0 ) 50 | compat_flag = 1; 51 | else if ( strcmp( arg2, "5" ) != 0 ) 52 | cli_error("Compatibility with stage %s not supported", arg2); 53 | } 54 | 55 | else if ( rchar(argv[i], 0) == '-' ) 56 | cli_error("ccx: unknown option: %s\n", argv[i]); 57 | 58 | else { 59 | if ( filename ) cli_error( 60 | "ccx: multiple input files specified: '%s' and '%s'\n", 61 | filename, argv[i]); 62 | filename = argv[i]; 63 | ++i; 64 | } 65 | } 66 | 67 | if ( !filename ) 68 | cli_error("ccx: no input file specified\n"); 69 | 70 | init_stypes(); 71 | init_symtab(); 72 | init_scan(filename, 0); 73 | 74 | if (!outname) { 75 | /* We allow .c or .i filenames: .i is used for preprocessed source. */ 76 | l = strlen(filename); 77 | if ( rchar( filename, l-1 ) != 'c' && rchar( filename, l-1 ) != 'i' 78 | || rchar( filename, l-2 ) != '.' ) 79 | cli_error("ccx: input filename must have .c or .i extension\n"); 80 | 81 | outname = strdup( filename ); 82 | freeout = 1; 83 | lchar( outname, l-1, 's' ); 84 | } 85 | 86 | file = fopen( outname, "w" ); 87 | if (!file) cli_error( "ccx: unable to open file '%s'\n", outname ); 88 | if (freeout) free( outname ); 89 | 90 | compile( file ); 91 | 92 | fclose( file ); 93 | close_scan(); 94 | 95 | fini_symtab(); 96 | fini_stypes(); 97 | rc_done(); 98 | return 0; 99 | } 100 | -------------------------------------------------------------------------------- /stage-5/node.c: -------------------------------------------------------------------------------- 1 | /* node.c -- low-level code for manipulating AST nodes 2 | * 3 | * Copyright (C) 2013, 2014 Richard Smith 4 | * All rights reserved. 5 | */ 6 | 7 | static 8 | rc_count = 0; 9 | 10 | /* Allocate SZ bytes of memory, adding a reference-counted header. */ 11 | static 12 | rc_alloc(sz) { 13 | auto ptr = malloc(8 + sz); 14 | ++rc_count; 15 | ptr[0] = 1; /* the reference count */ 16 | ptr[1] = sz; /* the capacity */ 17 | return &ptr[2]; 18 | } 19 | 20 | /* Unconditionally unallocate PTR which is memory allocated by rc_alloc. */ 21 | static 22 | rc_free(ptr) { 23 | --rc_count; 24 | free( &ptr[-2] ); 25 | } 26 | 27 | /* Diagnostic routine to check that all nodes have been unallocated. */ 28 | rc_done() { 29 | if (rc_count) { 30 | extern stderr; 31 | fprintf(stderr, "Internal error: program leaked %d objects\n", 32 | rc_count); 33 | } 34 | } 35 | 36 | /* Wrapper around realloc to work with pointers returned by rc_alloc. */ 37 | static 38 | rc_realloc(old_ptr, sz) { 39 | auto new_ptr; 40 | old_ptr = &old_ptr[-2]; 41 | 42 | /* We cannot currently handle reallocating if there are multiple copies. 43 | * What should it do? If the address changes, we need to update all 44 | * the references, but we cannot do that. So we'd have to create a unique 45 | * clone first, but then it's not really shared. Best to prohibit it. */ 46 | if ( *old_ptr > 1 ) 47 | abort(); 48 | 49 | new_ptr = realloc( old_ptr, sz + 8 ); 50 | new_ptr[1] = sz; 51 | return &new_ptr[2]; 52 | } 53 | 54 | /* Increment the reference count on a pointer */ 55 | add_ref(ptr) { 56 | ++ptr[-2]; 57 | return ptr; 58 | } 59 | 60 | /* Allocate a new node of type TYPE. */ 61 | new_node(type, arity) { 62 | /* struct node { int type; int nops; node* type; node* op[4]; } 63 | * 64 | * For binary operators, op[0] is the lhs and op[1] the rhs; for unary 65 | * prefix operators, only op[0] is used; and for unary postfix only 66 | * op[1] is used. 67 | * 68 | * The scanner never reads a ternary operator (because ?: has two separate 69 | * lexical elements), but we generate '?:' nodes in the expression parser 70 | * and want a uniform interface. Similarly, the 'for' node is a quaternary 71 | * "operator" (init, test, incr, stmt). */ 72 | auto n = rc_alloc(28); 73 | 74 | n[0] = type; n[1] = arity; 75 | 76 | /* The type and payload (operands) will get filled in by the parser */ 77 | memset( &n[2], 0, 20 ); 78 | 79 | return n; 80 | } 81 | 82 | /* Unallocate a node, NODE, created by new_node(). */ 83 | free_node(node) { 84 | auto i = 0; 85 | 86 | if (!node) return; 87 | /* Trap for double delete */ 88 | else if (node[-2] == 0) abort(); 89 | /* If the reference count doesn't drop to zero, do nothing. */ 90 | else if (--node[-2]) return; 91 | 92 | free_node( node[2] ); 93 | 94 | while ( i < node[1] ) 95 | free_node( node[ 3 + i++ ] ); 96 | 97 | rc_free(node); 98 | } 99 | 100 | /* If SIZE is equal to the capacity of NODE, then reallocate it with twice 101 | * capacity, and return the new node. */ 102 | static 103 | grow_node(node, size) { 104 | /* 12 is the size of the node before the payload. */ 105 | 106 | if ( size + 12 == node[-1] ) { 107 | size *= 2; 108 | return rc_realloc( node, size + 12 ); 109 | } 110 | 111 | return node; 112 | } 113 | 114 | /* Append node N to the vector node V, growing the vector if necessary, 115 | * and returning the (possibly reallocated) vector. */ 116 | vnode_app( v, n ) { 117 | v = grow_node(v, v[1] * 4); 118 | v[ 3 + v[1]++ ] = n; 119 | return v; 120 | } 121 | 122 | /* Returns a pointer to the string payload of a node */ 123 | node_str(node) { 124 | return &node[3]; 125 | } 126 | 127 | /* Returns the node type. This only exists to abstract the difference 128 | * between t[0] (in stage-4) and t->code (in stage-5). */ 129 | node_code(node) { 130 | return node[0]; 131 | } 132 | 133 | node_arity(node) { 134 | return node[1]; 135 | } 136 | 137 | node_type(node) { 138 | return node[2]; 139 | } 140 | 141 | node_op(node, n) { 142 | return node[3+n]; 143 | } 144 | 145 | set_code(node, code) { 146 | node[0] = code; 147 | } 148 | 149 | set_arity(node, arity) { 150 | node[1] = arity; 151 | } 152 | 153 | set_type(node, type) { 154 | node[2] = type; 155 | } 156 | 157 | set_op(node, n, op) { 158 | node[3+n] = op; 159 | } 160 | 161 | /* Append character CHR to the payload of the node *NODE_PTR which is treated 162 | * as a string with current length *LEN_PTR. The value of *LEN_PTR is 163 | * incremented. The node may be reallocated. */ 164 | node_lchar( node_ptr, len_ptr, chr ) 165 | int *len_ptr; 166 | { 167 | auto node = *node_ptr; 168 | node = grow_node( node, *len_ptr ); 169 | lchar( node_str(node), (*len_ptr)++, chr ); 170 | *node_ptr = node; 171 | } 172 | 173 | /* Push-back facility doesn't really belong here, but having to keep 174 | * compatibility with the stage-5 cc without --compatibility=4 is 175 | * painful. 176 | * 177 | * This is a struct pb_slot { struct node* token; struct pb_slot* next } *; */ 178 | static pb_stack = 0; 179 | 180 | pb_empty() { 181 | return !pb_stack; 182 | } 183 | 184 | pb_pop() { 185 | auto ret = 0; 186 | if ( pb_stack ) { 187 | auto old = pb_stack; 188 | ret = pb_stack[0]; 189 | pb_stack = pb_stack[1]; 190 | free(old); 191 | } 192 | return ret; 193 | } 194 | 195 | pb_push(token) { 196 | auto p = malloc(8); 197 | p[1] = pb_stack; 198 | pb_stack = p; 199 | pb_stack[0] = token; 200 | } 201 | 202 | /* Allocate a string node and set its payload to STR */ 203 | struct node* 204 | new_strnode(code, str) 205 | char* str; 206 | { 207 | auto sz = strlen(str) + 1; 208 | auto node = rc_alloc( 12 + sz ); 209 | set_code( node, code ); 210 | set_arity( node, 0 ); 211 | set_type( node, 0 ); 212 | strcpy( node_str(node), str, sz ); 213 | return node; 214 | } 215 | 216 | 217 | -------------------------------------------------------------------------------- /stage-5/nodenew.c: -------------------------------------------------------------------------------- 1 | /* nodenew.c -- node.c rewritten to use structs 2 | * 3 | * Copyright (C) 2013, 2014, 2015, 2016, 2018 4 | * Richard Smith 5 | * All rights reserved. 6 | */ 7 | 8 | /* This disables the errors on incompatibilities with stage-4. 9 | * This is safe because this file is never processed with the stage-4 10 | * compiler (and, indeed, wouldn't compile if were). */ 11 | #pragma RBC compatibility 5 12 | 13 | static 14 | rc_count = 0; 15 | 16 | static 17 | int (*debug_fn)() = 0; 18 | 19 | dbg_nodes(fn) 20 | int (*fn)(); 21 | { 22 | debug_fn = fn; 23 | } 24 | 25 | struct rc_node { 26 | int ref_count, capacity; 27 | }; 28 | 29 | /* Allocate SZ bytes of memory, adding a reference-counted header. */ 30 | static 31 | rc_alloc(sz) { 32 | struct rc_node* ptr = malloc( sizeof(struct rc_node) + sz ); 33 | ++rc_count; 34 | ptr->ref_count = 1; 35 | ptr->capacity = sz; 36 | return ptr + 1; 37 | } 38 | 39 | /* Unconditionally unallocate PTR which is memory allocated by rc_alloc. */ 40 | static 41 | rc_free(ptr) { 42 | --rc_count; 43 | free( ptr - sizeof(struct rc_node) ); 44 | } 45 | 46 | /* Diagnostic routine to check that all nodes have been unallocated. */ 47 | rc_done() { 48 | if (rc_count) 49 | int_error("Internal error: program leaked %d objects\n", rc_count); 50 | } 51 | 52 | /* Wrapper around realloc to work with pointers returned by rc_alloc. */ 53 | static 54 | rc_realloc(ptr, sz) { 55 | struct rc_node *old_ptr, *new_ptr; 56 | 57 | if ( !ptr ) 58 | return rc_alloc(sz); 59 | 60 | old_ptr = (struct rc_node*)( (unsigned char*)ptr - sizeof(struct rc_node) ); 61 | 62 | /* We cannot currently handle reallocating if there are multiple copies. 63 | * What should it do? If the address changes, we need to update all 64 | * the references, but we cannot do that. So we'd have to create a unique 65 | * clone first, but then it's not really shared. Best to prohibit it. */ 66 | if ( old_ptr->ref_count != 1 ) 67 | int_error("Attempt to reallocate a shared ref-counted object"); 68 | 69 | new_ptr = (struct rc_node*) realloc( old_ptr, sizeof(struct rc_node) + sz ); 70 | new_ptr->ref_count = 1; 71 | new_ptr->capacity = sz; 72 | return new_ptr + 1; 73 | } 74 | 75 | /* Increment the reference count on a pointer */ 76 | add_ref(ptr) { 77 | struct rc_node* n = ptr - sizeof(struct rc_node); 78 | ++n->ref_count; 79 | return ptr; 80 | } 81 | 82 | struct node { 83 | int code; /* character code for the node, e.g. '+' or 'if'. */ 84 | int arity; /* the number of nodes in the ops[] array. */ 85 | struct node* type; /* a node representing the type of the node. */ 86 | 87 | /* For binary operators, ops[0] is the lhs and ops[1] the rhs; for unary 88 | * prefix operators, only ops[0] is used; and for unary postfix only 89 | * ops[1] is used. 90 | * 91 | * The scanner never reads a ternary operator (because ?: has two separate 92 | * lexical elements), but we generate '?:' nodes in the expression parser 93 | * and want a uniform interface. Similarly, the 'for' node is a quaternary 94 | * "operator" (init, test, incr, stmt). 95 | * 96 | * Because of the way we store other data, but particularly strings, in 97 | * nodes, this is really a: 98 | * 99 | * union { 100 | * struct node* op[4]; 101 | * int ivals[4]; 102 | * char str[]; 103 | * }; 104 | * 105 | * The ARITY field is zero when the node contains a string so as to 106 | * prevent node_free from treating the string as a node pointer and 107 | * freeing it. There are some other instances where integers are 108 | * stored in the node. E.g. in a 'macp' node (used to represent the 109 | * occurrence of a macro parameter in the replacement list of a 110 | * function-like macro) has ARITY=0 and nothing in the OP[] array, 111 | * but uses IVAL[0] to represent the parameter number. */ 112 | struct node* ops[4]; 113 | }; 114 | 115 | /* Allocate a new node of code CODE, and arity ARITY. */ 116 | new_node(code, arity) { 117 | struct node* n = rc_alloc( sizeof(struct node) ); 118 | memset( n, 0, sizeof(struct node) ); 119 | 120 | /* The type and payload (operands) will get filled in by the parser, 121 | * but they were safely zeroed by the above call to memset(). */ 122 | n->code = code; 123 | n->arity = arity; 124 | 125 | if (debug_fn) debug_fn(n, "new_node"); 126 | 127 | return n; 128 | } 129 | 130 | /* Checks that NODE and its children still have non-zero a ref count. */ 131 | check_node(node) 132 | struct node* node; 133 | { 134 | if (node) { 135 | struct rc_node* rc = (unsigned char*)node - sizeof(struct rc_node); 136 | int i; 137 | 138 | if ( rc->ref_count <= 0 || rc->ref_count > 1000 ) 139 | int_error( "Use of node type '%Mc' at 0x%x with %d ref-count\n", 140 | node->code, node, rc->ref_count ); 141 | 142 | for ( i = 0; i < node->arity; ++i ) 143 | check_node( node->ops[i] ); 144 | 145 | if (node->code & 0x80808080) 146 | int_error( "Invalid node code '%Mc' at 0x%x with %d ref-count\n", 147 | node->code, node, rc->ref_count ); 148 | } 149 | 150 | return node; 151 | } 152 | 153 | /* Unallocate a node, NODE, created by new_node(). */ 154 | free_node(node) 155 | struct node* node; 156 | { 157 | if (node) { 158 | struct rc_node* rc = (unsigned char*)node - sizeof(struct rc_node); 159 | 160 | /* Trap for double delete */ 161 | if ( rc->ref_count == 0 ) 162 | int_error( "Double delete of node type '%Mc' at 0x%x\n", 163 | node->code, node ); 164 | 165 | /* Only delete if the reference count drops to zero. */ 166 | if ( --rc->ref_count == 0 ) { 167 | /* This is the latest we can call debug_fn as we're about to free 168 | * the children, and the function will probably access them. */ 169 | if (debug_fn) debug_fn(node, "free_node"); 170 | 171 | int i; 172 | for ( i = 0; i < node->arity; ++i ) 173 | free_node( node->ops[i] ); 174 | 175 | free_node( node->type ); 176 | 177 | rc_free(node); 178 | } 179 | } 180 | } 181 | 182 | /* Expand, if necessary, the storage of NODE. SIZE is the current size 183 | * (in bytes) of the node, and EXTRA is the additional space required. 184 | * If SIZE + EXTRA is greater than the capacity (in bytes) of NODE, then 185 | * reallocate it with twice (or, if necessary, more) capacity, and return 186 | * the new node. It does not increment the arity of the node. More 187 | * friendly interfaces are provided by vnode_app() and node_lchar(). */ 188 | static 189 | struct node* 190 | grow_node(node, size, extra) 191 | struct node* node; 192 | { 193 | struct rc_node* rc 194 | = node ? (unsigned char*)node - sizeof(struct rc_node) : 0; 195 | 196 | /* This is the size of the node before the ops[] payload. */ 197 | int overhead = sizeof(struct node) - sizeof(struct node *[4]); 198 | 199 | if ( !rc || size + extra + overhead > rc->capacity ) { 200 | struct node *new; 201 | size += (extra <= size ? size : extra); 202 | 203 | if (debug_fn && node) debug_fn(node, "grow_node [free]"); 204 | 205 | new = (struct node *)rc_realloc( node, size + overhead ); 206 | 207 | /* Initialise if it's a new node */ 208 | if (!node) { 209 | new->code = 0; 210 | new->arity = 0; 211 | new->type = 0; 212 | } 213 | 214 | if (debug_fn) debug_fn(new, "grow_node [realloced]"); 215 | 216 | return new; 217 | } 218 | 219 | return node; 220 | } 221 | 222 | /* Append node CHILD to node VEC, growing the vector if necessary, 223 | * and returning the (possibly reallocated) vector. */ 224 | struct node * 225 | vnode_app( vec, child ) 226 | struct node *vec, *child; 227 | { 228 | vec = grow_node( vec, vec->arity * sizeof(struct node*), 229 | sizeof(struct node*) ); 230 | vec->ops[ vec->arity++ ] = child; 231 | return vec; 232 | } 233 | 234 | /* Append nodes with index [FIRST, LAST) from SRC to vnode DEST, growing 235 | * the vector if necessary, and returning the (possibly reallocated) vector. 236 | * If LAST is -1, the whole source vector is copied. */ 237 | struct node * 238 | vnode_copy( dest, src, first, last ) 239 | struct node *dest, *src; 240 | { 241 | int i; 242 | if ( last < 0 ) last = src->arity; 243 | for ( i = first; i < last; ++i ) 244 | dest = vnode_app( dest, add_ref(src->ops[i]) ); 245 | return dest; 246 | } 247 | 248 | /* Prepend node CHILD to node VEC, growing the vector if necessary, 249 | * and returning the (possibly reallocated) vector. */ 250 | struct node * 251 | vnode_prep( vec, child ) 252 | struct node *vec, *child; 253 | { 254 | vec = grow_node( vec, vec->arity * sizeof(struct node*), 255 | sizeof(struct node*) ); 256 | memmove( &vec->ops[1], &vec->ops[0], vec->arity * sizeof(struct node*) ); 257 | vec->arity++; 258 | vec->ops[0] = child; 259 | return vec; 260 | } 261 | 262 | /* Returns a pointer to the string payload of a node */ 263 | node_str(node) 264 | struct node* node; 265 | { 266 | return (char*) node->ops; 267 | } 268 | 269 | /* Returns the node type. This only exists to abstract the difference 270 | * between t[0] (in stage-4) and t->code (in stage-5). */ 271 | node_code(node) 272 | struct node* node; 273 | { 274 | return node->code; 275 | } 276 | 277 | node_arity(node) 278 | struct node* node; 279 | { 280 | return node->arity; 281 | } 282 | 283 | node_type(node) 284 | struct node* node; 285 | { 286 | return node->type; 287 | } 288 | 289 | node_op(node, n) 290 | struct node* node; 291 | { 292 | return node->ops[n]; 293 | } 294 | 295 | set_code(node, code) 296 | struct node* node; 297 | { 298 | node->code = code; 299 | } 300 | 301 | set_arity(node, arity) 302 | struct node* node; 303 | { 304 | node->arity = arity; 305 | } 306 | 307 | set_type(node, type) 308 | struct node *node, *type; 309 | { 310 | node->type = type; 311 | } 312 | 313 | set_op(node, n, op) 314 | struct node *node, *op; 315 | { 316 | node->ops[n] = op; 317 | } 318 | 319 | set_ival(node, val) 320 | struct node *node; 321 | { 322 | node->ops[0] = (struct node*) val; 323 | } 324 | 325 | node_ival(node) 326 | struct node *node; 327 | { 328 | return node->ops[0]; 329 | } 330 | 331 | /* Allocate a string node and set its payload to STR */ 332 | struct node* 333 | new_strnode(code, str) 334 | char* str; 335 | { 336 | int sz = strlen(str) + 1; 337 | struct node* node = grow_node( 0, 0, sz ); 338 | /* grow_node() has already zeroed code, arity and type */ 339 | node->code = code; 340 | strcpy( node_str(node), str, sz ); 341 | if (debug_fn) debug_fn(node, "new_strnode"); 342 | return node; 343 | } 344 | 345 | 346 | /* Append character CHR to the payload of the node *NODE_PTR which is treated 347 | * as a string with current length *LEN_PTR. The value of *LEN_PTR is 348 | * incremented. The node may be reallocated. */ 349 | node_lchar( node_ptr, len_ptr, chr ) 350 | struct node** node_ptr; 351 | int *len_ptr; 352 | { 353 | struct node* node = grow_node( *node_ptr, *len_ptr, 1 ); 354 | char* buf = node_str(node); 355 | buf[ (*len_ptr)++ ] = chr; 356 | *node_ptr = node; 357 | 358 | if (debug_fn) debug_fn(node, "node_lchar"); 359 | } 360 | 361 | /* Append string STR to the payload of the node *NODE_PTR which is treated 362 | * as a string with current length *LEN_PTR. The value of *LEN_PTR is 363 | * incremented. The node may be reallocated. */ 364 | node_strcat( node_ptr, len_ptr, str, len ) 365 | struct node** node_ptr; 366 | int *len_ptr; 367 | char *str; 368 | { 369 | struct node* node = grow_node( *node_ptr, *len_ptr, len ); 370 | char* buf = node_str(node); 371 | strncpy( buf + *len_ptr, str, len ); 372 | *len_ptr += len; 373 | *node_ptr = node; 374 | } 375 | 376 | /* Push-back facility doesn't really belong here, but having to keep 377 | * compatibility with the stage-4 cc is tricky. */ 378 | static struct pb_slot { 379 | struct node* node; 380 | struct pb_slot* next; 381 | } *pb_stack = 0; 382 | 383 | pb_empty() { 384 | return !pb_stack; 385 | } 386 | 387 | struct node* 388 | pb_pop() { 389 | struct node* ret = 0; 390 | if ( pb_stack ) { 391 | struct pb_slot* old = pb_stack; 392 | ret = pb_stack->node; 393 | pb_stack = pb_stack->next; 394 | free(old); 395 | } 396 | return ret; 397 | } 398 | 399 | pb_push(token) 400 | struct node* token; 401 | { 402 | struct pb_slot* p = malloc( sizeof(struct pb_slot) ); 403 | p->next = pb_stack; 404 | p->node = token; 405 | pb_stack = p; 406 | } 407 | 408 | 409 | 410 | -------------------------------------------------------------------------------- /stage-5/pvector.c: -------------------------------------------------------------------------------- 1 | /* pvector.c -- code to deal with vectors of pointers 2 | * 3 | * Copyright (C) 2013 Richard Smith 4 | * All rights reserved. 5 | */ 6 | 7 | /* The Makefile sticks --compatibility=4 on the command line. Remove it. */ 8 | #pragma RBC compatibility 5 9 | 10 | /* We would like to #include "pvector.h" here, but we don't so that it can 11 | * be used in the implementation of the preprocessor. Instead, repeat the 12 | * definition here. */ 13 | struct pvector { 14 | char **start, **end, **end_store; 15 | }; 16 | 17 | struct pvector* 18 | pvec_new() { 19 | struct pvector* v = (struct pvector*) malloc( sizeof(struct pvector) ); 20 | int cap = 8; 21 | v->start = v->end = (char**) malloc( sizeof(char*) * cap ); 22 | *v->end = 0; /* null termination */ 23 | v->end_store = v->start + cap; 24 | return v; 25 | } 26 | 27 | pvec_delete(v) 28 | struct pvector* v; 29 | { 30 | if (v) { 31 | free( v->start ); 32 | free( v ); 33 | } 34 | } 35 | 36 | pvec_push(v, elt) 37 | struct pvector* v; 38 | char* elt; 39 | { 40 | /* Overwrite the null termination: which means we're guaranteed to 41 | * have space at this point. */ 42 | *v->end++ = elt; 43 | 44 | if (v->end == v->end_store) { 45 | /* We need to reallocate now to push the null terminator */ 46 | int cap = v->end - v->start; 47 | v->start = (char**) realloc( v->start, sizeof(char*) * 2*cap ); 48 | v->end = v->start + cap; 49 | v->end_store = v->start + 2*cap; 50 | } 51 | 52 | *v->end = 0; 53 | } 54 | 55 | char* 56 | pvec_pop(v) 57 | struct pvector* v; 58 | { 59 | char* last = *--v->end; 60 | *v->end = 0; 61 | return last; 62 | } 63 | -------------------------------------------------------------------------------- /stage-5/pvector.h: -------------------------------------------------------------------------------- 1 | /* pvector.h -- code to deal with vectors of pointers 2 | * 3 | * Copyright (C) 2013 Richard Smith 4 | * All rights reserved. 5 | */ 6 | 7 | #ifndef RBC5_PVECTOR_INCLUDED 8 | #define RBC5_PVECTOR_INCLUDED 9 | 10 | struct pvector { 11 | char **start, **end, **end_store; 12 | }; 13 | 14 | struct pvector* pvec_new(); 15 | pvec_delete(); 16 | pvec_push(); 17 | char* pvec_pop(); 18 | 19 | #endif 20 | -------------------------------------------------------------------------------- /stage-5/scanner.c: -------------------------------------------------------------------------------- 1 | /* scanner.c -- code for converting preprocessor tokens to C ones 2 | * This file is part of the compiler, ccx 3 | * 4 | * Copyright (C) 2013, 2014 Richard Smith 5 | * All rights reserved. 6 | */ 7 | 8 | 9 | /* Check whether the null-terminated string, NODE->str, is a keyword, and 10 | * if so set NODE->type to the keyword token (which is a multicharacter 11 | * literal containing at most the first four characters of the keyword, 12 | * e.g. 'whil' for "while"); otherwise set NODE->type = 'id' for an 13 | * identifier. Returns NODE. */ 14 | chk_keyword(node) 15 | { 16 | /* Argument is: struct node { int type; int dummy; char str[]; } */ 17 | 18 | auto char *keywords[29] = { 19 | /* Complete list of keywords per K&R, minus 'entry', plus 'signed' 20 | * from C90. C90 also adds 'const', 'enum', 'void', and 'volatile'. 21 | * 22 | * 'do' and 'if' have an extra NUL character to pad them to 4 bytes 23 | * for casting to an int (i.e. a multicharacter literal). 24 | * 25 | * TODO: Not yet implemented: double, typedef, union. 26 | */ 27 | "auto", "break", "case", "char", "continue", "default", "do\0", 28 | "double", "else", "extern", "float", "for", "goto", "if\0", 29 | "int", "long", "register", "return", "signed", "short", "sizeof", 30 | "static", "struct", "switch", "typedef", "union", "unsigned", "while", 31 | 0 32 | }; 33 | 34 | auto i = 0; 35 | while ( keywords[i] && strcmp(keywords[i], &node[3]) != 0 ) 36 | ++i; 37 | 38 | if ( keywords[i] ) { 39 | /* Change the id node to an op node, using the first four bytes 40 | * of the keyword as the multicharacter node code. */ 41 | auto int* keyword = keywords[i]; 42 | node[0] = *keyword; 43 | 44 | /* Zero the memory used by the string: it's now an node* array[4]. */ 45 | memset( &node[3], 0, 16 ); 46 | } 47 | 48 | return node; 49 | } 50 | 51 | /* Create a node, NODE, which will be returned; read a number (oct / hex / dec) 52 | * starting with character C (which has already been checked by isdigit), 53 | * parse it into NODE->val, set NODE->type, and return NODE. */ 54 | get_number(stream, c, c2) { 55 | auto char *nptr; 56 | auto ppnode = get_ppnum(stream, c, c2); 57 | auto node = mk_number(ppnode); 58 | free_node(ppnode); 59 | return node; 60 | } 61 | 62 | /* Handle a #pragma directive */ 63 | prgm_direct(stream) { 64 | extern char* node_str(); 65 | extern struct node* get_word(); 66 | 67 | auto struct node* tok; 68 | auto char* str; 69 | auto int c = skip_hwhite(stream); 70 | 71 | /* The standard requires unrecognised #pragmas to be allowed, but 72 | * this is a bit silly. */ 73 | if ( !isidchar1(c) ) { 74 | warning("Unfamiliar form of #pragma directive"); 75 | /* A bare #pragma is a bit silly too, but the grammar allows it. */ 76 | if ( c == '\n' ) ungetc(c, stream); 77 | else pp_slurp(stream, 0, 0); 78 | return 0; 79 | } 80 | 81 | /* Get the pragma namespace */ 82 | tok = get_word(stream, c); 83 | str = node_str(tok); 84 | 85 | /* Our #pragmas all live in the RBC namespace (which stands for 86 | * Richard's Bootstrap Compiler). */ 87 | if ( strcmp( str, "RBC" ) != 0 ) { 88 | /* An unknown pragma: silently ignore it. */ 89 | pp_slurp(stream, 0, 0); 90 | free_node(tok); 91 | return 0; 92 | } 93 | free_node(tok); 94 | 95 | c = skip_hwhite(stream); 96 | if ( !isidchar1(c) ) 97 | error("#pragma RBC requires a command argument"); 98 | tok = get_word(stream, c); 99 | str = node_str(tok); 100 | 101 | /* We only know about #pragma RBC compatibility */ 102 | if ( strcmp( str, "compatibility" ) == 0 ) { 103 | extern compat_flag; 104 | auto int n = pp_dir_num(stream); 105 | if ( n < 4 || n > 5 ) 106 | error("Compatibility with stage %d not supported", n); 107 | compat_flag = ( n == 4 ); 108 | } 109 | else { 110 | warning("Unhandled #pragma RBC %s", str); 111 | pp_slurp(stream, 0, 0); 112 | } 113 | 114 | end_ppdir(stream, "pragma RBC"); 115 | free_node(tok); 116 | 117 | /* The return is a null node*, and indicates that we have handled 118 | * (or ignored) the #pragma, and not to include it in the output 119 | * token stream produced by the scanner. */ 120 | return 0; 121 | } 122 | 123 | /* Hook for handling preprocessor directives other than #line and #pragma */ 124 | pp_direct(stream, str) { 125 | error("Unknown preprocessor directive: %s", str); 126 | } 127 | 128 | do_get_qlit(stream, c1, c2) { 129 | auto int l; 130 | auto tok = get_qlit(stream, c1, c2, &l); 131 | 132 | /* Character literals have type int in C. */ 133 | if (c1 == '\'') 134 | tok[2] = add_ref( implct_int() ); 135 | /* String literals have type char[N] */ 136 | else if (c1 == '\"') 137 | tok[2] = chr_array_t(l); 138 | else 139 | int_error("Unknown type of quoted string: %c...%c", c1, c2); 140 | return tok; 141 | } 142 | 143 | handle_eof() { 144 | return 0; 145 | } 146 | 147 | cpp_pragma() { 148 | return 0; 149 | } 150 | -------------------------------------------------------------------------------- /stage-5/timeconv.c: -------------------------------------------------------------------------------- 1 | /* cc.c -- functions to convert times 2 | * 3 | * Copyright (C) 2014 Richard Smith 4 | * All rights reserved. 5 | */ 6 | 7 | /* The Makefile sticks --compatibility=4 on the command line. Remove it. */ 8 | #pragma RBC compatibility 5 9 | 10 | #include 11 | 12 | static 13 | struct tm buf; 14 | 15 | static 16 | leap_year(year) { 17 | if (year % 4) return 0; 18 | else if (year % 100) return 1; 19 | else if (year % 400) return 0; 20 | else return 1; 21 | } 22 | 23 | /* The stage-5 compiler doesn't support multi-dimensional arrays */ 24 | static int month_lens[26] = { 25 | 31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31, 0, 26 | 31, 29, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31, 0 27 | }; 28 | 29 | /* The C standard gmtime */ 30 | struct tm * 31 | gmtime(timep) 32 | int *timep; 33 | { 34 | int t = *timep; 35 | int ly = 0, year = 1970, month = 0, *lp; 36 | 37 | buf.tm_sec = t % 60; t /= 60; 38 | buf.tm_min = t % 60; t /= 60; 39 | buf.tm_hour = t % 24; t /= 24; 40 | 41 | /* The Unix epoch, 1 Jan 1970, was a Thurday, day 4. */ 42 | buf.tm_wday = (t - 4) % 7; 43 | 44 | while ( t < 0 ) 45 | --year, ly = leap_year(year), t += 365 + ly; 46 | while ( !ly && t == 365 || t > 365 ) 47 | ++year, t -= 365 + ly, ly = leap_year(year); 48 | 49 | buf.tm_year = year - 1900; 50 | buf.tm_yday = t; 51 | 52 | for ( lp = month_lens + ly*13; *lp && t >= *lp; month++, t -= *lp, ++lp ) 53 | ; 54 | buf.tm_mon = month; 55 | buf.tm_mday = t + 1; 56 | 57 | buf.tm_isdst = 0; 58 | 59 | return &buf; 60 | } 61 | 62 | 63 | --------------------------------------------------------------------------------