├── .gitignore ├── LICENSE ├── README.md ├── call.asm ├── cmd ├── assembler │ └── main.go ├── lexer │ └── main.go └── parser │ └── main.go ├── compiler └── compiler.go ├── elf └── elf.go ├── exit.asm ├── go.mod ├── hello.asm ├── instructions └── instructions.go ├── jmp.asm ├── lexer ├── lexer.go └── lexer_test.go ├── parser ├── ast.go ├── parser.go └── parser_test.go ├── test.asm └── token ├── token.go └── token_test.go /.gitignore: -------------------------------------------------------------------------------- 1 | a.out 2 | assembler 3 | cmd/lexer/lexer 4 | cmd/parser/parser 5 | cmd/assembler/assembler 6 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | GNU GENERAL PUBLIC LICENSE 2 | Version 2, June 1991 3 | 4 | Copyright (C) 1989, 1991 Free Software Foundation, Inc., 5 | 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 6 | Everyone is permitted to copy and distribute verbatim copies 7 | of this license document, but changing it is not allowed. 8 | 9 | Preamble 10 | 11 | The licenses for most software are designed to take away your 12 | freedom to share and change it. By contrast, the GNU General Public 13 | License is intended to guarantee your freedom to share and change free 14 | software--to make sure the software is free for all its users. This 15 | General Public License applies to most of the Free Software 16 | Foundation's software and to any other program whose authors commit to 17 | using it. (Some other Free Software Foundation software is covered by 18 | the GNU Lesser General Public License instead.) You can apply it to 19 | your programs, too. 20 | 21 | When we speak of free software, we are referring to freedom, not 22 | price. Our General Public Licenses are designed to make sure that you 23 | have the freedom to distribute copies of free software (and charge for 24 | this service if you wish), that you receive source code or can get it 25 | if you want it, that you can change the software or use pieces of it 26 | in new free programs; and that you know you can do these things. 27 | 28 | To protect your rights, we need to make restrictions that forbid 29 | anyone to deny you these rights or to ask you to surrender the rights. 30 | These restrictions translate to certain responsibilities for you if you 31 | distribute copies of the software, or if you modify it. 32 | 33 | For example, if you distribute copies of such a program, whether 34 | gratis or for a fee, you must give the recipients all the rights that 35 | you have. You must make sure that they, too, receive or can get the 36 | source code. And you must show them these terms so they know their 37 | rights. 38 | 39 | We protect your rights with two steps: (1) copyright the software, and 40 | (2) offer you this license which gives you legal permission to copy, 41 | distribute and/or modify the software. 42 | 43 | Also, for each author's protection and ours, we want to make certain 44 | that everyone understands that there is no warranty for this free 45 | software. If the software is modified by someone else and passed on, we 46 | want its recipients to know that what they have is not the original, so 47 | that any problems introduced by others will not reflect on the original 48 | authors' reputations. 49 | 50 | Finally, any free program is threatened constantly by software 51 | patents. We wish to avoid the danger that redistributors of a free 52 | program will individually obtain patent licenses, in effect making the 53 | program proprietary. To prevent this, we have made it clear that any 54 | patent must be licensed for everyone's free use or not licensed at all. 55 | 56 | The precise terms and conditions for copying, distribution and 57 | modification follow. 58 | 59 | GNU GENERAL PUBLIC LICENSE 60 | TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION 61 | 62 | 0. This License applies to any program or other work which contains 63 | a notice placed by the copyright holder saying it may be distributed 64 | under the terms of this General Public License. The "Program", below, 65 | refers to any such program or work, and a "work based on the Program" 66 | means either the Program or any derivative work under copyright law: 67 | that is to say, a work containing the Program or a portion of it, 68 | either verbatim or with modifications and/or translated into another 69 | language. (Hereinafter, translation is included without limitation in 70 | the term "modification".) Each licensee is addressed as "you". 71 | 72 | Activities other than copying, distribution and modification are not 73 | covered by this License; they are outside its scope. The act of 74 | running the Program is not restricted, and the output from the Program 75 | is covered only if its contents constitute a work based on the 76 | Program (independent of having been made by running the Program). 77 | Whether that is true depends on what the Program does. 78 | 79 | 1. You may copy and distribute verbatim copies of the Program's 80 | source code as you receive it, in any medium, provided that you 81 | conspicuously and appropriately publish on each copy an appropriate 82 | copyright notice and disclaimer of warranty; keep intact all the 83 | notices that refer to this License and to the absence of any warranty; 84 | and give any other recipients of the Program a copy of this License 85 | along with the Program. 86 | 87 | You may charge a fee for the physical act of transferring a copy, and 88 | you may at your option offer warranty protection in exchange for a fee. 89 | 90 | 2. You may modify your copy or copies of the Program or any portion 91 | of it, thus forming a work based on the Program, and copy and 92 | distribute such modifications or work under the terms of Section 1 93 | above, provided that you also meet all of these conditions: 94 | 95 | a) You must cause the modified files to carry prominent notices 96 | stating that you changed the files and the date of any change. 97 | 98 | b) You must cause any work that you distribute or publish, that in 99 | whole or in part contains or is derived from the Program or any 100 | part thereof, to be licensed as a whole at no charge to all third 101 | parties under the terms of this License. 102 | 103 | c) If the modified program normally reads commands interactively 104 | when run, you must cause it, when started running for such 105 | interactive use in the most ordinary way, to print or display an 106 | announcement including an appropriate copyright notice and a 107 | notice that there is no warranty (or else, saying that you provide 108 | a warranty) and that users may redistribute the program under 109 | these conditions, and telling the user how to view a copy of this 110 | License. (Exception: if the Program itself is interactive but 111 | does not normally print such an announcement, your work based on 112 | the Program is not required to print an announcement.) 113 | 114 | These requirements apply to the modified work as a whole. If 115 | identifiable sections of that work are not derived from the Program, 116 | and can be reasonably considered independent and separate works in 117 | themselves, then this License, and its terms, do not apply to those 118 | sections when you distribute them as separate works. But when you 119 | distribute the same sections as part of a whole which is a work based 120 | on the Program, the distribution of the whole must be on the terms of 121 | this License, whose permissions for other licensees extend to the 122 | entire whole, and thus to each and every part regardless of who wrote it. 123 | 124 | Thus, it is not the intent of this section to claim rights or contest 125 | your rights to work written entirely by you; rather, the intent is to 126 | exercise the right to control the distribution of derivative or 127 | collective works based on the Program. 128 | 129 | In addition, mere aggregation of another work not based on the Program 130 | with the Program (or with a work based on the Program) on a volume of 131 | a storage or distribution medium does not bring the other work under 132 | the scope of this License. 133 | 134 | 3. You may copy and distribute the Program (or a work based on it, 135 | under Section 2) in object code or executable form under the terms of 136 | Sections 1 and 2 above provided that you also do one of the following: 137 | 138 | a) Accompany it with the complete corresponding machine-readable 139 | source code, which must be distributed under the terms of Sections 140 | 1 and 2 above on a medium customarily used for software interchange; or, 141 | 142 | b) Accompany it with a written offer, valid for at least three 143 | years, to give any third party, for a charge no more than your 144 | cost of physically performing source distribution, a complete 145 | machine-readable copy of the corresponding source code, to be 146 | distributed under the terms of Sections 1 and 2 above on a medium 147 | customarily used for software interchange; or, 148 | 149 | c) Accompany it with the information you received as to the offer 150 | to distribute corresponding source code. (This alternative is 151 | allowed only for noncommercial distribution and only if you 152 | received the program in object code or executable form with such 153 | an offer, in accord with Subsection b above.) 154 | 155 | The source code for a work means the preferred form of the work for 156 | making modifications to it. For an executable work, complete source 157 | code means all the source code for all modules it contains, plus any 158 | associated interface definition files, plus the scripts used to 159 | control compilation and installation of the executable. However, as a 160 | special exception, the source code distributed need not include 161 | anything that is normally distributed (in either source or binary 162 | form) with the major components (compiler, kernel, and so on) of the 163 | operating system on which the executable runs, unless that component 164 | itself accompanies the executable. 165 | 166 | If distribution of executable or object code is made by offering 167 | access to copy from a designated place, then offering equivalent 168 | access to copy the source code from the same place counts as 169 | distribution of the source code, even though third parties are not 170 | compelled to copy the source along with the object code. 171 | 172 | 4. You may not copy, modify, sublicense, or distribute the Program 173 | except as expressly provided under this License. Any attempt 174 | otherwise to copy, modify, sublicense or distribute the Program is 175 | void, and will automatically terminate your rights under this License. 176 | However, parties who have received copies, or rights, from you under 177 | this License will not have their licenses terminated so long as such 178 | parties remain in full compliance. 179 | 180 | 5. You are not required to accept this License, since you have not 181 | signed it. However, nothing else grants you permission to modify or 182 | distribute the Program or its derivative works. These actions are 183 | prohibited by law if you do not accept this License. Therefore, by 184 | modifying or distributing the Program (or any work based on the 185 | Program), you indicate your acceptance of this License to do so, and 186 | all its terms and conditions for copying, distributing or modifying 187 | the Program or works based on it. 188 | 189 | 6. Each time you redistribute the Program (or any work based on the 190 | Program), the recipient automatically receives a license from the 191 | original licensor to copy, distribute or modify the Program subject to 192 | these terms and conditions. You may not impose any further 193 | restrictions on the recipients' exercise of the rights granted herein. 194 | You are not responsible for enforcing compliance by third parties to 195 | this License. 196 | 197 | 7. If, as a consequence of a court judgment or allegation of patent 198 | infringement or for any other reason (not limited to patent issues), 199 | conditions are imposed on you (whether by court order, agreement or 200 | otherwise) that contradict the conditions of this License, they do not 201 | excuse you from the conditions of this License. If you cannot 202 | distribute so as to satisfy simultaneously your obligations under this 203 | License and any other pertinent obligations, then as a consequence you 204 | may not distribute the Program at all. For example, if a patent 205 | license would not permit royalty-free redistribution of the Program by 206 | all those who receive copies directly or indirectly through you, then 207 | the only way you could satisfy both it and this License would be to 208 | refrain entirely from distribution of the Program. 209 | 210 | If any portion of this section is held invalid or unenforceable under 211 | any particular circumstance, the balance of the section is intended to 212 | apply and the section as a whole is intended to apply in other 213 | circumstances. 214 | 215 | It is not the purpose of this section to induce you to infringe any 216 | patents or other property right claims or to contest validity of any 217 | such claims; this section has the sole purpose of protecting the 218 | integrity of the free software distribution system, which is 219 | implemented by public license practices. Many people have made 220 | generous contributions to the wide range of software distributed 221 | through that system in reliance on consistent application of that 222 | system; it is up to the author/donor to decide if he or she is willing 223 | to distribute software through any other system and a licensee cannot 224 | impose that choice. 225 | 226 | This section is intended to make thoroughly clear what is believed to 227 | be a consequence of the rest of this License. 228 | 229 | 8. If the distribution and/or use of the Program is restricted in 230 | certain countries either by patents or by copyrighted interfaces, the 231 | original copyright holder who places the Program under this License 232 | may add an explicit geographical distribution limitation excluding 233 | those countries, so that distribution is permitted only in or among 234 | countries not thus excluded. In such case, this License incorporates 235 | the limitation as if written in the body of this License. 236 | 237 | 9. The Free Software Foundation may publish revised and/or new versions 238 | of the General Public License from time to time. Such new versions will 239 | be similar in spirit to the present version, but may differ in detail to 240 | address new problems or concerns. 241 | 242 | Each version is given a distinguishing version number. If the Program 243 | specifies a version number of this License which applies to it and "any 244 | later version", you have the option of following the terms and conditions 245 | either of that version or of any later version published by the Free 246 | Software Foundation. If the Program does not specify a version number of 247 | this License, you may choose any version ever published by the Free Software 248 | Foundation. 249 | 250 | 10. If you wish to incorporate parts of the Program into other free 251 | programs whose distribution conditions are different, write to the author 252 | to ask for permission. For software which is copyrighted by the Free 253 | Software Foundation, write to the Free Software Foundation; we sometimes 254 | make exceptions for this. Our decision will be guided by the two goals 255 | of preserving the free status of all derivatives of our free software and 256 | of promoting the sharing and reuse of software generally. 257 | 258 | NO WARRANTY 259 | 260 | 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY 261 | FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN 262 | OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES 263 | PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED 264 | OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF 265 | MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS 266 | TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE 267 | PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, 268 | REPAIR OR CORRECTION. 269 | 270 | 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING 271 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR 272 | REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, 273 | INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING 274 | OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED 275 | TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY 276 | YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER 277 | PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE 278 | POSSIBILITY OF SUCH DAMAGES. 279 | 280 | END OF TERMS AND CONDITIONS 281 | 282 | How to Apply These Terms to Your New Programs 283 | 284 | If you develop a new program, and you want it to be of the greatest 285 | possible use to the public, the best way to achieve this is to make it 286 | free software which everyone can redistribute and change under these terms. 287 | 288 | To do so, attach the following notices to the program. It is safest 289 | to attach them to the start of each source file to most effectively 290 | convey the exclusion of warranty; and each file should have at least 291 | the "copyright" line and a pointer to where the full notice is found. 292 | 293 | 294 | Copyright (C) 295 | 296 | This program is free software; you can redistribute it and/or modify 297 | it under the terms of the GNU General Public License as published by 298 | the Free Software Foundation; either version 2 of the License, or 299 | (at your option) any later version. 300 | 301 | This program is distributed in the hope that it will be useful, 302 | but WITHOUT ANY WARRANTY; without even the implied warranty of 303 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 304 | GNU General Public License for more details. 305 | 306 | You should have received a copy of the GNU General Public License along 307 | with this program; if not, write to the Free Software Foundation, Inc., 308 | 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. 309 | 310 | Also add information on how to contact you by electronic and paper mail. 311 | 312 | If the program is interactive, make it output a short notice like this 313 | when it starts in an interactive mode: 314 | 315 | Gnomovision version 69, Copyright (C) year name of author 316 | Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. 317 | This is free software, and you are welcome to redistribute it 318 | under certain conditions; type `show c' for details. 319 | 320 | The hypothetical commands `show w' and `show c' should show the appropriate 321 | parts of the General Public License. Of course, the commands you use may 322 | be called something other than `show w' and `show c'; they could even be 323 | mouse-clicks or menu items--whatever suits your program. 324 | 325 | You should also get your employer (if you work as a programmer) or your 326 | school, if any, to sign a "copyright disclaimer" for the program, if 327 | necessary. Here is a sample; alter the names: 328 | 329 | Yoyodyne, Inc., hereby disclaims all copyright interest in the program 330 | `Gnomovision' (which makes passes at compilers) written by James Hacker. 331 | 332 | , 1 April 1989 333 | Ty Coon, President of Vice 334 | 335 | This General Public License does not permit incorporating your program into 336 | proprietary programs. If your program is a subroutine library, you may 337 | consider it more useful to permit linking proprietary applications with the 338 | library. If this is what you want to do, use the GNU Lesser General 339 | Public License instead of this License. 340 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![GoDoc](https://img.shields.io/static/v1?label=godoc&message=reference&color=blue)](https://pkg.go.dev/github.com/skx/assembler) 2 | [![Go Report Card](https://goreportcard.com/badge/github.com/skx/assembler)](https://goreportcard.com/report/github.com/skx/assembler) 3 | [![license](https://img.shields.io/github/license/skx/assembler.svg)](https://github.com/skx/assembler/blob/master/LICENSE) 4 | 5 | * [Assembler](#assembler) 6 | * [Limitations](#limitations) 7 | * [Installation](#installation) 8 | * [Example Usage](#example-usage) 9 | * [Internals](#internals) 10 | * [Adding New Instructions](#adding-new-instructions) 11 | * [Debugging Generated Binaries](#debugging-generated-binaries) 12 | * [Bugs?](#bugs) 13 | 14 | 15 | # Assembler 16 | 17 | This repository contains a VERY BASIC x86-64 assembler, which is capable of 18 | reading assembly-language input, and generating a staticly linked ELF binary 19 | output. 20 | 21 | It is more a proof-of-concept than a useful assembler, but I hope to take it to the state where it can compile the kind of x86-64 assembly I produce in some of my other projects. 22 | 23 | Currently the assembler will generate a binary which looks like this: 24 | 25 | ``` 26 | $ file a.out 27 | a.out: ELF 64-bit LSB executable, x86-64, version 1 (SYSV) 28 | statically linked, no section header 29 | ``` 30 | 31 | Why? I've written a couple of toy projects that generate assembly language programs, then pass them through an assembler: 32 | 33 | * [brainfuck compiler](https://github.com/skx/bfcc/) 34 | * [math compiler](https://github.com/skx/math-compiler/) 35 | 36 | The code in this repository was born out of the process of experimenting with generating an ELF binary directly. A necessary learning-process. 37 | 38 | 39 | 40 | ## Limitations 41 | 42 | We don't support anywhere near the complete instruction-set which an assembly language programmer would expect. Currently we support only things like this: 43 | 44 | * `add $REG, $REG` + `add $REG, $NUMBER` 45 | * Add a number, or the contents of another register, to a register. 46 | * `call $LABEL` 47 | * See [call.asm](call.asm) for an example. 48 | * `dec $REG` 49 | * Decrement the contents of the specified register. 50 | * We also support indirection, so the following work: 51 | * `inc byte ptr [$REG]` 52 | * `inc word ptr [$REG]` 53 | * `inc dword ptr [$REG]` 54 | * `inc qword ptr [$REG]` 55 | * `inc $REG` 56 | * Increment the contents of the specified register. 57 | * We also support indirection, so the following work: 58 | * `inc byte ptr [$REG]` 59 | * `inc word ptr [$REG]` 60 | * `inc dword ptr [$REG]` 61 | * `inc qword ptr [$REG]` 62 | * `jmp $LABEL`, `je $LABEL`, `jne $LABEL` 63 | * We support jumping instructions, but only with -127/+128 byte displacements 64 | * See [jmp.asm](jmp.asm) for a simple example. 65 | * `mov $REG, $NUMBER` 66 | * `mov $REG, $REG` 67 | * Move a number into the specified register. 68 | * `nop` 69 | * Do nothing. 70 | * `push $NUMBER`, or `push $IDENTIFIER` 71 | * `ret` 72 | * Return from call. 73 | * **NOTE**: We don't actually support making calls, though that can be emulated via `push` - see [jmp.asm](jmp.asm) for an example. 74 | * `sub $REG, $REG` + `sub $REG, $NUMBER` 75 | * Subtract a number, or the contents of another register, from a register. 76 | * `xor $REG, $REG` 77 | * Set the given register to be zero. 78 | * `int $NUM` 79 | * Call the kernel. 80 | * Processor (flag) control instructions: 81 | * `clc`, `cld`, `cli`, `cmc`, `stc`, `std`, and `sti`. 82 | 83 | Note that we really only support the following registers, you'll see that we only support the 64-bit registers (which means `rax` is supported but `eax`, `ax`, `ah`, and `al` are specifically __not__ supported): 84 | 85 | * `rax` 86 | * `rcx` 87 | * `rdx` 88 | * `rbx` 89 | * `rsp` 90 | * `rbp` 91 | * `rsi` 92 | * `rdi` 93 | 94 | There is _some_ support for the extended registers `r8`-`r15`, but this varies on a per-instruction basis and should not be relied upon. 95 | 96 | There is support for storing fixed-data within our program, and locating that. See [hello.asm](hello.asm) for an example of that. 97 | 98 | We also have some other (obvious) limitations: 99 | 100 | * There is notably no support for comparison instructions, and jumping instructions. 101 | * We _emulate_ (unconditional) jump instructions via "`push`" and "`ret`", see [jmp.asm](jmp.asm) for an example of that. 102 | * The entry-point is __always__ at the beginning of the source. 103 | * You can only reference data AFTER it has been declared. 104 | * These are added to the `data` section of the generated binary, but must be defined first. 105 | * See [hello.asm](hello.asm) for an example of that. 106 | 107 | 108 | 109 | ## Installation 110 | 111 | If you have this repository cloned locally you can build the assembler like so: 112 | 113 | cd cmd/assembler 114 | go build . 115 | go install . 116 | 117 | If you wish to fetch and install via your existing toolchain: 118 | 119 | go get -u github.com/skx/assembler/cmd/assembler 120 | 121 | You can repeat for the other commands if you wish: 122 | 123 | go get -u github.com/skx/assembler/cmd/lexer 124 | go get -u github.com/skx/assembler/cmd/parser 125 | 126 | Of course these binary-names are very generic, so perhaps better to work locally! 127 | 128 | 129 | ## Example Usage 130 | 131 | Build the assembler: 132 | 133 | $ cd cmd/assembler 134 | $ go build . 135 | 136 | Compile the [sample program](test.asm), and execute it showing the return-code: 137 | 138 | $ cmd/assembler/assembler test.asm && ./a.out ; echo $? 139 | 9 140 | 141 | Or run the [hello.asm](hello.asm) example: 142 | 143 | $ cmd/assembler/assembler hello.in && ./a.out 144 | Hello, world 145 | Goodbye, world 146 | 147 | You'll note that the `\n` character was correctly expanded into a newline. 148 | 149 | 150 | # Internals 151 | 152 | The core of our code consists of a small number of simple packages: 153 | 154 | * A simple tokenizer [lexer/lexer.go](lexer/lexer.go) 155 | * A simple parser [parser/parser.go](parser/parser.go) 156 | * This populates a simple internal-form/AST [parser/ast.go](parser/ast.go). 157 | * A simple compiler [compiler/compiler.go](compiler/compiler.go) 158 | * A simple elf-generator [elf/elf.go](elf/elf.go) 159 | * Taken from [vishen/go-x64-executable](https://github.com/vishen/go-x64-executable/). 160 | 161 | 162 | In addition to the package modules we also have a couple of binaries: 163 | 164 | * `cmd/lexer` 165 | * Show the output of lexing a program. 166 | * This is useful for debugging and development-purposes, it isn't expected to be useful to end-users. 167 | * `cmd/parser` 168 | * Show the output of parsing a program. 169 | * This is useful for debugging and development-purposes, it isn't expected to be useful to end-users. 170 | * `cmd/assembler` 171 | * Assemble a program, producing an executable binary. 172 | 173 | These commands located beneath `cmd` each operate the same way. They each take a single argument which is a file containing assembly-language instructions. 174 | 175 | For example here is how you'd build and test the parser: 176 | 177 | cd cmd/parser 178 | go build . 179 | $ ./parser ../../test.asm 180 | &{{INSTRUCTION xor} [{REGISTER rax} {REGISTER rax}]} 181 | &{{INSTRUCTION inc} [{REGISTER rax}]} 182 | &{{INSTRUCTION mov} [{REGISTER rbx} {NUMBER 0x0000}]} 183 | &{{INSTRUCTION mov} [{REGISTER rcx} {NUMBER 0x0007}]} 184 | &{{INSTRUCTION add} [{REGISTER rbx} {REGISTER rcx}]} 185 | &{{INSTRUCTION mov} [{REGISTER rcx} {NUMBER 0x0002}]} 186 | &{{INSTRUCTION add} [{REGISTER rbx} {REGISTER rcx}]} 187 | &{{INSTRUCTION int} [{NUMBER 0x80}]} 188 | 189 | 190 | ## Adding New Instructions 191 | 192 | This is how you might add a new instruction to the assembler, for example you might add `jmp 0x00000` or some similar instruction: 193 | 194 | * Add a new entry for the instruction in [instructions/instructions.go](instructions/instructions.go) 195 | * i.e. Update `InstructionLengths` map to add the instruction. 196 | * This will be used by both the tokenization process, and the parser. 197 | * Generate the appropriate output in `compiler/compiler.go`, inside the function `compileInstruction`. 198 | * i.e. Emit the binary-code for the instruction. 199 | 200 | 201 | 202 | ## Debugging Generated Binaries 203 | 204 | Launch the binary under gdb: 205 | 206 | $ gdb ./a.out 207 | 208 | Start it: 209 | 210 | (gdb) starti 211 | Starting program: /home/skx/Repos/github.com/skx/assembler/a.out 212 | 213 | Program stopped. 214 | 0x00000000004000b0 in ?? () 215 | 216 | Dissassemble: 217 | 218 | (gdb) x/5i $pc 219 | 220 | Or show string-contents at an address: 221 | 222 | (gdb) x/s 0x400000 223 | 224 | 225 | # Bugs? 226 | 227 | Feel free to report, as this is more a proof of concept rather than a robust tool they are to be expected. 228 | 229 | Specifically we're missing support for many instructions, but I hope the code generated for those that is present is correct. 230 | 231 | 232 | Steve 233 | -------------------------------------------------------------------------------- /call.asm: -------------------------------------------------------------------------------- 1 | ;; 2 | ;; This file demonstrates using `call` to invoke subroutines. 3 | ;; 4 | ;; Here we have three subroutines of interest: 5 | ;; 6 | ;; print_string - prints a string with explicit address & size. 7 | ;; 8 | ;; print_asciiz_string - Prints a null-terminated string 9 | ;; 10 | ;; _exit - Exits the program 11 | ;; 12 | 13 | .hello DB "Hello, world\n\0" 14 | .message DB "This string has its size calculated dynamically!\n\0" 15 | .goodbye DB "Goodbye, world\n\0" 16 | 17 | ;; print a string, with a size 18 | mov rcx, hello 19 | mov rdx, 13 20 | call print_string 21 | 22 | ;; print a string with ZERO size calculation 23 | mov rcx, message 24 | call print_asciiz_string 25 | 26 | ;; print a string with ZERO size calculation 27 | ;; 28 | ;; BUT change the " " to "*" 29 | mov rdx, message 30 | call print_asciiz_string_with_stars 31 | 32 | 33 | ;; print a string with an explicit size 34 | mov rcx, goodbye 35 | mov rdx, 15 36 | call print_string 37 | 38 | ;; exit this script 39 | mov rbx, 2 40 | call _exit 41 | 42 | ;; Routine to print a string. 43 | ;; 44 | ;; Assumes string address is in RCX 45 | ;; Assumes string length is in RDX 46 | ;; 47 | ;; Traches: RAX, RBX, RCX, RDX 48 | :print_string 49 | mov rbx, 1 ;; output is STDOUT 50 | mov rax, 4 ;; sys_write 51 | int 0x80 ;; syscall 52 | 53 | ret 54 | 55 | ;; Routine to print a '0x00'-terminated string 56 | ;; 57 | ;; Assumes string address is in RCX 58 | :print_asciiz_string 59 | xor rdx, rdx ; zero the length 60 | push rcx ; save string 61 | :len_loop 62 | cmp byte ptr [rcx], 0x00 63 | je len_loop_over 64 | inc rdx 65 | inc rcx 66 | jmp len_loop 67 | :len_loop_over 68 | pop rcx ; restore string-pointer 69 | ; rdx has the mesage 70 | call print_string ; call the print routine 71 | ret ; and return from here 72 | 73 | 74 | 75 | ;; Print a string, terminated by NULL, but change " " to "*" 76 | ;; 77 | ;; NOTE: This destroys the string in the process. 78 | :print_asciiz_string_with_stars 79 | push rdx 80 | :star_loop 81 | cmp byte ptr [rdx], 0x00 ; end of string? we're done 82 | je star_loop_over 83 | cmp byte ptr [rdx], 0x20 ; is this a space? 84 | jne star_loop_cont ; if not continue 85 | mov byte ptr [rdx], 42 ; so replace with "*" 86 | :star_loop_cont 87 | inc rdx ; increase our pointer 88 | jmp star_loop ; loop again 89 | :star_loop_over 90 | pop rcx 91 | call print_asciiz_string 92 | ret 93 | 94 | 95 | ;; Exit 96 | ;; 97 | ;; Assumes RBX has exit-code 98 | :_exit 99 | mov rax, 1 ; SYS_exit 100 | int 0x80 ; syscall 101 | ret ; Never reached 102 | -------------------------------------------------------------------------------- /cmd/assembler/main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "fmt" 5 | "io/ioutil" 6 | "os" 7 | 8 | "github.com/skx/assembler/compiler" 9 | ) 10 | 11 | func main() { 12 | 13 | // 14 | // Ensure we have an argument 15 | // 16 | if len(os.Args) <= 1 { 17 | fmt.Printf("Usage: compiler input.asm\n") 18 | return 19 | } 20 | 21 | data, err := ioutil.ReadFile(os.Args[1]) 22 | if err != nil { 23 | fmt.Printf("error:%s\n", err.Error()) 24 | return 25 | } 26 | 27 | // Create the compiler 28 | c := compiler.New(string(data)) 29 | 30 | c.SetOutput("./a.out") 31 | 32 | err = c.Compile() 33 | if err != nil { 34 | fmt.Printf("Error:%s\n", err.Error()) 35 | return 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /cmd/lexer/main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "fmt" 5 | "io/ioutil" 6 | "os" 7 | 8 | "github.com/skx/assembler/lexer" 9 | "github.com/skx/assembler/token" 10 | ) 11 | 12 | func main() { 13 | // 14 | // Ensure we have an argument 15 | // 16 | if len(os.Args) <= 1 { 17 | fmt.Printf("Usage: lexer input.asm\n") 18 | return 19 | } 20 | 21 | data, err := ioutil.ReadFile(os.Args[1]) 22 | if err != nil { 23 | fmt.Printf("error:%s\n", err.Error()) 24 | return 25 | } 26 | 27 | l := lexer.New(string(data)) 28 | 29 | tok := l.NextToken() 30 | for tok.Type != token.EOF { 31 | fmt.Printf("%v\n", tok) 32 | tok = l.NextToken() 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /cmd/parser/main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "fmt" 5 | "io/ioutil" 6 | "os" 7 | 8 | "github.com/skx/assembler/parser" 9 | ) 10 | 11 | func main() { 12 | // 13 | // Ensure we have an argument 14 | // 15 | if len(os.Args) <= 1 { 16 | fmt.Printf("Usage: parser input.asm\n") 17 | return 18 | } 19 | 20 | data, err := ioutil.ReadFile(os.Args[1]) 21 | if err != nil { 22 | fmt.Printf("error:%s\n", err.Error()) 23 | return 24 | } 25 | 26 | p := parser.New(string(data)) 27 | 28 | stmt := p.Next() 29 | for stmt != nil { 30 | fmt.Printf("%v\n", stmt) 31 | 32 | stmt = p.Next() 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /compiler/compiler.go: -------------------------------------------------------------------------------- 1 | // Package compiler is the package which is actually responsible for reading 2 | // the user-program and generating the binary result. 3 | // 4 | // Internally this uses the parser, as you would expect 5 | package compiler 6 | 7 | import ( 8 | "encoding/binary" 9 | "fmt" 10 | "strconv" 11 | 12 | "github.com/skx/assembler/elf" 13 | "github.com/skx/assembler/parser" 14 | "github.com/skx/assembler/token" 15 | ) 16 | 17 | // Compiler holds our state 18 | type Compiler struct { 19 | 20 | // p holds the parser we use to generate AST 21 | p *parser.Parser 22 | 23 | // output holds the path to the binary we'll generate 24 | output string 25 | 26 | // code contains the code we generate 27 | code []byte 28 | 29 | // data is where we place constant-strings, etc. 30 | data []byte 31 | 32 | // map of "data-name" to "data-offset" 33 | dataOffsets map[string]int 34 | 35 | // patches we have to make, post-compilation. Don't ask 36 | patches map[int]int 37 | 38 | // labels and the corresponding offsets we've seen. 39 | labels map[string]int 40 | 41 | // offsets which contain jumps to labels 42 | labelTargets map[int]string 43 | 44 | // 8-bit offsets for relative label-jumps 45 | jmps map[int]string 46 | 47 | // 32-bit offsets for calls 48 | calls map[int]string 49 | } 50 | 51 | // New creates a new instance of the compiler 52 | func New(src string) *Compiler { 53 | 54 | c := &Compiler{p: parser.New(src), output: "a.out"} 55 | c.dataOffsets = make(map[string]int) 56 | c.patches = make(map[int]int) 57 | 58 | // mapping of "label -> XXX" 59 | c.labels = make(map[string]int) 60 | 61 | // fixups we need to make offset-of-code -> label 62 | c.labelTargets = make(map[int]string) 63 | 64 | // jump-fixups 65 | c.jmps = make(map[int]string) 66 | 67 | // call-fixups 68 | c.calls = make(map[int]string) 69 | 70 | return c 71 | } 72 | 73 | // SetOutput sets the path to the executable we create. 74 | // 75 | // If no output has been specified we default to `./a.out`. 76 | func (c *Compiler) SetOutput(path string) { 77 | c.output = path 78 | } 79 | 80 | // Compile walks over the parser-generated AST and assembles the source 81 | // program. 82 | // 83 | // Once the program has been completed an ELF executable will be produced 84 | func (c *Compiler) Compile() error { 85 | 86 | // 87 | // Walk over the parser-output 88 | // 89 | stmt := c.p.Next() 90 | for stmt != nil { 91 | 92 | switch stmt := stmt.(type) { 93 | 94 | case parser.Data: 95 | c.handleData(stmt) 96 | 97 | case parser.Error: 98 | return fmt.Errorf("error compiling - parser returned error %s", stmt.Value) 99 | 100 | case parser.Label: 101 | // So now we know the label with the given name 102 | // corresponds to the CURRENT position in the 103 | // generated binary-code. 104 | // 105 | // If anything refers to this we'll have to patch 106 | // it up 107 | c.labels[stmt.Name] = len(c.code) 108 | 109 | case parser.Instruction: 110 | err := c.compileInstruction(stmt) 111 | if err != nil { 112 | return err 113 | } 114 | 115 | default: 116 | return fmt.Errorf("unhandled node-type %v", stmt) 117 | } 118 | 119 | stmt = c.p.Next() 120 | } 121 | 122 | // 123 | // Apply data-patches. 124 | // 125 | // This is horrid. 126 | // 127 | for o, v := range c.patches { 128 | 129 | // start of virtual sectoin 130 | // + offset 131 | // + len of code segment 132 | // + elf header 133 | // + 2 * program header 134 | // life is hard 135 | v = 0x400000 + v + len(c.code) + 0x40 + (2 * 0x38) 136 | buf := make([]byte, 4) 137 | binary.LittleEndian.PutUint32(buf, uint32(v)) 138 | 139 | for i, x := range buf { 140 | c.code[i+o] = x 141 | } 142 | } 143 | 144 | // 145 | // OK now we need to patch references to labels 146 | // 147 | for o, s := range c.labelTargets { 148 | 149 | offset := c.labels[s] 150 | 151 | offset = 0x400000 + offset + 0x40 + (2 * 0x38) 152 | 153 | // So we have a new offset. 154 | 155 | buf := make([]byte, 4) 156 | binary.LittleEndian.PutUint32(buf, uint32(offset)) 157 | 158 | for i, x := range buf { 159 | c.code[i+o] = x 160 | } 161 | } 162 | 163 | // Patchup the jumps 164 | for o, s := range c.jmps { 165 | 166 | // the offset of the instruction to we should jump to 167 | offset := c.labels[s] 168 | 169 | // the offset of the position is a byte 170 | diff := uint(o - offset) 171 | 172 | c.code[o] = byte(0xff - byte(diff)) 173 | } 174 | 175 | // Patchup the calls 176 | for o, s := range c.calls { 177 | 178 | // the offset of the instruction to which we should call 179 | offset := c.labels[s] 180 | 181 | // the offset of the position is a byte 182 | diff := uint32(o - offset + 4) 183 | x := uint32(0xffffffff) - uint32(diff-1) 184 | 185 | buf := make([]byte, 4) 186 | binary.LittleEndian.PutUint32(buf, x) 187 | 188 | // overwrite the instruction 189 | for i, x := range buf { 190 | c.code[i+o] = x 191 | } 192 | } 193 | 194 | // 195 | // Write. The. Elf. Output. 196 | // 197 | e := elf.New() 198 | err := e.WriteContent(c.output, c.code, c.data) 199 | if err != nil { 200 | return fmt.Errorf("error writing elf: %s", err.Error()) 201 | } 202 | 203 | return nil 204 | 205 | } 206 | 207 | // handleData appends the data to the data-section of our binary, 208 | // and stores the offset appropriately 209 | func (c *Compiler) handleData(d parser.Data) { 210 | 211 | // Offset of the start of the data is the current 212 | // length of the existing data. 213 | offset := len(c.data) 214 | 215 | // Add 216 | c.data = append(c.data, d.Contents...) 217 | 218 | // Save 219 | c.dataOffsets[d.Name] = offset 220 | 221 | // TODO: Do we care about alignment? We might 222 | // in the future. 223 | } 224 | 225 | // compileInstruction handles the instruction generation 226 | func (c *Compiler) compileInstruction(i parser.Instruction) error { 227 | 228 | switch i.Instruction { 229 | 230 | case "add": 231 | err := c.assembleADD(i) 232 | if err != nil { 233 | return err 234 | } 235 | return nil 236 | 237 | case "call": 238 | err := c.assembleCALL(i) 239 | if err != nil { 240 | return err 241 | } 242 | return nil 243 | 244 | case "clc": 245 | c.code = append(c.code, 0xf8) 246 | return nil 247 | 248 | case "cld": 249 | c.code = append(c.code, 0xfc) 250 | return nil 251 | 252 | case "cli": 253 | c.code = append(c.code, 0xfa) 254 | return nil 255 | 256 | case "cmp": 257 | err := c.assembleCMP(i) 258 | if err != nil { 259 | return err 260 | } 261 | return nil 262 | 263 | case "cmc": 264 | c.code = append(c.code, 0xf5) 265 | return nil 266 | 267 | case "dec": 268 | err := c.assembleDEC(i) 269 | if err != nil { 270 | return err 271 | } 272 | return nil 273 | 274 | case "inc": 275 | err := c.assembleINC(i) 276 | if err != nil { 277 | return err 278 | } 279 | return nil 280 | 281 | case "int": 282 | n, err := c.argToByte(i.Operands[0].Token) 283 | if err != nil { 284 | return err 285 | } 286 | c.code = append(c.code, 0xcd) 287 | c.code = append(c.code, n) 288 | return nil 289 | 290 | case "jmp", "jne", "je", "jz", "jnz": 291 | err := c.assembleJMP(i) 292 | if err != nil { 293 | return err 294 | } 295 | return nil 296 | 297 | case "mov": 298 | err := c.assembleMov(i, false) 299 | if err != nil { 300 | return err 301 | } 302 | return nil 303 | 304 | case "nop": 305 | c.code = append(c.code, 0x90) 306 | return nil 307 | 308 | case "pop": 309 | err := c.assemblePop(i) 310 | if err != nil { 311 | return err 312 | } 313 | return nil 314 | 315 | case "push": 316 | err := c.assemblePush(i) 317 | if err != nil { 318 | return err 319 | } 320 | return nil 321 | 322 | case "ret": 323 | c.code = append(c.code, 0xc3) 324 | return nil 325 | 326 | case "stc": 327 | c.code = append(c.code, 0xf9) 328 | return nil 329 | 330 | case "std": 331 | c.code = append(c.code, 0xfd) 332 | return nil 333 | 334 | case "sti": 335 | c.code = append(c.code, 0xfb) 336 | return nil 337 | 338 | case "sub": 339 | err := c.assembleSUB(i) 340 | if err != nil { 341 | return err 342 | } 343 | return nil 344 | case "xor": 345 | err := c.assembleXOR(i) 346 | if err != nil { 347 | return err 348 | } 349 | return nil 350 | } 351 | 352 | return fmt.Errorf("unknown instruction %v", i) 353 | } 354 | 355 | // return register number - used for `dec`, `inc`, and `mov`. 356 | func (c *Compiler) getreg(reg string) int { 357 | 358 | // registers 359 | registers := []string{ 360 | "rax", 361 | "rcx", 362 | "rdx", 363 | "rbx", 364 | "rsp", 365 | "rbp", 366 | "rsi", 367 | "rdi"} 368 | 369 | for i, name := range registers { 370 | if reg == name { 371 | return i 372 | } 373 | } 374 | 375 | panic(fmt.Sprintf("failed to lookup register: %s", reg)) 376 | } 377 | 378 | // get magic value for two-register operations (`add`, `sub`, `xor`). 379 | func (c *Compiler) calcRM(dest string, src string) byte { 380 | 381 | // registers 382 | registers := []string{ 383 | "rax", 384 | "rcx", 385 | "rdx", 386 | "rbx", 387 | "rsp", 388 | "rbp", 389 | "rsi", 390 | "rdi"} 391 | 392 | dN := -1 393 | sN := -1 394 | 395 | for i, reg := range registers { 396 | if reg == dest { 397 | dN = i 398 | } 399 | if reg == src { 400 | sN = i 401 | 402 | } 403 | } 404 | 405 | if dN < 0 || sN < 0 { 406 | panic(fmt.Sprintf("failed to lookup registers: %s %s", src, dest)) 407 | } 408 | 409 | out := 0xc0 + (8 * sN) + dN 410 | if out > 255 { 411 | panic("calcRM received out of bounds value") 412 | } 413 | return byte(out) 414 | } 415 | 416 | // used by `int` 417 | func (c *Compiler) argToByte(t token.Token) (byte, error) { 418 | 419 | num, err := strconv.ParseInt(t.Literal, 0, 64) 420 | if err != nil { 421 | return 0, fmt.Errorf("unable to convert %s to number %s", t.Literal, err) 422 | } 423 | 424 | return byte(num), nil 425 | } 426 | 427 | // used by `mov` 428 | func (c *Compiler) argToByteArray(t token.Token) ([]byte, error) { 429 | 430 | // Store the result here 431 | buf := make([]byte, 4) 432 | 433 | num, err := strconv.ParseInt(t.Literal, 0, 64) 434 | if err != nil { 435 | return buf, fmt.Errorf("unable to convert %s to number for register %s", t.Literal, err) 436 | } 437 | 438 | binary.LittleEndian.PutUint32(buf, uint32(num)) 439 | return buf, nil 440 | } 441 | 442 | // assembleADD handles addition. 443 | func (c *Compiler) assembleADD(i parser.Instruction) error { 444 | 445 | // Two registers added? 446 | if i.Operands[0].Type == token.REGISTER && 447 | i.Operands[1].Type == token.REGISTER { 448 | c.code = append(c.code, []byte{0x48, 0x01}...) 449 | out := c.calcRM(i.Operands[0].Literal, i.Operands[1].Literal) 450 | c.code = append(c.code, out) 451 | return nil 452 | } 453 | 454 | // OK number added to a register? 455 | if i.Operands[0].Type == token.REGISTER && 456 | i.Operands[1].Type == token.NUMBER { 457 | 458 | // Convert the integer to a four-byte/64-bit value 459 | n, err := c.argToByteArray(i.Operands[1].Token) 460 | if err != nil { 461 | return err 462 | } 463 | 464 | // Work out the register 465 | switch i.Operands[0].Literal { 466 | case "rax": 467 | c.code = append(c.code, []byte{0x48, 0x05}...) 468 | case "rbx": 469 | c.code = append(c.code, []byte{0x48, 0x81, 0xc3}...) 470 | case "rcx": 471 | c.code = append(c.code, []byte{0x48, 0x81, 0xc1}...) 472 | case "rdx": 473 | c.code = append(c.code, []byte{0x48, 0x81, 0xc2}...) 474 | default: 475 | return fmt.Errorf("add %s, number not implemented", i.Operands[0].Literal) 476 | } 477 | 478 | // Now append the value 479 | c.code = append(c.code, n...) 480 | return nil 481 | } 482 | 483 | return fmt.Errorf("unhandled ADD instruction %v", i) 484 | } 485 | 486 | // Handle a call instruction 487 | func (c *Compiler) assembleCALL(i parser.Instruction) error { 488 | 489 | if i.Operands[0].Type != token.IDENTIFIER { 490 | return fmt.Errorf("we only support CALL to labels at the moment") 491 | } 492 | 493 | // emit the call 494 | c.code = append(c.code, 0xe8) 495 | 496 | c.calls[len(c.code)] = i.Operands[0].Literal 497 | c.code = append(c.code, []byte{0x00, 0x00, 0x00, 0x00}...) 498 | 499 | return nil 500 | } 501 | 502 | // Handle a comparison 503 | func (c *Compiler) assembleCMP(i parser.Instruction) error { 504 | 505 | // We're only handling indirection at the moment 506 | if i.Operands[0].Type != token.REGISTER && 507 | i.Operands[0].Indirection != true && 508 | i.Operands[1].Type != token.NUMBER { 509 | return fmt.Errorf("we only support CMP size ptr [reg],NUMBER at the moment") 510 | } 511 | 512 | // The number we're comparing 513 | n, err := strconv.ParseInt(i.Operands[1].Literal, 0, 64) 514 | if err != nil { 515 | return err 516 | } 517 | 518 | // Register number 519 | r := byte(c.getreg(i.Operands[0].Literal)) 520 | 521 | // things we add 522 | bytes := []byte{} 523 | 524 | switch i.Operands[0].Size { 525 | 526 | case 8: 527 | bytes = []byte{0x80, 0x38 + r, byte(n)} 528 | case 16: 529 | bytes = []byte{0x66, 0x83, 0x38 + r} 530 | 531 | buf := make([]byte, 2) 532 | binary.LittleEndian.PutUint16(buf, uint16(n)) 533 | bytes = append(bytes, buf...) 534 | 535 | case 32, 64: 536 | bytes = []byte{0x83, 0x38 + r} 537 | 538 | buf := make([]byte, 4) 539 | binary.LittleEndian.PutUint32(buf, uint32(n)) 540 | bytes = append(bytes, buf...) 541 | 542 | default: 543 | return fmt.Errorf("unknown size in instruction %v", i.Operands[0]) 544 | } 545 | 546 | c.code = append(c.code, bytes...) 547 | return nil 548 | } 549 | 550 | // assembleDEC handles dec rax, rbx, etc. 551 | func (c *Compiler) assembleDEC(i parser.Instruction) error { 552 | 553 | // Decrement the contents of a register 554 | if i.Operands[0].Indirection == false { 555 | // prefix 556 | c.code = append(c.code, []byte{0x48, 0xff}...) 557 | 558 | // register name 559 | reg := 0xc0 + c.getreg(i.Operands[0].Literal) 560 | c.code = append(c.code, byte(reg)) 561 | 562 | return nil 563 | } 564 | 565 | // indirect: byte 566 | if i.Operands[0].Size == 8 { 567 | // prefix 568 | c.code = append(c.code, []byte{0x67, 0xfe}...) 569 | 570 | // register name 571 | reg := c.getreg(i.Operands[0].Literal) 572 | reg += 0x08 573 | c.code = append(c.code, byte(reg)) 574 | 575 | return nil 576 | } 577 | 578 | // indirect: word 579 | if i.Operands[0].Size == 16 { 580 | // prefix 581 | c.code = append(c.code, []byte{0x67, 0x66, 0xff}...) 582 | 583 | // register name 584 | reg := c.getreg(i.Operands[0].Literal) 585 | reg += 0x08 586 | c.code = append(c.code, byte(reg)) 587 | 588 | return nil 589 | } 590 | 591 | // indirect: double word 592 | if i.Operands[0].Size == 32 || i.Operands[0].Size == 64 { 593 | // prefix 594 | c.code = append(c.code, []byte{0x67, 0xff}...) 595 | 596 | // register name 597 | reg := c.getreg(i.Operands[0].Literal) 598 | reg += 0x08 599 | c.code = append(c.code, byte(reg)) 600 | 601 | return nil 602 | } 603 | 604 | return fmt.Errorf("unknown argument for DEC %v", i) 605 | } 606 | 607 | // assembleINC handles inc rax, rbx, etc. 608 | func (c *Compiler) assembleINC(i parser.Instruction) error { 609 | 610 | // Increment the contents of a register 611 | if i.Operands[0].Indirection == false { 612 | // prefix 613 | c.code = append(c.code, []byte{0x48, 0xff}...) 614 | 615 | // register name 616 | reg := 0xc0 + c.getreg(i.Operands[0].Literal) 617 | c.code = append(c.code, byte(reg)) 618 | 619 | return nil 620 | } 621 | 622 | // indirect: byte 623 | if i.Operands[0].Size == 8 { 624 | // prefix 625 | c.code = append(c.code, []byte{0x67, 0xfe}...) 626 | 627 | // register name 628 | reg := c.getreg(i.Operands[0].Literal) 629 | c.code = append(c.code, byte(reg)) 630 | 631 | return nil 632 | } 633 | 634 | // indirect: word 635 | if i.Operands[0].Size == 16 { 636 | // prefix 637 | c.code = append(c.code, []byte{0x67, 0x66, 0xff}...) 638 | 639 | // register name 640 | reg := c.getreg(i.Operands[0].Literal) 641 | c.code = append(c.code, byte(reg)) 642 | 643 | return nil 644 | } 645 | 646 | // indirect: double word 647 | if i.Operands[0].Size == 32 || i.Operands[0].Size == 64 { 648 | // prefix 649 | c.code = append(c.code, []byte{0x67, 0xff}...) 650 | 651 | // register name 652 | reg := c.getreg(i.Operands[0].Literal) 653 | c.code = append(c.code, byte(reg)) 654 | 655 | return nil 656 | } 657 | 658 | return fmt.Errorf("unknown argument for INC %v", i) 659 | } 660 | 661 | // assembleJMP handles all the jump instructions 662 | // 663 | // NOTE We have to fixup the offsets here. 664 | func (c *Compiler) assembleJMP(i parser.Instruction) error { 665 | 666 | var byte byte 667 | 668 | switch i.Instruction { 669 | case "jmp": 670 | byte = 0xeb 671 | case "je", "jz": 672 | byte = 0x74 673 | case "jne", "jnz": 674 | byte = 0x75 675 | default: 676 | return fmt.Errorf("unknown jmp type") 677 | } 678 | 679 | // Ensure we're jumping to a label 680 | if i.Operands[0].Type != token.IDENTIFIER { 681 | return fmt.Errorf("we only support jumps to labels at the moment") 682 | } 683 | 684 | // emit the instruction and make a note of the fixup to make 685 | c.code = append(c.code, byte) 686 | c.jmps[len(c.code)] = i.Operands[0].Literal 687 | c.code = append(c.code, 0x00) // empty displacement 688 | 689 | return nil 690 | } 691 | 692 | func (c *Compiler) assembleMov(i parser.Instruction, label bool) error { 693 | 694 | // 695 | // Are we moving a register to another register? 696 | // 697 | // No indirection 698 | // 699 | if i.Operands[0].Type == token.REGISTER && 700 | i.Operands[0].Indirection == false && 701 | i.Operands[1].Type == token.REGISTER && 702 | i.Operands[1].Indirection == false { 703 | 704 | c.code = append(c.code, []byte{0x48, 0x89}...) 705 | out := c.calcRM(i.Operands[0].Literal, i.Operands[1].Literal) 706 | c.code = append(c.code, out) 707 | return nil 708 | 709 | } 710 | 711 | // 712 | // Are we moving a number to a register ? 713 | // 714 | if i.Operands[0].Type == token.REGISTER && 715 | i.Operands[0].Indirection == false && 716 | i.Operands[1].Type == token.NUMBER { 717 | 718 | // prefix 719 | c.code = append(c.code, []byte{0x48, 0xc7}...) 720 | 721 | // register name 722 | reg := 0xc0 + c.getreg(i.Operands[0].Literal) 723 | c.code = append(c.code, byte(reg)) 724 | 725 | // value 726 | n, err := c.argToByteArray(i.Operands[1].Token) 727 | if err != nil { 728 | return err 729 | } 730 | 731 | // hack 732 | if label { 733 | c.patches[len(c.code)], _ = strconv.Atoi(i.Operands[1].Literal) 734 | } 735 | c.code = append(c.code, n...) 736 | return nil 737 | } 738 | 739 | // mov $reg, $id 740 | if i.Operands[0].Type == token.REGISTER && 741 | i.Operands[0].Indirection == false && 742 | i.Operands[1].Type == token.IDENTIFIER { 743 | 744 | // 745 | // Lookup the identifier, and if we can find it 746 | // then we will treat it as a constant 747 | // 748 | name := i.Operands[1].Literal 749 | val, ok := c.dataOffsets[name] 750 | if ok { 751 | 752 | i.Operands[1].Type = token.NUMBER 753 | i.Operands[1].Literal = fmt.Sprintf("%d", val) 754 | return c.assembleMov(i, true) 755 | } 756 | return fmt.Errorf("reference to unknown label/data: %v", i.Operands[1]) 757 | } 758 | 759 | // Storing a value in an address 760 | if i.Operands[0].Type == token.REGISTER && 761 | i.Operands[0].Indirection && 762 | i.Operands[1].Type == token.NUMBER { 763 | 764 | // The number we're setting 765 | n, err := strconv.ParseInt(i.Operands[1].Literal, 0, 64) 766 | if err != nil { 767 | return err 768 | } 769 | 770 | // Register number 771 | r := byte(c.getreg(i.Operands[0].Literal)) 772 | 773 | // things we add 774 | bytes := []byte{} 775 | 776 | switch i.Operands[0].Size { 777 | 778 | case 8: 779 | bytes = []byte{0xc6, r, byte(n)} 780 | case 16: 781 | bytes = []byte{0x66, 0xc7, byte(r)} 782 | 783 | buf := make([]byte, 2) 784 | binary.LittleEndian.PutUint16(buf, uint16(n)) 785 | bytes = append(bytes, buf...) 786 | 787 | case 32, 64: 788 | bytes = []byte{0xc7, r} 789 | 790 | buf := make([]byte, 4) 791 | binary.LittleEndian.PutUint32(buf, uint32(n)) 792 | bytes = append(bytes, buf...) 793 | 794 | default: 795 | return fmt.Errorf("unknown size in instruction %v", i.Operands[0]) 796 | } 797 | 798 | c.code = append(c.code, bytes...) 799 | 800 | return nil 801 | } 802 | 803 | // 804 | // HACK 805 | // 806 | // mov rax, [REG] 807 | if i.Operands[0].Type == token.REGISTER && 808 | i.Operands[0].Literal == "rax" && 809 | i.Operands[1].Type == token.REGISTER && 810 | i.Operands[1].Indirection { 811 | 812 | // Register number 813 | r := byte(c.getreg(i.Operands[0].Literal)) 814 | c.code = append(c.code, []byte{0x8a, r}...) 815 | return nil 816 | } 817 | 818 | return fmt.Errorf("unknown MOV instruction: %v", i) 819 | 820 | } 821 | 822 | // assemblePop would compile "pop offset", and "push 0x1234" 823 | func (c *Compiler) assemblePop(i parser.Instruction) error { 824 | 825 | // known pop-types 826 | table := make(map[string][]byte) 827 | table["rax"] = []byte{0x58} 828 | table["rbx"] = []byte{0x5b} 829 | table["rcx"] = []byte{0x59} 830 | table["rdx"] = []byte{0x5a} 831 | table["rbp"] = []byte{0x5d} 832 | table["rsp"] = []byte{0x5c} 833 | table["rsi"] = []byte{0x5e} 834 | table["rdi"] = []byte{0x5f} 835 | table["r8"] = []byte{0x41, 0x58} 836 | table["r9"] = []byte{0x41, 0x59} 837 | table["r10"] = []byte{0x41, 0x5a} 838 | table["r11"] = []byte{0x41, 0x5b} 839 | table["r12"] = []byte{0x41, 0x5c} 840 | table["r13"] = []byte{0x41, 0x5d} 841 | table["r14"] = []byte{0x41, 0x5e} 842 | table["r15"] = []byte{0x41, 0x5f} 843 | 844 | // Is this "pop rax|rbx..|rdx", or something in the table? 845 | if i.Operands[0].Type == token.REGISTER { 846 | bytes, ok := table[i.Operands[0].Literal] 847 | if ok { 848 | c.code = append(c.code, bytes...) 849 | return nil 850 | } 851 | return fmt.Errorf("unknown register in 'pop'") 852 | } 853 | 854 | return fmt.Errorf("unknown pop-type: %v", i) 855 | 856 | } 857 | 858 | // assemblePush would compile "push offset", and "push 0x1234" 859 | func (c *Compiler) assemblePush(i parser.Instruction) error { 860 | 861 | // Is this a number? Just output it 862 | if i.Operands[0].Type == token.NUMBER { 863 | n, err := c.argToByteArray(i.Operands[1].Token) 864 | if err != nil { 865 | return err 866 | } 867 | c.code = append(c.code, 0x68) 868 | c.code = append(c.code, n...) 869 | return nil 870 | } 871 | 872 | // Is this a label? 873 | if i.Operands[0].Type == token.IDENTIFIER { 874 | 875 | c.code = append(c.code, 0x68) 876 | 877 | c.labelTargets[len(c.code)] = i.Operands[0].Literal 878 | 879 | c.code = append(c.code, []byte{0x0, 0x0, 0x0, 0x0}...) 880 | return nil 881 | } 882 | 883 | // is this a register? 884 | table := make(map[string][]byte) 885 | table["rax"] = []byte{0x50} 886 | table["rcx"] = []byte{0x51} 887 | table["rdx"] = []byte{0x52} 888 | table["rbx"] = []byte{0x53} 889 | table["rsp"] = []byte{0x54} 890 | table["rbp"] = []byte{0x55} 891 | table["rsi"] = []byte{0x56} 892 | table["rdi"] = []byte{0x57} 893 | table["r8"] = []byte{0x41, 0x50} 894 | table["r9"] = []byte{0x41, 0x51} 895 | table["r10"] = []byte{0x41, 0x52} 896 | table["r11"] = []byte{0x41, 0x53} 897 | table["r12"] = []byte{0x41, 0x54} 898 | table["r13"] = []byte{0x41, 0x55} 899 | table["r14"] = []byte{0x41, 0x56} 900 | table["r15"] = []byte{0x41, 0x57} 901 | 902 | // Is this "push rax|rbx..|rdx", or something in the table? 903 | if i.Operands[0].Type == token.REGISTER { 904 | bytes, ok := table[i.Operands[0].Literal] 905 | if ok { 906 | c.code = append(c.code, bytes...) 907 | return nil 908 | } 909 | return fmt.Errorf("unknown register in 'push'") 910 | } 911 | 912 | return fmt.Errorf("unknown push-type: %v", i) 913 | } 914 | 915 | // assembleSUB handles subtraction. 916 | func (c *Compiler) assembleSUB(i parser.Instruction) error { 917 | 918 | // Two registers subtracted? 919 | if i.Operands[0].Type == token.REGISTER && 920 | i.Operands[1].Type == token.REGISTER { 921 | c.code = append(c.code, []byte{0x48, 0x29}...) 922 | out := c.calcRM(i.Operands[0].Literal, i.Operands[1].Literal) 923 | c.code = append(c.code, out) 924 | return nil 925 | } 926 | 927 | // OK number subtracted from a register? 928 | if i.Operands[0].Type == token.REGISTER && 929 | i.Operands[1].Type == token.NUMBER { 930 | 931 | // Convert the integer to a four-byte/64-bit value 932 | n, err := c.argToByteArray(i.Operands[1].Token) 933 | if err != nil { 934 | return err 935 | } 936 | 937 | // Work out the register 938 | switch i.Operands[0].Literal { 939 | case "rax": 940 | c.code = append(c.code, []byte{0x48, 0x2d}...) 941 | case "rbx": 942 | c.code = append(c.code, []byte{0x48, 0x81, 0xeb}...) 943 | case "rcx": 944 | c.code = append(c.code, []byte{0x48, 0x81, 0xe9}...) 945 | case "rdx": 946 | c.code = append(c.code, []byte{0x48, 0x81, 0xea}...) 947 | default: 948 | return fmt.Errorf("SUB %s, number not implemented", i.Operands[0].Literal) 949 | } 950 | 951 | // Now append the value 952 | c.code = append(c.code, n...) 953 | return nil 954 | } 955 | 956 | return fmt.Errorf("unhandled SUB instruction %v", i) 957 | } 958 | 959 | // assembleXOR handles xor rax, rbx, etc. 960 | func (c *Compiler) assembleXOR(i parser.Instruction) error { 961 | 962 | // Two registers xor'd? 963 | if i.Operands[0].Type == token.REGISTER && 964 | i.Operands[1].Type == token.REGISTER { 965 | c.code = append(c.code, []byte{0x48, 0x31}...) 966 | out := c.calcRM(i.Operands[0].Literal, i.Operands[1].Literal) 967 | c.code = append(c.code, out) 968 | return nil 969 | } 970 | 971 | return fmt.Errorf("unknown argument for XOR %v", i) 972 | } 973 | -------------------------------------------------------------------------------- /elf/elf.go: -------------------------------------------------------------------------------- 1 | package elf 2 | 3 | import ( 4 | "encoding/binary" 5 | "io/ioutil" 6 | ) 7 | 8 | const ( 9 | virtualStartAddress uint64 = 0x400000 10 | dataVirtualStartAddress uint64 = 0x600000 11 | alignment uint64 = 0x200000 12 | ) 13 | 14 | type Builder struct { 15 | o []byte 16 | } 17 | 18 | func (b *Builder) WriteBytes(bs ...byte) { 19 | b.o = append(b.o, bs...) 20 | } 21 | 22 | func (b *Builder) WriteValue(size int, value uint64) { 23 | buf := make([]byte, size) 24 | binary.LittleEndian.PutUint64(buf, value) 25 | b.WriteBytes(buf...) 26 | } 27 | 28 | type Elf struct { 29 | } 30 | 31 | func New() *Elf { 32 | return &Elf{} 33 | } 34 | 35 | func (e *Elf) WriteContent(path string, textSection, dataSection []byte) error { 36 | 37 | data := e.buildELF(textSection, dataSection) 38 | if err := ioutil.WriteFile(path, data, 0755); err != nil { 39 | return err 40 | } 41 | 42 | return nil 43 | } 44 | 45 | func (e *Elf) buildELF(textSection, dataSection []byte) []byte { 46 | textSize := uint64(len(textSection)) 47 | // Size of ELF header + 2 * size program header? 48 | textOffset := uint64(0x40 + (2 * 0x38)) 49 | 50 | var o Builder 51 | 52 | // Build ELF Header 53 | o.WriteBytes(0x7f, 0x45, 0x4c, 0x46) // ELF magic value 54 | 55 | o.WriteBytes(0x02) // 64-bit executable 56 | o.WriteBytes(0x01) // Little endian 57 | o.WriteBytes(0x01) // ELF version 58 | o.WriteBytes(0x00) // Target OS ABI 59 | o.WriteBytes(0x00) // Further specify ABI version 60 | 61 | o.WriteBytes(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00) // Unused bytes 62 | 63 | o.WriteBytes(0x02, 0x00) // Executable type 64 | o.WriteBytes(0x3e, 0x00) // x86-64 target architecture 65 | o.WriteBytes(0x01, 0x00, 0x00, 0x00) // ELF version 66 | 67 | // 64-bit virtual offsets always start at 0x400000?? https://stackoverflow.com/questions/38549972/why-elf-executables-have-a-fixed-load-address 68 | // This seems to be a convention set in the x86_64 system-v abi: https://refspecs.linuxfoundation.org/elf/x86_64-SysV-psABI.pdf P26 69 | o.WriteValue(8, virtualStartAddress+textOffset) 70 | 71 | o.WriteBytes(0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00) // Offset from file to program header 72 | o.WriteBytes(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00) // Start of section header table 73 | o.WriteBytes(0x00, 0x00, 0x00, 0x00) // Flags 74 | o.WriteBytes(0x40, 0x00) // Size of this header 75 | o.WriteBytes(0x38, 0x00) // Size of a program header table entry - This should always be the same for 64-bit 76 | o.WriteBytes(0x02, 0x00) // Length of sections: data and text for now 77 | o.WriteBytes(0x00, 0x00) // Size of section header, which we aren't using 78 | o.WriteBytes(0x00, 0x00) // Number of entries section header 79 | o.WriteBytes(0x00, 0x00) // Index of section header table entry 80 | 81 | // Build Program Header 82 | // Text Segment 83 | o.WriteBytes(0x01, 0x00, 0x00, 0x00) // PT_LOAD, loadable segment. Both data and text segment use this. 84 | o.WriteBytes(0x07, 0x00, 0x00, 0x00) // Flags: 0x4 executable, 0x2 write, 0x1 read 85 | o.WriteValue(8, 0) // textOffset) // Offset from the beginning of the file. These values depend on how big the header and segment sizes are. 86 | o.WriteValue(8, virtualStartAddress) 87 | o.WriteValue(8, virtualStartAddress) // Physical address, irrelavnt on linux. 88 | o.WriteValue(8, textSize) // Number of bytes in file image of segment, must be larger than or equal to the size of payload in segment. Should be zero for bss data. 89 | o.WriteValue(8, textSize) // Number of bytes in memory image of segment, is not always same size as file image. 90 | o.WriteValue(8, alignment) 91 | 92 | dataSize := uint64(len(dataSection)) 93 | dataOffset := uint64(textOffset + textSize) 94 | dataVirtualAddress := dataVirtualStartAddress + dataOffset 95 | 96 | // Build Program Header 97 | // Data Segment 98 | o.WriteBytes(0x01, 0x00, 0x00, 0x00) // PT_LOAD, loadable segment. Both data and text segment use this. 99 | o.WriteBytes(0x07, 0x00, 0x00, 0x00) // Flags: 0x4 executable, 0x2 write, 0x1 read 100 | o.WriteValue(8, dataOffset) // Offset address. 101 | o.WriteValue(8, dataVirtualAddress) // Virtual address. 102 | o.WriteValue(8, dataVirtualAddress) // Physical address. 103 | o.WriteValue(8, dataSize) // Number of bytes in file image. 104 | o.WriteValue(8, dataSize) // Number of bytes in memory image. 105 | o.WriteValue(8, alignment) 106 | 107 | // Output the text segment 108 | o.WriteBytes(textSection...) 109 | // Output the data segment 110 | o.WriteBytes(dataSection...) 111 | return o.o 112 | } 113 | -------------------------------------------------------------------------------- /exit.asm: -------------------------------------------------------------------------------- 1 | ;; Basic exit-code example. 2 | nop ; Nothing happens 3 | mov rbx,31 ; first syscall argument: exit code 4 | mov rax,1 ; system call number (sys_exit) 5 | int 0x80 ; call kernel 6 | -------------------------------------------------------------------------------- /go.mod: -------------------------------------------------------------------------------- 1 | module github.com/skx/assembler 2 | 3 | go 1.14 4 | -------------------------------------------------------------------------------- /hello.asm: -------------------------------------------------------------------------------- 1 | ;; Output some text to the console. 2 | ;; 3 | ;; This example demonstrates using sys_write, and sys_exit 4 | ;; 5 | ;; For less duplication see the code in `call.asm`. 6 | ;; 7 | 8 | .hello DB "Hello, world\n" 9 | .goodbye DB "Goodbye, world\n" 10 | 11 | mov rdx, 13 ;; write this many characters 12 | mov rcx, hello ;; starting at the string 13 | mov rbx, 1 ;; write to STDOUT 14 | mov rax, 4 ;; sys_write 15 | int 0x80 ;; syscall 16 | 17 | mov rdx, 15 ;; write this many characters 18 | mov rcx, goodbye ;; starting at the string 19 | mov rax, 4 ;; sys_write 20 | mov rbx, 1 ;; write to STDOUT 21 | int 0x80 ;; syscall 22 | 23 | xor rbx, rbx ;; exit-code is 0 24 | mov rax, 0x01 ;; sys_exit 25 | int 0x80 ;; syscall 26 | -------------------------------------------------------------------------------- /instructions/instructions.go: -------------------------------------------------------------------------------- 1 | // Package instructions contains the comment instruction-definitions 2 | // for the instructions that we understand. 3 | // 4 | // These are abstracted here, so that you only don't need to touch 5 | // the parser/lexer to add new instructions. 6 | // 7 | // Just add the instructions here, and update the compiler to emit the 8 | // appropriate code. 9 | package instructions 10 | 11 | var ( 12 | // InstructionLengths is a map that returns the number of operands 13 | // the given assembly-language operation will accept. 14 | // 15 | // For example a `nop` argument requires zero arguments so the 16 | // entry for that will be `0`. 17 | InstructionLengths map[string]int 18 | 19 | // Instructions is automatically generated from the InstructionLengths 20 | // map, and contains the known instruction-types we can lex, parse, and 21 | // compile. 22 | Instructions []string 23 | ) 24 | 25 | func init() { 26 | 27 | // Setup our instruction-lengths 28 | InstructionLengths = make(map[string]int) 29 | 30 | InstructionLengths["add"] = 2 31 | InstructionLengths["cmp"] = 2 32 | InstructionLengths["dec"] = 1 33 | InstructionLengths["inc"] = 1 34 | InstructionLengths["int"] = 1 35 | InstructionLengths["mov"] = 2 36 | InstructionLengths["nop"] = 0 37 | InstructionLengths["pop"] = 1 38 | InstructionLengths["push"] = 1 39 | InstructionLengths["ret"] = 0 40 | InstructionLengths["sub"] = 2 41 | InstructionLengths["xor"] = 2 42 | 43 | // call 44 | InstructionLengths["call"] = 1 45 | 46 | // jump 47 | InstructionLengths["je"] = 1 48 | InstructionLengths["jmp"] = 1 49 | InstructionLengths["jne"] = 1 50 | InstructionLengths["jnz"] = 1 51 | InstructionLengths["jz"] = 1 52 | 53 | // Processor control instructions 54 | InstructionLengths["clc"] = 0 55 | InstructionLengths["cld"] = 0 56 | InstructionLengths["cli"] = 0 57 | InstructionLengths["cmc"] = 0 58 | InstructionLengths["stc"] = 0 59 | InstructionLengths["std"] = 0 60 | InstructionLengths["sti"] = 0 61 | 62 | // Now record the known-instructions 63 | for k := range InstructionLengths { 64 | Instructions = append(Instructions, k) 65 | } 66 | } 67 | -------------------------------------------------------------------------------- /jmp.asm: -------------------------------------------------------------------------------- 1 | ;; This is an example of control-flow. 2 | ;; 3 | 4 | 5 | :start 6 | jmp foo 7 | 8 | :bar 9 | nop ; Nothing happens 10 | mov rbx,33 ; first syscall argument: exit code 11 | mov rax,1 ; system call number (sys_exit) 12 | int 0x80 ; call kernel 13 | 14 | :foo 15 | jmp bar 16 | -------------------------------------------------------------------------------- /lexer/lexer.go: -------------------------------------------------------------------------------- 1 | // Package lexer contains our lexer. 2 | package lexer 3 | 4 | import ( 5 | "errors" 6 | "fmt" 7 | "unicode" 8 | 9 | "github.com/skx/assembler/token" 10 | ) 11 | 12 | // Lexer holds our object-state. 13 | type Lexer struct { 14 | // The current character position 15 | position int 16 | 17 | // The next character position 18 | readPosition int 19 | 20 | // The current character 21 | ch rune 22 | 23 | // A rune slice of our input string 24 | characters []rune 25 | } 26 | 27 | // New creates a Lexer instance from the given string 28 | func New(input string) *Lexer { 29 | 30 | // Line counting starts at one. 31 | l := &Lexer{characters: []rune(input)} 32 | l.readChar() 33 | return l 34 | } 35 | 36 | // read forward one character. 37 | func (l *Lexer) readChar() { 38 | if l.readPosition >= len(l.characters) { 39 | l.ch = rune(0) 40 | } else { 41 | l.ch = l.characters[l.readPosition] 42 | } 43 | l.position = l.readPosition 44 | l.readPosition++ 45 | 46 | } 47 | 48 | // NextToken reads and returns the next token, skipping any intervening 49 | // white space, and swallowing any comments, in the process. 50 | func (l *Lexer) NextToken() token.Token { 51 | var tok token.Token 52 | l.skipWhitespace() 53 | 54 | // skip single-line comments 55 | if l.ch == rune(';') || l.ch == rune('#') { 56 | l.skipComment() 57 | return (l.NextToken()) 58 | } 59 | 60 | switch l.ch { 61 | 62 | case rune(0): 63 | tok.Literal = "" 64 | tok.Type = token.EOF 65 | 66 | case rune(':'): 67 | label, err := l.readLabel() 68 | if err != nil { 69 | tok.Literal = err.Error() 70 | tok.Type = token.ILLEGAL 71 | } else { 72 | tok = token.Token{Type: token.LABEL, Literal: label} 73 | } 74 | 75 | case rune('.'): 76 | label, err := l.readLabel() 77 | if err != nil { 78 | tok.Literal = err.Error() 79 | tok.Type = token.ILLEGAL 80 | } else { 81 | tok = token.Token{Type: token.DATA, Literal: label} 82 | } 83 | 84 | case rune(','): 85 | tok = token.Token{Type: token.COMMA, Literal: ","} 86 | 87 | case rune('['): 88 | tok = token.Token{Type: token.LSQUARE, Literal: "["} 89 | 90 | case rune(']'): 91 | 92 | l.readChar() 93 | return (l.NextToken()) 94 | 95 | case rune('"'): 96 | str, err := l.readString('"') 97 | if err == nil { 98 | tok.Literal = str 99 | tok.Type = token.STRING 100 | } else { 101 | tok.Literal = err.Error() 102 | tok.Type = token.ILLEGAL 103 | } 104 | 105 | default: 106 | // Number? 107 | if isDigit(l.ch) { 108 | tok := l.readDecimal() 109 | return tok 110 | } 111 | 112 | // Instruction/Register 113 | tok.Literal = l.readIdentifier() 114 | if len(tok.Literal) > 0 { 115 | tok.Type = token.LookupIdentifier(tok.Literal) 116 | return tok 117 | } 118 | 119 | // Not an instruction/register (+LABEL) 120 | tok.Type = token.IDENTIFIER 121 | return tok 122 | 123 | } 124 | 125 | l.readChar() 126 | 127 | return tok 128 | } 129 | 130 | // readIdentifier is designed to read an identifier (name of variable, 131 | // function, etc). 132 | func (l *Lexer) readIdentifier() string { 133 | 134 | id := "" 135 | 136 | for isIdentifier(l.ch) { 137 | id += string(l.ch) 138 | l.readChar() 139 | } 140 | return id 141 | } 142 | 143 | // skip over any white space. 144 | func (l *Lexer) skipWhitespace() { 145 | for isWhitespace(l.ch) { 146 | l.readChar() 147 | } 148 | } 149 | 150 | // skip a comment (until the end of the line). 151 | func (l *Lexer) skipComment() { 152 | for l.ch != '\n' && l.ch != rune(0) { 153 | l.readChar() 154 | } 155 | l.skipWhitespace() 156 | } 157 | 158 | // read a number. We only care about numerical digits here, floats will 159 | // be handled elsewhere. 160 | func (l *Lexer) readNumber() string { 161 | 162 | id := "" 163 | 164 | for isDigit(l.ch) || l.ch == rune('x') { 165 | id += string(l.ch) 166 | l.readChar() 167 | } 168 | return id 169 | } 170 | 171 | // read a decimal number, either int or floating-point. 172 | func (l *Lexer) readDecimal() token.Token { 173 | 174 | // 175 | // Read an integer-number. 176 | // 177 | integer := l.readNumber() 178 | 179 | // 180 | // Just an integer. 181 | // 182 | return token.Token{Type: token.NUMBER, Literal: integer} 183 | } 184 | 185 | // read a string, deliminated by the given character. 186 | func (l *Lexer) readString(delim rune) (string, error) { 187 | out := "" 188 | 189 | for { 190 | l.readChar() 191 | 192 | if l.ch == rune(0) { 193 | return "", fmt.Errorf("unterminated string") 194 | } 195 | if l.ch == delim { 196 | break 197 | } 198 | // 199 | // Handle \n, \r, \t, \", etc. 200 | // 201 | if l.ch == '\\' { 202 | 203 | // Line ending with "\" + newline 204 | if l.peekChar() == '\n' { 205 | // consume the newline. 206 | l.readChar() 207 | continue 208 | } 209 | 210 | l.readChar() 211 | 212 | if l.ch == rune(0) { 213 | return "", errors.New("unterminated string") 214 | } 215 | if l.ch == rune('n') { 216 | l.ch = '\n' 217 | } 218 | if l.ch == rune('0') { 219 | l.ch = rune(0) 220 | } 221 | if l.ch == rune('r') { 222 | l.ch = '\r' 223 | } 224 | if l.ch == rune('t') { 225 | l.ch = '\t' 226 | } 227 | if l.ch == rune('"') { 228 | l.ch = '"' 229 | } 230 | if l.ch == rune('\\') { 231 | l.ch = '\\' 232 | } 233 | } 234 | out = out + string(l.ch) 235 | } 236 | 237 | return out, nil 238 | } 239 | 240 | // read a label 241 | func (l *Lexer) readLabel() (string, error) { 242 | out := "" 243 | 244 | for { 245 | l.readChar() 246 | 247 | if l.ch == rune(0) { 248 | if len(out) > 1 { 249 | return out, nil 250 | } 251 | return "", fmt.Errorf("unterminated label") 252 | } 253 | if isWhitespace(l.ch) { 254 | return out, nil 255 | } 256 | out = out + string(l.ch) 257 | } 258 | } 259 | 260 | // determinate ch is identifier or not. Identifiers may be alphanumeric, 261 | // but they must start with a letter. Here that works because we are only 262 | // called if the first character is alphabetical. 263 | func isIdentifier(ch rune) bool { 264 | if unicode.IsLetter(ch) || unicode.IsDigit(ch) || ch == '$' || ch == '_' || ch == '-' { 265 | return true 266 | } 267 | return false 268 | } 269 | 270 | // is white space 271 | func isWhitespace(ch rune) bool { 272 | return ch == rune(' ') || ch == rune('\t') || ch == rune('\n') || ch == rune('\r') 273 | } 274 | 275 | // is Digit 276 | func isDigit(ch rune) bool { 277 | return rune('0') <= ch && ch <= rune('9') 278 | } 279 | 280 | // peek character 281 | func (l *Lexer) peekChar() rune { 282 | if l.readPosition >= len(l.characters) { 283 | return rune(0) 284 | } 285 | return l.characters[l.readPosition] 286 | } 287 | -------------------------------------------------------------------------------- /lexer/lexer_test.go: -------------------------------------------------------------------------------- 1 | package lexer 2 | 3 | import ( 4 | "testing" 5 | 6 | "github.com/skx/assembler/token" 7 | ) 8 | 9 | func TestComment(t *testing.T) { 10 | 11 | n := New(`; This is a comment 12 | # So is this`) 13 | 14 | tok := n.NextToken() 15 | if tok.Type != token.EOF { 16 | t.Errorf("expected end of file") 17 | } 18 | } 19 | 20 | func TestData(t *testing.T) { 21 | 22 | input := `.foo 23 | .` 24 | 25 | tests := []struct { 26 | expectedType token.Type 27 | expectedLiteral string 28 | }{ 29 | {token.DATA, "foo"}, 30 | {token.ILLEGAL, "unterminated label"}, 31 | {token.EOF, ""}, 32 | } 33 | 34 | l := New(input) 35 | for i, tt := range tests { 36 | tok := l.NextToken() 37 | if tok.Type != tt.expectedType { 38 | t.Fatalf("tests[%d] - tokentype wrong, expected=%q, got=%q", i, tt.expectedType, tok.Type) 39 | } 40 | if tok.Literal != tt.expectedLiteral { 41 | t.Fatalf("tests[%d] - Literal wrong, expected=%q, got=%q", i, tt.expectedLiteral, tok.Literal) 42 | } 43 | } 44 | 45 | } 46 | 47 | func TestMov(t *testing.T) { 48 | 49 | input := ` 50 | ;; Two move instructions 51 | mov rax, rcx 52 | mov rbx, 33 53 | ` 54 | 55 | tests := []struct { 56 | expectedType token.Type 57 | expectedLiteral string 58 | }{ 59 | {token.INSTRUCTION, "mov"}, 60 | {token.REGISTER, "rax"}, 61 | {token.COMMA, ","}, 62 | {token.REGISTER, "rcx"}, 63 | 64 | {token.INSTRUCTION, "mov"}, 65 | {token.REGISTER, "rbx"}, 66 | {token.COMMA, ","}, 67 | {token.NUMBER, "33"}, 68 | {token.EOF, ""}, 69 | } 70 | 71 | l := New(input) 72 | for i, tt := range tests { 73 | tok := l.NextToken() 74 | if tok.Type != tt.expectedType { 75 | t.Fatalf("tests[%d] - tokentype wrong, expected=%q, got=%q", i, tt.expectedType, tok.Type) 76 | } 77 | if tok.Literal != tt.expectedLiteral { 78 | t.Fatalf("tests[%d] - Literal wrong, expected=%q, got=%q", i, tt.expectedLiteral, tok.Literal) 79 | } 80 | } 81 | 82 | } 83 | 84 | func TestLabel(t *testing.T) { 85 | 86 | input := ` 87 | :name 88 | :` 89 | 90 | tests := []struct { 91 | expectedType token.Type 92 | expectedLiteral string 93 | }{ 94 | {token.LABEL, "name"}, 95 | {token.ILLEGAL, "unterminated label"}, 96 | 97 | {token.EOF, ""}, 98 | } 99 | 100 | l := New(input) 101 | for i, tt := range tests { 102 | tok := l.NextToken() 103 | if tok.Type != tt.expectedType { 104 | t.Fatalf("tests[%d] - tokentype wrong, expected=%q, got=%q", i, tt.expectedType, tok.Type) 105 | } 106 | if tok.Literal != tt.expectedLiteral { 107 | t.Fatalf("tests[%d] - Literal wrong, expected=%q, got=%q", i, tt.expectedLiteral, tok.Literal) 108 | } 109 | } 110 | } 111 | 112 | func TestString(t *testing.T) { 113 | 114 | input := ` 115 | .foo DB "Steve\r\n\t\"\\" 116 | .test DB "steve\ 117 | kemp" 118 | .bar DB "Open\` 119 | 120 | tests := []struct { 121 | expectedType token.Type 122 | expectedLiteral string 123 | }{ 124 | {token.DATA, "foo"}, 125 | {token.DB, "DB"}, 126 | {token.STRING, "Steve\r\n\t\"\\"}, 127 | 128 | {token.DATA, "test"}, 129 | {token.DB, "DB"}, 130 | {token.STRING, "steve kemp"}, 131 | 132 | {token.DATA, "bar"}, 133 | {token.DB, "DB"}, 134 | {token.ILLEGAL, "unterminated string"}, 135 | 136 | {token.EOF, ""}, 137 | } 138 | 139 | l := New(input) 140 | for i, tt := range tests { 141 | tok := l.NextToken() 142 | if tok.Type != tt.expectedType { 143 | t.Fatalf("tests[%d] - tokentype wrong, expected=%q, got=%q", i, tt.expectedType, tok.Type) 144 | } 145 | if tok.Literal != tt.expectedLiteral { 146 | t.Fatalf("tests[%d] - Literal wrong, expected=%q, got=%q", i, tt.expectedLiteral, tok.Literal) 147 | } 148 | } 149 | 150 | } 151 | 152 | func TestBrackets(t *testing.T) { 153 | 154 | // Note "[" is emitted as you expect, but "]" is swallowed. 155 | input := `mov eax, [eax]` 156 | 157 | tests := []struct { 158 | expectedType token.Type 159 | expectedLiteral string 160 | }{ 161 | {token.INSTRUCTION, "mov"}, 162 | {token.IDENTIFIER, "eax"}, 163 | {token.COMMA, ","}, 164 | {token.LSQUARE, "["}, 165 | {token.IDENTIFIER, "eax"}, 166 | {token.EOF, ""}, 167 | } 168 | 169 | l := New(input) 170 | for i, tt := range tests { 171 | tok := l.NextToken() 172 | if tok.Type != tt.expectedType { 173 | t.Fatalf("tests[%d] - tokentype wrong, expected=%q, got=%q", i, tt.expectedType, tok.Type) 174 | } 175 | if tok.Literal != tt.expectedLiteral { 176 | t.Fatalf("tests[%d] - Literal wrong, expected=%q, got=%q", i, tt.expectedLiteral, tok.Literal) 177 | } 178 | } 179 | 180 | } 181 | -------------------------------------------------------------------------------- /parser/ast.go: -------------------------------------------------------------------------------- 1 | package parser 2 | 3 | import ( 4 | "fmt" 5 | 6 | "github.com/skx/assembler/token" 7 | ) 8 | 9 | // Node is something we return from our parser. 10 | type Node interface { 11 | // Output this as a readable string 12 | String() string 13 | } 14 | 15 | // Error contains an error-message 16 | type Error struct { 17 | Node 18 | Value string 19 | } 20 | 21 | // String outputs this Error structure as a string. 22 | func (e Error) String() string { 23 | return fmt.Sprintf("", e.Value) 24 | } 25 | 26 | // Data holds a data-statement, which might look like either of these: 27 | // 28 | // .foo DB "Steve" 29 | // .bar DB 0x030, 0x40, 0x90 30 | // 31 | type Data struct { 32 | Node 33 | 34 | // Name is the name of the data-section 35 | Name string 36 | 37 | // Contents holds the string/byte data for the reference 38 | Contents []byte 39 | } 40 | 41 | // String outputs this Data structure as a string. 42 | func (d Data) String() string { 43 | return fmt.Sprintf("", d.Name, d.Contents) 44 | } 45 | 46 | // Operand is used to hold the operand for an instruction. 47 | // 48 | // Some instructions have zero operands (e.g. `nop`), others have 49 | // one (e.g. `inc rax`), and finally we have several which take two 50 | // operands (e.g. `mov rax, rbx`). 51 | // 52 | type Operand struct { 53 | // Token contains our parent token. 54 | token.Token 55 | 56 | // If we're operating upon memory-addresses we need to be 57 | // able to understand the size of the thing we're operating 58 | // upon. 59 | // 60 | // For example `inc byte ptr [rax]` will increment a byte, 61 | // or 8 bits. We have different define sizes available to us: 62 | // 63 | // byte -> 8 bits. 64 | // word -> 16 bits. 65 | // dword -> 32 bites. 66 | // qword -> 64 bites. 67 | Size int 68 | 69 | // Is indirection used? 70 | // 71 | // i.e. `rax` has no indirection, but `[rax]` does. 72 | Indirection bool 73 | } 74 | 75 | // Instruction holds a parsed instruction. 76 | // 77 | // For example "mov rax, rax". 78 | // 79 | type Instruction struct { 80 | Node 81 | 82 | // Instruction holds the instruction we've found, as a string. 83 | Instruction string 84 | 85 | // Operands holds the operands for this instruction. 86 | // 87 | // Operands will include numbers, registers, and indrected registers. 88 | Operands []Operand 89 | } 90 | 91 | // String outputs this Error structure as a string 92 | func (d Instruction) String() string { 93 | return fmt.Sprintf("", d.Instruction, d.Operands) 94 | } 95 | 96 | // Label holds a label, as seen when it is defined. 97 | // 98 | // For example ":foo" will define a label with name "foo". 99 | type Label struct { 100 | Node 101 | 102 | // Name has the name of the instruction 103 | Name string 104 | } 105 | 106 | // String outputs this Label structure as a string. 107 | func (l Label) String() string { 108 | return fmt.Sprintf("", l.Name) 109 | } 110 | -------------------------------------------------------------------------------- /parser/parser.go: -------------------------------------------------------------------------------- 1 | // Package parser consumes tokens from the lexer, and generates the AST 2 | // which is then walked to generate binary code. 3 | package parser 4 | 5 | import ( 6 | "fmt" 7 | "strconv" 8 | 9 | "github.com/skx/assembler/instructions" 10 | "github.com/skx/assembler/lexer" 11 | "github.com/skx/assembler/token" 12 | ) 13 | 14 | // Parser holds our state. 15 | type Parser struct { 16 | // program holds our lexed program, as a series of tokens. 17 | program []token.Token 18 | 19 | // position holds our current offset within the program 20 | // above. 21 | position int 22 | } 23 | 24 | // New creates a new Parser, which will parse the specified 25 | // input program into a series of tokens, and then allow it 26 | // to be parsed. 27 | func New(input string) *Parser { 28 | 29 | // Create our parser 30 | p := &Parser{} 31 | 32 | // Create the lexer object. 33 | l := lexer.New(input) 34 | 35 | // Parse our program into a series of tokens 36 | tok := l.NextToken() 37 | for tok.Type != token.EOF { 38 | p.program = append(p.program, tok) 39 | tok = l.NextToken() 40 | } 41 | 42 | // Now we have a parser complete with a series of tokens 43 | return p 44 | 45 | } 46 | 47 | // Next returns the stream of parsed "things" from the input source program. 48 | // 49 | // The things we return include: 50 | // 51 | // * Instructions. 52 | // * Label definitions. 53 | // * Data references. 54 | // 55 | // There might be more things in the future. 56 | func (p *Parser) Next() Node { 57 | 58 | // Loop until we've exhausted our input. 59 | for p.position < len(p.program) { 60 | 61 | // The token we're operating upon 62 | tok := p.program[p.position] 63 | 64 | switch tok.Type { 65 | 66 | case token.DATA: 67 | return p.parseData() 68 | 69 | case token.INSTRUCTION: 70 | return p.parseInstruction() 71 | 72 | case token.LABEL: 73 | return p.parseLabel() 74 | 75 | case token.RSQUARE: 76 | p.position++ 77 | 78 | default: 79 | fmt.Printf("Unhandled thing - definite bug: %v\n", tok) 80 | } 81 | } 82 | 83 | return nil 84 | } 85 | 86 | // parseData handles input of the form: 87 | // 88 | // .NAME DB "String content here" 89 | // 90 | // TODO: 91 | // 92 | // .NAME DB 0x01, 0x02, 0x03 ... 93 | func (p *Parser) parseData() Node { 94 | 95 | // create the data-structure, with the name. 96 | d := Data{Name: p.program[p.position].Literal} 97 | 98 | // skip the DATA 99 | p.position++ 100 | 101 | // ensure we're not out of the program 102 | if p.position >= len(p.program) { 103 | return Error{Value: "Unexpected EOF parsing data"} 104 | } 105 | 106 | // Next token should be DB 107 | db := p.program[p.position] 108 | if db.Type != token.DB { 109 | return Error{Value: fmt.Sprintf("expected DB, got %v", db)} 110 | } 111 | 112 | // move forward 113 | p.position++ 114 | if p.position >= len(p.program) { 115 | return Error{Value: "Unexpected EOF parsing data"} 116 | } 117 | 118 | // 119 | // We support: 120 | // .foo DB "String" 121 | // 122 | // Or 123 | // .foo DB 0x03, 0x4... 124 | // 125 | // If the next token is a string handle that. 126 | cur := p.program[p.position] 127 | if cur.Type == token.STRING { 128 | // bump past the string 129 | p.position++ 130 | 131 | d.Contents = []byte(cur.Literal) 132 | return d 133 | } 134 | 135 | // If the type isn't a number that's an error 136 | if cur.Type != token.NUMBER { 137 | return Error{Value: fmt.Sprintf("expected string|number-array, got %v", cur)} 138 | } 139 | 140 | // OK so we've got number 141 | for cur.Type == token.NUMBER { 142 | 143 | // Parse it 144 | num, err := strconv.ParseInt(cur.Literal, 0, 64) 145 | if err != nil { 146 | return Error{Value: fmt.Sprintf("failed to convert '%s' to number:%s", cur.Literal, err)} 147 | } 148 | 149 | // Add to the array 150 | d.Contents = append(d.Contents, byte(num)) 151 | 152 | // skip past the number 153 | p.position++ 154 | 155 | // end of program? 156 | if p.position >= len(p.program) { 157 | break 158 | } 159 | 160 | // if the next token is not a comma then we're done 161 | if p.program[p.position].Type != token.COMMA { 162 | break 163 | } 164 | 165 | // Otherwise skip over the comma 166 | p.position++ 167 | 168 | // end of program? 169 | if p.position >= len(p.program) { 170 | break 171 | } 172 | 173 | cur = p.program[p.position] 174 | } 175 | 176 | return d 177 | } 178 | 179 | // parseInstruction is our workhorse 180 | // 181 | // We either return an `Instruction` or an `Error` 182 | // 183 | func (p *Parser) parseInstruction() Node { 184 | 185 | // Get the current instruction 186 | tok := p.program[p.position] 187 | 188 | // Find out how many arguments it has 189 | count, ok := instructions.InstructionLengths[tok.Literal] 190 | 191 | // If that failed then it is an unknown instruction, probably 192 | if !ok { 193 | return Error{Value: fmt.Sprintf("unknown instructoin %v", tok)} 194 | } 195 | 196 | // No args? Just return the instruction and bump the position 197 | if count == 0 { 198 | p.position++ 199 | return Instruction{Instruction: tok.Literal} 200 | } 201 | 202 | if count == 1 { 203 | args, err := p.TakeOneArgument() 204 | if err != nil { 205 | return Error{Value: err.Error()} 206 | 207 | } 208 | 209 | return Instruction{Instruction: tok.Literal, Operands: args} 210 | } 211 | if count == 2 { 212 | 213 | args, err := p.TakeTwoArguments() 214 | if err != nil { 215 | return Error{Value: err.Error()} 216 | 217 | } 218 | return Instruction{Instruction: tok.Literal, Operands: args} 219 | } 220 | 221 | return Error{Value: fmt.Sprintf("unhandled argument-count for token %v", tok)} 222 | } 223 | 224 | // parseLabel handles input of the form: 225 | // 226 | // :foo 227 | func (p *Parser) parseLabel() Node { 228 | 229 | // create the label-structure, with the name. 230 | l := Label{Name: p.program[p.position].Literal} 231 | 232 | // skip the label itself 233 | p.position++ 234 | 235 | return l 236 | } 237 | 238 | // TakeTwoArguments handles fetching two arguments for an instruction. 239 | // 240 | // Arguments may be register-names, numbers, or label-values 241 | func (p *Parser) TakeTwoArguments() ([]Operand, error) { 242 | 243 | var toks []Operand 244 | 245 | // Get the first argument 246 | one, err := p.getOperand() 247 | if err != nil { 248 | return toks, err 249 | } 250 | toks = append(toks, one) 251 | 252 | // see if we have a comma 253 | c := p.program[p.position] 254 | if c.Type != token.COMMA { 255 | return toks, fmt.Errorf("expected ',', got %v", c) 256 | } 257 | 258 | // Get the second argument 259 | two, err := p.getOperand() 260 | if err != nil { 261 | return toks, err 262 | } 263 | toks = append(toks, two) 264 | 265 | return toks, nil 266 | } 267 | 268 | // TakeOneArgument reads the argument for a single-arg instruction. 269 | // 270 | // Arguments may be a register-name, number, or a label-value. 271 | func (p *Parser) TakeOneArgument() ([]Operand, error) { 272 | 273 | var toks []Operand 274 | 275 | // Get the argument 276 | one, err := p.getOperand() 277 | 278 | if err != nil { 279 | return toks, err 280 | } 281 | toks = append(toks, one) 282 | 283 | return toks, nil 284 | } 285 | 286 | func (p *Parser) getOperand() (Operand, error) { 287 | 288 | var op Operand 289 | 290 | // Skip over the instruction, because we want the arg 291 | p.position++ 292 | if p.position >= len(p.program) { 293 | return op, fmt.Errorf("unexpected EOF") 294 | } 295 | 296 | // Get the argument 297 | thing := p.program[p.position] 298 | 299 | if thing.Type == token.REGISTER || 300 | thing.Type == token.NUMBER { 301 | op.Token = thing 302 | p.position++ 303 | return op, nil 304 | } 305 | 306 | // Could be "identifer", could be "byte|word|qword ptr" 307 | if thing.Literal != "byte" && 308 | thing.Literal != "word" && 309 | thing.Literal != "dword" && 310 | thing.Literal != "qword" { 311 | op.Token = thing 312 | p.position++ 313 | return op, nil 314 | } 315 | 316 | // OK indirection. probably 317 | if thing.Literal == "byte" { 318 | op.Size = 8 319 | } 320 | if thing.Literal == "word" { 321 | op.Size = 16 322 | } 323 | if thing.Literal == "dword" { 324 | op.Size = 32 325 | } 326 | if thing.Literal == "qword" { 327 | op.Size = 64 328 | } 329 | 330 | // So the next token must be "ptr" 331 | p.position++ 332 | if p.position >= len(p.program) { 333 | return op, fmt.Errorf("unexpected EOF #2") 334 | } 335 | 336 | // Get the next arg 337 | next := p.program[p.position] 338 | if next.Type != token.IDENTIFIER || next.Literal != "ptr" { 339 | return op, fmt.Errorf("expected ptr after %s", thing.Literal) 340 | } 341 | p.position++ 342 | 343 | if p.program[p.position].Type == token.LSQUARE { 344 | op.Indirection = true 345 | 346 | // skip the [ 347 | p.position++ 348 | 349 | // get the register + skip it 350 | op.Token = p.program[p.position] 351 | p.position++ 352 | 353 | } else { 354 | p.position++ 355 | op.Token = p.program[p.position] 356 | p.position++ 357 | } 358 | return op, nil 359 | 360 | } 361 | -------------------------------------------------------------------------------- /parser/parser_test.go: -------------------------------------------------------------------------------- 1 | package parser 2 | 3 | import ( 4 | "testing" 5 | ) 6 | 7 | func TestComment(t *testing.T) { 8 | 9 | p := New(";; This is a test") 10 | 11 | out := p.Next() 12 | if out != nil { 13 | t.Fatalf("Failed to skip comment") 14 | } 15 | } 16 | 17 | func TestData(t *testing.T) { 18 | 19 | type TestCase struct { 20 | Input string 21 | Data []byte 22 | } 23 | 24 | tests := []TestCase{ 25 | TestCase{Input: ".data DB \"Steve\"", 26 | Data: []byte{83, 116, 101, 118, 101}, 27 | }, 28 | TestCase{Input: ".foo DB 1\n.bar DB 3,3", 29 | Data: []byte{1}, 30 | }, 31 | TestCase{Input: ".foo DB 32, 44", 32 | Data: []byte{32, 44}, 33 | }, 34 | TestCase{Input: ".foo DB 32, ", 35 | Data: []byte{32}, 36 | }, 37 | } 38 | 39 | // For each test 40 | for _, test := range tests { 41 | 42 | // Parse 43 | p := New(test.Input) 44 | 45 | // We expect a single data-statement 46 | out := p.Next() 47 | if out == nil { 48 | t.Fatalf("nil result from pasing %s", test.Input) 49 | } 50 | 51 | // Cast to the right value 52 | d, ok := out.(Data) 53 | if !ok { 54 | t.Fatalf("didn't get an Data structure: %v", out) 55 | } 56 | 57 | // Length matches? 58 | if len(d.Contents) != len(test.Data) { 59 | t.Fatalf("data length didn't match expectation") 60 | } 61 | 62 | // Content matches? 63 | for i, x := range d.Contents { 64 | if test.Data[i] != x { 65 | t.Fatalf("data mismatch at offset %d", i) 66 | } 67 | } 68 | } 69 | } 70 | 71 | func TestMove(t *testing.T) { 72 | 73 | p := New("mov rax, rbx") 74 | 75 | out := p.Next() 76 | 77 | outI, ok := out.(Instruction) 78 | if !ok { 79 | t.Fatalf("didn't get an instruction structure") 80 | } 81 | 82 | if len(outI.Operands) != 2 { 83 | t.Fatalf("mov - wrong arg count") 84 | } 85 | if outI.Operands[0].Literal != "rax" { 86 | t.Fatalf("mov - wrong first arg") 87 | } 88 | if outI.Operands[1].Literal != "rbx" { 89 | t.Fatalf("mov - wrong second arg") 90 | } 91 | } 92 | -------------------------------------------------------------------------------- /test.asm: -------------------------------------------------------------------------------- 1 | ;; 2 | ;; So this is a simple example assembly program 3 | ;; 4 | ;; It calls `int 0x80` with rax set to 0x01 5 | ;; 6 | ;; mov eax,1 ; system call number (sys_exit) 7 | ;; mov ebx, 0xNN ; return code 8 | ;; int 0x80 ; syscall 9 | ;; 10 | 11 | 12 | nop ; Comment goes here too, if we like. 13 | 14 | ;; 15 | ;; mov eax, 0x01 works 16 | ;; 17 | ;; However we can test our assembly by setting the register to zero, 18 | ;; then incrementing it. 19 | ;; 20 | xor rax, rax 21 | inc rax 22 | 23 | ;; 24 | ;; The exit-coe will be stored in rbx. 25 | ;; 26 | ;; We could set `mov rbx, 0x42`, however it is another test of our handling 27 | ;; to allow some maths to be carried out 28 | ;; 29 | mov rbx, 0x0000 30 | mov rcx, 0x0007 31 | add rbx, rcx 32 | 33 | mov rcx, 0x0002 34 | add rbx, rcx 35 | 36 | ;; 37 | ;; So we've said : 38 | ;; 39 | ;; rbx = 0 40 | ;; rbx += 7 41 | ;; rbx += 2 42 | ;; 43 | ;; -> rbx thus contains 9. 44 | ;; 45 | ;; Now call the kernel. 46 | ;; 47 | int 0x80 48 | -------------------------------------------------------------------------------- /token/token.go: -------------------------------------------------------------------------------- 1 | // Package token contains identifiers for the various things 2 | // we find in our source-scripts. 3 | // 4 | // Our lexer will convert an input-script into a series of tokens, 5 | // which will then be further-processed. 6 | package token 7 | 8 | import "github.com/skx/assembler/instructions" 9 | 10 | // Type is a string 11 | type Type string 12 | 13 | // Token struct represent the lexer token 14 | type Token struct { 15 | 16 | // Type contains the type of the token. 17 | Type Type 18 | 19 | // Literal contains the literal text of the token. 20 | Literal string 21 | } 22 | 23 | // Our known token-types 24 | const ( 25 | // Basic things 26 | COMMA = "," 27 | LSQUARE = "[" 28 | RSQUARE = "]" 29 | EOF = "EOF" 30 | LABEL = "LABEL" 31 | DATA = "DATA" 32 | REGISTER = "REGISTER" 33 | INSTRUCTION = "INSTRUCTION" 34 | IDENTIFIER = "IDENTIFIER" 35 | 36 | // Data statement 37 | DB = "DB" 38 | 39 | // Number as operand 40 | NUMBER = "NUMBER" 41 | 42 | // String for DB 43 | STRING = "STRING" 44 | 45 | // Something we couldn't handle 46 | ILLEGAL = "ILLEGAL" 47 | ) 48 | 49 | // known things we can handle 50 | var known = map[string]Type{ 51 | "DB": DB, 52 | "db": DB, 53 | 54 | // Things we parse as registers 55 | "rax": REGISTER, 56 | "rbx": REGISTER, 57 | "rcx": REGISTER, 58 | "rdx": REGISTER, 59 | "rbp": REGISTER, 60 | "rsp": REGISTER, 61 | "rsi": REGISTER, 62 | "rdi": REGISTER, 63 | "r8": REGISTER, 64 | "r9": REGISTER, 65 | "r10": REGISTER, 66 | "r11": REGISTER, 67 | "r12": REGISTER, 68 | "r13": REGISTER, 69 | "r14": REGISTER, 70 | "r15": REGISTER, 71 | } 72 | 73 | // LookupIdentifier used to determinate whether identifier is keyword nor not 74 | func LookupIdentifier(identifier string) Type { 75 | 76 | // Is this an instruction 77 | for _, ins := range instructions.Instructions { 78 | if identifier == ins { 79 | return INSTRUCTION 80 | } 81 | } 82 | 83 | if tok, ok := known[identifier]; ok { 84 | return tok 85 | } 86 | return IDENTIFIER 87 | } 88 | -------------------------------------------------------------------------------- /token/token_test.go: -------------------------------------------------------------------------------- 1 | package token 2 | 3 | import ( 4 | "testing" 5 | ) 6 | 7 | // Test looking up values succeeds, then fails 8 | func TestLookup(t *testing.T) { 9 | 10 | for key, val := range known { 11 | 12 | // Obviously this will pass. 13 | if LookupIdentifier(key) != val { 14 | t.Errorf("Lookup of %s failed", key) 15 | } 16 | 17 | // Once the keywords are "doubled" they'll no longer 18 | // match - so we find them as identifiers. 19 | if LookupIdentifier(key+key) != IDENTIFIER { 20 | t.Errorf("Lookup of %s failed", key) 21 | } 22 | } 23 | } 24 | --------------------------------------------------------------------------------