├── .gitignore
├── LICENSE
├── README.md
├── call.asm
├── cmd
    ├── assembler
    │   └── main.go
    ├── lexer
    │   └── main.go
    └── parser
    │   └── main.go
├── compiler
    └── compiler.go
├── elf
    └── elf.go
├── exit.asm
├── go.mod
├── hello.asm
├── instructions
    └── instructions.go
├── jmp.asm
├── lexer
    ├── lexer.go
    └── lexer_test.go
├── parser
    ├── ast.go
    ├── parser.go
    └── parser_test.go
├── test.asm
└── token
    ├── token.go
    └── token_test.go


/.gitignore:
--------------------------------------------------------------------------------
1 | a.out
2 | assembler
3 | cmd/lexer/lexer
4 | cmd/parser/parser
5 | cmd/assembler/assembler
6 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                     GNU GENERAL PUBLIC LICENSE
  2 |                        Version 2, June 1991
  3 | 
  4 |  Copyright (C) 1989, 1991 Free Software Foundation, Inc.,
  5 |  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  6 |  Everyone is permitted to copy and distribute verbatim copies
  7 |  of this license document, but changing it is not allowed.
  8 | 
  9 |                             Preamble
 10 | 
 11 |   The licenses for most software are designed to take away your
 12 | freedom to share and change it.  By contrast, the GNU General Public
 13 | License is intended to guarantee your freedom to share and change free
 14 | software--to make sure the software is free for all its users.  This
 15 | General Public License applies to most of the Free Software
 16 | Foundation's software and to any other program whose authors commit to
 17 | using it.  (Some other Free Software Foundation software is covered by
 18 | the GNU Lesser General Public License instead.)  You can apply it to
 19 | your programs, too.
 20 | 
 21 |   When we speak of free software, we are referring to freedom, not
 22 | price.  Our General Public Licenses are designed to make sure that you
 23 | have the freedom to distribute copies of free software (and charge for
 24 | this service if you wish), that you receive source code or can get it
 25 | if you want it, that you can change the software or use pieces of it
 26 | in new free programs; and that you know you can do these things.
 27 | 
 28 |   To protect your rights, we need to make restrictions that forbid
 29 | anyone to deny you these rights or to ask you to surrender the rights.
 30 | These restrictions translate to certain responsibilities for you if you
 31 | distribute copies of the software, or if you modify it.
 32 | 
 33 |   For example, if you distribute copies of such a program, whether
 34 | gratis or for a fee, you must give the recipients all the rights that
 35 | you have.  You must make sure that they, too, receive or can get the
 36 | source code.  And you must show them these terms so they know their
 37 | rights.
 38 | 
 39 |   We protect your rights with two steps: (1) copyright the software, and
 40 | (2) offer you this license which gives you legal permission to copy,
 41 | distribute and/or modify the software.
 42 | 
 43 |   Also, for each author's protection and ours, we want to make certain
 44 | that everyone understands that there is no warranty for this free
 45 | software.  If the software is modified by someone else and passed on, we
 46 | want its recipients to know that what they have is not the original, so
 47 | that any problems introduced by others will not reflect on the original
 48 | authors' reputations.
 49 | 
 50 |   Finally, any free program is threatened constantly by software
 51 | patents.  We wish to avoid the danger that redistributors of a free
 52 | program will individually obtain patent licenses, in effect making the
 53 | program proprietary.  To prevent this, we have made it clear that any
 54 | patent must be licensed for everyone's free use or not licensed at all.
 55 | 
 56 |   The precise terms and conditions for copying, distribution and
 57 | modification follow.
 58 | 
 59 |                     GNU GENERAL PUBLIC LICENSE
 60 |    TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
 61 | 
 62 |   0. This License applies to any program or other work which contains
 63 | a notice placed by the copyright holder saying it may be distributed
 64 | under the terms of this General Public License.  The "Program", below,
 65 | refers to any such program or work, and a "work based on the Program"
 66 | means either the Program or any derivative work under copyright law:
 67 | that is to say, a work containing the Program or a portion of it,
 68 | either verbatim or with modifications and/or translated into another
 69 | language.  (Hereinafter, translation is included without limitation in
 70 | the term "modification".)  Each licensee is addressed as "you".
 71 | 
 72 | Activities other than copying, distribution and modification are not
 73 | covered by this License; they are outside its scope.  The act of
 74 | running the Program is not restricted, and the output from the Program
 75 | is covered only if its contents constitute a work based on the
 76 | Program (independent of having been made by running the Program).
 77 | Whether that is true depends on what the Program does.
 78 | 
 79 |   1. You may copy and distribute verbatim copies of the Program's
 80 | source code as you receive it, in any medium, provided that you
 81 | conspicuously and appropriately publish on each copy an appropriate
 82 | copyright notice and disclaimer of warranty; keep intact all the
 83 | notices that refer to this License and to the absence of any warranty;
 84 | and give any other recipients of the Program a copy of this License
 85 | along with the Program.
 86 | 
 87 | You may charge a fee for the physical act of transferring a copy, and
 88 | you may at your option offer warranty protection in exchange for a fee.
 89 | 
 90 |   2. You may modify your copy or copies of the Program or any portion
 91 | of it, thus forming a work based on the Program, and copy and
 92 | distribute such modifications or work under the terms of Section 1
 93 | above, provided that you also meet all of these conditions:
 94 | 
 95 |     a) You must cause the modified files to carry prominent notices
 96 |     stating that you changed the files and the date of any change.
 97 | 
 98 |     b) You must cause any work that you distribute or publish, that in
 99 |     whole or in part contains or is derived from the Program or any
100 |     part thereof, to be licensed as a whole at no charge to all third
101 |     parties under the terms of this License.
102 | 
103 |     c) If the modified program normally reads commands interactively
104 |     when run, you must cause it, when started running for such
105 |     interactive use in the most ordinary way, to print or display an
106 |     announcement including an appropriate copyright notice and a
107 |     notice that there is no warranty (or else, saying that you provide
108 |     a warranty) and that users may redistribute the program under
109 |     these conditions, and telling the user how to view a copy of this
110 |     License.  (Exception: if the Program itself is interactive but
111 |     does not normally print such an announcement, your work based on
112 |     the Program is not required to print an announcement.)
113 | 
114 | These requirements apply to the modified work as a whole.  If
115 | identifiable sections of that work are not derived from the Program,
116 | and can be reasonably considered independent and separate works in
117 | themselves, then this License, and its terms, do not apply to those
118 | sections when you distribute them as separate works.  But when you
119 | distribute the same sections as part of a whole which is a work based
120 | on the Program, the distribution of the whole must be on the terms of
121 | this License, whose permissions for other licensees extend to the
122 | entire whole, and thus to each and every part regardless of who wrote it.
123 | 
124 | Thus, it is not the intent of this section to claim rights or contest
125 | your rights to work written entirely by you; rather, the intent is to
126 | exercise the right to control the distribution of derivative or
127 | collective works based on the Program.
128 | 
129 | In addition, mere aggregation of another work not based on the Program
130 | with the Program (or with a work based on the Program) on a volume of
131 | a storage or distribution medium does not bring the other work under
132 | the scope of this License.
133 | 
134 |   3. You may copy and distribute the Program (or a work based on it,
135 | under Section 2) in object code or executable form under the terms of
136 | Sections 1 and 2 above provided that you also do one of the following:
137 | 
138 |     a) Accompany it with the complete corresponding machine-readable
139 |     source code, which must be distributed under the terms of Sections
140 |     1 and 2 above on a medium customarily used for software interchange; or,
141 | 
142 |     b) Accompany it with a written offer, valid for at least three
143 |     years, to give any third party, for a charge no more than your
144 |     cost of physically performing source distribution, a complete
145 |     machine-readable copy of the corresponding source code, to be
146 |     distributed under the terms of Sections 1 and 2 above on a medium
147 |     customarily used for software interchange; or,
148 | 
149 |     c) Accompany it with the information you received as to the offer
150 |     to distribute corresponding source code.  (This alternative is
151 |     allowed only for noncommercial distribution and only if you
152 |     received the program in object code or executable form with such
153 |     an offer, in accord with Subsection b above.)
154 | 
155 | The source code for a work means the preferred form of the work for
156 | making modifications to it.  For an executable work, complete source
157 | code means all the source code for all modules it contains, plus any
158 | associated interface definition files, plus the scripts used to
159 | control compilation and installation of the executable.  However, as a
160 | special exception, the source code distributed need not include
161 | anything that is normally distributed (in either source or binary
162 | form) with the major components (compiler, kernel, and so on) of the
163 | operating system on which the executable runs, unless that component
164 | itself accompanies the executable.
165 | 
166 | If distribution of executable or object code is made by offering
167 | access to copy from a designated place, then offering equivalent
168 | access to copy the source code from the same place counts as
169 | distribution of the source code, even though third parties are not
170 | compelled to copy the source along with the object code.
171 | 
172 |   4. You may not copy, modify, sublicense, or distribute the Program
173 | except as expressly provided under this License.  Any attempt
174 | otherwise to copy, modify, sublicense or distribute the Program is
175 | void, and will automatically terminate your rights under this License.
176 | However, parties who have received copies, or rights, from you under
177 | this License will not have their licenses terminated so long as such
178 | parties remain in full compliance.
179 | 
180 |   5. You are not required to accept this License, since you have not
181 | signed it.  However, nothing else grants you permission to modify or
182 | distribute the Program or its derivative works.  These actions are
183 | prohibited by law if you do not accept this License.  Therefore, by
184 | modifying or distributing the Program (or any work based on the
185 | Program), you indicate your acceptance of this License to do so, and
186 | all its terms and conditions for copying, distributing or modifying
187 | the Program or works based on it.
188 | 
189 |   6. Each time you redistribute the Program (or any work based on the
190 | Program), the recipient automatically receives a license from the
191 | original licensor to copy, distribute or modify the Program subject to
192 | these terms and conditions.  You may not impose any further
193 | restrictions on the recipients' exercise of the rights granted herein.
194 | You are not responsible for enforcing compliance by third parties to
195 | this License.
196 | 
197 |   7. If, as a consequence of a court judgment or allegation of patent
198 | infringement or for any other reason (not limited to patent issues),
199 | conditions are imposed on you (whether by court order, agreement or
200 | otherwise) that contradict the conditions of this License, they do not
201 | excuse you from the conditions of this License.  If you cannot
202 | distribute so as to satisfy simultaneously your obligations under this
203 | License and any other pertinent obligations, then as a consequence you
204 | may not distribute the Program at all.  For example, if a patent
205 | license would not permit royalty-free redistribution of the Program by
206 | all those who receive copies directly or indirectly through you, then
207 | the only way you could satisfy both it and this License would be to
208 | refrain entirely from distribution of the Program.
209 | 
210 | If any portion of this section is held invalid or unenforceable under
211 | any particular circumstance, the balance of the section is intended to
212 | apply and the section as a whole is intended to apply in other
213 | circumstances.
214 | 
215 | It is not the purpose of this section to induce you to infringe any
216 | patents or other property right claims or to contest validity of any
217 | such claims; this section has the sole purpose of protecting the
218 | integrity of the free software distribution system, which is
219 | implemented by public license practices.  Many people have made
220 | generous contributions to the wide range of software distributed
221 | through that system in reliance on consistent application of that
222 | system; it is up to the author/donor to decide if he or she is willing
223 | to distribute software through any other system and a licensee cannot
224 | impose that choice.
225 | 
226 | This section is intended to make thoroughly clear what is believed to
227 | be a consequence of the rest of this License.
228 | 
229 |   8. If the distribution and/or use of the Program is restricted in
230 | certain countries either by patents or by copyrighted interfaces, the
231 | original copyright holder who places the Program under this License
232 | may add an explicit geographical distribution limitation excluding
233 | those countries, so that distribution is permitted only in or among
234 | countries not thus excluded.  In such case, this License incorporates
235 | the limitation as if written in the body of this License.
236 | 
237 |   9. The Free Software Foundation may publish revised and/or new versions
238 | of the General Public License from time to time.  Such new versions will
239 | be similar in spirit to the present version, but may differ in detail to
240 | address new problems or concerns.
241 | 
242 | Each version is given a distinguishing version number.  If the Program
243 | specifies a version number of this License which applies to it and "any
244 | later version", you have the option of following the terms and conditions
245 | either of that version or of any later version published by the Free
246 | Software Foundation.  If the Program does not specify a version number of
247 | this License, you may choose any version ever published by the Free Software
248 | Foundation.
249 | 
250 |   10. If you wish to incorporate parts of the Program into other free
251 | programs whose distribution conditions are different, write to the author
252 | to ask for permission.  For software which is copyrighted by the Free
253 | Software Foundation, write to the Free Software Foundation; we sometimes
254 | make exceptions for this.  Our decision will be guided by the two goals
255 | of preserving the free status of all derivatives of our free software and
256 | of promoting the sharing and reuse of software generally.
257 | 
258 |                             NO WARRANTY
259 | 
260 |   11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
261 | FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW.  EXCEPT WHEN
262 | OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
263 | PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
264 | OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
265 | MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.  THE ENTIRE RISK AS
266 | TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU.  SHOULD THE
267 | PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
268 | REPAIR OR CORRECTION.
269 | 
270 |   12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
271 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
272 | REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
273 | INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
274 | OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
275 | TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
276 | YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
277 | PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
278 | POSSIBILITY OF SUCH DAMAGES.
279 | 
280 |                      END OF TERMS AND CONDITIONS
281 | 
282 |             How to Apply These Terms to Your New Programs
283 | 
284 |   If you develop a new program, and you want it to be of the greatest
285 | possible use to the public, the best way to achieve this is to make it
286 | free software which everyone can redistribute and change under these terms.
287 | 
288 |   To do so, attach the following notices to the program.  It is safest
289 | to attach them to the start of each source file to most effectively
290 | convey the exclusion of warranty; and each file should have at least
291 | the "copyright" line and a pointer to where the full notice is found.
292 | 
293 |     <one line to give the program's name and a brief idea of what it does.>
294 |     Copyright (C) <year>  <name of author>
295 | 
296 |     This program is free software; you can redistribute it and/or modify
297 |     it under the terms of the GNU General Public License as published by
298 |     the Free Software Foundation; either version 2 of the License, or
299 |     (at your option) any later version.
300 | 
301 |     This program is distributed in the hope that it will be useful,
302 |     but WITHOUT ANY WARRANTY; without even the implied warranty of
303 |     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
304 |     GNU General Public License for more details.
305 | 
306 |     You should have received a copy of the GNU General Public License along
307 |     with this program; if not, write to the Free Software Foundation, Inc.,
308 |     51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
309 | 
310 | Also add information on how to contact you by electronic and paper mail.
311 | 
312 | If the program is interactive, make it output a short notice like this
313 | when it starts in an interactive mode:
314 | 
315 |     Gnomovision version 69, Copyright (C) year name of author
316 |     Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
317 |     This is free software, and you are welcome to redistribute it
318 |     under certain conditions; type `show c' for details.
319 | 
320 | The hypothetical commands `show w' and `show c' should show the appropriate
321 | parts of the General Public License.  Of course, the commands you use may
322 | be called something other than `show w' and `show c'; they could even be
323 | mouse-clicks or menu items--whatever suits your program.
324 | 
325 | You should also get your employer (if you work as a programmer) or your
326 | school, if any, to sign a "copyright disclaimer" for the program, if
327 | necessary.  Here is a sample; alter the names:
328 | 
329 |   Yoyodyne, Inc., hereby disclaims all copyright interest in the program
330 |   `Gnomovision' (which makes passes at compilers) written by James Hacker.
331 | 
332 |   <signature of Ty Coon>, 1 April 1989
333 |   Ty Coon, President of Vice
334 | 
335 | This General Public License does not permit incorporating your program into
336 | proprietary programs.  If your program is a subroutine library, you may
337 | consider it more useful to permit linking proprietary applications with the
338 | library.  If this is what you want to do, use the GNU Lesser General
339 | Public License instead of this License.
340 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | [![GoDoc](https://img.shields.io/static/v1?label=godoc&message=reference&color=blue)](https://pkg.go.dev/github.com/skx/assembler)
  2 | [![Go Report Card](https://goreportcard.com/badge/github.com/skx/assembler)](https://goreportcard.com/report/github.com/skx/assembler)
  3 | [![license](https://img.shields.io/github/license/skx/assembler.svg)](https://github.com/skx/assembler/blob/master/LICENSE)
  4 | 
  5 | * [Assembler](#assembler)
  6 |   * [Limitations](#limitations)
  7 |   * [Installation](#installation)
  8 |   * [Example Usage](#example-usage)
  9 | * [Internals](#internals)
 10 |   * [Adding New Instructions](#adding-new-instructions)
 11 |   * [Debugging Generated Binaries](#debugging-generated-binaries)
 12 | * [Bugs?](#bugs)
 13 | 
 14 | 
 15 | # Assembler
 16 | 
 17 | This repository contains a VERY BASIC x86-64 assembler, which is capable of
 18 | reading assembly-language input, and generating a staticly linked ELF binary
 19 | output.
 20 | 
 21 | It is more a proof-of-concept than a useful assembler, but I hope to take it to the state where it can compile the kind of x86-64 assembly I produce in some of my other projects.
 22 | 
 23 | Currently the assembler will generate a binary which looks like this:
 24 | 
 25 | ```
 26 | $ file a.out
 27 | a.out: ELF 64-bit LSB executable, x86-64, version 1 (SYSV)
 28 |        statically linked, no section header
 29 | ```
 30 | 
 31 | Why?  I've written a couple of toy projects that generate assembly language programs, then pass them through an assembler:
 32 | 
 33 | * [brainfuck compiler](https://github.com/skx/bfcc/)
 34 | * [math compiler](https://github.com/skx/math-compiler/)
 35 | 
 36 | The code in this repository was born out of the process of experimenting with generating an ELF binary directly.  A necessary learning-process.
 37 | 
 38 | 
 39 | 
 40 | ## Limitations
 41 | 
 42 | We don't support anywhere near the complete instruction-set which an assembly language programmer would expect.  Currently we support only things like this:
 43 | 
 44 | * `add $REG, $REG` + `add $REG, $NUMBER`
 45 |   * Add a number, or the contents of another register, to a register.
 46 | * `call $LABEL`
 47 |   * See [call.asm](call.asm) for an example.
 48 | * `dec $REG`
 49 |   * Decrement the contents of the specified register.
 50 |   * We also support indirection, so the following work:
 51 |     * `inc byte ptr [$REG]`
 52 |     * `inc word ptr [$REG]`
 53 |     * `inc dword ptr [$REG]`
 54 |     * `inc qword ptr [$REG]`
 55 | * `inc $REG`
 56 |   * Increment the contents of the specified register.
 57 |   * We also support indirection, so the following work:
 58 |     * `inc byte ptr [$REG]`
 59 |     * `inc word ptr [$REG]`
 60 |     * `inc dword ptr [$REG]`
 61 |     * `inc qword ptr [$REG]`
 62 | * `jmp $LABEL`, `je $LABEL`, `jne $LABEL`
 63 |   * We support jumping instructions, but only with -127/+128 byte displacements
 64 |   * See [jmp.asm](jmp.asm) for a simple example.
 65 | * `mov $REG, $NUMBER`
 66 | * `mov $REG, $REG`
 67 |   * Move a number into the specified register.
 68 | * `nop`
 69 |   * Do nothing.
 70 | * `push $NUMBER`, or `push $IDENTIFIER`
 71 | * `ret`
 72 |   * Return from call.
 73 |   * **NOTE**: We don't actually support making calls, though that can be emulated via `push` - see [jmp.asm](jmp.asm) for an example.
 74 | * `sub $REG, $REG` + `sub $REG, $NUMBER`
 75 |   * Subtract a number, or the contents of another register, from a register.
 76 | * `xor $REG, $REG`
 77 |   * Set the given register to be zero.
 78 | * `int $NUM`
 79 |   * Call the kernel.
 80 | * Processor (flag) control instructions:
 81 |   * `clc`, `cld`, `cli`, `cmc`, `stc`, `std`, and `sti`.
 82 | 
 83 | Note that we really only support the following registers, you'll see that we only support the 64-bit registers (which means `rax` is supported but `eax`, `ax`, `ah`, and `al` are specifically __not__ supported):
 84 | 
 85 | * `rax`
 86 | * `rcx`
 87 | * `rdx`
 88 | * `rbx`
 89 | * `rsp`
 90 | * `rbp`
 91 | * `rsi`
 92 | * `rdi`
 93 | 
 94 | There is _some_ support for the extended registers `r8`-`r15`, but this varies on a per-instruction basis and should not be relied upon.
 95 | 
 96 | There is support for storing fixed-data within our program, and locating that.  See [hello.asm](hello.asm) for an example of that.
 97 | 
 98 | We also have some other (obvious) limitations:
 99 | 
100 | * There is notably no support for comparison instructions, and jumping instructions.
101 |   * We _emulate_ (unconditional) jump instructions via "`push`" and "`ret`", see [jmp.asm](jmp.asm) for an example of that.
102 | * The entry-point is __always__ at the beginning of the source.
103 | * You can only reference data AFTER it has been declared.
104 |   * These are added to the `data` section of the generated binary, but must be defined first.
105 |   * See [hello.asm](hello.asm) for an example of that.
106 | 
107 | 
108 | 
109 | ## Installation
110 | 
111 | If you have this repository cloned locally you can build the assembler like so:
112 | 
113 |     cd cmd/assembler
114 |     go build .
115 |     go install .
116 | 
117 | If you wish to fetch and install via your existing toolchain:
118 | 
119 |     go get -u github.com/skx/assembler/cmd/assembler
120 | 
121 | You can repeat for the other commands if you wish:
122 | 
123 |     go get -u github.com/skx/assembler/cmd/lexer
124 |     go get -u github.com/skx/assembler/cmd/parser
125 | 
126 | Of course these binary-names are very generic, so perhaps better to work locally!
127 | 
128 | 
129 | ## Example Usage
130 | 
131 | Build the assembler:
132 | 
133 |      $ cd cmd/assembler
134 |      $ go build .
135 | 
136 | Compile the [sample program](test.asm), and execute it showing the return-code:
137 | 
138 |      $ cmd/assembler/assembler test.asm && ./a.out ; echo $?
139 |      9
140 | 
141 | Or run the [hello.asm](hello.asm) example:
142 | 
143 |      $ cmd/assembler/assembler  hello.in && ./a.out
144 |      Hello, world
145 |      Goodbye, world
146 | 
147 | You'll note that the `\n` character was correctly expanded into a newline.
148 | 
149 | 
150 | # Internals
151 | 
152 | The core of our code consists of a small number of simple packages:
153 | 
154 | * A simple tokenizer [lexer/lexer.go](lexer/lexer.go)
155 | * A simple parser [parser/parser.go](parser/parser.go)
156 |   * This populates a simple internal-form/AST [parser/ast.go](parser/ast.go).
157 | * A simple compiler [compiler/compiler.go](compiler/compiler.go)
158 | * A simple elf-generator [elf/elf.go](elf/elf.go)
159 |   * Taken from [vishen/go-x64-executable](https://github.com/vishen/go-x64-executable/).
160 | 
161 | 
162 | In addition to the package modules we also have a couple of binaries:
163 | 
164 | * `cmd/lexer`
165 |   * Show the output of lexing a program.
166 |   * This is useful for debugging and development-purposes, it isn't expected to be useful to end-users.
167 | * `cmd/parser`
168 |   * Show the output of parsing a program.
169 |     * This is useful for debugging and development-purposes, it isn't expected to be useful to end-users.
170 | * `cmd/assembler`
171 |   * Assemble a program, producing an executable binary.
172 | 
173 | These commands located beneath `cmd` each operate the same way.  They each take a single argument which is a file containing assembly-language instructions.
174 | 
175 | For example here is how you'd build and test the parser:
176 | 
177 |     cd cmd/parser
178 |     go build .
179 |     $ ./parser ../../test.asm
180 |     &{{INSTRUCTION xor} [{REGISTER rax} {REGISTER rax}]}
181 |     &{{INSTRUCTION inc} [{REGISTER rax}]}
182 |     &{{INSTRUCTION mov} [{REGISTER rbx} {NUMBER 0x0000}]}
183 |     &{{INSTRUCTION mov} [{REGISTER rcx} {NUMBER 0x0007}]}
184 |     &{{INSTRUCTION add} [{REGISTER rbx} {REGISTER rcx}]}
185 |     &{{INSTRUCTION mov} [{REGISTER rcx} {NUMBER 0x0002}]}
186 |     &{{INSTRUCTION add} [{REGISTER rbx} {REGISTER rcx}]}
187 |     &{{INSTRUCTION int} [{NUMBER 0x80}]}
188 | 
189 | 
190 | ## Adding New Instructions
191 | 
192 | This is how you might add a new instruction to the assembler, for example you might add `jmp 0x00000` or some similar instruction:
193 | 
194 | * Add a new entry for the instruction in [instructions/instructions.go](instructions/instructions.go)
195 |   * i.e. Update `InstructionLengths` map to add the instruction.
196 |   * This will be used by both the tokenization process, and the parser.
197 | * Generate the appropriate output in `compiler/compiler.go`, inside the function `compileInstruction`.
198 |   * i.e. Emit the binary-code for the instruction.
199 | 
200 | 
201 | 
202 | ## Debugging Generated Binaries
203 | 
204 | Launch the binary under gdb:
205 | 
206 |     $ gdb ./a.out
207 | 
208 | Start it:
209 | 
210 |     (gdb) starti
211 |     Starting program: /home/skx/Repos/github.com/skx/assembler/a.out
212 | 
213 |     Program stopped.
214 |     0x00000000004000b0 in ?? ()
215 | 
216 | Dissassemble:
217 | 
218 |     (gdb)  x/5i $pc
219 | 
220 | Or show string-contents at an address:
221 | 
222 |     (gdb) x/s 0x400000
223 | 
224 | 
225 | # Bugs?
226 | 
227 | Feel free to report, as this is more a proof of concept rather than a robust tool they are to be expected.
228 | 
229 | Specifically we're missing support for many instructions, but I hope the code generated for those that is present is correct.
230 | 
231 | 
232 | Steve
233 | 


--------------------------------------------------------------------------------
/call.asm:
--------------------------------------------------------------------------------
  1 |         ;;
  2 |         ;; This file demonstrates using `call` to invoke subroutines.
  3 |         ;;
  4 |         ;; Here we have three subroutines of interest:
  5 |         ;;
  6 |         ;;  print_string - prints a string with explicit address & size.
  7 |         ;;
  8 |         ;;  print_asciiz_string - Prints a null-terminated string
  9 |         ;;
 10 |         ;;  _exit - Exits the program
 11 |         ;;
 12 | 
 13 | .hello   DB "Hello, world\n\0"
 14 | .message DB "This string has its size calculated dynamically!\n\0"
 15 | .goodbye DB "Goodbye, world\n\0"
 16 | 
 17 |         ;; print a string, with a size
 18 |         mov rcx, hello
 19 |         mov rdx, 13
 20 |         call print_string
 21 | 
 22 |         ;; print a string with ZERO size calculation
 23 |         mov rcx, message
 24 |         call print_asciiz_string
 25 | 
 26 |         ;; print a string with ZERO size calculation
 27 |         ;;
 28 |         ;; BUT change the " " to "*"
 29 |         mov rdx, message
 30 |         call print_asciiz_string_with_stars
 31 | 
 32 | 
 33 |         ;; print a string with an explicit size
 34 |         mov rcx, goodbye
 35 |         mov rdx, 15
 36 |         call print_string
 37 | 
 38 |         ;; exit this script
 39 |         mov rbx, 2
 40 |         call _exit
 41 | 
 42 |         ;; Routine to print a string.
 43 |         ;;
 44 |         ;; Assumes string address is in RCX
 45 |         ;; Assumes string length is in RDX
 46 |         ;;
 47 |         ;; Traches: RAX, RBX, RCX, RDX
 48 | :print_string
 49 |         mov rbx, 1         ;; output is STDOUT
 50 |         mov rax, 4         ;; sys_write
 51 |         int 0x80           ;; syscall
 52 | 
 53 |         ret
 54 | 
 55 |         ;; Routine to print a '0x00'-terminated string
 56 |         ;;
 57 |         ;; Assumes string address is in RCX
 58 | :print_asciiz_string
 59 |         xor rdx, rdx            ; zero the length
 60 |         push rcx                ; save string
 61 | :len_loop
 62 |         cmp byte ptr [rcx], 0x00
 63 |         je len_loop_over
 64 |         inc rdx
 65 |         inc rcx
 66 |         jmp len_loop
 67 | :len_loop_over
 68 |         pop rcx                 ; restore string-pointer
 69 |                                 ; rdx has the mesage
 70 |         call print_string       ; call the print routine
 71 |         ret                     ; and return from here
 72 | 
 73 | 
 74 | 
 75 |         ;; Print a string, terminated by NULL, but change " " to "*"
 76 |         ;;
 77 |         ;; NOTE: This destroys the string in the process.
 78 | :print_asciiz_string_with_stars
 79 |         push rdx
 80 | :star_loop
 81 |         cmp byte ptr [rdx], 0x00   ; end of string? we're done
 82 |         je star_loop_over
 83 |         cmp byte ptr [rdx], 0x20   ; is this a space?
 84 |         jne star_loop_cont         ; if not continue
 85 |         mov byte ptr [rdx], 42     ; so replace with "*"
 86 | :star_loop_cont
 87 |         inc rdx                    ; increase our pointer
 88 |         jmp star_loop              ; loop again
 89 | :star_loop_over
 90 |         pop rcx
 91 |         call print_asciiz_string
 92 |         ret
 93 | 
 94 | 
 95 |         ;; Exit
 96 |         ;;
 97 |         ;; Assumes RBX has exit-code
 98 | :_exit
 99 |         mov rax, 1      ; SYS_exit
100 |         int 0x80        ; syscall
101 |         ret             ; Never reached
102 | 


--------------------------------------------------------------------------------
/cmd/assembler/main.go:
--------------------------------------------------------------------------------
 1 | package main
 2 | 
 3 | import (
 4 | 	"fmt"
 5 | 	"io/ioutil"
 6 | 	"os"
 7 | 
 8 | 	"github.com/skx/assembler/compiler"
 9 | )
10 | 
11 | func main() {
12 | 
13 | 	//
14 | 	// Ensure we have an argument
15 | 	//
16 | 	if len(os.Args) <= 1 {
17 | 		fmt.Printf("Usage: compiler input.asm\n")
18 | 		return
19 | 	}
20 | 
21 | 	data, err := ioutil.ReadFile(os.Args[1])
22 | 	if err != nil {
23 | 		fmt.Printf("error:%s\n", err.Error())
24 | 		return
25 | 	}
26 | 
27 | 	// Create the compiler
28 | 	c := compiler.New(string(data))
29 | 
30 | 	c.SetOutput("./a.out")
31 | 
32 | 	err = c.Compile()
33 | 	if err != nil {
34 | 		fmt.Printf("Error:%s\n", err.Error())
35 | 		return
36 | 	}
37 | }
38 | 


--------------------------------------------------------------------------------
/cmd/lexer/main.go:
--------------------------------------------------------------------------------
 1 | package main
 2 | 
 3 | import (
 4 | 	"fmt"
 5 | 	"io/ioutil"
 6 | 	"os"
 7 | 
 8 | 	"github.com/skx/assembler/lexer"
 9 | 	"github.com/skx/assembler/token"
10 | )
11 | 
12 | func main() {
13 | 	//
14 | 	// Ensure we have an argument
15 | 	//
16 | 	if len(os.Args) <= 1 {
17 | 		fmt.Printf("Usage: lexer input.asm\n")
18 | 		return
19 | 	}
20 | 
21 | 	data, err := ioutil.ReadFile(os.Args[1])
22 | 	if err != nil {
23 | 		fmt.Printf("error:%s\n", err.Error())
24 | 		return
25 | 	}
26 | 
27 | 	l := lexer.New(string(data))
28 | 
29 | 	tok := l.NextToken()
30 | 	for tok.Type != token.EOF {
31 | 		fmt.Printf("%v\n", tok)
32 | 		tok = l.NextToken()
33 | 	}
34 | }
35 | 


--------------------------------------------------------------------------------
/cmd/parser/main.go:
--------------------------------------------------------------------------------
 1 | package main
 2 | 
 3 | import (
 4 | 	"fmt"
 5 | 	"io/ioutil"
 6 | 	"os"
 7 | 
 8 | 	"github.com/skx/assembler/parser"
 9 | )
10 | 
11 | func main() {
12 | 	//
13 | 	// Ensure we have an argument
14 | 	//
15 | 	if len(os.Args) <= 1 {
16 | 		fmt.Printf("Usage: parser input.asm\n")
17 | 		return
18 | 	}
19 | 
20 | 	data, err := ioutil.ReadFile(os.Args[1])
21 | 	if err != nil {
22 | 		fmt.Printf("error:%s\n", err.Error())
23 | 		return
24 | 	}
25 | 
26 | 	p := parser.New(string(data))
27 | 
28 | 	stmt := p.Next()
29 | 	for stmt != nil {
30 | 		fmt.Printf("%v\n", stmt)
31 | 
32 | 		stmt = p.Next()
33 | 	}
34 | }
35 | 


--------------------------------------------------------------------------------
/compiler/compiler.go:
--------------------------------------------------------------------------------
  1 | // Package compiler is the package which is actually responsible for reading
  2 | // the user-program and generating the binary result.
  3 | //
  4 | // Internally this uses the parser, as you would expect
  5 | package compiler
  6 | 
  7 | import (
  8 | 	"encoding/binary"
  9 | 	"fmt"
 10 | 	"strconv"
 11 | 
 12 | 	"github.com/skx/assembler/elf"
 13 | 	"github.com/skx/assembler/parser"
 14 | 	"github.com/skx/assembler/token"
 15 | )
 16 | 
 17 | // Compiler holds our state
 18 | type Compiler struct {
 19 | 
 20 | 	// p holds the parser we use to generate AST
 21 | 	p *parser.Parser
 22 | 
 23 | 	// output holds the path to the binary we'll generate
 24 | 	output string
 25 | 
 26 | 	// code contains the code we generate
 27 | 	code []byte
 28 | 
 29 | 	// data is where we place constant-strings, etc.
 30 | 	data []byte
 31 | 
 32 | 	// map of "data-name" to "data-offset"
 33 | 	dataOffsets map[string]int
 34 | 
 35 | 	// patches we have to make, post-compilation.  Don't ask
 36 | 	patches map[int]int
 37 | 
 38 | 	// labels and the corresponding offsets we've seen.
 39 | 	labels map[string]int
 40 | 
 41 | 	// offsets which contain jumps to labels
 42 | 	labelTargets map[int]string
 43 | 
 44 | 	// 8-bit offsets for relative label-jumps
 45 | 	jmps map[int]string
 46 | 
 47 | 	// 32-bit offsets for calls
 48 | 	calls map[int]string
 49 | }
 50 | 
 51 | // New creates a new instance of the compiler
 52 | func New(src string) *Compiler {
 53 | 
 54 | 	c := &Compiler{p: parser.New(src), output: "a.out"}
 55 | 	c.dataOffsets = make(map[string]int)
 56 | 	c.patches = make(map[int]int)
 57 | 
 58 | 	// mapping of "label -> XXX"
 59 | 	c.labels = make(map[string]int)
 60 | 
 61 | 	// fixups we need to make offset-of-code -> label
 62 | 	c.labelTargets = make(map[int]string)
 63 | 
 64 | 	// jump-fixups
 65 | 	c.jmps = make(map[int]string)
 66 | 
 67 | 	// call-fixups
 68 | 	c.calls = make(map[int]string)
 69 | 
 70 | 	return c
 71 | }
 72 | 
 73 | // SetOutput sets the path to the executable we create.
 74 | //
 75 | // If no output has been specified we default to `./a.out`.
 76 | func (c *Compiler) SetOutput(path string) {
 77 | 	c.output = path
 78 | }
 79 | 
 80 | // Compile walks over the parser-generated AST and assembles the source
 81 | // program.
 82 | //
 83 | // Once the program has been completed an ELF executable will be produced
 84 | func (c *Compiler) Compile() error {
 85 | 
 86 | 	//
 87 | 	// Walk over the parser-output
 88 | 	//
 89 | 	stmt := c.p.Next()
 90 | 	for stmt != nil {
 91 | 
 92 | 		switch stmt := stmt.(type) {
 93 | 
 94 | 		case parser.Data:
 95 | 			c.handleData(stmt)
 96 | 
 97 | 		case parser.Error:
 98 | 			return fmt.Errorf("error compiling - parser returned error %s", stmt.Value)
 99 | 
100 | 		case parser.Label:
101 | 			// So now we know the label with the given name
102 | 			// corresponds to the CURRENT position in the
103 | 			// generated binary-code.
104 | 			//
105 | 			// If anything refers to this we'll have to patch
106 | 			// it up
107 | 			c.labels[stmt.Name] = len(c.code)
108 | 
109 | 		case parser.Instruction:
110 | 			err := c.compileInstruction(stmt)
111 | 			if err != nil {
112 | 				return err
113 | 			}
114 | 
115 | 		default:
116 | 			return fmt.Errorf("unhandled node-type %v", stmt)
117 | 		}
118 | 
119 | 		stmt = c.p.Next()
120 | 	}
121 | 
122 | 	//
123 | 	// Apply data-patches.
124 | 	//
125 | 	// This is horrid.
126 | 	//
127 | 	for o, v := range c.patches {
128 | 
129 | 		// start of virtual sectoin
130 | 		//  + offset
131 | 		//  + len of code segment
132 | 		//  + elf header
133 | 		//  + 2 * program header
134 | 		// life is hard
135 | 		v = 0x400000 + v + len(c.code) + 0x40 + (2 * 0x38)
136 | 		buf := make([]byte, 4)
137 | 		binary.LittleEndian.PutUint32(buf, uint32(v))
138 | 
139 | 		for i, x := range buf {
140 | 			c.code[i+o] = x
141 | 		}
142 | 	}
143 | 
144 | 	//
145 | 	// OK now we need to patch references to labels
146 | 	//
147 | 	for o, s := range c.labelTargets {
148 | 
149 | 		offset := c.labels[s]
150 | 
151 | 		offset = 0x400000 + offset + 0x40 + (2 * 0x38)
152 | 
153 | 		// So we have a new offset.
154 | 
155 | 		buf := make([]byte, 4)
156 | 		binary.LittleEndian.PutUint32(buf, uint32(offset))
157 | 
158 | 		for i, x := range buf {
159 | 			c.code[i+o] = x
160 | 		}
161 | 	}
162 | 
163 | 	// Patchup the jumps
164 | 	for o, s := range c.jmps {
165 | 
166 | 		// the offset of the instruction to we should jump to
167 | 		offset := c.labels[s]
168 | 
169 | 		// the offset of the position is a byte
170 | 		diff := uint(o - offset)
171 | 
172 | 		c.code[o] = byte(0xff - byte(diff))
173 | 	}
174 | 
175 | 	// Patchup the calls
176 | 	for o, s := range c.calls {
177 | 
178 | 		// the offset of the instruction to which we should call
179 | 		offset := c.labels[s]
180 | 
181 | 		// the offset of the position is a byte
182 | 		diff := uint32(o - offset + 4)
183 | 		x := uint32(0xffffffff) - uint32(diff-1)
184 | 
185 | 		buf := make([]byte, 4)
186 | 		binary.LittleEndian.PutUint32(buf, x)
187 | 
188 | 		// overwrite the instruction
189 | 		for i, x := range buf {
190 | 			c.code[i+o] = x
191 | 		}
192 | 	}
193 | 
194 | 	//
195 | 	// Write.  The.  Elf.  Output.
196 | 	//
197 | 	e := elf.New()
198 | 	err := e.WriteContent(c.output, c.code, c.data)
199 | 	if err != nil {
200 | 		return fmt.Errorf("error writing elf: %s", err.Error())
201 | 	}
202 | 
203 | 	return nil
204 | 
205 | }
206 | 
207 | // handleData appends the data to the data-section of our binary,
208 | // and stores the offset appropriately
209 | func (c *Compiler) handleData(d parser.Data) {
210 | 
211 | 	// Offset of the start of the data is the current
212 | 	// length of the existing data.
213 | 	offset := len(c.data)
214 | 
215 | 	// Add
216 | 	c.data = append(c.data, d.Contents...)
217 | 
218 | 	// Save
219 | 	c.dataOffsets[d.Name] = offset
220 | 
221 | 	// TODO: Do we care about alignment?  We might
222 | 	// in the future.
223 | }
224 | 
225 | // compileInstruction handles the instruction generation
226 | func (c *Compiler) compileInstruction(i parser.Instruction) error {
227 | 
228 | 	switch i.Instruction {
229 | 
230 | 	case "add":
231 | 		err := c.assembleADD(i)
232 | 		if err != nil {
233 | 			return err
234 | 		}
235 | 		return nil
236 | 
237 | 	case "call":
238 | 		err := c.assembleCALL(i)
239 | 		if err != nil {
240 | 			return err
241 | 		}
242 | 		return nil
243 | 
244 | 	case "clc":
245 | 		c.code = append(c.code, 0xf8)
246 | 		return nil
247 | 
248 | 	case "cld":
249 | 		c.code = append(c.code, 0xfc)
250 | 		return nil
251 | 
252 | 	case "cli":
253 | 		c.code = append(c.code, 0xfa)
254 | 		return nil
255 | 
256 | 	case "cmp":
257 | 		err := c.assembleCMP(i)
258 | 		if err != nil {
259 | 			return err
260 | 		}
261 | 		return nil
262 | 
263 | 	case "cmc":
264 | 		c.code = append(c.code, 0xf5)
265 | 		return nil
266 | 
267 | 	case "dec":
268 | 		err := c.assembleDEC(i)
269 | 		if err != nil {
270 | 			return err
271 | 		}
272 | 		return nil
273 | 
274 | 	case "inc":
275 | 		err := c.assembleINC(i)
276 | 		if err != nil {
277 | 			return err
278 | 		}
279 | 		return nil
280 | 
281 | 	case "int":
282 | 		n, err := c.argToByte(i.Operands[0].Token)
283 | 		if err != nil {
284 | 			return err
285 | 		}
286 | 		c.code = append(c.code, 0xcd)
287 | 		c.code = append(c.code, n)
288 | 		return nil
289 | 
290 | 	case "jmp", "jne", "je", "jz", "jnz":
291 | 		err := c.assembleJMP(i)
292 | 		if err != nil {
293 | 			return err
294 | 		}
295 | 		return nil
296 | 
297 | 	case "mov":
298 | 		err := c.assembleMov(i, false)
299 | 		if err != nil {
300 | 			return err
301 | 		}
302 | 		return nil
303 | 
304 | 	case "nop":
305 | 		c.code = append(c.code, 0x90)
306 | 		return nil
307 | 
308 | 	case "pop":
309 | 		err := c.assemblePop(i)
310 | 		if err != nil {
311 | 			return err
312 | 		}
313 | 		return nil
314 | 
315 | 	case "push":
316 | 		err := c.assemblePush(i)
317 | 		if err != nil {
318 | 			return err
319 | 		}
320 | 		return nil
321 | 
322 | 	case "ret":
323 | 		c.code = append(c.code, 0xc3)
324 | 		return nil
325 | 
326 | 	case "stc":
327 | 		c.code = append(c.code, 0xf9)
328 | 		return nil
329 | 
330 | 	case "std":
331 | 		c.code = append(c.code, 0xfd)
332 | 		return nil
333 | 
334 | 	case "sti":
335 | 		c.code = append(c.code, 0xfb)
336 | 		return nil
337 | 
338 | 	case "sub":
339 | 		err := c.assembleSUB(i)
340 | 		if err != nil {
341 | 			return err
342 | 		}
343 | 		return nil
344 | 	case "xor":
345 | 		err := c.assembleXOR(i)
346 | 		if err != nil {
347 | 			return err
348 | 		}
349 | 		return nil
350 | 	}
351 | 
352 | 	return fmt.Errorf("unknown instruction %v", i)
353 | }
354 | 
355 | // return register number - used for `dec`, `inc`, and `mov`.
356 | func (c *Compiler) getreg(reg string) int {
357 | 
358 | 	// registers
359 | 	registers := []string{
360 | 		"rax",
361 | 		"rcx",
362 | 		"rdx",
363 | 		"rbx",
364 | 		"rsp",
365 | 		"rbp",
366 | 		"rsi",
367 | 		"rdi"}
368 | 
369 | 	for i, name := range registers {
370 | 		if reg == name {
371 | 			return i
372 | 		}
373 | 	}
374 | 
375 | 	panic(fmt.Sprintf("failed to lookup register: %s", reg))
376 | }
377 | 
378 | // get magic value for two-register operations (`add`, `sub`, `xor`).
379 | func (c *Compiler) calcRM(dest string, src string) byte {
380 | 
381 | 	// registers
382 | 	registers := []string{
383 | 		"rax",
384 | 		"rcx",
385 | 		"rdx",
386 | 		"rbx",
387 | 		"rsp",
388 | 		"rbp",
389 | 		"rsi",
390 | 		"rdi"}
391 | 
392 | 	dN := -1
393 | 	sN := -1
394 | 
395 | 	for i, reg := range registers {
396 | 		if reg == dest {
397 | 			dN = i
398 | 		}
399 | 		if reg == src {
400 | 			sN = i
401 | 
402 | 		}
403 | 	}
404 | 
405 | 	if dN < 0 || sN < 0 {
406 | 		panic(fmt.Sprintf("failed to lookup registers: %s %s", src, dest))
407 | 	}
408 | 
409 | 	out := 0xc0 + (8 * sN) + dN
410 | 	if out > 255 {
411 | 		panic("calcRM received out of bounds value")
412 | 	}
413 | 	return byte(out)
414 | }
415 | 
416 | // used by `int`
417 | func (c *Compiler) argToByte(t token.Token) (byte, error) {
418 | 
419 | 	num, err := strconv.ParseInt(t.Literal, 0, 64)
420 | 	if err != nil {
421 | 		return 0, fmt.Errorf("unable to convert %s to number %s", t.Literal, err)
422 | 	}
423 | 
424 | 	return byte(num), nil
425 | }
426 | 
427 | // used by `mov`
428 | func (c *Compiler) argToByteArray(t token.Token) ([]byte, error) {
429 | 
430 | 	// Store the result here
431 | 	buf := make([]byte, 4)
432 | 
433 | 	num, err := strconv.ParseInt(t.Literal, 0, 64)
434 | 	if err != nil {
435 | 		return buf, fmt.Errorf("unable to convert %s to number for register %s", t.Literal, err)
436 | 	}
437 | 
438 | 	binary.LittleEndian.PutUint32(buf, uint32(num))
439 | 	return buf, nil
440 | }
441 | 
442 | // assembleADD handles addition.
443 | func (c *Compiler) assembleADD(i parser.Instruction) error {
444 | 
445 | 	// Two registers added?
446 | 	if i.Operands[0].Type == token.REGISTER &&
447 | 		i.Operands[1].Type == token.REGISTER {
448 | 		c.code = append(c.code, []byte{0x48, 0x01}...)
449 | 		out := c.calcRM(i.Operands[0].Literal, i.Operands[1].Literal)
450 | 		c.code = append(c.code, out)
451 | 		return nil
452 | 	}
453 | 
454 | 	// OK number added to a register?
455 | 	if i.Operands[0].Type == token.REGISTER &&
456 | 		i.Operands[1].Type == token.NUMBER {
457 | 
458 | 		// Convert the integer to a four-byte/64-bit value
459 | 		n, err := c.argToByteArray(i.Operands[1].Token)
460 | 		if err != nil {
461 | 			return err
462 | 		}
463 | 
464 | 		// Work out the register
465 | 		switch i.Operands[0].Literal {
466 | 		case "rax":
467 | 			c.code = append(c.code, []byte{0x48, 0x05}...)
468 | 		case "rbx":
469 | 			c.code = append(c.code, []byte{0x48, 0x81, 0xc3}...)
470 | 		case "rcx":
471 | 			c.code = append(c.code, []byte{0x48, 0x81, 0xc1}...)
472 | 		case "rdx":
473 | 			c.code = append(c.code, []byte{0x48, 0x81, 0xc2}...)
474 | 		default:
475 | 			return fmt.Errorf("add %s, number not implemented", i.Operands[0].Literal)
476 | 		}
477 | 
478 | 		// Now append the value
479 | 		c.code = append(c.code, n...)
480 | 		return nil
481 | 	}
482 | 
483 | 	return fmt.Errorf("unhandled ADD instruction %v", i)
484 | }
485 | 
486 | // Handle a call instruction
487 | func (c *Compiler) assembleCALL(i parser.Instruction) error {
488 | 
489 | 	if i.Operands[0].Type != token.IDENTIFIER {
490 | 		return fmt.Errorf("we only support CALL to labels at the moment")
491 | 	}
492 | 
493 | 	// emit the call
494 | 	c.code = append(c.code, 0xe8)
495 | 
496 | 	c.calls[len(c.code)] = i.Operands[0].Literal
497 | 	c.code = append(c.code, []byte{0x00, 0x00, 0x00, 0x00}...)
498 | 
499 | 	return nil
500 | }
501 | 
502 | // Handle a comparison
503 | func (c *Compiler) assembleCMP(i parser.Instruction) error {
504 | 
505 | 	// We're only handling indirection at the moment
506 | 	if i.Operands[0].Type != token.REGISTER &&
507 | 		i.Operands[0].Indirection != true &&
508 | 		i.Operands[1].Type != token.NUMBER {
509 | 		return fmt.Errorf("we only support CMP size ptr [reg],NUMBER at the moment")
510 | 	}
511 | 
512 | 	// The number we're comparing
513 | 	n, err := strconv.ParseInt(i.Operands[1].Literal, 0, 64)
514 | 	if err != nil {
515 | 		return err
516 | 	}
517 | 
518 | 	// Register number
519 | 	r := byte(c.getreg(i.Operands[0].Literal))
520 | 
521 | 	// things we add
522 | 	bytes := []byte{}
523 | 
524 | 	switch i.Operands[0].Size {
525 | 
526 | 	case 8:
527 | 		bytes = []byte{0x80, 0x38 + r, byte(n)}
528 | 	case 16:
529 | 		bytes = []byte{0x66, 0x83, 0x38 + r}
530 | 
531 | 		buf := make([]byte, 2)
532 | 		binary.LittleEndian.PutUint16(buf, uint16(n))
533 | 		bytes = append(bytes, buf...)
534 | 
535 | 	case 32, 64:
536 | 		bytes = []byte{0x83, 0x38 + r}
537 | 
538 | 		buf := make([]byte, 4)
539 | 		binary.LittleEndian.PutUint32(buf, uint32(n))
540 | 		bytes = append(bytes, buf...)
541 | 
542 | 	default:
543 | 		return fmt.Errorf("unknown size in instruction %v", i.Operands[0])
544 | 	}
545 | 
546 | 	c.code = append(c.code, bytes...)
547 | 	return nil
548 | }
549 | 
550 | // assembleDEC handles dec rax, rbx, etc.
551 | func (c *Compiler) assembleDEC(i parser.Instruction) error {
552 | 
553 | 	// Decrement the contents of a register
554 | 	if i.Operands[0].Indirection == false {
555 | 		// prefix
556 | 		c.code = append(c.code, []byte{0x48, 0xff}...)
557 | 
558 | 		// register name
559 | 		reg := 0xc0 + c.getreg(i.Operands[0].Literal)
560 | 		c.code = append(c.code, byte(reg))
561 | 
562 | 		return nil
563 | 	}
564 | 
565 | 	// indirect: byte
566 | 	if i.Operands[0].Size == 8 {
567 | 		// prefix
568 | 		c.code = append(c.code, []byte{0x67, 0xfe}...)
569 | 
570 | 		// register name
571 | 		reg := c.getreg(i.Operands[0].Literal)
572 | 		reg += 0x08
573 | 		c.code = append(c.code, byte(reg))
574 | 
575 | 		return nil
576 | 	}
577 | 
578 | 	// indirect: word
579 | 	if i.Operands[0].Size == 16 {
580 | 		// prefix
581 | 		c.code = append(c.code, []byte{0x67, 0x66, 0xff}...)
582 | 
583 | 		// register name
584 | 		reg := c.getreg(i.Operands[0].Literal)
585 | 		reg += 0x08
586 | 		c.code = append(c.code, byte(reg))
587 | 
588 | 		return nil
589 | 	}
590 | 
591 | 	// indirect: double word
592 | 	if i.Operands[0].Size == 32 || i.Operands[0].Size == 64 {
593 | 		// prefix
594 | 		c.code = append(c.code, []byte{0x67, 0xff}...)
595 | 
596 | 		// register name
597 | 		reg := c.getreg(i.Operands[0].Literal)
598 | 		reg += 0x08
599 | 		c.code = append(c.code, byte(reg))
600 | 
601 | 		return nil
602 | 	}
603 | 
604 | 	return fmt.Errorf("unknown argument for DEC %v", i)
605 | }
606 | 
607 | // assembleINC handles inc rax, rbx, etc.
608 | func (c *Compiler) assembleINC(i parser.Instruction) error {
609 | 
610 | 	// Increment the contents of a register
611 | 	if i.Operands[0].Indirection == false {
612 | 		// prefix
613 | 		c.code = append(c.code, []byte{0x48, 0xff}...)
614 | 
615 | 		// register name
616 | 		reg := 0xc0 + c.getreg(i.Operands[0].Literal)
617 | 		c.code = append(c.code, byte(reg))
618 | 
619 | 		return nil
620 | 	}
621 | 
622 | 	// indirect: byte
623 | 	if i.Operands[0].Size == 8 {
624 | 		// prefix
625 | 		c.code = append(c.code, []byte{0x67, 0xfe}...)
626 | 
627 | 		// register name
628 | 		reg := c.getreg(i.Operands[0].Literal)
629 | 		c.code = append(c.code, byte(reg))
630 | 
631 | 		return nil
632 | 	}
633 | 
634 | 	// indirect: word
635 | 	if i.Operands[0].Size == 16 {
636 | 		// prefix
637 | 		c.code = append(c.code, []byte{0x67, 0x66, 0xff}...)
638 | 
639 | 		// register name
640 | 		reg := c.getreg(i.Operands[0].Literal)
641 | 		c.code = append(c.code, byte(reg))
642 | 
643 | 		return nil
644 | 	}
645 | 
646 | 	// indirect: double word
647 | 	if i.Operands[0].Size == 32 || i.Operands[0].Size == 64 {
648 | 		// prefix
649 | 		c.code = append(c.code, []byte{0x67, 0xff}...)
650 | 
651 | 		// register name
652 | 		reg := c.getreg(i.Operands[0].Literal)
653 | 		c.code = append(c.code, byte(reg))
654 | 
655 | 		return nil
656 | 	}
657 | 
658 | 	return fmt.Errorf("unknown argument for INC %v", i)
659 | }
660 | 
661 | // assembleJMP handles all the jump instructions
662 | //
663 | // NOTE We have to fixup the offsets here.
664 | func (c *Compiler) assembleJMP(i parser.Instruction) error {
665 | 
666 | 	var byte byte
667 | 
668 | 	switch i.Instruction {
669 | 	case "jmp":
670 | 		byte = 0xeb
671 | 	case "je", "jz":
672 | 		byte = 0x74
673 | 	case "jne", "jnz":
674 | 		byte = 0x75
675 | 	default:
676 | 		return fmt.Errorf("unknown jmp type")
677 | 	}
678 | 
679 | 	// Ensure we're jumping to a label
680 | 	if i.Operands[0].Type != token.IDENTIFIER {
681 | 		return fmt.Errorf("we only support jumps to labels at the moment")
682 | 	}
683 | 
684 | 	// emit the instruction and make a note of the fixup to make
685 | 	c.code = append(c.code, byte)
686 | 	c.jmps[len(c.code)] = i.Operands[0].Literal
687 | 	c.code = append(c.code, 0x00) // empty displacement
688 | 
689 | 	return nil
690 | }
691 | 
692 | func (c *Compiler) assembleMov(i parser.Instruction, label bool) error {
693 | 
694 | 	//
695 | 	// Are we moving a register to another register?
696 | 	//
697 | 	// No indirection
698 | 	//
699 | 	if i.Operands[0].Type == token.REGISTER &&
700 | 		i.Operands[0].Indirection == false &&
701 | 		i.Operands[1].Type == token.REGISTER &&
702 | 		i.Operands[1].Indirection == false {
703 | 
704 | 		c.code = append(c.code, []byte{0x48, 0x89}...)
705 | 		out := c.calcRM(i.Operands[0].Literal, i.Operands[1].Literal)
706 | 		c.code = append(c.code, out)
707 | 		return nil
708 | 
709 | 	}
710 | 
711 | 	//
712 | 	// Are we moving a number to a register ?
713 | 	//
714 | 	if i.Operands[0].Type == token.REGISTER &&
715 | 		i.Operands[0].Indirection == false &&
716 | 		i.Operands[1].Type == token.NUMBER {
717 | 
718 | 		// prefix
719 | 		c.code = append(c.code, []byte{0x48, 0xc7}...)
720 | 
721 | 		// register name
722 | 		reg := 0xc0 + c.getreg(i.Operands[0].Literal)
723 | 		c.code = append(c.code, byte(reg))
724 | 
725 | 		// value
726 | 		n, err := c.argToByteArray(i.Operands[1].Token)
727 | 		if err != nil {
728 | 			return err
729 | 		}
730 | 
731 | 		// hack
732 | 		if label {
733 | 			c.patches[len(c.code)], _ = strconv.Atoi(i.Operands[1].Literal)
734 | 		}
735 | 		c.code = append(c.code, n...)
736 | 		return nil
737 | 	}
738 | 
739 | 	// mov $reg, $id
740 | 	if i.Operands[0].Type == token.REGISTER &&
741 | 		i.Operands[0].Indirection == false &&
742 | 		i.Operands[1].Type == token.IDENTIFIER {
743 | 
744 | 		//
745 | 		// Lookup the identifier, and if we can find it
746 | 		// then we will treat it as a constant
747 | 		//
748 | 		name := i.Operands[1].Literal
749 | 		val, ok := c.dataOffsets[name]
750 | 		if ok {
751 | 
752 | 			i.Operands[1].Type = token.NUMBER
753 | 			i.Operands[1].Literal = fmt.Sprintf("%d", val)
754 | 			return c.assembleMov(i, true)
755 | 		}
756 | 		return fmt.Errorf("reference to unknown label/data: %v", i.Operands[1])
757 | 	}
758 | 
759 | 	// Storing a value in an address
760 | 	if i.Operands[0].Type == token.REGISTER &&
761 | 		i.Operands[0].Indirection &&
762 | 		i.Operands[1].Type == token.NUMBER {
763 | 
764 | 		// The number we're setting
765 | 		n, err := strconv.ParseInt(i.Operands[1].Literal, 0, 64)
766 | 		if err != nil {
767 | 			return err
768 | 		}
769 | 
770 | 		// Register number
771 | 		r := byte(c.getreg(i.Operands[0].Literal))
772 | 
773 | 		// things we add
774 | 		bytes := []byte{}
775 | 
776 | 		switch i.Operands[0].Size {
777 | 
778 | 		case 8:
779 | 			bytes = []byte{0xc6, r, byte(n)}
780 | 		case 16:
781 | 			bytes = []byte{0x66, 0xc7, byte(r)}
782 | 
783 | 			buf := make([]byte, 2)
784 | 			binary.LittleEndian.PutUint16(buf, uint16(n))
785 | 			bytes = append(bytes, buf...)
786 | 
787 | 		case 32, 64:
788 | 			bytes = []byte{0xc7, r}
789 | 
790 | 			buf := make([]byte, 4)
791 | 			binary.LittleEndian.PutUint32(buf, uint32(n))
792 | 			bytes = append(bytes, buf...)
793 | 
794 | 		default:
795 | 			return fmt.Errorf("unknown size in instruction %v", i.Operands[0])
796 | 		}
797 | 
798 | 		c.code = append(c.code, bytes...)
799 | 
800 | 		return nil
801 | 	}
802 | 
803 | 	//
804 | 	// HACK
805 | 	//
806 | 	// mov rax, [REG]
807 | 	if i.Operands[0].Type == token.REGISTER &&
808 | 		i.Operands[0].Literal == "rax" &&
809 | 		i.Operands[1].Type == token.REGISTER &&
810 | 		i.Operands[1].Indirection {
811 | 
812 | 		// Register number
813 | 		r := byte(c.getreg(i.Operands[0].Literal))
814 | 		c.code = append(c.code, []byte{0x8a, r}...)
815 | 		return nil
816 | 	}
817 | 
818 | 	return fmt.Errorf("unknown MOV instruction: %v", i)
819 | 
820 | }
821 | 
822 | // assemblePop would compile "pop offset", and "push 0x1234"
823 | func (c *Compiler) assemblePop(i parser.Instruction) error {
824 | 
825 | 	// known pop-types
826 | 	table := make(map[string][]byte)
827 | 	table["rax"] = []byte{0x58}
828 | 	table["rbx"] = []byte{0x5b}
829 | 	table["rcx"] = []byte{0x59}
830 | 	table["rdx"] = []byte{0x5a}
831 | 	table["rbp"] = []byte{0x5d}
832 | 	table["rsp"] = []byte{0x5c}
833 | 	table["rsi"] = []byte{0x5e}
834 | 	table["rdi"] = []byte{0x5f}
835 | 	table["r8"] = []byte{0x41, 0x58}
836 | 	table["r9"] = []byte{0x41, 0x59}
837 | 	table["r10"] = []byte{0x41, 0x5a}
838 | 	table["r11"] = []byte{0x41, 0x5b}
839 | 	table["r12"] = []byte{0x41, 0x5c}
840 | 	table["r13"] = []byte{0x41, 0x5d}
841 | 	table["r14"] = []byte{0x41, 0x5e}
842 | 	table["r15"] = []byte{0x41, 0x5f}
843 | 
844 | 	// Is this "pop rax|rbx..|rdx", or something in the table?
845 | 	if i.Operands[0].Type == token.REGISTER {
846 | 		bytes, ok := table[i.Operands[0].Literal]
847 | 		if ok {
848 | 			c.code = append(c.code, bytes...)
849 | 			return nil
850 | 		}
851 | 		return fmt.Errorf("unknown register in 'pop'")
852 | 	}
853 | 
854 | 	return fmt.Errorf("unknown pop-type: %v", i)
855 | 
856 | }
857 | 
858 | // assemblePush would compile "push offset", and "push 0x1234"
859 | func (c *Compiler) assemblePush(i parser.Instruction) error {
860 | 
861 | 	// Is this a number?  Just output it
862 | 	if i.Operands[0].Type == token.NUMBER {
863 | 		n, err := c.argToByteArray(i.Operands[1].Token)
864 | 		if err != nil {
865 | 			return err
866 | 		}
867 | 		c.code = append(c.code, 0x68)
868 | 		c.code = append(c.code, n...)
869 | 		return nil
870 | 	}
871 | 
872 | 	// Is this a label?
873 | 	if i.Operands[0].Type == token.IDENTIFIER {
874 | 
875 | 		c.code = append(c.code, 0x68)
876 | 
877 | 		c.labelTargets[len(c.code)] = i.Operands[0].Literal
878 | 
879 | 		c.code = append(c.code, []byte{0x0, 0x0, 0x0, 0x0}...)
880 | 		return nil
881 | 	}
882 | 
883 | 	// is this a register?
884 | 	table := make(map[string][]byte)
885 | 	table["rax"] = []byte{0x50}
886 | 	table["rcx"] = []byte{0x51}
887 | 	table["rdx"] = []byte{0x52}
888 | 	table["rbx"] = []byte{0x53}
889 | 	table["rsp"] = []byte{0x54}
890 | 	table["rbp"] = []byte{0x55}
891 | 	table["rsi"] = []byte{0x56}
892 | 	table["rdi"] = []byte{0x57}
893 | 	table["r8"] = []byte{0x41, 0x50}
894 | 	table["r9"] = []byte{0x41, 0x51}
895 | 	table["r10"] = []byte{0x41, 0x52}
896 | 	table["r11"] = []byte{0x41, 0x53}
897 | 	table["r12"] = []byte{0x41, 0x54}
898 | 	table["r13"] = []byte{0x41, 0x55}
899 | 	table["r14"] = []byte{0x41, 0x56}
900 | 	table["r15"] = []byte{0x41, 0x57}
901 | 
902 | 	// Is this "push rax|rbx..|rdx", or something in the table?
903 | 	if i.Operands[0].Type == token.REGISTER {
904 | 		bytes, ok := table[i.Operands[0].Literal]
905 | 		if ok {
906 | 			c.code = append(c.code, bytes...)
907 | 			return nil
908 | 		}
909 | 		return fmt.Errorf("unknown register in 'push'")
910 | 	}
911 | 
912 | 	return fmt.Errorf("unknown push-type: %v", i)
913 | }
914 | 
915 | // assembleSUB handles subtraction.
916 | func (c *Compiler) assembleSUB(i parser.Instruction) error {
917 | 
918 | 	// Two registers subtracted?
919 | 	if i.Operands[0].Type == token.REGISTER &&
920 | 		i.Operands[1].Type == token.REGISTER {
921 | 		c.code = append(c.code, []byte{0x48, 0x29}...)
922 | 		out := c.calcRM(i.Operands[0].Literal, i.Operands[1].Literal)
923 | 		c.code = append(c.code, out)
924 | 		return nil
925 | 	}
926 | 
927 | 	// OK number subtracted from a register?
928 | 	if i.Operands[0].Type == token.REGISTER &&
929 | 		i.Operands[1].Type == token.NUMBER {
930 | 
931 | 		// Convert the integer to a four-byte/64-bit value
932 | 		n, err := c.argToByteArray(i.Operands[1].Token)
933 | 		if err != nil {
934 | 			return err
935 | 		}
936 | 
937 | 		// Work out the register
938 | 		switch i.Operands[0].Literal {
939 | 		case "rax":
940 | 			c.code = append(c.code, []byte{0x48, 0x2d}...)
941 | 		case "rbx":
942 | 			c.code = append(c.code, []byte{0x48, 0x81, 0xeb}...)
943 | 		case "rcx":
944 | 			c.code = append(c.code, []byte{0x48, 0x81, 0xe9}...)
945 | 		case "rdx":
946 | 			c.code = append(c.code, []byte{0x48, 0x81, 0xea}...)
947 | 		default:
948 | 			return fmt.Errorf("SUB %s, number not implemented", i.Operands[0].Literal)
949 | 		}
950 | 
951 | 		// Now append the value
952 | 		c.code = append(c.code, n...)
953 | 		return nil
954 | 	}
955 | 
956 | 	return fmt.Errorf("unhandled SUB instruction %v", i)
957 | }
958 | 
959 | // assembleXOR handles xor rax, rbx, etc.
960 | func (c *Compiler) assembleXOR(i parser.Instruction) error {
961 | 
962 | 	// Two registers xor'd?
963 | 	if i.Operands[0].Type == token.REGISTER &&
964 | 		i.Operands[1].Type == token.REGISTER {
965 | 		c.code = append(c.code, []byte{0x48, 0x31}...)
966 | 		out := c.calcRM(i.Operands[0].Literal, i.Operands[1].Literal)
967 | 		c.code = append(c.code, out)
968 | 		return nil
969 | 	}
970 | 
971 | 	return fmt.Errorf("unknown argument for XOR %v", i)
972 | }
973 | 


--------------------------------------------------------------------------------
/elf/elf.go:
--------------------------------------------------------------------------------
  1 | package elf
  2 | 
  3 | import (
  4 | 	"encoding/binary"
  5 | 	"io/ioutil"
  6 | )
  7 | 
  8 | const (
  9 | 	virtualStartAddress     uint64 = 0x400000
 10 | 	dataVirtualStartAddress uint64 = 0x600000
 11 | 	alignment               uint64 = 0x200000
 12 | )
 13 | 
 14 | type Builder struct {
 15 | 	o []byte
 16 | }
 17 | 
 18 | func (b *Builder) WriteBytes(bs ...byte) {
 19 | 	b.o = append(b.o, bs...)
 20 | }
 21 | 
 22 | func (b *Builder) WriteValue(size int, value uint64) {
 23 | 	buf := make([]byte, size)
 24 | 	binary.LittleEndian.PutUint64(buf, value)
 25 | 	b.WriteBytes(buf...)
 26 | }
 27 | 
 28 | type Elf struct {
 29 | }
 30 | 
 31 | func New() *Elf {
 32 | 	return &Elf{}
 33 | }
 34 | 
 35 | func (e *Elf) WriteContent(path string, textSection, dataSection []byte) error {
 36 | 
 37 | 	data := e.buildELF(textSection, dataSection)
 38 | 	if err := ioutil.WriteFile(path, data, 0755); err != nil {
 39 | 		return err
 40 | 	}
 41 | 
 42 | 	return nil
 43 | }
 44 | 
 45 | func (e *Elf) buildELF(textSection, dataSection []byte) []byte {
 46 | 	textSize := uint64(len(textSection))
 47 | 	// Size of ELF header + 2 * size program header?
 48 | 	textOffset := uint64(0x40 + (2 * 0x38))
 49 | 
 50 | 	var o Builder
 51 | 
 52 | 	// Build ELF Header
 53 | 	o.WriteBytes(0x7f, 0x45, 0x4c, 0x46) // ELF magic value
 54 | 
 55 | 	o.WriteBytes(0x02) // 64-bit executable
 56 | 	o.WriteBytes(0x01) // Little endian
 57 | 	o.WriteBytes(0x01) // ELF version
 58 | 	o.WriteBytes(0x00) // Target OS ABI
 59 | 	o.WriteBytes(0x00) // Further specify ABI version
 60 | 
 61 | 	o.WriteBytes(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00) // Unused bytes
 62 | 
 63 | 	o.WriteBytes(0x02, 0x00)             // Executable type
 64 | 	o.WriteBytes(0x3e, 0x00)             // x86-64 target architecture
 65 | 	o.WriteBytes(0x01, 0x00, 0x00, 0x00) // ELF version
 66 | 
 67 | 	// 64-bit virtual offsets always start at 0x400000?? https://stackoverflow.com/questions/38549972/why-elf-executables-have-a-fixed-load-address
 68 | 	// This seems to be a convention set in the x86_64 system-v abi: https://refspecs.linuxfoundation.org/elf/x86_64-SysV-psABI.pdf P26
 69 | 	o.WriteValue(8, virtualStartAddress+textOffset)
 70 | 
 71 | 	o.WriteBytes(0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00) // Offset from file to program header
 72 | 	o.WriteBytes(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00) // Start of section header table
 73 | 	o.WriteBytes(0x00, 0x00, 0x00, 0x00)                         // Flags
 74 | 	o.WriteBytes(0x40, 0x00)                                     // Size of this header
 75 | 	o.WriteBytes(0x38, 0x00)                                     // Size of a program header table entry - This should always be the same for 64-bit
 76 | 	o.WriteBytes(0x02, 0x00)                                     // Length of sections: data and text for now
 77 | 	o.WriteBytes(0x00, 0x00)                                     // Size of section header, which we aren't using
 78 | 	o.WriteBytes(0x00, 0x00)                                     // Number of entries section header
 79 | 	o.WriteBytes(0x00, 0x00)                                     // Index of section header table entry
 80 | 
 81 | 	// Build Program Header
 82 | 	// Text Segment
 83 | 	o.WriteBytes(0x01, 0x00, 0x00, 0x00) // PT_LOAD, loadable segment. Both data and text segment use this.
 84 | 	o.WriteBytes(0x07, 0x00, 0x00, 0x00) // Flags: 0x4 executable, 0x2 write, 0x1 read
 85 | 	o.WriteValue(8, 0)                   // textOffset)          // Offset from the beginning of the file. These values depend on how big the header and segment sizes are.
 86 | 	o.WriteValue(8, virtualStartAddress)
 87 | 	o.WriteValue(8, virtualStartAddress) // Physical address, irrelavnt on linux.
 88 | 	o.WriteValue(8, textSize)            // Number of bytes in file image of segment, must be larger than or equal to the size of payload in segment. Should be zero for bss data.
 89 | 	o.WriteValue(8, textSize)            // Number of bytes in memory image of segment, is not always same size as file image.
 90 | 	o.WriteValue(8, alignment)
 91 | 
 92 | 	dataSize := uint64(len(dataSection))
 93 | 	dataOffset := uint64(textOffset + textSize)
 94 | 	dataVirtualAddress := dataVirtualStartAddress + dataOffset
 95 | 
 96 | 	// Build Program Header
 97 | 	// Data Segment
 98 | 	o.WriteBytes(0x01, 0x00, 0x00, 0x00) // PT_LOAD, loadable segment. Both data and text segment use this.
 99 | 	o.WriteBytes(0x07, 0x00, 0x00, 0x00) // Flags: 0x4 executable, 0x2 write, 0x1 read
100 | 	o.WriteValue(8, dataOffset)          // Offset address.
101 | 	o.WriteValue(8, dataVirtualAddress)  // Virtual address.
102 | 	o.WriteValue(8, dataVirtualAddress)  // Physical address.
103 | 	o.WriteValue(8, dataSize)            // Number of bytes in file image.
104 | 	o.WriteValue(8, dataSize)            // Number of bytes in memory image.
105 | 	o.WriteValue(8, alignment)
106 | 
107 | 	// Output the text segment
108 | 	o.WriteBytes(textSection...)
109 | 	// Output the data segment
110 | 	o.WriteBytes(dataSection...)
111 | 	return o.o
112 | }
113 | 


--------------------------------------------------------------------------------
/exit.asm:
--------------------------------------------------------------------------------
1 | ;; Basic exit-code example.
2 |         nop                     ; Nothing happens
3 |         mov rbx,31              ; first syscall argument: exit code
4 |         mov rax,1               ; system call number (sys_exit)
5 |         int 0x80                ; call kernel
6 | 


--------------------------------------------------------------------------------
/go.mod:
--------------------------------------------------------------------------------
1 | module github.com/skx/assembler
2 | 
3 | go 1.14
4 | 


--------------------------------------------------------------------------------
/hello.asm:
--------------------------------------------------------------------------------
 1 |         ;; Output some text to the console.
 2 |         ;;
 3 |         ;; This example demonstrates using sys_write, and sys_exit
 4 |         ;;
 5 |         ;; For less duplication see the code in `call.asm`.
 6 |         ;;
 7 | 
 8 | .hello   DB "Hello, world\n"
 9 | .goodbye DB "Goodbye, world\n"
10 | 
11 |         mov rdx, 13        ;; write this many characters
12 |         mov rcx, hello     ;; starting at the string
13 |         mov rbx, 1         ;; write to STDOUT
14 |         mov rax, 4         ;; sys_write
15 |         int 0x80           ;; syscall
16 | 
17 |         mov rdx, 15        ;; write this many characters
18 |         mov rcx, goodbye   ;; starting at the string
19 |         mov rax, 4         ;; sys_write
20 |         mov rbx, 1         ;; write to STDOUT
21 |         int 0x80           ;; syscall
22 | 
23 |         xor rbx, rbx       ;; exit-code is 0
24 |         mov rax, 0x01      ;; sys_exit
25 |         int 0x80           ;; syscall
26 | 


--------------------------------------------------------------------------------
/instructions/instructions.go:
--------------------------------------------------------------------------------
 1 | // Package instructions contains the comment instruction-definitions
 2 | // for the instructions that we understand.
 3 | //
 4 | // These are abstracted here, so that you only don't need to touch
 5 | // the parser/lexer to add new instructions.
 6 | //
 7 | // Just add the instructions here, and update the compiler to emit the
 8 | // appropriate code.
 9 | package instructions
10 | 
11 | var (
12 | 	// InstructionLengths is a map that returns the number of operands
13 | 	// the given assembly-language operation will accept.
14 | 	//
15 | 	// For example a `nop` argument requires zero arguments so the
16 | 	// entry for that will be `0`.
17 | 	InstructionLengths map[string]int
18 | 
19 | 	// Instructions is automatically generated from the InstructionLengths
20 | 	// map, and contains the known instruction-types we can lex, parse, and
21 | 	// compile.
22 | 	Instructions []string
23 | )
24 | 
25 | func init() {
26 | 
27 | 	// Setup our instruction-lengths
28 | 	InstructionLengths = make(map[string]int)
29 | 
30 | 	InstructionLengths["add"] = 2
31 | 	InstructionLengths["cmp"] = 2
32 | 	InstructionLengths["dec"] = 1
33 | 	InstructionLengths["inc"] = 1
34 | 	InstructionLengths["int"] = 1
35 | 	InstructionLengths["mov"] = 2
36 | 	InstructionLengths["nop"] = 0
37 | 	InstructionLengths["pop"] = 1
38 | 	InstructionLengths["push"] = 1
39 | 	InstructionLengths["ret"] = 0
40 | 	InstructionLengths["sub"] = 2
41 | 	InstructionLengths["xor"] = 2
42 | 
43 | 	// call
44 | 	InstructionLengths["call"] = 1
45 | 
46 | 	// jump
47 | 	InstructionLengths["je"] = 1
48 | 	InstructionLengths["jmp"] = 1
49 | 	InstructionLengths["jne"] = 1
50 | 	InstructionLengths["jnz"] = 1
51 | 	InstructionLengths["jz"] = 1
52 | 
53 | 	// Processor control instructions
54 | 	InstructionLengths["clc"] = 0
55 | 	InstructionLengths["cld"] = 0
56 | 	InstructionLengths["cli"] = 0
57 | 	InstructionLengths["cmc"] = 0
58 | 	InstructionLengths["stc"] = 0
59 | 	InstructionLengths["std"] = 0
60 | 	InstructionLengths["sti"] = 0
61 | 
62 | 	// Now record the known-instructions
63 | 	for k := range InstructionLengths {
64 | 		Instructions = append(Instructions, k)
65 | 	}
66 | }
67 | 


--------------------------------------------------------------------------------
/jmp.asm:
--------------------------------------------------------------------------------
 1 | ;; This is an example of control-flow.
 2 | ;;
 3 | 
 4 | 
 5 | :start
 6 |         jmp foo
 7 | 
 8 | :bar
 9 |         nop          ; Nothing happens
10 |         mov rbx,33   ; first syscall argument: exit code
11 |         mov rax,1    ; system call number (sys_exit)
12 |         int 0x80     ; call kernel
13 | 
14 | :foo
15 |         jmp bar
16 | 


--------------------------------------------------------------------------------
/lexer/lexer.go:
--------------------------------------------------------------------------------
  1 | // Package lexer contains our lexer.
  2 | package lexer
  3 | 
  4 | import (
  5 | 	"errors"
  6 | 	"fmt"
  7 | 	"unicode"
  8 | 
  9 | 	"github.com/skx/assembler/token"
 10 | )
 11 | 
 12 | // Lexer holds our object-state.
 13 | type Lexer struct {
 14 | 	// The current character position
 15 | 	position int
 16 | 
 17 | 	// The next character position
 18 | 	readPosition int
 19 | 
 20 | 	// The current character
 21 | 	ch rune
 22 | 
 23 | 	// A rune slice of our input string
 24 | 	characters []rune
 25 | }
 26 | 
 27 | // New creates a Lexer instance from the given string
 28 | func New(input string) *Lexer {
 29 | 
 30 | 	// Line counting starts at one.
 31 | 	l := &Lexer{characters: []rune(input)}
 32 | 	l.readChar()
 33 | 	return l
 34 | }
 35 | 
 36 | // read forward one character.
 37 | func (l *Lexer) readChar() {
 38 | 	if l.readPosition >= len(l.characters) {
 39 | 		l.ch = rune(0)
 40 | 	} else {
 41 | 		l.ch = l.characters[l.readPosition]
 42 | 	}
 43 | 	l.position = l.readPosition
 44 | 	l.readPosition++
 45 | 
 46 | }
 47 | 
 48 | // NextToken reads and returns the next token, skipping any intervening
 49 | // white space, and swallowing any comments, in the process.
 50 | func (l *Lexer) NextToken() token.Token {
 51 | 	var tok token.Token
 52 | 	l.skipWhitespace()
 53 | 
 54 | 	// skip single-line comments
 55 | 	if l.ch == rune(';') || l.ch == rune('#') {
 56 | 		l.skipComment()
 57 | 		return (l.NextToken())
 58 | 	}
 59 | 
 60 | 	switch l.ch {
 61 | 
 62 | 	case rune(0):
 63 | 		tok.Literal = ""
 64 | 		tok.Type = token.EOF
 65 | 
 66 | 	case rune(':'):
 67 | 		label, err := l.readLabel()
 68 | 		if err != nil {
 69 | 			tok.Literal = err.Error()
 70 | 			tok.Type = token.ILLEGAL
 71 | 		} else {
 72 | 			tok = token.Token{Type: token.LABEL, Literal: label}
 73 | 		}
 74 | 
 75 | 	case rune('.'):
 76 | 		label, err := l.readLabel()
 77 | 		if err != nil {
 78 | 			tok.Literal = err.Error()
 79 | 			tok.Type = token.ILLEGAL
 80 | 		} else {
 81 | 			tok = token.Token{Type: token.DATA, Literal: label}
 82 | 		}
 83 | 
 84 | 	case rune(','):
 85 | 		tok = token.Token{Type: token.COMMA, Literal: ","}
 86 | 
 87 | 	case rune('['):
 88 | 		tok = token.Token{Type: token.LSQUARE, Literal: "["}
 89 | 
 90 | 	case rune(']'):
 91 | 
 92 | 		l.readChar()
 93 | 		return (l.NextToken())
 94 | 
 95 | 	case rune('"'):
 96 | 		str, err := l.readString('"')
 97 | 		if err == nil {
 98 | 			tok.Literal = str
 99 | 			tok.Type = token.STRING
100 | 		} else {
101 | 			tok.Literal = err.Error()
102 | 			tok.Type = token.ILLEGAL
103 | 		}
104 | 
105 | 	default:
106 | 		// Number?
107 | 		if isDigit(l.ch) {
108 | 			tok := l.readDecimal()
109 | 			return tok
110 | 		}
111 | 
112 | 		// Instruction/Register
113 | 		tok.Literal = l.readIdentifier()
114 | 		if len(tok.Literal) > 0 {
115 | 			tok.Type = token.LookupIdentifier(tok.Literal)
116 | 			return tok
117 | 		}
118 | 
119 | 		// Not an instruction/register (+LABEL)
120 | 		tok.Type = token.IDENTIFIER
121 | 		return tok
122 | 
123 | 	}
124 | 
125 | 	l.readChar()
126 | 
127 | 	return tok
128 | }
129 | 
130 | // readIdentifier is designed to read an identifier (name of variable,
131 | // function, etc).
132 | func (l *Lexer) readIdentifier() string {
133 | 
134 | 	id := ""
135 | 
136 | 	for isIdentifier(l.ch) {
137 | 		id += string(l.ch)
138 | 		l.readChar()
139 | 	}
140 | 	return id
141 | }
142 | 
143 | // skip over any white space.
144 | func (l *Lexer) skipWhitespace() {
145 | 	for isWhitespace(l.ch) {
146 | 		l.readChar()
147 | 	}
148 | }
149 | 
150 | // skip a comment (until the end of the line).
151 | func (l *Lexer) skipComment() {
152 | 	for l.ch != '\n' && l.ch != rune(0) {
153 | 		l.readChar()
154 | 	}
155 | 	l.skipWhitespace()
156 | }
157 | 
158 | // read a number.  We only care about numerical digits here, floats will
159 | // be handled elsewhere.
160 | func (l *Lexer) readNumber() string {
161 | 
162 | 	id := ""
163 | 
164 | 	for isDigit(l.ch) || l.ch == rune('x') {
165 | 		id += string(l.ch)
166 | 		l.readChar()
167 | 	}
168 | 	return id
169 | }
170 | 
171 | // read a decimal number, either int or floating-point.
172 | func (l *Lexer) readDecimal() token.Token {
173 | 
174 | 	//
175 | 	// Read an integer-number.
176 | 	//
177 | 	integer := l.readNumber()
178 | 
179 | 	//
180 | 	// Just an integer.
181 | 	//
182 | 	return token.Token{Type: token.NUMBER, Literal: integer}
183 | }
184 | 
185 | // read a string, deliminated by the given character.
186 | func (l *Lexer) readString(delim rune) (string, error) {
187 | 	out := ""
188 | 
189 | 	for {
190 | 		l.readChar()
191 | 
192 | 		if l.ch == rune(0) {
193 | 			return "", fmt.Errorf("unterminated string")
194 | 		}
195 | 		if l.ch == delim {
196 | 			break
197 | 		}
198 | 		//
199 | 		// Handle \n, \r, \t, \", etc.
200 | 		//
201 | 		if l.ch == '\\' {
202 | 
203 | 			// Line ending with "\" + newline
204 | 			if l.peekChar() == '\n' {
205 | 				// consume the newline.
206 | 				l.readChar()
207 | 				continue
208 | 			}
209 | 
210 | 			l.readChar()
211 | 
212 | 			if l.ch == rune(0) {
213 | 				return "", errors.New("unterminated string")
214 | 			}
215 | 			if l.ch == rune('n') {
216 | 				l.ch = '\n'
217 | 			}
218 | 			if l.ch == rune('0') {
219 | 				l.ch = rune(0)
220 | 			}
221 | 			if l.ch == rune('r') {
222 | 				l.ch = '\r'
223 | 			}
224 | 			if l.ch == rune('t') {
225 | 				l.ch = '\t'
226 | 			}
227 | 			if l.ch == rune('"') {
228 | 				l.ch = '"'
229 | 			}
230 | 			if l.ch == rune('\\') {
231 | 				l.ch = '\\'
232 | 			}
233 | 		}
234 | 		out = out + string(l.ch)
235 | 	}
236 | 
237 | 	return out, nil
238 | }
239 | 
240 | // read a label
241 | func (l *Lexer) readLabel() (string, error) {
242 | 	out := ""
243 | 
244 | 	for {
245 | 		l.readChar()
246 | 
247 | 		if l.ch == rune(0) {
248 | 			if len(out) > 1 {
249 | 				return out, nil
250 | 			}
251 | 			return "", fmt.Errorf("unterminated label")
252 | 		}
253 | 		if isWhitespace(l.ch) {
254 | 			return out, nil
255 | 		}
256 | 		out = out + string(l.ch)
257 | 	}
258 | }
259 | 
260 | // determinate ch is identifier or not.  Identifiers may be alphanumeric,
261 | // but they must start with a letter.  Here that works because we are only
262 | // called if the first character is alphabetical.
263 | func isIdentifier(ch rune) bool {
264 | 	if unicode.IsLetter(ch) || unicode.IsDigit(ch) || ch == '$' || ch == '_' || ch == '-' {
265 | 		return true
266 | 	}
267 | 	return false
268 | }
269 | 
270 | // is white space
271 | func isWhitespace(ch rune) bool {
272 | 	return ch == rune(' ') || ch == rune('\t') || ch == rune('\n') || ch == rune('\r')
273 | }
274 | 
275 | // is Digit
276 | func isDigit(ch rune) bool {
277 | 	return rune('0') <= ch && ch <= rune('9')
278 | }
279 | 
280 | // peek character
281 | func (l *Lexer) peekChar() rune {
282 | 	if l.readPosition >= len(l.characters) {
283 | 		return rune(0)
284 | 	}
285 | 	return l.characters[l.readPosition]
286 | }
287 | 


--------------------------------------------------------------------------------
/lexer/lexer_test.go:
--------------------------------------------------------------------------------
  1 | package lexer
  2 | 
  3 | import (
  4 | 	"testing"
  5 | 
  6 | 	"github.com/skx/assembler/token"
  7 | )
  8 | 
  9 | func TestComment(t *testing.T) {
 10 | 
 11 | 	n := New(`; This is a comment
 12 | # So is this`)
 13 | 
 14 | 	tok := n.NextToken()
 15 | 	if tok.Type != token.EOF {
 16 | 		t.Errorf("expected end of file")
 17 | 	}
 18 | }
 19 | 
 20 | func TestData(t *testing.T) {
 21 | 
 22 | 	input := `.foo
 23 | .`
 24 | 
 25 | 	tests := []struct {
 26 | 		expectedType    token.Type
 27 | 		expectedLiteral string
 28 | 	}{
 29 | 		{token.DATA, "foo"},
 30 | 		{token.ILLEGAL, "unterminated label"},
 31 | 		{token.EOF, ""},
 32 | 	}
 33 | 
 34 | 	l := New(input)
 35 | 	for i, tt := range tests {
 36 | 		tok := l.NextToken()
 37 | 		if tok.Type != tt.expectedType {
 38 | 			t.Fatalf("tests[%d] - tokentype wrong, expected=%q, got=%q", i, tt.expectedType, tok.Type)
 39 | 		}
 40 | 		if tok.Literal != tt.expectedLiteral {
 41 | 			t.Fatalf("tests[%d] - Literal wrong, expected=%q, got=%q", i, tt.expectedLiteral, tok.Literal)
 42 | 		}
 43 | 	}
 44 | 
 45 | }
 46 | 
 47 | func TestMov(t *testing.T) {
 48 | 
 49 | 	input := `
 50 | ;; Two move instructions
 51 | mov rax, rcx
 52 | mov rbx, 33
 53 | `
 54 | 
 55 | 	tests := []struct {
 56 | 		expectedType    token.Type
 57 | 		expectedLiteral string
 58 | 	}{
 59 | 		{token.INSTRUCTION, "mov"},
 60 | 		{token.REGISTER, "rax"},
 61 | 		{token.COMMA, ","},
 62 | 		{token.REGISTER, "rcx"},
 63 | 
 64 | 		{token.INSTRUCTION, "mov"},
 65 | 		{token.REGISTER, "rbx"},
 66 | 		{token.COMMA, ","},
 67 | 		{token.NUMBER, "33"},
 68 | 		{token.EOF, ""},
 69 | 	}
 70 | 
 71 | 	l := New(input)
 72 | 	for i, tt := range tests {
 73 | 		tok := l.NextToken()
 74 | 		if tok.Type != tt.expectedType {
 75 | 			t.Fatalf("tests[%d] - tokentype wrong, expected=%q, got=%q", i, tt.expectedType, tok.Type)
 76 | 		}
 77 | 		if tok.Literal != tt.expectedLiteral {
 78 | 			t.Fatalf("tests[%d] - Literal wrong, expected=%q, got=%q", i, tt.expectedLiteral, tok.Literal)
 79 | 		}
 80 | 	}
 81 | 
 82 | }
 83 | 
 84 | func TestLabel(t *testing.T) {
 85 | 
 86 | 	input := `
 87 | :name
 88 | :`
 89 | 
 90 | 	tests := []struct {
 91 | 		expectedType    token.Type
 92 | 		expectedLiteral string
 93 | 	}{
 94 | 		{token.LABEL, "name"},
 95 | 		{token.ILLEGAL, "unterminated label"},
 96 | 
 97 | 		{token.EOF, ""},
 98 | 	}
 99 | 
100 | 	l := New(input)
101 | 	for i, tt := range tests {
102 | 		tok := l.NextToken()
103 | 		if tok.Type != tt.expectedType {
104 | 			t.Fatalf("tests[%d] - tokentype wrong, expected=%q, got=%q", i, tt.expectedType, tok.Type)
105 | 		}
106 | 		if tok.Literal != tt.expectedLiteral {
107 | 			t.Fatalf("tests[%d] - Literal wrong, expected=%q, got=%q", i, tt.expectedLiteral, tok.Literal)
108 | 		}
109 | 	}
110 | }
111 | 
112 | func TestString(t *testing.T) {
113 | 
114 | 	input := `
115 | .foo DB "Steve\r\n\t\"\\"
116 | .test DB "steve\
117 |  kemp"
118 | .bar DB "Open\`
119 | 
120 | 	tests := []struct {
121 | 		expectedType    token.Type
122 | 		expectedLiteral string
123 | 	}{
124 | 		{token.DATA, "foo"},
125 | 		{token.DB, "DB"},
126 | 		{token.STRING, "Steve\r\n\t\"\\"},
127 | 
128 | 		{token.DATA, "test"},
129 | 		{token.DB, "DB"},
130 | 		{token.STRING, "steve kemp"},
131 | 
132 | 		{token.DATA, "bar"},
133 | 		{token.DB, "DB"},
134 | 		{token.ILLEGAL, "unterminated string"},
135 | 
136 | 		{token.EOF, ""},
137 | 	}
138 | 
139 | 	l := New(input)
140 | 	for i, tt := range tests {
141 | 		tok := l.NextToken()
142 | 		if tok.Type != tt.expectedType {
143 | 			t.Fatalf("tests[%d] - tokentype wrong, expected=%q, got=%q", i, tt.expectedType, tok.Type)
144 | 		}
145 | 		if tok.Literal != tt.expectedLiteral {
146 | 			t.Fatalf("tests[%d] - Literal wrong, expected=%q, got=%q", i, tt.expectedLiteral, tok.Literal)
147 | 		}
148 | 	}
149 | 
150 | }
151 | 
152 | func TestBrackets(t *testing.T) {
153 | 
154 | 	// Note "[" is emitted as you expect, but "]" is swallowed.
155 | 	input := `mov eax, [eax]`
156 | 
157 | 	tests := []struct {
158 | 		expectedType    token.Type
159 | 		expectedLiteral string
160 | 	}{
161 | 		{token.INSTRUCTION, "mov"},
162 | 		{token.IDENTIFIER, "eax"},
163 | 		{token.COMMA, ","},
164 | 		{token.LSQUARE, "["},
165 | 		{token.IDENTIFIER, "eax"},
166 | 		{token.EOF, ""},
167 | 	}
168 | 
169 | 	l := New(input)
170 | 	for i, tt := range tests {
171 | 		tok := l.NextToken()
172 | 		if tok.Type != tt.expectedType {
173 | 			t.Fatalf("tests[%d] - tokentype wrong, expected=%q, got=%q", i, tt.expectedType, tok.Type)
174 | 		}
175 | 		if tok.Literal != tt.expectedLiteral {
176 | 			t.Fatalf("tests[%d] - Literal wrong, expected=%q, got=%q", i, tt.expectedLiteral, tok.Literal)
177 | 		}
178 | 	}
179 | 
180 | }
181 | 


--------------------------------------------------------------------------------
/parser/ast.go:
--------------------------------------------------------------------------------
  1 | package parser
  2 | 
  3 | import (
  4 | 	"fmt"
  5 | 
  6 | 	"github.com/skx/assembler/token"
  7 | )
  8 | 
  9 | // Node is something we return from our parser.
 10 | type Node interface {
 11 | 	// Output this as a readable string
 12 | 	String() string
 13 | }
 14 | 
 15 | // Error contains an error-message
 16 | type Error struct {
 17 | 	Node
 18 | 	Value string
 19 | }
 20 | 
 21 | // String outputs this Error structure as a string.
 22 | func (e Error) String() string {
 23 | 	return fmt.Sprintf("<ERROR:%s>", e.Value)
 24 | }
 25 | 
 26 | // Data holds a data-statement, which might look like either of these:
 27 | //
 28 | //   .foo DB "Steve"
 29 | //   .bar DB 0x030, 0x40, 0x90
 30 | //
 31 | type Data struct {
 32 | 	Node
 33 | 
 34 | 	// Name is the name of the data-section
 35 | 	Name string
 36 | 
 37 | 	// Contents holds the string/byte data for the reference
 38 | 	Contents []byte
 39 | }
 40 | 
 41 | // String outputs this Data structure as a string.
 42 | func (d Data) String() string {
 43 | 	return fmt.Sprintf("<DATA: name:%s data:%v>", d.Name, d.Contents)
 44 | }
 45 | 
 46 | // Operand is used to hold the operand for an instruction.
 47 | //
 48 | // Some instructions have zero operands (e.g. `nop`), others have
 49 | // one (e.g. `inc rax`), and finally we have several which take two
 50 | // operands (e.g. `mov rax, rbx`).
 51 | //
 52 | type Operand struct {
 53 | 	// Token contains our parent token.
 54 | 	token.Token
 55 | 
 56 | 	// If we're operating upon memory-addresses we need to be
 57 | 	// able to understand the size of the thing we're operating
 58 | 	// upon.
 59 | 	//
 60 | 	// For example `inc byte ptr [rax]` will increment a byte,
 61 | 	// or 8 bits.  We have different define sizes available to us:
 62 | 	//
 63 | 	//   byte -> 8 bits.
 64 | 	//   word -> 16 bits.
 65 | 	//  dword -> 32 bites.
 66 | 	//  qword -> 64 bites.
 67 | 	Size int
 68 | 
 69 | 	// Is indirection used?
 70 | 	//
 71 | 	// i.e. `rax` has no indirection, but `[rax]` does.
 72 | 	Indirection bool
 73 | }
 74 | 
 75 | // Instruction holds a parsed instruction.
 76 | //
 77 | // For example "mov rax, rax".
 78 | //
 79 | type Instruction struct {
 80 | 	Node
 81 | 
 82 | 	// Instruction holds the instruction we've found, as a string.
 83 | 	Instruction string
 84 | 
 85 | 	// Operands holds the operands for this instruction.
 86 | 	//
 87 | 	// Operands will include numbers, registers, and indrected registers.
 88 | 	Operands []Operand
 89 | }
 90 | 
 91 | // String outputs this Error structure as a string
 92 | func (d Instruction) String() string {
 93 | 	return fmt.Sprintf("<INSTRUCTION: %s args:%v>", d.Instruction, d.Operands)
 94 | }
 95 | 
 96 | // Label holds a label, as seen when it is defined.
 97 | //
 98 | // For example ":foo" will define a label with name "foo".
 99 | type Label struct {
100 | 	Node
101 | 
102 | 	// Name has the name of the instruction
103 | 	Name string
104 | }
105 | 
106 | // String outputs this Label structure as a string.
107 | func (l Label) String() string {
108 | 	return fmt.Sprintf("<LABEL: %s>", l.Name)
109 | }
110 | 


--------------------------------------------------------------------------------
/parser/parser.go:
--------------------------------------------------------------------------------
  1 | // Package parser consumes tokens from the lexer, and generates the AST
  2 | // which is then walked to generate binary code.
  3 | package parser
  4 | 
  5 | import (
  6 | 	"fmt"
  7 | 	"strconv"
  8 | 
  9 | 	"github.com/skx/assembler/instructions"
 10 | 	"github.com/skx/assembler/lexer"
 11 | 	"github.com/skx/assembler/token"
 12 | )
 13 | 
 14 | // Parser holds our state.
 15 | type Parser struct {
 16 | 	// program holds our lexed program, as a series of tokens.
 17 | 	program []token.Token
 18 | 
 19 | 	// position holds our current offset within the program
 20 | 	// above.
 21 | 	position int
 22 | }
 23 | 
 24 | // New creates a new Parser, which will parse the specified
 25 | // input program into a series of tokens, and then allow it
 26 | // to be parsed.
 27 | func New(input string) *Parser {
 28 | 
 29 | 	// Create our parser
 30 | 	p := &Parser{}
 31 | 
 32 | 	// Create the lexer object.
 33 | 	l := lexer.New(input)
 34 | 
 35 | 	// Parse our program into a series of tokens
 36 | 	tok := l.NextToken()
 37 | 	for tok.Type != token.EOF {
 38 | 		p.program = append(p.program, tok)
 39 | 		tok = l.NextToken()
 40 | 	}
 41 | 
 42 | 	// Now we have a parser complete with a series of tokens
 43 | 	return p
 44 | 
 45 | }
 46 | 
 47 | // Next returns the stream of parsed "things" from the input source program.
 48 | //
 49 | // The things we return include:
 50 | //
 51 | //  * Instructions.
 52 | //  * Label definitions.
 53 | //  * Data references.
 54 | //
 55 | // There might be more things in the future.
 56 | func (p *Parser) Next() Node {
 57 | 
 58 | 	// Loop until we've exhausted our input.
 59 | 	for p.position < len(p.program) {
 60 | 
 61 | 		// The token we're operating upon
 62 | 		tok := p.program[p.position]
 63 | 
 64 | 		switch tok.Type {
 65 | 
 66 | 		case token.DATA:
 67 | 			return p.parseData()
 68 | 
 69 | 		case token.INSTRUCTION:
 70 | 			return p.parseInstruction()
 71 | 
 72 | 		case token.LABEL:
 73 | 			return p.parseLabel()
 74 | 
 75 | 		case token.RSQUARE:
 76 | 			p.position++
 77 | 
 78 | 		default:
 79 | 			fmt.Printf("Unhandled thing - definite bug: %v\n", tok)
 80 | 		}
 81 | 	}
 82 | 
 83 | 	return nil
 84 | }
 85 | 
 86 | // parseData handles input of the form:
 87 | //
 88 | //  .NAME DB "String content here"
 89 | //
 90 | // TODO:
 91 | //
 92 | //  .NAME DB 0x01, 0x02, 0x03 ...
 93 | func (p *Parser) parseData() Node {
 94 | 
 95 | 	// create the data-structure, with the name.
 96 | 	d := Data{Name: p.program[p.position].Literal}
 97 | 
 98 | 	// skip the DATA
 99 | 	p.position++
100 | 
101 | 	// ensure we're not out of the program
102 | 	if p.position >= len(p.program) {
103 | 		return Error{Value: "Unexpected EOF parsing data"}
104 | 	}
105 | 
106 | 	// Next token should be DB
107 | 	db := p.program[p.position]
108 | 	if db.Type != token.DB {
109 | 		return Error{Value: fmt.Sprintf("expected DB, got %v", db)}
110 | 	}
111 | 
112 | 	// move forward
113 | 	p.position++
114 | 	if p.position >= len(p.program) {
115 | 		return Error{Value: "Unexpected EOF parsing data"}
116 | 	}
117 | 
118 | 	//
119 | 	// We support:
120 | 	//   .foo DB "String"
121 | 	//
122 | 	// Or
123 | 	//   .foo DB 0x03, 0x4...
124 | 	//
125 | 	// If the next token is a string handle that.
126 | 	cur := p.program[p.position]
127 | 	if cur.Type == token.STRING {
128 | 		// bump past the string
129 | 		p.position++
130 | 
131 | 		d.Contents = []byte(cur.Literal)
132 | 		return d
133 | 	}
134 | 
135 | 	// If the type isn't a number that's an error
136 | 	if cur.Type != token.NUMBER {
137 | 		return Error{Value: fmt.Sprintf("expected string|number-array, got %v", cur)}
138 | 	}
139 | 
140 | 	// OK so we've got number
141 | 	for cur.Type == token.NUMBER {
142 | 
143 | 		// Parse it
144 | 		num, err := strconv.ParseInt(cur.Literal, 0, 64)
145 | 		if err != nil {
146 | 			return Error{Value: fmt.Sprintf("failed to convert '%s' to number:%s", cur.Literal, err)}
147 | 		}
148 | 
149 | 		// Add to the array
150 | 		d.Contents = append(d.Contents, byte(num))
151 | 
152 | 		// skip past the number
153 | 		p.position++
154 | 
155 | 		// end of program?
156 | 		if p.position >= len(p.program) {
157 | 			break
158 | 		}
159 | 
160 | 		// if the next token is not a comma then we're done
161 | 		if p.program[p.position].Type != token.COMMA {
162 | 			break
163 | 		}
164 | 
165 | 		// Otherwise skip over the comma
166 | 		p.position++
167 | 
168 | 		// end of program?
169 | 		if p.position >= len(p.program) {
170 | 			break
171 | 		}
172 | 
173 | 		cur = p.program[p.position]
174 | 	}
175 | 
176 | 	return d
177 | }
178 | 
179 | // parseInstruction is our workhorse
180 | //
181 | // We either return an `Instruction` or an `Error`
182 | //
183 | func (p *Parser) parseInstruction() Node {
184 | 
185 | 	// Get the current instruction
186 | 	tok := p.program[p.position]
187 | 
188 | 	// Find out how many arguments it has
189 | 	count, ok := instructions.InstructionLengths[tok.Literal]
190 | 
191 | 	// If that failed then it is an unknown instruction, probably
192 | 	if !ok {
193 | 		return Error{Value: fmt.Sprintf("unknown instructoin %v", tok)}
194 | 	}
195 | 
196 | 	// No args?  Just return the instruction and bump the position
197 | 	if count == 0 {
198 | 		p.position++
199 | 		return Instruction{Instruction: tok.Literal}
200 | 	}
201 | 
202 | 	if count == 1 {
203 | 		args, err := p.TakeOneArgument()
204 | 		if err != nil {
205 | 			return Error{Value: err.Error()}
206 | 
207 | 		}
208 | 
209 | 		return Instruction{Instruction: tok.Literal, Operands: args}
210 | 	}
211 | 	if count == 2 {
212 | 
213 | 		args, err := p.TakeTwoArguments()
214 | 		if err != nil {
215 | 			return Error{Value: err.Error()}
216 | 
217 | 		}
218 | 		return Instruction{Instruction: tok.Literal, Operands: args}
219 | 	}
220 | 
221 | 	return Error{Value: fmt.Sprintf("unhandled argument-count for token %v", tok)}
222 | }
223 | 
224 | // parseLabel handles input of the form:
225 | //
226 | //  :foo
227 | func (p *Parser) parseLabel() Node {
228 | 
229 | 	// create the label-structure, with the name.
230 | 	l := Label{Name: p.program[p.position].Literal}
231 | 
232 | 	// skip the label itself
233 | 	p.position++
234 | 
235 | 	return l
236 | }
237 | 
238 | // TakeTwoArguments handles fetching two arguments for an instruction.
239 | //
240 | // Arguments may be register-names, numbers, or label-values
241 | func (p *Parser) TakeTwoArguments() ([]Operand, error) {
242 | 
243 | 	var toks []Operand
244 | 
245 | 	// Get the first argument
246 | 	one, err := p.getOperand()
247 | 	if err != nil {
248 | 		return toks, err
249 | 	}
250 | 	toks = append(toks, one)
251 | 
252 | 	// see if we have a comma
253 | 	c := p.program[p.position]
254 | 	if c.Type != token.COMMA {
255 | 		return toks, fmt.Errorf("expected ',', got %v", c)
256 | 	}
257 | 
258 | 	// Get the second argument
259 | 	two, err := p.getOperand()
260 | 	if err != nil {
261 | 		return toks, err
262 | 	}
263 | 	toks = append(toks, two)
264 | 
265 | 	return toks, nil
266 | }
267 | 
268 | // TakeOneArgument reads the argument for a single-arg instruction.
269 | //
270 | // Arguments may be a register-name, number, or a label-value.
271 | func (p *Parser) TakeOneArgument() ([]Operand, error) {
272 | 
273 | 	var toks []Operand
274 | 
275 | 	// Get the argument
276 | 	one, err := p.getOperand()
277 | 
278 | 	if err != nil {
279 | 		return toks, err
280 | 	}
281 | 	toks = append(toks, one)
282 | 
283 | 	return toks, nil
284 | }
285 | 
286 | func (p *Parser) getOperand() (Operand, error) {
287 | 
288 | 	var op Operand
289 | 
290 | 	// Skip over the instruction, because we want the arg
291 | 	p.position++
292 | 	if p.position >= len(p.program) {
293 | 		return op, fmt.Errorf("unexpected EOF")
294 | 	}
295 | 
296 | 	// Get the argument
297 | 	thing := p.program[p.position]
298 | 
299 | 	if thing.Type == token.REGISTER ||
300 | 		thing.Type == token.NUMBER {
301 | 		op.Token = thing
302 | 		p.position++
303 | 		return op, nil
304 | 	}
305 | 
306 | 	// Could be "identifer", could be "byte|word|qword ptr"
307 | 	if thing.Literal != "byte" &&
308 | 		thing.Literal != "word" &&
309 | 		thing.Literal != "dword" &&
310 | 		thing.Literal != "qword" {
311 | 		op.Token = thing
312 | 		p.position++
313 | 		return op, nil
314 | 	}
315 | 
316 | 	// OK indirection.  probably
317 | 	if thing.Literal == "byte" {
318 | 		op.Size = 8
319 | 	}
320 | 	if thing.Literal == "word" {
321 | 		op.Size = 16
322 | 	}
323 | 	if thing.Literal == "dword" {
324 | 		op.Size = 32
325 | 	}
326 | 	if thing.Literal == "qword" {
327 | 		op.Size = 64
328 | 	}
329 | 
330 | 	// So the next token must be "ptr"
331 | 	p.position++
332 | 	if p.position >= len(p.program) {
333 | 		return op, fmt.Errorf("unexpected EOF #2")
334 | 	}
335 | 
336 | 	// Get the next arg
337 | 	next := p.program[p.position]
338 | 	if next.Type != token.IDENTIFIER || next.Literal != "ptr" {
339 | 		return op, fmt.Errorf("expected ptr after %s", thing.Literal)
340 | 	}
341 | 	p.position++
342 | 
343 | 	if p.program[p.position].Type == token.LSQUARE {
344 | 		op.Indirection = true
345 | 
346 | 		// skip the [
347 | 		p.position++
348 | 
349 | 		// get the register + skip it
350 | 		op.Token = p.program[p.position]
351 | 		p.position++
352 | 
353 | 	} else {
354 | 		p.position++
355 | 		op.Token = p.program[p.position]
356 | 		p.position++
357 | 	}
358 | 	return op, nil
359 | 
360 | }
361 | 


--------------------------------------------------------------------------------
/parser/parser_test.go:
--------------------------------------------------------------------------------
 1 | package parser
 2 | 
 3 | import (
 4 | 	"testing"
 5 | )
 6 | 
 7 | func TestComment(t *testing.T) {
 8 | 
 9 | 	p := New(";; This is a test")
10 | 
11 | 	out := p.Next()
12 | 	if out != nil {
13 | 		t.Fatalf("Failed to skip comment")
14 | 	}
15 | }
16 | 
17 | func TestData(t *testing.T) {
18 | 
19 | 	type TestCase struct {
20 | 		Input string
21 | 		Data  []byte
22 | 	}
23 | 
24 | 	tests := []TestCase{
25 | 		TestCase{Input: ".data DB \"Steve\"",
26 | 			Data: []byte{83, 116, 101, 118, 101},
27 | 		},
28 | 		TestCase{Input: ".foo DB 1\n.bar DB 3,3",
29 | 			Data: []byte{1},
30 | 		},
31 | 		TestCase{Input: ".foo DB 32, 44",
32 | 			Data: []byte{32, 44},
33 | 		},
34 | 		TestCase{Input: ".foo DB 32, ",
35 | 			Data: []byte{32},
36 | 		},
37 | 	}
38 | 
39 | 	// For each test
40 | 	for _, test := range tests {
41 | 
42 | 		// Parse
43 | 		p := New(test.Input)
44 | 
45 | 		// We expect a single data-statement
46 | 		out := p.Next()
47 | 		if out == nil {
48 | 			t.Fatalf("nil result from pasing %s", test.Input)
49 | 		}
50 | 
51 | 		// Cast to the right value
52 | 		d, ok := out.(Data)
53 | 		if !ok {
54 | 			t.Fatalf("didn't get an Data structure: %v", out)
55 | 		}
56 | 
57 | 		// Length matches?
58 | 		if len(d.Contents) != len(test.Data) {
59 | 			t.Fatalf("data length didn't match expectation")
60 | 		}
61 | 
62 | 		// Content matches?
63 | 		for i, x := range d.Contents {
64 | 			if test.Data[i] != x {
65 | 				t.Fatalf("data mismatch at offset %d", i)
66 | 			}
67 | 		}
68 | 	}
69 | }
70 | 
71 | func TestMove(t *testing.T) {
72 | 
73 | 	p := New("mov rax, rbx")
74 | 
75 | 	out := p.Next()
76 | 
77 | 	outI, ok := out.(Instruction)
78 | 	if !ok {
79 | 		t.Fatalf("didn't get an instruction structure")
80 | 	}
81 | 
82 | 	if len(outI.Operands) != 2 {
83 | 		t.Fatalf("mov - wrong arg count")
84 | 	}
85 | 	if outI.Operands[0].Literal != "rax" {
86 | 		t.Fatalf("mov - wrong first arg")
87 | 	}
88 | 	if outI.Operands[1].Literal != "rbx" {
89 | 		t.Fatalf("mov - wrong second arg")
90 | 	}
91 | }
92 | 


--------------------------------------------------------------------------------
/test.asm:
--------------------------------------------------------------------------------
 1 | ;;
 2 | ;;  So this is a simple example assembly program
 3 | ;;
 4 | ;;  It calls `int 0x80` with rax set to 0x01
 5 | ;;
 6 | ;;       mov eax,1        ; system call number (sys_exit)
 7 | ;;       mov ebx, 0xNN    ; return code
 8 | ;;       int 0x80         ; syscall
 9 | ;;
10 | 
11 | 
12 |         nop                     ; Comment goes here too, if we like.
13 | 
14 | ;;
15 | ;;  mov eax, 0x01 works
16 | ;;
17 | ;;  However we can test our assembly by setting the register to zero,
18 | ;; then incrementing it.
19 | ;;
20 |         xor rax, rax
21 |         inc rax
22 | 
23 | ;;
24 | ;; The exit-coe will be stored in rbx.
25 | ;;
26 | ;; We could set `mov rbx, 0x42`, however it is another test  of our handling
27 | ;; to allow some maths to be carried out
28 | ;;
29 | 	mov rbx, 0x0000
30 | 	mov rcx, 0x0007
31 | 	add rbx, rcx
32 | 
33 | 	mov rcx, 0x0002
34 | 	add rbx, rcx
35 | 
36 | ;;
37 | ;; So we've said :
38 | ;;
39 | ;;    rbx = 0
40 | ;;    rbx += 7
41 | ;;    rbx += 2
42 | ;;
43 | ;; -> rbx thus contains 9.
44 | ;;
45 | ;; Now call the kernel.
46 | ;;
47 |         int 0x80
48 | 


--------------------------------------------------------------------------------
/token/token.go:
--------------------------------------------------------------------------------
 1 | // Package token contains identifiers for the various things
 2 | // we find in our source-scripts.
 3 | //
 4 | // Our lexer will convert an input-script into a series of tokens,
 5 | // which will then be further-processed.
 6 | package token
 7 | 
 8 | import "github.com/skx/assembler/instructions"
 9 | 
10 | // Type is a string
11 | type Type string
12 | 
13 | // Token struct represent the lexer token
14 | type Token struct {
15 | 
16 | 	// Type contains the type of the token.
17 | 	Type Type
18 | 
19 | 	// Literal contains the literal text of the token.
20 | 	Literal string
21 | }
22 | 
23 | // Our known token-types
24 | const (
25 | 	// Basic things
26 | 	COMMA       = ","
27 | 	LSQUARE     = "["
28 | 	RSQUARE     = "]"
29 | 	EOF         = "EOF"
30 | 	LABEL       = "LABEL"
31 | 	DATA        = "DATA"
32 | 	REGISTER    = "REGISTER"
33 | 	INSTRUCTION = "INSTRUCTION"
34 | 	IDENTIFIER  = "IDENTIFIER"
35 | 
36 | 	// Data statement
37 | 	DB = "DB"
38 | 
39 | 	// Number as operand
40 | 	NUMBER = "NUMBER"
41 | 
42 | 	// String for DB
43 | 	STRING = "STRING"
44 | 
45 | 	// Something we couldn't handle
46 | 	ILLEGAL = "ILLEGAL"
47 | )
48 | 
49 | // known things we can handle
50 | var known = map[string]Type{
51 | 	"DB": DB,
52 | 	"db": DB,
53 | 
54 | 	// Things we parse as registers
55 | 	"rax": REGISTER,
56 | 	"rbx": REGISTER,
57 | 	"rcx": REGISTER,
58 | 	"rdx": REGISTER,
59 | 	"rbp": REGISTER,
60 | 	"rsp": REGISTER,
61 | 	"rsi": REGISTER,
62 | 	"rdi": REGISTER,
63 | 	"r8":  REGISTER,
64 | 	"r9":  REGISTER,
65 | 	"r10": REGISTER,
66 | 	"r11": REGISTER,
67 | 	"r12": REGISTER,
68 | 	"r13": REGISTER,
69 | 	"r14": REGISTER,
70 | 	"r15": REGISTER,
71 | }
72 | 
73 | // LookupIdentifier used to determinate whether identifier is keyword nor not
74 | func LookupIdentifier(identifier string) Type {
75 | 
76 | 	// Is this an instruction
77 | 	for _, ins := range instructions.Instructions {
78 | 		if identifier == ins {
79 | 			return INSTRUCTION
80 | 		}
81 | 	}
82 | 
83 | 	if tok, ok := known[identifier]; ok {
84 | 		return tok
85 | 	}
86 | 	return IDENTIFIER
87 | }
88 | 


--------------------------------------------------------------------------------
/token/token_test.go:
--------------------------------------------------------------------------------
 1 | package token
 2 | 
 3 | import (
 4 | 	"testing"
 5 | )
 6 | 
 7 | // Test looking up values succeeds, then fails
 8 | func TestLookup(t *testing.T) {
 9 | 
10 | 	for key, val := range known {
11 | 
12 | 		// Obviously this will pass.
13 | 		if LookupIdentifier(key) != val {
14 | 			t.Errorf("Lookup of %s failed", key)
15 | 		}
16 | 
17 | 		// Once the keywords are "doubled" they'll no longer
18 | 		// match - so we find them as identifiers.
19 | 		if LookupIdentifier(key+key) != IDENTIFIER {
20 | 			t.Errorf("Lookup of %s failed", key)
21 | 		}
22 | 	}
23 | }
24 | 


--------------------------------------------------------------------------------