├── .gitignore
├── LICENSE
├── README.md
├── build.zig
├── src
    ├── data.zig
    ├── main.zig
    └── tokenize.zig
└── test
    ├── aarch64v8-linux
        └── hello.s
    └── x86_64-linux
        └── hello.s


/.gitignore:
--------------------------------------------------------------------------------
1 | zig-cache
2 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (Expat)
 2 | 
 3 | Copyright (c) 2019 Andrew Kelley
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in
13 | all copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21 | THE SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # zasm
 2 | 
 3 | Multi-target assembler, disassembler, and linker.
 4 | 
 5 | This is my experimental playground for a non-LLVM Zig backend.
 6 | 
 7 | ## Status
 8 | 
 9 | This project has only just begun. There is nothing to see here yet.
10 | 
11 | ## Planned Targets
12 | 
13 | ### Architectures
14 | 
15 | All of them.
16 | 
17 | No matter how insignificant the architecture, provided it has reached some kind
18 | of stable release that includes a specification, it is in scope.
19 | 
20 | ### Object File Formats
21 | 
22 |  * ELF
23 |  * COFF
24 |  * MACH-O
25 |  * WebAssembly
26 | 
27 | ### Executable File Formats
28 | 
29 |  * ELF
30 |  * PE (Portable Executable) (Windows)
31 |  * WebAssembly
32 | 
33 | ### Debug Info Formats
34 | 
35 |  * DWARF
36 |  * PDB
37 | 
38 | ## Roadmap
39 | 
40 |  * Hello world aarch64 assembly
41 |  * Hello world x86_64 assembly split across 2 files
42 |  * Hello world i386 assembly
43 |  * Tests
44 |  * Symbol table
45 |  * DWARF Debug Info
46 |  * Support more instructions
47 |  * Build objects
48 |  * Link objects
49 |  * Incremental linking
50 | 


--------------------------------------------------------------------------------
/build.zig:
--------------------------------------------------------------------------------
 1 | const Builder = @import("std").build.Builder;
 2 | 
 3 | pub fn build(b: *Builder) void {
 4 |     const mode = b.standardReleaseOptions();
 5 |     const exe = b.addExecutable("zasm", "src/main.zig");
 6 |     exe.setBuildMode(mode);
 7 |     exe.install();
 8 | 
 9 |     const test_step = b.addTest("src/main.zig");
10 |     const test_cmd = b.step("test", "Run the tests");
11 |     test_cmd.dependOn(&test_step.step);
12 | }
13 | 


--------------------------------------------------------------------------------
/src/data.zig:
--------------------------------------------------------------------------------
 1 | pub const Instruction = struct {
 2 |     /// Primary opcode
 3 |     po: u8,
 4 |     prefix: ?u8 = null,
 5 |     suffix: ?u8 = null,
 6 |     name: []const u8,
 7 |     args: []const Arg,
 8 |     size: u64,
 9 | };
10 | 
11 | pub const Register = enum {
12 |     eax,
13 |     edi,
14 |     edx,
15 |     esi,
16 | };
17 | 
18 | pub const Arg = union(enum) {
19 |     register: Register,
20 |     immediate,
21 | };
22 | 
23 | pub const instructions = [_]Instruction{
24 |     .{
25 |         .name = "mov",
26 |         .po = 0xb8,
27 |         .args = &[_]Arg{
28 |             .{ .register = .eax },
29 |             .immediate,
30 |         },
31 |         .size = 5,
32 |     },
33 |     .{
34 |         .name = "mov",
35 |         .po = 0xbf,
36 |         .args = &[_]Arg{
37 |             .{ .register = .edi },
38 |             .immediate,
39 |         },
40 |         .size = 5,
41 |     },
42 |     .{
43 |         .name = "mov",
44 |         .po = 0xbe,
45 |         .args = &[_]Arg{
46 |             .{ .register = .esi },
47 |             .immediate,
48 |         },
49 |         .size = 5,
50 |     },
51 |     .{
52 |         .name = "mov",
53 |         .po = 0xba,
54 |         .args = &[_]Arg{
55 |             .{ .register = .edx },
56 |             .immediate,
57 |         },
58 |         .size = 5,
59 |     },
60 |     .{
61 |         .name = "syscall",
62 |         .po = 0x05,
63 |         .prefix = 0x0f,
64 |         .args = &[_]Arg{},
65 |         .size = 2,
66 |     },
67 |     .{
68 |         .name = "xor",
69 |         .po = 0x31,
70 |         .suffix = 0xff,
71 |         .args = &[_]Arg{
72 |             .{ .register = .edi },
73 |             .{ .register = .edi },
74 |         },
75 |         .size = 2,
76 |     },
77 | };
78 | 


--------------------------------------------------------------------------------
/src/main.zig:
--------------------------------------------------------------------------------
   1 | const std = @import("std");
   2 | const Tokenizer = @import("tokenize.zig").Tokenizer;
   3 | const Token = @import("tokenize.zig").Token;
   4 | const mem = std.mem;
   5 | const fs = std.fs;
   6 | const process = std.process;
   7 | const math = std.math;
   8 | const data = @import("data.zig");
   9 | const parseStringLiteral = std.zig.parseStringLiteral;
  10 | const elf = std.elf;
  11 | const assert = std.debug.assert;
  12 | 
  13 | const Cmd = enum {
  14 |     exe,
  15 |     obj,
  16 |     dis,
  17 |     targets,
  18 |     tokenize,
  19 | };
  20 | 
  21 | const Assembly = struct {
  22 |     allocator: *mem.Allocator,
  23 |     input_files: []const []const u8,
  24 |     asm_files: []AsmFile,
  25 |     target: std.Target,
  26 |     errors: std.ArrayList(Error),
  27 |     entry_addr: ?u64 = null,
  28 |     output_file: ?[]const u8,
  29 |     file_offset: u64,
  30 |     next_map_addr: u64 = 0x10000,
  31 |     sections: SectionTable,
  32 | 
  33 |     const SectionTable = std.StringHashMap(*Section);
  34 | 
  35 |     pub const SourceInfo = struct {
  36 |         token: Token,
  37 |         source: []const u8,
  38 |         file_name: []const u8,
  39 |     };
  40 | 
  41 |     const Error = union(enum) {
  42 |         unexpected_token: struct {
  43 |             source_info: SourceInfo,
  44 |         },
  45 |         unrecognized_directive: struct {
  46 |             source_info: SourceInfo,
  47 |         },
  48 |         unrecognized_instruction: struct {
  49 |             source_info: SourceInfo,
  50 |         },
  51 |         symbol_outside_section: struct {
  52 |             source_info: SourceInfo,
  53 |         },
  54 |         duplicate_symbol: struct {
  55 |             source_info: SourceInfo,
  56 |             other_symbol: Token,
  57 |         },
  58 |         bad_integer_literal: struct {
  59 |             source_info: SourceInfo,
  60 |         },
  61 |         instr_outside_symbol: struct {
  62 |             source_info: SourceInfo,
  63 |         },
  64 |         bad_string_literal: struct {
  65 |             source_info: SourceInfo,
  66 |             bad_index: usize,
  67 |         },
  68 |         bad_section_flag: struct {
  69 |             source_info: SourceInfo,
  70 |             bad_index: usize,
  71 |             bad_byte: u8,
  72 |         },
  73 |         too_many_args: struct {
  74 |             source_info: SourceInfo,
  75 |         },
  76 |         unknown_symbol: struct {
  77 |             source_info: SourceInfo,
  78 |         },
  79 | 
  80 |         fn printToStream(stream: var, comptime message: []const u8, source_info: SourceInfo, args: ...) !void {
  81 |             const loc = tokenLocation(source_info.source, source_info.token);
  82 |             try stream.print(
  83 |                 "{}:{}:{}: " ++ message,
  84 |                 source_info.file_name,
  85 |                 loc.line + 1,
  86 |                 loc.column + 1,
  87 |                 args,
  88 |             );
  89 |         }
  90 | 
  91 |         fn render(self: Error, stream: var) !void {
  92 |             switch (self) {
  93 |                 .unexpected_token => |info| {
  94 |                     try printToStream(
  95 |                         stream,
  96 |                         "error: unexpected token: {}\n",
  97 |                         info.source_info,
  98 |                         @tagName(info.source_info.token.id),
  99 |                     );
 100 |                 },
 101 |                 .unrecognized_directive => |info| {
 102 |                     const si = info.source_info;
 103 |                     try printToStream(
 104 |                         stream,
 105 |                         "error: unrecognized directive: {}\n",
 106 |                         si,
 107 |                         si.source[si.token.start..si.token.end],
 108 |                     );
 109 |                 },
 110 |                 .unrecognized_instruction => |info| {
 111 |                     const si = info.source_info;
 112 |                     try printToStream(
 113 |                         stream,
 114 |                         "error: instruction name or parameters do not match asm database: {}\n",
 115 |                         si,
 116 |                         si.source[si.token.start..si.token.end],
 117 |                     );
 118 |                 },
 119 |                 .symbol_outside_section => |info| {
 120 |                     const si = info.source_info;
 121 |                     try printToStream(
 122 |                         stream,
 123 |                         "error: symbol outside section: {}\n",
 124 |                         si,
 125 |                         si.source[si.token.start..si.token.end],
 126 |                     );
 127 |                 },
 128 |                 .bad_integer_literal => |info| {
 129 |                     const si = info.source_info;
 130 |                     try printToStream(
 131 |                         stream,
 132 |                         "error: invalid integer literal: {}\n",
 133 |                         si,
 134 |                         si.source[si.token.start..si.token.end],
 135 |                     );
 136 |                 },
 137 |                 .bad_string_literal => |info| {
 138 |                     const loc = tokenLocation(info.source_info.source, info.source_info.token);
 139 |                     try stream.print(
 140 |                         "{}:{}:{}: error: invalid byte in string literal\n",
 141 |                         info.source_info.file_name,
 142 |                         loc.line + 1,
 143 |                         loc.column + 1 + info.bad_index,
 144 |                     );
 145 |                 },
 146 |                 .bad_section_flag => |info| {
 147 |                     const loc = tokenLocation(info.source_info.source, info.source_info.token);
 148 |                     try stream.print(
 149 |                         "{}:{}:{}: error: invalid section flag: '{c}'\n",
 150 |                         info.source_info.file_name,
 151 |                         loc.line + 1,
 152 |                         loc.column + 1 + info.bad_index,
 153 |                         info.bad_byte,
 154 |                     );
 155 |                 },
 156 |                 .instr_outside_symbol => |info| {
 157 |                     const si = info.source_info;
 158 |                     try printToStream(
 159 |                         stream,
 160 |                         "error: instruction outside symbol\n",
 161 |                         si,
 162 |                     );
 163 |                 },
 164 |                 .duplicate_symbol => |info| {
 165 |                     const si = info.source_info;
 166 |                     const other_loc = tokenLocation(si.source, info.other_symbol);
 167 |                     try printToStream(
 168 |                         stream,
 169 |                         "error: duplicate symbol: {}\n" ++
 170 |                             "{}:{}:{}: note: original definition. \n",
 171 |                         si,
 172 |                         si.source[si.token.start..si.token.end],
 173 | 
 174 |                         si.file_name,
 175 |                         other_loc.line + 1,
 176 |                         other_loc.column + 1,
 177 |                     );
 178 |                 },
 179 |                 .too_many_args => |info| {
 180 |                     const si = info.source_info;
 181 |                     try printToStream(
 182 |                         stream,
 183 |                         "error: too many args\n",
 184 |                         si,
 185 |                     );
 186 |                 },
 187 |                 .unknown_symbol => |info| {
 188 |                     const si = info.source_info;
 189 |                     try printToStream(
 190 |                         stream,
 191 |                         "error: unknown symbol\n",
 192 |                         si,
 193 |                     );
 194 |                 },
 195 |             }
 196 |         }
 197 |     };
 198 | };
 199 | 
 200 | const Section = struct {
 201 |     name: []const u8,
 202 |     layout: std.ArrayList(*Symbol),
 203 |     alignment: u32,
 204 |     file_offset: u64,
 205 |     file_size: u64,
 206 |     virt_addr: u64,
 207 |     mem_size: u64,
 208 |     flags: u32,
 209 | };
 210 | 
 211 | const Symbol = struct {
 212 |     /// `undefined` until a second pass when addresses are calculated.
 213 |     addr: u64,
 214 | 
 215 |     /// Starts at 0. Increments with instructions being added.
 216 |     size: u64,
 217 | 
 218 |     source_token: Token,
 219 |     name: []const u8,
 220 |     section: *Section,
 221 | 
 222 |     ops: std.ArrayList(PseudoOp),
 223 | 
 224 |     source_file: *AsmFile,
 225 | };
 226 | 
 227 | const Instruction = struct {
 228 |     props: *const data.Instruction,
 229 |     args: []Arg,
 230 | };
 231 | 
 232 | const PseudoOp = union(enum) {
 233 |     instruction: Instruction,
 234 |     data: []const u8,
 235 | };
 236 | 
 237 | const Location = struct {
 238 |     line: usize,
 239 |     column: usize,
 240 |     line_start: usize,
 241 |     line_end: usize,
 242 | };
 243 | 
 244 | fn tokenLocation(source: []const u8, token: Token) Location {
 245 |     const start_index = 0;
 246 |     var loc = Location{
 247 |         .line = 0,
 248 |         .column = 0,
 249 |         .line_start = start_index,
 250 |         .line_end = source.len,
 251 |     };
 252 |     const token_start = token.start;
 253 |     for (source[start_index..]) |c, i| {
 254 |         if (i + start_index == token_start) {
 255 |             loc.line_end = i + start_index;
 256 |             while (loc.line_end < source.len and source[loc.line_end] != '\n') : (loc.line_end += 1) {}
 257 |             return loc;
 258 |         }
 259 |         if (c == '\n') {
 260 |             loc.line += 1;
 261 |             loc.column = 0;
 262 |             loc.line_start = i + 1;
 263 |         } else {
 264 |             loc.column += 1;
 265 |         }
 266 |     }
 267 |     return loc;
 268 | }
 269 | 
 270 | pub fn main() anyerror!void {
 271 |     var arena = std.heap.ArenaAllocator.init(std.heap.direct_allocator);
 272 |     defer arena.deinit();
 273 |     const allocator = &arena.allocator;
 274 | 
 275 |     var assembly = Assembly{
 276 |         .allocator = allocator,
 277 |         .target = .Native,
 278 |         .input_files = undefined,
 279 |         .errors = std.ArrayList(Assembly.Error).init(allocator),
 280 |         .output_file = null,
 281 |         .sections = Assembly.SectionTable.init(allocator),
 282 |         .asm_files = undefined,
 283 |         .file_offset = 0,
 284 |     };
 285 |     var input_files = std.ArrayList([]const u8).init(allocator);
 286 |     var maybe_cmd: ?Cmd = null;
 287 |     var debug_errors = false;
 288 | 
 289 |     const args = try process.argsAlloc(allocator);
 290 |     var arg_i: usize = 1;
 291 |     while (arg_i < args.len) : (arg_i += 1) {
 292 |         const full_arg = args[arg_i];
 293 |         if (mem.startsWith(u8, full_arg, "-")) {
 294 |             const arg = full_arg[1..];
 295 |             if (mem.eql(u8, arg, "help")) {
 296 |                 try dumpUsage(std.io.getStdOut());
 297 |                 return;
 298 |             } else if (mem.eql(u8, arg, "debug-errors")) {
 299 |                 debug_errors = true;
 300 |             } else {
 301 |                 arg_i += 1;
 302 |                 if (arg_i >= args.len) {
 303 |                     std.debug.warn("Expected another parameter after '{}'\n", full_arg);
 304 |                     dumpStdErrUsageAndExit();
 305 |                 } else if (mem.eql(u8, arg, "target")) {
 306 |                     assembly.target = try std.Target.parse(args[arg_i]);
 307 |                 } else if (mem.eql(u8, arg, "o")) {
 308 |                     assembly.output_file = args[arg_i];
 309 |                 } else {
 310 |                     std.debug.warn("Invalid parameter: {}\n", full_arg);
 311 |                     dumpStdErrUsageAndExit();
 312 |                 }
 313 |             }
 314 |         } else if (maybe_cmd == null) {
 315 |             inline for (std.meta.fields(Cmd)) |field| {
 316 |                 if (mem.eql(u8, full_arg, field.name)) {
 317 |                     maybe_cmd = @field(Cmd, field.name);
 318 |                     break;
 319 |                 }
 320 |             } else {
 321 |                 std.debug.warn("Invalid command: {}\n", full_arg);
 322 |                 dumpStdErrUsageAndExit();
 323 |             }
 324 |         } else {
 325 |             try input_files.append(full_arg);
 326 |         }
 327 |     }
 328 | 
 329 |     const cmd = maybe_cmd orelse {
 330 |         std.debug.warn("Expected a command parameter\n");
 331 |         dumpStdErrUsageAndExit();
 332 |     };
 333 | 
 334 |     switch (cmd) {
 335 |         .targets => {
 336 |             try std.io.getStdOut().write(
 337 |                 \\x86_64-linux
 338 |                 \\
 339 |             );
 340 |             return;
 341 |         },
 342 |         .exe => {
 343 |             assembly.input_files = input_files.toSliceConst();
 344 |             assembleExecutable(&assembly) catch |err| switch (err) {
 345 |                 error.ParseFailure => {
 346 |                     const stream = &std.io.getStdErr().outStream().stream;
 347 |                     for (assembly.errors.toSliceConst()) |asm_err| {
 348 |                         try asm_err.render(stream);
 349 |                     }
 350 |                     if (debug_errors) {
 351 |                         return err;
 352 |                     } else {
 353 |                         process.exit(1);
 354 |                     }
 355 |                 },
 356 |                 else => |e| return e,
 357 |             };
 358 |         },
 359 |         .obj => {
 360 |             std.debug.warn("object files not yet implemented\n");
 361 |             process.exit(1);
 362 |         },
 363 |         .dis => {
 364 |             std.debug.warn("disassembly not yet implemented\n");
 365 |             process.exit(1);
 366 |         },
 367 |         .tokenize => {
 368 |             const stdout = &std.io.getStdOut().outStream().stream;
 369 |             const cwd = fs.cwd();
 370 |             for (input_files.toSliceConst()) |input_file| {
 371 |                 const source = try cwd.readFileAlloc(allocator, input_file, math.maxInt(usize));
 372 |                 var tokenizer = Tokenizer.init(source);
 373 |                 while (true) {
 374 |                     const token = tokenizer.next();
 375 |                     if (token.id == .eof) break;
 376 |                     try stdout.print("{}: {}\n", @tagName(token.id), source[token.start..token.end]);
 377 |                 }
 378 |             }
 379 |         },
 380 |     }
 381 | }
 382 | 
 383 | const Arg = union(enum) {
 384 |     register: data.Register,
 385 |     immediate: u64,
 386 |     symbol_ref: Token,
 387 | };
 388 | 
 389 | const AsmFile = struct {
 390 |     source: []const u8,
 391 |     file_name: []const u8,
 392 |     tokenizer: Tokenizer,
 393 |     assembly: *Assembly,
 394 |     current_section: ?*Section = null,
 395 |     current_symbol: ?*Symbol = null,
 396 |     globals: GlobalSet,
 397 |     put_back_buffer: [1]Token,
 398 |     put_back_count: u1,
 399 |     symbols: SymbolTable,
 400 | 
 401 |     const SymbolTable = std.StringHashMap(*Symbol);
 402 |     const GlobalSet = std.StringHashMap(void);
 403 | 
 404 |     fn tokenSlice(asm_file: AsmFile, token: Token) []const u8 {
 405 |         return asm_file.source[token.start..token.end];
 406 |     }
 407 | 
 408 |     fn findOrCreateSection(self: *AsmFile, name: []const u8, flags: u32) !*Section {
 409 |         const gop = try self.assembly.sections.getOrPut(name);
 410 |         if (gop.found_existing) {
 411 |             // TODO deal with flags conflicts
 412 |             return gop.kv.value;
 413 |         }
 414 |         const section = try self.assembly.sections.allocator.create(Section);
 415 |         section.* = Section{
 416 |             .name = name,
 417 |             .layout = std.ArrayList(*Symbol).init(self.assembly.sections.allocator),
 418 |             .alignment = 0x1000,
 419 |             .file_offset = undefined,
 420 |             .file_size = 0,
 421 |             .virt_addr = undefined,
 422 |             .mem_size = 0,
 423 |             .flags = flags,
 424 |         };
 425 |         gop.kv.value = section;
 426 |         return section;
 427 |     }
 428 | 
 429 |     fn setCurrentSection(self: *AsmFile, name: []const u8, flags: u32) !void {
 430 |         const section = try self.findOrCreateSection(name, flags);
 431 |         self.current_section = section;
 432 |     }
 433 | 
 434 |     fn setCurrentSectionFlagsTok(self: *AsmFile, name: []const u8, flags_tok: Token) !void {
 435 |         var flags: u32 = 0;
 436 |         const flags_str = blk: {
 437 |             const tok_slice = self.tokenSlice(flags_tok);
 438 |             // skip over the double quotes
 439 |             break :blk tok_slice[1 .. tok_slice.len - 1];
 440 |         };
 441 |         for (flags_str) |b, offset| switch (b) {
 442 |             'a', 'r' => flags |= elf.PF_R,
 443 |             'w' => flags |= elf.PF_W,
 444 |             'x' => flags |= elf.PF_X,
 445 |             else => {
 446 |                 try self.assembly.errors.append(.{
 447 |                     .bad_section_flag = .{
 448 |                         .source_info = newSourceInfo(self, flags_tok),
 449 |                         .bad_index = offset,
 450 |                         .bad_byte = b,
 451 |                     },
 452 |                 });
 453 |                 return error.ParseFailure;
 454 |             },
 455 |         };
 456 | 
 457 |         return self.setCurrentSection(name, flags);
 458 |     }
 459 | 
 460 |     fn beginSymbol(self: *AsmFile, source_token: Token, name: []const u8) !void {
 461 |         const current_section = self.current_section orelse {
 462 |             try self.assembly.errors.append(.{
 463 |                 .symbol_outside_section = .{ .source_info = newSourceInfo(self, source_token) },
 464 |             });
 465 |             return error.ParseFailure;
 466 |         };
 467 |         const symbol = try self.symbols.allocator.create(Symbol);
 468 |         symbol.* = Symbol{
 469 |             .addr = undefined,
 470 |             .size = 0,
 471 |             .source_token = source_token,
 472 |             .name = name,
 473 |             .section = current_section,
 474 |             .ops = std.ArrayList(PseudoOp).init(self.assembly.allocator),
 475 |             .source_file = self,
 476 |         };
 477 |         if (try self.symbols.put(name, symbol)) |existing_entry| {
 478 |             try self.assembly.errors.append(.{
 479 |                 .duplicate_symbol = .{
 480 |                     .source_info = newSourceInfo(self, source_token),
 481 |                     .other_symbol = existing_entry.value.source_token,
 482 |                 },
 483 |             });
 484 |             return error.ParseFailure;
 485 |         }
 486 |         try current_section.layout.append(symbol);
 487 |         self.current_symbol = symbol;
 488 |     }
 489 | 
 490 |     fn addGlobal(self: *AsmFile, name: []const u8) !void {
 491 |         _ = try self.globals.put(name, {});
 492 |     }
 493 | 
 494 |     fn nextToken(self: *AsmFile) Token {
 495 |         if (self.put_back_count == 0) {
 496 |             return self.tokenizer.next();
 497 |         } else {
 498 |             self.put_back_count -= 1;
 499 |             return self.put_back_buffer[self.put_back_count];
 500 |         }
 501 |     }
 502 | 
 503 |     fn eatToken(self: *AsmFile, id: Token.Id) ?Token {
 504 |         const token = self.nextToken();
 505 |         if (token.id == id) return token;
 506 |         self.putBackToken(token);
 507 |         return null;
 508 |     }
 509 | 
 510 |     fn putBackToken(self: *AsmFile, token: Token) void {
 511 |         self.put_back_buffer[self.put_back_count] = token;
 512 |         self.put_back_count += 1;
 513 |     }
 514 | 
 515 |     fn expectToken(asm_file: *AsmFile, id: Token.Id) !Token {
 516 |         const token = asm_file.nextToken();
 517 |         if (token.id != id) {
 518 |             try asm_file.assembly.errors.append(.{
 519 |                 .unexpected_token = .{ .source_info = newSourceInfo(asm_file, token) },
 520 |             });
 521 |             return error.ParseFailure;
 522 |         }
 523 |         return token;
 524 |     }
 525 | 
 526 |     fn getCurrentSymbol(asm_file: *AsmFile, source_token: Token) !*Symbol {
 527 |         return asm_file.current_symbol orelse {
 528 |             try asm_file.assembly.errors.append(.{
 529 |                 .instr_outside_symbol = .{ .source_info = newSourceInfo(asm_file, source_token) },
 530 |             });
 531 |             return error.ParseFailure;
 532 |         };
 533 |     }
 534 | };
 535 | 
 536 | fn newSourceInfo(asm_file: *AsmFile, tok: Token) Assembly.SourceInfo {
 537 |     return .{
 538 |         .token = tok,
 539 |         .source = asm_file.source,
 540 |         .file_name = asm_file.file_name,
 541 |     };
 542 | }
 543 | 
 544 | fn assembleExecutable(assembly: *Assembly) !void {
 545 |     const cwd = fs.cwd();
 546 | 
 547 |     assembly.asm_files = try assembly.allocator.alloc(AsmFile, assembly.input_files.len);
 548 | 
 549 |     for (assembly.input_files) |input_file, input_file_index| {
 550 |         const asm_file = &assembly.asm_files[input_file_index];
 551 |         asm_file.* = .{
 552 |             .assembly = assembly,
 553 |             .file_name = input_file,
 554 |             .globals = AsmFile.GlobalSet.init(assembly.allocator),
 555 |             .symbols = AsmFile.SymbolTable.init(assembly.allocator),
 556 |             .source = try cwd.readFileAlloc(assembly.allocator, input_file, math.maxInt(usize)),
 557 |             .tokenizer = undefined,
 558 |             .put_back_buffer = undefined,
 559 |             .put_back_count = 0,
 560 |         };
 561 |         asm_file.tokenizer = Tokenizer.init(asm_file.source);
 562 |         while (true) {
 563 |             const token = asm_file.nextToken();
 564 |             switch (token.id) {
 565 |                 .line_break => continue,
 566 |                 .eof => break,
 567 |                 .period => {
 568 |                     const dir_ident = try asm_file.expectToken(.identifier);
 569 |                     const dir_name = asm_file.tokenSlice(dir_ident);
 570 |                     if (mem.eql(u8, dir_name, "text")) {
 571 |                         try asm_file.setCurrentSection("text", elf.PF_R | elf.PF_X);
 572 |                         _ = try asm_file.expectToken(.line_break);
 573 |                     } else if (mem.eql(u8, dir_name, "globl")) {
 574 |                         while (true) {
 575 |                             const ident = try asm_file.expectToken(.identifier);
 576 |                             try asm_file.addGlobal(asm_file.tokenSlice(ident));
 577 |                             if (asm_file.eatToken(.comma)) |_| continue else break;
 578 |                         }
 579 |                         _ = try asm_file.expectToken(.line_break);
 580 |                     } else if (mem.eql(u8, dir_name, "section")) {
 581 |                         _ = try asm_file.expectToken(.period);
 582 |                         const sect_name_token = try asm_file.expectToken(.identifier);
 583 |                         const sect_name = asm_file.tokenSlice(sect_name_token);
 584 |                         _ = try asm_file.expectToken(.comma);
 585 |                         const flags_tok = try asm_file.expectToken(.string_literal);
 586 |                         try asm_file.setCurrentSectionFlagsTok(sect_name, flags_tok);
 587 |                         _ = try asm_file.expectToken(.line_break);
 588 |                     } else if (mem.eql(u8, dir_name, "ascii")) {
 589 |                         const current_symbol = try asm_file.getCurrentSymbol(dir_ident);
 590 | 
 591 |                         const str_lit_tok = try asm_file.expectToken(.string_literal);
 592 |                         const str_lit = asm_file.tokenSlice(str_lit_tok);
 593 |                         var bad_index: usize = undefined;
 594 |                         const bytes = parseStringLiteral(
 595 |                             assembly.allocator,
 596 |                             str_lit,
 597 |                             &bad_index,
 598 |                         ) catch |err| switch (err) {
 599 |                             error.InvalidCharacter => {
 600 |                                 try assembly.errors.append(.{
 601 |                                     .bad_string_literal = .{
 602 |                                         .source_info = newSourceInfo(asm_file, str_lit_tok),
 603 |                                         .bad_index = bad_index,
 604 |                                     },
 605 |                                 });
 606 |                                 return error.ParseFailure;
 607 |                             },
 608 |                             error.OutOfMemory => |e| return e,
 609 |                         };
 610 | 
 611 |                         try current_symbol.ops.append(.{ .data = bytes });
 612 |                         current_symbol.size += bytes.len;
 613 |                         _ = try asm_file.expectToken(.line_break);
 614 |                     } else {
 615 |                         try assembly.errors.append(.{
 616 |                             .unrecognized_directive = .{ .source_info = newSourceInfo(asm_file, dir_ident) },
 617 |                         });
 618 |                         return error.ParseFailure;
 619 |                     }
 620 |                 },
 621 |                 .identifier => {
 622 |                     if (asm_file.eatToken(.colon)) |_| {
 623 |                         const symbol_name = asm_file.tokenSlice(token);
 624 |                         try asm_file.beginSymbol(token, symbol_name);
 625 |                     } else {
 626 |                         var arg_toks: [2]Token = undefined;
 627 |                         var arg_count: usize = 0;
 628 |                         var last_comma_tok: Token = undefined;
 629 |                         while (arg_count < arg_toks.len) {
 630 |                             const tok = asm_file.nextToken();
 631 |                             if (tok.id == .line_break) break;
 632 | 
 633 |                             arg_toks[arg_count] = tok;
 634 |                             arg_count += 1;
 635 | 
 636 |                             if (asm_file.eatToken(.comma)) |comma_tok| {
 637 |                                 last_comma_tok = comma_tok;
 638 |                                 continue;
 639 |                             } else {
 640 |                                 break;
 641 |                             }
 642 |                         } else {
 643 |                             try assembly.errors.append(.{
 644 |                                 .too_many_args = .{ .source_info = newSourceInfo(asm_file, last_comma_tok) },
 645 |                             });
 646 |                             return error.ParseFailure;
 647 |                         }
 648 | 
 649 |                         const wanted_instr_name = asm_file.tokenSlice(token);
 650 |                         const inst = outer: for (data.instructions) |*inst| {
 651 |                             if (!mem.eql(u8, inst.name, wanted_instr_name)) continue;
 652 |                             if (inst.args.len != arg_count) continue;
 653 |                             for (inst.args) |inst_arg, i| switch (inst_arg) {
 654 |                                 .register => |reg| {
 655 |                                     if (arg_toks[i].id != .identifier) continue :outer;
 656 |                                     const src_reg_name = asm_file.tokenSlice(arg_toks[i]);
 657 |                                     // TODO validate register name
 658 |                                     if (!mem.eql(u8, src_reg_name, @tagName(reg))) continue :outer;
 659 |                                 },
 660 |                                 .immediate => switch (arg_toks[i].id) {
 661 |                                     .integer_literal, .char_literal, .identifier => {},
 662 |                                     else => continue :outer,
 663 |                                 },
 664 |                             };
 665 |                             break :outer inst;
 666 |                         } else {
 667 |                             try assembly.errors.append(.{
 668 |                                 .unrecognized_instruction = .{ .source_info = newSourceInfo(asm_file, token) },
 669 |                             });
 670 |                             return error.ParseFailure;
 671 |                         };
 672 |                         const current_symbol = try asm_file.getCurrentSymbol(token);
 673 |                         var args = std.ArrayList(Arg).init(assembly.allocator);
 674 |                         for (arg_toks[0..arg_count]) |arg_token| {
 675 |                             const arg_text = asm_file.tokenSlice(arg_token);
 676 |                             var arg: Arg = undefined;
 677 |                             switch (arg_token.id) {
 678 |                                 .integer_literal => {
 679 |                                     var text: []const u8 = undefined;
 680 |                                     var base: u8 = undefined;
 681 |                                     if (mem.startsWith(u8, arg_text, "0x")) {
 682 |                                         base = 16;
 683 |                                         text = arg_text[2..];
 684 |                                     } else if (mem.startsWith(u8, arg_text, "0b")) {
 685 |                                         base = 2;
 686 |                                         text = arg_text[2..];
 687 |                                     } else if (mem.startsWith(u8, arg_text, "0o")) {
 688 |                                         base = 8;
 689 |                                         text = arg_text[2..];
 690 |                                     } else {
 691 |                                         base = 10;
 692 |                                         text = arg_text;
 693 |                                     }
 694 |                                     const imm = std.fmt.parseUnsigned(u64, text, base) catch |err| {
 695 |                                         try asm_file.assembly.errors.append(.{
 696 |                                             .bad_integer_literal = .{ .source_info = newSourceInfo(asm_file, arg_token) },
 697 |                                         });
 698 |                                         return error.ParseFailure;
 699 |                                     };
 700 |                                     arg = Arg{ .immediate = imm };
 701 |                                 },
 702 |                                 .identifier => {
 703 |                                     inline for (std.meta.fields(data.Register)) |field| {
 704 |                                         if (mem.eql(u8, arg_text, field.name)) {
 705 |                                             const reg = @field(data.Register, field.name);
 706 |                                             arg = Arg{ .register = reg };
 707 |                                             break;
 708 |                                         }
 709 |                                     } else {
 710 |                                         arg = Arg{ .symbol_ref = arg_token };
 711 |                                     }
 712 |                                 },
 713 |                                 else => {
 714 |                                     try asm_file.assembly.errors.append(.{
 715 |                                         .unexpected_token = .{ .source_info = newSourceInfo(asm_file, arg_token) },
 716 |                                     });
 717 |                                     return error.ParseFailure;
 718 |                                 },
 719 |                             }
 720 |                             try args.append(arg);
 721 |                         }
 722 | 
 723 |                         try current_symbol.ops.append(.{
 724 |                             .instruction = .{
 725 |                                 .props = inst,
 726 |                                 .args = args.toSliceConst(),
 727 |                             },
 728 |                         });
 729 |                         current_symbol.size += inst.size;
 730 |                     }
 731 |                 },
 732 |                 else => {
 733 |                     try assembly.errors.append(.{
 734 |                         .unexpected_token = .{ .source_info = newSourceInfo(asm_file, token) },
 735 |                     });
 736 |                     return error.ParseFailure;
 737 |                 },
 738 |             }
 739 |         }
 740 |     }
 741 | 
 742 |     const output_file = assembly.output_file orelse blk: {
 743 |         const basename = fs.path.basename(assembly.input_files[0]);
 744 |         const dot_index = mem.lastIndexOfScalar(u8, basename, '.') orelse basename.len;
 745 |         break :blk basename[0..dot_index];
 746 |     };
 747 |     const file = try std.fs.cwd().createFile(output_file, .{
 748 |         .mode = 0o755,
 749 |         .truncate = false,
 750 |     });
 751 |     defer file.close();
 752 | 
 753 |     const ptr_width: PtrWidth = switch (assembly.target.getArchPtrBitWidth()) {
 754 |         32 => ._32,
 755 |         64 => ._64,
 756 |         else => return error.UnsupportedArchitecture,
 757 |     };
 758 |     const ehdr_size: u64 = switch (ptr_width) {
 759 |         ._32 => @sizeOf(elf.Elf32_Ehdr),
 760 |         ._64 => @sizeOf(elf.Elf64_Ehdr),
 761 |     };
 762 |     const phdr_size: u64 = switch (ptr_width) {
 763 |         ._32 => @sizeOf(elf.Elf32_Phdr),
 764 |         ._64 => @sizeOf(elf.Elf64_Phdr),
 765 |     };
 766 |     const section_names = [_][]const u8{ "rodata", "text" };
 767 |     assembly.file_offset = ehdr_size + phdr_size * section_names.len;
 768 | 
 769 |     for (section_names) |section_name| {
 770 |         const section = (assembly.sections.get(section_name) orelse break).value;
 771 | 
 772 |         assembly.file_offset = mem.alignForward(assembly.file_offset, section.alignment);
 773 |         assembly.next_map_addr = mem.alignForward(assembly.next_map_addr, section.alignment);
 774 | 
 775 |         section.file_offset = assembly.file_offset;
 776 |         section.virt_addr = assembly.next_map_addr;
 777 | 
 778 |         try file.seekTo(assembly.file_offset);
 779 |         const prev_file_offset = assembly.file_offset;
 780 |         const prev_map_addr = assembly.next_map_addr;
 781 |         try writeSection(assembly, section, file, ptr_width);
 782 |         section.file_size = assembly.file_offset - prev_file_offset;
 783 |         section.mem_size = assembly.next_map_addr - prev_map_addr;
 784 |     }
 785 | 
 786 |     try file.seekTo(0);
 787 |     try writeElfHeader(assembly, file, ptr_width, &section_names);
 788 | }
 789 | 
 790 | fn writeSection(assembly: *Assembly, section: *Section, file: fs.File, ptr_width: PtrWidth) !void {
 791 |     const endian = assembly.target.getArch().endian();
 792 | 
 793 |     for (section.layout.toSliceConst()) |symbol| {
 794 |         symbol.addr = assembly.next_map_addr;
 795 | 
 796 |         for (symbol.ops.toSliceConst()) |pseudo_op| {
 797 |             switch (pseudo_op) {
 798 |                 .instruction => |inst| {
 799 |                     var buf: [8]u8 = undefined;
 800 |                     var index: usize = 0;
 801 |                     if (inst.props.prefix) |prefix| {
 802 |                         buf[index] = prefix;
 803 |                         index += 1;
 804 |                     }
 805 |                     buf[index] = inst.props.po;
 806 |                     index += 1;
 807 | 
 808 |                     for (inst.args) |arg| switch (arg) {
 809 |                         .register => {},
 810 |                         .immediate => |x| {
 811 |                             mem.writeInt(u32, @ptrCast(*[4]u8, &buf[index]), @intCast(u32, x), endian);
 812 |                             index += 4;
 813 |                         },
 814 |                         .symbol_ref => |other_sym_tok| {
 815 |                             const other_sym_name = symbol.source_file.tokenSlice(other_sym_tok);
 816 |                             const other_symbol = symbol.source_file.symbols.getValue(other_sym_name) orelse {
 817 |                                 try assembly.errors.append(.{
 818 |                                     .unknown_symbol = .{
 819 |                                         .source_info = newSourceInfo(
 820 |                                             symbol.source_file,
 821 |                                             other_sym_tok,
 822 |                                         ),
 823 |                                     },
 824 |                                 });
 825 |                                 return error.ParseFailure;
 826 |                             };
 827 |                             mem.writeInt(u32, @ptrCast(*[4]u8, &buf[index]), @intCast(u32, other_symbol.addr), endian);
 828 |                             index += 4;
 829 |                         },
 830 |                     };
 831 | 
 832 |                     if (inst.props.suffix) |suffix| {
 833 |                         buf[index] = suffix;
 834 |                         index += 1;
 835 |                     }
 836 | 
 837 |                     try file.write(buf[0..index]);
 838 |                     assembly.next_map_addr += index;
 839 |                     assembly.file_offset += index;
 840 |                 },
 841 |                 .data => |slice| {
 842 |                     try file.write(slice);
 843 |                     assembly.next_map_addr += slice.len;
 844 |                     assembly.file_offset += slice.len;
 845 |                 },
 846 |             }
 847 |         }
 848 |         if (mem.eql(u8, symbol.name, "_start")) {
 849 |             if (assembly.entry_addr) |prev_addr| {
 850 |                 @panic("TODO emit error for _start already defined");
 851 |             } else {
 852 |                 assembly.entry_addr = symbol.addr;
 853 |             }
 854 |         }
 855 |     }
 856 | }
 857 | 
 858 | const PtrWidth = enum {
 859 |     _32,
 860 |     _64,
 861 | };
 862 | 
 863 | fn writeElfHeader(
 864 |     assembly: *Assembly,
 865 |     /// Expected to be already seeked to position 0 in the file.
 866 |     file: fs.File,
 867 |     ptr_width: PtrWidth,
 868 |     section_names: []const []const u8,
 869 | ) !void {
 870 |     const endian = assembly.target.getArch().endian();
 871 |     var hdr_buf: [math.max(@sizeOf(elf.Elf64_Ehdr), @sizeOf(elf.Elf64_Phdr))]u8 = undefined;
 872 |     var index: usize = 0;
 873 | 
 874 |     mem.copy(u8, hdr_buf[index..], "\x7fELF");
 875 |     index += 4;
 876 | 
 877 |     hdr_buf[index] = switch (ptr_width) {
 878 |         ._32 => 1,
 879 |         ._64 => 2,
 880 |     };
 881 |     index += 1;
 882 | 
 883 |     hdr_buf[index] = switch (endian) {
 884 |         .Little => 1,
 885 |         .Big => 2,
 886 |     };
 887 |     index += 1;
 888 | 
 889 |     hdr_buf[index] = 1; // ELF version
 890 |     index += 1;
 891 | 
 892 |     // OS ABI, often set to 0 regardless of target platform
 893 |     // ABI Version, possibly used by glibc but not by static executables
 894 |     // padding
 895 |     mem.set(u8, hdr_buf[index..][0..9], 0);
 896 |     index += 9;
 897 | 
 898 |     assert(index == 16);
 899 | 
 900 |     // TODO: https://github.com/ziglang/zig/issues/863 makes this (and all following) @ptrCast unnecessary
 901 |     mem.writeInt(u16, @ptrCast(*[2]u8, &hdr_buf[index]), @enumToInt(elf.ET.EXEC), endian);
 902 |     index += 2;
 903 | 
 904 |     const machine = assembly.target.getArch().toElfMachine();
 905 |     mem.writeInt(u16, @ptrCast(*[2]u8, &hdr_buf[index]), @enumToInt(machine), endian);
 906 |     index += 2;
 907 | 
 908 |     // ELF Version, again
 909 |     mem.writeInt(u32, @ptrCast(*[4]u8, &hdr_buf[index]), 1, endian);
 910 |     index += 4;
 911 | 
 912 |     switch (ptr_width) {
 913 |         ._32 => {
 914 |             // e_entry
 915 |             mem.writeInt(u32, @ptrCast(*[4]u8, &hdr_buf[index]), @intCast(u32, assembly.entry_addr.?), endian);
 916 |             index += 4;
 917 | 
 918 |             // e_phoff
 919 |             mem.writeInt(u32, @ptrCast(*[4]u8, &hdr_buf[index]), @sizeOf(elf.Elf32_Ehdr), endian);
 920 |             index += 4;
 921 | 
 922 |             // e_shoff
 923 |             mem.writeInt(u32, @ptrCast(*[4]u8, &hdr_buf[index]), 0, endian);
 924 |             index += 4;
 925 |         },
 926 |         ._64 => {
 927 |             // e_entry
 928 |             mem.writeInt(u64, @ptrCast(*[8]u8, &hdr_buf[index]), assembly.entry_addr.?, endian);
 929 |             index += 8;
 930 | 
 931 |             // e_phoff
 932 |             mem.writeInt(u64, @ptrCast(*[8]u8, &hdr_buf[index]), @sizeOf(elf.Elf64_Ehdr), endian);
 933 |             index += 8;
 934 | 
 935 |             // e_shoff
 936 |             mem.writeInt(u64, @ptrCast(*[8]u8, &hdr_buf[index]), 0, endian);
 937 |             index += 8;
 938 |         },
 939 |     }
 940 | 
 941 |     const e_flags = 0;
 942 |     mem.writeInt(u32, @ptrCast(*[4]u8, &hdr_buf[index]), e_flags, endian);
 943 |     index += 4;
 944 | 
 945 |     const e_ehsize: u16 = switch (ptr_width) {
 946 |         ._32 => @sizeOf(elf.Elf32_Ehdr),
 947 |         ._64 => @sizeOf(elf.Elf64_Ehdr),
 948 |     };
 949 |     mem.writeInt(u16, @ptrCast(*[2]u8, &hdr_buf[index]), e_ehsize, endian);
 950 |     index += 2;
 951 | 
 952 |     const e_phentsize: u16 = switch (ptr_width) {
 953 |         ._32 => @sizeOf(elf.Elf32_Phdr),
 954 |         ._64 => @sizeOf(elf.Elf64_Phdr),
 955 |     };
 956 |     mem.writeInt(u16, @ptrCast(*[2]u8, &hdr_buf[index]), e_phentsize, endian);
 957 |     index += 2;
 958 | 
 959 |     const e_phnum = @intCast(u16, section_names.len);
 960 |     mem.writeInt(u16, @ptrCast(*[2]u8, &hdr_buf[index]), e_phnum, endian);
 961 |     index += 2;
 962 | 
 963 |     const e_shentsize: u16 = switch (ptr_width) {
 964 |         ._32 => @sizeOf(elf.Elf32_Shdr),
 965 |         ._64 => @sizeOf(elf.Elf64_Shdr),
 966 |     };
 967 |     mem.writeInt(u16, @ptrCast(*[2]u8, &hdr_buf[index]), e_shentsize, endian);
 968 |     index += 2;
 969 | 
 970 |     const e_shnum = 0;
 971 |     mem.writeInt(u16, @ptrCast(*[2]u8, &hdr_buf[index]), e_shnum, endian);
 972 |     index += 2;
 973 | 
 974 |     const e_shstrndx = 0;
 975 |     mem.writeInt(u16, @ptrCast(*[2]u8, &hdr_buf[index]), e_shstrndx, endian);
 976 |     index += 2;
 977 | 
 978 |     assert(index == e_ehsize);
 979 | 
 980 |     try file.write(hdr_buf[0..index]);
 981 | 
 982 |     // Program headers
 983 |     for (section_names) |section_name| {
 984 |         const section = (assembly.sections.get(section_name) orelse break).value;
 985 | 
 986 |         index = 0;
 987 | 
 988 |         const p_type = elf.PT_LOAD;
 989 |         mem.writeInt(u32, @ptrCast(*[4]u8, &hdr_buf[index]), p_type, endian);
 990 |         index += 4;
 991 | 
 992 |         switch (ptr_width) {
 993 |             ._32 => @panic("TODO"),
 994 |             ._64 => {
 995 |                 mem.writeInt(u32, @ptrCast(*[4]u8, &hdr_buf[index]), section.flags, endian);
 996 |                 index += 4;
 997 | 
 998 |                 mem.writeInt(u64, @ptrCast(*[8]u8, &hdr_buf[index]), section.file_offset, endian);
 999 |                 index += 8;
1000 | 
1001 |                 mem.writeInt(u64, @ptrCast(*[8]u8, &hdr_buf[index]), section.virt_addr, endian);
1002 |                 index += 8;
1003 | 
1004 |                 mem.writeInt(u64, @ptrCast(*[8]u8, &hdr_buf[index]), section.virt_addr, endian);
1005 |                 index += 8;
1006 | 
1007 |                 mem.writeInt(u64, @ptrCast(*[8]u8, &hdr_buf[index]), section.file_size, endian);
1008 |                 index += 8;
1009 | 
1010 |                 mem.writeInt(u64, @ptrCast(*[8]u8, &hdr_buf[index]), section.mem_size, endian);
1011 |                 index += 8;
1012 | 
1013 |                 mem.writeInt(u64, @ptrCast(*[8]u8, &hdr_buf[index]), section.alignment, endian);
1014 |                 index += 8;
1015 |             },
1016 |             else => unreachable,
1017 |         }
1018 | 
1019 |         assert(index == e_phentsize);
1020 |         try file.write(hdr_buf[0..index]);
1021 |     }
1022 | }
1023 | 
1024 | fn dumpStdErrUsageAndExit() noreturn {
1025 |     dumpUsage(std.io.getStdErr()) catch {};
1026 |     process.exit(1);
1027 | }
1028 | 
1029 | fn dumpUsage(file: fs.File) !void {
1030 |     try file.write(
1031 |         \\Usage: zasm [command] [options]
1032 |         \\
1033 |         \\Commands:
1034 |         \\  exe [files]          create an executable file
1035 |         \\  obj [files]          create an object file
1036 |         \\  dis [file]           disassemble a binary into source
1037 |         \\  targets              list the supported targets to stdout
1038 |         \\  tokenize [file]      (debug) tokenize the input files
1039 |         \\
1040 |         \\Options:
1041 |         \\  -o [file]            override output file name
1042 |         \\  -help                dump this help text to stdout
1043 |         \\  -target [arch]-[os]  specify the target for positional arguments
1044 |         \\  -debug-errors        (debug) show stack trace on error
1045 |         \\
1046 |     );
1047 | }
1048 | 
1049 | test "" {
1050 |     _ = Token;
1051 |     _ = Tokenizer;
1052 | }
1053 | 


--------------------------------------------------------------------------------
/src/tokenize.zig:
--------------------------------------------------------------------------------
  1 | const std = @import("std");
  2 | const mem = std.mem;
  3 | 
  4 | pub const Token = struct {
  5 |     id: Id,
  6 |     start: usize,
  7 |     end: usize,
  8 | 
  9 |     pub const Id = enum {
 10 |         invalid,
 11 |         identifier,
 12 |         string_literal,
 13 |         integer_literal,
 14 |         float_literal,
 15 |         char_literal,
 16 |         colon,
 17 |         comma,
 18 |         line_comment,
 19 |         line_break,
 20 |         period,
 21 |         slash,
 22 |         eof,
 23 |     };
 24 | };
 25 | 
 26 | pub const Tokenizer = struct {
 27 |     buffer: []const u8,
 28 |     index: usize,
 29 |     pending_invalid_token: ?Token,
 30 | 
 31 |     pub fn init(buffer: []const u8) Tokenizer {
 32 |         // Skip the UTF-8 BOM if present
 33 |         return Tokenizer{
 34 |             .buffer = buffer,
 35 |             .index = if (mem.startsWith(u8, buffer, "\xEF\xBB\xBF")) 3 else 0,
 36 |             .pending_invalid_token = null,
 37 |         };
 38 |     }
 39 | 
 40 |     const State = enum {
 41 |         start,
 42 |         char_literal,
 43 |         char_literal_backslash,
 44 |         char_literal_end,
 45 |         char_literal_hex_escape,
 46 |         char_literal_unicode,
 47 |         char_literal_unicode_escape,
 48 |         char_literal_unicode_escape_saw_u,
 49 |         char_literal_unicode_invalid,
 50 |         float_exponent_number,
 51 |         float_exponent_number_hex,
 52 |         float_exponent_unsigned,
 53 |         float_exponent_unsigned_hex,
 54 |         float_fraction,
 55 |         float_fraction_hex,
 56 |         identifier,
 57 |         integer_literal,
 58 |         integer_literal_with_radix,
 59 |         integer_literal_with_radix_hex,
 60 |         line_comment,
 61 |         number_dot,
 62 |         number_dot_hex,
 63 |         slash,
 64 |         string_literal,
 65 |         string_literal_backslash,
 66 |         zero,
 67 |         line_break,
 68 |     };
 69 | 
 70 |     pub fn next(self: *Tokenizer) Token {
 71 |         if (self.pending_invalid_token) |token| {
 72 |             self.pending_invalid_token = null;
 73 |             return token;
 74 |         }
 75 |         const start_index = self.index;
 76 |         var state: State = .start;
 77 |         var result = Token{
 78 |             .id = .eof,
 79 |             .start = self.index,
 80 |             .end = undefined,
 81 |         };
 82 |         var seen_escape_digits: usize = undefined;
 83 |         var remaining_code_units: usize = undefined;
 84 |         while (self.index < self.buffer.len) : (self.index += 1) {
 85 |             const c = self.buffer[self.index];
 86 |             switch (state) {
 87 |                 .start => switch (c) {
 88 |                     ' ', '\t', '\r' => {
 89 |                         result.start = self.index + 1;
 90 |                     },
 91 |                     '\n' => {
 92 |                         result.id = .line_break;
 93 |                         state = .line_break;
 94 |                     },
 95 |                     '"' => {
 96 |                         state = .string_literal;
 97 |                         result.id = .string_literal;
 98 |                     },
 99 |                     '\'' => {
100 |                         state = .char_literal;
101 |                     },
102 |                     'a'...'z', 'A'...'Z', '_' => {
103 |                         state = .identifier;
104 |                         result.id = .identifier;
105 |                     },
106 |                     ',' => {
107 |                         result.id = .comma;
108 |                         self.index += 1;
109 |                         break;
110 |                     },
111 |                     ':' => {
112 |                         result.id = .colon;
113 |                         self.index += 1;
114 |                         break;
115 |                     },
116 |                     '.' => {
117 |                         result.id = .period;
118 |                         self.index += 1;
119 |                         break;
120 |                     },
121 |                     '/' => {
122 |                         state = .slash;
123 |                     },
124 |                     '0' => {
125 |                         state = .zero;
126 |                         result.id = .integer_literal;
127 |                     },
128 |                     '1'...'9' => {
129 |                         state = .integer_literal;
130 |                         result.id = .integer_literal;
131 |                     },
132 |                     else => {
133 |                         result.id = .invalid;
134 |                         self.index += 1;
135 |                         break;
136 |                     },
137 |                 },
138 | 
139 |                 .line_break => switch (c) {
140 |                     '\n', '\r' => {},
141 |                     else => break,
142 |                 },
143 | 
144 |                 .identifier => switch (c) {
145 |                     'a'...'z', 'A'...'Z', '_', '0'...'9' => {},
146 |                     else => break,
147 |                 },
148 |                 .string_literal => switch (c) {
149 |                     '\\' => {
150 |                         state = .string_literal_backslash;
151 |                     },
152 |                     '"' => {
153 |                         self.index += 1;
154 |                         break;
155 |                     },
156 |                     '\n', '\r' => break, // Look for this error later.
157 |                     else => self.checkLiteralCharacter(),
158 |                 },
159 | 
160 |                 .string_literal_backslash => switch (c) {
161 |                     '\n', '\r' => break, // Look for this error later.
162 |                     else => {
163 |                         state = .string_literal;
164 |                     },
165 |                 },
166 | 
167 |                 .char_literal => switch (c) {
168 |                     '\\' => {
169 |                         state = .char_literal_backslash;
170 |                     },
171 |                     '\'', 0x80...0xbf, 0xf8...0xff => {
172 |                         result.id = .invalid;
173 |                         break;
174 |                     },
175 |                     0xc0...0xdf => { // 110xxxxx
176 |                         remaining_code_units = 1;
177 |                         state = .char_literal_unicode;
178 |                     },
179 |                     0xe0...0xef => { // 1110xxxx
180 |                         remaining_code_units = 2;
181 |                         state = .char_literal_unicode;
182 |                     },
183 |                     0xf0...0xf7 => { // 11110xxx
184 |                         remaining_code_units = 3;
185 |                         state = .char_literal_unicode;
186 |                     },
187 |                     else => {
188 |                         state = .char_literal_end;
189 |                     },
190 |                 },
191 | 
192 |                 .char_literal_backslash => switch (c) {
193 |                     '\n' => {
194 |                         result.id = .invalid;
195 |                         break;
196 |                     },
197 |                     'x' => {
198 |                         state = .char_literal_hex_escape;
199 |                         seen_escape_digits = 0;
200 |                     },
201 |                     'u' => {
202 |                         state = .char_literal_unicode_escape_saw_u;
203 |                     },
204 |                     else => {
205 |                         state = .char_literal_end;
206 |                     },
207 |                 },
208 | 
209 |                 .char_literal_hex_escape => switch (c) {
210 |                     '0'...'9', 'a'...'f', 'A'...'F' => {
211 |                         seen_escape_digits += 1;
212 |                         if (seen_escape_digits == 2) {
213 |                             state = .char_literal_end;
214 |                         }
215 |                     },
216 |                     else => {
217 |                         result.id = .invalid;
218 |                         break;
219 |                     },
220 |                 },
221 | 
222 |                 .char_literal_unicode_escape_saw_u => switch (c) {
223 |                     '{' => {
224 |                         state = .char_literal_unicode_escape;
225 |                         seen_escape_digits = 0;
226 |                     },
227 |                     else => {
228 |                         result.id = .invalid;
229 |                         state = .char_literal_unicode_invalid;
230 |                     },
231 |                 },
232 | 
233 |                 .char_literal_unicode_escape => switch (c) {
234 |                     '0'...'9', 'a'...'f', 'A'...'F' => {
235 |                         seen_escape_digits += 1;
236 |                     },
237 |                     '}' => {
238 |                         if (seen_escape_digits == 0) {
239 |                             result.id = .invalid;
240 |                             state = .char_literal_unicode_invalid;
241 |                         } else {
242 |                             state = .char_literal_end;
243 |                         }
244 |                     },
245 |                     else => {
246 |                         result.id = .invalid;
247 |                         state = .char_literal_unicode_invalid;
248 |                     },
249 |                 },
250 | 
251 |                 .char_literal_unicode_invalid => switch (c) {
252 |                     // Keep consuming characters until an obvious stopping point.
253 |                     // This consolidates e.g. `u{0ab1Q}` into a single invalid token
254 |                     // instead of creating the tokens `u{0ab1`, `Q`, `}`
255 |                     '0'...'9', 'a'...'z', 'A'...'Z', '}' => {},
256 |                     else => break,
257 |                 },
258 | 
259 |                 .char_literal_end => switch (c) {
260 |                     '\'' => {
261 |                         result.id = .char_literal;
262 |                         self.index += 1;
263 |                         break;
264 |                     },
265 |                     else => {
266 |                         result.id = .invalid;
267 |                         break;
268 |                     },
269 |                 },
270 | 
271 |                 .char_literal_unicode => switch (c) {
272 |                     0x80...0xbf => {
273 |                         remaining_code_units -= 1;
274 |                         if (remaining_code_units == 0) {
275 |                             state = .char_literal_end;
276 |                         }
277 |                     },
278 |                     else => {
279 |                         result.id = .invalid;
280 |                         break;
281 |                     },
282 |                 },
283 | 
284 |                 .slash => switch (c) {
285 |                     '/' => {
286 |                         state = .line_comment;
287 |                         result.id = .line_comment;
288 |                     },
289 |                     else => {
290 |                         result.id = .slash;
291 |                         break;
292 |                     },
293 |                 },
294 |                 .line_comment => switch (c) {
295 |                     '\n' => break,
296 |                     else => self.checkLiteralCharacter(),
297 |                 },
298 |                 .zero => switch (c) {
299 |                     'b', 'o' => {
300 |                         state = .integer_literal_with_radix;
301 |                     },
302 |                     'x' => {
303 |                         state = .integer_literal_with_radix_hex;
304 |                     },
305 |                     else => {
306 |                         // reinterpret as a normal number
307 |                         self.index -= 1;
308 |                         state = .integer_literal;
309 |                     },
310 |                 },
311 |                 .integer_literal => switch (c) {
312 |                     '.' => {
313 |                         state = .number_dot;
314 |                     },
315 |                     'p', 'P', 'e', 'E' => {
316 |                         state = .float_exponent_unsigned;
317 |                     },
318 |                     '0'...'9' => {},
319 |                     else => break,
320 |                 },
321 |                 .integer_literal_with_radix => switch (c) {
322 |                     '.' => {
323 |                         state = .number_dot;
324 |                     },
325 |                     '0'...'9' => {},
326 |                     else => break,
327 |                 },
328 |                 .integer_literal_with_radix_hex => switch (c) {
329 |                     '.' => {
330 |                         state = .number_dot_hex;
331 |                     },
332 |                     'p', 'P' => {
333 |                         state = .float_exponent_unsigned_hex;
334 |                     },
335 |                     '0'...'9', 'a'...'f', 'A'...'F' => {},
336 |                     else => break,
337 |                 },
338 |                 .number_dot => switch (c) {
339 |                     '.' => {
340 |                         self.index -= 1;
341 |                         state = .start;
342 |                         break;
343 |                     },
344 |                     else => {
345 |                         self.index -= 1;
346 |                         result.id = .float_literal;
347 |                         state = .float_fraction;
348 |                     },
349 |                 },
350 |                 .number_dot_hex => switch (c) {
351 |                     '.' => {
352 |                         self.index -= 1;
353 |                         state = .start;
354 |                         break;
355 |                     },
356 |                     else => {
357 |                         self.index -= 1;
358 |                         result.id = .float_literal;
359 |                         state = .float_fraction_hex;
360 |                     },
361 |                 },
362 |                 .float_fraction => switch (c) {
363 |                     'e', 'E' => {
364 |                         state = .float_exponent_unsigned;
365 |                     },
366 |                     '0'...'9' => {},
367 |                     else => break,
368 |                 },
369 |                 .float_fraction_hex => switch (c) {
370 |                     'p', 'P' => {
371 |                         state = .float_exponent_unsigned_hex;
372 |                     },
373 |                     '0'...'9', 'a'...'f', 'A'...'F' => {},
374 |                     else => break,
375 |                 },
376 |                 .float_exponent_unsigned => switch (c) {
377 |                     '+', '-' => {
378 |                         state = .float_exponent_number;
379 |                     },
380 |                     else => {
381 |                         // reinterpret as a normal exponent number
382 |                         self.index -= 1;
383 |                         state = .float_exponent_number;
384 |                     },
385 |                 },
386 |                 .float_exponent_unsigned_hex => switch (c) {
387 |                     '+', '-' => {
388 |                         state = .float_exponent_number_hex;
389 |                     },
390 |                     else => {
391 |                         // reinterpret as a normal exponent number
392 |                         self.index -= 1;
393 |                         state = .float_exponent_number_hex;
394 |                     },
395 |                 },
396 |                 .float_exponent_number => switch (c) {
397 |                     '0'...'9' => {},
398 |                     else => break,
399 |                 },
400 |                 .float_exponent_number_hex => switch (c) {
401 |                     '0'...'9', 'a'...'f', 'A'...'F' => {},
402 |                     else => break,
403 |                 },
404 |             }
405 |         } else if (self.index == self.buffer.len) {
406 |             switch (state) {
407 |                 .start,
408 |                 .integer_literal,
409 |                 .integer_literal_with_radix,
410 |                 .integer_literal_with_radix_hex,
411 |                 .float_fraction,
412 |                 .float_fraction_hex,
413 |                 .float_exponent_number,
414 |                 .float_exponent_number_hex,
415 |                 .line_break,
416 |                 .string_literal, // find this error later
417 |                 => {},
418 | 
419 |                 .identifier => {},
420 | 
421 |                 .line_comment => {
422 |                     result.id = Token.Id.line_comment;
423 |                 },
424 | 
425 |                 .number_dot,
426 |                 .number_dot_hex,
427 |                 .float_exponent_unsigned,
428 |                 .float_exponent_unsigned_hex,
429 |                 .char_literal,
430 |                 .char_literal_backslash,
431 |                 .char_literal_hex_escape,
432 |                 .char_literal_unicode_escape_saw_u,
433 |                 .char_literal_unicode_escape,
434 |                 .char_literal_unicode_invalid,
435 |                 .char_literal_end,
436 |                 .char_literal_unicode,
437 |                 .string_literal_backslash,
438 |                 => {
439 |                     result.id = .invalid;
440 |                 },
441 | 
442 |                 .slash => {
443 |                     result.id = .slash;
444 |                 },
445 |                 .zero => {
446 |                     result.id = .integer_literal;
447 |                 },
448 |             }
449 |         }
450 | 
451 |         if (result.id == .eof) {
452 |             if (self.pending_invalid_token) |token| {
453 |                 self.pending_invalid_token = null;
454 |                 return token;
455 |             }
456 |         }
457 | 
458 |         result.end = self.index;
459 |         return result;
460 |     }
461 | 
462 |     fn checkLiteralCharacter(self: *Tokenizer) void {
463 |         if (self.pending_invalid_token != null) return;
464 |         const invalid_length = self.getInvalidCharacterLength();
465 |         if (invalid_length == 0) return;
466 |         self.pending_invalid_token = Token{
467 |             .id = .invalid,
468 |             .start = self.index,
469 |             .end = self.index + invalid_length,
470 |         };
471 |     }
472 | 
473 |     fn getInvalidCharacterLength(self: *Tokenizer) u3 {
474 |         const c0 = self.buffer[self.index];
475 |         if (c0 < 0x80) {
476 |             if (c0 < 0x20 or c0 == 0x7f) {
477 |                 // ascii control codes are never allowed
478 |                 // (note that \n was checked before we got here)
479 |                 return 1;
480 |             }
481 |             // looks fine to me.
482 |             return 0;
483 |         } else {
484 |             // check utf8-encoded character.
485 |             const length = std.unicode.utf8ByteSequenceLength(c0) catch return 1;
486 |             if (self.index + length > self.buffer.len) {
487 |                 return @intCast(u3, self.buffer.len - self.index);
488 |             }
489 |             const bytes = self.buffer[self.index .. self.index + length];
490 |             switch (length) {
491 |                 2 => {
492 |                     const value = std.unicode.utf8Decode2(bytes) catch return length;
493 |                     if (value == 0x85) return length; // U+0085 (NEL)
494 |                 },
495 |                 3 => {
496 |                     const value = std.unicode.utf8Decode3(bytes) catch return length;
497 |                     if (value == 0x2028) return length; // U+2028 (LS)
498 |                     if (value == 0x2029) return length; // U+2029 (PS)
499 |                 },
500 |                 4 => {
501 |                     _ = std.unicode.utf8Decode4(bytes) catch return length;
502 |                 },
503 |                 else => unreachable,
504 |             }
505 |             self.index += length - 1;
506 |             return 0;
507 |         }
508 |     }
509 | };
510 | 
511 | test "tokenizer - char literal with hex escape" {
512 |     testTokenize(
513 |         \\'\x1b'
514 |     , [_]Token.Id{.char_literal});
515 |     testTokenize(
516 |         \\'\x1'
517 |     , [_]Token.Id{ .invalid, .invalid });
518 | }
519 | 
520 | test "tokenizer - char literal with unicode escapes" {
521 |     // Valid unicode escapes
522 |     testTokenize(
523 |         \\'\u{3}'
524 |     , [_]Token.Id{.char_literal});
525 |     testTokenize(
526 |         \\'\u{01}'
527 |     , [_]Token.Id{.char_literal});
528 |     testTokenize(
529 |         \\'\u{2a}'
530 |     , [_]Token.Id{.char_literal});
531 |     testTokenize(
532 |         \\'\u{3f9}'
533 |     , [_]Token.Id{.char_literal});
534 |     testTokenize(
535 |         \\'\u{6E09aBc1523}'
536 |     , [_]Token.Id{.char_literal});
537 |     testTokenize(
538 |         \\"\u{440}"
539 |     , [_]Token.Id{.string_literal});
540 | 
541 |     // Invalid unicode escapes
542 |     testTokenize(
543 |         \\'\u'
544 |     , [_]Token.Id{.invalid});
545 |     testTokenize(
546 |         \\'\u{{'
547 |     , [_]Token.Id{ .invalid, .invalid });
548 |     testTokenize(
549 |         \\'\u{}'
550 |     , [_]Token.Id{ .invalid, .invalid });
551 |     testTokenize(
552 |         \\'\u{s}'
553 |     , [_]Token.Id{ .invalid, .invalid });
554 |     testTokenize(
555 |         \\'\u{2z}'
556 |     , [_]Token.Id{ .invalid, .invalid });
557 |     testTokenize(
558 |         \\'\u{4a'
559 |     , [_]Token.Id{.invalid});
560 | 
561 |     // Test old-style unicode literals
562 |     testTokenize(
563 |         \\'\u0333'
564 |     , [_]Token.Id{ .invalid, .invalid });
565 |     testTokenize(
566 |         \\'\U0333'
567 |     , [_]Token.Id{ .invalid, .integer_literal, .invalid });
568 | }
569 | 
570 | test "tokenizer - char literal with unicode code point" {
571 |     testTokenize(
572 |         \\'💩'
573 |     , [_]Token.Id{.char_literal});
574 | }
575 | 
576 | test "tokenizer - line comment followed by identifier" {
577 |     testTokenize(
578 |         \\    Unexpected,
579 |         \\    // another
580 |         \\    Another,
581 |     , [_]Token.Id{
582 |         .identifier,
583 |         .comma,
584 |         .line_comment,
585 |         .identifier,
586 |         .comma,
587 |     });
588 | }
589 | 
590 | test "tokenizer - UTF-8 BOM is recognized and skipped" {
591 |     testTokenize("\xEF\xBB\xBFa.\n", [_]Token.Id{
592 |         .identifier,
593 |         .period,
594 |     });
595 | }
596 | 
597 | fn testTokenize(source: []const u8, expected_tokens: []const Token.Id) void {
598 |     var tokenizer = Tokenizer.init(source);
599 |     for (expected_tokens) |expected_token_id| {
600 |         const token = tokenizer.next();
601 |         if (token.id != expected_token_id) {
602 |             std.debug.panic("expected {}, found {}\n", @tagName(expected_token_id), @tagName(token.id));
603 |         }
604 |     }
605 |     const last_token = tokenizer.next();
606 |     std.testing.expect(last_token.id == .eof);
607 | }
608 | 


--------------------------------------------------------------------------------
/test/aarch64v8-linux/hello.s:
--------------------------------------------------------------------------------
 1 | 	.text
 2 | 	.globl	_start
 3 | _start:
 4 | 	adrp	x1, msg
 5 | 	mov	w0, 1
 6 | 	add	x1, x1, 288
 7 | 	mov	w8, 64
 8 | 	mov	w2, 14
 9 | 	svc	0
10 | 	mov	x0, xzr
11 | 	mov	w8, 93
12 | 	svc	0
13 | 
14 | 	.section	.rodata,"a"
15 | msg:
16 | 	.ascii	"Hello, world!\n"
17 | 	.size	msg, 14
18 | 


--------------------------------------------------------------------------------
/test/x86_64-linux/hello.s:
--------------------------------------------------------------------------------
 1 | 	.text
 2 | 	.globl	_start
 3 | _start:
 4 | 	mov	eax, 0x1
 5 | 	mov	edi, 0x1
 6 | 	mov	esi, msg
 7 | 	mov	edx, 14
 8 | 	syscall
 9 | 	mov	eax, 60
10 | 	xor	edi, edi
11 | 	syscall
12 | 	.section	.rodata,"a"
13 | msg:
14 | 	.ascii	"Hello, world!\n"
15 | 


--------------------------------------------------------------------------------