├── .gitignore ├── LICENSE ├── README.md ├── build.zig ├── src ├── data.zig ├── main.zig └── tokenize.zig └── test ├── aarch64v8-linux └── hello.s └── x86_64-linux └── hello.s /.gitignore: -------------------------------------------------------------------------------- 1 | zig-cache 2 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (Expat) 2 | 3 | Copyright (c) 2019 Andrew Kelley 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in 13 | all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | THE SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # zasm 2 | 3 | Multi-target assembler, disassembler, and linker. 4 | 5 | This is my experimental playground for a non-LLVM Zig backend. 6 | 7 | ## Status 8 | 9 | This project has only just begun. There is nothing to see here yet. 10 | 11 | ## Planned Targets 12 | 13 | ### Architectures 14 | 15 | All of them. 16 | 17 | No matter how insignificant the architecture, provided it has reached some kind 18 | of stable release that includes a specification, it is in scope. 19 | 20 | ### Object File Formats 21 | 22 | * ELF 23 | * COFF 24 | * MACH-O 25 | * WebAssembly 26 | 27 | ### Executable File Formats 28 | 29 | * ELF 30 | * PE (Portable Executable) (Windows) 31 | * WebAssembly 32 | 33 | ### Debug Info Formats 34 | 35 | * DWARF 36 | * PDB 37 | 38 | ## Roadmap 39 | 40 | * Hello world aarch64 assembly 41 | * Hello world x86_64 assembly split across 2 files 42 | * Hello world i386 assembly 43 | * Tests 44 | * Symbol table 45 | * DWARF Debug Info 46 | * Support more instructions 47 | * Build objects 48 | * Link objects 49 | * Incremental linking 50 | -------------------------------------------------------------------------------- /build.zig: -------------------------------------------------------------------------------- 1 | const Builder = @import("std").build.Builder; 2 | 3 | pub fn build(b: *Builder) void { 4 | const mode = b.standardReleaseOptions(); 5 | const exe = b.addExecutable("zasm", "src/main.zig"); 6 | exe.setBuildMode(mode); 7 | exe.install(); 8 | 9 | const test_step = b.addTest("src/main.zig"); 10 | const test_cmd = b.step("test", "Run the tests"); 11 | test_cmd.dependOn(&test_step.step); 12 | } 13 | -------------------------------------------------------------------------------- /src/data.zig: -------------------------------------------------------------------------------- 1 | pub const Instruction = struct { 2 | /// Primary opcode 3 | po: u8, 4 | prefix: ?u8 = null, 5 | suffix: ?u8 = null, 6 | name: []const u8, 7 | args: []const Arg, 8 | size: u64, 9 | }; 10 | 11 | pub const Register = enum { 12 | eax, 13 | edi, 14 | edx, 15 | esi, 16 | }; 17 | 18 | pub const Arg = union(enum) { 19 | register: Register, 20 | immediate, 21 | }; 22 | 23 | pub const instructions = [_]Instruction{ 24 | .{ 25 | .name = "mov", 26 | .po = 0xb8, 27 | .args = &[_]Arg{ 28 | .{ .register = .eax }, 29 | .immediate, 30 | }, 31 | .size = 5, 32 | }, 33 | .{ 34 | .name = "mov", 35 | .po = 0xbf, 36 | .args = &[_]Arg{ 37 | .{ .register = .edi }, 38 | .immediate, 39 | }, 40 | .size = 5, 41 | }, 42 | .{ 43 | .name = "mov", 44 | .po = 0xbe, 45 | .args = &[_]Arg{ 46 | .{ .register = .esi }, 47 | .immediate, 48 | }, 49 | .size = 5, 50 | }, 51 | .{ 52 | .name = "mov", 53 | .po = 0xba, 54 | .args = &[_]Arg{ 55 | .{ .register = .edx }, 56 | .immediate, 57 | }, 58 | .size = 5, 59 | }, 60 | .{ 61 | .name = "syscall", 62 | .po = 0x05, 63 | .prefix = 0x0f, 64 | .args = &[_]Arg{}, 65 | .size = 2, 66 | }, 67 | .{ 68 | .name = "xor", 69 | .po = 0x31, 70 | .suffix = 0xff, 71 | .args = &[_]Arg{ 72 | .{ .register = .edi }, 73 | .{ .register = .edi }, 74 | }, 75 | .size = 2, 76 | }, 77 | }; 78 | -------------------------------------------------------------------------------- /src/main.zig: -------------------------------------------------------------------------------- 1 | const std = @import("std"); 2 | const Tokenizer = @import("tokenize.zig").Tokenizer; 3 | const Token = @import("tokenize.zig").Token; 4 | const mem = std.mem; 5 | const fs = std.fs; 6 | const process = std.process; 7 | const math = std.math; 8 | const data = @import("data.zig"); 9 | const parseStringLiteral = std.zig.parseStringLiteral; 10 | const elf = std.elf; 11 | const assert = std.debug.assert; 12 | 13 | const Cmd = enum { 14 | exe, 15 | obj, 16 | dis, 17 | targets, 18 | tokenize, 19 | }; 20 | 21 | const Assembly = struct { 22 | allocator: *mem.Allocator, 23 | input_files: []const []const u8, 24 | asm_files: []AsmFile, 25 | target: std.Target, 26 | errors: std.ArrayList(Error), 27 | entry_addr: ?u64 = null, 28 | output_file: ?[]const u8, 29 | file_offset: u64, 30 | next_map_addr: u64 = 0x10000, 31 | sections: SectionTable, 32 | 33 | const SectionTable = std.StringHashMap(*Section); 34 | 35 | pub const SourceInfo = struct { 36 | token: Token, 37 | source: []const u8, 38 | file_name: []const u8, 39 | }; 40 | 41 | const Error = union(enum) { 42 | unexpected_token: struct { 43 | source_info: SourceInfo, 44 | }, 45 | unrecognized_directive: struct { 46 | source_info: SourceInfo, 47 | }, 48 | unrecognized_instruction: struct { 49 | source_info: SourceInfo, 50 | }, 51 | symbol_outside_section: struct { 52 | source_info: SourceInfo, 53 | }, 54 | duplicate_symbol: struct { 55 | source_info: SourceInfo, 56 | other_symbol: Token, 57 | }, 58 | bad_integer_literal: struct { 59 | source_info: SourceInfo, 60 | }, 61 | instr_outside_symbol: struct { 62 | source_info: SourceInfo, 63 | }, 64 | bad_string_literal: struct { 65 | source_info: SourceInfo, 66 | bad_index: usize, 67 | }, 68 | bad_section_flag: struct { 69 | source_info: SourceInfo, 70 | bad_index: usize, 71 | bad_byte: u8, 72 | }, 73 | too_many_args: struct { 74 | source_info: SourceInfo, 75 | }, 76 | unknown_symbol: struct { 77 | source_info: SourceInfo, 78 | }, 79 | 80 | fn printToStream(stream: var, comptime message: []const u8, source_info: SourceInfo, args: ...) !void { 81 | const loc = tokenLocation(source_info.source, source_info.token); 82 | try stream.print( 83 | "{}:{}:{}: " ++ message, 84 | source_info.file_name, 85 | loc.line + 1, 86 | loc.column + 1, 87 | args, 88 | ); 89 | } 90 | 91 | fn render(self: Error, stream: var) !void { 92 | switch (self) { 93 | .unexpected_token => |info| { 94 | try printToStream( 95 | stream, 96 | "error: unexpected token: {}\n", 97 | info.source_info, 98 | @tagName(info.source_info.token.id), 99 | ); 100 | }, 101 | .unrecognized_directive => |info| { 102 | const si = info.source_info; 103 | try printToStream( 104 | stream, 105 | "error: unrecognized directive: {}\n", 106 | si, 107 | si.source[si.token.start..si.token.end], 108 | ); 109 | }, 110 | .unrecognized_instruction => |info| { 111 | const si = info.source_info; 112 | try printToStream( 113 | stream, 114 | "error: instruction name or parameters do not match asm database: {}\n", 115 | si, 116 | si.source[si.token.start..si.token.end], 117 | ); 118 | }, 119 | .symbol_outside_section => |info| { 120 | const si = info.source_info; 121 | try printToStream( 122 | stream, 123 | "error: symbol outside section: {}\n", 124 | si, 125 | si.source[si.token.start..si.token.end], 126 | ); 127 | }, 128 | .bad_integer_literal => |info| { 129 | const si = info.source_info; 130 | try printToStream( 131 | stream, 132 | "error: invalid integer literal: {}\n", 133 | si, 134 | si.source[si.token.start..si.token.end], 135 | ); 136 | }, 137 | .bad_string_literal => |info| { 138 | const loc = tokenLocation(info.source_info.source, info.source_info.token); 139 | try stream.print( 140 | "{}:{}:{}: error: invalid byte in string literal\n", 141 | info.source_info.file_name, 142 | loc.line + 1, 143 | loc.column + 1 + info.bad_index, 144 | ); 145 | }, 146 | .bad_section_flag => |info| { 147 | const loc = tokenLocation(info.source_info.source, info.source_info.token); 148 | try stream.print( 149 | "{}:{}:{}: error: invalid section flag: '{c}'\n", 150 | info.source_info.file_name, 151 | loc.line + 1, 152 | loc.column + 1 + info.bad_index, 153 | info.bad_byte, 154 | ); 155 | }, 156 | .instr_outside_symbol => |info| { 157 | const si = info.source_info; 158 | try printToStream( 159 | stream, 160 | "error: instruction outside symbol\n", 161 | si, 162 | ); 163 | }, 164 | .duplicate_symbol => |info| { 165 | const si = info.source_info; 166 | const other_loc = tokenLocation(si.source, info.other_symbol); 167 | try printToStream( 168 | stream, 169 | "error: duplicate symbol: {}\n" ++ 170 | "{}:{}:{}: note: original definition. \n", 171 | si, 172 | si.source[si.token.start..si.token.end], 173 | 174 | si.file_name, 175 | other_loc.line + 1, 176 | other_loc.column + 1, 177 | ); 178 | }, 179 | .too_many_args => |info| { 180 | const si = info.source_info; 181 | try printToStream( 182 | stream, 183 | "error: too many args\n", 184 | si, 185 | ); 186 | }, 187 | .unknown_symbol => |info| { 188 | const si = info.source_info; 189 | try printToStream( 190 | stream, 191 | "error: unknown symbol\n", 192 | si, 193 | ); 194 | }, 195 | } 196 | } 197 | }; 198 | }; 199 | 200 | const Section = struct { 201 | name: []const u8, 202 | layout: std.ArrayList(*Symbol), 203 | alignment: u32, 204 | file_offset: u64, 205 | file_size: u64, 206 | virt_addr: u64, 207 | mem_size: u64, 208 | flags: u32, 209 | }; 210 | 211 | const Symbol = struct { 212 | /// `undefined` until a second pass when addresses are calculated. 213 | addr: u64, 214 | 215 | /// Starts at 0. Increments with instructions being added. 216 | size: u64, 217 | 218 | source_token: Token, 219 | name: []const u8, 220 | section: *Section, 221 | 222 | ops: std.ArrayList(PseudoOp), 223 | 224 | source_file: *AsmFile, 225 | }; 226 | 227 | const Instruction = struct { 228 | props: *const data.Instruction, 229 | args: []Arg, 230 | }; 231 | 232 | const PseudoOp = union(enum) { 233 | instruction: Instruction, 234 | data: []const u8, 235 | }; 236 | 237 | const Location = struct { 238 | line: usize, 239 | column: usize, 240 | line_start: usize, 241 | line_end: usize, 242 | }; 243 | 244 | fn tokenLocation(source: []const u8, token: Token) Location { 245 | const start_index = 0; 246 | var loc = Location{ 247 | .line = 0, 248 | .column = 0, 249 | .line_start = start_index, 250 | .line_end = source.len, 251 | }; 252 | const token_start = token.start; 253 | for (source[start_index..]) |c, i| { 254 | if (i + start_index == token_start) { 255 | loc.line_end = i + start_index; 256 | while (loc.line_end < source.len and source[loc.line_end] != '\n') : (loc.line_end += 1) {} 257 | return loc; 258 | } 259 | if (c == '\n') { 260 | loc.line += 1; 261 | loc.column = 0; 262 | loc.line_start = i + 1; 263 | } else { 264 | loc.column += 1; 265 | } 266 | } 267 | return loc; 268 | } 269 | 270 | pub fn main() anyerror!void { 271 | var arena = std.heap.ArenaAllocator.init(std.heap.direct_allocator); 272 | defer arena.deinit(); 273 | const allocator = &arena.allocator; 274 | 275 | var assembly = Assembly{ 276 | .allocator = allocator, 277 | .target = .Native, 278 | .input_files = undefined, 279 | .errors = std.ArrayList(Assembly.Error).init(allocator), 280 | .output_file = null, 281 | .sections = Assembly.SectionTable.init(allocator), 282 | .asm_files = undefined, 283 | .file_offset = 0, 284 | }; 285 | var input_files = std.ArrayList([]const u8).init(allocator); 286 | var maybe_cmd: ?Cmd = null; 287 | var debug_errors = false; 288 | 289 | const args = try process.argsAlloc(allocator); 290 | var arg_i: usize = 1; 291 | while (arg_i < args.len) : (arg_i += 1) { 292 | const full_arg = args[arg_i]; 293 | if (mem.startsWith(u8, full_arg, "-")) { 294 | const arg = full_arg[1..]; 295 | if (mem.eql(u8, arg, "help")) { 296 | try dumpUsage(std.io.getStdOut()); 297 | return; 298 | } else if (mem.eql(u8, arg, "debug-errors")) { 299 | debug_errors = true; 300 | } else { 301 | arg_i += 1; 302 | if (arg_i >= args.len) { 303 | std.debug.warn("Expected another parameter after '{}'\n", full_arg); 304 | dumpStdErrUsageAndExit(); 305 | } else if (mem.eql(u8, arg, "target")) { 306 | assembly.target = try std.Target.parse(args[arg_i]); 307 | } else if (mem.eql(u8, arg, "o")) { 308 | assembly.output_file = args[arg_i]; 309 | } else { 310 | std.debug.warn("Invalid parameter: {}\n", full_arg); 311 | dumpStdErrUsageAndExit(); 312 | } 313 | } 314 | } else if (maybe_cmd == null) { 315 | inline for (std.meta.fields(Cmd)) |field| { 316 | if (mem.eql(u8, full_arg, field.name)) { 317 | maybe_cmd = @field(Cmd, field.name); 318 | break; 319 | } 320 | } else { 321 | std.debug.warn("Invalid command: {}\n", full_arg); 322 | dumpStdErrUsageAndExit(); 323 | } 324 | } else { 325 | try input_files.append(full_arg); 326 | } 327 | } 328 | 329 | const cmd = maybe_cmd orelse { 330 | std.debug.warn("Expected a command parameter\n"); 331 | dumpStdErrUsageAndExit(); 332 | }; 333 | 334 | switch (cmd) { 335 | .targets => { 336 | try std.io.getStdOut().write( 337 | \\x86_64-linux 338 | \\ 339 | ); 340 | return; 341 | }, 342 | .exe => { 343 | assembly.input_files = input_files.toSliceConst(); 344 | assembleExecutable(&assembly) catch |err| switch (err) { 345 | error.ParseFailure => { 346 | const stream = &std.io.getStdErr().outStream().stream; 347 | for (assembly.errors.toSliceConst()) |asm_err| { 348 | try asm_err.render(stream); 349 | } 350 | if (debug_errors) { 351 | return err; 352 | } else { 353 | process.exit(1); 354 | } 355 | }, 356 | else => |e| return e, 357 | }; 358 | }, 359 | .obj => { 360 | std.debug.warn("object files not yet implemented\n"); 361 | process.exit(1); 362 | }, 363 | .dis => { 364 | std.debug.warn("disassembly not yet implemented\n"); 365 | process.exit(1); 366 | }, 367 | .tokenize => { 368 | const stdout = &std.io.getStdOut().outStream().stream; 369 | const cwd = fs.cwd(); 370 | for (input_files.toSliceConst()) |input_file| { 371 | const source = try cwd.readFileAlloc(allocator, input_file, math.maxInt(usize)); 372 | var tokenizer = Tokenizer.init(source); 373 | while (true) { 374 | const token = tokenizer.next(); 375 | if (token.id == .eof) break; 376 | try stdout.print("{}: {}\n", @tagName(token.id), source[token.start..token.end]); 377 | } 378 | } 379 | }, 380 | } 381 | } 382 | 383 | const Arg = union(enum) { 384 | register: data.Register, 385 | immediate: u64, 386 | symbol_ref: Token, 387 | }; 388 | 389 | const AsmFile = struct { 390 | source: []const u8, 391 | file_name: []const u8, 392 | tokenizer: Tokenizer, 393 | assembly: *Assembly, 394 | current_section: ?*Section = null, 395 | current_symbol: ?*Symbol = null, 396 | globals: GlobalSet, 397 | put_back_buffer: [1]Token, 398 | put_back_count: u1, 399 | symbols: SymbolTable, 400 | 401 | const SymbolTable = std.StringHashMap(*Symbol); 402 | const GlobalSet = std.StringHashMap(void); 403 | 404 | fn tokenSlice(asm_file: AsmFile, token: Token) []const u8 { 405 | return asm_file.source[token.start..token.end]; 406 | } 407 | 408 | fn findOrCreateSection(self: *AsmFile, name: []const u8, flags: u32) !*Section { 409 | const gop = try self.assembly.sections.getOrPut(name); 410 | if (gop.found_existing) { 411 | // TODO deal with flags conflicts 412 | return gop.kv.value; 413 | } 414 | const section = try self.assembly.sections.allocator.create(Section); 415 | section.* = Section{ 416 | .name = name, 417 | .layout = std.ArrayList(*Symbol).init(self.assembly.sections.allocator), 418 | .alignment = 0x1000, 419 | .file_offset = undefined, 420 | .file_size = 0, 421 | .virt_addr = undefined, 422 | .mem_size = 0, 423 | .flags = flags, 424 | }; 425 | gop.kv.value = section; 426 | return section; 427 | } 428 | 429 | fn setCurrentSection(self: *AsmFile, name: []const u8, flags: u32) !void { 430 | const section = try self.findOrCreateSection(name, flags); 431 | self.current_section = section; 432 | } 433 | 434 | fn setCurrentSectionFlagsTok(self: *AsmFile, name: []const u8, flags_tok: Token) !void { 435 | var flags: u32 = 0; 436 | const flags_str = blk: { 437 | const tok_slice = self.tokenSlice(flags_tok); 438 | // skip over the double quotes 439 | break :blk tok_slice[1 .. tok_slice.len - 1]; 440 | }; 441 | for (flags_str) |b, offset| switch (b) { 442 | 'a', 'r' => flags |= elf.PF_R, 443 | 'w' => flags |= elf.PF_W, 444 | 'x' => flags |= elf.PF_X, 445 | else => { 446 | try self.assembly.errors.append(.{ 447 | .bad_section_flag = .{ 448 | .source_info = newSourceInfo(self, flags_tok), 449 | .bad_index = offset, 450 | .bad_byte = b, 451 | }, 452 | }); 453 | return error.ParseFailure; 454 | }, 455 | }; 456 | 457 | return self.setCurrentSection(name, flags); 458 | } 459 | 460 | fn beginSymbol(self: *AsmFile, source_token: Token, name: []const u8) !void { 461 | const current_section = self.current_section orelse { 462 | try self.assembly.errors.append(.{ 463 | .symbol_outside_section = .{ .source_info = newSourceInfo(self, source_token) }, 464 | }); 465 | return error.ParseFailure; 466 | }; 467 | const symbol = try self.symbols.allocator.create(Symbol); 468 | symbol.* = Symbol{ 469 | .addr = undefined, 470 | .size = 0, 471 | .source_token = source_token, 472 | .name = name, 473 | .section = current_section, 474 | .ops = std.ArrayList(PseudoOp).init(self.assembly.allocator), 475 | .source_file = self, 476 | }; 477 | if (try self.symbols.put(name, symbol)) |existing_entry| { 478 | try self.assembly.errors.append(.{ 479 | .duplicate_symbol = .{ 480 | .source_info = newSourceInfo(self, source_token), 481 | .other_symbol = existing_entry.value.source_token, 482 | }, 483 | }); 484 | return error.ParseFailure; 485 | } 486 | try current_section.layout.append(symbol); 487 | self.current_symbol = symbol; 488 | } 489 | 490 | fn addGlobal(self: *AsmFile, name: []const u8) !void { 491 | _ = try self.globals.put(name, {}); 492 | } 493 | 494 | fn nextToken(self: *AsmFile) Token { 495 | if (self.put_back_count == 0) { 496 | return self.tokenizer.next(); 497 | } else { 498 | self.put_back_count -= 1; 499 | return self.put_back_buffer[self.put_back_count]; 500 | } 501 | } 502 | 503 | fn eatToken(self: *AsmFile, id: Token.Id) ?Token { 504 | const token = self.nextToken(); 505 | if (token.id == id) return token; 506 | self.putBackToken(token); 507 | return null; 508 | } 509 | 510 | fn putBackToken(self: *AsmFile, token: Token) void { 511 | self.put_back_buffer[self.put_back_count] = token; 512 | self.put_back_count += 1; 513 | } 514 | 515 | fn expectToken(asm_file: *AsmFile, id: Token.Id) !Token { 516 | const token = asm_file.nextToken(); 517 | if (token.id != id) { 518 | try asm_file.assembly.errors.append(.{ 519 | .unexpected_token = .{ .source_info = newSourceInfo(asm_file, token) }, 520 | }); 521 | return error.ParseFailure; 522 | } 523 | return token; 524 | } 525 | 526 | fn getCurrentSymbol(asm_file: *AsmFile, source_token: Token) !*Symbol { 527 | return asm_file.current_symbol orelse { 528 | try asm_file.assembly.errors.append(.{ 529 | .instr_outside_symbol = .{ .source_info = newSourceInfo(asm_file, source_token) }, 530 | }); 531 | return error.ParseFailure; 532 | }; 533 | } 534 | }; 535 | 536 | fn newSourceInfo(asm_file: *AsmFile, tok: Token) Assembly.SourceInfo { 537 | return .{ 538 | .token = tok, 539 | .source = asm_file.source, 540 | .file_name = asm_file.file_name, 541 | }; 542 | } 543 | 544 | fn assembleExecutable(assembly: *Assembly) !void { 545 | const cwd = fs.cwd(); 546 | 547 | assembly.asm_files = try assembly.allocator.alloc(AsmFile, assembly.input_files.len); 548 | 549 | for (assembly.input_files) |input_file, input_file_index| { 550 | const asm_file = &assembly.asm_files[input_file_index]; 551 | asm_file.* = .{ 552 | .assembly = assembly, 553 | .file_name = input_file, 554 | .globals = AsmFile.GlobalSet.init(assembly.allocator), 555 | .symbols = AsmFile.SymbolTable.init(assembly.allocator), 556 | .source = try cwd.readFileAlloc(assembly.allocator, input_file, math.maxInt(usize)), 557 | .tokenizer = undefined, 558 | .put_back_buffer = undefined, 559 | .put_back_count = 0, 560 | }; 561 | asm_file.tokenizer = Tokenizer.init(asm_file.source); 562 | while (true) { 563 | const token = asm_file.nextToken(); 564 | switch (token.id) { 565 | .line_break => continue, 566 | .eof => break, 567 | .period => { 568 | const dir_ident = try asm_file.expectToken(.identifier); 569 | const dir_name = asm_file.tokenSlice(dir_ident); 570 | if (mem.eql(u8, dir_name, "text")) { 571 | try asm_file.setCurrentSection("text", elf.PF_R | elf.PF_X); 572 | _ = try asm_file.expectToken(.line_break); 573 | } else if (mem.eql(u8, dir_name, "globl")) { 574 | while (true) { 575 | const ident = try asm_file.expectToken(.identifier); 576 | try asm_file.addGlobal(asm_file.tokenSlice(ident)); 577 | if (asm_file.eatToken(.comma)) |_| continue else break; 578 | } 579 | _ = try asm_file.expectToken(.line_break); 580 | } else if (mem.eql(u8, dir_name, "section")) { 581 | _ = try asm_file.expectToken(.period); 582 | const sect_name_token = try asm_file.expectToken(.identifier); 583 | const sect_name = asm_file.tokenSlice(sect_name_token); 584 | _ = try asm_file.expectToken(.comma); 585 | const flags_tok = try asm_file.expectToken(.string_literal); 586 | try asm_file.setCurrentSectionFlagsTok(sect_name, flags_tok); 587 | _ = try asm_file.expectToken(.line_break); 588 | } else if (mem.eql(u8, dir_name, "ascii")) { 589 | const current_symbol = try asm_file.getCurrentSymbol(dir_ident); 590 | 591 | const str_lit_tok = try asm_file.expectToken(.string_literal); 592 | const str_lit = asm_file.tokenSlice(str_lit_tok); 593 | var bad_index: usize = undefined; 594 | const bytes = parseStringLiteral( 595 | assembly.allocator, 596 | str_lit, 597 | &bad_index, 598 | ) catch |err| switch (err) { 599 | error.InvalidCharacter => { 600 | try assembly.errors.append(.{ 601 | .bad_string_literal = .{ 602 | .source_info = newSourceInfo(asm_file, str_lit_tok), 603 | .bad_index = bad_index, 604 | }, 605 | }); 606 | return error.ParseFailure; 607 | }, 608 | error.OutOfMemory => |e| return e, 609 | }; 610 | 611 | try current_symbol.ops.append(.{ .data = bytes }); 612 | current_symbol.size += bytes.len; 613 | _ = try asm_file.expectToken(.line_break); 614 | } else { 615 | try assembly.errors.append(.{ 616 | .unrecognized_directive = .{ .source_info = newSourceInfo(asm_file, dir_ident) }, 617 | }); 618 | return error.ParseFailure; 619 | } 620 | }, 621 | .identifier => { 622 | if (asm_file.eatToken(.colon)) |_| { 623 | const symbol_name = asm_file.tokenSlice(token); 624 | try asm_file.beginSymbol(token, symbol_name); 625 | } else { 626 | var arg_toks: [2]Token = undefined; 627 | var arg_count: usize = 0; 628 | var last_comma_tok: Token = undefined; 629 | while (arg_count < arg_toks.len) { 630 | const tok = asm_file.nextToken(); 631 | if (tok.id == .line_break) break; 632 | 633 | arg_toks[arg_count] = tok; 634 | arg_count += 1; 635 | 636 | if (asm_file.eatToken(.comma)) |comma_tok| { 637 | last_comma_tok = comma_tok; 638 | continue; 639 | } else { 640 | break; 641 | } 642 | } else { 643 | try assembly.errors.append(.{ 644 | .too_many_args = .{ .source_info = newSourceInfo(asm_file, last_comma_tok) }, 645 | }); 646 | return error.ParseFailure; 647 | } 648 | 649 | const wanted_instr_name = asm_file.tokenSlice(token); 650 | const inst = outer: for (data.instructions) |*inst| { 651 | if (!mem.eql(u8, inst.name, wanted_instr_name)) continue; 652 | if (inst.args.len != arg_count) continue; 653 | for (inst.args) |inst_arg, i| switch (inst_arg) { 654 | .register => |reg| { 655 | if (arg_toks[i].id != .identifier) continue :outer; 656 | const src_reg_name = asm_file.tokenSlice(arg_toks[i]); 657 | // TODO validate register name 658 | if (!mem.eql(u8, src_reg_name, @tagName(reg))) continue :outer; 659 | }, 660 | .immediate => switch (arg_toks[i].id) { 661 | .integer_literal, .char_literal, .identifier => {}, 662 | else => continue :outer, 663 | }, 664 | }; 665 | break :outer inst; 666 | } else { 667 | try assembly.errors.append(.{ 668 | .unrecognized_instruction = .{ .source_info = newSourceInfo(asm_file, token) }, 669 | }); 670 | return error.ParseFailure; 671 | }; 672 | const current_symbol = try asm_file.getCurrentSymbol(token); 673 | var args = std.ArrayList(Arg).init(assembly.allocator); 674 | for (arg_toks[0..arg_count]) |arg_token| { 675 | const arg_text = asm_file.tokenSlice(arg_token); 676 | var arg: Arg = undefined; 677 | switch (arg_token.id) { 678 | .integer_literal => { 679 | var text: []const u8 = undefined; 680 | var base: u8 = undefined; 681 | if (mem.startsWith(u8, arg_text, "0x")) { 682 | base = 16; 683 | text = arg_text[2..]; 684 | } else if (mem.startsWith(u8, arg_text, "0b")) { 685 | base = 2; 686 | text = arg_text[2..]; 687 | } else if (mem.startsWith(u8, arg_text, "0o")) { 688 | base = 8; 689 | text = arg_text[2..]; 690 | } else { 691 | base = 10; 692 | text = arg_text; 693 | } 694 | const imm = std.fmt.parseUnsigned(u64, text, base) catch |err| { 695 | try asm_file.assembly.errors.append(.{ 696 | .bad_integer_literal = .{ .source_info = newSourceInfo(asm_file, arg_token) }, 697 | }); 698 | return error.ParseFailure; 699 | }; 700 | arg = Arg{ .immediate = imm }; 701 | }, 702 | .identifier => { 703 | inline for (std.meta.fields(data.Register)) |field| { 704 | if (mem.eql(u8, arg_text, field.name)) { 705 | const reg = @field(data.Register, field.name); 706 | arg = Arg{ .register = reg }; 707 | break; 708 | } 709 | } else { 710 | arg = Arg{ .symbol_ref = arg_token }; 711 | } 712 | }, 713 | else => { 714 | try asm_file.assembly.errors.append(.{ 715 | .unexpected_token = .{ .source_info = newSourceInfo(asm_file, arg_token) }, 716 | }); 717 | return error.ParseFailure; 718 | }, 719 | } 720 | try args.append(arg); 721 | } 722 | 723 | try current_symbol.ops.append(.{ 724 | .instruction = .{ 725 | .props = inst, 726 | .args = args.toSliceConst(), 727 | }, 728 | }); 729 | current_symbol.size += inst.size; 730 | } 731 | }, 732 | else => { 733 | try assembly.errors.append(.{ 734 | .unexpected_token = .{ .source_info = newSourceInfo(asm_file, token) }, 735 | }); 736 | return error.ParseFailure; 737 | }, 738 | } 739 | } 740 | } 741 | 742 | const output_file = assembly.output_file orelse blk: { 743 | const basename = fs.path.basename(assembly.input_files[0]); 744 | const dot_index = mem.lastIndexOfScalar(u8, basename, '.') orelse basename.len; 745 | break :blk basename[0..dot_index]; 746 | }; 747 | const file = try std.fs.cwd().createFile(output_file, .{ 748 | .mode = 0o755, 749 | .truncate = false, 750 | }); 751 | defer file.close(); 752 | 753 | const ptr_width: PtrWidth = switch (assembly.target.getArchPtrBitWidth()) { 754 | 32 => ._32, 755 | 64 => ._64, 756 | else => return error.UnsupportedArchitecture, 757 | }; 758 | const ehdr_size: u64 = switch (ptr_width) { 759 | ._32 => @sizeOf(elf.Elf32_Ehdr), 760 | ._64 => @sizeOf(elf.Elf64_Ehdr), 761 | }; 762 | const phdr_size: u64 = switch (ptr_width) { 763 | ._32 => @sizeOf(elf.Elf32_Phdr), 764 | ._64 => @sizeOf(elf.Elf64_Phdr), 765 | }; 766 | const section_names = [_][]const u8{ "rodata", "text" }; 767 | assembly.file_offset = ehdr_size + phdr_size * section_names.len; 768 | 769 | for (section_names) |section_name| { 770 | const section = (assembly.sections.get(section_name) orelse break).value; 771 | 772 | assembly.file_offset = mem.alignForward(assembly.file_offset, section.alignment); 773 | assembly.next_map_addr = mem.alignForward(assembly.next_map_addr, section.alignment); 774 | 775 | section.file_offset = assembly.file_offset; 776 | section.virt_addr = assembly.next_map_addr; 777 | 778 | try file.seekTo(assembly.file_offset); 779 | const prev_file_offset = assembly.file_offset; 780 | const prev_map_addr = assembly.next_map_addr; 781 | try writeSection(assembly, section, file, ptr_width); 782 | section.file_size = assembly.file_offset - prev_file_offset; 783 | section.mem_size = assembly.next_map_addr - prev_map_addr; 784 | } 785 | 786 | try file.seekTo(0); 787 | try writeElfHeader(assembly, file, ptr_width, §ion_names); 788 | } 789 | 790 | fn writeSection(assembly: *Assembly, section: *Section, file: fs.File, ptr_width: PtrWidth) !void { 791 | const endian = assembly.target.getArch().endian(); 792 | 793 | for (section.layout.toSliceConst()) |symbol| { 794 | symbol.addr = assembly.next_map_addr; 795 | 796 | for (symbol.ops.toSliceConst()) |pseudo_op| { 797 | switch (pseudo_op) { 798 | .instruction => |inst| { 799 | var buf: [8]u8 = undefined; 800 | var index: usize = 0; 801 | if (inst.props.prefix) |prefix| { 802 | buf[index] = prefix; 803 | index += 1; 804 | } 805 | buf[index] = inst.props.po; 806 | index += 1; 807 | 808 | for (inst.args) |arg| switch (arg) { 809 | .register => {}, 810 | .immediate => |x| { 811 | mem.writeInt(u32, @ptrCast(*[4]u8, &buf[index]), @intCast(u32, x), endian); 812 | index += 4; 813 | }, 814 | .symbol_ref => |other_sym_tok| { 815 | const other_sym_name = symbol.source_file.tokenSlice(other_sym_tok); 816 | const other_symbol = symbol.source_file.symbols.getValue(other_sym_name) orelse { 817 | try assembly.errors.append(.{ 818 | .unknown_symbol = .{ 819 | .source_info = newSourceInfo( 820 | symbol.source_file, 821 | other_sym_tok, 822 | ), 823 | }, 824 | }); 825 | return error.ParseFailure; 826 | }; 827 | mem.writeInt(u32, @ptrCast(*[4]u8, &buf[index]), @intCast(u32, other_symbol.addr), endian); 828 | index += 4; 829 | }, 830 | }; 831 | 832 | if (inst.props.suffix) |suffix| { 833 | buf[index] = suffix; 834 | index += 1; 835 | } 836 | 837 | try file.write(buf[0..index]); 838 | assembly.next_map_addr += index; 839 | assembly.file_offset += index; 840 | }, 841 | .data => |slice| { 842 | try file.write(slice); 843 | assembly.next_map_addr += slice.len; 844 | assembly.file_offset += slice.len; 845 | }, 846 | } 847 | } 848 | if (mem.eql(u8, symbol.name, "_start")) { 849 | if (assembly.entry_addr) |prev_addr| { 850 | @panic("TODO emit error for _start already defined"); 851 | } else { 852 | assembly.entry_addr = symbol.addr; 853 | } 854 | } 855 | } 856 | } 857 | 858 | const PtrWidth = enum { 859 | _32, 860 | _64, 861 | }; 862 | 863 | fn writeElfHeader( 864 | assembly: *Assembly, 865 | /// Expected to be already seeked to position 0 in the file. 866 | file: fs.File, 867 | ptr_width: PtrWidth, 868 | section_names: []const []const u8, 869 | ) !void { 870 | const endian = assembly.target.getArch().endian(); 871 | var hdr_buf: [math.max(@sizeOf(elf.Elf64_Ehdr), @sizeOf(elf.Elf64_Phdr))]u8 = undefined; 872 | var index: usize = 0; 873 | 874 | mem.copy(u8, hdr_buf[index..], "\x7fELF"); 875 | index += 4; 876 | 877 | hdr_buf[index] = switch (ptr_width) { 878 | ._32 => 1, 879 | ._64 => 2, 880 | }; 881 | index += 1; 882 | 883 | hdr_buf[index] = switch (endian) { 884 | .Little => 1, 885 | .Big => 2, 886 | }; 887 | index += 1; 888 | 889 | hdr_buf[index] = 1; // ELF version 890 | index += 1; 891 | 892 | // OS ABI, often set to 0 regardless of target platform 893 | // ABI Version, possibly used by glibc but not by static executables 894 | // padding 895 | mem.set(u8, hdr_buf[index..][0..9], 0); 896 | index += 9; 897 | 898 | assert(index == 16); 899 | 900 | // TODO: https://github.com/ziglang/zig/issues/863 makes this (and all following) @ptrCast unnecessary 901 | mem.writeInt(u16, @ptrCast(*[2]u8, &hdr_buf[index]), @enumToInt(elf.ET.EXEC), endian); 902 | index += 2; 903 | 904 | const machine = assembly.target.getArch().toElfMachine(); 905 | mem.writeInt(u16, @ptrCast(*[2]u8, &hdr_buf[index]), @enumToInt(machine), endian); 906 | index += 2; 907 | 908 | // ELF Version, again 909 | mem.writeInt(u32, @ptrCast(*[4]u8, &hdr_buf[index]), 1, endian); 910 | index += 4; 911 | 912 | switch (ptr_width) { 913 | ._32 => { 914 | // e_entry 915 | mem.writeInt(u32, @ptrCast(*[4]u8, &hdr_buf[index]), @intCast(u32, assembly.entry_addr.?), endian); 916 | index += 4; 917 | 918 | // e_phoff 919 | mem.writeInt(u32, @ptrCast(*[4]u8, &hdr_buf[index]), @sizeOf(elf.Elf32_Ehdr), endian); 920 | index += 4; 921 | 922 | // e_shoff 923 | mem.writeInt(u32, @ptrCast(*[4]u8, &hdr_buf[index]), 0, endian); 924 | index += 4; 925 | }, 926 | ._64 => { 927 | // e_entry 928 | mem.writeInt(u64, @ptrCast(*[8]u8, &hdr_buf[index]), assembly.entry_addr.?, endian); 929 | index += 8; 930 | 931 | // e_phoff 932 | mem.writeInt(u64, @ptrCast(*[8]u8, &hdr_buf[index]), @sizeOf(elf.Elf64_Ehdr), endian); 933 | index += 8; 934 | 935 | // e_shoff 936 | mem.writeInt(u64, @ptrCast(*[8]u8, &hdr_buf[index]), 0, endian); 937 | index += 8; 938 | }, 939 | } 940 | 941 | const e_flags = 0; 942 | mem.writeInt(u32, @ptrCast(*[4]u8, &hdr_buf[index]), e_flags, endian); 943 | index += 4; 944 | 945 | const e_ehsize: u16 = switch (ptr_width) { 946 | ._32 => @sizeOf(elf.Elf32_Ehdr), 947 | ._64 => @sizeOf(elf.Elf64_Ehdr), 948 | }; 949 | mem.writeInt(u16, @ptrCast(*[2]u8, &hdr_buf[index]), e_ehsize, endian); 950 | index += 2; 951 | 952 | const e_phentsize: u16 = switch (ptr_width) { 953 | ._32 => @sizeOf(elf.Elf32_Phdr), 954 | ._64 => @sizeOf(elf.Elf64_Phdr), 955 | }; 956 | mem.writeInt(u16, @ptrCast(*[2]u8, &hdr_buf[index]), e_phentsize, endian); 957 | index += 2; 958 | 959 | const e_phnum = @intCast(u16, section_names.len); 960 | mem.writeInt(u16, @ptrCast(*[2]u8, &hdr_buf[index]), e_phnum, endian); 961 | index += 2; 962 | 963 | const e_shentsize: u16 = switch (ptr_width) { 964 | ._32 => @sizeOf(elf.Elf32_Shdr), 965 | ._64 => @sizeOf(elf.Elf64_Shdr), 966 | }; 967 | mem.writeInt(u16, @ptrCast(*[2]u8, &hdr_buf[index]), e_shentsize, endian); 968 | index += 2; 969 | 970 | const e_shnum = 0; 971 | mem.writeInt(u16, @ptrCast(*[2]u8, &hdr_buf[index]), e_shnum, endian); 972 | index += 2; 973 | 974 | const e_shstrndx = 0; 975 | mem.writeInt(u16, @ptrCast(*[2]u8, &hdr_buf[index]), e_shstrndx, endian); 976 | index += 2; 977 | 978 | assert(index == e_ehsize); 979 | 980 | try file.write(hdr_buf[0..index]); 981 | 982 | // Program headers 983 | for (section_names) |section_name| { 984 | const section = (assembly.sections.get(section_name) orelse break).value; 985 | 986 | index = 0; 987 | 988 | const p_type = elf.PT_LOAD; 989 | mem.writeInt(u32, @ptrCast(*[4]u8, &hdr_buf[index]), p_type, endian); 990 | index += 4; 991 | 992 | switch (ptr_width) { 993 | ._32 => @panic("TODO"), 994 | ._64 => { 995 | mem.writeInt(u32, @ptrCast(*[4]u8, &hdr_buf[index]), section.flags, endian); 996 | index += 4; 997 | 998 | mem.writeInt(u64, @ptrCast(*[8]u8, &hdr_buf[index]), section.file_offset, endian); 999 | index += 8; 1000 | 1001 | mem.writeInt(u64, @ptrCast(*[8]u8, &hdr_buf[index]), section.virt_addr, endian); 1002 | index += 8; 1003 | 1004 | mem.writeInt(u64, @ptrCast(*[8]u8, &hdr_buf[index]), section.virt_addr, endian); 1005 | index += 8; 1006 | 1007 | mem.writeInt(u64, @ptrCast(*[8]u8, &hdr_buf[index]), section.file_size, endian); 1008 | index += 8; 1009 | 1010 | mem.writeInt(u64, @ptrCast(*[8]u8, &hdr_buf[index]), section.mem_size, endian); 1011 | index += 8; 1012 | 1013 | mem.writeInt(u64, @ptrCast(*[8]u8, &hdr_buf[index]), section.alignment, endian); 1014 | index += 8; 1015 | }, 1016 | else => unreachable, 1017 | } 1018 | 1019 | assert(index == e_phentsize); 1020 | try file.write(hdr_buf[0..index]); 1021 | } 1022 | } 1023 | 1024 | fn dumpStdErrUsageAndExit() noreturn { 1025 | dumpUsage(std.io.getStdErr()) catch {}; 1026 | process.exit(1); 1027 | } 1028 | 1029 | fn dumpUsage(file: fs.File) !void { 1030 | try file.write( 1031 | \\Usage: zasm [command] [options] 1032 | \\ 1033 | \\Commands: 1034 | \\ exe [files] create an executable file 1035 | \\ obj [files] create an object file 1036 | \\ dis [file] disassemble a binary into source 1037 | \\ targets list the supported targets to stdout 1038 | \\ tokenize [file] (debug) tokenize the input files 1039 | \\ 1040 | \\Options: 1041 | \\ -o [file] override output file name 1042 | \\ -help dump this help text to stdout 1043 | \\ -target [arch]-[os] specify the target for positional arguments 1044 | \\ -debug-errors (debug) show stack trace on error 1045 | \\ 1046 | ); 1047 | } 1048 | 1049 | test "" { 1050 | _ = Token; 1051 | _ = Tokenizer; 1052 | } 1053 | -------------------------------------------------------------------------------- /src/tokenize.zig: -------------------------------------------------------------------------------- 1 | const std = @import("std"); 2 | const mem = std.mem; 3 | 4 | pub const Token = struct { 5 | id: Id, 6 | start: usize, 7 | end: usize, 8 | 9 | pub const Id = enum { 10 | invalid, 11 | identifier, 12 | string_literal, 13 | integer_literal, 14 | float_literal, 15 | char_literal, 16 | colon, 17 | comma, 18 | line_comment, 19 | line_break, 20 | period, 21 | slash, 22 | eof, 23 | }; 24 | }; 25 | 26 | pub const Tokenizer = struct { 27 | buffer: []const u8, 28 | index: usize, 29 | pending_invalid_token: ?Token, 30 | 31 | pub fn init(buffer: []const u8) Tokenizer { 32 | // Skip the UTF-8 BOM if present 33 | return Tokenizer{ 34 | .buffer = buffer, 35 | .index = if (mem.startsWith(u8, buffer, "\xEF\xBB\xBF")) 3 else 0, 36 | .pending_invalid_token = null, 37 | }; 38 | } 39 | 40 | const State = enum { 41 | start, 42 | char_literal, 43 | char_literal_backslash, 44 | char_literal_end, 45 | char_literal_hex_escape, 46 | char_literal_unicode, 47 | char_literal_unicode_escape, 48 | char_literal_unicode_escape_saw_u, 49 | char_literal_unicode_invalid, 50 | float_exponent_number, 51 | float_exponent_number_hex, 52 | float_exponent_unsigned, 53 | float_exponent_unsigned_hex, 54 | float_fraction, 55 | float_fraction_hex, 56 | identifier, 57 | integer_literal, 58 | integer_literal_with_radix, 59 | integer_literal_with_radix_hex, 60 | line_comment, 61 | number_dot, 62 | number_dot_hex, 63 | slash, 64 | string_literal, 65 | string_literal_backslash, 66 | zero, 67 | line_break, 68 | }; 69 | 70 | pub fn next(self: *Tokenizer) Token { 71 | if (self.pending_invalid_token) |token| { 72 | self.pending_invalid_token = null; 73 | return token; 74 | } 75 | const start_index = self.index; 76 | var state: State = .start; 77 | var result = Token{ 78 | .id = .eof, 79 | .start = self.index, 80 | .end = undefined, 81 | }; 82 | var seen_escape_digits: usize = undefined; 83 | var remaining_code_units: usize = undefined; 84 | while (self.index < self.buffer.len) : (self.index += 1) { 85 | const c = self.buffer[self.index]; 86 | switch (state) { 87 | .start => switch (c) { 88 | ' ', '\t', '\r' => { 89 | result.start = self.index + 1; 90 | }, 91 | '\n' => { 92 | result.id = .line_break; 93 | state = .line_break; 94 | }, 95 | '"' => { 96 | state = .string_literal; 97 | result.id = .string_literal; 98 | }, 99 | '\'' => { 100 | state = .char_literal; 101 | }, 102 | 'a'...'z', 'A'...'Z', '_' => { 103 | state = .identifier; 104 | result.id = .identifier; 105 | }, 106 | ',' => { 107 | result.id = .comma; 108 | self.index += 1; 109 | break; 110 | }, 111 | ':' => { 112 | result.id = .colon; 113 | self.index += 1; 114 | break; 115 | }, 116 | '.' => { 117 | result.id = .period; 118 | self.index += 1; 119 | break; 120 | }, 121 | '/' => { 122 | state = .slash; 123 | }, 124 | '0' => { 125 | state = .zero; 126 | result.id = .integer_literal; 127 | }, 128 | '1'...'9' => { 129 | state = .integer_literal; 130 | result.id = .integer_literal; 131 | }, 132 | else => { 133 | result.id = .invalid; 134 | self.index += 1; 135 | break; 136 | }, 137 | }, 138 | 139 | .line_break => switch (c) { 140 | '\n', '\r' => {}, 141 | else => break, 142 | }, 143 | 144 | .identifier => switch (c) { 145 | 'a'...'z', 'A'...'Z', '_', '0'...'9' => {}, 146 | else => break, 147 | }, 148 | .string_literal => switch (c) { 149 | '\\' => { 150 | state = .string_literal_backslash; 151 | }, 152 | '"' => { 153 | self.index += 1; 154 | break; 155 | }, 156 | '\n', '\r' => break, // Look for this error later. 157 | else => self.checkLiteralCharacter(), 158 | }, 159 | 160 | .string_literal_backslash => switch (c) { 161 | '\n', '\r' => break, // Look for this error later. 162 | else => { 163 | state = .string_literal; 164 | }, 165 | }, 166 | 167 | .char_literal => switch (c) { 168 | '\\' => { 169 | state = .char_literal_backslash; 170 | }, 171 | '\'', 0x80...0xbf, 0xf8...0xff => { 172 | result.id = .invalid; 173 | break; 174 | }, 175 | 0xc0...0xdf => { // 110xxxxx 176 | remaining_code_units = 1; 177 | state = .char_literal_unicode; 178 | }, 179 | 0xe0...0xef => { // 1110xxxx 180 | remaining_code_units = 2; 181 | state = .char_literal_unicode; 182 | }, 183 | 0xf0...0xf7 => { // 11110xxx 184 | remaining_code_units = 3; 185 | state = .char_literal_unicode; 186 | }, 187 | else => { 188 | state = .char_literal_end; 189 | }, 190 | }, 191 | 192 | .char_literal_backslash => switch (c) { 193 | '\n' => { 194 | result.id = .invalid; 195 | break; 196 | }, 197 | 'x' => { 198 | state = .char_literal_hex_escape; 199 | seen_escape_digits = 0; 200 | }, 201 | 'u' => { 202 | state = .char_literal_unicode_escape_saw_u; 203 | }, 204 | else => { 205 | state = .char_literal_end; 206 | }, 207 | }, 208 | 209 | .char_literal_hex_escape => switch (c) { 210 | '0'...'9', 'a'...'f', 'A'...'F' => { 211 | seen_escape_digits += 1; 212 | if (seen_escape_digits == 2) { 213 | state = .char_literal_end; 214 | } 215 | }, 216 | else => { 217 | result.id = .invalid; 218 | break; 219 | }, 220 | }, 221 | 222 | .char_literal_unicode_escape_saw_u => switch (c) { 223 | '{' => { 224 | state = .char_literal_unicode_escape; 225 | seen_escape_digits = 0; 226 | }, 227 | else => { 228 | result.id = .invalid; 229 | state = .char_literal_unicode_invalid; 230 | }, 231 | }, 232 | 233 | .char_literal_unicode_escape => switch (c) { 234 | '0'...'9', 'a'...'f', 'A'...'F' => { 235 | seen_escape_digits += 1; 236 | }, 237 | '}' => { 238 | if (seen_escape_digits == 0) { 239 | result.id = .invalid; 240 | state = .char_literal_unicode_invalid; 241 | } else { 242 | state = .char_literal_end; 243 | } 244 | }, 245 | else => { 246 | result.id = .invalid; 247 | state = .char_literal_unicode_invalid; 248 | }, 249 | }, 250 | 251 | .char_literal_unicode_invalid => switch (c) { 252 | // Keep consuming characters until an obvious stopping point. 253 | // This consolidates e.g. `u{0ab1Q}` into a single invalid token 254 | // instead of creating the tokens `u{0ab1`, `Q`, `}` 255 | '0'...'9', 'a'...'z', 'A'...'Z', '}' => {}, 256 | else => break, 257 | }, 258 | 259 | .char_literal_end => switch (c) { 260 | '\'' => { 261 | result.id = .char_literal; 262 | self.index += 1; 263 | break; 264 | }, 265 | else => { 266 | result.id = .invalid; 267 | break; 268 | }, 269 | }, 270 | 271 | .char_literal_unicode => switch (c) { 272 | 0x80...0xbf => { 273 | remaining_code_units -= 1; 274 | if (remaining_code_units == 0) { 275 | state = .char_literal_end; 276 | } 277 | }, 278 | else => { 279 | result.id = .invalid; 280 | break; 281 | }, 282 | }, 283 | 284 | .slash => switch (c) { 285 | '/' => { 286 | state = .line_comment; 287 | result.id = .line_comment; 288 | }, 289 | else => { 290 | result.id = .slash; 291 | break; 292 | }, 293 | }, 294 | .line_comment => switch (c) { 295 | '\n' => break, 296 | else => self.checkLiteralCharacter(), 297 | }, 298 | .zero => switch (c) { 299 | 'b', 'o' => { 300 | state = .integer_literal_with_radix; 301 | }, 302 | 'x' => { 303 | state = .integer_literal_with_radix_hex; 304 | }, 305 | else => { 306 | // reinterpret as a normal number 307 | self.index -= 1; 308 | state = .integer_literal; 309 | }, 310 | }, 311 | .integer_literal => switch (c) { 312 | '.' => { 313 | state = .number_dot; 314 | }, 315 | 'p', 'P', 'e', 'E' => { 316 | state = .float_exponent_unsigned; 317 | }, 318 | '0'...'9' => {}, 319 | else => break, 320 | }, 321 | .integer_literal_with_radix => switch (c) { 322 | '.' => { 323 | state = .number_dot; 324 | }, 325 | '0'...'9' => {}, 326 | else => break, 327 | }, 328 | .integer_literal_with_radix_hex => switch (c) { 329 | '.' => { 330 | state = .number_dot_hex; 331 | }, 332 | 'p', 'P' => { 333 | state = .float_exponent_unsigned_hex; 334 | }, 335 | '0'...'9', 'a'...'f', 'A'...'F' => {}, 336 | else => break, 337 | }, 338 | .number_dot => switch (c) { 339 | '.' => { 340 | self.index -= 1; 341 | state = .start; 342 | break; 343 | }, 344 | else => { 345 | self.index -= 1; 346 | result.id = .float_literal; 347 | state = .float_fraction; 348 | }, 349 | }, 350 | .number_dot_hex => switch (c) { 351 | '.' => { 352 | self.index -= 1; 353 | state = .start; 354 | break; 355 | }, 356 | else => { 357 | self.index -= 1; 358 | result.id = .float_literal; 359 | state = .float_fraction_hex; 360 | }, 361 | }, 362 | .float_fraction => switch (c) { 363 | 'e', 'E' => { 364 | state = .float_exponent_unsigned; 365 | }, 366 | '0'...'9' => {}, 367 | else => break, 368 | }, 369 | .float_fraction_hex => switch (c) { 370 | 'p', 'P' => { 371 | state = .float_exponent_unsigned_hex; 372 | }, 373 | '0'...'9', 'a'...'f', 'A'...'F' => {}, 374 | else => break, 375 | }, 376 | .float_exponent_unsigned => switch (c) { 377 | '+', '-' => { 378 | state = .float_exponent_number; 379 | }, 380 | else => { 381 | // reinterpret as a normal exponent number 382 | self.index -= 1; 383 | state = .float_exponent_number; 384 | }, 385 | }, 386 | .float_exponent_unsigned_hex => switch (c) { 387 | '+', '-' => { 388 | state = .float_exponent_number_hex; 389 | }, 390 | else => { 391 | // reinterpret as a normal exponent number 392 | self.index -= 1; 393 | state = .float_exponent_number_hex; 394 | }, 395 | }, 396 | .float_exponent_number => switch (c) { 397 | '0'...'9' => {}, 398 | else => break, 399 | }, 400 | .float_exponent_number_hex => switch (c) { 401 | '0'...'9', 'a'...'f', 'A'...'F' => {}, 402 | else => break, 403 | }, 404 | } 405 | } else if (self.index == self.buffer.len) { 406 | switch (state) { 407 | .start, 408 | .integer_literal, 409 | .integer_literal_with_radix, 410 | .integer_literal_with_radix_hex, 411 | .float_fraction, 412 | .float_fraction_hex, 413 | .float_exponent_number, 414 | .float_exponent_number_hex, 415 | .line_break, 416 | .string_literal, // find this error later 417 | => {}, 418 | 419 | .identifier => {}, 420 | 421 | .line_comment => { 422 | result.id = Token.Id.line_comment; 423 | }, 424 | 425 | .number_dot, 426 | .number_dot_hex, 427 | .float_exponent_unsigned, 428 | .float_exponent_unsigned_hex, 429 | .char_literal, 430 | .char_literal_backslash, 431 | .char_literal_hex_escape, 432 | .char_literal_unicode_escape_saw_u, 433 | .char_literal_unicode_escape, 434 | .char_literal_unicode_invalid, 435 | .char_literal_end, 436 | .char_literal_unicode, 437 | .string_literal_backslash, 438 | => { 439 | result.id = .invalid; 440 | }, 441 | 442 | .slash => { 443 | result.id = .slash; 444 | }, 445 | .zero => { 446 | result.id = .integer_literal; 447 | }, 448 | } 449 | } 450 | 451 | if (result.id == .eof) { 452 | if (self.pending_invalid_token) |token| { 453 | self.pending_invalid_token = null; 454 | return token; 455 | } 456 | } 457 | 458 | result.end = self.index; 459 | return result; 460 | } 461 | 462 | fn checkLiteralCharacter(self: *Tokenizer) void { 463 | if (self.pending_invalid_token != null) return; 464 | const invalid_length = self.getInvalidCharacterLength(); 465 | if (invalid_length == 0) return; 466 | self.pending_invalid_token = Token{ 467 | .id = .invalid, 468 | .start = self.index, 469 | .end = self.index + invalid_length, 470 | }; 471 | } 472 | 473 | fn getInvalidCharacterLength(self: *Tokenizer) u3 { 474 | const c0 = self.buffer[self.index]; 475 | if (c0 < 0x80) { 476 | if (c0 < 0x20 or c0 == 0x7f) { 477 | // ascii control codes are never allowed 478 | // (note that \n was checked before we got here) 479 | return 1; 480 | } 481 | // looks fine to me. 482 | return 0; 483 | } else { 484 | // check utf8-encoded character. 485 | const length = std.unicode.utf8ByteSequenceLength(c0) catch return 1; 486 | if (self.index + length > self.buffer.len) { 487 | return @intCast(u3, self.buffer.len - self.index); 488 | } 489 | const bytes = self.buffer[self.index .. self.index + length]; 490 | switch (length) { 491 | 2 => { 492 | const value = std.unicode.utf8Decode2(bytes) catch return length; 493 | if (value == 0x85) return length; // U+0085 (NEL) 494 | }, 495 | 3 => { 496 | const value = std.unicode.utf8Decode3(bytes) catch return length; 497 | if (value == 0x2028) return length; // U+2028 (LS) 498 | if (value == 0x2029) return length; // U+2029 (PS) 499 | }, 500 | 4 => { 501 | _ = std.unicode.utf8Decode4(bytes) catch return length; 502 | }, 503 | else => unreachable, 504 | } 505 | self.index += length - 1; 506 | return 0; 507 | } 508 | } 509 | }; 510 | 511 | test "tokenizer - char literal with hex escape" { 512 | testTokenize( 513 | \\'\x1b' 514 | , [_]Token.Id{.char_literal}); 515 | testTokenize( 516 | \\'\x1' 517 | , [_]Token.Id{ .invalid, .invalid }); 518 | } 519 | 520 | test "tokenizer - char literal with unicode escapes" { 521 | // Valid unicode escapes 522 | testTokenize( 523 | \\'\u{3}' 524 | , [_]Token.Id{.char_literal}); 525 | testTokenize( 526 | \\'\u{01}' 527 | , [_]Token.Id{.char_literal}); 528 | testTokenize( 529 | \\'\u{2a}' 530 | , [_]Token.Id{.char_literal}); 531 | testTokenize( 532 | \\'\u{3f9}' 533 | , [_]Token.Id{.char_literal}); 534 | testTokenize( 535 | \\'\u{6E09aBc1523}' 536 | , [_]Token.Id{.char_literal}); 537 | testTokenize( 538 | \\"\u{440}" 539 | , [_]Token.Id{.string_literal}); 540 | 541 | // Invalid unicode escapes 542 | testTokenize( 543 | \\'\u' 544 | , [_]Token.Id{.invalid}); 545 | testTokenize( 546 | \\'\u{{' 547 | , [_]Token.Id{ .invalid, .invalid }); 548 | testTokenize( 549 | \\'\u{}' 550 | , [_]Token.Id{ .invalid, .invalid }); 551 | testTokenize( 552 | \\'\u{s}' 553 | , [_]Token.Id{ .invalid, .invalid }); 554 | testTokenize( 555 | \\'\u{2z}' 556 | , [_]Token.Id{ .invalid, .invalid }); 557 | testTokenize( 558 | \\'\u{4a' 559 | , [_]Token.Id{.invalid}); 560 | 561 | // Test old-style unicode literals 562 | testTokenize( 563 | \\'\u0333' 564 | , [_]Token.Id{ .invalid, .invalid }); 565 | testTokenize( 566 | \\'\U0333' 567 | , [_]Token.Id{ .invalid, .integer_literal, .invalid }); 568 | } 569 | 570 | test "tokenizer - char literal with unicode code point" { 571 | testTokenize( 572 | \\'💩' 573 | , [_]Token.Id{.char_literal}); 574 | } 575 | 576 | test "tokenizer - line comment followed by identifier" { 577 | testTokenize( 578 | \\ Unexpected, 579 | \\ // another 580 | \\ Another, 581 | , [_]Token.Id{ 582 | .identifier, 583 | .comma, 584 | .line_comment, 585 | .identifier, 586 | .comma, 587 | }); 588 | } 589 | 590 | test "tokenizer - UTF-8 BOM is recognized and skipped" { 591 | testTokenize("\xEF\xBB\xBFa.\n", [_]Token.Id{ 592 | .identifier, 593 | .period, 594 | }); 595 | } 596 | 597 | fn testTokenize(source: []const u8, expected_tokens: []const Token.Id) void { 598 | var tokenizer = Tokenizer.init(source); 599 | for (expected_tokens) |expected_token_id| { 600 | const token = tokenizer.next(); 601 | if (token.id != expected_token_id) { 602 | std.debug.panic("expected {}, found {}\n", @tagName(expected_token_id), @tagName(token.id)); 603 | } 604 | } 605 | const last_token = tokenizer.next(); 606 | std.testing.expect(last_token.id == .eof); 607 | } 608 | -------------------------------------------------------------------------------- /test/aarch64v8-linux/hello.s: -------------------------------------------------------------------------------- 1 | .text 2 | .globl _start 3 | _start: 4 | adrp x1, msg 5 | mov w0, 1 6 | add x1, x1, 288 7 | mov w8, 64 8 | mov w2, 14 9 | svc 0 10 | mov x0, xzr 11 | mov w8, 93 12 | svc 0 13 | 14 | .section .rodata,"a" 15 | msg: 16 | .ascii "Hello, world!\n" 17 | .size msg, 14 18 | -------------------------------------------------------------------------------- /test/x86_64-linux/hello.s: -------------------------------------------------------------------------------- 1 | .text 2 | .globl _start 3 | _start: 4 | mov eax, 0x1 5 | mov edi, 0x1 6 | mov esi, msg 7 | mov edx, 14 8 | syscall 9 | mov eax, 60 10 | xor edi, edi 11 | syscall 12 | .section .rodata,"a" 13 | msg: 14 | .ascii "Hello, world!\n" 15 | --------------------------------------------------------------------------------