├── README.md ├── hardcoded.c ├── jitproto.c ├── mandeljit.c ├── micro-asm.h ├── simple.c └── simple.s /README.md: -------------------------------------------------------------------------------- 1 | # How to write a JIT compiler 2 | First up, you probably don't want to. JIT, or more accurately "dynamic code 3 | generation," is typically not the most effective way to optimize a project, and 4 | common techniques end up trading away a lot of portability and require fairly 5 | detailed knowledge about processor-level optimization. 6 | 7 | That said, though, writing JIT compiler is a lot of fun and a great way to 8 | learn stuff. The first thing to do is to write an interpreter. 9 | 10 | **NOTE:** If you don't have solid grasp of UNIX system-level programming, you 11 | might want to read about [how to write a 12 | shell](https://github.com/spencertipping/shell-tutorial), which covers a lot of 13 | the fundamentals. 14 | 15 | ## MandelASM 16 | GPUs are fine for machine learning, but serious fractal enthusiasts design 17 | their own processors to generate Mandelbrot sets. And the first step in 18 | processor design, of course, is to write an emulator for it. Our emulator will 19 | interpret the machine code we want to run and emit an image to stdout. 20 | 21 | To keep it simple, our processor has four complex-valued registers called `a`, 22 | `b`, `c`, and `d`, and it supports three in-place operations: 23 | 24 | - `=ab`: assign register `a` to register `b` 25 | - `+ab`: add register `a` to register `b` 26 | - `*ab`: multiply register `b` by register `a` 27 | 28 | For each pixel, the interpreter will zero all of the registers and then set `a` 29 | to the current pixel's coordinates. It then iterates the machine code for up to 30 | 256 iterations waiting for register `b` to "overflow" (i.e. for its complex 31 | absolute value to exceed 2). That means that the code for a standard Mandelbrot 32 | set is `*bb+ab`. 33 | 34 | ### Simple interpreter 35 | The first thing to do is write up a bare-bones interpreter in C. It would be 36 | simpler to use `complex.h` here, but I'm going to write it in terms of 37 | individual numbers because the JIT compiler will end up generating the longhand 38 | logic. In production code we'd include bounds-checks and stuff, but I'm 39 | omitting those here for simplicity. 40 | 41 | ```c 42 | // simple.c 43 | #include 44 | #include 45 | 46 | #define sqr(x) ((x) * (x)) 47 | 48 | typedef struct { double r; double i; } complex; 49 | 50 | void interpret(complex *registers, char const *code) { 51 | complex *src, *dst; 52 | double r, i; 53 | for (; *code; code += 3) { 54 | dst = ®isters[code[2] - 'a']; 55 | src = ®isters[code[1] - 'a']; 56 | switch (*code) { 57 | case '=': 58 | dst->r = src->r; 59 | dst->i = src->i; 60 | break; 61 | case '+': 62 | dst->r += src->r; 63 | dst->i += src->i; 64 | break; 65 | case '*': 66 | r = dst->r * src->r - dst->i * src->i; 67 | i = dst->r * src->i + dst->i * src->r; 68 | dst->r = r; 69 | dst->i = i; 70 | break; 71 | default: 72 | fprintf(stderr, "undefined instruction %s (ASCII %x)\n", code, *code); 73 | exit(1); 74 | } 75 | } 76 | } 77 | 78 | int main(int argc, char **argv) { 79 | complex registers[4]; 80 | int i, x, y; 81 | char line[1600]; 82 | printf("P5\n%d %d\n%d\n", 1600, 900, 255); 83 | for (y = 0; y < 900; ++y) { 84 | for (x = 0; x < 1600; ++x) { 85 | registers[0].r = 2 * 1.6 * (x / 1600.0 - 0.5); 86 | registers[0].i = 2 * 0.9 * (y / 900.0 - 0.5); 87 | for (i = 1; i < 4; ++i) registers[i].r = registers[i].i = 0; 88 | for (i = 0; i < 256 && sqr(registers[1].r) + sqr(registers[1].i) < 4; ++i) 89 | interpret(registers, argv[1]); 90 | line[x] = i; 91 | } 92 | fwrite(line, 1, sizeof(line), stdout); 93 | } 94 | return 0; 95 | } 96 | ``` 97 | 98 | Now we can see the results by using `display` from ImageMagick 99 | (`apt-get install imagemagick`), or by saving to a file: 100 | 101 | ```sh 102 | $ gcc simple.c -o simple 103 | $ ./simple *bb+ab | display - # imagemagick version 104 | $ ./simple *bb+ab > output.pgm # save a grayscale PPM image 105 | $ time ./simple *bb+ab > /dev/null # quick benchmark 106 | real 0m2.369s 107 | user 0m2.364s 108 | sys 0m0.000s 109 | $ 110 | ``` 111 | 112 | ![image](http://spencertipping.com/mandelbrot-output.png) 113 | 114 | ### Performance analysis 115 | **In the real world, JIT is absolutely the wrong move for this problem.** 116 | 117 | Array languages like APL, Matlab, and to a large extent Perl, Python, etc, 118 | manage to achieve reasonable performance by having interpreter operations that 119 | apply over a large number of data elements at a time. We've got exactly that 120 | situation here: in the real world it's a lot more practical to vectorize the 121 | operations to apply simultaneously to a screen-worth of data at a time -- then 122 | we'd have nice options like offloading stuff to a GPU, etc. 123 | 124 | However, since the point here is to compile stuff, on we go. 125 | 126 | JIT can basically eliminate the interpreter overhead, which we can easily model 127 | here by replacing `interpret()` with a hard-coded Mandelbrot calculation. This 128 | will provide an upper bound on realistic JIT performance, since we're unlikely 129 | to optimize as well as `gcc` does. 130 | 131 | ```c 132 | // hardcoded.c 133 | #include 134 | #include 135 | 136 | #define sqr(x) ((x) * (x)) 137 | 138 | typedef struct { double r; double i; } complex; 139 | 140 | void interpret(complex *registers, char const *code) { 141 | complex *a = ®isters[0]; 142 | complex *b = ®isters[1]; 143 | double r, i; 144 | r = b->r * b->r - b->i * b->i; 145 | i = b->r * b->i + b->i * b->r; 146 | b->r = r; 147 | b->i = i; 148 | b->r += a->r; 149 | b->i += a->i; 150 | } 151 | 152 | int main(int argc, char **argv) { 153 | complex registers[4]; 154 | int i, x, y; 155 | char line[1600]; 156 | printf("P5\n%d %d\n%d\n", 1600, 900, 255); 157 | for (y = 0; y < 900; ++y) { 158 | for (x = 0; x < 1600; ++x) { 159 | registers[0].r = 2 * 1.6 * (x / 1600.0 - 0.5); 160 | registers[0].i = 2 * 0.9 * (y / 900.0 - 0.5); 161 | for (i = 1; i < 4; ++i) registers[i].r = registers[i].i = 0; 162 | for (i = 0; i < 256 && sqr(registers[1].r) + sqr(registers[1].i) < 4; ++i) 163 | interpret(registers, argv[1]); 164 | line[x] = i; 165 | } 166 | fwrite(line, 1, sizeof(line), stdout); 167 | } 168 | return 0; 169 | } 170 | ``` 171 | 172 | This version runs about twice as fast as the simple interpreter: 173 | 174 | ```sh 175 | $ gcc hardcoded.c -o hardcoded 176 | $ time ./hardcoded *bb+ab > /dev/null 177 | real 0m1.329s 178 | user 0m1.328s 179 | sys 0m0.000s 180 | $ 181 | ``` 182 | 183 | ### JIT design and the x86-64 calling convention 184 | The basic strategy is to replace `interpret(registers, code)` with a function 185 | `compile(code)` that returns a pointer to a function whose signature is this: 186 | `void compiled(registers*)`. The memory for the function needs to be allocated 187 | using `mmap` so we can set permission for the processor to execute it. 188 | 189 | The easiest way to start with something like this is probably to emit the 190 | assembly for `simple.c` to see how it works: 191 | 192 | ```sh 193 | $ gcc -S simple.c 194 | ``` 195 | 196 | Edited/annotated highlights from the assembly `simple.s`, which is much more 197 | complicated than what we'll end up generating: 198 | 199 | ```s 200 | interpret: 201 | // The stack contains local variables referenced to the "base pointer" 202 | // stored in hardware register %rbp. Here's the layout: 203 | // 204 | // double i = -8(%rbp) 205 | // double r = -16(%rbp) 206 | // src = -24(%rbp) 207 | // dst = -32(%rbp) 208 | // registers = -40(%rbp) <- comes in as an argument in %rdi 209 | // code = -48(%rbp) <- comes in as an argument in %rsi 210 | 211 | pushq %rbp 212 | movq %rsp, %rbp // standard x86-64 function header 213 | subq $48, %rsp // allocate space for six local vars 214 | movq %rdi, -40(%rbp) // registers arg -> local var 215 | movq %rsi, -48(%rbp) // code arg -> local var 216 | jmp for_loop_condition // commence loopage 217 | ``` 218 | 219 | Before getting to the rest, I wanted to call out the `%rsi` and `%rdi` stuff 220 | and explain a bit about how calls work on x86-64. `%rsi` and `%rdi` seem 221 | arbitrary, which they are to some extent -- C obeys a platform-specific calling 222 | convention that specifies how arguments get passed in. On x86-64, up to six 223 | arguments come in as registers; after that they get pushed onto the stack. If 224 | you're returning a value, it goes into `%rax`. 225 | 226 | The return address is automatically pushed onto the stack by `call` 227 | instructions like `e8 <32-bit relative>`. So internally, `call` is the same as 228 | `push ADDRESS; jmp ; ADDRESS: ...`. `ret` is the same as `pop %rip`, 229 | except that you can't pop into `%rip`. This means that the return address is 230 | always the most immediate value on the stack. 231 | 232 | Part of the calling convention also requires callees to save a couple of 233 | registers and use `%rbp` to be a copy of `%rsp` at function-call-time, but our 234 | JIT can mostly ignore this stuff because it doesn't call back into C. 235 | 236 | ```s 237 | for_loop_body: 238 | // (a bunch of stuff to set up *src and *dst) 239 | 240 | cmpl $43, %eax // case '+' 241 | je add_branch 242 | cmpl $61, %eax // case '=' 243 | je assign_branch 244 | cmpl $42, %eax // case '*' 245 | je mult_branch 246 | jmp switch_default // default 247 | 248 | assign_branch: 249 | // the "bunch of stuff" above calculated *src and *dst, which are 250 | // stored in -24(%rbp) and -32(%rbp). 251 | movq -24(%rbp), %rax // %rax = src 252 | movsd (%rax), %xmm0 // %xmm0 = src.r 253 | movq -32(%rbp), %rax // %rax = dst 254 | movsd %xmm0, (%rax) // dst.r = %xmm0 255 | 256 | movq -24(%rbp), %rax // %rax = src 257 | movsd 8(%rax), %xmm0 // %xmm0 = src.i 258 | movq -32(%rbp), %rax // %rax = dst 259 | movsd %xmm0, 8(%rax) // dst.i = %xmm0 260 | 261 | jmp for_loop_step 262 | 263 | add_branch: 264 | movq -32(%rbp), %rax // %rax = dst 265 | movsd (%rax), %xmm1 // %xmm1 = dst.r 266 | movq -24(%rbp), %rax // %rax = src 267 | movsd (%rax), %xmm0 // %xmm0 = src.r 268 | addsd %xmm1, %xmm0 // %xmm0 += %xmm1 269 | movq -32(%rbp), %rax // %rax = dst 270 | movsd %xmm0, (%rax) // dst.r = %xmm0 271 | 272 | movq -32(%rbp), %rax // same thing for src.i and dst.i 273 | movsd 8(%rax), %xmm1 274 | movq -24(%rbp), %rax 275 | movsd 8(%rax), %xmm0 276 | addsd %xmm1, %xmm0 277 | movq -32(%rbp), %rax 278 | movsd %xmm0, 8(%rax) 279 | 280 | jmp for_loop_step 281 | 282 | mult_branch: 283 | movq -32(%rbp), %rax 284 | movsd (%rax), %xmm1 285 | movq -24(%rbp), %rax 286 | movsd (%rax), %xmm0 287 | mulsd %xmm1, %xmm0 288 | movq -32(%rbp), %rax 289 | movsd 8(%rax), %xmm2 290 | movq -24(%rbp), %rax 291 | movsd 8(%rax), %xmm1 292 | mulsd %xmm2, %xmm1 293 | subsd %xmm1, %xmm0 294 | movsd %xmm0, -16(%rbp) // double r = src.r*dst.r - src.i*dst.i 295 | 296 | movq -32(%rbp), %rax 297 | movsd (%rax), %xmm1 298 | movq -24(%rbp), %rax 299 | movsd 8(%rax), %xmm0 300 | mulsd %xmm0, %xmm1 301 | movq -32(%rbp), %rax 302 | movsd 8(%rax), %xmm2 303 | movq -24(%rbp), %rax 304 | movsd (%rax), %xmm0 305 | mulsd %xmm2, %xmm0 306 | addsd %xmm1, %xmm0 307 | movsd %xmm0, -8(%rbp) // double i = src.r*dst.i + src.i*dst.r 308 | 309 | movq -32(%rbp), %rax 310 | movsd -16(%rbp), %xmm0 311 | movsd %xmm0, (%rax) // dst.r = r 312 | movq -32(%rbp), %rax 313 | movsd -8(%rbp), %xmm0 314 | movsd %xmm0, 8(%rax) // dst.i = i 315 | jmp for_loop_step 316 | 317 | for_loop_step: 318 | addq $3, -48(%rbp) 319 | 320 | for_loop_condition: 321 | movq -48(%rbp), %rax // %rax = code (the pointer) 322 | movzbl (%rax), %eax // %eax = *code (move one byte) 323 | testb %al, %al // is %eax 0? 324 | jne for_loop_body // if no, then continue 325 | 326 | leave // otherwise rewind stack 327 | ret // pop and jmp 328 | ``` 329 | 330 | #### Compilation strategy 331 | Most of the above is register-shuffling fluff that we can get rid of. We're 332 | compiling the code up front, which means all of our register addresses are 333 | known quantities and we won't need any unknown indirection at runtime. So all 334 | of the shuffling into and out of `%rax` can be replaced by a much simpler move 335 | directly to or from `N(%rdi)` -- since `%rdi` is the argument that points to 336 | the first register's real component. 337 | 338 | If you haven't already, at this point I'd recommend downloading the [Intel 339 | software developer's 340 | manual](https://software.intel.com/en-us/articles/intel-sdm), of which volume 2 341 | describes the semantics and machine code representation of every instruction. 342 | 343 | **NOTE:** GCC uses AT&T assembly syntax, whereas the Intel manuals use Intel 344 | assembly syntax. An important difference is that AT&T reverses the arguments: 345 | `mov %rax, %rbx` (AT&T syntax) assigns to `%rbx`, whereas `mov rax, rbx` (Intel 346 | syntax) assigns to `rax`. All of my code examples use AT&T, and none of this 347 | will matter once we're working with machine code. 348 | 349 | ##### Example: the Mandelbrot function `*bb+ab` 350 | ```s 351 | // Step 1: multiply register B by itself 352 | movsd 16(%rdi), %xmm0 // %xmm0 = b.r 353 | movsd 24(%rdi), %xmm1 // %xmm1 = b.i 354 | movsd 16(%rdi), %xmm2 // %xmm2 = b.r 355 | movsd 24(%rdi), %xmm3 // %xmm3 = b.i 356 | movsd %xmm0, %xmm4 // %xmm4 = b.r 357 | mulsd %xmm2, %xmm4 // %xmm4 = b.r*b.r 358 | movsd %xmm1, %xmm5 // %xmm5 = b.i 359 | mulsd %xmm3, %xmm5 // %xmm5 = b.i*b.i 360 | subsd %xmm5, %xmm4 // %xmm4 = b.r*b.r - b.i*b.i 361 | movsd %xmm4, 16(%rdi) // b.r = %xmm4 362 | 363 | mulsd %xmm0, %xmm3 // %xmm3 = b.r*b.i 364 | mulsd %xmm1, %xmm2 // %xmm2 = b.i*b.r 365 | addsd %xmm3, %xmm2 // %xmm2 = b.r*b.i + b.i*b.r 366 | movsd %xmm2, 24(%rdi) // b.i = %xmm2 367 | 368 | // Step 2: add register A to register B 369 | movpd (%rdi), %xmm0 // %xmm0 = (a.r, a.i) 370 | addpd %xmm0, 16(%rdi) // %xmm0 += (b.r, b.i) 371 | movpd %xmm0, 16(%rdi) // (b.r, b.i) = %xmm0 372 | ``` 373 | 374 | The multiplication code isn't optimized for the squaring-a-register use case; 375 | instead, I left it fully general so we can use it as a template when we start 376 | generating machine code. 377 | 378 | ### JIT mechanics 379 | Before we compile a real language, let's just get a basic code generator 380 | working. 381 | 382 | ```c 383 | // jitproto.c 384 | #include 385 | #include 386 | #include 387 | 388 | typedef long(*fn)(long); 389 | 390 | fn compile_identity(void) { 391 | // Allocate some memory and set its permissions correctly. In particular, we 392 | // need PROT_EXEC (which isn't normally enabled for data memory, e.g. from 393 | // malloc()), which tells the processor it's ok to execute it as machine 394 | // code. 395 | char *memory = mmap(NULL, // address 396 | 4096, // size 397 | PROT_READ | PROT_WRITE | PROT_EXEC, 398 | MAP_PRIVATE | MAP_ANONYMOUS, 399 | -1, // fd (not used here) 400 | 0); // offset (not used here) 401 | if (memory == MAP_FAILED) { 402 | perror("failed to allocate memory"); 403 | exit(1); 404 | } 405 | 406 | int i = 0; 407 | 408 | // mov %rdi, %rax 409 | memory[i++] = 0x48; // REX.W prefix 410 | memory[i++] = 0x8b; // MOV opcode, register/register 411 | memory[i++] = 0xc7; // MOD/RM byte for %rdi -> %rax 412 | 413 | // ret 414 | memory[i++] = 0xc3; // RET opcode 415 | 416 | return (fn) memory; 417 | } 418 | 419 | int main() { 420 | fn f = compile_identity(); 421 | int i; 422 | for (i = 0; i < 10; ++i) 423 | printf("f(%d) = %ld\n", i, (*f)(i)); 424 | munmap(f, 4096); 425 | return 0; 426 | } 427 | ``` 428 | 429 | This does what we expect: we've just produced an identity function. 430 | 431 | ```sh 432 | $ gcc jitproto.c -o jitproto 433 | $ ./jitproto 434 | f(0) = 0 435 | f(1) = 1 436 | f(2) = 2 437 | f(3) = 3 438 | f(4) = 4 439 | f(5) = 5 440 | f(6) = 6 441 | f(7) = 7 442 | f(8) = 8 443 | f(9) = 9 444 | ``` 445 | 446 | **TODO:** explanation about userspace page mapping/permissions, and how ELF 447 | instructions tie into this (maybe also explain stuff like the FD table while 448 | we're at it) 449 | 450 | #### Generating MandelASM machine code 451 | This is where we start to get some serious mileage out of the Intel manuals. We 452 | need encodings for the following instructions: 453 | 454 | - `f2 0f 11`: `movsd reg -> memory` 455 | - `f2 0f 10`: `movsd memory -> reg` 456 | - `f2 0f 59`: `mulsd reg -> reg` 457 | - `f2 0f 58`: `addsd reg -> reg` 458 | - `f2 0f 5c`: `subsd reg -> reg` 459 | - `66 0f 11`: `movpd reg -> memory` (technically `movupd` for unaligned move) 460 | - `66 0f 10`: `movpd memory -> reg` 461 | - `66 0f 58`: `addpd memory -> reg` 462 | 463 | ##### The gnarly bits: how operands are specified 464 | Chapter 2 of the Intel manual volume 2 contains a roundabout, confusing 465 | description of operand encoding, so I'll try to sum up the basics here. 466 | (**TODO**) 467 | 468 | For the operators above, we've got two ModR/M configurations: 469 | 470 | - `movsd reg <-> X(%rdi)`: mod = 01, r/m = 111, disp8 = X 471 | - `addsd reg -> reg`: mod = 11 472 | 473 | At the byte level, they're written like this: 474 | 475 | ``` 476 | movsd %xmm0, 16(%rdi) # f2 0f 11 47 10 477 | # modr/m = b01 000 111 = 47 478 | # disp = 16 = 10 479 | 480 | addsd %xmm3, %xmm4 # f2 0f 58 e3 481 | # modr/m = b11 100 011 = e3 482 | ``` 483 | 484 | ##### A simple micro-assembler 485 | ```h 486 | // micro-asm.h 487 | #include 488 | typedef struct { 489 | char *dest; 490 | } microasm; 491 | 492 | // this makes it more obvious what we're doing later on 493 | #define xmm(n) (n) 494 | 495 | void asm_write(microasm *a, int n, ...) { 496 | va_list bytes; 497 | int i; 498 | va_start(bytes, n); 499 | for (i = 0; i < n; ++i) *(a->dest++) = (char) va_arg(bytes, int); 500 | va_end(bytes); 501 | } 502 | 503 | void movsd_reg_memory(microasm *a, char reg, char disp) 504 | { asm_write(a, 5, 0xf2, 0x0f, 0x11, 0x47 | reg << 3, disp); } 505 | 506 | void movsd_memory_reg(microasm *a, char disp, char reg) 507 | { asm_write(a, 5, 0xf2, 0x0f, 0x10, 0x47 | reg << 3, disp); } 508 | 509 | void movsd_reg_reg(microasm *a, char src, char dst) 510 | { asm_write(a, 4, 0xf2, 0x0f, 0x11, 0xc0 | src << 3 | dst); } 511 | 512 | void mulsd(microasm *a, char src, char dst) 513 | { asm_write(a, 4, 0xf2, 0x0f, 0x59, 0xc0 | dst << 3 | src); } 514 | 515 | void addsd(microasm *a, char src, char dst) 516 | { asm_write(a, 4, 0xf2, 0x0f, 0x58, 0xc0 | dst << 3 | src); } 517 | 518 | void subsd(microasm *a, char src, char dst) 519 | { asm_write(a, 4, 0xf2, 0x0f, 0x5c, 0xc0 | dst << 3 | src); } 520 | 521 | void movpd_reg_memory(microasm *a, char reg, char disp) 522 | { asm_write(a, 5, 0x66, 0x0f, 0x11, 0x47 | reg << 3, disp); } 523 | 524 | void movpd_memory_reg(microasm *a, char disp, char reg) 525 | { asm_write(a, 5, 0x66, 0x0f, 0x10, 0x47 | reg << 3, disp); } 526 | 527 | void addpd_memory_reg(microasm *a, char disp, char reg) 528 | { asm_write(a, 5, 0x66, 0x0f, 0x58, 0x47 | reg << 3, disp); } 529 | ``` 530 | 531 | ##### Putting it all together 532 | Now that we can write assembly-level stuff, we can take the structure from the 533 | prototype JIT compiler and modify it to compile MandelASM. 534 | 535 | ```c 536 | // mandeljit.c 537 | #include 538 | #include 539 | #include 540 | 541 | #include "micro-asm.h" 542 | 543 | #define sqr(x) ((x) * (x)) 544 | 545 | typedef struct { double r; double i; } complex; 546 | typedef void(*compiled)(complex*); 547 | 548 | #define offsetof(type, field) ((unsigned long) &(((type *) 0)->field)) 549 | 550 | compiled compile(char *code) { 551 | char *memory = mmap(NULL, 4096, PROT_READ | PROT_WRITE | PROT_EXEC, 552 | MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); 553 | microasm a = { .dest = memory }; 554 | char src_dsp, dst_dsp; 555 | char const r = offsetof(complex, r); 556 | char const i = offsetof(complex, i); 557 | 558 | for (; *code; code += 3) { 559 | src_dsp = sizeof(complex) * (code[1] - 'a'); 560 | dst_dsp = sizeof(complex) * (code[2] - 'a'); 561 | switch (*code) { 562 | case '=': 563 | movpd_memory_reg(&a, src_dsp, xmm(0)); 564 | movpd_reg_memory(&a, xmm(0), dst_dsp); 565 | break; 566 | 567 | case '+': 568 | movpd_memory_reg(&a, src_dsp, xmm(0)); 569 | addpd_memory_reg(&a, dst_dsp, xmm(0)); 570 | movpd_reg_memory(&a, xmm(0), dst_dsp); 571 | break; 572 | 573 | case '*': 574 | movsd_memory_reg(&a, src_dsp + r, xmm(0)); 575 | movsd_memory_reg(&a, src_dsp + i, xmm(1)); 576 | movsd_memory_reg(&a, dst_dsp + r, xmm(2)); 577 | movsd_memory_reg(&a, dst_dsp + i, xmm(3)); 578 | movsd_reg_reg (&a, xmm(0), xmm(4)); 579 | mulsd (&a, xmm(2), xmm(4)); 580 | movsd_reg_reg (&a, xmm(1), xmm(5)); 581 | mulsd (&a, xmm(3), xmm(5)); 582 | subsd (&a, xmm(5), xmm(4)); 583 | movsd_reg_memory(&a, xmm(4), dst_dsp + r); 584 | 585 | mulsd (&a, xmm(0), xmm(3)); 586 | mulsd (&a, xmm(1), xmm(2)); 587 | addsd (&a, xmm(3), xmm(2)); 588 | movsd_reg_memory(&a, xmm(2), dst_dsp + i); 589 | break; 590 | 591 | default: 592 | fprintf(stderr, "undefined instruction %s (ASCII %x)\n", code, *code); 593 | exit(1); 594 | } 595 | } 596 | 597 | // Return to caller (important! otherwise we'll segfault) 598 | asm_write(&a, 1, 0xc3); 599 | 600 | return (compiled) memory; 601 | } 602 | 603 | int main(int argc, char **argv) { 604 | compiled fn = compile(argv[1]); 605 | complex registers[4]; 606 | int i, x, y; 607 | char line[1600]; 608 | printf("P5\n%d %d\n%d\n", 1600, 900, 255); 609 | for (y = 0; y < 900; ++y) { 610 | for (x = 0; x < 1600; ++x) { 611 | registers[0].r = 2 * 1.6 * (x / 1600.0 - 0.5); 612 | registers[0].i = 2 * 0.9 * (y / 900.0 - 0.5); 613 | for (i = 1; i < 4; ++i) registers[i].r = registers[i].i = 0; 614 | for (i = 0; i < 256 && sqr(registers[1].r) + sqr(registers[1].i) < 4; ++i) 615 | (*fn)(registers); 616 | line[x] = i; 617 | } 618 | fwrite(line, 1, sizeof(line), stdout); 619 | } 620 | return 0; 621 | } 622 | ``` 623 | 624 | Now let's benchmark the interpreted and JIT-compiled versions: 625 | 626 | ```sh 627 | $ gcc mandeljit.c -o mandeljit 628 | $ time ./simple *bb+ab > /dev/null 629 | real 0m2.348s 630 | user 0m2.344s 631 | sys 0m0.000s 632 | $ time ./mandeljit *bb+ab > /dev/null 633 | real 0m1.462s 634 | user 0m1.460s 635 | sys 0m0.000s 636 | ``` 637 | 638 | Very close to the limit performance of the hardcoded version. And, of course, 639 | the JIT-compiled result is identical to the interpreted one: 640 | 641 | ```sh 642 | $ ./simple *bb+ab | md5sum 643 | 12a1013d55ee17998390809ffd671dbc - 644 | $ ./mandeljit *bb+ab | md5sum 645 | 12a1013d55ee17998390809ffd671dbc - 646 | ``` 647 | 648 | ## Further reading 649 | ### Debugging JIT compilers 650 | First, you need a good scotch; this one should work. 651 | 652 | ![image](https://cdn1.masterofmalt.com/whiskies/p-2813/laphroaig-quarter-cask-whisky.jpg?ss=2.0) 653 | 654 | Once you've got that set up, `gdb` can probably be scripted to do what you 655 | need. I've [used it somewhat 656 | successfully](https://github.com/spencertipping/canard/blob/circular/bin/canard.debug.gdb) 657 | to debug a bunch of hand-written self-modifying machine code with no debugging 658 | symbols -- the limitations of the approach ended up being whiskey-related 659 | rather than any deficiency of GDB itself. 660 | 661 | I've also had some luck using [radare2](http://www.radare.org/r/) to figure out 662 | when I was generating bogus instructions. 663 | 664 | Offline disassemblers like NASM and YASM won't help you. 665 | 666 | ### Low-level 667 | - The Intel guides cover a lot of stuff we didn't end up using here: addressing 668 | modes, instructions, etc. If you're serious about writing JIT compilers, it's 669 | worth an in-depth read. 670 | 671 | - [Agner Fog's guides to processor-level 672 | optimization](http://www.agner.org/optimize/): an insanely detailed tour 673 | through processor internals, instruction parsing pipelines, and pretty much 674 | every variant of every processor in existence. 675 | 676 | - [The V8 source 677 | code](https://github.com/v8/v8/blob/master/src/codegen/x64/assembler-x64.h): how JIT 678 | assemblers are actually written 679 | 680 | - [The JVM source 681 | code](https://github.com/openjdk/jdk/tree/master/src/hotspot/) 682 | 683 | - [Jonesforth](http://git.annexia.org/?p=jonesforth.git;a=blob;f=jonesforth.S;h=45e6e854a5d2a4c3f26af264dfce56379d401425;hb=HEAD): 684 | a well-documented example of low-level code generation and interpreter 685 | structure (sort of a JIT alternative) 686 | 687 | - [Canard machine 688 | code](https://github.com/spencertipping/canard/blob/circular/bin/canard.md#introduction): 689 | similar to jonesforth, but uses machine code for its data structures 690 | -------------------------------------------------------------------------------- /hardcoded.c: -------------------------------------------------------------------------------- 1 | // hardcoded.c 2 | #include 3 | #include 4 | 5 | #define sqr(x) ((x) * (x)) 6 | 7 | typedef struct { double r; double i; } complex; 8 | 9 | void interpret(complex *registers, char const *code) { 10 | complex *a = ®isters[0]; 11 | complex *b = ®isters[1]; 12 | double r, i; 13 | r = b->r * b->r - b->i * b->i; 14 | i = b->r * b->i + b->i * b->r; 15 | b->r = r; 16 | b->i = i; 17 | b->r += a->r; 18 | b->i += a->i; 19 | } 20 | 21 | int main(int argc, char **argv) { 22 | complex registers[4]; 23 | int i, x, y; 24 | char line[1600]; 25 | printf("P5\n%d %d\n%d\n", 1600, 900, 255); 26 | for (y = 0; y < 900; ++y) { 27 | for (x = 0; x < 1600; ++x) { 28 | registers[0].r = 2 * 1.6 * (x / 1600.0 - 0.5); 29 | registers[0].i = 2 * 0.9 * (y / 900.0 - 0.5); 30 | for (i = 1; i < 4; ++i) registers[i].r = registers[i].i = 0; 31 | for (i = 0; i < 256 && sqr(registers[1].r) + sqr(registers[1].i) < 4; ++i) 32 | interpret(registers, argv[1]); 33 | line[x] = i; 34 | } 35 | fwrite(line, 1, sizeof(line), stdout); 36 | } 37 | return 0; 38 | } 39 | -------------------------------------------------------------------------------- /jitproto.c: -------------------------------------------------------------------------------- 1 | // jitproto.c 2 | #include 3 | #include 4 | #include 5 | 6 | typedef long(*fn)(long); 7 | 8 | fn compile_identity(void) { 9 | // Allocate some memory and set its permissions correctly. In particular, we 10 | // need PROT_EXEC (which isn't normally enabled for data memory, e.g. from 11 | // malloc()), which tells the processor it's ok to execute it as machine 12 | // code. 13 | char *memory = mmap(NULL, // address 14 | 4096, // size 15 | PROT_READ | PROT_WRITE | PROT_EXEC, 16 | MAP_PRIVATE | MAP_ANONYMOUS, 17 | -1, // fd (not used here) 18 | 0); // offset (not used here) 19 | if (!memory) { 20 | perror("failed to allocate memory"); 21 | exit(1); 22 | } 23 | 24 | int i = 0; 25 | 26 | // mov %rdi, %rax 27 | memory[i++] = 0x48; // REX.W prefix 28 | memory[i++] = 0x8b; // MOV opcode, register/register 29 | memory[i++] = 0xc7; // MOD/RM byte for %rdi -> %rax 30 | 31 | // ret 32 | memory[i++] = 0xc3; // RET opcode 33 | 34 | return (fn) memory; 35 | } 36 | 37 | int main() { 38 | fn f = compile_identity(); 39 | int i; 40 | for (i = 0; i < 10; ++i) 41 | printf("f(%d) = %ld\n", i, (*f)(i)); 42 | munmap(f, 4096); 43 | return 0; 44 | } 45 | -------------------------------------------------------------------------------- /mandeljit.c: -------------------------------------------------------------------------------- 1 | // mandeljit.c 2 | #include 3 | #include 4 | #include 5 | 6 | #include "micro-asm.h" 7 | 8 | #define sqr(x) ((x) * (x)) 9 | 10 | typedef struct { double r; double i; } complex; 11 | typedef void(*compiled)(complex*); 12 | 13 | #define offsetof(type, field) ((unsigned long) &(((type *) 0)->field)) 14 | 15 | compiled compile(char *code) { 16 | char *memory = mmap(NULL, 4096, PROT_READ | PROT_WRITE | PROT_EXEC, 17 | MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); 18 | microasm a = { .dest = memory }; 19 | char src_dsp, dst_dsp; 20 | char const r = offsetof(complex, r); 21 | char const i = offsetof(complex, i); 22 | 23 | for (; *code; code += 3) { 24 | src_dsp = sizeof(complex) * (code[1] - 'a'); 25 | dst_dsp = sizeof(complex) * (code[2] - 'a'); 26 | switch (*code) { 27 | case '=': 28 | movpd_memory_reg(&a, src_dsp, xmm(0)); 29 | movpd_reg_memory(&a, xmm(0), dst_dsp); 30 | break; 31 | 32 | case '+': 33 | movpd_memory_reg(&a, src_dsp, xmm(0)); 34 | addpd_memory_reg(&a, dst_dsp, xmm(0)); 35 | movpd_reg_memory(&a, xmm(0), dst_dsp); 36 | break; 37 | 38 | case '*': 39 | movsd_memory_reg(&a, src_dsp + r, xmm(0)); 40 | movsd_memory_reg(&a, src_dsp + i, xmm(1)); 41 | movsd_memory_reg(&a, dst_dsp + r, xmm(2)); 42 | movsd_memory_reg(&a, dst_dsp + i, xmm(3)); 43 | movsd_reg_reg (&a, xmm(0), xmm(4)); 44 | mulsd (&a, xmm(2), xmm(4)); 45 | movsd_reg_reg (&a, xmm(1), xmm(5)); 46 | mulsd (&a, xmm(3), xmm(5)); 47 | subsd (&a, xmm(5), xmm(4)); 48 | movsd_reg_memory(&a, xmm(4), dst_dsp + r); 49 | 50 | mulsd (&a, xmm(0), xmm(3)); 51 | mulsd (&a, xmm(1), xmm(2)); 52 | addsd (&a, xmm(3), xmm(2)); 53 | movsd_reg_memory(&a, xmm(2), dst_dsp + i); 54 | break; 55 | 56 | default: 57 | fprintf(stderr, "undefined instruction %s (ASCII %x)\n", code, *code); 58 | exit(1); 59 | } 60 | } 61 | 62 | // Return to caller (important!) 63 | asm_write(&a, 1, 0xc3); 64 | 65 | return (compiled) memory; 66 | } 67 | 68 | int main(int argc, char **argv) { 69 | compiled fn = compile(argv[1]); 70 | complex registers[4]; 71 | int i, x, y; 72 | char line[1600]; 73 | printf("P5\n%d %d\n%d\n", 1600, 900, 255); 74 | for (y = 0; y < 900; ++y) { 75 | for (x = 0; x < 1600; ++x) { 76 | registers[0].r = 2 * 1.6 * (x / 1600.0 - 0.5); 77 | registers[0].i = 2 * 0.9 * (y / 900.0 - 0.5); 78 | for (i = 1; i < 4; ++i) registers[i].r = registers[i].i = 0; 79 | for (i = 0; i < 256 && sqr(registers[1].r) + sqr(registers[1].i) < 4; ++i) 80 | (*fn)(registers); 81 | line[x] = i; 82 | } 83 | fwrite(line, 1, sizeof(line), stdout); 84 | } 85 | return 0; 86 | } 87 | -------------------------------------------------------------------------------- /micro-asm.h: -------------------------------------------------------------------------------- 1 | // micro-asm.h 2 | #include 3 | typedef struct { 4 | char *dest; 5 | } microasm; 6 | 7 | // this makes it more obvious what we're doing later on 8 | #define xmm(n) (n) 9 | 10 | void asm_write(microasm *a, int n, ...) { 11 | va_list bytes; 12 | int i; 13 | va_start(bytes, n); 14 | for (i = 0; i < n; ++i) *(a->dest++) = (char) va_arg(bytes, int); 15 | va_end(bytes); 16 | } 17 | 18 | void movpd_reg_memory(microasm *a, char reg, char disp) { asm_write(a, 5, 0x66, 0x0f, 0x11, 0x47 | reg << 3, disp); } 19 | void movpd_memory_reg(microasm *a, char disp, char reg) { asm_write(a, 5, 0x66, 0x0f, 0x10, 0x47 | reg << 3, disp); } 20 | void addpd_memory_reg(microasm *a, char disp, char reg) { asm_write(a, 5, 0x66, 0x0f, 0x58, 0x47 | reg << 3, disp); } 21 | void movsd_reg_memory(microasm *a, char reg, char disp) { asm_write(a, 5, 0xf2, 0x0f, 0x11, 0x47 | reg << 3, disp); } 22 | void movsd_memory_reg(microasm *a, char disp, char reg) { asm_write(a, 5, 0xf2, 0x0f, 0x10, 0x47 | reg << 3, disp); } 23 | void movsd_reg_reg (microasm *a, char src, char dst) { asm_write(a, 4, 0xf2, 0x0f, 0x11, 0xc0 | src << 3 | dst); } 24 | void mulsd (microasm *a, char src, char dst) { asm_write(a, 4, 0xf2, 0x0f, 0x59, 0xc0 | dst << 3 | src); } 25 | void addsd (microasm *a, char src, char dst) { asm_write(a, 4, 0xf2, 0x0f, 0x58, 0xc0 | dst << 3 | src); } 26 | void subsd (microasm *a, char src, char dst) { asm_write(a, 4, 0xf2, 0x0f, 0x5c, 0xc0 | dst << 3 | src); } 27 | -------------------------------------------------------------------------------- /simple.c: -------------------------------------------------------------------------------- 1 | // simple.c 2 | #include 3 | #include 4 | 5 | #define sqr(x) ((x) * (x)) 6 | 7 | typedef struct { double r; double i; } complex; 8 | 9 | void interpret(complex *registers, char const *code) { 10 | complex *src, *dst; 11 | double r, i; 12 | for (; *code; code += 3) { 13 | dst = ®isters[code[2] - 'a']; 14 | src = ®isters[code[1] - 'a']; 15 | switch (*code) { 16 | case '=': 17 | dst->r = src->r; 18 | dst->i = src->i; 19 | break; 20 | case '+': 21 | dst->r += src->r; 22 | dst->i += src->i; 23 | break; 24 | case '*': 25 | r = dst->r * src->r - dst->i * src->i; 26 | i = dst->r * src->i + dst->i * src->r; 27 | dst->r = r; 28 | dst->i = i; 29 | break; 30 | default: 31 | fprintf(stderr, "undefined instruction %s (ASCII %x)\n", code, *code); 32 | exit(1); 33 | } 34 | } 35 | } 36 | 37 | int main(int argc, char **argv) { 38 | complex registers[4]; 39 | int i, x, y; 40 | char line[1600]; 41 | printf("P5\n%d %d\n%d\n", 1600, 900, 255); 42 | for (y = 0; y < 900; ++y) { 43 | for (x = 0; x < 1600; ++x) { 44 | registers[0].r = 2 * 1.6 * (x / 1600.0 - 0.5); 45 | registers[0].i = 2 * 0.9 * (y / 900.0 - 0.5); 46 | for (i = 1; i < 4; ++i) registers[i].r = registers[i].i = 0; 47 | for (i = 0; i < 256 && sqr(registers[1].r) + sqr(registers[1].i) < 4; ++i) 48 | interpret(registers, argv[1]); 49 | line[x] = i; 50 | } 51 | fwrite(line, 1, sizeof(line), stdout); 52 | } 53 | return 0; 54 | } 55 | -------------------------------------------------------------------------------- /simple.s: -------------------------------------------------------------------------------- 1 | .file "simple.c" 2 | .section .rodata 3 | .align 8 4 | .LC0: 5 | .string "undefined instruction %s (ASCII %x)\n" 6 | .text 7 | .globl interpret 8 | .type interpret, @function 9 | interpret: 10 | .LFB2: 11 | .cfi_startproc 12 | pushq %rbp 13 | .cfi_def_cfa_offset 16 14 | .cfi_offset 6, -16 15 | movq %rsp, %rbp 16 | .cfi_def_cfa_register 6 17 | subq $48, %rsp 18 | movq %rdi, -40(%rbp) 19 | movq %rsi, -48(%rbp) 20 | jmp .L2 21 | .L8: 22 | movq -48(%rbp), %rax 23 | addq $2, %rax 24 | movzbl (%rax), %eax 25 | movsbq %al, %rax 26 | salq $4, %rax 27 | leaq -1552(%rax), %rdx 28 | movq -40(%rbp), %rax 29 | addq %rdx, %rax 30 | movq %rax, -32(%rbp) 31 | movq -48(%rbp), %rax 32 | addq $1, %rax 33 | movzbl (%rax), %eax 34 | movsbq %al, %rax 35 | salq $4, %rax 36 | leaq -1552(%rax), %rdx 37 | movq -40(%rbp), %rax 38 | addq %rdx, %rax 39 | movq %rax, -24(%rbp) 40 | movq -48(%rbp), %rax 41 | movzbl (%rax), %eax 42 | movsbl %al, %eax 43 | cmpl $43, %eax 44 | je .L4 45 | cmpl $61, %eax 46 | je .L5 47 | cmpl $42, %eax 48 | je .L6 49 | jmp .L9 50 | .L5: 51 | movq -24(%rbp), %rax 52 | movsd (%rax), %xmm0 53 | movq -32(%rbp), %rax 54 | movsd %xmm0, (%rax) 55 | movq -24(%rbp), %rax 56 | movsd 8(%rax), %xmm0 57 | movq -32(%rbp), %rax 58 | movsd %xmm0, 8(%rax) 59 | jmp .L7 60 | .L4: 61 | movq -32(%rbp), %rax 62 | movsd (%rax), %xmm1 63 | movq -24(%rbp), %rax 64 | movsd (%rax), %xmm0 65 | addsd %xmm1, %xmm0 66 | movq -32(%rbp), %rax 67 | movsd %xmm0, (%rax) 68 | movq -32(%rbp), %rax 69 | movsd 8(%rax), %xmm1 70 | movq -24(%rbp), %rax 71 | movsd 8(%rax), %xmm0 72 | addsd %xmm1, %xmm0 73 | movq -32(%rbp), %rax 74 | movsd %xmm0, 8(%rax) 75 | jmp .L7 76 | .L6: 77 | movq -32(%rbp), %rax 78 | movsd (%rax), %xmm1 79 | movq -24(%rbp), %rax 80 | movsd (%rax), %xmm0 81 | mulsd %xmm1, %xmm0 82 | movq -32(%rbp), %rax 83 | movsd 8(%rax), %xmm2 84 | movq -24(%rbp), %rax 85 | movsd 8(%rax), %xmm1 86 | mulsd %xmm2, %xmm1 87 | subsd %xmm1, %xmm0 88 | movsd %xmm0, -16(%rbp) 89 | movq -32(%rbp), %rax 90 | movsd (%rax), %xmm1 91 | movq -24(%rbp), %rax 92 | movsd 8(%rax), %xmm0 93 | mulsd %xmm0, %xmm1 94 | movq -32(%rbp), %rax 95 | movsd 8(%rax), %xmm2 96 | movq -24(%rbp), %rax 97 | movsd (%rax), %xmm0 98 | mulsd %xmm2, %xmm0 99 | addsd %xmm1, %xmm0 100 | movsd %xmm0, -8(%rbp) 101 | movq -32(%rbp), %rax 102 | movsd -16(%rbp), %xmm0 103 | movsd %xmm0, (%rax) 104 | movq -32(%rbp), %rax 105 | movsd -8(%rbp), %xmm0 106 | movsd %xmm0, 8(%rax) 107 | jmp .L7 108 | .L9: 109 | movq -48(%rbp), %rax 110 | movzbl (%rax), %eax 111 | movsbl %al, %ecx 112 | movq stderr(%rip), %rax 113 | movq -48(%rbp), %rdx 114 | movl $.LC0, %esi 115 | movq %rax, %rdi 116 | movl $0, %eax 117 | call fprintf 118 | movl $1, %edi 119 | call exit 120 | .L7: 121 | addq $3, -48(%rbp) 122 | .L2: 123 | movq -48(%rbp), %rax 124 | movzbl (%rax), %eax 125 | testb %al, %al 126 | jne .L8 127 | nop 128 | leave 129 | .cfi_def_cfa 7, 8 130 | ret 131 | .cfi_endproc 132 | .LFE2: 133 | .size interpret, .-interpret 134 | .section .rodata 135 | .LC1: 136 | .string "P5\n%d %d\n%d\n" 137 | .text 138 | .globl main 139 | .type main, @function 140 | main: 141 | .LFB3: 142 | .cfi_startproc 143 | pushq %rbp 144 | .cfi_def_cfa_offset 16 145 | .cfi_offset 6, -16 146 | movq %rsp, %rbp 147 | .cfi_def_cfa_register 6 148 | subq $1712, %rsp 149 | movl %edi, -1700(%rbp) 150 | movq %rsi, -1712(%rbp) 151 | movq %fs:40, %rax 152 | movq %rax, -8(%rbp) 153 | xorl %eax, %eax 154 | movl $255, %ecx 155 | movl $900, %edx 156 | movl $1600, %esi 157 | movl $.LC1, %edi 158 | movl $0, %eax 159 | call printf 160 | movl $0, -1684(%rbp) 161 | jmp .L11 162 | .L19: 163 | movl $0, -1688(%rbp) 164 | jmp .L12 165 | .L18: 166 | pxor %xmm0, %xmm0 167 | cvtsi2sd -1688(%rbp), %xmm0 168 | movsd .LC2(%rip), %xmm1 169 | divsd %xmm1, %xmm0 170 | movsd .LC3(%rip), %xmm1 171 | subsd %xmm1, %xmm0 172 | movsd .LC4(%rip), %xmm1 173 | mulsd %xmm1, %xmm0 174 | movsd %xmm0, -1680(%rbp) 175 | pxor %xmm0, %xmm0 176 | cvtsi2sd -1684(%rbp), %xmm0 177 | movsd .LC5(%rip), %xmm1 178 | divsd %xmm1, %xmm0 179 | movsd .LC3(%rip), %xmm1 180 | subsd %xmm1, %xmm0 181 | movsd .LC6(%rip), %xmm1 182 | mulsd %xmm1, %xmm0 183 | movsd %xmm0, -1672(%rbp) 184 | movl $1, -1692(%rbp) 185 | jmp .L13 186 | .L14: 187 | movl -1692(%rbp), %eax 188 | cltq 189 | salq $4, %rax 190 | addq %rbp, %rax 191 | subq $1672, %rax 192 | pxor %xmm0, %xmm0 193 | movsd %xmm0, (%rax) 194 | movl -1692(%rbp), %eax 195 | cltq 196 | salq $4, %rax 197 | addq %rbp, %rax 198 | subq $1672, %rax 199 | movsd (%rax), %xmm0 200 | movl -1692(%rbp), %eax 201 | cltq 202 | salq $4, %rax 203 | addq %rbp, %rax 204 | subq $1680, %rax 205 | movsd %xmm0, (%rax) 206 | addl $1, -1692(%rbp) 207 | .L13: 208 | cmpl $3, -1692(%rbp) 209 | jle .L14 210 | movl $0, -1692(%rbp) 211 | jmp .L15 212 | .L17: 213 | movq -1712(%rbp), %rax 214 | addq $8, %rax 215 | movq (%rax), %rdx 216 | leaq -1680(%rbp), %rax 217 | movq %rdx, %rsi 218 | movq %rax, %rdi 219 | call interpret 220 | addl $1, -1692(%rbp) 221 | .L15: 222 | cmpl $255, -1692(%rbp) 223 | jg .L16 224 | movsd -1664(%rbp), %xmm1 225 | movsd -1664(%rbp), %xmm0 226 | mulsd %xmm0, %xmm1 227 | movsd -1656(%rbp), %xmm2 228 | movsd -1656(%rbp), %xmm0 229 | mulsd %xmm2, %xmm0 230 | addsd %xmm1, %xmm0 231 | movsd .LC8(%rip), %xmm1 232 | ucomisd %xmm0, %xmm1 233 | ja .L17 234 | .L16: 235 | movl -1692(%rbp), %eax 236 | movl %eax, %edx 237 | movl -1688(%rbp), %eax 238 | cltq 239 | movb %dl, -1616(%rbp,%rax) 240 | addl $1, -1688(%rbp) 241 | .L12: 242 | cmpl $1599, -1688(%rbp) 243 | jle .L18 244 | movq stdout(%rip), %rdx 245 | leaq -1616(%rbp), %rax 246 | movq %rdx, %rcx 247 | movl $1600, %edx 248 | movl $1, %esi 249 | movq %rax, %rdi 250 | call fwrite 251 | addl $1, -1684(%rbp) 252 | .L11: 253 | cmpl $899, -1684(%rbp) 254 | jle .L19 255 | movl $0, %eax 256 | movq -8(%rbp), %rcx 257 | xorq %fs:40, %rcx 258 | je .L21 259 | call __stack_chk_fail 260 | .L21: 261 | leave 262 | .cfi_def_cfa 7, 8 263 | ret 264 | .cfi_endproc 265 | .LFE3: 266 | .size main, .-main 267 | .section .rodata 268 | .align 8 269 | .LC2: 270 | .long 0 271 | .long 1083768832 272 | .align 8 273 | .LC3: 274 | .long 0 275 | .long 1071644672 276 | .align 8 277 | .LC4: 278 | .long 2576980378 279 | .long 1074370969 280 | .align 8 281 | .LC5: 282 | .long 0 283 | .long 1082925056 284 | .align 8 285 | .LC6: 286 | .long 3435973837 287 | .long 1073532108 288 | .align 8 289 | .LC8: 290 | .long 0 291 | .long 1074790400 292 | .ident "GCC: (Ubuntu 5.4.0-6ubuntu1~16.04.4) 5.4.0 20160609" 293 | .section .note.GNU-stack,"",@progbits 294 | --------------------------------------------------------------------------------