├── lib
    ├── jit
    │   ├── version.rb
    │   ├── compiler.rb
    │   └── assembler.rb
    └── jit.rb
├── test
    ├── none.rb
    ├── plus.rb
    ├── minus.rb
    ├── local.rb
    ├── lt.rb
    ├── send.rb
    ├── fib.rb
    ├── branch.rb
    └── jit
    │   └── compiler_test.rb
├── bin
    ├── docker
    ├── setup
    ├── ruby
    ├── console
    └── bench
├── .gitignore
├── Gemfile
├── benchmark.yml
├── Rakefile
├── Dockerfile
├── jit.gemspec
├── LICENSE.txt
└── README.md


/lib/jit/version.rb:
--------------------------------------------------------------------------------
1 | module JIT
2 |   VERSION = '0.1.0'
3 | end
4 | 


--------------------------------------------------------------------------------
/test/none.rb:
--------------------------------------------------------------------------------
1 | def none
2 |   nil
3 | end
4 | 
5 | none
6 | none
7 | p none
8 | 


--------------------------------------------------------------------------------
/test/plus.rb:
--------------------------------------------------------------------------------
1 | def plus
2 |   1 + 2
3 | end
4 | 
5 | plus
6 | plus
7 | p plus
8 | 


--------------------------------------------------------------------------------
/test/minus.rb:
--------------------------------------------------------------------------------
1 | def minus
2 |   3 - 1
3 | end
4 | 
5 | minus
6 | minus
7 | p minus
8 | 


--------------------------------------------------------------------------------
/test/local.rb:
--------------------------------------------------------------------------------
1 | def local(n)
2 |   n
3 | end
4 | 
5 | local(1)
6 | local(1)
7 | p local(2)
8 | 


--------------------------------------------------------------------------------
/test/lt.rb:
--------------------------------------------------------------------------------
1 | def lt(n)
2 |   n < 2
3 | end
4 | 
5 | lt(1)
6 | lt(1)
7 | p lt(1)
8 | p lt(2)
9 | 


--------------------------------------------------------------------------------
/bin/docker:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | exec docker run --rm -it -v "$(pwd):/app" k0kubun/rjit bash
3 | 


--------------------------------------------------------------------------------
/test/send.rb:
--------------------------------------------------------------------------------
 1 | def foo(a)
 2 |   1 + a
 3 | end
 4 | 
 5 | def bar
 6 |   foo(1)
 7 | end
 8 | 
 9 | bar
10 | bar
11 | p bar
12 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | /.bundle/
 2 | /.yardoc
 3 | /_yardoc/
 4 | /coverage/
 5 | /doc/
 6 | /pkg/
 7 | /spec/reports/
 8 | /tmp/
 9 | /Gemfile.lock
10 | 


--------------------------------------------------------------------------------
/Gemfile:
--------------------------------------------------------------------------------
1 | source 'https://rubygems.org'
2 | 
3 | # Specify your gem's dependencies in jit.gemspec
4 | gemspec
5 | 
6 | gem 'minitest'
7 | gem 'rake'
8 | 


--------------------------------------------------------------------------------
/test/fib.rb:
--------------------------------------------------------------------------------
 1 | def fib(n)
 2 |   if n < 2
 3 |     return n
 4 |   end
 5 |   return fib(n-1) + fib(n-2)
 6 | end
 7 | 
 8 | fib(2)
 9 | p fib(32)
10 | 


--------------------------------------------------------------------------------
/bin/setup:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | set -euo pipefail
3 | IFS=$'\n\t'
4 | set -vx
5 | 
6 | bundle install
7 | 
8 | # Do any other automated setup that you need to do here
9 | 


--------------------------------------------------------------------------------
/test/branch.rb:
--------------------------------------------------------------------------------
 1 | def branch(flag)
 2 |   if flag
 3 |     1
 4 |   else
 5 |     0
 6 |   end
 7 | end
 8 | 
 9 | branch(true)
10 | branch(true)
11 | p branch(true)
12 | p branch(false)
13 | 


--------------------------------------------------------------------------------
/bin/ruby:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | repo_root="$(cd "$(dirname "$0")"; cd ..; pwd)"
3 | ruby="${RJIT_RUBY:-"ruby"}"
4 | exec "$ruby" "-r${repo_root}/lib/jit.rb" --rjit=pause --rjit-call-threshold=3 "$@"
5 | 


--------------------------------------------------------------------------------
/benchmark.yml:
--------------------------------------------------------------------------------
 1 | prelude: |
 2 |   def fib(n)
 3 |     if n < 2
 4 |       return n
 5 |     end
 6 |     return fib(n-1) + fib(n-2)
 7 |   end
 8 | 
 9 |   fib(2)
10 |   fib(2)
11 | benchmark: fib(32)
12 | loop_count: 15
13 | 


--------------------------------------------------------------------------------
/Rakefile:
--------------------------------------------------------------------------------
 1 | require 'bundler/gem_tasks'
 2 | require 'rake/testtask'
 3 | 
 4 | Rake::TestTask.new(:test) do |t|
 5 |   t.libs << 'lib' << 'test'
 6 |   t.test_files = %w[test/jit/*_test.rb]
 7 |   t.verbose = true
 8 | end
 9 | 
10 | task default: :test
11 | 


--------------------------------------------------------------------------------
/bin/console:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env ruby
 2 | 
 3 | require 'bundler/setup'
 4 | require 'jit'
 5 | 
 6 | # You can add fixtures and/or initialization code here to make experimenting
 7 | # with your gem easier. You can also use a different console, if you like.
 8 | 
 9 | require 'irb'
10 | IRB.start(__FILE__)
11 | 


--------------------------------------------------------------------------------
/lib/jit.rb:
--------------------------------------------------------------------------------
 1 | require_relative 'jit/version'
 2 | require_relative 'jit/compiler'
 3 | 
 4 | return unless RubyVM::RJIT.enabled?
 5 | 
 6 | # Replace RJIT with JIT::Compiler
 7 | RubyVM::RJIT::Compiler.prepend(Module.new {
 8 |   def compile(iseq, _)
 9 |     @compiler ||= JIT::Compiler.new
10 |     @compiler.compile(iseq)
11 |   end
12 | })
13 | 
14 | # Enable JIT compilation (paused by --rjit=pause)
15 | RubyVM::RJIT.resume
16 | 


--------------------------------------------------------------------------------
/bin/bench:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | repo_root="$(cd "$(dirname "$0")"; cd ..; pwd)"
 3 | ruby="$(which ruby)"
 4 | 
 5 | if ! which benchmark-driver > /dev/null; then
 6 |   gem install benchmark_driver
 7 | fi
 8 | 
 9 | benchmark-driver "${repo_root}/benchmark.yml" \
10 |   -e "no-jit::${ruby}" \
11 |   -e "rjit::${ruby} --rjit-call-threshold=3" \
12 |   -e "yjit::${ruby} --yjit-call-threshold=3" \
13 |   -e "ruby-jit::${ruby} --rjit=pause -r${repo_root}/lib/jit.rb --rjit-call-threshold=3"
14 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM ubuntu:22.04
 2 | 
 3 | RUN apt-get update && apt-get install -y \
 4 |   autoconf bison patch build-essential rustc libssl-dev libyaml-dev libreadline6-dev \
 5 |   zlib1g-dev libgmp-dev libncurses5-dev libffi-dev libgdbm6 libgdbm-dev libdb-dev uuid-dev \
 6 |   ruby git libcapstone-dev \
 7 |   && rm -rf /var/lib/apt/lists/*
 8 | 
 9 | ENV RUBY_REVISION=f2c367734f847a7277f09c583a0476086313fdc9
10 | RUN git clone --depth=1 https://github.com/ruby/ruby /ruby && cd /ruby && \
11 |   git fetch origin $RUBY_REVISION && git reset --hard $RUBY_REVISION && \
12 |   ./autogen.sh && \
13 |   ./configure --disable-install-doc --prefix=/usr/local --enable-yjit --enable-rjit=disasm && \
14 |   make -j8 && make install && apt-get remove -y ruby && rm -rf /ruby
15 | 
16 | RUN mkdir /app
17 | WORKDIR /app
18 | 


--------------------------------------------------------------------------------
/jit.gemspec:
--------------------------------------------------------------------------------
 1 | require_relative 'lib/jit/version'
 2 | 
 3 | Gem::Specification.new do |spec|
 4 |   spec.name = 'jit'
 5 |   spec.version = JIT::VERSION
 6 |   spec.authors = ['Takashi Kokubun']
 7 |   spec.email = ['takashikkbn@gmail.com']
 8 | 
 9 |   spec.summary = 'Ruby JIT Challenge'
10 |   spec.description = 'Ruby JIT Challenge'
11 |   spec.homepage = 'https://github.com/k0kubun/ruby-jit-challenge'
12 |   spec.required_ruby_version = '>= 3.3.0.dev'
13 | 
14 |   spec.files = Dir.chdir(__dir__) do
15 |     `git ls-files -z`.split("\x0").reject do |f|
16 |       (f == __FILE__) || f.match(%r{\A(?:(?:bin|test|spec|features)/|\.(?:git|circleci)|appveyor)})
17 |     end
18 |   end
19 |   spec.bindir = 'exe'
20 |   spec.executables = spec.files.grep(%r{\Aexe/}) { |f| File.basename(f) }
21 |   spec.require_paths = ['lib']
22 | end
23 | 


--------------------------------------------------------------------------------
/test/jit/compiler_test.rb:
--------------------------------------------------------------------------------
 1 | require 'minitest/autorun'
 2 | require 'open3'
 3 | 
 4 | class JITCompilerTest < Minitest::Test
 5 |   REPO_ROOT = File.expand_path('../..', __dir__)
 6 | 
 7 |   def test_none
 8 |     assert_jit('test/none.rb', 'nil')
 9 |   end
10 | 
11 |   def test_plus
12 |     assert_jit('test/plus.rb', '3')
13 |   end
14 | 
15 |   def test_minus
16 |     assert_jit('test/minus.rb', '2')
17 |   end
18 | 
19 |   def test_local
20 |     assert_jit('test/local.rb', '2')
21 |   end
22 | 
23 |   def test_lt
24 |     assert_jit('test/lt.rb', "true\nfalse")
25 |   end
26 | 
27 |   def test_branch
28 |     assert_jit('test/branch.rb', "1\n0")
29 |   end
30 | 
31 |   def test_send
32 |     assert_jit('test/send.rb', '2')
33 |   end
34 | 
35 |   def test_fib
36 |     assert_jit('test/fib.rb', '2178309')
37 |   end
38 | 
39 |   private
40 | 
41 |   def assert_jit(path, expected)
42 |     stdout, stderr, status = with_unbundled_env do
43 |       Open3.capture3(
44 |         RbConfig.ruby, "-r#{REPO_ROOT}/lib/jit.rb", '--rjit=pause',
45 |         '--rjit-call-threshold=3', File.expand_path(path, REPO_ROOT)
46 |       )
47 |     end
48 |     assert_equal 0, status.exitstatus,
49 |       "stdout:\n```\n#{stdout}```\n\nstderr:\n```\n#{stderr}```"
50 |     assert_equal '', stderr
51 |     assert_equal "#{expected}\n", stdout
52 |   end
53 | 
54 |   def with_unbundled_env(&block)
55 |     if defined?(Bundler)
56 |       Bundler.with_unbundled_env { block.call }
57 |     else
58 |       block.call
59 |     end
60 |   end
61 | end
62 | 


--------------------------------------------------------------------------------
/lib/jit/compiler.rb:
--------------------------------------------------------------------------------
 1 | require_relative 'assembler'
 2 | 
 3 | module JIT
 4 |   class Compiler
 5 |     # Utilities to call C functions and interact with the Ruby VM.
 6 |     # See: https://github.com/ruby/ruby/blob/master/rjit_c.rb
 7 |     C = RubyVM::RJIT::C
 8 | 
 9 |     # Metadata for each YARV instruction.
10 |     INSNS = RubyVM::RJIT::INSNS
11 | 
12 |     # Size of the JIT buffer
13 |     JIT_BUF_SIZE = 1024 * 1024
14 | 
15 |     # Initialize a JIT buffer. Called only once.
16 |     def initialize
17 |       # Allocate 64MiB of memory. This returns the memory address.
18 |       @jit_buf = C.mmap(JIT_BUF_SIZE)
19 |       # The number of bytes that have been written to @jit_buf.
20 |       @jit_pos = 0
21 |     end
22 | 
23 |     # Compile a method. Called after --rjit-call-threshold calls.
24 |     def compile(iseq)
25 |       # Write machine code to this assembler.
26 |       asm = Assembler.new
27 | 
28 |       # Iterate over each YARV instruction.
29 |       insn_index = 0
30 |       while insn_index < iseq.body.iseq_size
31 |         insn = INSNS.fetch(C.rb_vm_insn_decode(iseq.body.iseq_encoded[insn_index]))
32 |         case insn.name
33 |         in :nop
34 |           # none
35 |         end
36 |         insn_index += insn.len
37 |       end
38 | 
39 |       # Write machine code into memory and use it as a JIT function.
40 |       iseq.body.jit_func = write(asm)
41 |     rescue Exception => e
42 |       abort e.full_message
43 |     end
44 | 
45 |     private
46 | 
47 |     # Write bytes in a given assembler into @jit_buf.
48 |     # @param asm [JIT::Assembler]
49 |     def write(asm)
50 |       jit_addr = @jit_buf + @jit_pos
51 | 
52 |       # Append machine code to the JIT buffer
53 |       C.mprotect_write(@jit_buf, JIT_BUF_SIZE) # make @jit_buf writable
54 |       @jit_pos += asm.assemble(jit_addr)
55 |       C.mprotect_exec(@jit_buf, JIT_BUF_SIZE) # make @jit_buf executable
56 | 
57 |       # Dump disassembly if --rjit-dump-disasm
58 |       if C.rjit_opts.dump_disasm
59 |         C.dump_disasm(jit_addr, @jit_buf + @jit_pos).each do |address, mnemonic, op_str|
60 |           puts "  0x#{format("%x", address)}: #{mnemonic} #{op_str}"
61 |         end
62 |         puts
63 |       end
64 | 
65 |       jit_addr
66 |     end
67 |   end
68 | end
69 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | Ruby is copyrighted free software by Yukihiro Matsumoto <matz@netlab.jp>.
 2 | You can redistribute it and/or modify it under either the terms of the
 3 | 2-clause BSDL (see the file BSDL), or the conditions below:
 4 | 
 5 | 1. You may make and give away verbatim copies of the source form of the
 6 |    software without restriction, provided that you duplicate all of the
 7 |    original copyright notices and associated disclaimers.
 8 | 
 9 | 2. You may modify your copy of the software in any way, provided that
10 |    you do at least ONE of the following:
11 | 
12 |    a. place your modifications in the Public Domain or otherwise
13 |       make them Freely Available, such as by posting said
14 |       modifications to Usenet or an equivalent medium, or by allowing
15 |       the author to include your modifications in the software.
16 | 
17 |    b. use the modified software only within your corporation or
18 |       organization.
19 | 
20 |    c. give non-standard binaries non-standard names, with
21 |       instructions on where to get the original software distribution.
22 | 
23 |    d. make other distribution arrangements with the author.
24 | 
25 | 3. You may distribute the software in object code or binary form,
26 |    provided that you do at least ONE of the following:
27 | 
28 |    a. distribute the binaries and library files of the software,
29 |       together with instructions (in the manual page or equivalent)
30 |       on where to get the original distribution.
31 | 
32 |    b. accompany the distribution with the machine-readable source of
33 |       the software.
34 | 
35 |    c. give non-standard binaries non-standard names, with
36 |       instructions on where to get the original software distribution.
37 | 
38 |    d. make other distribution arrangements with the author.
39 | 
40 | 4. You may modify and include the part of the software into any other
41 |    software (possibly commercial).  But some files in the distribution
42 |    are not written by the author, so that they are not under these terms.
43 | 
44 |    For the list of those files and their copying conditions, see the
45 |    file LEGAL.
46 | 
47 | 5. The scripts and library files supplied as input to or produced as
48 |    output from the software do not automatically fall under the
49 |    copyright of the software, but belong to whomever generated them,
50 |    and may be sold commercially, and may be aggregated with this
51 |    software.
52 | 
53 | 6. THIS SOFTWARE IS PROVIDED "AS IS" AND WITHOUT ANY EXPRESS OR
54 |    IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED
55 |    WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
56 |    PURPOSE.
57 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Ruby JIT Challenge
  2 | 
  3 | Supplemental material to [Ruby JIT Hacking Guide](https://rubykaigi.org/2023/presentations/k0kubun.html) for RubyKaigi 2023
  4 | 
  5 | ## Introduction
  6 | 
  7 | This is a small tutorial to write a JIT compiler in Ruby.
  8 | We don't expect any prior experience in compilers or assembly languages.
  9 | It's supposed to take only several minutes if you read all hints, but challenging if you don't.
 10 | 
 11 | You'll write a JIT that can compile a Fibonacci benchmark.
 12 | With relaxed implementation requirements, you'll hopefully create a JIT faster than existing Ruby JITs with ease.
 13 | 
 14 | The goal of this repository is to make you feel comfortable using and/or contributing to Ruby JIT.
 15 | More importantly, enjoy writing a compiler in Ruby.
 16 | 
 17 | ## Setup
 18 | 
 19 | This repository assumes an `x86_64-linux` environment.
 20 | It also requires a Ruby master build to leverage RJIT's interface to integrate a custom JIT.
 21 | 
 22 | It's recommended to use the following Docker container environment.
 23 | There's also [bin/docker](./bin/docker) as a shorthand.
 24 | 
 25 | ```bash
 26 | $ docker run -it -v "$(pwd):/app" k0kubun/rjit bash
 27 | ```
 28 | 
 29 | See [Dockerfile](./Dockerfile) if you want to prepare the same environment locally.
 30 | 
 31 | ## Testing
 32 | 
 33 | You'll build a JIT in multiple steps.
 34 | Test scripts in `test/*.rb` will help you test them one by one.
 35 | You can run them with your JIT enabled with [bin/ruby](./bin/ruby).
 36 | 
 37 | ```
 38 | bin/ruby test/none.rb
 39 | ```
 40 | 
 41 | You can also dump compiled code with `bin/ruby --rjit-dump-disasm test/none.rb`.
 42 | 
 43 | For your convenience, `rake test` ([test/jit/compiler\_test.rb](./test/jit/compiler_test.rb))
 44 | runs all test scripts with your JIT enabled.
 45 | 
 46 | ## 1. Compile nil
 47 | 
 48 | First, we'll compile the following simple method that just returns nil.
 49 | 
 50 | ```rb
 51 | def none
 52 |   nil
 53 | end
 54 | ```
 55 | 
 56 | ### --dump=insns
 57 | 
 58 | In CRuby, each Ruby method is internally compiled into an "Instruction Sequence", also known as ISeq.
 59 | The CRuby interpreter executes Ruby code by looping over instructions in this sequence.
 60 | 
 61 | Typically, a CRuby JIT takes an ISeq as input to the JIT compiler and outputs machine code
 62 | that works in the same way as the ISeq. In this exercise, it's the only input you'll need to take care of.
 63 | 
 64 | You can dump ISeqs in a file by `ruby --dump=insns option`.
 65 | Let's have a look at the ISeq of `none` method.
 66 | 
 67 | ```
 68 | $ ruby --dump=insns test/none.rb
 69 | ...
 70 | == disasm: #<ISeq:none@test/none.rb:1 (1,0)-(3,3)>
 71 | 0000 putnil                                                           (   1)[Ca]
 72 | 0001 leave                                                            (   3)[Re]
 73 | ```
 74 | 
 75 | This means that `none` consists of two instructions: `putnil` and `leave`.
 76 | 
 77 | `putnil` instruction puts nil on the "stack" of the Ruby interpreter. Imagine `stack = []; stack << nil`.
 78 | 
 79 | `leave` instruction is like `return`. It pops the stack top value and uses it as a return value of the method.
 80 | Imagine `return stack.pop`.
 81 | 
 82 | NOTE: Click ▼ to open hints.
 83 | 
 84 | <details>
 85 | <summary>Assembler</summary>
 86 | 
 87 | ### Assembler
 88 | 
 89 | [lib/jit/assembler.rb](./lib/jit/assembler.rb) has an x86\_64 assembler that was copied from RJIT and then simplified.
 90 | Feel free to remove it and write it from scratch, but this tutorial will not cover how to encode x86\_64 instructions.
 91 | 
 92 | Here's example code using `Assembler`.
 93 | 
 94 | ```rb
 95 | asm = Assembler.new
 96 | asm.mov(:rax, [:rsi, 8])
 97 | asm.add(:rax, 2)
 98 | write(asm)
 99 | ```
100 | 
101 | This writes the following machine code into memory.
102 | 
103 | ```asm
104 | mov rax, [rsi + 8]
105 | add rax, 2
106 | ```
107 | 
108 | `rax` and `rsi` are registers.
109 | `[rsi + 8]` is memory access based off of a register, which reads memory 8 bytes after the address in `rsi`.
110 | `2` is an immediate value.
111 | 
112 | See [lib/jit/assembler.rb](./lib/jit/assembler.rb) for what kind of input it can handle.
113 | 
114 | </details>
115 | <details>
116 | <summary>Instructions</summary>
117 | 
118 | ### Instructions
119 | 
120 | There are various x86\_64 instructions.
121 | However, it's enough to use only the following instructions to pass tests in this tutorial.
122 | 
123 | For `test/none.rb`, only `mov`, `add`, and `ret` are necessary.
124 | 
125 | | Instruction | Description                                 | Example      | Effect     |
126 | |:------------|:--------------------------------------------|:-------------|:-----------|
127 | | mov         | Assign a value.                             | `mov rax, 1` | `rax = 1`  |
128 | | add         | Add a value.                                | `add rax, 1` | `rax += 1` |
129 | | sub         | Subtract a value.                           | `sub rax, 1` | `rax -= 1` |
130 | | cmp         | Compare values. Use it with cmovl.          | `cmp rdi, rsi`   | `rdi < rsi` |
131 | | cmovl       | Assign a value if left < right.             | `cmovl rax, rcx` | `rax = rcx if rdi < rsi` |
132 | | test        | Compare values. Use it with jz.             | `test rax, 1` | `rax & 1` |
133 | | jz          | Jump if left and right have no common bits. | `jz 0x1234` | `goto 0x1234 if rax & 1 == 0` |
134 | | jmp         | Jump to an address.                         | `jmp 0x1234` | `goto 0x1234` |
135 | | call        | Call a function.                            | `call 0x1234` | `func()` |
136 | | ret         | Return a value.                             | `ret` | `return rax` |
137 | 
138 | </details>
139 | <details>
140 | <summary>Registers</summary>
141 | 
142 | ### Registers
143 | 
144 | Registers are like variables in machine code.
145 | You're free to use registers in whatever way, but a [reference implementation](https://github.com/k0kubun/ruby-jit-challenge/blob/k0kubun/lib/jit/compiler.rb)
146 | used only the following registers.
147 | 
148 | | Register | Purpose |
149 | |:---------|:--------|
150 | | rdi      | `ec` (execution context) is set when a JIT function is called. It represents a Ruby thread. Used when you push/pop a stack frame. |
151 | | rsi      | `cfp` (control frame pointer) is set when a JIT function is called. It represents a stack frame. Used when you fetch a local variable or a receiver. |
152 | | rax      | A JIT function return value to be set before `ret` instruction. It can be also used as a "scratch register" to hold temporary values. |
153 | | r8       | A general-purpose register. The reference implementation used this for the 1st slot of the Ruby VM stack, `stack[0]`. |
154 | | r9       | A general-purpose register. The reference implementation used this for the 2nd slot of the Ruby VM stack, `stack[1]`. |
155 | | r10      | A general-purpose register. The reference implementation used this for the 3rd slot of the Ruby VM stack, `stack[2]`. |
156 | | r11      | A general-purpose register. The reference implementation used this for the 4th slot of the Ruby VM stack, `stack[3]`. |
157 | 
158 | </details>
159 | <details>
160 | <summary>Compiling putnil</summary>
161 | 
162 | ### Compiling putnil
163 | 
164 | Open [lib/jit/compiler.rb](./lib/jit/compiler.rb) and add a case for `putnil`.
165 | 
166 | ```diff
167 |        # Iterate over each YARV instruction.
168 |        insn_index = 0
169 |        while insn_index < iseq.body.iseq_size
170 |          insn = INSNS.fetch(C.rb_vm_insn_decode(iseq.body.iseq_encoded[insn_index]))
171 |          case insn.name
172 |          in :nop
173 |            # none
174 | +        in :putnil
175 | +          # ...
176 |          end
177 |          insn_index += insn.len
178 |        end
179 | ```
180 | 
181 | Let's push `nil` onto the stack.
182 | In the scope of this tutorial, it's enough to use a random register as a replacement for a stack slot.
183 | 
184 | Let's say you decided to use `r8` for `stack[0]`, you could write the code as follows, for example.
185 | 
186 | ```diff
187 | +      STACK = [:r8]
188 | 
189 |        # Iterate over each YARV instruction.
190 |        insn_index = 0
191 | +      stack_size = 0
192 |        while insn_index < iseq.body.iseq_size
193 |          insn = INSNS.fetch(C.rb_vm_insn_decode(iseq.body.iseq_encoded[insn_index]))
194 |          case insn.name
195 |          in :nop
196 |            # none
197 |          in :putnil
198 | +          asm.mov(STACK[stack_size], C.to_value(nil))
199 | +          stack_size += 1
200 |          end
201 |          insn_index += insn.len
202 |        end
203 | ```
204 | 
205 | `C` is a module with useful helpers to write a JIT.
206 | `C.to_value` converts any Ruby object into its representation in the C language (and machine code).
207 | 
208 | `C.to_value(nil)` is 4, so this does `asm.mov(:r8, 4)`, which means `stack[0] = nil`.
209 | This value in `r8` should be then handled by subsequent instructions like `leave`.
210 | 
211 | </details>
212 | <details>
213 | <summary>Compiling leave</summary>
214 | 
215 | ### Compiling leave
216 | 
217 | `leave` instruction needs to do two things.
218 | 
219 | 1. Pop a stack frame
220 | 2. Return a value
221 | 
222 | A JIT function is called after a corresponding stack frame is pushed.
223 | However, the Ruby VM is not responsible for popping the stack frame after calling the JIT function.
224 | So a JIT function needs to pop it on `leave` instruction.
225 | 
226 | A stack frame `cfp` is in `rsi`. The interpreter reads `ec->cfp` to fetch the current stack frame and `ec` is in `rdi`.
227 | Therefore, you can generate code to pop a stack frame as follows.
228 | 
229 | ```diff
230 |        STACK = [:r8]
231 | +      EC = :rdi
232 | +      CFP = :rsi
233 | 
234 |        # Iterate over each YARV instruction.
235 |        insn_index = 0
236 |        stack_size = 0
237 |        while insn_index < iseq.body.iseq_size
238 |          insn = INSNS.fetch(C.rb_vm_insn_decode(iseq.body.iseq_encoded[insn_index]))
239 |          case insn.name
240 |          in :nop
241 |            # none
242 |          in :putnil
243 |            asm.mov(STACK[stack_size], C.to_value(nil))
244 |            stack_size += 1
245 | +        in :leave
246 | +          asm.add(CFP, C.rb_control_frame_t.size)
247 | +          asm.mov([EC, C.rb_execution_context_t.offsetof(:cfp)], CFP)
248 |          end
249 |          insn_index += insn.len
250 |        end
251 | ```
252 | 
253 | The `cfp` grows downward; `cfp -= 1` pushes a frame, and `cfp += 1` pops a frame.
254 | Here, we want to pop a frame, so we do `cfp += 1`.
255 | When we increment a pointer, `1` actually means the size of what it points to.
256 | `cfp` is called `rb_control_frame_t` in the Ruby VM, and you can get its size by `C.rb_control_frame_t.size`.
257 | 
258 | To set that to `ec->cfp`, you need to get a memory address based off of `ec`.
259 | The offset of `ec->cfp` relative to the head of `ec` is in `C.rb_execution_context_t.offsetof(:cfp)`.
260 | So you can use `[EC, C.rb_execution_context_t.offsetof(:cfp)]` to get `ec->cfp`.
261 | 
262 | Finally, we'll return a value from the JIT function.
263 | You should set a stack-top value to `rax` and then put `ret` instruction.
264 | 
265 | ```diff
266 |        # Iterate over each YARV instruction.
267 |        insn_index = 0
268 |        stack_size = 0
269 |        while insn_index < iseq.body.iseq_size
270 |          insn = INSNS.fetch(C.rb_vm_insn_decode(iseq.body.iseq_encoded[insn_index]))
271 |          case insn.name
272 |          in :nop
273 |            # none
274 |          in :putnil
275 |            asm.mov(STACK[stack_size], C.to_value(nil))
276 |            stack_size += 1
277 |          in :leave
278 |            asm.add(CFP, C.rb_control_frame_t.size)
279 |            asm.mov([EC, C.rb_execution_context_t.offsetof(:cfp)], CFP)
280 | +          asm.mov(:rax, STACK[stack_size - 1])
281 | +          asm.ret
282 |          end
283 |          insn_index += insn.len
284 |        end
285 | ```
286 | 
287 | Now you should be able to execute `test/none.rb`. Test it as follows.
288 | 
289 | ```
290 | $ bin/ruby --rjit-dump-disasm test/none.rb
291 |   0x564e87d2c000: mov r8, 4
292 |   0x564e87d2c007: add rsi, 0x40
293 |   0x564e87d2c00b: mov qword ptr [rdi + 0x10], rsi
294 |   0x564e87d2c00f: mov rax, r8
295 |   0x564e87d2c012: ret
296 | 
297 | nil
298 | ```
299 | 
300 | `rake test` should pass one test that runs `test/none.rb`.
301 | 
302 | Also try changing what you're giving to `C.to_value` in `putnil` to double-check
303 | the interpreter is calling the JIT function you generated.
304 | 
305 | </details>
306 | 
307 | ## 2. Compile 1 + 2
308 | 
309 | Next, we'll compile something more interesting: `Integer#+`.
310 | 
311 | ```rb
312 | def plus
313 |   1 + 2
314 | end
315 | ```
316 | 
317 | ### --dump=insns
318 | 
319 | ```
320 | $ ruby --dump=insns test/plus.rb
321 | ...
322 | == disasm: #<ISeq:plus@test/plus.rb:1 (1,0)-(3,3)>
323 | 0000 putobject_INT2FIX_1_                                             (   2)[LiCa]
324 | 0001 putobject                              2
325 | 0003 opt_plus                               <calldata!mid:+, argc:1, ARGS_SIMPLE>[CcCr]
326 | 0005 leave                                                            (   3)[Re]
327 | ```
328 | 
329 | `plus` has four instructions: `putobject_INT2FIX_1_`, `putobject`, `opt_plus`, and `leave`.
330 | 
331 | `putobject_INT2FIX_1_` is "operand unification" of `putobject 1`.
332 | `putnil` and `leave` didn't take any arguments, but `putobject` does.
333 | We call an argument of instructions an operand.
334 | At `0001`, there's `putobject` instruction, and its operand `2` is at `0002` before `opt_plus` at `0003`.
335 | At `0000`, there's `putobject_INT2FIX_1_` instruction, and its operand `INT2FIX(1)` is unified with `putobject`,
336 | so it doesn't take an operand, which makes the ISeq shorter.
337 | 
338 | `putobject` (and `putobject_INT2FIX_1_`) pushes an operand to the stack.
339 | Both instructions and operands are in `iseq.body.iseq_encoded`.
340 | To get an operand for `0001 putobject` which is at `0002`, you need to look at `iseq.body.iseq_encoded[2]`.
341 | So that works like `stack << iseq.body.iseq_encoded[2]`.
342 | 
343 | `opt_plus` pops two objects from the stack, calls `#+`, and pushes the result onto the stack.
344 | So it's `stack << stack.pop + stack.pop`.
345 | 
346 | <details>
347 | <summary>Compiling putobject</summary>
348 | 
349 | ### Compiling putobject
350 | 
351 | For `putobject_INT2FIX_1_`, you need to hard-code the operand as `1`.
352 | Instead of `INT2FIX(1)` that is used in C, you can use `C.to_value(1)` instead.
353 | So it can be:
354 | 
355 | ```rb
356 | STACK = [:r8, :r9]
357 | 
358 | in :putobject_INT2FIX_1_
359 |   asm.mov(STACK[stack_size], C.to_value(1))
360 |   stack_size += 1
361 | ```
362 | 
363 | For `putobject`, you need to get an operand from `iseq.body.iseq_encoded` as explained above.
364 | You could write:
365 | 
366 | ```rb
367 | in :putobject
368 |   operand = iseq.body.iseq_encoded[insn_index + 1]
369 |   asm.mov(STACK[stack_size], operand)
370 |   stack_size += 1
371 | ```
372 | 
373 | </details>
374 | 
375 | <details>
376 | <summary>Compiling opt_plus</summary>
377 | 
378 | ### Compiling opt\_plus
379 | 
380 | `opt_plus` is capable of handling any `#+` methods, but specifically optimizes a few methods such as `Integer#+`.
381 | In this tutorial, we're going to handle only `Integer`s. It's okay to assume operands are all `Integer`s.
382 | 
383 | In CRuby, a small-enough `Integer` is expressed as `(num << 1) + 1`.
384 | So an `Integer` object `1` is expressed as `(1 << 1) + 1`, which is `3`.
385 | 
386 | You'll take `(num1 << 1) + 1` and `(num2 << 1) + 1` as operands.
387 | If you just add them, the result will be `((num1 + num2) << 1) + 2`.
388 | The actual representation for `num1 + num2` is `((num1 + num2) << 1) + 1`,
389 | so you'll need to subtract it by 1.
390 | 
391 | Here's an example implementation.
392 | 
393 | ```rb
394 | in :opt_plus
395 |   recv = STACK[stack_size - 2]
396 |   obj = STACK[stack_size - 1]
397 | 
398 |   asm.add(recv, obj)
399 |   asm.sub(recv, 1)
400 | 
401 |   stack_size -= 1
402 | ```
403 | 
404 | Test those instructions with `bin/ruby --rjit-dump-disasm test/plus.rb`.
405 | 
406 | </details>
407 | 
408 | ## 3. Compile fibonacci
409 | 
410 | Finally, we'll have a look at the benchmark target, Fibonacci.
411 | 
412 | ```rb
413 | def fib(n)
414 |   if n < 2
415 |     return n
416 |   end
417 |   return fib(n-1) + fib(n-2)
418 | end
419 | ```
420 | 
421 | ### --dump=insns
422 | 
423 | ```
424 | $ ruby --dump=insns test/fib.rb
425 | ...
426 | == disasm: #<ISeq:fib@test/fib.rb:1 (1,0)-(6,3)>
427 | local table (size: 1, argc: 1 [opts: 0, rest: -1, post: 0, block: -1, kw: -1@-1, kwrest: -1])
428 | [ 1] n@0<Arg>
429 | 0000 getlocal_WC_0                          n@0                       (   2)[LiCa]
430 | 0002 putobject                              2
431 | 0004 opt_lt                                 <calldata!mid:<, argc:1, ARGS_SIMPLE>[CcCr]
432 | 0006 branchunless                           11
433 | 0008 getlocal_WC_0                          n@0                       (   3)[Li]
434 | 0010 leave                                  [Re]
435 | 0011 putself                                                          (   5)[Li]
436 | 0012 getlocal_WC_0                          n@0
437 | 0014 putobject_INT2FIX_1_
438 | 0015 opt_minus                              <calldata!mid:-, argc:1, ARGS_SIMPLE>[CcCr]
439 | 0017 opt_send_without_block                 <calldata!mid:fib, argc:1, FCALL|ARGS_SIMPLE>
440 | 0019 putself
441 | 0020 getlocal_WC_0                          n@0
442 | 0022 putobject                              2
443 | 0024 opt_minus                              <calldata!mid:-, argc:1, ARGS_SIMPLE>[CcCr]
444 | 0026 opt_send_without_block                 <calldata!mid:fib, argc:1, FCALL|ARGS_SIMPLE>
445 | 0028 opt_plus                               <calldata!mid:+, argc:1, ARGS_SIMPLE>[CcCr]
446 | 0030 leave                                                            (   6)[Re]
447 | ```
448 | 
449 | `fib` has many more instructions.
450 | 
451 | `opt_minus` and `opt_lt` are like `opt_plus` except it performs `#-` and `#<` respectively.
452 | 
453 | `getlocal_WC_0` is operand unification of `getlocal *, 0` where `WC` stands for a wildcard.
454 | It pushes a local variable onto the stack.
455 | 
456 | `branchunless` jumps to a destination specified by an operand when a stack-top value is
457 | false or nil.
458 | 
459 | `putself` pushes a receiver onto the stack.
460 | 
461 | `opt_send_without_block` calls a method with a receiver and arguments on the stack.
462 | 
463 | <details>
464 | <summary>Compiling opt_minus</summary>
465 | 
466 | ### Compiling opt\_minus
467 | 
468 | Remember `opt_plus`.
469 | You'll take `(num1 << 1) + 1` and `(num2 << 1) + 1` as operands.
470 | If you subtract one by the other, the result will be `((num1 - num2) << 1)`.
471 | But the actual representation for `num1 - num2` is `((num1 - num2) << 1) + 1`.
472 | So you'll need to add 1 to it.
473 | 
474 | Here's an example implementation.
475 | 
476 | ```rb
477 | STACK = [:r8, :r9, :r10, :r11]
478 | 
479 | in :opt_minus
480 |   recv = STACK[stack_size - 2]
481 |   obj = STACK[stack_size - 1]
482 | 
483 |   asm.sub(recv, obj)
484 |   asm.add(recv, 1)
485 | 
486 |   stack_size -= 1
487 | ```
488 | 
489 | Test the instruction with `bin/ruby --rjit-dump-disasm test/minus.rb`.
490 | 
491 | </details>
492 | 
493 | <details>
494 | <summary>Compiling getlocal</summary>
495 | 
496 | ### Compiling getlocal
497 | 
498 | `getlocal_WC_0` means `getlocal *, 0`. The `*` part is an operand and it has an index to the local variable from an "environment pointer" (EP).
499 | The `0` part is a "level", which shows how many levels of EPs you need to go deeper to get a local variable.
500 | This is needed when a local variable environment is nested, e.g. a block inside a method.
501 | Since it's `0` this time, you will not need to worry about digging EPs. You'll need to get the EP of the current "control frame" (`cfp`).
502 | 
503 | `cfp` is in `rsi` and you can get the offset to `cfp->ep` from `C.rb_control_frame_t.offsetof(:ep)`.
504 | So `[:rsi, C.rb_control_frame_t.offsetof(:ep)]` can be used to get an EP.
505 | 
506 | Once you get an EP, you need to find a local variable. The index is an operand, which can be fetched with `iseq.body.iseq_encoded[insn_index + 1]`.
507 | The index is a positive number but local variables actually live "below" the EP. So you have to negate the index.
508 | Besides, the unit of indexes is a `VALUE` type in C, which represents a Ruby object. So the index to a local variable from an EP is
509 | `-iseq.body.iseq_encoded[insn_index + 1] * C.VALUE.size`.
510 | 
511 | All in all, an example implementation looks like this.
512 | 
513 | ```rb
514 | in :getlocal_WC_0
515 |   # Get EP
516 |   asm.mov(:rax, [CFP, C.rb_control_frame_t.offsetof(:ep)])
517 | 
518 |   # Load the local variable
519 |   idx = iseq.body.iseq_encoded[insn_index + 1]
520 |   asm.mov(STACK[stack_size], [:rax, -idx * C.VALUE.size])
521 | 
522 |   stack_size += 1
523 | ```
524 | 
525 | Test the instruction with `bin/ruby --rjit-dump-disasm test/local.rb`.
526 | 
527 | </details>
528 | 
529 | <details>
530 | <summary>Compiling opt_lt</summary>
531 | 
532 | ### Compiling opt\_lt
533 | 
534 | Again, assume operands are `Integer`s.
535 | Comparing `(num1 << 1) + 1` and `(num2 << 1) + 1` would return the same result as comparing `num1` and `num2`.
536 | You'll use a `cmp` instruction that compares them.
537 | 
538 | Once you compare the values, you'll need to generate code that conditionally returns something.
539 | `Integer#<` returns `true` or `false`.
540 | There's a family of instructions that conditionally set a value based on a prior `cmp` (or `test`).
541 | To conditionally set a value if `num1 < num2` holds based on the previous `cmp`,
542 | you can use `cmovl` (conditionally move if less).
543 | 
544 | An example implementation is as follows.
545 | 
546 | ```rb
547 | in :opt_lt
548 |   recv = STACK[stack_size - 2]
549 |   obj = STACK[stack_size - 1]
550 | 
551 |   asm.cmp(recv, obj)
552 |   asm.mov(recv, C.to_value(false))
553 |   asm.mov(:rax, C.to_value(true))
554 |   asm.cmovl(recv, :rax)
555 | 
556 |   stack_size -= 1
557 | ```
558 | 
559 | Test the instruction with `bin/ruby --rjit-dump-disasm test/lt.rb`.
560 | 
561 | </details>
562 | 
563 | <details>
564 | <summary>Compiling putself</summary>
565 | 
566 | ### Compiling putself
567 | 
568 | `fib` method is called without an argument. In Ruby, it implicitly uses the receiver of the current frame (`cfp`).
569 | `cfp` is in `rsi`, and the offset to `cfp->self` (receiver) is implemented at `C.rb_control_frame_t.offsetof(:self)`.
570 | So `[:rsi, C.rb_control_frame_t.offsetof(:self)]` can be used to fetch a receiver.
571 | 
572 | An example implementation looks like this.
573 | 
574 | ```rb
575 | in :putself
576 |   asm.mov(STACK[stack_size], [CFP, C.rb_control_frame_t.offsetof(:self)])
577 |   stack_size += 1
578 | ```
579 | 
580 | </details>
581 | 
582 | <details>
583 | <summary>Compiling opt_send_without_block</summary>
584 | 
585 | ### Compiling opt\_send\_without\_block
586 | 
587 | Congratulations on making it to this stage. You've accomplished a lot already.
588 | I hope you've enjoyed your journey.
589 | We're going to tackle a couple of instructions that may be the most challenging part in this tutorial.
590 | If you get lost, consider just copying the code that is shown later and playing with it.
591 | 
592 | `opt_send_without_block` supports various method calls.
593 | However, in this tutorial, it's okay to assume any method call is a Ruby method call.
594 | 
595 | As long as you use `--rjit-call-threshold=3` (compile methods that have been called three times),
596 | the cache of all `opt_send_without_block` instructions is "warmed up" in all test scripts.
597 | It means that the cache has a reference to an ISeq. For simplicity in this tutorial,
598 | assume that it's not gonna change and you won't need to invalidate it.
599 | 
600 | `opt_send_without_block` takes a "call data" operand, which is a pair of "call info" and "call cache".
601 | A call data object can be instantiated with `cd = C.rb_call_data.new(iseq.body.iseq_encoded[insn_index + 1])`.
602 | 
603 | A call info is in `cd.ci`, which has information like the number of arguments.
604 | `ci` has a packed data structure which cannot be accessed like a normal struct.
605 | So you need to get the number of arguments using a special helper, `C.vm_ci_argc(ci)`.
606 | 
607 | A call cache has a reference to an ISeq. `cd.cc.cme_.def.body.iseq.iseqptr` has a callee ISeq.
608 | For better performance, we want to compile everything and directly jump to an already-compiled address.
609 | You can call `compile(callee_iseq)` if `callee_iseq.body.jit_func` is still `0` (NULL in C).
610 | 
611 | Once a callee function becomes ready, we need to prepare for calling a method.
612 | Since our `getlocal` implementation gets a local variable on the stack relative to an EP,
613 | we have to set arguments to the stack, which are local variables to the callee.
614 | 
615 | The VM stack looks like this when you call a method.
616 | 
617 | ```
618 | | locals | cme | block_handler | frame type (callee EP) | stack bottom (callee SP) |
619 | ```
620 | 
621 | For locals, we want to put arguments. There's a "stack pointer" in `SP` which points to
622 | a free stack slot above the stack top. You could write values to it and keep bumping the SP until you finish writing all arguments.
623 | Once it's done, SP needs to be bumped three more times to accommodate a "cme" (callable method entry), a block handler, and a frame type.
624 | You don't need to use them in this tutorial. Just bump SP by 3 to get a callee SP. EP is one slot below that.
625 | 
626 | Set those `sp` and `ep` fields to a callee `cfp` after bumping `cfp`.
627 | Remember what you did at `leave` instruction; pushing a frame means to subtract it by `C.rb_control_frame_t.size`.
628 | Since `putself` refers to it, you may set `cfp->self` as well, using `C.rb_control_frame_t.offsetof(:self)`. 
629 | Note, however, that we don't actually use the receiver in `cfp` for method dispatch. You may just skip it.
630 | 
631 | Before and after calling a callee function, you have to save and restore registers you're using for the stack
632 | so that the callee function can use them.
633 | We've used `r8`, `r9`, `r10`, and `r11` as `STACK`. You can use `push` instruction to push a register to the machine stack,
634 | and then use `pop` instruction in the reverse order to restore a register from the machine stack.
635 | 
636 | An example implementation looks like this.
637 | 
638 | ```rb
639 | in :opt_send_without_block
640 |   # Compile the callee ISEQ
641 |   cd = C.rb_call_data.new(iseq.body.iseq_encoded[insn_index + 1])
642 |   callee_iseq = cd.cc.cme_.def.body.iseq.iseqptr
643 |   if callee_iseq.body.jit_func == 0
644 |     compile(callee_iseq)
645 |   end
646 | 
647 |   # Get SP
648 |   asm.mov(:rax, [CFP, C.rb_control_frame_t.offsetof(:sp)])
649 |   # Spill arguments
650 |   C.vm_ci_argc(cd.ci).times do |i|
651 |     asm.mov([:rax, C.VALUE.size * i], STACK[stack_size - C.vm_ci_argc(cd.ci) + i])
652 |   end
653 | 
654 |   # Push cfp: ec->cfp = cfp - 1
655 |   asm.sub(CFP, C.rb_control_frame_t.size)
656 |   asm.mov([EC, C.rb_execution_context_t.offsetof(:cfp)], CFP)
657 |   # Set SP
658 |   asm.add(:rax, C.VALUE.size * (C.vm_ci_argc(cd.ci) + 3))
659 |   asm.mov([CFP, C.rb_control_frame_t.offsetof(:sp)], :rax)
660 |   # Set EP
661 |   asm.sub(:rax, C.VALUE.size)
662 |   asm.mov([CFP, C.rb_control_frame_t.offsetof(:ep)], :rax)
663 |   # Set receiver
664 |   asm.mov(:rax, STACK[stack_size - C.vm_ci_argc(cd.ci) - 1])
665 |   asm.mov([CFP, C.rb_control_frame_t.offsetof(:self)], :rax)
666 | 
667 |   # Save stack registers
668 |   STACK.each do |reg|
669 |     asm.push(reg)
670 |   end
671 | 
672 |   # Call the JIT func
673 |   asm.call(callee_iseq.body.jit_func)
674 | 
675 |   # Pop stack registers
676 |   STACK.reverse_each do |reg|
677 |     asm.pop(reg)
678 |   end
679 | 
680 |   # Set a return value
681 |   asm.mov(STACK[stack_size - C.vm_ci_argc(cd.ci) - 1], :rax)
682 | 
683 |   stack_size -= C.vm_ci_argc(cd.ci)
684 | ```
685 | 
686 | Test the instruction with `bin/ruby --rjit-dump-disasm test/send.rb`.
687 | 
688 | This code has some optimization opportunities when you need to support only `fib`.
689 | In fact, my [reference implementation](https://github.com/k0kubun/ruby-jit-challenge/blob/k0kubun/lib/jit/compiler.rb)
690 | is already a bit faster than that. It could be even faster, for example, if you use registers for local variables.
691 | 
692 | </details>
693 | 
694 | <details>
695 | <summary>Compiling branchunless</summary>
696 | 
697 | ### Compiling branchunless
698 | 
699 | It's almost there. This will be the last instruction you'll compile to run `fib`.
700 | This is probably the most interesting and challenging part of this tutorial.
701 | 
702 | Supporting this instruction requires a major refactoring on the boilerplate code.
703 | It's because past test scripts run instructions from top to bottom whereas
704 | you need to jump to different instruction indexes based on runtime values.
705 | 
706 | There's not only the jump support, but also complexity in dependencies.
707 | Let's have a look at `ruby --dump=insns test/branch.rb`.
708 | 
709 | ```
710 | == disasm: #<ISeq:branch@test/branch.rb:1 (1,0)-(7,3)>
711 | local table (size: 1, argc: 1 [opts: 0, rest: -1, post: 0, block: -1, kw: -1@-1, kwrest: -1])
712 | [ 1] flag@0<Arg>
713 | 0000 getlocal_WC_0                          flag@0                    (   2)[LiCa]
714 | 0002 branchunless                           6
715 | 
716 | 0004 putobject_INT2FIX_1_                                             (   3)[Li]
717 | 0005 leave                                                            (   7)[Re]
718 | 
719 | 0006 putobject_INT2FIX_0_                                             (   5)[Li]
720 | 0007 leave                                                            (   7)[Re]
721 | ```
722 | 
723 | I inserted newlines into the actual output to indicate "basic block" boundaries.
724 | There are three blocks: the first block from `0000`, the second block from `0004`, and the third block from `0006`.
725 | 
726 | Let's say you start compiling the first block, you'll need to generate code to jump to the second block or the third block.
727 | However, the second block and the third block have not been compiled yet. You cannot compile it from top to bottom as before.
728 | 
729 | Then, why not compile it from the second block and the third block, and then compile the first block?
730 | Sure, it works for this example. But what if the second block calls the first block?
731 | It's a circular dependency. And it's exactly what `fib` does.
732 | So you have to design the compiler in a way that it supports circular dependencies.
733 | 
734 | One suggested solution is to write out dummy addresses first, and then rewrite them after all blocks are compiled.
735 | Rewriting a past address requires you to figure out the address that `Assembler` used.
736 | The `Assembler` in the boilerplate doesn't have such interface, so you have to define it yourself.
737 | 
738 | For example, you could add this kind of interface.
739 | 
740 | ```diff
741 | --- a/lib/jit/assembler.rb
742 | +++ b/lib/jit/assembler.rb
743 | @@ -50,6 +50,7 @@ module JIT
744 |      end
745 | 
746 |      def assemble(addr)
747 | +      set_start_addrs(addr)
748 |        resolve_rel32(addr)
749 |        resolve_labels
750 | 
751 | @@ -905,6 +876,12 @@ module JIT
752 |        @labels[label] = @bytes.size
753 |      end
754 | 
755 | +    # Mark the starting addresses of a branch
756 | +    def branch(branch)
757 | +      @branches[@bytes.size] << branch
758 | +      yield
759 | +    end
760 | +
761 |      private
762 | 
763 |      def insn(prefix: 0, opcode:, rd: nil, mod_rm: nil, disp: nil, imm: nil)
764 | @@ -1010,6 +987,14 @@ module JIT
765 |        [Rel32.new(addr), Rel32Pad, Rel32Pad, Rel32Pad]
766 |      end
767 | 
768 | +    def set_start_addrs(write_addr)
769 | +      (@bytes.size + 1).times do |index|
770 | +        @branches.fetch(index, []).each do |branch|
771 | +          branch.start_addr = write_addr + index
772 | +        end
773 | +      end
774 | +    end
775 | ```
776 | 
777 | Then a random object you're giving to `#branch` will get `start_addr` assigned.
778 | If the object also has a Proc to re-compile a branch, you can just buffer those objects
779 | and calls them later.
780 | 
781 | To simplify the problem, you could split an ISeq into basic blocks, and just compile
782 | each block as before. Here's an example logic that works for the test scripts in this tutorial.
783 | 
784 | ```rb
785 | # Get a list of basic blocks in a method
786 | def split_blocks(iseq, insn_index: 0, stack_size: 0, split_indexes: [])
787 |   return [] if split_indexes.include?(insn_index)
788 |   split_indexes << insn_index
789 | 
790 |   block = { start_index: insn_index, end_index: nil, stack_size: }
791 |   blocks = [block]
792 | 
793 |   while insn_index < iseq.body.iseq_size
794 |     insn = INSNS.fetch(C.rb_vm_insn_decode(iseq.body.iseq_encoded[insn_index]))
795 |     case insn.name
796 |     when :branchunless
797 |       block[:end_index] = insn_index
798 |       stack_size += sp_inc(iseq, insn_index)
799 |       next_index = insn_index + insn.len
800 |       blocks += split_blocks(iseq, insn_index: next_index, stack_size:, split_indexes:)
801 |       blocks += split_blocks(iseq, insn_index: next_index + iseq.body.iseq_encoded[insn_index + 1], stack_size:, split_indexes:)
802 |       break
803 |     when :leave
804 |       block[:end_index] = insn_index
805 |       break
806 |     else
807 |       stack_size += sp_inc(iseq, insn_index)
808 |       insn_index += insn.len
809 |     end
810 |   end
811 | 
812 |   blocks
813 | end
814 | 
815 | # Get a stack size increase for a YARV instruction.
816 | def sp_inc(iseq, insn_index)
817 |   insn = INSNS.fetch(C.rb_vm_insn_decode(iseq.body.iseq_encoded[insn_index]))
818 |   case insn.name
819 |   in :opt_plus | :opt_minus | :opt_lt | :leave | :branchunless
820 |     -1
821 |   in :nop
822 |     0
823 |   in :putnil | :putobject_INT2FIX_0_ | :putobject_INT2FIX_1_ | :putobject | :putself | :getlocal_WC_0
824 |     1
825 |   in :opt_send_without_block
826 |     cd = C.rb_call_data.new(iseq.body.iseq_encoded[insn_index + 1])
827 |     -C.vm_ci_argc(cd.ci)
828 |   end
829 | end
830 | ```
831 | 
832 | Each block is represented as a Hash that has `start_index`, `end_index`, and an initial `stack_size`.
833 | The first block's first address should be set to `iseq.body.jit_func`.
834 | 
835 | Finally, let's compile `branchunless`. With `blocks` made by `split_blocks` and `branches = []`, an example implementation
836 | looks like this.
837 | 
838 | ```rb
839 | Branch = Struct.new(:start_addr, :compile)
840 | 
841 | in :branchunless
842 |   next_index = insn_index + insn.len
843 |   next_block = blocks.find { |block| block[:start_index] == next_index }
844 | 
845 |   jump_index = next_index + iseq.body.iseq_encoded[insn_index + 1]
846 |   jump_block = blocks.find { |block| block[:start_index] == jump_index }
847 | 
848 |   # This `test` sets ZF only for Qnil and Qfalse, which lets jz jump.
849 |   asm.test(STACK[stack_size - 1], ~C.to_value(nil))
850 | 
851 |   branch = Branch.new
852 |   branch.compile = proc do |asm|
853 |     dummy_addr = @jit_buf + JIT_BUF_SIZE
854 |     asm.jz(jump_block.fetch(:start_addr, dummy_addr))
855 |     asm.jmp(next_block.fetch(:start_addr, dummy_addr))
856 |   end
857 |   asm.branch(branch) do
858 |     branch.compile.call(asm)
859 |   end
860 |   branches << branch
861 | ```
862 | 
863 | The `branches` are then re-compiled with:
864 | 
865 | ```rb
866 | branches.each do |branch|
867 |   with_addr(branch[:start_addr]) do
868 |     asm = Assembler.new
869 |     branch.compile.call(asm)
870 |     write(asm)
871 |   end
872 | end
873 | ```
874 | 
875 | ```rb
876 | def with_addr(addr)
877 |   jit_pos = @jit_pos
878 |   @jit_pos = addr - @jit_buf
879 |   yield
880 | ensure
881 |   @jit_pos = jit_pos
882 | end
883 | ```
884 | 
885 | That's all. Test it with `bin/ruby --rjit-dump-disasm test/branch.rb`.
886 | If everything is done correctly, `bin/ruby test/fib.rb` should also work.
887 | 
888 | </details>
889 | 
890 | ## 4. Benchmark
891 | 
892 | Let's measure the performance.
893 | [bin/bench](./bin/bench) allows you to compare your JIT (ruby-jit) and other CRuby JITs.
894 | 
895 | ```
896 | $ bin/bench
897 | Calculating -------------------------------------
898 |                          no-jit        rjit        yjit    ruby-jit
899 |              fib(32)      5.250      19.481      32.841      58.145 i/s
900 | 
901 | Comparison:
902 |                           fib(32)
903 |             ruby-jit:        58.1 i/s
904 |                 yjit:        32.8 i/s - 1.77x  slower
905 |                 rjit:        19.5 i/s - 2.98x  slower
906 |               no-jit:         5.2 i/s - 11.08x  slower
907 | ```
908 | 


--------------------------------------------------------------------------------
/lib/jit/assembler.rb:
--------------------------------------------------------------------------------
   1 | # frozen_string_literal: true
   2 | module JIT
   3 |   # 8-bit memory access
   4 |   class BytePtr < Data.define(:reg, :disp); end
   5 | 
   6 |   # 32-bit memory access
   7 |   class DwordPtr < Data.define(:reg, :disp); end
   8 | 
   9 |   # 64-bit memory access
  10 |   QwordPtr = Array
  11 | 
  12 |   # SystemV x64 calling convention
  13 |   C_ARGS = [:rdi, :rsi, :rdx, :rcx, :r8, :r9]
  14 |   C_RET  = :rax
  15 | 
  16 |   # https://cdrdv2.intel.com/v1/dl/getContent/671110
  17 |   # Mostly an x86_64 assembler, but this also has some stuff that is useful for any architecture.
  18 |   class Assembler
  19 |     # rel8 jumps are made with labels
  20 |     class Label < Data.define(:id, :name); end
  21 | 
  22 |     # rel32 is inserted as [Rel32, Rel32Pad..] and converted on #resolve_rel32
  23 |     class Rel32 < Data.define(:addr); end
  24 |     Rel32Pad = Object.new
  25 | 
  26 |     # A set of ModR/M values encoded on #insn
  27 |     class ModRM < Data.define(:mod, :reg, :rm); end
  28 |     Mod00 = 0b00 # Mod 00: [reg]
  29 |     Mod01 = 0b01 # Mod 01: [reg]+disp8
  30 |     Mod10 = 0b10 # Mod 10: [reg]+disp32
  31 |     Mod11 = 0b11 # Mod 11: reg
  32 | 
  33 |     # REX =   0100WR0B
  34 |     REX_B = 0b01000001
  35 |     REX_R = 0b01000100
  36 |     REX_W = 0b01001000
  37 | 
  38 |     # Operand matchers
  39 |     R32   = -> (op) { op.is_a?(Symbol) && r32?(op) }
  40 |     R64   = -> (op) { op.is_a?(Symbol) && r64?(op) }
  41 |     IMM8  = -> (op) { op.is_a?(Integer) && imm8?(op) }
  42 |     IMM32 = -> (op) { op.is_a?(Integer) && imm32?(op) }
  43 |     IMM64 = -> (op) { op.is_a?(Integer) && imm64?(op) }
  44 | 
  45 |     def initialize
  46 |       @bytes = []
  47 |       @labels = {}
  48 |       @label_id = 0
  49 |       @branches = Hash.new { |h, k| h[k] = [] }
  50 |     end
  51 | 
  52 |     def assemble(addr)
  53 |       resolve_rel32(addr)
  54 |       resolve_labels
  55 | 
  56 |       write_bytes(addr)
  57 | 
  58 |       @bytes.size
  59 |     ensure
  60 |       @bytes.clear
  61 |     end
  62 | 
  63 |     def size
  64 |       @bytes.size
  65 |     end
  66 | 
  67 |     #
  68 |     # Instructions
  69 |     #
  70 | 
  71 |     # ADD: dst = dst + src
  72 |     def add(dst, src)
  73 |       case [dst, src]
  74 |       # ADD r/m64, imm8 (Mod 00: [reg])
  75 |       in [QwordPtr[R64 => dst_reg], IMM8 => src_imm]
  76 |         # REX.W + 83 /0 ib
  77 |         # MI: Operand 1: ModRM:r/m (r, w), Operand 2: imm8/16/32
  78 |         insn(
  79 |           prefix: REX_W,
  80 |           opcode: 0x83,
  81 |           mod_rm: ModRM[mod: Mod00, reg: 0, rm: dst_reg],
  82 |           imm: imm8(src_imm),
  83 |         )
  84 |       # ADD r/m64, imm8 (Mod 11: reg)
  85 |       in [R64 => dst_reg, IMM8 => src_imm]
  86 |         # REX.W + 83 /0 ib
  87 |         # MI: Operand 1: ModRM:r/m (r, w), Operand 2: imm8/16/32
  88 |         insn(
  89 |           prefix: REX_W,
  90 |           opcode: 0x83,
  91 |           mod_rm: ModRM[mod: Mod11, reg: 0, rm: dst_reg],
  92 |           imm: imm8(src_imm),
  93 |         )
  94 |       # ADD r/m64 imm32 (Mod 11: reg)
  95 |       in [R64 => dst_reg, IMM32 => src_imm]
  96 |         # REX.W + 81 /0 id
  97 |         # MI: Operand 1: ModRM:r/m (r, w), Operand 2: imm8/16/32
  98 |         insn(
  99 |           prefix: REX_W,
 100 |           opcode: 0x81,
 101 |           mod_rm: ModRM[mod: Mod11, reg: 0, rm: dst_reg],
 102 |           imm: imm32(src_imm),
 103 |         )
 104 |       # ADD r/m64, r64 (Mod 11: reg)
 105 |       in [R64 => dst_reg, R64 => src_reg]
 106 |         # REX.W + 01 /r
 107 |         # MR: Operand 1: ModRM:r/m (r, w), Operand 2: ModRM:reg (r)
 108 |         insn(
 109 |           prefix: REX_W,
 110 |           opcode: 0x01,
 111 |           mod_rm: ModRM[mod: Mod11, reg: src_reg, rm: dst_reg],
 112 |         )
 113 |       end
 114 |     end
 115 | 
 116 |     # AND: dst = dst & src
 117 |     def and(dst, src)
 118 |       case [dst, src]
 119 |       # AND r/m64, imm8 (Mod 11: reg)
 120 |       in [R64 => dst_reg, IMM8 => src_imm]
 121 |         # REX.W + 83 /4 ib
 122 |         # MI: Operand 1: ModRM:r/m (r, w), Operand 2: imm8/16/32
 123 |         insn(
 124 |           prefix: REX_W,
 125 |           opcode: 0x83,
 126 |           mod_rm: ModRM[mod: Mod11, reg: 4, rm: dst_reg],
 127 |           imm: imm8(src_imm),
 128 |         )
 129 |       # AND r/m64, imm32 (Mod 11: reg)
 130 |       in [R64 => dst_reg, IMM32 => src_imm]
 131 |         # REX.W + 81 /4 id
 132 |         # MI: Operand 1: ModRM:r/m (r, w), Operand 2: imm8/16/32
 133 |         insn(
 134 |           prefix: REX_W,
 135 |           opcode: 0x81,
 136 |           mod_rm: ModRM[mod: Mod11, reg: 4, rm: dst_reg],
 137 |           imm: imm32(src_imm),
 138 |         )
 139 |       # AND r64, r/m64 (Mod 01: [reg]+disp8)
 140 |       in [R64 => dst_reg, QwordPtr[R64 => src_reg, IMM8 => src_disp]]
 141 |         # REX.W + 23 /r
 142 |         # RM: Operand 1: ModRM:reg (r, w), Operand 2: ModRM:r/m (r)
 143 |         insn(
 144 |           prefix: REX_W,
 145 |           opcode: 0x23,
 146 |           mod_rm: ModRM[mod: Mod01, reg: dst_reg, rm: src_reg],
 147 |           disp: imm8(src_disp),
 148 |         )
 149 |       end
 150 |     end
 151 | 
 152 |     # CALL: dst()
 153 |     def call(dst)
 154 |       case dst
 155 |       # CALL rel32
 156 |       in Integer => dst_addr
 157 |         # E8 cd
 158 |         # D: Operand 1: Offset
 159 |         insn(opcode: 0xe8, imm: rel32(dst_addr))
 160 |       # CALL r/m64 (Mod 11: reg)
 161 |       in R64 => dst_reg
 162 |         # FF /2
 163 |         # M: Operand 1: ModRM:r/m (r)
 164 |         insn(
 165 |           opcode: 0xff,
 166 |           mod_rm: ModRM[mod: Mod11, reg: 2, rm: dst_reg],
 167 |         )
 168 |       end
 169 |     end
 170 | 
 171 |     # CMOVE: dst = src if left == right
 172 |     def cmove(dst, src)
 173 |       case [dst, src]
 174 |       # CMOVE r64, r/m64 (Mod 11: reg)
 175 |       in [R64 => dst_reg, R64 => src_reg]
 176 |         # REX.W + 0F 44 /r
 177 |         # RM: Operand 1: ModRM:reg (r, w), Operand 2: ModRM:r/m (r)
 178 |         insn(
 179 |           prefix: REX_W,
 180 |           opcode: [0x0f, 0x44],
 181 |           mod_rm: ModRM[mod: Mod11, reg: dst_reg, rm: src_reg],
 182 |         )
 183 |       end
 184 |     end
 185 | 
 186 |     # CMOVG: dst = src if left > right
 187 |     def cmovg(dst, src)
 188 |       case [dst, src]
 189 |       # CMOVG r64, r/m64 (Mod 11: reg)
 190 |       in [R64 => dst_reg, R64 => src_reg]
 191 |         # REX.W + 0F 4F /r
 192 |         # RM: Operand 1: ModRM:reg (r, w), Operand 2: ModRM:r/m (r)
 193 |         insn(
 194 |           prefix: REX_W,
 195 |           opcode: [0x0f, 0x4f],
 196 |           mod_rm: ModRM[mod: Mod11, reg: dst_reg, rm: src_reg],
 197 |         )
 198 |       end
 199 |     end
 200 | 
 201 |     # CMOVGE: dst = src if left >= right
 202 |     def cmovge(dst, src)
 203 |       case [dst, src]
 204 |       # CMOVGE r64, r/m64 (Mod 11: reg)
 205 |       in [R64 => dst_reg, R64 => src_reg]
 206 |         # REX.W + 0F 4D /r
 207 |         # RM: Operand 1: ModRM:reg (r, w), Operand 2: ModRM:r/m (r)
 208 |         insn(
 209 |           prefix: REX_W,
 210 |           opcode: [0x0f, 0x4d],
 211 |           mod_rm: ModRM[mod: Mod11, reg: dst_reg, rm: src_reg],
 212 |         )
 213 |       end
 214 |     end
 215 | 
 216 |     # CMOVL: dst = src if left < right
 217 |     def cmovl(dst, src)
 218 |       case [dst, src]
 219 |       # CMOVL r64, r/m64 (Mod 11: reg)
 220 |       in [R64 => dst_reg, R64 => src_reg]
 221 |         # REX.W + 0F 4C /r
 222 |         # RM: Operand 1: ModRM:reg (r, w), Operand 2: ModRM:r/m (r)
 223 |         insn(
 224 |           prefix: REX_W,
 225 |           opcode: [0x0f, 0x4c],
 226 |           mod_rm: ModRM[mod: Mod11, reg: dst_reg, rm: src_reg],
 227 |         )
 228 |       end
 229 |     end
 230 | 
 231 |     # CMOVLE: dst = src if left <= right
 232 |     def cmovle(dst, src)
 233 |       case [dst, src]
 234 |       # CMOVLE r64, r/m64 (Mod 11: reg)
 235 |       in [R64 => dst_reg, R64 => src_reg]
 236 |         # REX.W + 0F 4E /r
 237 |         # RM: Operand 1: ModRM:reg (r, w), Operand 2: ModRM:r/m (r)
 238 |         insn(
 239 |           prefix: REX_W,
 240 |           opcode: [0x0f, 0x4e],
 241 |           mod_rm: ModRM[mod: Mod11, reg: dst_reg, rm: src_reg],
 242 |         )
 243 |       end
 244 |     end
 245 | 
 246 |     # CMOVNE: dst = src if left != right
 247 |     def cmovne(dst, src)
 248 |       case [dst, src]
 249 |       # CMOVNE r64, r/m64 (Mod 11: reg)
 250 |       in [R64 => dst_reg, R64 => src_reg]
 251 |         # REX.W + 0F 45 /r
 252 |         # RM: Operand 1: ModRM:reg (r, w), Operand 2: ModRM:r/m (r)
 253 |         insn(
 254 |           prefix: REX_W,
 255 |           opcode: [0x0f, 0x45],
 256 |           mod_rm: ModRM[mod: Mod11, reg: dst_reg, rm: src_reg],
 257 |         )
 258 |       end
 259 |     end
 260 | 
 261 |     # CMOVNZ: dst = src if left != 0
 262 |     def cmovnz(dst, src)
 263 |       case [dst, src]
 264 |       # CMOVNZ r64, r/m64 (Mod 11: reg)
 265 |       in [R64 => dst_reg, R64 => src_reg]
 266 |         # REX.W + 0F 45 /r
 267 |         # RM: Operand 1: ModRM:reg (r, w), Operand 2: ModRM:r/m (r)
 268 |         insn(
 269 |           prefix: REX_W,
 270 |           opcode: [0x0f, 0x45],
 271 |           mod_rm: ModRM[mod: Mod11, reg: dst_reg, rm: src_reg],
 272 |         )
 273 |       end
 274 |     end
 275 | 
 276 |     # CMOVZ: dst = src if left == 0
 277 |     def cmovz(dst, src)
 278 |       case [dst, src]
 279 |       # CMOVZ r64, r/m64 (Mod 11: reg)
 280 |       in [R64 => dst_reg, R64 => src_reg]
 281 |         # REX.W + 0F 44 /r
 282 |         # RM: Operand 1: ModRM:reg (r, w), Operand 2: ModRM:r/m (r)
 283 |         insn(
 284 |           prefix: REX_W,
 285 |           opcode: [0x0f, 0x44],
 286 |           mod_rm: ModRM[mod: Mod11, reg: dst_reg, rm: src_reg],
 287 |         )
 288 |       # CMOVZ r64, r/m64 (Mod 01: [reg]+disp8)
 289 |       in [R64 => dst_reg, QwordPtr[R64 => src_reg, IMM8 => src_disp]]
 290 |         # REX.W + 0F 44 /r
 291 |         # RM: Operand 1: ModRM:reg (r, w), Operand 2: ModRM:r/m (r)
 292 |         insn(
 293 |           prefix: REX_W,
 294 |           opcode: [0x0f, 0x44],
 295 |           mod_rm: ModRM[mod: Mod01, reg: dst_reg, rm: src_reg],
 296 |           disp: imm8(src_disp),
 297 |         )
 298 |       end
 299 |     end
 300 | 
 301 |     # CMP: Compare left and right
 302 |     def cmp(left, right)
 303 |       case [left, right]
 304 |       # CMP r/m8, imm8 (Mod 01: [reg]+disp8)
 305 |       in [BytePtr[R64 => left_reg, IMM8 => left_disp], IMM8 => right_imm]
 306 |         # 80 /7 ib
 307 |         # MI: Operand 1: ModRM:r/m (r), Operand 2: imm8/16/32
 308 |         insn(
 309 |           opcode: 0x80,
 310 |           mod_rm: ModRM[mod: Mod01, reg: 7, rm: left_reg],
 311 |           disp: left_disp,
 312 |           imm: imm8(right_imm),
 313 |         )
 314 |       # CMP r/m32, imm32 (Mod 01: [reg]+disp8)
 315 |       in [DwordPtr[R64 => left_reg, IMM8 => left_disp], IMM32 => right_imm]
 316 |         # 81 /7 id
 317 |         # MI: Operand 1: ModRM:r/m (r), Operand 2: imm8/16/32
 318 |         insn(
 319 |           opcode: 0x81,
 320 |           mod_rm: ModRM[mod: Mod01, reg: 7, rm: left_reg],
 321 |           disp: left_disp,
 322 |           imm: imm32(right_imm),
 323 |         )
 324 |       # CMP r/m64, imm8 (Mod 01: [reg]+disp8)
 325 |       in [QwordPtr[R64 => left_reg, IMM8 => left_disp], IMM8 => right_imm]
 326 |         # REX.W + 83 /7 ib
 327 |         # MI: Operand 1: ModRM:r/m (r), Operand 2: imm8/16/32
 328 |         insn(
 329 |           prefix: REX_W,
 330 |           opcode: 0x83,
 331 |           mod_rm: ModRM[mod: Mod01, reg: 7, rm: left_reg],
 332 |           disp: left_disp,
 333 |           imm: imm8(right_imm),
 334 |         )
 335 |       # CMP r/m64, imm8 (Mod 10: [reg]+disp32)
 336 |       in [QwordPtr[R64 => left_reg, IMM32 => left_disp], IMM8 => right_imm]
 337 |         # REX.W + 83 /7 ib
 338 |         # MI: Operand 1: ModRM:r/m (r), Operand 2: imm8/16/32
 339 |         insn(
 340 |           prefix: REX_W,
 341 |           opcode: 0x83,
 342 |           mod_rm: ModRM[mod: Mod10, reg: 7, rm: left_reg],
 343 |           disp: imm32(left_disp),
 344 |           imm: imm8(right_imm),
 345 |         )
 346 |       # CMP r/m64, imm8 (Mod 11: reg)
 347 |       in [R64 => left_reg, IMM8 => right_imm]
 348 |         # REX.W + 83 /7 ib
 349 |         # MI: Operand 1: ModRM:r/m (r), Operand 2: imm8/16/32
 350 |         insn(
 351 |           prefix: REX_W,
 352 |           opcode: 0x83,
 353 |           mod_rm: ModRM[mod: Mod11, reg: 7, rm: left_reg],
 354 |           imm: imm8(right_imm),
 355 |         )
 356 |       # CMP r/m64, imm32 (Mod 11: reg)
 357 |       in [R64 => left_reg, IMM32 => right_imm]
 358 |         # REX.W + 81 /7 id
 359 |         # MI: Operand 1: ModRM:r/m (r), Operand 2: imm8/16/32
 360 |         insn(
 361 |           prefix: REX_W,
 362 |           opcode: 0x81,
 363 |           mod_rm: ModRM[mod: Mod11, reg: 7, rm: left_reg],
 364 |           imm: imm32(right_imm),
 365 |         )
 366 |       # CMP r/m64, r64 (Mod 01: [reg]+disp8)
 367 |       in [QwordPtr[R64 => left_reg, IMM8 => left_disp], R64 => right_reg]
 368 |         # REX.W + 39 /r
 369 |         # MR: Operand 1: ModRM:r/m (r), Operand 2: ModRM:reg (r)
 370 |         insn(
 371 |           prefix: REX_W,
 372 |           opcode: 0x39,
 373 |           mod_rm: ModRM[mod: Mod01, reg: right_reg, rm: left_reg],
 374 |           disp: left_disp,
 375 |         )
 376 |       # CMP r/m64, r64 (Mod 10: [reg]+disp32)
 377 |       in [QwordPtr[R64 => left_reg, IMM32 => left_disp], R64 => right_reg]
 378 |         # REX.W + 39 /r
 379 |         # MR: Operand 1: ModRM:r/m (r), Operand 2: ModRM:reg (r)
 380 |         insn(
 381 |           prefix: REX_W,
 382 |           opcode: 0x39,
 383 |           mod_rm: ModRM[mod: Mod10, reg: right_reg, rm: left_reg],
 384 |           disp: imm32(left_disp),
 385 |         )
 386 |       # CMP r/m64, r64 (Mod 11: reg)
 387 |       in [R64 => left_reg, R64 => right_reg]
 388 |         # REX.W + 39 /r
 389 |         # MR: Operand 1: ModRM:r/m (r), Operand 2: ModRM:reg (r)
 390 |         insn(
 391 |           prefix: REX_W,
 392 |           opcode: 0x39,
 393 |           mod_rm: ModRM[mod: Mod11, reg: right_reg, rm: left_reg],
 394 |         )
 395 |       end
 396 |     end
 397 | 
 398 |     # JBE: Jump to dst if left >= right
 399 |     def jbe(dst)
 400 |       case dst
 401 |       # JBE rel8
 402 |       in Label => dst_label
 403 |         # 76 cb
 404 |         insn(opcode: 0x76, imm: dst_label)
 405 |       # JBE rel32
 406 |       in Integer => dst_addr
 407 |         # 0F 86 cd
 408 |         insn(opcode: [0x0f, 0x86], imm: rel32(dst_addr))
 409 |       end
 410 |     end
 411 | 
 412 |     # JE: Jump to dst if left == right
 413 |     def je(dst)
 414 |       case dst
 415 |       # JE rel8
 416 |       in Label => dst_label
 417 |         # 74 cb
 418 |         insn(opcode: 0x74, imm: dst_label)
 419 |       # JE rel32
 420 |       in Integer => dst_addr
 421 |         # 0F 84 cd
 422 |         insn(opcode: [0x0f, 0x84], imm: rel32(dst_addr))
 423 |       end
 424 |     end
 425 | 
 426 |     # JE: Jump to dst if left == right
 427 |     def jl(dst)
 428 |       case dst
 429 |       # JL rel32
 430 |       in Integer => dst_addr
 431 |         # 0F 8C cd
 432 |         insn(opcode: [0x0f, 0x8c], imm: rel32(dst_addr))
 433 |       end
 434 |     end
 435 | 
 436 |     # JMP: Jump to dst
 437 |     def jmp(dst)
 438 |       case dst
 439 |       # JZ rel8
 440 |       in Label => dst_label
 441 |         # EB cb
 442 |         insn(opcode: 0xeb, imm: dst_label)
 443 |       # JMP rel32
 444 |       in Integer => dst_addr
 445 |         # E9 cd
 446 |         insn(opcode: 0xe9, imm: rel32(dst_addr))
 447 |       # JMP r/m64 (Mod 01: [reg]+disp8)
 448 |       in QwordPtr[R64 => dst_reg, IMM8 => dst_disp]
 449 |         # FF /4
 450 |         insn(opcode: 0xff, mod_rm: ModRM[mod: Mod01, reg: 4, rm: dst_reg], disp: dst_disp)
 451 |       # JMP r/m64 (Mod 11: reg)
 452 |       in R64 => dst_reg
 453 |         # FF /4
 454 |         insn(opcode: 0xff, mod_rm: ModRM[mod: Mod11, reg: 4, rm: dst_reg])
 455 |       end
 456 |     end
 457 | 
 458 |     # JNE: Jump to dst if left != right
 459 |     def jne(dst)
 460 |       case dst
 461 |       # JNE rel8
 462 |       in Label => dst_label
 463 |         # 75 cb
 464 |         insn(opcode: 0x75, imm: dst_label)
 465 |       # JNE rel32
 466 |       in Integer => dst_addr
 467 |         # 0F 85 cd
 468 |         insn(opcode: [0x0f, 0x85], imm: rel32(dst_addr))
 469 |       end
 470 |     end
 471 | 
 472 |     # JNZ: Jump to dst if left != 0
 473 |     def jnz(dst)
 474 |       case dst
 475 |       # JE rel8
 476 |       in Label => dst_label
 477 |         # 75 cb
 478 |         insn(opcode: 0x75, imm: dst_label)
 479 |       # JNZ rel32
 480 |       in Integer => dst_addr
 481 |         # 0F 85 cd
 482 |         insn(opcode: [0x0f, 0x85], imm: rel32(dst_addr))
 483 |       end
 484 |     end
 485 | 
 486 |     # JO: Jump to dst if overflow
 487 |     def jo(dst)
 488 |       case dst
 489 |       # JO rel32
 490 |       in Integer => dst_addr
 491 |         # 0F 80 cd
 492 |         insn(opcode: [0x0f, 0x80], imm: rel32(dst_addr))
 493 |       end
 494 |     end
 495 | 
 496 |     # JZ: Jump to dst if left == 0
 497 |     def jz(dst)
 498 |       case dst
 499 |       # JZ rel8
 500 |       in Label => dst_label
 501 |         # 74 cb
 502 |         insn(opcode: 0x74, imm: dst_label)
 503 |       # JZ rel32
 504 |       in Integer => dst_addr
 505 |         # 0F 84 cd
 506 |         insn(opcode: [0x0f, 0x84], imm: rel32(dst_addr))
 507 |       end
 508 |     end
 509 | 
 510 |     # LEA: dst = &src
 511 |     def lea(dst, src)
 512 |       case [dst, src]
 513 |       # LEA r64,m (Mod 01: [reg]+disp8)
 514 |       in [R64 => dst_reg, QwordPtr[R64 => src_reg, IMM8 => src_disp]]
 515 |         # REX.W + 8D /r
 516 |         # RM: Operand 1: ModRM:reg (w), Operand 2: ModRM:r/m (r)
 517 |         insn(
 518 |           prefix: REX_W,
 519 |           opcode: 0x8d,
 520 |           mod_rm: ModRM[mod: Mod01, reg: dst_reg, rm: src_reg],
 521 |           disp: imm8(src_disp),
 522 |         )
 523 |       # LEA r64,m (Mod 10: [reg]+disp32)
 524 |       in [R64 => dst_reg, QwordPtr[R64 => src_reg, IMM32 => src_disp]]
 525 |         # REX.W + 8D /r
 526 |         # RM: Operand 1: ModRM:reg (w), Operand 2: ModRM:r/m (r)
 527 |         insn(
 528 |           prefix: REX_W,
 529 |           opcode: 0x8d,
 530 |           mod_rm: ModRM[mod: Mod10, reg: dst_reg, rm: src_reg],
 531 |           disp: imm32(src_disp),
 532 |         )
 533 |       end
 534 |     end
 535 | 
 536 |     # MOV: dst = src
 537 |     def mov(dst, src)
 538 |       case dst
 539 |       in R32 => dst_reg
 540 |         case src
 541 |         # MOV r32 r/m32 (Mod 01: [reg]+disp8)
 542 |         in DwordPtr[R64 => src_reg, IMM8 => src_disp]
 543 |           # 8B /r
 544 |           # RM: Operand 1: ModRM:reg (w), Operand 2: ModRM:r/m (r)
 545 |           insn(
 546 |             opcode: 0x8b,
 547 |             mod_rm: ModRM[mod: Mod01, reg: dst_reg, rm: src_reg],
 548 |             disp: src_disp,
 549 |           )
 550 |         # MOV r32, imm32 (Mod 11: reg)
 551 |         in IMM32 => src_imm
 552 |           # B8+ rd id
 553 |           # OI: Operand 1: opcode + rd (w), Operand 2: imm8/16/32/64
 554 |           insn(
 555 |             opcode: 0xb8,
 556 |             rd: dst_reg,
 557 |             imm: imm32(src_imm),
 558 |           )
 559 |         end
 560 |       in R64 => dst_reg
 561 |         case src
 562 |         # MOV r64, r/m64 (Mod 00: [reg])
 563 |         in QwordPtr[R64 => src_reg]
 564 |           # REX.W + 8B /r
 565 |           # RM: Operand 1: ModRM:reg (w), Operand 2: ModRM:r/m (r)
 566 |           insn(
 567 |             prefix: REX_W,
 568 |             opcode: 0x8b,
 569 |             mod_rm: ModRM[mod: Mod00, reg: dst_reg, rm: src_reg],
 570 |           )
 571 |         # MOV r64, r/m64 (Mod 01: [reg]+disp8)
 572 |         in QwordPtr[R64 => src_reg, IMM8 => src_disp]
 573 |           # REX.W + 8B /r
 574 |           # RM: Operand 1: ModRM:reg (w), Operand 2: ModRM:r/m (r)
 575 |           insn(
 576 |             prefix: REX_W,
 577 |             opcode: 0x8b,
 578 |             mod_rm: ModRM[mod: Mod01, reg: dst_reg, rm: src_reg],
 579 |             disp: src_disp,
 580 |           )
 581 |         # MOV r64, r/m64 (Mod 10: [reg]+disp32)
 582 |         in QwordPtr[R64 => src_reg, IMM32 => src_disp]
 583 |           # REX.W + 8B /r
 584 |           # RM: Operand 1: ModRM:reg (w), Operand 2: ModRM:r/m (r)
 585 |           insn(
 586 |             prefix: REX_W,
 587 |             opcode: 0x8b,
 588 |             mod_rm: ModRM[mod: Mod10, reg: dst_reg, rm: src_reg],
 589 |             disp: imm32(src_disp),
 590 |           )
 591 |         # MOV r64, r/m64 (Mod 11: reg)
 592 |         in R64 => src_reg
 593 |           # REX.W + 8B /r
 594 |           # RM: Operand 1: ModRM:reg (w), Operand 2: ModRM:r/m (r)
 595 |           insn(
 596 |             prefix: REX_W,
 597 |             opcode: 0x8b,
 598 |             mod_rm: ModRM[mod: Mod11, reg: dst_reg, rm: src_reg],
 599 |           )
 600 |         # MOV r/m64, imm32 (Mod 11: reg)
 601 |         in IMM32 => src_imm
 602 |           # REX.W + C7 /0 id
 603 |           # MI: Operand 1: ModRM:r/m (w), Operand 2: imm8/16/32/64
 604 |           insn(
 605 |             prefix: REX_W,
 606 |             opcode: 0xc7,
 607 |             mod_rm: ModRM[mod: Mod11, reg: 0, rm: dst_reg],
 608 |             imm: imm32(src_imm),
 609 |           )
 610 |         # MOV r64, imm64
 611 |         in IMM64 => src_imm
 612 |           # REX.W + B8+ rd io
 613 |           # OI: Operand 1: opcode + rd (w), Operand 2: imm8/16/32/64
 614 |           insn(
 615 |             prefix: REX_W,
 616 |             opcode: 0xb8,
 617 |             rd: dst_reg,
 618 |             imm: imm64(src_imm),
 619 |           )
 620 |         end
 621 |       in DwordPtr[R64 => dst_reg, IMM8 => dst_disp]
 622 |         case src
 623 |         # MOV r/m32, imm32 (Mod 01: [reg]+disp8)
 624 |         in IMM32 => src_imm
 625 |           # C7 /0 id
 626 |           # MI: Operand 1: ModRM:r/m (w), Operand 2: imm8/16/32/64
 627 |           insn(
 628 |             opcode: 0xc7,
 629 |             mod_rm: ModRM[mod: Mod01, reg: 0, rm: dst_reg],
 630 |             disp: dst_disp,
 631 |             imm: imm32(src_imm),
 632 |           )
 633 |         end
 634 |       in QwordPtr[R64 => dst_reg]
 635 |         case src
 636 |         # MOV r/m64, imm32 (Mod 00: [reg])
 637 |         in IMM32 => src_imm
 638 |           # REX.W + C7 /0 id
 639 |           # MI: Operand 1: ModRM:r/m (w), Operand 2: imm8/16/32/64
 640 |           insn(
 641 |             prefix: REX_W,
 642 |             opcode: 0xc7,
 643 |             mod_rm: ModRM[mod: Mod00, reg: 0, rm: dst_reg],
 644 |             imm: imm32(src_imm),
 645 |           )
 646 |         # MOV r/m64, r64 (Mod 00: [reg])
 647 |         in R64 => src_reg
 648 |           # REX.W + 89 /r
 649 |           # MR: Operand 1: ModRM:r/m (w), Operand 2: ModRM:reg (r)
 650 |           insn(
 651 |             prefix: REX_W,
 652 |             opcode: 0x89,
 653 |             mod_rm: ModRM[mod: Mod00, reg: src_reg, rm: dst_reg],
 654 |           )
 655 |         end
 656 |       in QwordPtr[R64 => dst_reg, IMM8 => dst_disp]
 657 |         # Optimize encoding when disp is 0
 658 |         return mov([dst_reg], src) if dst_disp == 0
 659 | 
 660 |         case src
 661 |         # MOV r/m64, imm32 (Mod 01: [reg]+disp8)
 662 |         in IMM32 => src_imm
 663 |           # REX.W + C7 /0 id
 664 |           # MI: Operand 1: ModRM:r/m (w), Operand 2: imm8/16/32/64
 665 |           insn(
 666 |             prefix: REX_W,
 667 |             opcode: 0xc7,
 668 |             mod_rm: ModRM[mod: Mod01, reg: 0, rm: dst_reg],
 669 |             disp: dst_disp,
 670 |             imm: imm32(src_imm),
 671 |           )
 672 |         # MOV r/m64, r64 (Mod 01: [reg]+disp8)
 673 |         in R64 => src_reg
 674 |           # REX.W + 89 /r
 675 |           # MR: Operand 1: ModRM:r/m (w), Operand 2: ModRM:reg (r)
 676 |           insn(
 677 |             prefix: REX_W,
 678 |             opcode: 0x89,
 679 |             mod_rm: ModRM[mod: Mod01, reg: src_reg, rm: dst_reg],
 680 |             disp: dst_disp,
 681 |           )
 682 |         end
 683 |       in QwordPtr[R64 => dst_reg, IMM32 => dst_disp]
 684 |         case src
 685 |         # MOV r/m64, imm32 (Mod 10: [reg]+disp32)
 686 |         in IMM32 => src_imm
 687 |           # REX.W + C7 /0 id
 688 |           # MI: Operand 1: ModRM:r/m (w), Operand 2: imm8/16/32/64
 689 |           insn(
 690 |             prefix: REX_W,
 691 |             opcode: 0xc7,
 692 |             mod_rm: ModRM[mod: Mod10, reg: 0, rm: dst_reg],
 693 |             disp: imm32(dst_disp),
 694 |             imm: imm32(src_imm),
 695 |           )
 696 |         # MOV r/m64, r64 (Mod 10: [reg]+disp32)
 697 |         in R64 => src_reg
 698 |           # REX.W + 89 /r
 699 |           # MR: Operand 1: ModRM:r/m (w), Operand 2: ModRM:reg (r)
 700 |           insn(
 701 |             prefix: REX_W,
 702 |             opcode: 0x89,
 703 |             mod_rm: ModRM[mod: Mod10, reg: src_reg, rm: dst_reg],
 704 |             disp: imm32(dst_disp),
 705 |           )
 706 |         end
 707 |       end
 708 |     end
 709 | 
 710 |     # OR: dst = dst | src
 711 |     def or(dst, src)
 712 |       case [dst, src]
 713 |       # OR r/m64, imm8 (Mod 11: reg)
 714 |       in [R64 => dst_reg, IMM8 => src_imm]
 715 |         # REX.W + 83 /1 ib
 716 |         # MI: Operand 1: ModRM:r/m (r, w), Operand 2: imm8/16/32
 717 |         insn(
 718 |           prefix: REX_W,
 719 |           opcode: 0x83,
 720 |           mod_rm: ModRM[mod: Mod11, reg: 1, rm: dst_reg],
 721 |           imm: imm8(src_imm),
 722 |         )
 723 |       # OR r/m64, imm32 (Mod 11: reg)
 724 |       in [R64 => dst_reg, IMM32 => src_imm]
 725 |         # REX.W + 81 /1 id
 726 |         # MI: Operand 1: ModRM:r/m (r, w), Operand 2: imm8/16/32
 727 |         insn(
 728 |           prefix: REX_W,
 729 |           opcode: 0x81,
 730 |           mod_rm: ModRM[mod: Mod11, reg: 1, rm: dst_reg],
 731 |           imm: imm32(src_imm),
 732 |         )
 733 |       # OR r64, r/m64 (Mod 01: [reg]+disp8)
 734 |       in [R64 => dst_reg, QwordPtr[R64 => src_reg, IMM8 => src_disp]]
 735 |         # REX.W + 0B /r
 736 |         # RM: Operand 1: ModRM:reg (r, w), Operand 2: ModRM:r/m (r)
 737 |         insn(
 738 |           prefix: REX_W,
 739 |           opcode: 0x0b,
 740 |           mod_rm: ModRM[mod: Mod01, reg: dst_reg, rm: src_reg],
 741 |           disp: imm8(src_disp),
 742 |         )
 743 |       end
 744 |     end
 745 | 
 746 |     # PUSH: Push src onto the stack
 747 |     def push(src)
 748 |       case src
 749 |       # PUSH r64
 750 |       in R64 => src_reg
 751 |         # 50+rd
 752 |         # O: Operand 1: opcode + rd (r)
 753 |         insn(opcode: 0x50, rd: src_reg)
 754 |       end
 755 |     end
 756 | 
 757 |     # POP: Pop from the stack to dst
 758 |     def pop(dst)
 759 |       case dst
 760 |       # POP r64
 761 |       in R64 => dst_reg
 762 |         # 58+ rd
 763 |         # O: Operand 1: opcode + rd (r)
 764 |         insn(opcode: 0x58, rd: dst_reg)
 765 |       end
 766 |     end
 767 | 
 768 |     # RET: Return
 769 |     def ret
 770 |       # RET
 771 |       # Near return: A return to a procedure within the current code segment
 772 |       insn(opcode: 0xc3)
 773 |     end
 774 | 
 775 |     # SAR: dst = dst >> src
 776 |     def sar(dst, src)
 777 |       case [dst, src]
 778 |       in [R64 => dst_reg, IMM8 => src_imm]
 779 |         # REX.W + C1 /7 ib
 780 |         # MI: Operand 1: ModRM:r/m (r, w), Operand 2: imm8
 781 |         insn(
 782 |           prefix: REX_W,
 783 |           opcode: 0xc1,
 784 |           mod_rm: ModRM[mod: Mod11, reg: 7, rm: dst_reg],
 785 |           imm: imm8(src_imm),
 786 |         )
 787 |       end
 788 |     end
 789 | 
 790 |     # SUB: dst = dst - src
 791 |     def sub(dst, src)
 792 |       case [dst, src]
 793 |       # SUB r/m64, imm8 (Mod 11: reg)
 794 |       in [R64 => dst_reg, IMM8 => src_imm]
 795 |         # REX.W + 83 /5 ib
 796 |         # MI: Operand 1: ModRM:r/m (r, w), Operand 2: imm8/16/32
 797 |         insn(
 798 |           prefix: REX_W,
 799 |           opcode: 0x83,
 800 |           mod_rm: ModRM[mod: Mod11, reg: 5, rm: dst_reg],
 801 |           imm: imm8(src_imm),
 802 |         )
 803 |       # SUB r/m64, r64 (Mod 11: reg)
 804 |       in [R64 => dst_reg, R64 => src_reg]
 805 |         # REX.W + 29 /r
 806 |         # MR: Operand 1: ModRM:r/m (r, w), Operand 2: ModRM:reg (r)
 807 |         insn(
 808 |           prefix: REX_W,
 809 |           opcode: 0x29,
 810 |           mod_rm: ModRM[mod: Mod11, reg: src_reg, rm: dst_reg],
 811 |         )
 812 |       end
 813 |     end
 814 | 
 815 |     # TEST: Compare test and right
 816 |     def test(left, right)
 817 |       case [left, right]
 818 |       # TEST r/m8*, imm8 (Mod 01: [reg]+disp8)
 819 |       in [BytePtr[R64 => left_reg, IMM8 => left_disp], IMM8 => right_imm]
 820 |         # REX + F6 /0 ib
 821 |         # MI: Operand 1: ModRM:r/m (r), Operand 2: imm8/16/32
 822 |         insn(
 823 |           opcode: 0xf6,
 824 |           mod_rm: ModRM[mod: Mod01, reg: 0, rm: left_reg],
 825 |           disp: left_disp,
 826 |           imm: imm8(right_imm),
 827 |         )
 828 |       # TEST r/m64, imm32 (Mod 01: [reg]+disp8)
 829 |       in [QwordPtr[R64 => left_reg, IMM8 => left_disp], IMM32 => right_imm]
 830 |         # REX.W + F7 /0 id
 831 |         # MI: Operand 1: ModRM:r/m (r), Operand 2: imm8/16/32
 832 |         insn(
 833 |           prefix: REX_W,
 834 |           opcode: 0xf7,
 835 |           mod_rm: ModRM[mod: Mod01, reg: 0, rm: left_reg],
 836 |           disp: left_disp,
 837 |           imm: imm32(right_imm),
 838 |         )
 839 |       # TEST r/m64, imm32 (Mod 10: [reg]+disp32)
 840 |       in [QwordPtr[R64 => left_reg, IMM32 => left_disp], IMM32 => right_imm]
 841 |         # REX.W + F7 /0 id
 842 |         # MI: Operand 1: ModRM:r/m (r), Operand 2: imm8/16/32
 843 |         insn(
 844 |           prefix: REX_W,
 845 |           opcode: 0xf7,
 846 |           mod_rm: ModRM[mod: Mod10, reg: 0, rm: left_reg],
 847 |           disp: imm32(left_disp),
 848 |           imm: imm32(right_imm),
 849 |         )
 850 |       # TEST r/m64, imm32 (Mod 11: reg)
 851 |       in [R64 => left_reg, IMM32 => right_imm]
 852 |         # REX.W + F7 /0 id
 853 |         # MI: Operand 1: ModRM:r/m (r), Operand 2: imm8/16/32
 854 |         insn(
 855 |           prefix: REX_W,
 856 |           opcode: 0xf7,
 857 |           mod_rm: ModRM[mod: Mod11, reg: 0, rm: left_reg],
 858 |           imm: imm32(right_imm),
 859 |         )
 860 |       # TEST r/m32, r32 (Mod 11: reg)
 861 |       in [R32 => left_reg, R32 => right_reg]
 862 |         # 85 /r
 863 |         # MR: Operand 1: ModRM:r/m (r), Operand 2: ModRM:reg (r)
 864 |         insn(
 865 |           opcode: 0x85,
 866 |           mod_rm: ModRM[mod: Mod11, reg: right_reg, rm: left_reg],
 867 |         )
 868 |       # TEST r/m64, r64 (Mod 11: reg)
 869 |       in [R64 => left_reg, R64 => right_reg]
 870 |         # REX.W + 85 /r
 871 |         # MR: Operand 1: ModRM:r/m (r), Operand 2: ModRM:reg (r)
 872 |         insn(
 873 |           prefix: REX_W,
 874 |           opcode: 0x85,
 875 |           mod_rm: ModRM[mod: Mod11, reg: right_reg, rm: left_reg],
 876 |         )
 877 |       end
 878 |     end
 879 | 
 880 |     # XOR: dst = dst ^ src
 881 |     def xor(dst, src)
 882 |       case [dst, src]
 883 |       # XOR r/m64, r64 (Mod 11: reg)
 884 |       in [R64 => dst_reg, R64 => src_reg]
 885 |         # REX.W + 31 /r
 886 |         # MR: Operand 1: ModRM:r/m (r, w), Operand 2: ModRM:reg (r)
 887 |         insn(
 888 |           prefix: REX_W,
 889 |           opcode: 0x31,
 890 |           mod_rm: ModRM[mod: Mod11, reg: src_reg, rm: dst_reg],
 891 |         )
 892 |       end
 893 |     end
 894 | 
 895 |     #
 896 |     # Utilities
 897 |     #
 898 | 
 899 |     def new_label(name)
 900 |       Label.new(id: @label_id += 1, name:)
 901 |     end
 902 | 
 903 |     # @param [RubyVM::RJIT::Assembler::Label] label
 904 |     def write_label(label)
 905 |       @labels[label] = @bytes.size
 906 |     end
 907 | 
 908 |     private
 909 | 
 910 |     def insn(prefix: 0, opcode:, rd: nil, mod_rm: nil, disp: nil, imm: nil)
 911 |       # Determine prefix
 912 |       if rd
 913 |         prefix |= REX_B if extended_reg?(rd)
 914 |         opcode += reg_code(rd)
 915 |       end
 916 |       if mod_rm
 917 |         prefix |= REX_R if mod_rm.reg.is_a?(Symbol) && extended_reg?(mod_rm.reg)
 918 |         prefix |= REX_B if mod_rm.rm.is_a?(Symbol) && extended_reg?(mod_rm.rm)
 919 |       end
 920 | 
 921 |       # Encode insn
 922 |       if prefix > 0
 923 |         @bytes.push(prefix)
 924 |       end
 925 |       @bytes.push(*Array(opcode))
 926 |       if mod_rm
 927 |         mod_rm_byte = encode_mod_rm(
 928 |           mod: mod_rm.mod,
 929 |           reg: mod_rm.reg.is_a?(Symbol) ? reg_code(mod_rm.reg) : mod_rm.reg,
 930 |           rm: mod_rm.rm.is_a?(Symbol) ? reg_code(mod_rm.rm) : mod_rm.rm,
 931 |         )
 932 |         @bytes.push(mod_rm_byte)
 933 |       end
 934 |       if disp
 935 |         @bytes.push(*Array(disp))
 936 |       end
 937 |       if imm
 938 |         @bytes.push(*imm)
 939 |       end
 940 |     end
 941 | 
 942 |     def reg_code(reg)
 943 |       reg_code_extended(reg).first
 944 |     end
 945 | 
 946 |     # Table 2-2. 32-Bit Addressing Forms with the ModR/M Byte
 947 |     #
 948 |     #  7  6  5  4  3  2  1  0
 949 |     # +--+--+--+--+--+--+--+--+
 950 |     # | Mod | Reg/   | R/M    |
 951 |     # |     | Opcode |        |
 952 |     # +--+--+--+--+--+--+--+--+
 953 |     #
 954 |     # The r/m field can specify a register as an operand or it can be combined
 955 |     # with the mod field to encode an addressing mode.
 956 |     #
 957 |     # /0: R/M is 0 (not used)
 958 |     # /r: R/M is a register
 959 |     def encode_mod_rm(mod:, reg: 0, rm: 0)
 960 |       if mod > 0b11
 961 |         raise ArgumentError, "too large Mod: #{mod}"
 962 |       end
 963 |       if reg > 0b111
 964 |         raise ArgumentError, "too large Reg/Opcode: #{reg}"
 965 |       end
 966 |       if rm > 0b111
 967 |         raise ArgumentError, "too large R/M: #{rm}"
 968 |       end
 969 |       (mod << 6) + (reg << 3) + rm
 970 |     end
 971 | 
 972 |     # ib: 1 byte
 973 |     def imm8(imm)
 974 |       unless imm8?(imm)
 975 |         raise ArgumentError, "unexpected imm8: #{imm}"
 976 |       end
 977 |       [imm].pack('c').unpack('c*') # TODO: consider uimm
 978 |     end
 979 | 
 980 |     # id: 4 bytes
 981 |     def imm32(imm)
 982 |       unless imm32?(imm)
 983 |         raise ArgumentError, "unexpected imm32: #{imm}"
 984 |       end
 985 |       [imm].pack('l').unpack('c*') # TODO: consider uimm
 986 |     end
 987 | 
 988 |     # io: 8 bytes
 989 |     def imm64(imm)
 990 |       unless imm64?(imm)
 991 |         raise ArgumentError, "unexpected imm64: #{imm}"
 992 |       end
 993 |       imm_bytes(imm, 8)
 994 |     end
 995 | 
 996 |     def imm_bytes(imm, num_bytes)
 997 |       bytes = []
 998 |       bits = imm
 999 |       num_bytes.times do
1000 |         bytes << (bits & 0xff)
1001 |         bits >>= 8
1002 |       end
1003 |       if bits != 0
1004 |         raise ArgumentError, "unexpected imm with #{num_bytes} bytes: #{imm}"
1005 |       end
1006 |       bytes
1007 |     end
1008 | 
1009 |     def rel32(addr)
1010 |       [Rel32.new(addr), Rel32Pad, Rel32Pad, Rel32Pad]
1011 |     end
1012 | 
1013 |     def resolve_rel32(write_addr)
1014 |       @bytes.each_with_index do |byte, index|
1015 |         if byte.is_a?(Rel32)
1016 |           src_addr = write_addr + index + 4 # offset 4 bytes for rel32 itself
1017 |           dst_addr = byte.addr
1018 |           rel32 = dst_addr - src_addr
1019 |           raise "unexpected offset: #{rel32}" unless imm32?(rel32)
1020 |           imm32(rel32).each_with_index do |rel_byte, rel_index|
1021 |             @bytes[index + rel_index] = rel_byte
1022 |           end
1023 |         end
1024 |       end
1025 |     end
1026 | 
1027 |     def resolve_labels
1028 |       @bytes.each_with_index do |byte, index|
1029 |         if byte.is_a?(Label)
1030 |           src_index = index + 1 # offset 1 byte for rel8 itself
1031 |           dst_index = @labels.fetch(byte)
1032 |           rel8 = dst_index - src_index
1033 |           raise "unexpected offset: #{rel8}" unless imm8?(rel8)
1034 |           @bytes[index] = rel8
1035 |         end
1036 |       end
1037 |     end
1038 | 
1039 |     def write_bytes(addr)
1040 |       Fiddle::Pointer.new(addr)[0, @bytes.size] = @bytes.pack('c*')
1041 |     end
1042 |   end
1043 | 
1044 |   module OperandMatcher
1045 |     def imm8?(imm)
1046 |       (-0x80..0x7f).include?(imm)
1047 |     end
1048 | 
1049 |     def imm32?(imm)
1050 |       (-0x8000_0000..0x7fff_ffff).include?(imm) # TODO: consider uimm
1051 |     end
1052 | 
1053 |     def imm64?(imm)
1054 |       (-0x8000_0000_0000_0000..0xffff_ffff_ffff_ffff).include?(imm)
1055 |     end
1056 | 
1057 |     def r32?(reg)
1058 |       if extended_reg?(reg)
1059 |         reg.end_with?('d')
1060 |       else
1061 |         reg.start_with?('e')
1062 |       end
1063 |     end
1064 | 
1065 |     def r64?(reg)
1066 |       if extended_reg?(reg)
1067 |         reg.match?(/\Ar\d+\z/)
1068 |       else
1069 |         reg.start_with?('r')
1070 |       end
1071 |     end
1072 | 
1073 |     def extended_reg?(reg)
1074 |       reg_code_extended(reg).last
1075 |     end
1076 | 
1077 |     def reg_code_extended(reg)
1078 |       case reg
1079 |       # Not extended
1080 |       when :al, :ax, :eax, :rax then [0, false]
1081 |       when :cl, :cx, :ecx, :rcx then [1, false]
1082 |       when :dl, :dx, :edx, :rdx then [2, false]
1083 |       when :bl, :bx, :ebx, :rbx then [3, false]
1084 |       when :ah, :sp, :esp, :rsp then [4, false]
1085 |       when :ch, :bp, :ebp, :rbp then [5, false]
1086 |       when :dh, :si, :esi, :rsi then [6, false]
1087 |       when :bh, :di, :edi, :rdi then [7, false]
1088 |       # Extended
1089 |       when :r8b,  :r8w,  :r8d,  :r8  then [0, true]
1090 |       when :r9b,  :r9w,  :r9d,  :r9  then [1, true]
1091 |       when :r10b, :r10w, :r10d, :r10 then [2, true]
1092 |       when :r11b, :r11w, :r11d, :r11 then [3, true]
1093 |       when :r12b, :r12w, :r12d, :r12 then [4, true]
1094 |       when :r13b, :r13w, :r13d, :r13 then [5, true]
1095 |       when :r14b, :r14w, :r14d, :r14 then [6, true]
1096 |       when :r15b, :r15w, :r15d, :r15 then [7, true]
1097 |       else raise ArgumentError, "unexpected reg: #{reg.inspect}"
1098 |       end
1099 |     end
1100 |   end
1101 | 
1102 |   class Assembler
1103 |     include OperandMatcher
1104 |     extend OperandMatcher
1105 |   end
1106 | end
1107 | 


--------------------------------------------------------------------------------