├── .busted ├── .ci.sh ├── .luacheckrc ├── .travis.yml ├── LICENSE ├── Makefile ├── README.md ├── bpf.lua ├── bpf ├── builtins.lua ├── cdef.lua ├── elf.lua ├── ljbytecode.lua └── proto.lua ├── examples ├── kprobe-latency.lua ├── kprobe-write.lua ├── sock-parse-dns.lua ├── sock-parse-http.lua ├── sock-proto.lua ├── sock-protolen.lua ├── tracepoint-offcputime.lua ├── uprobe-readline-perf.lua ├── uprobe-readline.lua └── uprobe-tailkt.lua ├── rockspec └── bpf-scm-1.rockspec └── spec ├── compile_spec.lua ├── decoder_spec.lua └── elf_spec.lua /.busted: -------------------------------------------------------------------------------- 1 | -- Configuration for unit tests 2 | -- See: http://olivinelabs.com/busted/ 3 | return { 4 | default = { 5 | lpath = "./?.lua", 6 | ["auto-insulate"] = false, 7 | } 8 | } -------------------------------------------------------------------------------- /.ci.sh: -------------------------------------------------------------------------------- 1 | #/bin/bash -e 2 | pkg="libelf-0.8.13" 3 | if [ ! -f $HOME/.local/lib/pkgconfig/libelf.pc ]; then 4 | curl -O http://www.mr511.de/software/${pkg}.tar.gz 5 | tar xvzf ${pkg}.tar.gz 6 | cd ${pkg} 7 | ./configure --prefix=$HOME/.local 8 | make 9 | make install 10 | fi 11 | -------------------------------------------------------------------------------- /.luacheckrc: -------------------------------------------------------------------------------- 1 | std = "luajit" 2 | ignore = { "211", "212", "411", "412", "421", "431", "542" } 3 | files["examples"] = { 4 | new_globals = { "pkt", "time", "xadd", "c" } 5 | } 6 | files["bpf/builtins.lua"] = { 7 | ignore = { "122" } 8 | } 9 | files["spec"] = { 10 | std = "+busted", 11 | new_globals = { "pkt", "time", "xadd", "c" } 12 | } -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | sudo: false 3 | 4 | env: 5 | matrix: 6 | - LUA="luajit @" 7 | - LUA="luajit 2.0" 8 | - LUA="luajit 2.1" 9 | global: 10 | - LD_LIBRARY_PATH="$HOME/.local/lib" 11 | 12 | branches: 13 | only: 14 | - master 15 | 16 | before_install: 17 | - pip install hererocks 18 | - hererocks ~/hererocks -r^ --$LUA 19 | - export PATH=$PATH:~/hererocks/bin 20 | - eval `luarocks path --bin` 21 | - luarocks install luacheck 22 | - luarocks install luacov-coveralls 23 | - luarocks install lua_cliargs 2.5-5 24 | - luarocks install busted 2.0.rc10-0 25 | - ./.ci.sh 26 | 27 | install: 28 | - luarocks install --only-deps rockspec/bpf-scm-1.rockspec 29 | 30 | script: 31 | - luacheck . 32 | - busted -c 33 | 34 | after_success: 35 | - luacov-coveralls -v 36 | 37 | notifications: 38 | email: 39 | on_success: change 40 | on_failure: change 41 | 42 | cache: 43 | directories: 44 | - $HOME/.cache/hererocks 45 | - $HOME/.local 46 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2016, Marek Vavrusa 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without 5 | modification, are permitted provided that the following conditions are met: 6 | 7 | * Redistributions of source code must retain the above copyright notice, this 8 | list of conditions and the following disclaimer. 9 | 10 | * Redistributions in binary form must reproduce the above copyright notice, 11 | this list of conditions and the following disclaimer in the documentation 12 | and/or other materials provided with the distribution. 13 | 14 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 15 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 17 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 18 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 20 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 21 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 22 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 23 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | LUA ?= luajit 2 | 3 | check: 4 | @echo "[*] static analysis" 5 | @luacheck --codes --formatter TAP . 6 | @echo "[*] unit tests" 7 | @busted --lua=$(LUA) -o TAP 8 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | This repo has been merged to [iovisor/bcc](https://github.com/iovisor/bcc/tree/master/src/lua#luajit-bpf-compiler) and further development will happen there. 2 | -------------------------------------------------------------------------------- /bpf.lua: -------------------------------------------------------------------------------- 1 | -- Translate LuaJIT function into eBPF bytecode. 2 | -- 3 | -- The code generation phase is currently one-pass and produces: 4 | -- * Compiled code in eBPF bytecode format (https://www.kernel.org/doc/Documentation/networking/filter.txt) 5 | -- * Variables with liveness analysis and other meta (spill information, compile-time value) 6 | -- 7 | -- The code generator optimises as much as possible in single pass: 8 | -- * Fold compile-time expressions and constant propagation 9 | -- * Basic control flow analysis with dead code elimination (based on compile-time expressions) 10 | -- * Single-pass optimistic register allocation 11 | -- 12 | -- The first pass doesn't have variable lifetime visibility yet, so it relies on rewriter for further 13 | -- optimisations such as: 14 | -- * Dead store elimination (first-pass doesn't know if/when the variable is going to be used) 15 | -- * Common sub-expression elimination (relies on DCE and liveness analysis) 16 | -- * Orphan JMP elimination (removing this in first pass would break previous JMP targets) 17 | -- * Better register allocation (needs to be recomputed after optimisations) 18 | 19 | local ffi = require('ffi') 20 | local bit = require('bit') 21 | local band = bit.band 22 | local S = require('syscall') 23 | local c, t = S.c, S.t 24 | local bytecode = require('bpf.ljbytecode') 25 | local cdef = require('bpf.cdef') 26 | local proto = require('bpf.proto') 27 | local builtins = require('bpf.builtins') 28 | 29 | -- Constants 30 | local ALWAYS, NEVER = -1, -2 31 | local BPF, CMD = ffi.typeof('struct bpf'), ffi.typeof('struct bpf_cmd') 32 | local HELPER = ffi.typeof('struct bpf_func_id') 33 | 34 | -- Symbolic table of constant expressions over numbers 35 | local const_expr = { 36 | ADD = function (a, b) return a + b end, 37 | SUB = function (a, b) return a - b end, 38 | DIV = function (a, b) return a / b end, 39 | MOD = function (a, b) return a % b end, 40 | JEQ = function (a, b) return a == b end, 41 | JNE = function (a, b) return a ~= b end, 42 | JGE = function (a, b) return a >= b end, 43 | JGT = function (a, b) return a > b end, 44 | } 45 | local const_width = { 46 | [1] = BPF.B, [2] = BPF.H, [4] = BPF.W, [8] = BPF.DW, 47 | } 48 | 49 | -- Built-ins that are strict only (never compile-time expandable) 50 | local builtins_strict = { 51 | [ffi.new] = true, 52 | [print] = true, 53 | } 54 | 55 | -- Return struct member size/type (requires LuaJIT 2.1+) 56 | -- I am ashamed that there's no easier way around it. 57 | local function sizeofattr(ct, name) 58 | if not ffi.typeinfo then error('LuaJIT 2.1+ is required for ffi.typeinfo') end 59 | local cinfo = ffi.typeinfo(ct) 60 | while true do 61 | cinfo = ffi.typeinfo(cinfo.sib) 62 | if not cinfo then return end 63 | if cinfo.name == name then break end 64 | end 65 | local size = math.max(1, ffi.typeinfo(cinfo.sib or ct).size - cinfo.size) 66 | -- Guess type name 67 | return size, builtins.width_type(size) 68 | end 69 | 70 | -- Return true if the constant part is a proxy 71 | local function is_proxy(x) 72 | return type(x) == 'table' and (x.__dissector or x.__map or x.__base) 73 | end 74 | 75 | -- Create compiler closure 76 | local function create_emitter(env, stackslots, params, param_types) 77 | 78 | local V = {} -- Variable tracking / register allocator 79 | local code = { -- Generated code 80 | pc = 0, bc_pc = 0, 81 | insn = ffi.new('struct bpf_insn[4096]'), 82 | fixup = {}, 83 | reachable = true, 84 | seen_cmp = nil, 85 | } 86 | local Vstate = {} -- Track variable layout at basic block exits 87 | 88 | -- Anything below this stack offset is free to use by caller 89 | -- @note: There is no tracking memory allocator, so the caller may 90 | -- lower it for persistent objects, but such memory will never 91 | -- be reclaimed and the caller is responsible for resetting stack 92 | -- top whenever the memory below is free to be reused 93 | local stack_top = (stackslots + 1) * ffi.sizeof('uint64_t') 94 | 95 | local function emit(op, dst, src, off, imm) 96 | local ins = code.insn[code.pc] 97 | ins.code = op 98 | ins.dst_reg = dst 99 | ins.src_reg = src 100 | ins.off = off 101 | ins.imm = imm 102 | code.pc = code.pc + 1 103 | end 104 | 105 | local function reg_spill(var) 106 | local vinfo = V[var] 107 | vinfo.spill = (var + 1) * ffi.sizeof('uint64_t') -- Index by (variable number) * (register width) 108 | emit(BPF.MEM + BPF.STX + BPF.DW, 10, vinfo.reg, -vinfo.spill, 0) 109 | vinfo.reg = nil 110 | end 111 | 112 | local function reg_fill(var, reg) 113 | local vinfo = V[var] 114 | assert(vinfo.spill, 'attempt to fill register with a VAR that isn\'t spilled') 115 | emit(BPF.MEM + BPF.LDX + BPF.DW, reg, 10, -vinfo.spill, 0) 116 | vinfo.reg = reg 117 | vinfo.spill = nil 118 | end 119 | 120 | -- Allocate a register (lazy simple allocator) 121 | local function reg_alloc(var, reg) 122 | -- Specific register requested, must spill/move existing variable 123 | if reg then 124 | for k,v in pairs(V) do -- Spill any variable that has this register 125 | if v.reg == reg and not v.shadow then 126 | reg_spill(k) 127 | break 128 | end 129 | end 130 | return reg 131 | end 132 | -- Find free or least recently used slot 133 | local last, last_seen, used = nil, 0xffff, 0 134 | for k,v in pairs(V) do 135 | if v.reg then 136 | if not v.live_to or v.live_to < last_seen then 137 | last, last_seen = k, v.live_to or last_seen 138 | end 139 | used = bit.bor(used, bit.lshift(1, v.reg)) 140 | end 141 | end 142 | -- Attempt to select a free register from R7-R9 (callee saved) 143 | local free = bit.bnot(used) 144 | if bit.band(free, 0x80) ~= 0 then reg = 7 145 | elseif bit.band(free,0x100) ~= 0 then reg = 8 146 | elseif bit.band(free,0x200) ~= 0 then reg = 9 147 | end 148 | -- Select another variable to be spilled 149 | if not reg then 150 | assert(last) 151 | reg = V[last].reg 152 | reg_spill(last) 153 | end 154 | assert(reg, 'VAR '..var..'fill/spill failed') 155 | return reg 156 | end 157 | 158 | -- Set new variable 159 | local function vset(var, reg, const, vtype) 160 | -- Must materialise all variables shadowing this variable slot, as it will be overwritten 161 | if V[var] and V[var].reg then 162 | for k, vinfo in pairs(V) do 163 | -- Shadowing variable MUST share the same type and attributes, 164 | -- but the register assignment may have changed 165 | if vinfo.shadow == var then 166 | vinfo.reg = V[var].reg 167 | vinfo.shadow = nil 168 | end 169 | end 170 | end 171 | -- Get precise type for CDATA or attempt to narrow numeric constant 172 | if not vtype and type(const) == 'cdata' then vtype = ffi.typeof(const) end 173 | V[var] = {reg=reg, const=const, type=vtype} 174 | end 175 | 176 | -- Materialize (or register) a variable in a register 177 | -- If the register is nil, then the a new register is assigned (if not already assigned) 178 | local function vreg(var, reg, reserve, vtype) 179 | local vinfo = V[var] 180 | assert(vinfo, 'VAR '..var..' not registered') 181 | vinfo.live_to = code.pc-1 182 | if (vinfo.reg and not reg) and not vinfo.shadow then return vinfo.reg end 183 | reg = reg_alloc(var, reg) 184 | -- Materialize variable shadow copy 185 | local src = vinfo 186 | while src.shadow do src = V[src.shadow] end 187 | if reserve then 188 | -- No load to register occurs 189 | elseif src.reg then 190 | emit(BPF.ALU64 + BPF.MOV + BPF.X, reg, src.reg, 0, 0) 191 | elseif src.spill then 192 | vinfo.spill = src.spill 193 | reg_fill(var, reg) 194 | elseif src.const then 195 | vtype = vtype or src.type 196 | if type(src.const) == 'table' and src.const.__base then 197 | -- Load pointer type 198 | emit(BPF.ALU64 + BPF.MOV + BPF.X, reg, 10, 0, 0) 199 | emit(BPF.ALU64 + BPF.ADD + BPF.K, reg, 0, 0, -src.const.__base) 200 | elseif type(src.const) == 'table' and src.const.__dissector then 201 | -- Load dissector offset (imm32), but keep the constant part (dissector proxy) 202 | emit(BPF.ALU64 + BPF.MOV + BPF.K, reg, 0, 0, src.const.off or 0) 203 | elseif vtype and ffi.sizeof(vtype) == 8 then 204 | -- IMM64 must be done in two instructions with imm64 = (lo(imm32), hi(imm32)) 205 | emit(BPF.LD + BPF.DW, reg, 0, 0, ffi.cast('uint32_t', src.const)) 206 | emit(0, 0, 0, 0, ffi.cast('uint32_t', bit.rshift(bit.rshift(src.const, 16), 16))) 207 | vinfo.const = nil -- The variable is live 208 | else 209 | emit(BPF.ALU64 + BPF.MOV + BPF.K, reg, 0, 0, src.const) 210 | vinfo.const = nil -- The variable is live 211 | end 212 | else assert(false, 'VAR '..var..' has neither register nor constant value') end 213 | vinfo.reg = reg 214 | vinfo.shadow = nil 215 | vinfo.live_from = code.pc-1 216 | vinfo.type = vtype or vinfo.type 217 | return reg 218 | end 219 | 220 | -- Copy variable 221 | local function vcopy(dst, src) 222 | if dst == src then return end 223 | V[dst] = {reg=V[src].reg, const=V[src].const, shadow=src, source=V[src].source, type=V[src].type} 224 | end 225 | 226 | -- Dereference variable of pointer type 227 | local function vderef(dst_reg, src_reg, vtype) 228 | -- Dereference map pointers for primitive types 229 | -- BPF doesn't allow pointer arithmetics, so use the entry value 230 | local w = ffi.sizeof(vtype) 231 | assert(const_width[w], 'NYI: sizeof('..tostring(vtype)..') not 1/2/4/8 bytes') 232 | if dst_reg ~= src_reg then 233 | emit(BPF.ALU64 + BPF.MOV + BPF.X, dst_reg, src_reg, 0, 0) -- dst = src 234 | end 235 | emit(BPF.JMP + BPF.JEQ + BPF.K, src_reg, 0, 1, 0) -- if (src != NULL) 236 | emit(BPF.MEM + BPF.LDX + const_width[w], dst_reg, src_reg, 0, 0) -- dst = *src; 237 | end 238 | 239 | -- Allocate a space for variable 240 | local function valloc(size, blank) 241 | local base = stack_top 242 | assert(stack_top + size < 512 * 1024, 'exceeded maximum stack size of 512kB') 243 | stack_top = stack_top + size 244 | -- Align to 8 byte boundary 245 | stack_top = math.ceil(stack_top/8)*8 246 | -- Current kernel version doesn't support ARG_PTR_TO_RAW_STACK 247 | -- so we always need to have memory initialized, remove this when supported 248 | if blank then 249 | if type(blank) == 'string' then 250 | local sp = 0 251 | while sp < size do 252 | -- TODO: no BPF_ST + BPF_DW instruction yet 253 | local as_u32 = ffi.new('uint32_t [1]') 254 | local sub = blank:sub(sp+1, sp+ffi.sizeof(as_u32)) 255 | ffi.copy(as_u32, sub, #sub) 256 | emit(BPF.MEM + BPF.ST + BPF.W, 10, 0, -(stack_top-sp), as_u32[0]) 257 | sp = sp + ffi.sizeof(as_u32) 258 | end 259 | elseif type(blank) == 'boolean' then 260 | reg_alloc(stackslots, 0) 261 | emit(BPF.ALU64 + BPF.MOV + BPF.K, 0, 0, 0, 0) 262 | for sp = base+8,stack_top,8 do 263 | emit(BPF.MEM + BPF.STX + BPF.DW, 10, 0, -sp, 0) 264 | end 265 | else error('NYI: will with unknown type '..type(blank)) end 266 | end 267 | return stack_top 268 | end 269 | 270 | -- Emit compensation code at the end of basic block to unify variable set layout on all block exits 271 | -- 1. we need to free registers by spilling 272 | -- 2. fill registers to match other exits from this BB 273 | local function bb_end(Vcomp) 274 | for i,v in pairs(V) do 275 | if Vcomp[i] and Vcomp[i].spill and not v.spill then 276 | reg_spill(i) 277 | end 278 | end 279 | for i,v in pairs(V) do 280 | if Vcomp[i] and Vcomp[i].reg and not v.reg then 281 | vreg(i, Vcomp[i].reg) 282 | end 283 | end 284 | end 285 | 286 | local function LD_ABS(dst, off, w) 287 | local dst_reg = vreg(dst, 0, true, builtins.width_type(w)) -- Reserve R0 288 | -- assert(w < 8, 'NYI: LD_ABS64 is not supported') -- IMM64 has two IMM32 insns fused together 289 | emit(BPF.LD + BPF.ABS + const_width[w], dst_reg, 0, 0, off) 290 | end 291 | 292 | local function LD_IND(dst, src, w, off) 293 | local src_reg = vreg(src) -- Must materialize first in case dst == src 294 | local dst_reg = vreg(dst, 0, true, builtins.width_type(w)) -- Reserve R0 295 | emit(BPF.LD + BPF.IND + const_width[w], dst_reg, src_reg, 0, off or 0) 296 | end 297 | 298 | local function LD_FIELD(a, d, w, imm) 299 | if imm then 300 | LD_ABS(a, imm, w) 301 | else 302 | LD_IND(a, d, w) 303 | end 304 | end 305 | 306 | -- @note: This is specific now as it expects registers reserved 307 | local function LD_IMM_X(dst_reg, src_type, imm, w) 308 | if w == 8 then -- IMM64 must be done in two instructions with imm64 = (lo(imm32), hi(imm32)) 309 | emit(BPF.LD + const_width[w], dst_reg, src_type, 0, ffi.cast('uint32_t', imm)) 310 | -- Must shift in two steps as bit.lshift supports [0..31] 311 | emit(0, 0, 0, 0, ffi.cast('uint32_t', bit.lshift(bit.lshift(imm, 16), 16))) 312 | else 313 | emit(BPF.LD + const_width[w], dst_reg, src_type, 0, imm) 314 | end 315 | end 316 | 317 | local function LOAD(dst, src, off, vtype) 318 | local base = V[src].const 319 | assert(base.__dissector, 'NYI: load() on variable that doesnt have dissector') 320 | -- Cast to different type if requested 321 | vtype = vtype or base.__dissector 322 | local w = ffi.sizeof(vtype) 323 | assert(w <= 4, 'NYI: load() supports 1/2/4 bytes at a time only') 324 | if base.off then -- Absolute address to payload 325 | LD_ABS(dst, off + base.off, w) 326 | else -- Indirect address to payload 327 | LD_IND(dst, src, w, off) 328 | end 329 | V[dst].type = vtype 330 | V[dst].const = nil -- Dissected value is not constant anymore 331 | end 332 | 333 | local function CMP_STR(a, b, op) 334 | assert(op == 'JEQ' or op == 'JNE', 'NYI: only equivallence stack/string only supports == or ~=') 335 | -- I have no better idea how to implement it than unrolled XOR loop, as we can fixup only one JMP 336 | -- So: X(a,b) = a[0] ^ b[0] | a[1] ^ b[1] | ... 337 | -- EQ(a,b) <=> X == 0 338 | -- This could be optimised by placing early exits by rewriter in second phase for long strings 339 | local base, size = V[a].const.__base, math.min(#b, ffi.sizeof(V[a].type)) 340 | local acc, tmp = reg_alloc(stackslots, 0), reg_alloc(stackslots+1, 1) 341 | local sp = 0 342 | emit(BPF.ALU64 + BPF.MOV + BPF.K, acc, 0, 0, 0) 343 | while sp < size do 344 | -- Load string chunk as imm32 345 | local as_u32 = ffi.new('uint32_t [1]') 346 | local sub = b:sub(sp+1, sp+ffi.sizeof(as_u32)) 347 | ffi.copy(as_u32, sub, #sub) 348 | -- TODO: make this faster by interleaved load/compare steps with DW length 349 | emit(BPF.MEM + BPF.LDX + BPF.W, tmp, 10, -(base-sp), 0) 350 | emit(BPF.ALU64 + BPF.XOR + BPF.K, tmp, 0, 0, as_u32[0]) 351 | emit(BPF.ALU64 + BPF.OR + BPF.X, acc, tmp, 0, 0) 352 | sp = sp + ffi.sizeof(as_u32) 353 | end 354 | emit(BPF.JMP + BPF[op] + BPF.K, acc, 0, 0xffff, 0) 355 | code.seen_cmp = code.pc-1 356 | end 357 | 358 | local function CMP_REG(a, b, op) 359 | -- Fold compile-time expressions 360 | if V[a].const and V[b].const and not (is_proxy(V[a].const) or is_proxy(V[b].const)) then 361 | code.seen_cmp = const_expr[op](V[a].const, V[b].const) and ALWAYS or NEVER 362 | else 363 | -- Comparison against compile-time string or stack memory 364 | if V[b].const and type(V[b].const) == 'string' then 365 | return CMP_STR(a, V[b].const, op) 366 | end 367 | -- The 0xFFFF target here has no significance, it's just a placeholder for 368 | -- compiler to replace it's absolute offset to LJ bytecode insn with a relative 369 | -- offset in BPF program code, verifier will accept only programs with valid JMP targets 370 | local a_reg, b_reg = vreg(a), vreg(b) 371 | -- Migrate operands from R0-5 as it will be spilled in compensation code when JMP out of BB 372 | if a_reg == 0 then a_reg = vreg(a, 7) end 373 | emit(BPF.JMP + BPF[op] + BPF.X, a_reg, b_reg, 0xffff, 0) 374 | code.seen_cmp = code.pc-1 375 | end 376 | end 377 | 378 | local function CMP_IMM(a, b, op) 379 | if V[a].const and not is_proxy(V[a].const) then -- Fold compile-time expressions 380 | code.seen_cmp = const_expr[op](V[a].const, b) and ALWAYS or NEVER 381 | else 382 | -- Convert imm32 to number 383 | if type(b) == 'string' then 384 | if #b == 1 then b = b:byte() 385 | elseif cdef.isptr(V[a].type) then 386 | -- String comparison between stack/constant string 387 | return CMP_STR(a, b, op) 388 | elseif #b <= 4 then 389 | -- Convert to u32 with network byte order 390 | local imm = ffi.new('uint32_t[1]') 391 | ffi.copy(imm, b, #b) 392 | b = builtins.hton(imm[0]) 393 | else error('NYI: compare register with string, where #string > sizeof(u32)') end 394 | end 395 | -- The 0xFFFF target here has no significance, it's just a placeholder for 396 | -- compiler to replace it's absolute offset to LJ bytecode insn with a relative 397 | -- offset in BPF program code, verifier will accept only programs with valid JMP targets 398 | local reg = vreg(a) 399 | -- Migrate operands from R0-5 as it will be spilled in compensation code when JMP out of BB 400 | if reg == 0 then reg = vreg(a, 7) end 401 | emit(BPF.JMP + BPF[op] + BPF.K, reg, 0, 0xffff, b) 402 | code.seen_cmp = code.pc-1 403 | end 404 | end 405 | 406 | local function ALU_IMM(dst, a, b, op) 407 | -- Fold compile-time expressions 408 | if V[a].const and not is_proxy(V[a].const) then 409 | assert(type(V[a].const) == 'number', 'VAR '..a..' must be numeric') 410 | vset(dst, nil, const_expr[op](V[a].const, b)) 411 | -- Now we need to materialize dissected value at DST, and add it 412 | else 413 | vcopy(dst, a) 414 | local dst_reg = vreg(dst) 415 | if cdef.isptr(V[a].type) then 416 | vderef(dst_reg, dst_reg, V[a].const.__dissector) 417 | V[dst].type = V[a].const.__dissector 418 | else 419 | V[dst].type = V[a].type 420 | end 421 | emit(BPF.ALU64 + BPF[op] + BPF.K, dst_reg, 0, 0, b) 422 | end 423 | end 424 | 425 | local function ALU_REG(dst, a, b, op) 426 | -- Fold compile-time expressions 427 | if V[a].const and not (is_proxy(V[a].const) or is_proxy(V[b].const)) then 428 | assert(type(V[a].const) == 'number', 'VAR '..a..' must be numeric') 429 | assert(type(V[b].const) == 'number', 'VAR '..b..' must be numeric') 430 | if type(op) == 'string' then op = const_expr[op] end 431 | vcopy(dst, a) 432 | V[dst].const = op(V[a].const, V[b].const) 433 | else 434 | local src_reg = b and vreg(b) or 0 -- SRC is optional for unary operations 435 | if b and cdef.isptr(V[b].type) then 436 | -- We have to allocate a temporary register for dereferencing to preserve 437 | -- pointer in source variable that MUST NOT be altered 438 | reg_alloc(stackslots, 2) 439 | vderef(2, src_reg, V[b].const.__dissector) 440 | src_reg = 2 441 | end 442 | vcopy(dst, a) -- DST may alias B, so copy must occur after we materialize B 443 | local dst_reg = vreg(dst) 444 | if cdef.isptr(V[a].type) then 445 | vderef(dst_reg, dst_reg, V[a].const.__dissector) 446 | V[dst].type = V[a].const.__dissector 447 | end 448 | emit(BPF.ALU64 + BPF[op] + BPF.X, dst_reg, src_reg, 0, 0) 449 | V[stackslots].reg = nil -- Free temporary registers 450 | end 451 | end 452 | 453 | 454 | local function ALU_IMM_NV(dst, a, b, op) 455 | -- Do DST = IMM(a) op VAR(b) where we can't invert because 456 | -- the registers are u64 but immediates are u32, so complement 457 | -- arithmetics wouldn't work 458 | vset(stackslots+1, nil, a) 459 | ALU_REG(dst, stackslots+1, b, op) 460 | end 461 | 462 | local function BUILTIN(func, ...) 463 | local builtin_export = { 464 | -- Compiler primitives (work with variable slots, emit instructions) 465 | V=V, vreg=vreg, vset=vset, vcopy=vcopy, vderef=vderef, valloc=valloc, emit=emit, 466 | reg_alloc=reg_alloc, reg_spill=reg_spill, tmpvar=stackslots, const_width=const_width, 467 | -- Extensions and helpers (use with care) 468 | LD_IMM_X = LD_IMM_X, 469 | } 470 | func(builtin_export, ...) 471 | end 472 | 473 | local function CALL(a, b, d) 474 | assert(b-1 <= 1, 'NYI: CALL with >1 return values') 475 | -- Perform either compile-time, helper, or builtin 476 | local func = V[a].const 477 | -- Gather all arguments and check if they're constant 478 | local args, const, nargs = {}, true, d - 1 479 | for i = a+1, a+d-1 do 480 | table.insert(args, V[i].const) 481 | if not V[i].const or is_proxy(V[i].const) then const = false end 482 | end 483 | local builtin = builtins[func] 484 | if not const or nargs == 0 then 485 | if builtin and type(builtin) == 'function' then 486 | args = {a} 487 | for i = a+1, a+nargs do table.insert(args, i) end 488 | BUILTIN(builtin, unpack(args)) 489 | elseif V[a+2] and V[a+2].const then -- var OP imm 490 | ALU_IMM(a, a+1, V[a+2].const, builtin) 491 | elseif nargs <= 2 then -- var OP var 492 | ALU_REG(a, a+1, V[a+2] and a+2, builtin) 493 | else 494 | error('NYI: CALL non-builtin with 3 or more arguments') 495 | end 496 | -- Call on dissector implies slice retrieval 497 | elseif type(func) == 'table' and func.__dissector then 498 | assert(nargs >= 2, 'NYI: .slice(a, b) must have at least two arguments') 499 | assert(V[a+1].const and V[a+2].const, 'NYI: slice() arguments must be constant') 500 | local off = V[a+1].const 501 | local vtype = builtins.width_type(V[a+2].const - off) 502 | LOAD(a, a, off, vtype) 503 | -- Strict builtins cannot be expanded on compile-time 504 | elseif builtins_strict[func] and builtin then 505 | args = {a} 506 | for i = a+1, a+nargs do table.insert(args, i) end 507 | BUILTIN(builtin, unpack(args)) 508 | -- Attempt compile-time call expansion (expects all argument compile-time known) 509 | else 510 | V[a].const = func(unpack(args)) 511 | end 512 | end 513 | 514 | local function MAP_INIT(map_var, key, imm) 515 | local map = V[map_var].const 516 | vreg(map_var, 1, true, ffi.typeof('uint64_t')) 517 | -- Reserve R1 and load ptr for process-local map fd 518 | LD_IMM_X(1, BPF.PSEUDO_MAP_FD, map.fd, ffi.sizeof(V[map_var].type)) 519 | V[map_var].reg = nil -- R1 will be invalidated after CALL, forget register allocation 520 | -- Reserve R2 and load R2 = key pointer 521 | local key_size = ffi.sizeof(map.key_type) 522 | local w = const_width[key_size] or BPF.DW 523 | local pod_type = const_width[key_size] 524 | local sp = stack_top + key_size -- Must use stack below spill slots 525 | -- Store immediate value on stack 526 | reg_alloc(stackslots, 2) -- Spill anything in R2 (unnamed tmp variable) 527 | local key_base = key and V[key].const 528 | imm = imm or key_base 529 | if imm and (not key or not is_proxy(key_base)) then 530 | assert(pod_type, 'NYI: map[const K], K width must be 1/2/4/8') 531 | emit(BPF.MEM + BPF.ST + w, 10, 0, -sp, imm) 532 | -- Key is in register, spill it 533 | elseif V[key].reg and pod_type then 534 | if cdef.isptr(V[key].type) then 535 | -- There is already pointer in register, dereference before spilling 536 | emit(BPF.MEM + BPF.LDX + w, 2, V[key].reg, 0, 0) 537 | emit(BPF.MEM + BPF.STX + w, 10, 2, -sp, 0) 538 | else -- Variable in register is POD, spill it on the stack 539 | emit(BPF.MEM + BPF.STX + w, 10, V[key].reg, -sp, 0) 540 | end 541 | -- Key is spilled from register to stack 542 | elseif V[key].spill then 543 | sp = V[key].spill 544 | -- Key is already on stack, write to base-relative address 545 | elseif key_base.__base then 546 | assert(key_size == ffi.sizeof(V[key].type), 'VAR '..key..' type incompatible with BPF map key type') 547 | sp = key_base.__base 548 | else 549 | error('VAR '..key..' is neither const-expr/register/stack/spilled') 550 | end 551 | -- If [FP+K] addressing, emit it 552 | if sp then 553 | emit(BPF.ALU64 + BPF.MOV + BPF.X, 2, 10, 0, 0) 554 | emit(BPF.ALU64 + BPF.ADD + BPF.K, 2, 0, 0, -sp) 555 | end 556 | end 557 | 558 | local function MAP_GET(dst, map_var, key, imm) 559 | local map = V[map_var].const 560 | MAP_INIT(map_var, key, imm) 561 | -- Flag as pointer type and associate dissector for map value type 562 | vreg(dst, 0, true, ffi.typeof('uint8_t *')) 563 | V[dst].const = {__dissector=map.val_type} 564 | emit(BPF.JMP + BPF.CALL, 0, 0, 0, HELPER.map_lookup_elem) 565 | V[stackslots].reg = nil -- Free temporary registers 566 | end 567 | 568 | local function MAP_DEL(map_var, key, key_imm) 569 | -- Set R0, R1 (map fd, preempt R0) 570 | reg_alloc(stackslots, 0) -- Spill anything in R0 (unnamed tmp variable) 571 | MAP_INIT(map_var, key, key_imm) 572 | emit(BPF.JMP + BPF.CALL, 0, 0, 0, HELPER.map_delete_elem) 573 | V[stackslots].reg = nil -- Free temporary registers 574 | end 575 | 576 | local function MAP_SET(map_var, key, key_imm, src) 577 | local map = V[map_var].const 578 | -- Delete when setting nil 579 | if V[src].type == ffi.typeof('void') then 580 | return MAP_DEL(map_var, key, key_imm) 581 | end 582 | -- Set R0, R1 (map fd, preempt R0) 583 | reg_alloc(stackslots, 0) -- Spill anything in R0 (unnamed tmp variable) 584 | MAP_INIT(map_var, key, key_imm) 585 | reg_alloc(stackslots, 4) -- Spill anything in R4 (unnamed tmp variable) 586 | emit(BPF.ALU64 + BPF.MOV + BPF.K, 4, 0, 0, 0) -- BPF_ANY, create new element or update existing 587 | -- Reserve R3 for value pointer 588 | local val_size = ffi.sizeof(map.val_type) 589 | local w = const_width[val_size] or BPF.DW 590 | local pod_type = const_width[val_size] 591 | -- Stack pointer must be aligned to both key/value size and have enough headroom for (key, value) 592 | local sp = stack_top + ffi.sizeof(map.key_type) + val_size 593 | sp = sp + (sp % val_size) 594 | local base = V[src].const 595 | if base and not is_proxy(base) then 596 | assert(pod_type, 'NYI: MAP[K] = imm V; V width must be 1/2/4/8') 597 | emit(BPF.MEM + BPF.ST + w, 10, 0, -sp, base) 598 | -- Value is in register, spill it 599 | elseif V[src].reg and pod_type then 600 | emit(BPF.MEM + BPF.STX + w, 10, V[src].reg, -sp, 0) 601 | -- We get a pointer to spilled register on stack 602 | elseif V[src].spill then 603 | -- If variable is a pointer, we can load it to R3 directly (save "LEA") 604 | if cdef.isptr(V[src].type) then 605 | reg_fill(src, 3) 606 | emit(BPF.JMP + BPF.CALL, 0, 0, 0, HELPER.map_update_elem) 607 | return 608 | else 609 | sp = V[src].spill 610 | end 611 | -- Value is already on stack, write to base-relative address 612 | elseif base.__base then 613 | assert(val_size == ffi.sizeof(V[key].type), 'VAR '..key..' type incompatible with BPF map value type') 614 | sp = key_base.__base 615 | -- Value is constant, materialize it on stack 616 | else 617 | error('VAR '..key..' is neither const-expr/register/stack/spilled') 618 | end 619 | reg_alloc(stackslots, 3) -- Spill anything in R3 (unnamed tmp variable) 620 | emit(BPF.ALU64 + BPF.MOV + BPF.X, 3, 10, 0, 0) 621 | emit(BPF.ALU64 + BPF.ADD + BPF.K, 3, 0, 0, -sp) 622 | emit(BPF.JMP + BPF.CALL, 0, 0, 0, HELPER.map_update_elem) 623 | V[stackslots].reg = nil -- Free temporary registers 624 | end 625 | 626 | -- Finally - this table translates LuaJIT bytecode into code emitter actions. 627 | local BC = { 628 | -- Constants 629 | KNUM = function(a, _, c, _) -- KNUM 630 | vset(a, nil, c, ffi.typeof('int32_t')) -- TODO: only 32bit immediates are supported now 631 | end, 632 | KSHORT = function(a, _, _, d) -- KSHORT 633 | vset(a, nil, d, ffi.typeof('int16_t')) 634 | end, 635 | KPRI = function(a, _, _, d) -- KPRI 636 | -- KNIL is 0, must create a special type to identify it 637 | local vtype = (d < 1) and ffi.typeof('void') or ffi.typeof('uint8_t') 638 | vset(a, nil, (d < 2) and 0 or 1, vtype) 639 | end, 640 | KSTR = function(a, _, c, _) -- KSTR 641 | vset(a, nil, c, ffi.typeof('const char[?]')) 642 | end, 643 | MOV = function(a, _, _, d) -- MOV var, var 644 | vcopy(a, d) 645 | end, 646 | 647 | -- Comparison ops 648 | -- Note: comparisons are always followed by JMP opcode, that 649 | -- will fuse following JMP to JMP+CMP instruction in BPF 650 | -- Note: we're narrowed to integers, so operand/operator inversion is legit 651 | ISLT = function(a, _, _, d) return CMP_REG(d, a, 'JGE') end, -- (a < d) (inverted) 652 | ISGE = function(a, _, _, d) return CMP_REG(a, d, 'JGE') end, -- (a >= d) 653 | ISGT = function(a, _, _, d) return CMP_REG(a, d, 'JGT') end, -- (a > d) 654 | ISEQV = function(a, _, _, d) return CMP_REG(a, d, 'JEQ') end, -- (a == d) 655 | ISNEV = function(a, _, _, d) return CMP_REG(a, d, 'JNE') end, -- (a ~= d) 656 | ISEQS = function(a, _, c, _) return CMP_IMM(a, c, 'JEQ') end, -- (a == str(c)) 657 | ISNES = function(a, _, c, _) return CMP_IMM(a, c, 'JNE') end, -- (a ~= str(c)) 658 | ISEQN = function(a, _, c, _) return CMP_IMM(a, c, 'JEQ') end, -- (a == c) 659 | ISNEN = function(a, _, c, _) return CMP_IMM(a, c, 'JNE') end, -- (a ~= c) 660 | IST = function(_, _, _, d) return CMP_IMM(d, 0, 'JNE') end, -- (d) 661 | ISF = function(_, _, _, d) return CMP_IMM(d, 0, 'JEQ') end, -- (not d) 662 | ISEQP = function(a, _, c, _) return CMP_IMM(a, c, 'JEQ') end, -- ISEQP (a == c) 663 | -- Binary operations with RHS constants 664 | ADDVN = function(a, b, c, _) return ALU_IMM(a, b, c, 'ADD') end, 665 | SUBVN = function(a, b, c, _) return ALU_IMM(a, b, c, 'SUB') end, 666 | MULVN = function(a, b, c, _) return ALU_IMM(a, b, c, 'MUL') end, 667 | DIVVN = function(a, b, c, _) return ALU_IMM(a, b, c, 'DIV') end, 668 | MODVN = function(a, b, c, _) return ALU_IMM(a, b, c, 'MOD') end, 669 | -- Binary operations with LHS constants 670 | -- Cheat code: we're narrowed to integer arithmetic, so MUL+ADD are commutative 671 | ADDNV = function(a, b, c, _) return ALU_IMM(a, b, c, 'ADD') end, -- ADDNV 672 | MULNV = function(a, b, c, _) return ALU_IMM(a, b, c, 'MUL') end, -- MULNV 673 | SUBNV = function(a, b, c, _) return ALU_IMM_NV(a, c, b, 'SUB') end, -- SUBNV 674 | DIVNV = function(a, b, c, _) return ALU_IMM_NV(a, c, b, 'DIV') end, -- DIVNV 675 | -- Binary operations between registers 676 | ADDVV = function(a, b, _, d) return ALU_REG(a, b, d, 'ADD') end, 677 | SUBVV = function(a, b, _, d) return ALU_REG(a, b, d, 'SUB') end, 678 | MULVV = function(a, b, _, d) return ALU_REG(a, b, d, 'MUL') end, 679 | DIVVV = function(a, b, _, d) return ALU_REG(a, b, d, 'DIV') end, 680 | MODVV = function(a, b, _, d) return ALU_REG(a, b, d, 'MOD') end, 681 | -- Strings 682 | CAT = function(a, b, _, d) -- CAT A = B ~ D 683 | assert(V[b].const and V[d].const, 'NYI: CAT only works on compile-time expressions') 684 | assert(type(V[b].const) == 'string' and type(V[d].const) == 'string', 685 | 'NYI: CAT only works on compile-time strings') 686 | vset(a, nil, V[b].const .. V[d].const) 687 | end, 688 | -- Tables 689 | GGET = function (a, _, c, _) -- GGET (A = GLOBAL[c]) 690 | if env[c] ~= nil then 691 | vset(a, nil, env[c]) 692 | else error(string.format("undefined global '%s'", c)) end 693 | end, 694 | UGET = function (a, _, c, _) -- UGET (A = UPVALUE[c]) 695 | if env[c] ~= nil then 696 | vset(a, nil, env[c]) 697 | else error(string.format("undefined upvalue '%s'", c)) end 698 | end, 699 | TGETB = function (a, b, _, d) -- TGETB (A = B[D]) 700 | if a ~= b then vset(a) end 701 | local base = V[b].const 702 | if base.__map then -- BPF map read (constant) 703 | MAP_GET(a, b, nil, d) 704 | else 705 | LOAD(a, b, d, ffi.typeof('uint8_t')) 706 | end 707 | end, 708 | TSETB = function (a, b, _, d) -- TSETB (B[D] = A) 709 | if V[b].const.__map then -- BPF map read (constant) 710 | return MAP_SET(b, nil, d, a) -- D is literal 711 | elseif V[b].const and V[b].const and V[a].const then 712 | V[b].const[V[d].const] = V[a].const 713 | else error('NYI: B[D] = A, where B is not Lua table or BPF map') 714 | end 715 | end, 716 | TSETV = function (a, b, _, d) -- TSETV (B[D] = A) 717 | if V[b].const.__map then -- BPF map read (constant) 718 | return MAP_SET(b, d, nil, a) -- D is variable 719 | elseif V[b].const and V[d].const and V[a].const then 720 | V[b].const[V[d].const] = V[a].const 721 | else error('NYI: B[D] = A, where B is not Lua table or BPF map') 722 | end 723 | end, 724 | TSETS = function (a, b, c, _) -- TSETS (B[C] = A) 725 | assert(V[b] and V[b].const, 'NYI: B[D] where B is not Lua table or BPF map') 726 | local base = V[b].const 727 | if base.__dissector then 728 | local ofs,bpos,bsize = ffi.offsetof(base.__dissector, c) 729 | assert(not bpos, 'NYI: B[C] = A, where C is a bitfield') 730 | local w = sizeofattr(base.__dissector, c) 731 | -- TODO: support vectorized moves larger than register width 732 | assert(const_width[w], 'B[C] = A, sizeof(A) must be 1/2/4/8') 733 | local src_reg = vreg(a) 734 | -- If source is a pointer, we must dereference it first 735 | if cdef.isptr(V[a].type) then 736 | local tmp_reg = reg_alloc(stackslots, 1) -- Clone variable in tmp register 737 | emit(BPF.ALU64 + BPF.MOV + BPF.X, tmp_reg, src_reg, 0, 0) 738 | vderef(tmp_reg, tmp_reg, V[a].const.__dissector) 739 | src_reg = tmp_reg -- Materialize and dereference it 740 | -- Source is a value on stack, we must load it first 741 | elseif V[a].const and V[a].const.__base > 0 then 742 | emit(BPF.MEM + BPF.LDX + const_width[w], src_reg, 10, -V[a].const.__base, 0) 743 | V[a].type = V[a].const.__dissector 744 | V[a].const = nil -- Value is dereferenced 745 | end 746 | -- If the table is not on stack, it must be checked for NULL 747 | if not base.__base then 748 | emit(BPF.JMP + BPF.JEQ + BPF.K, V[b].reg, 0, 1, 0) -- if (map[x] != NULL) 749 | emit(BPF.MEM + BPF.STX + const_width[w], V[b].reg, src_reg, ofs, 0) 750 | else -- Table is already on stack, write to base-relative address 751 | emit(BPF.MEM + BPF.STX + const_width[w], 10, src_reg, -base.__base + ofs, 0) 752 | end 753 | elseif V[a].const then 754 | base[c] = V[a].const 755 | else error('NYI: B[C] = A, where B is not Lua table or BPF map') 756 | end 757 | end, 758 | TGETV = function (a, b, _, d) -- TGETV (A = B[D]) 759 | assert(V[b] and V[b].const, 'NYI: B[D] where B is not Lua table or BPF map') 760 | if a ~= b then vset(a) end 761 | if V[b].const.__map then -- BPF map read 762 | MAP_GET(a, b, d) 763 | elseif V[b].const == env.pkt then -- Raw packet, no offset 764 | LD_FIELD(a, d, 1, V[d].const) 765 | else V[a].const = V[b].const[V[d].const] end 766 | end, 767 | TGETS = function (a, b, c, _) -- TGETS (A = B[C]) 768 | assert(V[b] and V[b].const, 'NYI: B[C] where C is string and B not Lua table or BPF map') 769 | local base = V[b].const 770 | if type(base) == 'table' and base.__dissector then 771 | local ofs,bpos,bsize = ffi.offsetof(base.__dissector, c) 772 | -- Resolve table key using metatable 773 | if not ofs and type(base.__dissector[c]) == 'string' then 774 | c = base.__dissector[c] 775 | ofs,bpos,bsize = ffi.offsetof(base.__dissector, c) 776 | end 777 | if not ofs and proto[c] then -- Load new dissector on given offset 778 | BUILTIN(proto[c], a, b, c) 779 | else 780 | assert(ofs, tostring(base.__dissector)..'.'..c..' attribute not exists') 781 | if a ~= b then vset(a) end 782 | -- Dissected value is probably not constant anymore 783 | local new_const = nil 784 | -- Simple register load, get absolute offset or R-relative 785 | local w, atype = sizeofattr(base.__dissector, c) 786 | if base.__base == true then -- R-relative addressing 787 | local dst_reg = vreg(a, nil, true) 788 | assert(const_width[w], 'NYI: sizeof('..tostring(base.__dissector)..'.'..c..') not 1/2/4/8 bytes') 789 | emit(BPF.MEM + BPF.LDX + const_width[w], dst_reg, V[b].reg, ofs, 0) 790 | elseif not base.source and base.__base and base.__base > 0 then -- [FP+K] addressing 791 | if cdef.isptr(atype) then -- If the member is pointer type, update base pointer with offset 792 | new_const = {__base = base.__base-ofs} 793 | else 794 | local dst_reg = vreg(a, nil, true) 795 | emit(BPF.MEM + BPF.LDX + const_width[w], dst_reg, 10, -base.__base+ofs, 0) 796 | end 797 | elseif base.off then -- Absolute address to payload 798 | LD_ABS(a, ofs + base.off, w) 799 | elseif base.source == 'probe' then -- Indirect read using probe 800 | BUILTIN(builtins[builtins.probe_read], nil, a, b, atype, ofs) 801 | V[a].source = V[b].source -- Builtin handles everything 802 | return 803 | else -- Indirect address to payload 804 | LD_IND(a, b, w, ofs) 805 | end 806 | -- Bitfield, must be further narrowed with a bitmask/shift 807 | if bpos then 808 | local mask = 0 809 | for i=bpos+1,bpos+bsize do 810 | mask = bit.bor(mask, bit.lshift(1, w*8-i)) 811 | end 812 | emit(BPF.ALU64 + BPF.AND + BPF.K, vreg(a), 0, 0, mask) 813 | -- Free optimization: single-bit values need just boolean result 814 | if bsize > 1 then 815 | local shift = w*8-bsize-bpos 816 | if shift > 0 then 817 | emit(BPF.ALU64 + BPF.RSH + BPF.K, vreg(a), 0, 0, shift) 818 | end 819 | end 820 | end 821 | V[a].type = atype 822 | V[a].const = new_const 823 | V[a].source = V[b].source 824 | end 825 | else V[a].const = base[c] end 826 | end, 827 | -- Loops and branches 828 | CALLM = function (a, b, _, d) -- A = A(A+1, ..., A+D+MULTRES) 829 | -- NYI: Support single result only 830 | CALL(a, b, d+2) 831 | end, 832 | CALL = function (a, b, _, d) -- A = A(A+1, ..., A+D-1) 833 | CALL(a, b, d) 834 | end, 835 | JMP = function (a, _, c, d) -- JMP 836 | -- Discard unused slots after jump 837 | for i, _ in pairs(V) do 838 | if i >= a then V[i] = {} end 839 | end 840 | local val = code.fixup[c] or {} 841 | if code.seen_cmp and code.seen_cmp ~= ALWAYS then 842 | if code.seen_cmp ~= NEVER then -- Do not emit the jump or fixup 843 | -- Store previous CMP insn for reemitting after compensation code 844 | local jmpi = ffi.new('struct bpf_insn', code.insn[code.pc-1]) 845 | code.pc = code.pc - 1 846 | -- First branch point, emit compensation code 847 | local Vcomp = Vstate[c] 848 | if not Vcomp then 849 | for i,v in pairs(V) do 850 | if not v.reg and v.const and not is_proxy(v.const) then 851 | vreg(i, 0) -- Load to TMP register (not saved) 852 | end 853 | if v.reg and v.reg <= 5 then 854 | reg_spill(i) -- Spill caller-saved registers 855 | end 856 | end 857 | -- Record variable state 858 | Vstate[c] = V 859 | V = {} 860 | for i,v in pairs(Vstate[c]) do 861 | V[i] = {} 862 | for k,e in pairs(v) do 863 | V[i][k] = e 864 | end 865 | end 866 | -- Variable state already set, emit specific compensation code 867 | else bb_end(Vcomp) end 868 | -- Reemit CMP insn 869 | emit(jmpi.code, jmpi.dst_reg, jmpi.src_reg, jmpi.off, jmpi.imm) 870 | -- Fuse JMP into previous CMP opcode, mark JMP target for fixup 871 | -- as we don't knot the relative offset in generated code yet 872 | table.insert(val, code.pc-1) 873 | code.fixup[c] = val 874 | end 875 | code.seen_cmp = nil 876 | else 877 | emit(BPF.JMP + BPF.JEQ + BPF.X, 6, 6, 0xffff, 0) -- Always true 878 | table.insert(val, code.pc-1) -- Fixup JMP target 879 | code.reachable = false -- Code following the JMP is not reachable 880 | code.fixup[c] = val 881 | end 882 | end, 883 | RET1 = function (a, _, _, _) -- RET1 884 | if V[a].reg ~= 0 then vreg(a, 0) end 885 | emit(BPF.JMP + BPF.EXIT, 0, 0, 0, 0) 886 | -- Free optimisation: spilled variable will not be filled again 887 | for _,v in pairs(V) do if v.reg == 0 then v.reg = nil end end 888 | code.reachable = false 889 | end, 890 | RET0 = function (_, _, _, _) -- RET0 891 | emit(BPF.ALU64 + BPF.MOV + BPF.K, 0, 0, 0, 0) 892 | emit(BPF.JMP + BPF.EXIT, 0, 0, 0, 0) 893 | code.reachable = false 894 | end, 895 | compile = function () 896 | return code 897 | end 898 | } 899 | -- Always initialize R6 with R1 context 900 | emit(BPF.ALU64 + BPF.MOV + BPF.X, 6, 1, 0, 0) 901 | -- Register R6 as context variable (first argument) 902 | if params and params > 0 then 903 | vset(0, 6, param_types[1] or proto.skb) 904 | end 905 | -- Register tmpvars 906 | vset(stackslots) 907 | vset(stackslots+1) 908 | return setmetatable(BC, { 909 | __index = function (t, k, v) 910 | if type(k) == 'number' then 911 | local op_str = string.sub(require('jit.vmdef').bcnames, 6*k+1, 6*k+6) 912 | error(string.format("NYI: opcode '0x%02x' (%-04s)", k, op_str)) 913 | end 914 | end, 915 | __call = function (t, op, a, b, c, d) 916 | code.bc_pc = code.bc_pc + 1 917 | -- Exitting BB straight through, emit compensation code 918 | if Vstate[code.bc_pc] and code.reachable then 919 | bb_end(Vstate[code.bc_pc]) 920 | end 921 | -- Perform fixup of jump targets 922 | -- We need to do this because the number of consumed and emited 923 | -- bytecode instructions is different 924 | local fixup = code.fixup[code.bc_pc] 925 | if fixup ~= nil then 926 | -- Patch JMP source insn with relative offset 927 | for _,pc in ipairs(fixup) do 928 | code.insn[pc].off = code.pc - 1 - pc 929 | end 930 | code.fixup[code.bc_pc] = nil 931 | code.reachable = true 932 | end 933 | -- Execute 934 | if code.reachable then 935 | assert(t[op], string.format('NYI: instruction %s, parameters: %s,%s,%s,%s', op,a,b,c,d)) 936 | return t[op](a, b, c, d) 937 | end 938 | end, 939 | }) 940 | end 941 | 942 | -- Emitted code dump 943 | local function dump_mem(cls, ins) 944 | local mode = bit.band(ins.code, 0xe0) 945 | if mode == BPF.XADD then cls = 5 end -- The only mode 946 | local op_1 = {'LD', 'LDX', 'ST', 'STX', '', 'XADD'} 947 | local op_2 = {[0]='W', [8]='H', [16]='B', [24]='DW'} 948 | local name = op_1[cls+1] .. op_2[bit.band(ins.code, 0x18)] 949 | local off = tonumber(ffi.cast('int16_t', ins.off)) -- Reinterpret as signed 950 | local dst = cls < 2 and 'R'..ins.dst_reg or string.format('[R%d%+d]', ins.dst_reg, off) 951 | local src = cls % 2 == 0 and '#'..ins.imm or 'R'..ins.src_reg 952 | if cls == BPF.LDX then src = string.format('[R%d%+d]', ins.src_reg, off) end 953 | if mode == BPF.ABS then src = string.format('[%d]', ins.imm) end 954 | if mode == BPF.IND then src = string.format('[R%d%+d]', ins.src_reg, ins.imm) end 955 | return string.format('%s\t%s\t%s', name, dst, src) 956 | end 957 | local function dump_alu(cls, ins, pc) 958 | local alu = {'ADD', 'SUB', 'MUL', 'DIV', 'OR', 'AND', 'LSH', 'RSH', 'NEG', 'MOD', 'XOR', 'MOV', 'ARSH', 'END' } 959 | local jmp = {'JA', 'JEQ', 'JGT', 'JGE', 'JSET', 'JNE', 'JSGT', 'JSGE', 'CALL', 'EXIT'} 960 | local helper = {'unspec', 'map_lookup_elem', 'map_update_elem', 'map_delete_elem', 'probe_read', 'ktime_get_ns', 961 | 'trace_printk', 'get_prandom_u32', 'get_smp_processor_id', 'skb_store_bytes', 962 | 'l3_csum_replace', 'l4_csum_replace', 'tail_call', 'clone_redirect', 'get_current_pid_tgid', 963 | 'get_current_uid_gid', 'get_current_comm', 'get_cgroup_classid', 'skb_vlan_push', 'skb_vlan_pop', 964 | 'skb_get_tunnel_key', 'skb_set_tunnel_key', 'perf_event_read', 'redirect', 'get_route_realm', 965 | 'perf_event_output', 'skb_load_bytes'} 966 | local op = 0 967 | for i = 0,13 do if 0x10 * i == bit.band(ins.code, 0xf0) then op = i + 1 break end end 968 | local name = (cls == 5) and jmp[op] or alu[op] 969 | local src = (bit.band(ins.code, 0x08) == BPF.X) and 'R'..ins.src_reg or '#'..ins.imm 970 | local target = (cls == 5 and op < 9) and string.format('\t=> %04d', pc + ins.off + 1) or '' 971 | if cls == 5 and op == 9 then target = string.format('\t; %s', helper[ins.imm + 1] or tostring(ins.imm)) end 972 | return string.format('%s\t%s\t%s%s', name, 'R'..ins.dst_reg, src, target) 973 | end 974 | local function dump(code) 975 | if not code then return end 976 | print(string.format('-- BPF %s:0-%u', code.insn, code.pc)) 977 | local cls_map = { 978 | [0] = dump_mem, [1] = dump_mem, [2] = dump_mem, [3] = dump_mem, 979 | [4] = dump_alu, [5] = dump_alu, [7] = dump_alu, 980 | } 981 | for i = 0, code.pc - 1 do 982 | local ins = code.insn[i] 983 | local cls = bit.band(ins.code, 0x07) 984 | print(string.format('%04u\t%s', i, cls_map[cls](cls, ins, i))) 985 | end 986 | end 987 | 988 | local function compile(prog, params) 989 | -- Create code emitter sandbox, include caller locals 990 | local env = { pkt=proto.pkt, BPF=BPF } 991 | -- Include upvalues up to 4 nested scopes back 992 | -- the narrower scope overrides broader scope 993 | for k = 5, 2, -1 do 994 | local i = 1 995 | while true do 996 | local ok, n, v = pcall(debug.getlocal, k, i) 997 | if not ok or not n then break end 998 | env[n] = v 999 | i = i + 1 1000 | end 1001 | end 1002 | setmetatable(env, { 1003 | __index = function (_, k) 1004 | return proto[k] or builtins[k] or _G[k] 1005 | end 1006 | }) 1007 | -- Create code emitter and compile LuaJIT bytecode 1008 | if type(prog) == 'string' then prog = loadstring(prog) end 1009 | -- Create error handler to print traceback 1010 | local funci, pc = bytecode.funcinfo(prog), 0 1011 | local E = create_emitter(env, funci.stackslots, funci.params, params or {}) 1012 | local on_err = function (e) 1013 | local funci = bytecode.funcinfo(prog, pc) 1014 | local from, to = 0, 0 1015 | for _ = 1, funci.currentline do 1016 | from = to 1017 | to = string.find(funci.source, '\n', from+1, true) or 0 1018 | end 1019 | print(funci.loc..':'..string.sub(funci.source, from+1, to-1)) 1020 | print('error: '..e) 1021 | print(debug.traceback()) 1022 | end 1023 | for _,op,a,b,c,d in bytecode.decoder(prog) do 1024 | local ok, res, err = xpcall(E,on_err,op,a,b,c,d) 1025 | if not ok then 1026 | return nil 1027 | end 1028 | end 1029 | return E:compile() 1030 | end 1031 | 1032 | -- BPF map interface 1033 | local bpf_map_mt = { 1034 | __gc = function (map) S.close(map.fd) end, 1035 | __len = function(map) return map.max_entries end, 1036 | __index = function (map, k) 1037 | if type(k) == 'string' then 1038 | -- Return iterator 1039 | if k == 'pairs' then 1040 | return function(t, key) 1041 | -- Get next key 1042 | local next_key = ffi.new(ffi.typeof(t.key)) 1043 | local cur_key 1044 | if key then 1045 | cur_key = t.key 1046 | t.key[0] = key 1047 | else 1048 | cur_key = ffi.new(ffi.typeof(t.key)) 1049 | end 1050 | local ok, err = S.bpf_map_op(c.BPF_CMD.MAP_GET_NEXT_KEY, map.fd, cur_key, next_key) 1051 | if not ok then return nil end 1052 | -- Get next value 1053 | assert(S.bpf_map_op(c.BPF_CMD.MAP_LOOKUP_ELEM, map.fd, next_key, map.val)) 1054 | return next_key[0], map.val[0] 1055 | end, map, nil 1056 | -- Read for perf event map 1057 | elseif k == 'reader' then 1058 | return function (pmap, pid, cpu, event_type) 1059 | -- Caller must either specify PID or CPU 1060 | if not pid or pid < 0 then 1061 | assert((cpu and cpu >= 0), 'NYI: creating composed reader for all CPUs') 1062 | pid = -1 1063 | end 1064 | -- Create BPF output reader 1065 | local pe = t.perf_event_attr1() 1066 | pe[0].type = 'software' 1067 | pe[0].config = 'sw_bpf_output' 1068 | pe[0].sample_type = 'raw' 1069 | pe[0].sample_period = 1 1070 | pe[0].wakeup_events = 1 1071 | local reader, err = t.perf_reader(S.perf_event_open(pe, pid, cpu or -1)) 1072 | if not reader then return nil, tostring(err) end 1073 | -- Register event reader fd in BPF map 1074 | assert(cpu < pmap.max_entries, string.format('BPF map smaller than read CPU %d', cpu)) 1075 | pmap[cpu] = reader.fd 1076 | -- Open memory map and start reading 1077 | local ok, err = reader:start() 1078 | assert(ok, tostring(err)) 1079 | ok, err = reader:mmap() 1080 | assert(ok, tostring(err)) 1081 | return cdef.event_reader(reader, event_type) 1082 | end 1083 | -- Signalise this is a map type 1084 | end 1085 | return k == '__map' 1086 | end 1087 | -- Retrieve key 1088 | map.key[0] = k 1089 | local ok, err = S.bpf_map_op(c.BPF_CMD.MAP_LOOKUP_ELEM, map.fd, map.key, map.val) 1090 | if not ok then return nil, err end 1091 | return ffi.new(map.val_type, map.val[0]) 1092 | end, 1093 | __newindex = function (map, k, v) 1094 | map.key[0] = k 1095 | if v == nil then 1096 | return S.bpf_map_op(map.fd, c.BPF_CMD.MAP_DELETE_ELEM, map.key, nil) 1097 | end 1098 | map.val[0] = v 1099 | return S.bpf_map_op(c.BPF_CMD.MAP_UPDATE_ELEM, map.fd, map.key, map.val) 1100 | end, 1101 | } 1102 | 1103 | -- Linux tracing interface 1104 | local function trace_check_enabled(path) 1105 | path = path or '/sys/kernel/debug/tracing' 1106 | if S.statfs(path) then return true end 1107 | return nil, 'debugfs not accessible: "mount -t debugfs nodev /sys/kernel/debug"? missing sudo?' 1108 | end 1109 | 1110 | -- Tracepoint interface 1111 | local tracepoint_mt = { 1112 | __index = { 1113 | bpf = function (t, prog) 1114 | if type(prog) ~= 'table' then 1115 | -- Create protocol parser with source=probe 1116 | prog = compile(prog, {proto.type(t.type, {source='probe'})}) 1117 | end 1118 | -- Load the BPF program 1119 | local prog_fd, err, log = S.bpf_prog_load(S.c.BPF_PROG.TRACEPOINT, prog.insn, prog.pc) 1120 | assert(prog_fd, tostring(err)..': '..tostring(log)) 1121 | -- Open tracepoint and attach 1122 | t.reader:setbpf(prog_fd:getfd()) 1123 | table.insert(t.progs, prog_fd) 1124 | return prog_fd 1125 | end, 1126 | } 1127 | } 1128 | -- Open tracepoint 1129 | local function tracepoint_open(path, pid, cpu, group_fd) 1130 | -- Open tracepoint and compile tracepoint type 1131 | local tp = assert(S.perf_tracepoint('/sys/kernel/debug/tracing/events/'..path)) 1132 | local tp_type = assert(cdef.tracepoint_type(path)) 1133 | -- Open tracepoint reader and create interface 1134 | local reader = assert(S.perf_attach_tracepoint(tp, pid, cpu, group_fd)) 1135 | return setmetatable({tp=tp,type=tp_type,reader=reader,progs={}}, tracepoint_mt) 1136 | end 1137 | 1138 | local function trace_bpf(tp, ptype, pname, pdef, retprobe, prog, pid, cpu, group_fd) 1139 | -- Load BPF program 1140 | local prog_fd, err, log = S.bpf_prog_load(S.c.BPF_PROG.KPROBE, prog.insn, prog.pc) 1141 | assert(prog_fd, tostring(err)..': '..tostring(log)) 1142 | -- Open tracepoint and attach 1143 | local tp, err = S.perf_probe(ptype, pname, pdef, retprobe) 1144 | if not tp then 1145 | prog_fd:close() 1146 | return nil, tostring(err) 1147 | end 1148 | local reader, err = S.perf_attach_tracepoint(tp, pid, cpu, group_fd, {sample_type='raw, callchain'}) 1149 | if not reader then 1150 | prog_fd:close() 1151 | S.perf_probe(ptype, pname, false) 1152 | return nil, tostring(err) 1153 | end 1154 | local ok, err = reader:setbpf(prog_fd:getfd()) 1155 | if not ok then 1156 | prog_fd:close() 1157 | reader:close() 1158 | S.perf_probe(ptype, pname, false) 1159 | return nil, tostring(err)..' (kernel version should be at least 4.1)' 1160 | end 1161 | -- Create GC closure for reader to close BPF program 1162 | -- and detach probe in correct order 1163 | ffi.gc(reader, function () 1164 | prog_fd:close() 1165 | reader:close() 1166 | S.perf_probe(ptype, pname, false) 1167 | end) 1168 | return {reader=reader, prog=prog_fd, probe=pname, probe_type=ptype} 1169 | end 1170 | 1171 | -- Module interface 1172 | return setmetatable({ 1173 | new = create_emitter, 1174 | dump = dump, 1175 | prog = PROG, 1176 | maps = {}, 1177 | map = function (type, max_entries, key_ctype, val_ctype) 1178 | if not key_ctype then key_ctype = ffi.typeof('uint32_t') end 1179 | if not val_ctype then val_ctype = ffi.typeof('uint32_t') end 1180 | if not max_entries then max_entries = 4096 end 1181 | -- Special case for BPF_MAP_STACK_TRACE 1182 | if c.BPF_MAP[type] == c.BPF_MAP.STACK_TRACE then 1183 | key_ctype = ffi.typeof('int32_t') 1184 | val_ctype = ffi.typeof('struct bpf_stacktrace') 1185 | end 1186 | local fd, err = S.bpf_map_create(c.BPF_MAP[type], ffi.sizeof(key_ctype), ffi.sizeof(val_ctype), max_entries) 1187 | if not fd then return nil, tostring(err) end 1188 | local map = setmetatable({ 1189 | max_entries = max_entries, 1190 | key = ffi.new(ffi.typeof('$ [1]', key_ctype)), 1191 | val = ffi.new(ffi.typeof('$ [1]', val_ctype)), 1192 | map_type = c.BPF_MAP[type], 1193 | key_type = key_ctype, 1194 | val_type = val_ctype, 1195 | fd = fd:nogc():getfd(), 1196 | }, bpf_map_mt) 1197 | return map 1198 | end, 1199 | socket = function (sock, prog) 1200 | -- Expect socket type, if sock is string then assume it's 1201 | -- an interface name (e.g. 'lo'), if it's a number then typecast it as a socket 1202 | local ok, err 1203 | if type(sock) == 'string' then 1204 | local iface = assert(S.nl.getlink())[sock] 1205 | assert(iface, sock..' is not interface name') 1206 | sock, err = S.socket('packet', 'raw') 1207 | assert(sock, tostring(err)) 1208 | ok, err = sock:bind(S.t.sockaddr_ll({protocol='all', ifindex=iface.index})) 1209 | assert(ok, tostring(err)) 1210 | elseif type(sock) == 'number' then 1211 | sock = assert(S.t.socket(sock)) 1212 | end 1213 | -- Load program and attach it to socket 1214 | local prog_fd, err, log = S.bpf_prog_load(S.c.BPF_PROG.SOCKET_FILTER, prog.insn, prog.pc) 1215 | assert(prog_fd, tostring(err)..': '..tostring(log)) 1216 | assert(sock:setsockopt('socket', 'attach_bpf', prog_fd:getfd())) 1217 | return prog_fd, err 1218 | end, 1219 | tracepoint = function(tp, prog, pid, cpu, group_fd) 1220 | assert(trace_check_enabled()) 1221 | -- Return tracepoint instance if no program specified 1222 | -- this allows free specialisation of arg0 to tracepoint type 1223 | local probe = tracepoint_open(tp, pid, cpu, group_fd) 1224 | -- Load the BPF program 1225 | if prog then 1226 | probe:bpf(prog) 1227 | end 1228 | return probe 1229 | end, 1230 | kprobe = function(tp, prog, retprobe, pid, cpu, group_fd) 1231 | assert(trace_check_enabled()) 1232 | -- Open tracepoint and attach 1233 | local pname, pdef = tp:match('([^:]+):(.+)') 1234 | return trace_bpf(tp, 'kprobe', pname, pdef, retprobe, prog, pid, cpu, group_fd) 1235 | end, 1236 | uprobe = function(tp, prog, retprobe, pid, cpu, group_fd) 1237 | assert(trace_check_enabled()) 1238 | -- Translate symbol to address 1239 | local obj, sym_want = tp:match('([^:]+):(.+)') 1240 | if not S.statfs(obj) then return nil, S.t.error(S.c.E.NOENT) end 1241 | -- Resolve Elf object (no support for anything else) 1242 | local elf = require('bpf.elf').open(obj) 1243 | local sym = elf:resolve(sym_want) 1244 | if not sym then return nil, 'no such symbol' end 1245 | sym = sym.st_value - elf:loadaddr() 1246 | local sym_addr = string.format('%x%04x', tonumber(bit.rshift(sym, 32)), 1247 | tonumber(ffi.cast('uint32_t', sym))) 1248 | -- Convert it to expected uprobe format 1249 | local pname = string.format('%s_%s', obj:gsub('.*/', ''), sym_addr) 1250 | local pdef = obj..':0x'..sym_addr 1251 | return trace_bpf(tp, 'uprobe', pname, pdef, retprobe, prog, pid, cpu, group_fd) 1252 | end, 1253 | tracelog = function(path) 1254 | assert(trace_check_enabled()) 1255 | path = path or '/sys/kernel/debug/tracing/trace_pipe' 1256 | return io.open(path, 'r') 1257 | end, 1258 | ntoh = builtins.ntoh, hton = builtins.hton, 1259 | }, { 1260 | __call = function (t, prog) return compile(prog) end, 1261 | }) -------------------------------------------------------------------------------- /bpf/builtins.lua: -------------------------------------------------------------------------------- 1 | local ffi = require('ffi') 2 | local bit = require('bit') 3 | local cdef = require('bpf.cdef') 4 | 5 | local BPF, HELPER = ffi.typeof('struct bpf'), ffi.typeof('struct bpf_func_id') 6 | local const_width = { 7 | [1] = BPF.B, [2] = BPF.H, [4] = BPF.W, [8] = BPF.DW, 8 | } 9 | local const_width_type = { 10 | [1] = ffi.typeof('uint8_t'), [2] = ffi.typeof('uint16_t'), [4] = ffi.typeof('uint32_t'), [8] = ffi.typeof('uint64_t'), 11 | } 12 | 13 | -- Built-ins that will be translated into BPF instructions 14 | -- i.e. bit.bor(0xf0, 0x0f) becomes {'alu64, or, k', reg(0xf0), reg(0x0f), 0, 0} 15 | local builtins = { 16 | [bit.lshift] = 'LSH', 17 | [bit.rshift] = 'RSH', 18 | [bit.band] = 'AND', 19 | [bit.bnot] = 'NEG', 20 | [bit.bor] = 'OR', 21 | [bit.bxor] = 'XOR', 22 | [bit.arshift] = 'ARSH', 23 | -- Extensions and intrinsics 24 | } 25 | 26 | local function width_type(w) 27 | -- Note: ffi.typeof doesn't accept '?' as template 28 | return const_width_type[w] or ffi.typeof(string.format('uint8_t [%d]', w)) 29 | end 30 | builtins.width_type = width_type 31 | 32 | -- Byte-order conversions for little endian 33 | local function ntoh(x, w) 34 | if w then x = ffi.cast(const_width_type[w/8], x) end 35 | return bit.bswap(x) 36 | end 37 | local function hton(x, w) return ntoh(x, w) end 38 | builtins.ntoh = ntoh 39 | builtins.hton = hton 40 | builtins[ntoh] = function (e, dst, a, w) 41 | -- This is trickery, but TO_LE means cpu_to_le(), 42 | -- and we want exactly the opposite as network is always 'be' 43 | w = w or ffi.sizeof(e.V[a].type)*8 44 | if w == 8 then return end -- NOOP 45 | assert(w <= 64, 'NYI: hton(a[, width]) - operand larger than register width') 46 | -- Allocate registers and execute 47 | e.vcopy(dst, a) 48 | e.emit(BPF.ALU + BPF.END + BPF.TO_BE, e.vreg(dst), 0, 0, w) 49 | end 50 | builtins[hton] = function (e, dst, a, w) 51 | w = w or ffi.sizeof(e.V[a].type)*8 52 | if w == 8 then return end -- NOOP 53 | assert(w <= 64, 'NYI: hton(a[, width]) - operand larger than register width') 54 | -- Allocate registers and execute 55 | e.vcopy(dst, a) 56 | e.emit(BPF.ALU + BPF.END + BPF.TO_LE, e.vreg(dst), 0, 0, w) 57 | end 58 | -- Byte-order conversions for big endian are no-ops 59 | if ffi.abi('be') then 60 | ntoh = function (x, w) 61 | return w and ffi.cast(const_width_type[w/8], x) or x 62 | end 63 | hton = ntoh 64 | builtins[ntoh] = function(a, b, w) return end 65 | builtins[hton] = function(a, b, w) return end 66 | end 67 | -- Other built-ins 68 | local function xadd(a, b) error('NYI') end 69 | builtins.xadd = xadd 70 | builtins[xadd] = function (e, dst, a, b, off) 71 | assert(e.V[a].const.__dissector, 'xadd(a, b) called on non-pointer') 72 | local w = ffi.sizeof(e.V[a].const.__dissector) 73 | assert(w == 4 or w == 8, 'NYI: xadd() - 1 and 2 byte atomic increments are not supported') 74 | -- Allocate registers and execute 75 | e.vcopy(dst, a) 76 | local src_reg = e.vreg(b) 77 | local dst_reg = e.vreg(dst) 78 | e.emit(BPF.JMP + BPF.JEQ + BPF.K, dst_reg, 0, 1, 0) -- if (dst != NULL) 79 | e.emit(BPF.XADD + BPF.STX + const_width[w], dst_reg, src_reg, off or 0, 0) 80 | end 81 | 82 | local function probe_read() error('NYI') end 83 | builtins.probe_read = probe_read 84 | builtins[probe_read] = function (e, ret, dst, src, vtype, ofs) 85 | e.reg_alloc(e.tmpvar, 1) 86 | -- Load stack pointer to dst, since only load to stack memory is supported 87 | -- we have to use allocated stack memory or create a new allocation and convert 88 | -- to pointer type 89 | e.emit(BPF.ALU64 + BPF.MOV + BPF.X, 1, 10, 0, 0) 90 | if not e.V[dst].const or not e.V[dst].const.__base > 0 then 91 | builtins[ffi.new](e, dst, vtype) -- Allocate stack memory 92 | end 93 | e.emit(BPF.ALU64 + BPF.ADD + BPF.K, 1, 0, 0, -e.V[dst].const.__base) 94 | -- Set stack memory maximum size bound 95 | e.reg_alloc(e.tmpvar, 2) 96 | if not vtype then 97 | vtype = cdef.typename(e.V[dst].type) 98 | -- Dereference pointer type to pointed type for size calculation 99 | if vtype:sub(-1) == '*' then vtype = vtype:sub(0, -2) end 100 | end 101 | local w = ffi.sizeof(vtype) 102 | e.emit(BPF.ALU64 + BPF.MOV + BPF.K, 2, 0, 0, w) 103 | -- Set source pointer 104 | if e.V[src].reg then 105 | e.reg_alloc(e.tmpvar, 3) -- Copy from original register 106 | e.emit(BPF.ALU64 + BPF.MOV + BPF.X, 3, e.V[src].reg, 0, 0) 107 | else 108 | local src_reg = e.vreg(src, 3) 109 | e.reg_spill(src) -- Spill to avoid overwriting 110 | end 111 | if ofs and ofs > 0 then 112 | e.emit(BPF.ALU64 + BPF.ADD + BPF.K, 3, 0, 0, ofs) 113 | end 114 | -- Call probe read helper 115 | ret = ret or e.tmpvar 116 | e.vset(ret) 117 | e.vreg(ret, 0, true, ffi.typeof('int32_t')) 118 | e.emit(BPF.JMP + BPF.CALL, 0, 0, 0, HELPER.probe_read) 119 | e.V[e.tmpvar].reg = nil -- Free temporary registers 120 | end 121 | builtins[ffi.cast] = function (e, dst, ct, x) 122 | assert(e.V[ct].const, 'ffi.cast(ctype, x) called with bad ctype') 123 | e.vcopy(dst, x) 124 | if not e.V[x].const then 125 | e.V[dst].type = ffi.typeof(e.V[ct].const) 126 | else 127 | e.V[dst].const.__dissector = ffi.typeof(e.V[ct].const) 128 | end 129 | -- Specific types also encode source of the data 130 | -- This is because BPF has different helpers for reading 131 | -- different data sources, so variables must track origins. 132 | -- struct pt_regs - source of the data is probe 133 | -- struct skb - source of the data is socket buffer 134 | -- struct X - source of the data is probe/tracepoint 135 | if ffi.typeof(e.V[ct].const) == ffi.typeof('struct pt_regs') then 136 | e.V[dst].source = 'probe' 137 | end 138 | end 139 | builtins[ffi.new] = function (e, dst, ct, x) 140 | if type(ct) == 'number' then 141 | ct = ffi.typeof(e.V[ct].const) -- Get ctype from variable 142 | end 143 | assert(not x, 'NYI: ffi.new(ctype, ...) - initializer is not supported') 144 | assert(not cdef.isptr(ct, true), 'NYI: ffi.new(ctype, ...) - ctype MUST NOT be a pointer') 145 | e.vset(dst, nil, ct) 146 | e.V[dst].const = {__base = e.valloc(ffi.sizeof(ct), true), __dissector = ct} 147 | end 148 | builtins[ffi.copy] = function (e,ret, dst, src) 149 | assert(cdef.isptr(e.V[dst].type), 'ffi.copy(dst, src) - dst MUST be a pointer type') 150 | assert(cdef.isptr(e.V[src].type), 'ffi.copy(dst, src) - src MUST be a pointer type') 151 | -- Specific types also encode source of the data 152 | -- struct pt_regs - source of the data is probe 153 | -- struct skb - source of the data is socket buffer 154 | if e.V[src].source == 'probe' then 155 | e.reg_alloc(e.tmpvar, 1) 156 | -- Load stack pointer to dst, since only load to stack memory is supported 157 | -- we have to either use spilled variable or allocated stack memory offset 158 | e.emit(BPF.ALU64 + BPF.MOV + BPF.X, 1, 10, 0, 0) 159 | if e.V[dst].spill then 160 | e.emit(BPF.ALU64 + BPF.ADD + BPF.K, 1, 0, 0, -e.V[dst].spill) 161 | elseif e.V[dst].const.__base then 162 | e.emit(BPF.ALU64 + BPF.ADD + BPF.K, 1, 0, 0, -e.V[dst].const.__base) 163 | else error('ffi.copy(dst, src) - can\'t get stack offset of dst') end 164 | -- Set stack memory maximum size bound 165 | local dst_tname = cdef.typename(e.V[dst].type) 166 | if dst_tname:sub(-1) == '*' then dst_tname = dst_tname:sub(0, -2) end 167 | e.reg_alloc(e.tmpvar, 2) 168 | e.emit(BPF.ALU64 + BPF.MOV + BPF.K, 2, 0, 0, ffi.sizeof(dst_tname)) 169 | -- Set source pointer 170 | if e.V[src].reg then 171 | e.reg_alloc(e.tmpvar, 3) -- Copy from original register 172 | e.emit(BPF.ALU64 + BPF.MOV + BPF.X, 3, e.V[src].reg, 0, 0) 173 | else 174 | local src_reg = e.vreg(src, 3) 175 | e.reg_spill(src) -- Spill to avoid overwriting 176 | end 177 | -- Call probe read helper 178 | e.vset(ret) 179 | e.vreg(ret, 0, true, ffi.typeof('int32_t')) 180 | e.emit(BPF.JMP + BPF.CALL, 0, 0, 0, HELPER.probe_read) 181 | e.V[e.tmpvar].reg = nil -- Free temporary registers 182 | elseif e.V[src].const and e.V[src].const.__map then 183 | error('NYI: ffi.copy(dst, src) - src is backed by BPF map') 184 | elseif e.V[src].const and e.V[src].const.__dissector then 185 | error('NYI: ffi.copy(dst, src) - src is backed by socket buffer') 186 | else 187 | -- TODO: identify cheap register move 188 | -- TODO: identify copy to/from stack 189 | error('NYI: ffi.copy(dst, src) - src is neither BPF map/socket buffer or probe') 190 | end 191 | end 192 | -- print(format, ...) builtin changes semantics from Lua print(...) 193 | -- the first parameter has to be format and only reduced set of conversion specificers 194 | -- is allowed: %d %u %x %ld %lu %lx %lld %llu %llx %p %s 195 | builtins[print] = function (e, ret, fmt, a1, a2, a3) 196 | -- Load format string and length 197 | e.reg_alloc(e.V[e.tmpvar], 1) 198 | e.reg_alloc(e.V[e.tmpvar+1], 1) 199 | if type(e.V[fmt].const) == 'string' then 200 | local src = e.V[fmt].const 201 | local len = #src + 1 202 | local dst = e.valloc(len, src) 203 | -- TODO: this is materialize step 204 | e.V[fmt].const = {__base=dst} 205 | e.V[fmt].type = ffi.typeof('char ['..len..']') 206 | elseif e.V[fmt].const.__base then -- NOP 207 | else error('NYI: print(fmt, ...) - format variable is not literal/stack memory') end 208 | -- Prepare helper call 209 | e.emit(BPF.ALU64 + BPF.MOV + BPF.X, 1, 10, 0, 0) 210 | e.emit(BPF.ALU64 + BPF.ADD + BPF.K, 1, 0, 0, -e.V[fmt].const.__base) 211 | e.emit(BPF.ALU64 + BPF.MOV + BPF.K, 2, 0, 0, ffi.sizeof(e.V[fmt].type)) 212 | if a1 then 213 | local args = {a1, a2, a3} 214 | assert(#args <= 3, 'print(fmt, ...) - maximum of 3 arguments supported') 215 | for i, arg in ipairs(args) do 216 | e.vcopy(e.tmpvar, arg) -- Copy variable 217 | e.vreg(e.tmpvar, 3+i-1) -- Materialize it in arg register 218 | end 219 | end 220 | -- Call helper 221 | e.vset(ret) 222 | e.vreg(ret, 0, true, ffi.typeof('int32_t')) -- Return is integer 223 | e.emit(BPF.JMP + BPF.CALL, 0, 0, 0, HELPER.trace_printk) 224 | e.V[e.tmpvar].reg = nil -- Free temporary registers 225 | end 226 | 227 | -- Implements bpf_perf_event_output(ctx, map, flags, var, vlen) on perf event map 228 | local function perf_submit(e, dst, map_var, src) 229 | -- Set R2 = map fd (indirect load) 230 | local map = e.V[map_var].const 231 | e.vcopy(e.tmpvar, map_var) 232 | e.vreg(e.tmpvar, 2, true, ffi.typeof('uint64_t')) 233 | e.LD_IMM_X(2, BPF.PSEUDO_MAP_FD, map.fd, ffi.sizeof('uint64_t')) 234 | -- Set R1 = ctx 235 | e.reg_alloc(e.tmpvar, 1) -- Spill anything in R1 (unnamed tmp variable) 236 | e.emit(BPF.ALU64 + BPF.MOV + BPF.X, 1, 6, 0, 0) -- CTX is always in R6, copy 237 | -- Set R3 = flags 238 | e.vset(e.tmpvar, nil, 0) -- BPF_F_CURRENT_CPU 239 | e.vreg(e.tmpvar, 3, false, ffi.typeof('uint64_t')) 240 | -- Set R4 = pointer to src on stack 241 | assert(e.V[src].const.__base, 'NYI: submit(map, var) - variable is not on stack') 242 | e.emit(BPF.ALU64 + BPF.MOV + BPF.X, 4, 10, 0, 0) 243 | e.emit(BPF.ALU64 + BPF.ADD + BPF.K, 4, 0, 0, -e.V[src].const.__base) 244 | -- Set R5 = src length 245 | e.emit(BPF.ALU64 + BPF.MOV + BPF.K, 5, 0, 0, ffi.sizeof(e.V[src].type)) 246 | -- Set R0 = ret and call 247 | e.vset(dst) 248 | e.vreg(dst, 0, true, ffi.typeof('int32_t')) -- Return is integer 249 | e.emit(BPF.JMP + BPF.CALL, 0, 0, 0, HELPER.perf_event_output) 250 | e.V[e.tmpvar].reg = nil -- Free temporary registers 251 | end 252 | 253 | -- Implements bpf_get_stack_id() 254 | local function stack_id(e, ret, map_var, key) 255 | -- Set R2 = map fd (indirect load) 256 | local map = e.V[map_var].const 257 | e.vcopy(e.tmpvar, map_var) 258 | e.vreg(e.tmpvar, 2, true, ffi.typeof('uint64_t')) 259 | e.LD_IMM_X(2, BPF.PSEUDO_MAP_FD, map.fd, ffi.sizeof('uint64_t')) 260 | -- Set R1 = ctx 261 | e.reg_alloc(e.tmpvar, 1) -- Spill anything in R1 (unnamed tmp variable) 262 | e.emit(BPF.ALU64 + BPF.MOV + BPF.X, 1, 6, 0, 0) -- CTX is always in R6, copy 263 | -- Load flags in R2 (immediate value or key) 264 | local imm = e.V[key].const 265 | assert(tonumber(imm), 'NYI: stack_id(map, var), var must be constant number') 266 | e.reg_alloc(e.tmpvar, 3) -- Spill anything in R2 (unnamed tmp variable) 267 | e.LD_IMM_X(3, 0, imm, 8) 268 | -- Return R0 as signed integer 269 | e.vset(ret) 270 | e.vreg(ret, 0, true, ffi.typeof('int32_t')) 271 | e.emit(BPF.JMP + BPF.CALL, 0, 0, 0, HELPER.get_stackid) 272 | e.V[e.tmpvar].reg = nil -- Free temporary registers 273 | end 274 | 275 | -- table.insert(table, value) keeps semantics with the exception of BPF maps 276 | -- map `perf_event` -> submit inserted value 277 | builtins[table.insert] = function (e, dst, map_var, value) 278 | assert(e.V[map_var].const.__map, 'NYI: table.insert() supported only on BPF maps') 279 | return perf_submit(e, dst, map_var, value) 280 | end 281 | 282 | -- bpf_get_current_comm(buffer) - write current process name to byte buffer 283 | local function comm() error('NYI') end 284 | builtins[comm] = function (e, ret, dst) 285 | -- Set R1 = buffer 286 | assert(e.V[dst].const.__base, 'NYI: comm(buffer) - buffer variable is not on stack') 287 | e.reg_alloc(e.tmpvar, 1) -- Spill 288 | e.emit(BPF.ALU64 + BPF.MOV + BPF.X, 1, 10, 0, 0) 289 | e.emit(BPF.ALU64 + BPF.ADD + BPF.K, 1, 0, 0, -e.V[dst].const.__base) 290 | -- Set R2 = length 291 | e.reg_alloc(e.tmpvar, 2) -- Spill 292 | e.emit(BPF.ALU64 + BPF.MOV + BPF.K, 2, 0, 0, ffi.sizeof(e.V[dst].type)) 293 | -- Return is integer 294 | e.vset(ret) 295 | e.vreg(ret, 0, true, ffi.typeof('int32_t')) 296 | e.emit(BPF.JMP + BPF.CALL, 0, 0, 0, HELPER.get_current_comm) 297 | e.V[e.tmpvar].reg = nil -- Free temporary registers 298 | end 299 | 300 | -- Math library built-ins 301 | math.log2 = function (x) error('NYI') end 302 | builtins[math.log2] = function (e, dst, x) 303 | -- Classic integer bits subdivison algorithm to find the position 304 | -- of the highest bit set, adapted for BPF bytecode-friendly operations. 305 | -- https://graphics.stanford.edu/~seander/bithacks.html 306 | -- r = 0 307 | local r = e.vreg(dst, nil, true) 308 | e.emit(BPF.ALU64 + BPF.MOV + BPF.K, r, 0, 0, 0) 309 | -- v = x 310 | e.vcopy(e.tmpvar, x) 311 | local v = e.vreg(e.tmpvar, 2) 312 | if cdef.isptr(e.V[x].const) then -- No pointer arithmetics, dereference 313 | e.vderef(v, v, ffi.typeof('uint64_t')) 314 | end 315 | -- Invert value to invert all tests, otherwise we would need and+jnz 316 | e.emit(BPF.ALU64 + BPF.NEG + BPF.K, v, 0, 0, 0) -- v = ~v 317 | -- Unrolled test cases, converted masking to arithmetic as we don't have "if !(a & b)" 318 | -- As we're testing inverted value, we have to use arithmetic shift to copy MSB 319 | for i=4,0,-1 do 320 | local k = bit.lshift(1, i) 321 | e.emit(BPF.JMP + BPF.JGT + BPF.K, v, 0, 2, bit.bnot(bit.lshift(1, k))) -- if !upper_half(x) 322 | e.emit(BPF.ALU64 + BPF.ARSH + BPF.K, v, 0, 0, k) -- v >>= k 323 | e.emit(BPF.ALU64 + BPF.OR + BPF.K, r, 0, 0, k) -- r |= k 324 | end 325 | -- No longer constant, cleanup tmpvars 326 | e.V[dst].const = nil 327 | e.V[e.tmpvar].reg = nil 328 | end 329 | builtins[math.log10] = function (e, dst, x) 330 | -- Compute log2(x) and transform 331 | builtins[math.log2](e, dst, x) 332 | -- Relationship: log10(v) = log2(v) / log2(10) 333 | local r = e.V[dst].reg 334 | e.emit(BPF.ALU64 + BPF.ADD + BPF.K, r, 0, 0, 1) -- Compensate round-down 335 | e.emit(BPF.ALU64 + BPF.MUL + BPF.K, r, 0, 0, 1233) -- log2(10) ~ 1233>>12 336 | e.emit(BPF.ALU64 + BPF.RSH + BPF.K, r, 0, 0, 12) 337 | end 338 | builtins[math.log] = function (e, dst, x) 339 | -- Compute log2(x) and transform 340 | builtins[math.log2](e, dst, x) 341 | -- Relationship: ln(v) = log2(v) / log2(e) 342 | local r = e.V[dst].reg 343 | e.emit(BPF.ALU64 + BPF.ADD + BPF.K, r, 0, 0, 1) -- Compensate round-down 344 | e.emit(BPF.ALU64 + BPF.MUL + BPF.K, r, 0, 0, 2839) -- log2(e) ~ 2839>>12 345 | e.emit(BPF.ALU64 + BPF.RSH + BPF.K, r, 0, 0, 12) 346 | end 347 | 348 | -- Call-type helpers 349 | local function call_helper(e, dst, h) 350 | e.vset(dst) 351 | local dst_reg = e.vreg(dst, 0, true) 352 | e.emit(BPF.JMP + BPF.CALL, 0, 0, 0, h) 353 | e.V[dst].const = nil -- Target is not a function anymore 354 | end 355 | local function cpu() error('NYI') end 356 | local function rand() error('NYI') end 357 | local function time() error('NYI') end 358 | local function pid_tgid() error('NYI') end 359 | local function uid_gid() error('NYI') end 360 | 361 | -- Export helpers and builtin variants 362 | builtins.cpu = cpu 363 | builtins.time = time 364 | builtins.pid_tgid = pid_tgid 365 | builtins.uid_gid = uid_gid 366 | builtins.comm = comm 367 | builtins.perf_submit = perf_submit 368 | builtins.stack_id = stack_id 369 | builtins[cpu] = function (e, dst) return call_helper(e, dst, HELPER.get_smp_processor_id) end 370 | builtins[rand] = function (e, dst) return call_helper(e, dst, HELPER.get_prandom_u32) end 371 | builtins[time] = function (e, dst) return call_helper(e, dst, HELPER.ktime_get_ns) end 372 | builtins[pid_tgid] = function (e, dst) return call_helper(e, dst, HELPER.get_current_pid_tgid) end 373 | builtins[uid_gid] = function (e, dst) return call_helper(e, dst, HELPER.get_current_uid_gid) end 374 | builtins[perf_submit] = function (e, dst, map, value) return perf_submit(e, dst, map, value) end 375 | builtins[stack_id] = function (e, dst, map, key) return stack_id(e, dst, map, key) end 376 | 377 | return builtins 378 | -------------------------------------------------------------------------------- /bpf/cdef.lua: -------------------------------------------------------------------------------- 1 | local ffi = require('ffi') 2 | local S = require('syscall') 3 | local M = {} 4 | 5 | ffi.cdef [[ 6 | struct bpf { 7 | /* Instruction classes */ 8 | static const int LD = 0x00; 9 | static const int LDX = 0x01; 10 | static const int ST = 0x02; 11 | static const int STX = 0x03; 12 | static const int ALU = 0x04; 13 | static const int JMP = 0x05; 14 | static const int ALU64 = 0x07; 15 | /* ld/ldx fields */ 16 | static const int W = 0x00; 17 | static const int H = 0x08; 18 | static const int B = 0x10; 19 | static const int ABS = 0x20; 20 | static const int IND = 0x40; 21 | static const int MEM = 0x60; 22 | static const int LEN = 0x80; 23 | static const int MSH = 0xa0; 24 | /* alu/jmp fields */ 25 | static const int ADD = 0x00; 26 | static const int SUB = 0x10; 27 | static const int MUL = 0x20; 28 | static const int DIV = 0x30; 29 | static const int OR = 0x40; 30 | static const int AND = 0x50; 31 | static const int LSH = 0x60; 32 | static const int RSH = 0x70; 33 | static const int NEG = 0x80; 34 | static const int MOD = 0x90; 35 | static const int XOR = 0xa0; 36 | static const int JA = 0x00; 37 | static const int JEQ = 0x10; 38 | static const int JGT = 0x20; 39 | static const int JGE = 0x30; 40 | static const int JSET = 0x40; 41 | static const int K = 0x00; 42 | static const int X = 0x08; 43 | static const int JNE = 0x50; /* jump != */ 44 | static const int JSGT = 0x60; /* SGT is signed '>', GT in x86 */ 45 | static const int JSGE = 0x70; /* SGE is signed '>=', GE in x86 */ 46 | static const int CALL = 0x80; /* function call */ 47 | static const int EXIT = 0x90; /* function return */ 48 | /* ld/ldx fields */ 49 | static const int DW = 0x18; /* double word */ 50 | static const int XADD = 0xc0; /* exclusive add */ 51 | /* alu/jmp fields */ 52 | static const int MOV = 0xb0; /* mov reg to reg */ 53 | static const int ARSH = 0xc0; /* sign extending arithmetic shift right */ 54 | /* change endianness of a register */ 55 | static const int END = 0xd0; /* flags for endianness conversion: */ 56 | static const int TO_LE = 0x00; /* convert to little-endian */ 57 | static const int TO_BE = 0x08; /* convert to big-endian */ 58 | /* misc */ 59 | static const int PSEUDO_MAP_FD = 0x01; 60 | /* helper functions */ 61 | static const int F_CURRENT_CPU = 0xffffffff; 62 | static const int F_USER_STACK = 1 << 8; 63 | static const int F_FAST_STACK_CMP = 1 << 9; 64 | static const int F_REUSE_STACKID = 1 << 10; 65 | }; 66 | /* eBPF commands */ 67 | struct bpf_cmd { 68 | static const int MAP_CREATE = 0; 69 | static const int MAP_LOOKUP_ELEM = 1; 70 | static const int MAP_UPDATE_ELEM = 2; 71 | static const int MAP_DELETE_ELEM = 3; 72 | static const int MAP_GET_NEXT_KEY = 4; 73 | static const int PROG_LOAD = 5; 74 | static const int OBJ_PIN = 6; 75 | static const int OBJ_GET = 7; 76 | }; 77 | /* eBPF helpers */ 78 | struct bpf_func_id { 79 | static const int unspec = 0; 80 | static const int map_lookup_elem = 1; 81 | static const int map_update_elem = 2; 82 | static const int map_delete_elem = 3; 83 | static const int probe_read = 4; 84 | static const int ktime_get_ns = 5; 85 | static const int trace_printk = 6; 86 | static const int get_prandom_u32 = 7; 87 | static const int get_smp_processor_id = 8; 88 | static const int skb_store_bytes = 9; 89 | static const int l3_csum_replace = 10; 90 | static const int l4_csum_replace = 11; 91 | static const int tail_call = 12; 92 | static const int clone_redirect = 13; 93 | static const int get_current_pid_tgid = 14; 94 | static const int get_current_uid_gid = 15; 95 | static const int get_current_comm = 16; 96 | static const int get_cgroup_classid = 17; 97 | static const int skb_vlan_push = 18; 98 | static const int skb_vlan_pop = 19; 99 | static const int skb_get_tunnel_key = 20; 100 | static const int skb_set_tunnel_key = 21; 101 | static const int perf_event_read = 22; 102 | static const int redirect = 23; 103 | static const int get_route_realm = 24; 104 | static const int perf_event_output = 25; 105 | static const int skb_load_bytes = 26; 106 | static const int get_stackid = 27; 107 | }; 108 | /* BPF_MAP_STACK_TRACE structures and constants */ 109 | static const int BPF_MAX_STACK_DEPTH = 127; 110 | struct bpf_stacktrace { 111 | uint64_t ip[BPF_MAX_STACK_DEPTH]; 112 | }; 113 | ]] 114 | 115 | -- Compatibility: ljsyscall doesn't have support for BPF syscall 116 | if not S.bpf then 117 | function S.bpf () error("ljsyscall doesn't support bpf(), must be updated") end 118 | end 119 | 120 | -- Compatibility: ljsyscall<=0.12 121 | if not S.c.BPF_MAP.PERCPU_HASH then 122 | S.c.BPF_MAP.PERCPU_HASH = 5 123 | S.c.BPF_MAP.PERCPU_ARRAY = 6 124 | S.c.BPF_MAP.STACK_TRACE = 7 125 | S.c.BPF_MAP.CGROUP_ARRAY = 8 126 | end 127 | if not S.c.BPF_PROG.TRACEPOINT then 128 | S.c.BPF_PROG.TRACEPOINT = 5 129 | end 130 | 131 | -- Compatibility: metatype for stacktrace 132 | function stacktrace_iter(t, i) 133 | i = i + 1 134 | if i < #t and t.ip[i] > 0 then 135 | return i, t.ip[i] 136 | end 137 | end 138 | ffi.metatype('struct bpf_stacktrace', { 139 | __len = function (t) return ffi.sizeof(t.ip) / ffi.sizeof(t.ip[0]) end, 140 | __ipairs = function (t) return stacktrace_iter, t, -1 end, 141 | }) 142 | 143 | -- Reflect cdata type 144 | function M.typename(v) 145 | if not v or type(v) ~= 'cdata' then return nil end 146 | return string.match(tostring(ffi.typeof(v)), '<([^>]+)') 147 | end 148 | 149 | -- Reflect if cdata type can be pointer (accepts array or pointer) 150 | function M.isptr(v, noarray) 151 | local ctname = M.typename(v) 152 | if ctname then 153 | ctname = string.sub(ctname, -1) 154 | ctname = ctname == '*' or (not noarray and ctname == ']') 155 | end 156 | return ctname 157 | end 158 | 159 | function M.osversion() 160 | -- We have no better way to extract current kernel hex-string other 161 | -- than parsing headers, compiling a helper function or reading /proc 162 | local ver_str, count = S.sysctl('kernel.version'):match('%d+.%d+.%d+'), 2 163 | if not ver_str then -- kernel.version is freeform, fallback to kernel.osrelease 164 | ver_str = S.sysctl('kernel.osrelease'):match('%d+.%d+.%d+') 165 | end 166 | local version = 0 167 | for i in ver_str:gmatch('%d+') do -- Convert 'X.Y.Z' to 0xXXYYZZ 168 | version = bit.bor(version, bit.lshift(tonumber(i), 8*count)) 169 | count = count - 1 170 | end 171 | return version 172 | end 173 | 174 | function M.event_reader(reader, event_type) 175 | -- Caller can specify event message binary format 176 | if event_type then 177 | assert(type(event_type) == 'string' and ffi.typeof(event_type), 'not a valid type for event reader') 178 | event_type = ffi.typeof(event_type .. '*') -- Convert type to pointer-to-type 179 | end 180 | -- Wrap reader in interface that can interpret read event messages 181 | return setmetatable({reader=reader,type=event_type}, {__index = { 182 | block = function(self) 183 | return S.select { readfds = {reader.fd} } 184 | end, 185 | next = function(self, k) 186 | local len, ev = reader:next(k) 187 | -- Filter out only sample frames 188 | while ev and ev.type ~= S.c.PERF_RECORD.SAMPLE do 189 | len, ev = reader:next(len) 190 | end 191 | if ev and event_type then 192 | -- The perf event reader returns framed data with header and variable length 193 | -- This is going skip the frame header and cast data to given type 194 | ev = ffi.cast(event_type, ffi.cast('char *', ev) + ffi.sizeof('struct perf_event_header') + ffi.sizeof('uint32_t')) 195 | end 196 | return len, ev 197 | end, 198 | read = function(self) 199 | return self.next, self, nil 200 | end, 201 | }}) 202 | end 203 | 204 | function M.tracepoint_type(tp) 205 | -- Read tracepoint format string 206 | local fp = assert(io.open('/sys/kernel/debug/tracing/events/'..tp..'/format', 'r')) 207 | local fmt = fp:read '*a' 208 | fp:close() 209 | -- Parse struct fields 210 | local fields = {} 211 | for f in fmt:gmatch 'field:([^;]+;)' do 212 | table.insert(fields, f) 213 | end 214 | return string.format('struct { %s }', table.concat(fields)) 215 | end 216 | 217 | return M -------------------------------------------------------------------------------- /bpf/elf.lua: -------------------------------------------------------------------------------- 1 | -- This is a tiny wrapper over libelf to extract load address 2 | -- and offsets of dynamic symbols 3 | 4 | local S = require('syscall') 5 | local ffi = require('ffi') 6 | ffi.cdef [[ 7 | /* Type for a 16-bit quantity. */ 8 | typedef uint16_t Elf32_Half; 9 | typedef uint16_t Elf64_Half; 10 | 11 | /* Types for signed and unsigned 32-bit quantities. */ 12 | typedef uint32_t Elf32_Word; 13 | typedef int32_t Elf32_Sword; 14 | typedef uint32_t Elf64_Word; 15 | typedef int32_t Elf64_Sword; 16 | 17 | /* Types for signed and unsigned 64-bit quantities. */ 18 | typedef uint64_t Elf32_Xword; 19 | typedef int64_t Elf32_Sxword; 20 | typedef uint64_t Elf64_Xword; 21 | typedef int64_t Elf64_Sxword; 22 | 23 | /* Type of addresses. */ 24 | typedef uint32_t Elf32_Addr; 25 | typedef uint64_t Elf64_Addr; 26 | 27 | /* Type of file offsets. */ 28 | typedef uint32_t Elf32_Off; 29 | typedef uint64_t Elf64_Off; 30 | 31 | /* Type for section indices, which are 16-bit quantities. */ 32 | typedef uint16_t Elf32_Section; 33 | typedef uint16_t Elf64_Section; 34 | 35 | /* Constants */ 36 | struct Elf_Cmd 37 | { 38 | static const int READ = 1; 39 | static const int RDWR = 2; 40 | static const int WRITE = 3; 41 | static const int CLR = 4; 42 | static const int SET = 5; 43 | static const int FDDONE = 6; 44 | static const int FDREAD = 7; 45 | static const int READ_MMAP = 8; 46 | static const int RDWR_MMAP = 9; 47 | static const int WRITE_MMAP =10; 48 | static const int READ_MMAP_PRIVATE =11; 49 | static const int EMPTY =12; 50 | static const int NUM =13; 51 | }; 52 | 53 | /* Descriptor for the ELF file. */ 54 | typedef struct Elf Elf; 55 | /* Descriptor for ELF file section. */ 56 | typedef struct Elf_Scn Elf_Scn; 57 | /* Container type for metatable */ 58 | struct Elf_object { int fd; Elf *elf; }; 59 | /* Program segment header. */ 60 | typedef struct 61 | { 62 | Elf64_Word p_type; /* Segment type */ 63 | Elf64_Word p_flags; /* Segment flags */ 64 | Elf64_Off p_offset; /* Segment file offset */ 65 | Elf64_Addr p_vaddr; /* Segment virtual address */ 66 | Elf64_Addr p_paddr; /* Segment physical address */ 67 | Elf64_Xword p_filesz; /* Segment size in file */ 68 | Elf64_Xword p_memsz; /* Segment size in memory */ 69 | Elf64_Xword p_align; /* Segment alignment */ 70 | } Elf64_Phdr; 71 | typedef Elf64_Phdr GElf_Phdr; 72 | /* Section header. */ 73 | typedef struct 74 | { 75 | Elf64_Word sh_name; /* Section name (string tbl index) */ 76 | Elf64_Word sh_type; /* Section type */ 77 | Elf64_Xword sh_flags; /* Section flags */ 78 | Elf64_Addr sh_addr; /* Section virtual addr at execution */ 79 | Elf64_Off sh_offset; /* Section file offset */ 80 | Elf64_Xword sh_size; /* Section size in bytes */ 81 | Elf64_Word sh_link; /* Link to another section */ 82 | Elf64_Word sh_info; /* Additional section information */ 83 | Elf64_Xword sh_addralign; /* Section alignment */ 84 | Elf64_Xword sh_entsize; /* Entry size if section holds table */ 85 | } Elf64_Shdr; 86 | typedef Elf64_Shdr GElf_Shdr; 87 | /* Descriptor for data to be converted to or from memory format. */ 88 | typedef struct 89 | { 90 | void *d_buf; /* Pointer to the actual data. */ 91 | int d_type; /* Type of this piece of data. */ 92 | unsigned int d_version; /* ELF version. */ 93 | size_t d_size; /* Size in bytes. */ 94 | uint64_t d_off; /* Offset into section. */ 95 | size_t d_align; /* Alignment in section. */ 96 | } Elf_Data; 97 | /* Symbol table entry. */ 98 | typedef struct 99 | { 100 | Elf64_Word st_name; /* Symbol name (string tbl index) */ 101 | unsigned char st_info; /* Symbol type and binding */ 102 | unsigned char st_other; /* Symbol visibility */ 103 | Elf64_Section st_shndx; /* Section index */ 104 | Elf64_Addr st_value; /* Symbol value */ 105 | Elf64_Xword st_size; /* Symbol size */ 106 | } Elf64_Sym; 107 | typedef Elf64_Sym GElf_Sym; 108 | 109 | /* Coordinate ELF library and application versions. */ 110 | unsigned int elf_version (unsigned int __version); 111 | /* Return descriptor for ELF file to work according to CMD. */ 112 | Elf *elf_begin (int __fildes, int __cmd, Elf *__ref); 113 | /* Free resources allocated for ELF. */ 114 | int elf_end (Elf *__elf); 115 | /* Get the number of program headers in the ELF file. If the file uses 116 | more headers than can be represented in the e_phnum field of the ELF 117 | header the information from the sh_info field in the zeroth section 118 | header is used. */ 119 | int elf_getphdrnum (Elf *__elf, size_t *__dst); 120 | /* Retrieve program header table entry. */ 121 | GElf_Phdr *gelf_getphdr (Elf *__elf, int __ndx, GElf_Phdr *__dst); 122 | /* Retrieve section header. */ 123 | GElf_Shdr *gelf_getshdr (Elf_Scn *__scn, GElf_Shdr *__dst); 124 | /* Retrieve symbol information from the symbol table at the given index. */ 125 | GElf_Sym *gelf_getsym (Elf_Data *__data, int __ndx, GElf_Sym *__dst); 126 | /* Get section with next section index. */ 127 | Elf_Scn *elf_nextscn (Elf *__elf, Elf_Scn *__scn); 128 | /* Get data from section while translating from file representation 129 | to memory representation. */ 130 | Elf_Data *elf_getdata (Elf_Scn *__scn, Elf_Data *__data); 131 | /* Return pointer to string at OFFSET in section INDEX. */ 132 | char *elf_strptr (Elf *__elf, size_t __index, size_t __offset); 133 | ]] 134 | 135 | local elf = ffi.load('elf') 136 | local EV = { NONE=0, CURRENT=1, NUM=2 } 137 | local PT = { NULL=0, LOAD=1, DYNAMIC=2, INTERP=3, NOTE=4, SHLIB=5, PHDR=6, TLS=7, NUM=8 } 138 | local SHT = { NULL=0, PROGBITS=1, SYMTAB=2, STRTAB=3, RELA=4, HASH=5, DYNAMIC=6, NOTE=7, 139 | NOBITS=8, REL=9, SHLIB=10, DYNSYM=11, INIT_ARRAY=14, FINI_ARRAY=15, PREINIT_ARRAY=16, 140 | GROUP=17, SYMTAB_SHNDX=18, NUM=19 } 141 | local ELF_C = ffi.new('struct Elf_Cmd') 142 | local M = {} 143 | 144 | -- Optional poor man's C++ demangler 145 | local cpp_demangler = os.getenv('CPP_DEMANGLER') 146 | if not cpp_demangler then 147 | for prefix in string.gmatch(os.getenv('PATH'), '[^;:]+') do 148 | if S.statfs(prefix..'/c++filt') then 149 | cpp_demangler = prefix..'/c++filt' 150 | break 151 | end 152 | end 153 | end 154 | local cpp_demangle = function (name) return name end 155 | if cpp_demangler then 156 | cpp_demangle = function (name) 157 | local cmd = string.format('%s -p %s', cpp_demangler, name) 158 | local fp = assert(io.popen(cmd, 'r')) 159 | local output = fp:read('*all') 160 | fp:close() 161 | return output:match '^(.-)%s*$' 162 | end 163 | end 164 | 165 | -- Metatable for ELF object 166 | ffi.metatype('struct Elf_object', { 167 | __gc = function (t) t:close() end, 168 | __index = { 169 | close = function (t) 170 | if t.elf ~= nil then 171 | elf.elf_end(t.elf) 172 | S.close(t.fd) 173 | t.elf = nil 174 | end 175 | end, 176 | -- Load library load address 177 | loadaddr = function(t) 178 | local phnum = ffi.new('size_t [1]') 179 | if elf.elf_getphdrnum(t.elf, phnum) == nil then 180 | return nil, 'cannot get phdrnum' 181 | end 182 | local header = ffi.new('GElf_Phdr [1]') 183 | for i = 0, tonumber(phnum[0])-1 do 184 | if elf.gelf_getphdr(t.elf, i, header) ~= nil 185 | and header[0].p_type == PT.LOAD then 186 | return header[0].p_vaddr 187 | end 188 | end 189 | end, 190 | -- Resolve symbol address 191 | resolve = function (t, k, pattern) 192 | local section = elf.elf_nextscn(t.elf, nil) 193 | while section ~= nil do 194 | local header = ffi.new('GElf_Shdr [1]') 195 | if elf.gelf_getshdr(section, header) ~= nil then 196 | if header[0].sh_type == SHT.SYMTAB or header[0].sh_type == SHT.DYNSYM then 197 | local data = elf.elf_getdata(section, nil) 198 | while data ~= nil do 199 | if data.d_size % header[0].sh_entsize > 0 then 200 | return nil, 'bad section header entity size' 201 | end 202 | local symcount = tonumber(data.d_size / header[0].sh_entsize) 203 | local sym = ffi.new('GElf_Sym [1]') 204 | for i = 0, symcount - 1 do 205 | if elf.gelf_getsym(data, i, sym) ~= nil then 206 | local name = elf.elf_strptr(t.elf, header[0].sh_link, sym[0].st_name) 207 | if name ~= nil then 208 | -- Demangle C++ symbols if necessary 209 | name = ffi.string(name) 210 | if name:sub(1,2) == '_Z' then 211 | name = cpp_demangle(name) 212 | end 213 | -- Match symbol name against pattern 214 | if pattern and string.match(name, k) or k == name then 215 | return sym[0] 216 | end 217 | end 218 | end 219 | end 220 | data = elf.elf_getdata(section, data) 221 | end 222 | end 223 | end 224 | section = elf.elf_nextscn(t.elf, section) 225 | end 226 | end, 227 | } 228 | }) 229 | 230 | -- Open an ELF object 231 | function M.open(path) 232 | if elf.elf_version(EV.CURRENT) == EV.NONE then 233 | return nil, 'bad version' 234 | end 235 | local fd, err = S.open(path, 'rdonly') 236 | if not fd then return nil, err end 237 | local pt = ffi.new('Elf *') 238 | pt = elf.elf_begin(fd:getfd(), ELF_C.READ, pt) 239 | if not pt then 240 | fd:close() 241 | return nil, 'cannot open elf object' 242 | end 243 | return ffi.new('struct Elf_object', fd:nogc():getfd(), pt) 244 | end 245 | 246 | return M -------------------------------------------------------------------------------- /bpf/ljbytecode.lua: -------------------------------------------------------------------------------- 1 | local jutil = require("jit.util") 2 | local vmdef = require("jit.vmdef") 3 | local bit = require('bit') 4 | local shr, band = bit.rshift, bit.band 5 | 6 | -- Decode LuaJIT 2.0 Byte Format 7 | -- Reference: http://wiki.luajit.org/Bytecode-2.0 8 | -- Thanks to LJ, we get code in portable bytecode with constants folded, basic 9 | -- virtual registers allocated etc. 10 | -- No SSA IR, type inference or advanced optimizations because the code wasn't traced yet. 11 | local function decode_ins(func, pc) 12 | local ins, m = jutil.funcbc(func, pc) 13 | if not ins then return nil end 14 | local op, ma, mb, mc = band(ins, 0xff), band(m, 7), band(m, 15*8), band(m, 15*128) 15 | local a, b, c, d = band(shr(ins, 8), 0xff), nil, nil, shr(ins, 16) 16 | if mb ~= 0 then 17 | d = band(d, 0xff) 18 | b = shr(ins, 24) 19 | end 20 | if ma == 5 then -- BCMuv 21 | a = jutil.funcuvname(func, a) 22 | end 23 | if mc == 13*128 then -- BCMjump 24 | c = pc+d-0x7fff 25 | elseif mc == 9*128 then -- BCMint 26 | c = jutil.funck(func, d) 27 | elseif mc == 10*128 then -- BCMstr 28 | c = jutil.funck(func, -d-1) 29 | elseif mc == 5*128 then -- BCMuv 30 | c = jutil.funcuvname(func, d) 31 | end 32 | -- Convert version-specific opcode to string 33 | op = 6*op 34 | op = string.sub(vmdef.bcnames, op+1, op+6):match('[^%s]+') 35 | return pc, op, a, b, c, d 36 | end 37 | 38 | -- Decoder closure 39 | local function decoder(func) 40 | local pc = 0 41 | return function () 42 | pc = pc + 1 43 | return decode_ins(func, pc) 44 | end 45 | end 46 | 47 | -- Hexdump generated code 48 | local function dump(func) 49 | return require('jit.bc').dump(func) 50 | end 51 | 52 | return { 53 | decode = decode_ins, 54 | decoder = decoder, 55 | dump = dump, 56 | funcinfo = function (...) return jutil.funcinfo(...) end, 57 | } -------------------------------------------------------------------------------- /bpf/proto.lua: -------------------------------------------------------------------------------- 1 | local ffi = require('ffi') 2 | local BPF = ffi.typeof('struct bpf') 3 | 4 | ffi.cdef [[ 5 | struct sk_buff { 6 | uint32_t len; 7 | uint32_t pkt_type; 8 | uint32_t mark; 9 | uint32_t queue_mapping; 10 | uint32_t protocol; 11 | uint32_t vlan_present; 12 | uint32_t vlan_tci; 13 | uint32_t vlan_proto; 14 | uint32_t priority; 15 | uint32_t ingress_ifindex; 16 | uint32_t ifindex; 17 | uint32_t tc_index; 18 | uint32_t cb[5]; 19 | uint32_t hash; 20 | uint32_t tc_classid; 21 | }; 22 | 23 | struct eth_t { 24 | uint8_t dst[6]; 25 | uint8_t src[6]; 26 | uint16_t type; 27 | } __attribute__((packed)); 28 | 29 | struct dot1q_t { 30 | uint16_t pri:3; 31 | uint16_t cfi:1; 32 | uint16_t vlanid:12; 33 | uint16_t type; 34 | } __attribute__((packed)); 35 | 36 | struct arp_t { 37 | uint16_t htype; 38 | uint16_t ptype; 39 | uint8_t hlen; 40 | uint8_t plen; 41 | uint16_t oper; 42 | uint8_t sha[6]; 43 | uint32_t spa; 44 | uint8_t tha[6]; 45 | uint32_t tpa; 46 | } __attribute__((packed)); 47 | 48 | struct ip_t { 49 | uint8_t ver:4; 50 | uint8_t hlen:4; 51 | uint8_t tos; 52 | uint16_t tlen; 53 | uint16_t identification; 54 | uint16_t ffo_unused:1; 55 | uint16_t df:1; 56 | uint16_t mf:1; 57 | uint16_t foffset:13; 58 | uint8_t ttl; 59 | uint8_t proto; 60 | uint16_t hchecksum; 61 | uint32_t src; 62 | uint32_t dst; 63 | } __attribute__((packed)); 64 | 65 | struct icmp_t { 66 | uint8_t type; 67 | uint8_t code; 68 | uint16_t checksum; 69 | } __attribute__((packed)); 70 | 71 | struct ip6_t { 72 | uint32_t ver:4; 73 | uint32_t priority:8; 74 | uint32_t flow_label:20; 75 | uint16_t payload_len; 76 | uint8_t next_header; 77 | uint8_t hop_limit; 78 | uint64_t src_hi; 79 | uint64_t src_lo; 80 | uint64_t dst_hi; 81 | uint64_t dst_lo; 82 | } __attribute__((packed)); 83 | 84 | struct ip6_opt_t { 85 | uint8_t next_header; 86 | uint8_t ext_len; 87 | uint8_t pad[6]; 88 | } __attribute__((packed)); 89 | 90 | struct icmp6_t { 91 | uint8_t type; 92 | uint8_t code; 93 | uint16_t checksum; 94 | } __attribute__((packed)); 95 | 96 | struct udp_t { 97 | uint16_t src_port; 98 | uint16_t dst_port; 99 | uint16_t length; 100 | uint16_t crc; 101 | } __attribute__((packed)); 102 | 103 | struct tcp_t { 104 | uint16_t src_port; 105 | uint16_t dst_port; 106 | uint32_t seq_num; 107 | uint32_t ack_num; 108 | uint8_t offset:4; 109 | uint8_t reserved:4; 110 | uint8_t flag_cwr:1; 111 | uint8_t flag_ece:1; 112 | uint8_t flag_urg:1; 113 | uint8_t flag_ack:1; 114 | uint8_t flag_psh:1; 115 | uint8_t flag_rst:1; 116 | uint8_t flag_syn:1; 117 | uint8_t flag_fin:1; 118 | uint16_t rcv_wnd; 119 | uint16_t cksum; 120 | uint16_t urg_ptr; 121 | } __attribute__((packed)); 122 | 123 | struct vxlan_t { 124 | uint32_t rsv1:4; 125 | uint32_t iflag:1; 126 | uint32_t rsv2:3; 127 | uint32_t rsv3:24; 128 | uint32_t key:24; 129 | uint32_t rsv4:8; 130 | } __attribute__((packed)); 131 | ]] 132 | 133 | 134 | -- Architecture-specific ptrace register layout 135 | local S = require('syscall') 136 | local arch = S.abi.arch 137 | local parm_to_reg = {} 138 | if arch == 'x64' then 139 | ffi.cdef [[ 140 | struct pt_regs { 141 | unsigned long r15; 142 | unsigned long r14; 143 | unsigned long r13; 144 | unsigned long r12; 145 | unsigned long bp; 146 | unsigned long bx; 147 | unsigned long r11; 148 | unsigned long r10; 149 | unsigned long r9; 150 | unsigned long r8; 151 | unsigned long ax; 152 | unsigned long cx; 153 | unsigned long dx; 154 | unsigned long si; 155 | unsigned long di; 156 | unsigned long orig_ax; 157 | unsigned long ip; 158 | unsigned long cs; 159 | unsigned long flags; 160 | unsigned long sp; 161 | unsigned long ss; 162 | };]] 163 | parm_to_reg = {parm1='di', parm2='si', parm3='dx', parm4='cx', parm5='r8', ret='sp', fp='bp'} 164 | else 165 | ffi.cdef 'struct pt_regs {};' 166 | end 167 | -- Map symbolic registers to architecture ABI 168 | ffi.metatype('struct pt_regs', { 169 | __index = function (t,k) 170 | return assert(parm_to_reg[k], 'no such register: '..k) 171 | end, 172 | }) 173 | 174 | local M = {} 175 | 176 | -- Dissector interface 177 | local function dissector(type, e, dst, src, field) 178 | local parent = e.V[src].const 179 | -- Create new dissector variable 180 | e.vcopy(dst, src) 181 | -- Compute and materialize new dissector offset from parent 182 | e.V[dst].const = {off=e.V[src].const.off, __dissector=e.V[src].const.__dissector} 183 | parent.__dissector[field](e, dst) 184 | e.V[dst].const.__dissector = type 185 | end 186 | M.dissector = dissector 187 | 188 | -- Get current effective offset, load field value at an offset relative to it and 189 | -- add its value to compute next effective offset (e.g. udp_off = ip_off + pkt[ip_off].hlen) 190 | local function next_offset(e, var, type, off, mask, shift) 191 | local d = e.V[var].const 192 | -- Materialize relative offset value in R0 193 | local dst_reg, tmp_reg 194 | if d.off then 195 | dst_reg = e.vreg(var, 0, true) 196 | tmp_reg = dst_reg -- Use target register to avoid copy 197 | e.emit(BPF.LD + BPF.ABS + e.const_width[ffi.sizeof(type)], tmp_reg, 0, 0, d.off + off or 0) 198 | else 199 | tmp_reg = e.vreg(e.tmpvar, 0, true, type) -- Reserve R0 for temporary relative offset 200 | dst_reg = e.vreg(var) -- Must rematerialize (if it was spilled by tmp var) 201 | e.emit(BPF.LD + BPF.IND + e.const_width[ffi.sizeof(type)], tmp_reg, dst_reg, 0, off or 0) 202 | end 203 | -- Finalize relative offset 204 | if mask then 205 | e.emit(BPF.ALU + BPF.AND + BPF.K, tmp_reg, 0, 0, mask) 206 | end 207 | if shift then 208 | local op = BPF.LSH 209 | if shift < 0 then 210 | op = BPF.RSH 211 | shift = -shift 212 | end 213 | e.emit(BPF.ALU + op + BPF.K, tmp_reg, 0, 0, shift) 214 | end 215 | -- Add to base offset to turn it into effective address 216 | if dst_reg ~= tmp_reg then 217 | e.emit(BPF.ALU + BPF.ADD + BPF.X, dst_reg, tmp_reg, 0, 0) 218 | else 219 | e.emit(BPF.ALU + BPF.ADD + BPF.K, dst_reg, 0, 0, d.off) 220 | end 221 | -- Discard temporary allocations 222 | d.off = nil 223 | e.V[e.tmpvar].reg = nil 224 | end 225 | 226 | local function next_skip(e, var, off) 227 | local d = e.V[var].const 228 | if not d.off then 229 | local dst_reg = e.vreg(var) 230 | e.emit(BPF.ALU64 + BPF.ADD + BPF.K, dst_reg, 0, 0, off) 231 | else 232 | d.off = d.off + off 233 | end 234 | end 235 | 236 | local function skip_eth(e, dst) 237 | -- IP starts right after ETH header (fixed size) 238 | local d = e.V[dst].const 239 | d.off = d.off + ffi.sizeof('struct eth_t') 240 | end 241 | 242 | -- Export types 243 | M.type = function(typestr, t) 244 | t = t or {} 245 | t.__dissector=ffi.typeof(typestr) 246 | return t 247 | end 248 | M.skb = M.type('struct sk_buff', {__base=true}) 249 | M.pt_regs = M.type('struct pt_regs', {__base=true, source='probe'}) 250 | M.pkt = {off=0, __dissector=ffi.typeof('struct eth_t')} -- skb needs special accessors 251 | -- M.eth = function (...) return dissector(ffi.typeof('struct eth_t'), ...) end 252 | M.dot1q = function (...) return dissector(ffi.typeof('struct dot1q_t'), ...) end 253 | M.arp = function (...) return dissector(ffi.typeof('struct arp_t'), ...) end 254 | M.icmp = function (...) return dissector(ffi.typeof('struct icmp_t'), ...) end 255 | M.ip = function (...) return dissector(ffi.typeof('struct ip_t'), ...) end 256 | M.icmp6 = function (...) return dissector(ffi.typeof('struct icmp6_t'), ...) end 257 | M.ip6 = function (...) return dissector(ffi.typeof('struct ip6_t'), ...) end 258 | M.ip6_opt = function (...) return dissector(ffi.typeof('struct ip6_opt_t'), ...) end 259 | M.udp = function (...) return dissector(ffi.typeof('struct udp_t'), ...) end 260 | M.tcp = function (...) return dissector(ffi.typeof('struct tcp_t'), ...) end 261 | M.vxlan = function (...) return dissector(ffi.typeof('struct vxlan_t'), ...) end 262 | M.data = function (...) return dissector(ffi.typeof('uint8_t'), ...) end 263 | 264 | -- Metatables 265 | ffi.metatype(ffi.typeof('struct eth_t'), { 266 | __index = { 267 | ip = skip_eth, 268 | ip6 = skip_eth, 269 | } 270 | }) 271 | 272 | ffi.metatype(ffi.typeof('struct ip_t'), { 273 | __index = { 274 | -- Skip IP header length (stored as number of words) 275 | -- e.g. hlen = 5, Header Length = 5 x sizeof(u32) = 20 octets 276 | -- Mask first nibble and shift by 2 (multiplication by 4) 277 | icmp = function(e, dst) next_offset(e, dst, ffi.typeof('uint8_t'), 0, 0x0f, 2) end, 278 | udp = function(e, dst) next_offset(e, dst, ffi.typeof('uint8_t'), 0, 0x0f, 2) end, 279 | tcp = function(e, dst) next_offset(e, dst, ffi.typeof('uint8_t'), 0, 0x0f, 2) end, 280 | } 281 | }) 282 | 283 | ffi.metatype(ffi.typeof('struct tcp_t'), { 284 | __index = { 285 | -- Skip TCP header length (stored as number of words) 286 | -- e.g. hlen = 5, Header Length = 5 x sizeof(u32) = 20 octets 287 | data = function(e, dst) 288 | next_offset(e, dst, ffi.typeof('uint8_t'), ffi.offsetof('struct tcp_t', 'offset'), 0xf0, -2) 289 | end, 290 | } 291 | }) 292 | 293 | ffi.metatype(ffi.typeof('struct udp_t'), { 294 | __index = { 295 | -- Skip UDP header length (8 octets) 296 | data = function(e, dst) 297 | next_skip(e, dst, ffi.sizeof('struct udp_t')) 298 | end, 299 | } 300 | }) 301 | 302 | -- Constants 303 | M.c = { 304 | eth = { -- Constants http://standards.ieee.org/regauth/ethertype 305 | ip = 0x0800, -- IP (v4) protocol 306 | ip6 = 0x86dd, -- IP (v6) protocol 307 | arp = 0x0806, -- Address resolution protocol 308 | revarp = 0x8035, -- Reverse addr resolution protocol 309 | vlan = 0x8100, -- IEEE 802.1Q VLAN tagging 310 | }, 311 | ip = { 312 | -- Reserved Addresses 313 | addr_any = 0x00000000, -- 0.0.0.0 314 | addr_broadcast = 0xffffffff, -- 255.255.255.255 315 | addr_loopback = 0x7f000001, -- 127.0.0.1 316 | addr_mcast_all = 0xe0000001, -- 224.0.0.1 317 | addr_mcast_local = 0xe00000ff, -- 224.0.0.255 318 | -- Type of service (ip_tos), RFC 1349 ("obsoleted by RFC 2474") 319 | tos_default = 0x00, -- default 320 | tos_lowdelay = 0x10, -- low delay 321 | tos_throughput = 0x08, -- high throughput 322 | tos_reliability = 0x04, -- high reliability 323 | tos_lowcost = 0x02, -- low monetary cost - XXX 324 | tos_ect = 0x02, -- ECN-capable transport 325 | tos_ce = 0x01, -- congestion experienced 326 | -- Fragmentation flags (ip_off) 327 | rf = 0x8000, -- reserved 328 | df = 0x4000, -- don't fragment 329 | mf = 0x2000, -- more fragments (not last frag) 330 | offmask = 0x1fff, -- mask for fragment offset 331 | -- Time-to-live (ip_ttl), seconds 332 | ttl_default = 64, -- default ttl, RFC 1122, RFC 1340 333 | ttl_max = 255, -- maximum ttl 334 | -- Protocol (ip_p) - http://www.iana.org/assignments/protocol-numbers 335 | proto_ip = 0, -- dummy for IP 336 | proto_hopopts = 0, -- IPv6 hop-by-hop options 337 | proto_icmp = 1, -- ICMP 338 | proto_igmp = 2, -- IGMP 339 | proto_ggp = 3, -- gateway-gateway protocol 340 | proto_ipip = 4, -- IP in IP 341 | proto_st = 5, -- ST datagram mode 342 | proto_tcp = 6, -- TCP 343 | proto_cbt = 7, -- CBT 344 | proto_egp = 8, -- exterior gateway protocol 345 | proto_igp = 9, -- interior gateway protocol 346 | proto_bbnrcc = 10, -- BBN RCC monitoring 347 | proto_nvp = 11, -- Network Voice Protocol 348 | proto_pup = 12, -- PARC universal packet 349 | proto_argus = 13, -- ARGUS 350 | proto_emcon = 14, -- EMCON 351 | proto_xnet = 15, -- Cross Net Debugger 352 | proto_chaos = 16, -- Chaos 353 | proto_udp = 17, -- UDP 354 | proto_mux = 18, -- multiplexing 355 | proto_dcnmeas = 19, -- DCN measurement 356 | proto_hmp = 20, -- Host Monitoring Protocol 357 | proto_prm = 21, -- Packet Radio Measurement 358 | proto_idp = 22, -- Xerox NS IDP 359 | proto_trunk1 = 23, -- Trunk-1 360 | proto_trunk2 = 24, -- Trunk-2 361 | proto_leaf1 = 25, -- Leaf-1 362 | proto_leaf2 = 26, -- Leaf-2 363 | proto_rdp = 27, -- "Reliable Datagram" proto 364 | proto_irtp = 28, -- Inet Reliable Transaction 365 | proto_tp = 29, -- ISO TP class 4 366 | proto_netblt = 30, -- Bulk Data Transfer 367 | proto_mfpnsp = 31, -- MFE Network Services 368 | proto_meritinp= 32, -- Merit Internodal Protocol 369 | proto_sep = 33, -- Sequential Exchange proto 370 | proto_3pc = 34, -- Third Party Connect proto 371 | proto_idpr = 35, -- Interdomain Policy Route 372 | proto_xtp = 36, -- Xpress Transfer Protocol 373 | proto_ddp = 37, -- Datagram Delivery Proto 374 | proto_cmtp = 38, -- IDPR Ctrl Message Trans 375 | proto_tppp = 39, -- TP++ Transport Protocol 376 | proto_il = 40, -- IL Transport Protocol 377 | proto_ip6 = 41, -- IPv6 378 | proto_sdrp = 42, -- Source Demand Routing 379 | proto_routing = 43, -- IPv6 routing header 380 | proto_fragment= 44, -- IPv6 fragmentation header 381 | proto_rsvp = 46, -- Reservation protocol 382 | proto_gre = 47, -- General Routing Encap 383 | proto_mhrp = 48, -- Mobile Host Routing 384 | proto_ena = 49, -- ENA 385 | proto_esp = 50, -- Encap Security Payload 386 | proto_ah = 51, -- Authentication Header 387 | proto_inlsp = 52, -- Integated Net Layer Sec 388 | proto_swipe = 53, -- SWIPE 389 | proto_narp = 54, -- NBMA Address Resolution 390 | proto_mobile = 55, -- Mobile IP, RFC 2004 391 | proto_tlsp = 56, -- Transport Layer Security 392 | proto_skip = 57, -- SKIP 393 | proto_icmp6 = 58, -- ICMP for IPv6 394 | proto_none = 59, -- IPv6 no next header 395 | proto_dstopts = 60, -- IPv6 destination options 396 | proto_anyhost = 61, -- any host internal proto 397 | proto_cftp = 62, -- CFTP 398 | proto_anynet = 63, -- any local network 399 | proto_expak = 64, -- SATNET and Backroom EXPAK 400 | proto_kryptolan = 65, -- Kryptolan 401 | proto_rvd = 66, -- MIT Remote Virtual Disk 402 | proto_ippc = 67, -- Inet Pluribus Packet Core 403 | proto_distfs = 68, -- any distributed fs 404 | proto_satmon = 69, -- SATNET Monitoring 405 | proto_visa = 70, -- VISA Protocol 406 | proto_ipcv = 71, -- Inet Packet Core Utility 407 | proto_cpnx = 72, -- Comp Proto Net Executive 408 | proto_cphb = 73, -- Comp Protocol Heart Beat 409 | proto_wsn = 74, -- Wang Span Network 410 | proto_pvp = 75, -- Packet Video Protocol 411 | proto_brsatmon= 76, -- Backroom SATNET Monitor 412 | proto_sunnd = 77, -- SUN ND Protocol 413 | proto_wbmon = 78, -- WIDEBAND Monitoring 414 | proto_wbexpak = 79, -- WIDEBAND EXPAK 415 | proto_eon = 80, -- ISO CNLP 416 | proto_vmtp = 81, -- Versatile Msg Transport 417 | proto_svmtp = 82, -- Secure VMTP 418 | proto_vines = 83, -- VINES 419 | proto_ttp = 84, -- TTP 420 | proto_nsfigp = 85, -- NSFNET-IGP 421 | proto_dgp = 86, -- Dissimilar Gateway Proto 422 | proto_tcf = 87, -- TCF 423 | proto_eigrp = 88, -- EIGRP 424 | proto_ospf = 89, -- Open Shortest Path First 425 | proto_spriterpc= 90, -- Sprite RPC Protocol 426 | proto_larp = 91, -- Locus Address Resolution 427 | proto_mtp = 92, -- Multicast Transport Proto 428 | proto_ax25 = 93, -- AX.25 Frames 429 | proto_ipipencap= 94, -- yet-another IP encap 430 | proto_micp = 95, -- Mobile Internet Ctrl 431 | proto_sccsp = 96, -- Semaphore Comm Sec Proto 432 | proto_etherip = 97, -- Ethernet in IPv4 433 | proto_encap = 98, -- encapsulation header 434 | proto_anyenc = 99, -- private encryption scheme 435 | proto_gmtp = 100, -- GMTP 436 | proto_ifmp = 101, -- Ipsilon Flow Mgmt Proto 437 | proto_pnni = 102, -- PNNI over IP 438 | proto_pim = 103, -- Protocol Indep Multicast 439 | proto_aris = 104, -- ARIS 440 | proto_scps = 105, -- SCPS 441 | proto_qnx = 106, -- QNX 442 | proto_an = 107, -- Active Networks 443 | proto_ipcomp = 108, -- IP Payload Compression 444 | proto_snp = 109, -- Sitara Networks Protocol 445 | proto_compaqpeer= 110, -- Compaq Peer Protocol 446 | proto_ipxip = 111, -- IPX in IP 447 | proto_vrrp = 112, -- Virtual Router Redundancy 448 | proto_pgm = 113, -- PGM Reliable Transport 449 | proto_any0hop = 114, -- 0-hop protocol 450 | proto_l2tp = 115, -- Layer 2 Tunneling Proto 451 | proto_ddx = 116, -- D-II Data Exchange (DDX) 452 | proto_iatp = 117, -- Interactive Agent Xfer 453 | proto_stp = 118, -- Schedule Transfer Proto 454 | proto_srp = 119, -- SpectraLink Radio Proto 455 | proto_uti = 120, -- UTI 456 | proto_smp = 121, -- Simple Message Protocol 457 | proto_sm = 122, -- SM 458 | proto_ptp = 123, -- Performance Transparency 459 | proto_isis = 124, -- ISIS over IPv4 460 | proto_fire = 125, -- FIRE 461 | proto_crtp = 126, -- Combat Radio Transport 462 | proto_crudp = 127, -- Combat Radio UDP 463 | proto_sscopmce= 128, -- SSCOPMCE 464 | proto_iplt = 129, -- IPLT 465 | proto_sps = 130, -- Secure Packet Shield 466 | proto_pipe = 131, -- Private IP Encap in IP 467 | proto_sctp = 132, -- Stream Ctrl Transmission 468 | proto_fc = 133, -- Fibre Channel 469 | proto_rsvpign = 134, -- RSVP-E2E-IGNORE 470 | proto_raw = 255, -- Raw IP packets 471 | proto_reserved= 255, -- Reserved 472 | }, 473 | } 474 | 475 | return M -------------------------------------------------------------------------------- /examples/kprobe-latency.lua: -------------------------------------------------------------------------------- 1 | -- This example program measures latency of block device operations and plots it 2 | -- in a histogram. It is similar to BPF example: 3 | -- https://github.com/torvalds/linux/blob/master/samples/bpf/tracex3_kern.c 4 | local ffi = require('ffi') 5 | local bpf = require('bpf') 6 | local S = require('syscall') 7 | 8 | -- Shared part of the program 9 | local bins = 100 10 | local map = bpf.map('hash', 512, ffi.typeof('uint64_t'), ffi.typeof('uint64_t')) 11 | local lat_map = bpf.map('array', bins) 12 | 13 | -- Kernel-space part of the program 14 | local trace_start = assert(bpf(function (ptregs) 15 | local req = ffi.cast('struct pt_regs', ptregs) 16 | map[req.parm1] = time() 17 | end)) 18 | local trace_end = assert(bpf(function (ptregs) 19 | local req = ffi.cast('struct pt_regs', ptregs) 20 | -- The lines below are computing index 21 | -- using log10(x)*10 = log2(x)*10/log2(10) = log2(x)*3 22 | -- index = 29 ~ 1 usec 23 | -- index = 59 ~ 1 msec 24 | -- index = 89 ~ 1 sec 25 | -- index = 99 ~ 10sec or more 26 | local delta = time() - map[req.parm1] 27 | local index = 3 * math.log2(delta) 28 | if index >= bins then 29 | index = bins-1 30 | end 31 | xadd(lat_map[index], 1) 32 | return true 33 | end)) 34 | local probes = { 35 | bpf.kprobe('myprobe:blk_start_request', trace_start, false, -1, 0), 36 | bpf.kprobe('myprobe2:blk_account_io_completion', trace_end, false, -1, 0), 37 | } 38 | -- User-space part of the program 39 | pcall(function() 40 | local counter = 0 41 | local sym = {' ',' ','.','.','*','*','o','o','O','O','#','#'} 42 | while true do 43 | -- Print header once in a while 44 | if counter % 50 == 0 then 45 | print('|1us |10us |100us |1ms |10ms |100ms |1s |10s') 46 | counter = 0 47 | end 48 | counter = counter + 1 49 | -- Collect all events 50 | local hist, events = {}, 0 51 | for i=29,bins-1 do 52 | local v = tonumber(lat_map[i] or 0) 53 | if v > 0 then 54 | hist[i] = hist[i] or 0 + v 55 | events = events + v 56 | end 57 | end 58 | -- Print histogram symbols based on relative frequency 59 | local s = '' 60 | for i=29,bins-1 do 61 | if hist[i] then 62 | local c = math.ceil((hist[i] / (events + 1)) * #sym) 63 | s = s .. sym[c] 64 | else s = s .. ' ' end 65 | end 66 | print(s .. string.format(' ; %d events', events)) 67 | S.sleep(1) 68 | end 69 | end) -------------------------------------------------------------------------------- /examples/kprobe-write.lua: -------------------------------------------------------------------------------- 1 | -- Simple tracing example that executes a program on 2 | -- return from sys_write() and tracks the number of hits 3 | local ffi = require('ffi') 4 | local bpf = require('bpf') 5 | local S = require('syscall') 6 | 7 | -- Shared part of the program 8 | local map = bpf.map('array', 1) 9 | -- Kernel-space part of the program 10 | local probe = assert(bpf.kprobe('myprobe:sys_write', bpf(function (ptregs) 11 | xadd(map[0], 1) 12 | end), true)) 13 | -- User-space part of the program 14 | pcall(function() 15 | for _ = 1, 10 do 16 | print('hits: ', tonumber(map[0])) 17 | S.sleep(1) 18 | end 19 | end) 20 | probe:close() 21 | -------------------------------------------------------------------------------- /examples/sock-parse-dns.lua: -------------------------------------------------------------------------------- 1 | -- Simple parsing example of UDP/DNS that counts frequency of QTYPEs. 2 | -- It shows how to parse packet variable-length packet structures. 3 | local ffi = require("ffi") 4 | local bpf = require("bpf") 5 | local S = require("syscall") 6 | 7 | -- Shared part of the program 8 | local map = assert(bpf.map('array', 256)) 9 | -- Kernel-space part of the program 10 | local prog = bpf.socket('lo', bpf(function (skb) 11 | local ip = pkt.ip -- Accept only UDP messages 12 | if ip.proto ~= c.ip.proto_udp then return false end 13 | local udp = ip.udp -- Only messages >12 octets (DNS header) 14 | if udp.length < 12 then return false end 15 | -- Unroll QNAME (up to 2 labels) 16 | udp = udp.data + 12 17 | local label = udp[0] 18 | if label > 0 then 19 | udp = udp + label + 1 20 | label = udp[0] 21 | if label > 0 then 22 | udp = udp + label + 1 23 | end 24 | end 25 | -- Track QTYPE (low types) 26 | if udp[0] == 0 then 27 | local qtype = udp[2] -- Low octet from QTYPE 28 | xadd(map[qtype], 1) 29 | end 30 | end)) 31 | -- User-space part of the program 32 | for _ = 1, 10 do 33 | for k,v in map.pairs,map,0 do 34 | v = tonumber(v) 35 | if v > 0 then 36 | print(string.format('TYPE%d: %d', k, v)) 37 | end 38 | end 39 | S.sleep(1) 40 | end -------------------------------------------------------------------------------- /examples/sock-parse-http.lua: -------------------------------------------------------------------------------- 1 | -- Simple parsing example of TCP/HTTP that counts frequency of types of requests 2 | -- and shows more complicated pattern matching constructions and slices. 3 | -- Rewrite of a BCC example: 4 | -- https://github.com/iovisor/bcc/blob/master/examples/networking/http_filter/http-parse-simple.c 5 | local ffi = require("ffi") 6 | local bpf = require("bpf") 7 | local S = require("syscall") 8 | 9 | -- Shared part of the program 10 | local map = bpf.map('hash', 64) 11 | -- Kernel-space part of the program 12 | local prog = bpf(function (skb) 13 | -- Only ingress so we don't count twice on loopback 14 | if skb.ingress_ifindex == 0 then return end 15 | local data = pkt.ip.tcp.data -- Get TCP protocol dissector 16 | -- Continue only if we have 7 bytes of TCP data 17 | if data + 7 > skb.len then return end 18 | -- Fetch 4 bytes of TCP data and compare 19 | local h = data(0, 4) 20 | if h == 'HTTP' or h == 'GET ' or 21 | h == 'POST' or h == 'PUT ' or 22 | h == 'HEAD' or h == 'DELE' then 23 | -- If hash key doesn't exist, create it 24 | -- otherwise increment counter 25 | local v = map[h] 26 | if not v then map[h] = 1 27 | else xadd(map[h], 1) 28 | end 29 | end 30 | end) 31 | bpf.dump(prog) 32 | bpf.socket('lo', prog) 33 | -- User-space part of the program 34 | for _ = 1, 10 do 35 | local strkey = ffi.new('uint32_t [1]') 36 | local s = '' 37 | for k,v in map.pairs,map,0 do 38 | strkey[0] = bpf.ntoh(k) 39 | s = s..string.format('%s %d ', ffi.string(strkey, 4):match '^%s*(.-)%s*$', tonumber(v)) 40 | end 41 | if #s > 0 then print(s..'messages') end 42 | S.sleep(1) 43 | end -------------------------------------------------------------------------------- /examples/sock-proto.lua: -------------------------------------------------------------------------------- 1 | -- This program looks at IP, UDP and ICMP packets and 2 | -- increments counter for each packet of given type seen 3 | -- Rewrite of https://github.com/torvalds/linux/blob/master/samples/bpf/sock_example.c 4 | local ffi = require("ffi") 5 | local bpf = require("bpf") 6 | local S = require("syscall") 7 | 8 | -- Shared part of the program 9 | local map = bpf.map('hash', 256) 10 | map[1], map[6], map[17] = 0, 0, 0 11 | -- Kernel-space part of the program 12 | bpf.socket('lo', bpf(function () 13 | local proto = pkt.ip.proto -- Get byte (ip.proto) from frame at [23] 14 | xadd(map[proto], 1) -- Atomic `map[proto] += 1` 15 | end)) 16 | -- User-space part of the program 17 | for _ = 1, 10 do 18 | local icmp, udp, tcp = map[1], map[17], map[6] 19 | print(string.format('TCP %d UDP %d ICMP %d packets', 20 | tonumber(tcp or 0), tonumber(udp or 0), tonumber(icmp or 0))) 21 | S.sleep(1) 22 | end -------------------------------------------------------------------------------- /examples/sock-protolen.lua: -------------------------------------------------------------------------------- 1 | -- This program counts total bytes received per-protocol in 64-bit counters. 2 | -- The map backend is array in this case to avoid key allocations. 3 | -- increments counter for each packet of given type seen 4 | -- Rewrite of https://github.com/torvalds/linux/blob/master/samples/bpf/sock_example.c 5 | local ffi = require("ffi") 6 | local bpf = require("bpf") 7 | local S = require("syscall") 8 | 9 | -- Shared part of the program 10 | local map = bpf.map('array', 256, ffi.typeof('uint32_t'), ffi.typeof('uint64_t')) 11 | -- Kernel-space part of the program 12 | bpf.socket('lo', bpf(function (skb) 13 | local proto = pkt.ip.proto -- Get byte (ip.proto) from frame at [23] 14 | xadd(map[proto], skb.len) -- Atomic `map[proto] += ` 15 | end)) 16 | -- User-space part of the program 17 | for _ = 1, 10 do 18 | local icmp, udp, tcp = map[1], map[17], map[6] 19 | print(string.format('TCP %d UDP %d ICMP %d bytes', 20 | tonumber(tcp or 0), tonumber(udp or 0), tonumber(icmp or 0))) 21 | S.sleep(1) 22 | end -------------------------------------------------------------------------------- /examples/tracepoint-offcputime.lua: -------------------------------------------------------------------------------- 1 | -- Summarize off-CPU time by stack trace 2 | -- Related tool: https://github.com/iovisor/bcc/blob/master/tools/offcputime.py 3 | local ffi = require('ffi') 4 | local bpf = require('bpf') 5 | local S = require('syscall') 6 | -- Create BPF maps 7 | -- TODO: made smaller to fit default memory limits 8 | local key_t = 'struct { char name[16]; int32_t stack_id; }' 9 | local starts = assert(bpf.map('hash', 128, ffi.typeof('uint32_t'), ffi.typeof('uint64_t'))) 10 | local counts = assert(bpf.map('hash', 128, ffi.typeof(key_t), ffi.typeof('uint64_t'))) 11 | local stack_traces = assert(bpf.map('stack_trace', 16)) 12 | -- Open tracepoint and attach BPF program 13 | -- The 'arg' parses tracepoint format automatically 14 | local tp = bpf.tracepoint('sched/sched_switch', function (arg) 15 | -- Update previous thread sleep time 16 | local pid = arg.prev_pid 17 | local now = time() 18 | starts[pid] = now 19 | -- Calculate current thread's delta time 20 | pid = arg.next_pid 21 | local from = starts[pid] 22 | if not from then 23 | return 0 24 | end 25 | local delta = (now - from) / 1000 26 | starts[pid] = nil 27 | -- Check if the delta is below 1us 28 | if delta < 1 then 29 | return 30 | end 31 | -- Create key for this thread 32 | local key = ffi.new(key_t) 33 | comm(key.name) 34 | key.stack_id = stack_id(stack_traces, BPF.F_FAST_STACK_CMP) 35 | -- Update current thread off cpu time with delta 36 | local val = counts[key] 37 | if not val then 38 | counts[key] = 0 39 | end 40 | xadd(counts[key], delta) 41 | end, 0, -1) 42 | -- Helper: load kernel symbols 43 | ffi.cdef 'unsigned long long strtoull(const char *, char **, int);' 44 | local ksyms = {} 45 | for l in io.lines('/proc/kallsyms') do 46 | local addr, sym = l:match '(%w+) %w (%S+)' 47 | if addr then ksyms[ffi.C.strtoull(addr, nil, 16)] = sym end 48 | end 49 | -- User-space part of the program 50 | while true do 51 | for k,v in counts.pairs,counts,nil do 52 | local s = '' 53 | local traces = stack_traces[k.stack_id] 54 | if traces then 55 | for i, ip in ipairs(traces) do 56 | s = s .. string.format(" %-16p %s", ip, ksyms[ip]) 57 | end 58 | end 59 | s = s .. string.format(" %-16s %s", "-", ffi.string(k.name)) 60 | s = s .. string.format(" %d", tonumber(v)) 61 | print(s) 62 | end 63 | S.sleep(1) 64 | end 65 | -------------------------------------------------------------------------------- /examples/uprobe-readline-perf.lua: -------------------------------------------------------------------------------- 1 | -- Trace readline() call from all bash instances (print bash commands from all running shells). 2 | -- This is rough equivallent to `bashreadline` with output through perf event API. 3 | -- Source: http://www.brendangregg.com/blog/2016-02-08/linux-ebpf-bcc-uprobes.html 4 | local ffi = require('ffi') 5 | local bpf = require('bpf') 6 | local S = require('syscall') 7 | -- Perf event map 8 | local sample_t = 'struct { uint64_t pid; char str[80]; }' 9 | local events = bpf.map('perf_event_array') 10 | -- Kernel-space part of the program 11 | local prog = bpf(function (ptregs) 12 | local req = ffi.cast('struct pt_regs', ptregs) -- Cast to pt_regs, specialized type. 13 | local sample = ffi.new(sample_t) 14 | sample.pid = pid_tgid() 15 | ffi.copy(sample.str, ffi.cast('char *', req.ax)) -- Cast `ax` to string pointer and copy to buffer 16 | perf_submit(events, sample) -- Write buffer to perf event map 17 | end) 18 | bpf.dump(prog) 19 | local probe = assert(bpf.uprobe('/bin/bash:readline', prog, true, -1, 0)) 20 | -- User-space part of the program 21 | local log = events:reader(nil, 0, sample_t) -- Must specify PID or CPU_ID to observe 22 | print(' TASK-PID TIMESTAMP FUNCTION') 23 | print(' | | | |') 24 | while true do 25 | log:block() -- Wait until event reader is readable 26 | for _,e in log:read() do -- Collect available reader events 27 | print(string.format('%12s%-16s %-10s %s', '', tonumber(e.pid), os.date("%H:%M:%S"), ffi.string(e.str))) 28 | end 29 | end 30 | -------------------------------------------------------------------------------- /examples/uprobe-readline.lua: -------------------------------------------------------------------------------- 1 | -- Trace readline() call from all bash instances (print bash commands from all running shells). 2 | -- This is rough equivallent to `bashreadline` 3 | -- Source: http://www.brendangregg.com/blog/2016-02-08/linux-ebpf-bcc-uprobes.html 4 | local ffi = require('ffi') 5 | local bpf = require('bpf') 6 | local S = require('syscall') 7 | -- Kernel-space part of the program 8 | local prog = bpf(function (ptregs) 9 | local req = ffi.cast('struct pt_regs', ptregs) -- Cast to pt_regs, specialized type. 10 | local line = ffi.new('char [40]') -- Create a 40 byte buffer on stack 11 | ffi.copy(line, ffi.cast('char *', req.ax)) -- Cast `ax` to string pointer and copy to buffer 12 | print('%s\n', line) -- Print to trace_pipe 13 | end) 14 | bpf.dump(prog) 15 | local probe = assert(bpf.uprobe('/bin/bash:readline', prog, true, -1, 0)) 16 | -- User-space part of the program 17 | local ok, err = pcall(function() 18 | local log = bpf.tracelog() 19 | print(' TASK-PID CPU# TIMESTAMP FUNCTION') 20 | print(' | | | | |') 21 | while true do 22 | print(log:read()) 23 | end 24 | end) 25 | -------------------------------------------------------------------------------- /examples/uprobe-tailkt.lua: -------------------------------------------------------------------------------- 1 | -- Trace operations on keys matching given pattern in KyotoTycoon daemon. 2 | -- This can show you if certain keys were modified or read during the lifetime 3 | -- even if KT doesn't support this. It also shows how to attach to C++ mangled symbols. 4 | local ffi = require('ffi') 5 | local bpf = require('bpf') 6 | local S = require('syscall') 7 | local function help(err) 8 | print(string.format('%s [get|set] [key]', arg[0])) 9 | if err then print('error: '..err) end 10 | os.exit(1) 11 | end 12 | -- Accept the same format as ktremotemgr for clarity: 13 | local writeable, watch_key, klen = 'any', arg[2] or '*', 80 14 | if arg[1] == 'get' then writeable = 0 15 | elseif arg[1] == 'set' then writeable = 1 16 | elseif arg[1] == '-h' or arg[1] == '--help' then help() 17 | elseif arg[1] and arg[1] ~= 'any' then 18 | help(string.format('bad cmd: "%s"', arg[1])) 19 | end 20 | if watch_key ~= '*' then klen = #watch_key end 21 | 22 | -- Find a good entrypoint that has both key and differentiates read/write in KT 23 | -- That is going to serve as an attachment point for BPF program 24 | -- ABI: bool accept(void *this, const char* kbuf, size_t ksiz, Visitor* visitor, bool writable) 25 | local key_type = string.format('char [%d]', klen) 26 | local prog = bpf(function (ptregs) 27 | local req = ffi.cast('struct pt_regs', ptregs) -- Cast to pt_regs, specialized type. 28 | -- Watch either get/set or both 29 | if writeable ~= 'any' then 30 | if req.parm5 ~= writeable then return end 31 | end 32 | local line = ffi.new(key_type) 33 | ffi.copy(line, ffi.cast('char *', req.parm2)) 34 | -- Check if we're looking for specific key 35 | if watch_key ~= '*' then 36 | if req.parm3 ~= klen then return false end 37 | if line ~= watch_key then return false end 38 | end 39 | print('%s write:%d\n', line, req.parm5) 40 | end) 41 | local probe = assert(bpf.uprobe('/usr/local/bin/ktserver:kyotocabinet::StashDB::accept', prog, false, -1, 0)) 42 | -- User-space part of the program 43 | local ok, err = pcall(function() 44 | local log = bpf.tracelog() 45 | print(' TASK-PID CPU# TIMESTAMP FUNCTION') 46 | print(' | | | | |') 47 | while true do 48 | print(log:read()) 49 | end 50 | end) 51 | -------------------------------------------------------------------------------- /rockspec/bpf-scm-1.rockspec: -------------------------------------------------------------------------------- 1 | package = "bpf" 2 | version = "scm-1" 3 | source = { 4 | url = "git+https://github.com/vavrusa/luajit-bpf.git" 5 | } 6 | description = { 7 | summary = "A LuaJIT to BPF compiler.", 8 | detailed = [[ 9 | ]], 10 | homepage = "https://github.com/vavrusa/luajit-bpf", 11 | license = "BSD" 12 | } 13 | dependencies = { 14 | "lua >= 5.1", 15 | "ljsyscall >= 0.12", 16 | } 17 | external_dependencies = { 18 | LIBELF = { 19 | library = "elf" 20 | } 21 | } 22 | build = { 23 | type = "builtin", 24 | install = { 25 | bin = { 26 | } 27 | }, 28 | modules = { 29 | ["bpf.builtins"] = "bpf/builtins.lua", 30 | ["bpf.cdef"] = "bpf/cdef.lua", 31 | ["bpf.elf"] = "bpf/elf.lua", 32 | ["bpf.proto"] = "bpf/proto.lua", 33 | ["bpf.ljbytecode"] = "bpf/ljbytecode.lua", 34 | bpf = "bpf.lua", 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /spec/compile_spec.lua: -------------------------------------------------------------------------------- 1 | describe('compile', function() 2 | local ffi = require('ffi') 3 | local bpf = require('bpf') 4 | 5 | it('can compile socket filter', function() 6 | -- Create mock BPF map 7 | local mock_map = { 8 | max_entries = 16, 9 | key_type = ffi.typeof('uint64_t [1]'), 10 | val_type = ffi.typeof('uint64_t [1]'), 11 | fd = 1, 12 | __map = true, 13 | } 14 | -- Compile small code example 15 | local code = bpf(function () 16 | local proto = pkt.ip.proto 17 | xadd(mock_map[proto], 1) 18 | end) 19 | assert.truthy(code) 20 | assert.same(type(code), 'table') 21 | assert.same(code.pc, 15) 22 | end) 23 | end) 24 | -------------------------------------------------------------------------------- /spec/decoder_spec.lua: -------------------------------------------------------------------------------- 1 | describe('decoder', function() 2 | 3 | -- Decode simple function 4 | local bytecode = require('bpf.ljbytecode') 5 | local f = function (x) return x + 1 end 6 | 7 | it('should decode functions', function() 8 | -- Make sure it calls LJ decoder 9 | local bc = bytecode.decoder(f) 10 | assert.truthy(bc) 11 | -- Decode bytecode bytecode to instructions 12 | local jutil = require("jit.util") 13 | spy.on(jutil, 'funcbc') 14 | local pc, op = bc() 15 | -- Check bytecode for sanity (starts with ADDVN(x, 1)) 16 | assert.equal(pc, 1) 17 | assert.equal(op, 'ADDVN') 18 | for pc, op in bc do 19 | assert.truthy(pc and op) 20 | end 21 | assert.spy(jutil.funcbc).was.called() 22 | end) 23 | it('should fail on bad input', function() 24 | assert.has_error(function() bytecode.decoder(nil)() end) 25 | assert.has_error(function() bytecode.decoder(5)() end) 26 | assert.has_error(function() bytecode.decoder('test')() end) 27 | end) 28 | it('should dump bytecode', function() 29 | bytecode.dump(f) 30 | end) 31 | end) 32 | -------------------------------------------------------------------------------- /spec/elf_spec.lua: -------------------------------------------------------------------------------- 1 | describe('elf reader', function() 2 | 3 | local elf = require('bpf.elf') 4 | it('should handle C library', function() 5 | -- Open libc 6 | local sh = elf.open('/bin/sh') 7 | assert.truthy(sh) 8 | -- Find load address 9 | local base = sh:loadaddr() 10 | assert.truthy(base) 11 | -- Find something from ISO C 12 | local malloc_addr = sh:resolve('malloc') 13 | assert.truthy(malloc_addr) 14 | -- Find something that doesn't exist 15 | local bad_addr = sh:resolve('thisnotexists') 16 | assert.falsy(bad_addr) 17 | end) 18 | it('should fail on bad input', function() 19 | assert.falsy(elf.open(nil)) 20 | assert.falsy(elf.open('/tmp'):loadaddr()) 21 | end) 22 | end) 23 | --------------------------------------------------------------------------------