├── .github └── workflows │ ├── docs.yml │ └── doctests.yml ├── .gitignore ├── README.md ├── bitspec.py ├── pyproject.toml └── tox.ini /.github/workflows/docs.yml: -------------------------------------------------------------------------------- 1 | name: Deploy to GitHub pages 2 | on: 3 | push: 4 | branches: [ master ] 5 | jobs: 6 | build: 7 | runs-on: ubuntu-latest 8 | 9 | steps: 10 | - uses: actions/checkout@v2 11 | 12 | - name: Generate static pydoc html 13 | uses: actions/setup-python@v2 14 | with: 15 | python-version: '3.10' 16 | - run: | 17 | python -m pip install --upgrade pip 18 | python -m pip install git+https://github.com/pdoc3/pdoc.git@f358893e4fcfd7f29857a7ff5491b606ff146d39 19 | python -m pdoc --html bitspec.py 20 | 21 | - name: Build docs and commit to fresh git repo 22 | run: | 23 | cd html 24 | mv bitspec.html index.html 25 | git init 26 | git add -A 27 | git config --local user.email "action@users.noreply.github.com" 28 | git config --local user.name "GitHub Action" 29 | git commit -m 'deploy' 30 | 31 | - name: Force push to destination branch 32 | uses: ad-m/github-push-action@v0.5.0 33 | with: 34 | github_token: ${{ secrets.GITHUB_TOKEN }} 35 | branch: gh-pages 36 | force: true 37 | directory: ./html 38 | -------------------------------------------------------------------------------- /.github/workflows/doctests.yml: -------------------------------------------------------------------------------- 1 | name: doctests 2 | on: 3 | push: 4 | branches: [master] 5 | pull_request: 6 | branches: [master] 7 | workflow_dispatch: 8 | jobs: 9 | test: 10 | runs-on: ubuntu-latest 11 | strategy: 12 | matrix: 13 | python-version: [3.8, 3.9, '3.10'] 14 | env: 15 | USING_COVERAGE: "3.10" 16 | steps: 17 | - name: Checkout sources 18 | uses: actions/checkout@v2 19 | 20 | - name: Set up Python 21 | uses: actions/setup-python@v2 22 | with: 23 | python-version: ${{ matrix.python-version }} 24 | 25 | - name: Install dependencies 26 | run: | 27 | python -m pip install --upgrade pip 28 | python -m pip install coverage flake8 flit mccabe pylint pytest tox tox-gh-actions 29 | 30 | - name: Run tox 31 | run: | 32 | python -m tox 33 | 34 | - name: Upload coverage to Codecov 35 | uses: codecov/codecov-action@v1 36 | if: contains(env.USING_COVERAGE, matrix.python-version) 37 | with: 38 | fail_ci_if_error: false 39 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__/ 2 | dist/ 3 | .* 4 | !/.gitignore -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Bit pattern mini-language for instruction encodings. Bytes in, IR out! 2 | 3 | [![PyPI Version][pypi-image]][pypi-url] 4 | [![Doctests Status][build-image]][build-url] 5 | [![Code Coverage][coverage-image]][coverage-url] 6 | 7 | See [docs](https://amtal.github.io/bitspec) for a step-by-step tutorial and API 8 | reference. Here's a complete example: 9 | ```python 10 | >>> import bitspec 11 | >>> @bitspec.bitspec('11 xy:1 11101 0xCB nn:s8 00 000 r:3', op='RLC') 12 | ... @bitspec.bitspec('11 xy:1 11101 0xCB nn:s8 00 001 r:3', op='RRC') 13 | ... @bitspec.bitspec('11 xy:1 11101 0xCB nn:s8 00 010 r:3', op='RL ') 14 | ... @bitspec.bitspec('11 xy:1 11101 0xCB nn:s8 00 011 r:3', op='RR ') 15 | ... @bitspec.bitspec('11 xy:1 11101 0xCB nn:s8 00 100 r:3', op='SLA') 16 | ... @bitspec.bitspec('11 xy:1 11101 0xCB nn:s8 00 101 r:3', op='SRA') 17 | ... @bitspec.bitspec('11 xy:1 11101 0xCB nn:s8 00 110 r:3', op='SL1') # "SLL" 18 | ... @bitspec.bitspec('11 xy:1 11101 0xCB nn:s8 00 111 r:3', op='SRL') 19 | ... @bitspec.bitspec('11 xy:1 11101 0xCB nn:s8 10 b:3 r:3', op='RES') 20 | ... @bitspec.bitspec('11 xy:1 11101 0xCB nn:s8 11 b:3 r:3', op='SET') 21 | ... class Z80UndocBitOps: # NORTHERN BYTES Volume 3 #10 (October 1982) 22 | ... def __str__(self): # mirrored at http://z80.info/z80undoc.htm 23 | ... dst = ['B,', 'C,', 'D,', 'E,', 'H,', 'L,', '', 'A,'][self.r] 24 | ... bit = '' if self.b == None else f'{self.b},' 25 | ... src = ['IX', 'IY'][self.xy] 26 | ... return f'{self.op} {dst}{bit}({src}{self.nn:+})' 27 | ... def __repr__(self): return f'<{self!s}>' 28 | ... def __init__(self, xy, nn, r, op, b=None): 29 | ... self.xy, self.nn, self.r, self.op, self.b = xy, nn, r, op, b 30 | 31 | >>> code = bytes.fromhex('fdCB7f17 ddCBfe88 fdCB0125') 32 | >>> Z80UndocBitOps.from_bytes(code) 33 | 34 | 35 | >>> {hex(op.addr):op for op in Z80UndocBitOps.iter_bytes(code, addr=0x50)} 36 | {'0x50': , '0x54': , '0x58': } 37 | ``` 38 | 39 | Install from [Pypi](https://pypi.org/project/bitspec) or just copy `bitspec.py` 40 | into your project. [Bugs](https://github.com/amtal/bitspec/issues), questions, 41 | or [other feedback](https://github.com/amtal/bitspec/discussions) are welcome! 42 | 43 | 44 | 45 | [pypi-image]: https://img.shields.io/pypi/v/bitspec 46 | [pypi-url]: https://pypi.org/project/bitspec/ 47 | [build-image]: https://github.com/amtal/bitspec/actions/workflows/doctests.yml/badge.svg 48 | [build-url]: https://github.com/amtal/bitspec/actions/workflows/doctests.yml 49 | [coverage-image]: https://codecov.io/gh/amtal/bitspec/branch/master/graph/badge.svg 50 | [coverage-url]: https://codecov.io/gh/amtal/bitspec 51 | -------------------------------------------------------------------------------- /bitspec.py: -------------------------------------------------------------------------------- 1 | # MIT License 2 | # 3 | # Copyright (c) 2020 amtal 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | # 12 | # The above copyright notice and this permission notice shall be included in all 13 | # copies or substantial portions of the Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | # SOFTWARE. 22 | """Bit pattern mini-language for instruction encodings. Bytes in, IR out! 23 | 24 | Machine code and interpreter bytecode usually have densely-packed semantic 25 | structure that speeds up tool development when exposed. Specifying encodings 26 | declaratively allows the intermediate representation (IR) to be laid out based 27 | on that structure, not bit-level quirks. 28 | 29 | # Example Disassembler 30 | 31 | Here's a peculiar group of 4-byte long Zilog Z80 instructions. The architecture 32 | is a byte-prefixed extension of the Intel 8080 crammed into 4 unused opcodes. 33 | Some encoding behaviors were left undefined - possibly to leave room for 34 | further extension. 35 | 36 | >>> import bitspec 37 | >>> @bitspec.bitspec('11 xy:1 11101 0xCB nn:s8 00 000 r:3', op='RLC') 38 | ... @bitspec.bitspec('11 xy:1 11101 0xCB nn:s8 00 001 r:3', op='RRC') 39 | ... @bitspec.bitspec('11 xy:1 11101 0xCB nn:s8 00 010 r:3', op='RL ') 40 | ... @bitspec.bitspec('11 xy:1 11101 0xCB nn:s8 00 011 r:3', op='RR ') 41 | ... @bitspec.bitspec('11 xy:1 11101 0xCB nn:s8 00 100 r:3', op='SLA') 42 | ... @bitspec.bitspec('11 xy:1 11101 0xCB nn:s8 00 101 r:3', op='SRA') 43 | ... @bitspec.bitspec('11 xy:1 11101 0xCB nn:s8 00 110 r:3', op='SL1') # "SLL" 44 | ... @bitspec.bitspec('11 xy:1 11101 0xCB nn:s8 00 111 r:3', op='SRL') 45 | ... @bitspec.bitspec('11 xy:1 11101 0xCB nn:s8 10 b:3 r:3', op='RES') 46 | ... @bitspec.bitspec('11 xy:1 11101 0xCB nn:s8 11 b:3 r:3', op='SET') 47 | ... class Z80UndocBitOps: # NORTHERN BYTES Volume 3 #10 (October 1982) 48 | ... def __str__(self): # mirrored at http://z80.info/z80undoc.htm 49 | ... dst = ['B,', 'C,', 'D,', 'E,', 'H,', 'L,', '', 'A,'][self.r] 50 | ... bit = '' if self.b == None else f'{self.b},' 51 | ... src = ['IX', 'IY'][self.xy] 52 | ... return f'{self.op} {dst}{bit}({src}{self.nn:+})' 53 | ... def __repr__(self): return f'<{self!s}>' 54 | ... def __init__(self, xy, nn, r, op, b=None): 55 | ... self.xy, self.nn, self.r, self.op, self.b = xy, nn, r, op, b 56 | 57 | The class decorators add `Bitspec.from_bytes` and `Bitspec.iter_bytes` 58 | classmethods that build pattern-matched objects based on the bitfields 59 | specified. Endianness and signed fields are supported. 60 | 61 | >>> code = bytes.fromhex('fdCB7f17 ddCBfe88 fdCB0125') 62 | >>> Z80UndocBitOps.from_bytes(code) 63 | 64 | >>> {hex(op.addr):op for op in Z80UndocBitOps.iter_bytes(code, addr=0x50)} 65 | {'0x50': , '0x54': , '0x58': } 66 | 67 | The objects get a `__len__` implementation based on which decorator matched. 68 | There's also a `Bitspec.to_bytes` method in case you ever need to patch or 69 | re-assemble code. It's a niche use case, but with a declarative spec it comes 70 | for free! 71 | 72 | >>> Z80UndocBitOps.from_bytes(b'\\xdd\\xcb\\xfe\\x88\\x00\\x00') 73 | 74 | >>> i = _; assert len(i) == 4 75 | >>> i.nn = 2; {str(i): i.to_bytes()} 76 | {'RES B,1,(IX+2)': b'\\xdd\\xcb\\x02\\x88'} 77 | 78 | # Identifying Structure in an ISA 79 | 80 | The data structure above is terse but inconvenient to lift or interpret. We're 81 | already pattern matching once - why switch over the values again in order to 82 | separate simple shifter operations from bit flips? Someone familiar with the Z80's 83 | history would trivially carve it up into octet prefixes and hextet operands. 84 | 85 | This library is for incremental tool development on unfamiliar or poorly 86 | documented targets. 87 | 88 | ## Specializing Instructions 89 | 90 | To decode multiple kinds of instructions, group them in a class hierarchy. All 91 | the subclass specifications are merged together and matched at once. (The 92 | `dataclass` decorator is identical to `bitspec`, but also adds default PEP 557 93 | dataclass methods.) 94 | 95 | >>> @bitspec.dataclass 96 | ... class HPPA: pass # Hewlett Packard Precision Architecture (PA-RISC) 97 | >>> @bitspec.dataclass('000010 ----- ----- 0000 001001 0 00000') 98 | ... class NOP(HPPA): # ^^^^^ 99 | ... pass # R0 is always zero. Writing to it is a NOP. 100 | >>> @bitspec.dataclass('000010 00000 r1:5 0000 001001 0 rt:5') 101 | ... @bitspec.dataclass('000010 r1:5 00000 0000 001001 0 rt:5') # hack! 102 | ... class COPY(NOP): # ^^^^^ ^^^^^ 103 | ... r1: int # If one operand is R0, boolean OR is actually a COPY. 104 | ... rt: int # To simplify lifting, this IR pretends r2 is r1. 105 | >>> @bitspec.dataclass('000010 r2:5 r1:5 cf:4 001001 0 rt:5') 106 | ... class OR(COPY): # Finally here's the full instruction encoding. 107 | ... cf: int # You *could* lift OR, COPY, and NOP from it by checking 108 | ... r2: int # for 0-fields, but those cases are already handled. 109 | >>> import dataclasses; [dataclasses.is_dataclass(c) for c in OR.__mro__] 110 | [True, True, True, True, False] 111 | 112 | Matches are prioritized based on the maximum number of constant bits, with 113 | shortest class distance from the root class as a tie-breaker. This is handy for 114 | architectures where specialized instructions are actually subsets of more 115 | generic ones, or an architecture extension was allocated within a no-op 116 | encoding of an unrelated operation. 117 | 118 | >>> HPPA.from_bytes(b'\\x08\\x1f\\x02\\x5f') 119 | COPY(r1=31, rt=31) 120 | >>> HPPA.from_bytes(OR(cf=0, r2=0, r1=31, rt=31).to_bytes()) 121 | COPY(r1=31, rt=31) 122 | >>> HPPA.from_bytes(COPY(r1=31, rt=0).to_bytes()) 123 | NOP() 124 | 125 | ## Factoring Operands 126 | 127 | While keyword argument values usually get assigned as-is, bitspec-decorated 128 | classes get instantiated with `from_bytes` on the same input as the top-level 129 | object. This allows commonly-used addressing modes and operand types to be 130 | factored into their own classes for easy lifting. 131 | 132 | >>> class Operand: # Intel MCS-51 addressing modes 133 | ... def lift_load(self, il): raise NotImplementedError 134 | ... def lift_store(self, il): raise NotImplementedError 135 | >>> @bitspec.dataclass('.... ....') 136 | ... class Acc(Operand): pass # A 137 | >>> @bitspec.dataclass('.... .... addr:8') 138 | ... class Dir(Operand): addr: int # [addr] 139 | >>> @bitspec.dataclass('.... . reg:3') 140 | ... class Reg(Operand): reg: int # R0..R7 141 | >>> @bitspec.dataclass('.... ... i:1') 142 | ... class RegInd(Operand): i: int # @R0, @R1 143 | >>> @bitspec.dataclass('.:8 val:8') 144 | ... class Imm(Operand): val: int # #val 145 | >>> @bitspec.dataclass('.:16 val:8') 146 | ... class Imm_(Imm): pass # re-use lift semantics on a new encoding! 147 | >>> @bitspec.dataclass('0100 0100', name='ORL', dst=Acc, src=Imm) 148 | ... @bitspec.dataclass('0100 0101', name='ORL', dst=Acc, src=Dir) 149 | ... @bitspec.dataclass('0100 011.', name='ORL', dst=Acc, src=RegInd) 150 | ... @bitspec.dataclass('0100 1...', name='ORL', dst=Acc, src=Reg) 151 | ... @bitspec.dataclass('0100 0010', name='ORL', dst=Dir, src=Acc) 152 | ... @bitspec.dataclass('0100 0011', name='ORL', dst=Dir, src=Imm_) 153 | ... @bitspec.dataclass('0101 0100', name='ANL', dst=Acc, src=Imm) 154 | ... @bitspec.dataclass('0101 0101', name='ANL', dst=Acc, src=Dir) 155 | ... @bitspec.dataclass('0101 011.', name='ANL', dst=Acc, src=RegInd) 156 | ... @bitspec.dataclass('0101 1...', name='ANL', dst=Acc, src=Reg) 157 | ... @bitspec.dataclass('0101 0010', name='ANL', dst=Dir, src=Acc) 158 | ... @bitspec.dataclass('0101 0011', name='ANL', dst=Dir, src=Imm_) 159 | ... # [...] repetitive code for XRL, ADD, ADDC, etc. 160 | ... class MCS51ALU: 161 | ... name: str 162 | ... dst: Operand 163 | ... src: Operand 164 | 165 | The resulting `__len__` is adjusted based on any arguments `from_bytes` was 166 | called on, so variable-length encodings work as you'd expect. 167 | 168 | >>> code = bytes.fromhex('438010 58 52ff') 169 | >>> for i in MCS51ALU.iter_bytes(code, addr=0): 170 | ... print(f'{hex(i.addr)}({len(i)}): {i}') 171 | 0x0(3): MCS51ALU(name='ORL', dst=Dir(addr=128), src=Imm_(val=16)) 172 | 0x3(1): MCS51ALU(name='ANL', dst=Acc(), src=Reg(reg=0)) 173 | 0x4(2): MCS51ALU(name='ANL', dst=Dir(addr=255), src=Acc()) 174 | 175 | ## Incremental Development 176 | 177 | What if we're implementing tools for an unfamiliar architecture, and it's not 178 | obvious how addressing modes are encoded? 179 | 180 | Suppose we thought the MCS-51 instruction set was orthogonal and operand 181 | addressing mode encodings were independent. The preceding code could be 182 | code-golfed by having the operand arguments be an orthogonal pattern match. 183 | 184 | But wait - MCS-51 isn't orthogonal! One of the MOV encodings that would be a 185 | NOP on an architecture with bits to spare is actually a JMP, and another had a 186 | bit-level ORL variant squeezed in. 187 | 188 | >>> @bitspec.dataclass('.... 0100 n:8', dst='A', src='#{n}') # immediate 189 | ... @bitspec.dataclass('.... 0101 n:8', dst='A', src='0x{n:x}') # direct 190 | ... @bitspec.dataclass('.... 011 n:1', dst='A', src='@R{n}' ) # I-ram 191 | ... @bitspec.dataclass('.... 1 n:3', dst='A', src='R{n}') # register 192 | ... @bitspec.dataclass('0111 0010 n:8', dst='C', src='{n}') # bit addr (hack) 193 | ... @bitspec.dataclass('.... 0011 n:8 m:8', dst='0x{n:x}', src='#{m}') 194 | ... @bitspec.dataclass('.... 0010 n:8', dst='0x{n:x}', src='A') 195 | ... class Operands: 196 | ... src:str; dst:str; n:int = -1; m:int = -1 197 | ... def __str__(self): 198 | ... return f'{self.dst},{self.src}'.format(**self.__dict__) 199 | ... def __repr__(self): return f'<{len(self)}: {self!s}>' 200 | >>> @bitspec.dataclass('0100 ....', name='ORL', ops=Operands) 201 | ... @bitspec.dataclass('0101 ....', name='ANL', ops=Operands) 202 | ... @bitspec.dataclass('0110 ....', name='XRL', ops=Operands) 203 | ... @bitspec.dataclass('0111 ....', name='MOV', ops=Operands) 204 | ... # [...] 205 | ... @bitspec.dataclass('0111 0011', name='JMP', ops='@A + DPTR') # (hack) 206 | ... class OrthMCS51ALU: 207 | ... name:str; ops:Operands 208 | ... def __repr__(self): return f'<{len(self)}: {self.name} {self.ops!s}>' 209 | 210 | And that's fine. Since `Bitspec.from_bytes` only directly pattern matches on 211 | the class hierarchy it is called on, with any operands being pattern matched 212 | only after the first-level match succeeds, some conflicts and overlaps with 213 | bitspec arguments are okay. 214 | 215 | >>> OrthMCS51ALU.from_bytes(b'C'*3) 216 | <3: ORL 0x43,#67> 217 | >>> jmp_indirect = b'\\x73' + b'C' * 10 218 | >>> OrthMCS51ALU.from_bytes(jmp_indirect) # correct disassembly 219 | <1: JMP @A + DPTR> 220 | >>> Operands.from_bytes(jmp_indirect) # top-level match takes priority! 221 | <3: 0x43,#67> 222 | 223 | A disassembler and lifter can be quickly brought up based on common encoding 224 | structures. Edge cases can be filled in later as they're figured out. 225 | 226 | ## Debugging and Partial Decoding 227 | 228 | Bringing up a new ISA target will often raise questions: 229 | 230 | * "Am I looking at code or data? The disassembly doesn't make sense." 231 | * "Did I typo this instruction's encoding or is the reference manual wrong?" 232 | * On bad days, "Does my memory dump contain semi-random bitflips?" 233 | 234 | This library can't fix signal integrity in 1' of 0.1" jumper wire spaghetti. It 235 | *can* disassemble just one class of instructions at a time, which helps debug 236 | encoding typos from an interpreter. 237 | 238 | >>> NOP.from_bytes(b'\\x08\\x1f\\x02\\x41') 239 | COPY(r1=31, rt=1) 240 | 241 | You can even peek under the hood for a quick encoding reference while doing it. 242 | 243 | >>> from pprint import pprint as pp 244 | >>> pp(COPY.__bitspec__) 245 | {: {'r1': , 246 | 'rt': }, 247 | : {'r1': , 248 | 'rt': }} 249 | >>> pp(NOP.__bitspec__) 250 | {: {}} 251 | 252 | There's also a sometimes-useful (sometimes-annoying) edge case when decoding 253 | short bytestrings on variable length architectures: it's possible to decode an 254 | instruction prefix, but not have enough bytes left for the operand. 255 | 256 | >>> OrthMCS51ALU.from_bytes(b'\\x72' + b'C') # ok 257 | <2: MOV C,67> 258 | >>> OrthMCS51ALU.from_bytes(b'\x72') # no operand! 259 | <1: MOV None> 260 | 261 | Usually that's a sign the ISA specification is missing an instruction or needs 262 | the top-level match padded out to a full instruction length. However, at the 263 | end of a basic block or data section it's a good sign you're disassembling 264 | data. 265 | 266 | # Motivation and Similar Tools 267 | 268 | Isolating bit-level encoding details to a flexible mini-language allows the 269 | low-level IR to be designed entirely around the target ISA's semantic quirks 270 | rather than its encoding quirks. Building around addressing modes, ALU design, 271 | memory banking schemes, or significant ISA extensions greatly speeds up lifter 272 | bringup, debugging, and maintenance. 273 | 274 | Q: But I wanna go *fast*! Why not just write table-based (dis)assemblers?
275 | A: You should - in Rust. Python's GIL and ctypes back-and-forth to your 276 | binary analysis framework of choice will limit scalability anyway. 277 | 278 | Q: Isn't this overthinking a problem that doesn't exist?
279 | A: In theory the number of ISAs in this world is finite and we can 280 | brute-force our way to nice tool support for all of them. 281 | In practice appearance of interesting targets, weird bytecode machines, 282 | and binary analysis platforms seems to outpace tool publications. 283 | 284 | Q: The DSL style looks familiar.
285 | A: See GDB's opc2c.c, QEMU's decodetree.py, or Amoco's @ispec as other examples. 286 | 287 | Q: Do all sufficiently complex binary analysis projects really 288 | contain an ad-hoc implementation of LLVM's MCInst?
289 | A: Yes, but only touch LLVM when paid to. 290 | """ 291 | ## read docs locally with `python -m pydoc -b bitspec` 292 | ## (or ideally with the `pdoc3` package, but keep it out of CI? hmm) 293 | __version__ = '0.4.4' 294 | __author__ = 'amtal' 295 | __license__ = 'MIT' # https://opensource.org/licenses/MIT 296 | __all__ = ['dataclass', 'bitspec', 'is_bitspec', 'Bitspec'] 297 | # TODO top-level from_bytes API rework 298 | # addr=n argument to iter_bytes not present on from_bytes, weird 299 | # maybe drop from_bytes, add a fromhex to keep examples next()-free? 300 | # TODO want a len_iter_bytes length-decoder fastpath? 301 | # - need a way to signal full-decoding need though 302 | # since you usually want branch decoding not just length 303 | # - also very inconvenient to do with current arglen behavior, 304 | # arguments might expand final matched length so whole match tree is 305 | # too dynamic to try and extract a closed-form solution 306 | # - __bitspec_match__ is also a hack currently breaks "frozen" objects, 307 | # maybe dynamic length isn't worth it even in Python 308 | # TODO examples of NamedTuple / other __slots__-based IRs 309 | # TODO re-examine python version high watermark 310 | # TODO changelog in comments? 311 | from itertools import groupby 312 | # Python >= 3: 313 | import inspect 314 | from types import FunctionType 315 | import typing # Python >= 3.5 for NamedTuple subclass 316 | import abc, collections.abc 317 | 318 | 319 | # Attribute name for storing bitfield spec, canonical indicator for is_bitspec. 320 | _SPEC = '__bitspec__' 321 | # Cache of all subclasses, updated when __hash__ changes or on first use? 322 | _CACHE = '__bitspec_cache__' 323 | # Which pattern an object was matched from. (Used only in re-assembly.) 324 | _MATCH = '__bitspec_match__' 325 | 326 | 327 | class Bitspec(collections.abc.Sized, metaclass=abc.ABCMeta): 328 | """ABC[^1] for type-annotating `bitspec`-decorated classes. 329 | 330 | >>> import typing 331 | >>> @bitspec.dataclass('0x414141') 332 | ... class Aaa: pass 333 | >>> def decode(bs: bytes) -> typing.List[bitspec.Bitspec]: 334 | ... return list(Aaa.iter_bytes(bs)) 335 | 336 | The library follows the same extension pattern as PEP 557 dataclasses. No 337 | new class is created. As a result, inheritance is untouched! 338 | 339 | [^1]: Not a real PEP 3119 ABC, just here for annotations and docs. :) 340 | 341 | >>> xs = decode(b'A' * 9); xs 342 | [Aaa(), Aaa(), Aaa()] 343 | >>> b''.join(x.to_bytes() for x in xs) # this'll typecheck nicely, though 344 | b'AAAAAAAAA' 345 | >>> assert not any(isinstance(x, bitspec.Bitspec) for x in xs) 346 | 347 | *[ISA]: Instruction Set Architecture 348 | *[IR]: Intermediate Representation 349 | *[ALU]: Arithmetic Logic Unit 350 | *[DSL]: Domain Specific Language 351 | *[GIL]: Global Interpreter Lock 352 | *[ABC]: Abstract Base Class 353 | *[PEP 557]: dataclasses module added in Py3.7 354 | *[PEP 3119]: Introducing Abstract Base Classes 355 | """ 356 | # Dataclass-style extension doesn't actually re-create the class, so we 357 | # can't add an extra parent. This is purely a pydoc / mypy hallucination. 358 | # Could add it as a decoration? 359 | # 360 | # For type annotation reasons, it has to be declared here and defined at 361 | # the bottom. Every release of Python strays further from God's light. 362 | __slots__ = _SPEC, _MATCH 363 | from_bytes:classmethod 364 | iter_bytes:classmethod 365 | addr:typing.Optional[int] 366 | """Address of decoded instruction. 367 | 368 | Optional, only set if a base `addr` is passed to `Bitspec.iter_bytes`. 369 | """ 370 | to_bytes:typing.Callable[[typing.Any, int], bytes] 371 | __len__:typing.Callable[[typing.Any], int] 372 | 373 | 374 | def bitspec(specification='', **const_fields): 375 | """Class decorator which adds `Bitspec` methods to an existing class. 376 | 377 | Mini-language grammar, with whitespace separating constants and fields: 378 | 379 | specification := (const_bits | const_bytes | variable)+ 380 | const_bits := ('.' | '0' | '1')+ 381 | const_bytes := '0' 'x' hex_value 382 | variable := (py_identifier | '.') ':' 's'? int_size 383 | 384 | Bits and bytes are indexed in big-endian order, with the most significant 385 | bit at the start of the specification and bit 0 at the end. The byte order 386 | can be flipped for little-endian memory platforms, but bit order remains 387 | the same. 388 | 389 | Any bitspec argument fields are part of the full width; don't-care bits 390 | will be appended to the end of any too-short declarations. 391 | 392 | >>> # byte-endianness: [ 3] [ 2] [ 1] [0] 393 | >>> # bit-endianness: 31-----24 23-----16 15----8 7-0 394 | >>> @bitspec.dataclass('.... .... .... .... a:4 b:4 c:8') 395 | ... class Op: 396 | ... a: int; b: int; c: int 397 | >>> @bitspec.dataclass('1000 0100 0010 0001', op=Op) 398 | ... class BitOrder: 399 | ... op: Op 400 | >>> BitOrder.from_bytes(b'\\x84\\x21\\x53\\x10') 401 | BitOrder(op=Op(a=5, b=3, c=16)) 402 | >>> BitOrder.from_bytes(b'\\x21\\x84\\x10\\x53', byteswap=2) 403 | BitOrder(op=Op(a=5, b=3, c=16)) 404 | 405 | Total size should be aligned on byte boundaries, but internal bit fields 406 | can have arbitrary widths and alignments. 407 | 408 | Syntax sugar includes: 409 | 410 | - An empty spec (default) can't be matched, but still gets all the extra 411 | methods. This is often the case for top-level "instruction" classes that 412 | anchor multiple subclassed instruction types. Either `@bitspec()` or 413 | `@bitspec` decorator syntax can be used. 414 | - Long runs of ..... don't-care bits can be listed as a '.' or '-' variable. 415 | - Wildcard characters '.' and '-' are completely interchangeable. This can 416 | be handy to differentiate between actual don't-care bits and parts of the 417 | pattern match that are matched/extracted in an argument, but is purely a 418 | hint to the reader and isn't validated in any way. 419 | - Byte fields for instruction sets with known prefixes or big constants. 420 | They have to be byte-aligned and a multiple of 8 bits wide. 421 | 422 | >>> @bitspec.dataclass('-:7 .:9 a:4 b:4 c:8') 423 | ... class ShortOp(Op): pass 424 | >>> @bitspec.dataclass('0x8421', op=ShortOp) 425 | ... class ShortBitOrder(BitOrder): pass 426 | >>> ShortBitOrder.from_bytes(b'\\x84\\x21\\x53\\x10') 427 | ShortBitOrder(op=ShortOp(a=5, b=3, c=16)) 428 | 429 | Ambiguities are resolved by: 430 | 431 | 1. Maximizing the number of constant bits matched. 432 | 2. Prioritizing the shallowest reachable class. (This is why the above 433 | example isn't ambiguous, even though BitOrder is part of its pattern 434 | match.) 435 | 436 | Prefixing field size with 's' will read and sign-extend a 2s complement 437 | signed value. 438 | 439 | >>> @bitspec.dataclass('imm:32 off:s16 src:4 dst:4 op:8') 440 | ... class EBPF: 441 | ... imm:int; off:int; src:int; dst:int; op:int 442 | >>> ja_neg4 = bytes.fromhex('05 00 fcff 00000000') 443 | >>> EBPF.from_bytes(ja_neg4, byteswap=8) 444 | EBPF(imm=0, off=-4, src=0, dst=0, op=5) 445 | 446 | .. todo:: Multiple fields with the same name will be concatenated. 447 | 448 | Detection of dead code due to ambiguous or over-constrained specifications 449 | is best-effort and not guaranteed; this is an instruction decoder not a 450 | general-purpose parser. That said, the load-time checks provide a bit more 451 | assurance than usual for a statically unityped language like Python. 452 | 453 | Raises: 454 | SyntaxError: certain bugs (e.g. field name not a valid variable) 455 | SyntaxWarning: suspected bugs (e.g. ambiguous or overconstrained specs) 456 | IndexError: top-level byte alignment violated 457 | NameError: field names don't match constructor arguments 458 | """ 459 | if callable(specification) and len(const_fields) == 0: # @bitspec 460 | return install_methods(specification, None, {}, {}) 461 | 462 | match, var_fields = load_time_parse(specification) 463 | check_duplicate_args(var_fields, const_fields) 464 | def add_bitspec(cls): 465 | check_class_args(cls, var_fields, const_fields, specification) 466 | return install_methods(cls, match, var_fields, const_fields) 467 | return add_bitspec 468 | 469 | 470 | def test_bitspec(): # hack to make doctest see tests despite function 471 | pass # clashing with "import" glob (breaks line #s, oh well) 472 | test_bitspec.__doc__ = bitspec.__doc__ 473 | 474 | 475 | import dataclasses # Python >= 3.7 (maybe ImportError gate @dataclass?) 476 | def dataclass(specification='', **const_fields): 477 | """Same class decorator as @`bitspec` but with a PEP 557 @dataclass inside. 478 | 479 | You'll still need the extra line if using non-default dataclass arguments. 480 | 481 | >>> import dataclasses 482 | >>> @bitspec.bitspec('0xf000') 483 | ... @dataclasses.dataclass # or just do @bitspec.dataclass('0xf000') 484 | ... class Foo: pass 485 | """ 486 | if callable(specification) and len(const_fields) == 0: # @dataclass 487 | return install_methods(install_dataclass(specification), None, {}, {}) 488 | 489 | match, var_fields = load_time_parse(specification) 490 | check_duplicate_args(var_fields, const_fields) 491 | def add_bitspec_with_dataclass(cls): 492 | cls = install_dataclass(cls) 493 | check_class_args(cls, var_fields, const_fields, specification) 494 | return install_methods(cls, match, var_fields, const_fields) 495 | return add_bitspec_with_dataclass 496 | 497 | 498 | def install_dataclass(cls): 499 | # dataclasses.is_dataclass returns true for *inherited* dataclasses due 500 | # to using `hasattr`, we're just looking to see if @dataclass has been 501 | # applied already to avoid re-running it. 502 | is_already = dataclasses._FIELDS in cls.__dict__ 503 | return cls if is_already else dataclasses.dataclass(cls) 504 | # this was the only use of the module, could drop dep for py2.7 505 | 506 | 507 | def is_bitspec(cls_or_obj: typing.Any) -> bool: 508 | """True if `cls_or_obj` has been directly decorated with a `bitspec`. 509 | 510 | >>> @bitspec.bitspec 511 | ... class Foo: pass # can be matched, has extra methods 512 | >>> class Bar(Foo): pass # can't be matched, no extra methods 513 | >>> @bitspec.bitspec("0x0badf00d") 514 | ... class Baz(Bar): pass # part of the match for Foo 515 | >>> [bitspec.is_bitspec(x) for x in [Foo, Bar, Baz]] 516 | [True, False, True] 517 | >>> [cls.from_bytes(b'\\x0b\\xad\\xf0\\x0d') for cls in Baz.__mro__ 518 | ... if bitspec.is_bitspec(cls)] 519 | [<__main__.Baz object at ...>, <__main__.Baz object at ...>] 520 | 521 | Since `Bitspec.from_bytes` returns instances of the specific class that's 522 | been decorated, there's no clear meaning for calling it on a non-decorated 523 | subclass. As such while bitspec methods may resolve on a subclass, calling 524 | them is currently an error. 525 | 526 | >>> try: Bar.from_bytes(b'\\xde\\xad\\xbe\\xef') 527 | ... except TypeError: True 528 | True 529 | """ 530 | cls = cls_or_obj if isinstance(cls_or_obj, type) else type(cls_or_obj) 531 | return _SPEC in cls.__dict__ 532 | 533 | def iter_bytes(cls, bytes:bytes, byteswap=0, addr=None) -> typing.Iterable[Bitspec]: 534 | """Generate a sequence of objects pattern-matched from bytes. 535 | 536 | Yields results until a match fails. Un-decoded bytes can be identified 537 | based on last-decoded instruction address: 538 | 539 | >>> @bitspec.bitspec('0x41414141') 540 | ... class AAA: pass 541 | >>> mash, start = b'A'*1024 + b'\\xde\\xad\\xbe\\xef', 0x8000 542 | >>> for a in AAA.iter_bytes(mash, addr=start): 543 | ... pass 544 | >>> mash[a.addr - start + len(a):] 545 | b'\\xde\\xad\\xbe\\xef' 546 | 547 | Args: 548 | byteswap: little-endian word width in bytes, 0 for big-endian 549 | addr: set an `addr` attribute on generated objects, incrementing it as 550 | generator advances 551 | 552 | Raises: 553 | TypeError: no bitspec decorator found on cls 554 | """ 555 | # perform byte-endianness swap ASAP, if matches aren't word-aligned at 556 | # least the resulting bugs won't be confusing 557 | bytes = swap_endianness(bytes, byteswap) 558 | 559 | remaining = len(bytes) 560 | while remaining: 561 | if obj := from_bytes(cls, bytes[-remaining:]): 562 | if addr != None: 563 | obj.addr = addr 564 | addr += len(obj) 565 | remaining -= len(obj) 566 | yield obj 567 | else: # remaining length can be reconstructed from obj.addr 568 | return 569 | 570 | 571 | def reachable_bitspec_classes(root): 572 | # enumerate all possible subclasses that are part of pattern match 573 | # breadth-first traversal storing shortest path as later tie-breaker 574 | # 575 | # if A->B->C and is_bitspec(x) is True,False,True then C isn't reached... 576 | # document? full traverse? 577 | cls_tree = {} 578 | level = {root,} 579 | next_level = set() 580 | depth = 0 581 | while level: 582 | for cls in level: 583 | # Subtle: traverse *entire* class tree, but only return bitspec 584 | # matches. This means A->B->C where only A and C are decorated will 585 | # match both in A.from_bytes, but ignores B. 586 | # 587 | # This can come up when doing complicated things with operands for 588 | # the sake of code-golfing a lifter into something legible. 589 | if cls not in cls_tree and is_bitspec(cls): 590 | cls_tree[cls] = depth 591 | {next_level.add(sub_cls) for sub_cls in cls.__subclasses__()} 592 | level, next_level = next_level, set() 593 | depth += 1 594 | max_depth = depth 595 | return cls_tree, max_depth 596 | 597 | 598 | import functools 599 | @functools.lru_cache(maxsize=512) 600 | def __precompute(cls): 601 | cls_tree, max_depth = reachable_bitspec_classes(cls) 602 | possible_matches = [] 603 | for a_cls, depth in cls_tree.items(): 604 | for match in a_cls.__dict__[_SPEC]: 605 | score = bin(match.mask).count('1') # popcnt 606 | result = (score, max_depth - depth, match, a_cls) 607 | possible_matches.append((match, result)) 608 | opaque = Match.multimatch_precompute(possible_matches) 609 | return opaque 610 | 611 | @functools.lru_cache(maxsize=4096) 612 | def from_bytes(cls, bytes: bytes, byteswap=0) -> typing.Optional[Bitspec]: 613 | """Constructor classmethod. 614 | 615 | Args: 616 | byteswap: little-endian word width in bytes, 0 for big-endian 617 | 618 | Returns: 619 | `None` if match unsuccessful due to insufficient bytes or wrong prefix. 620 | 621 | Raises: 622 | TypeError: no bitspec decorator found on cls 623 | """ 624 | if not is_bitspec(cls): 625 | msg = f'''{cls} has not been decorated by @bitspec directly 626 | 627 | Either the intent was to match subclasses ({cls.__subclasses__()}) 628 | in which case {cls} needs a trivial decorator, or a decorated class 629 | (one of {cls.__mro__[1:-1]}) 630 | has been subclassed. Since bitspec from_bytes returns very specific classes on 631 | successful matches, deserializing non-bitspec subclasses do not make sense. 632 | ''' 633 | raise TypeError(msg) 634 | 635 | opaque = __precompute(cls) 636 | 637 | bytes = swap_endianness(bytes, byteswap) 638 | if not (possible_matches := Match.multimatch_execute(opaque, bytes)): 639 | return None 640 | _, _, match, matched_cls = max(possible_matches) 641 | 642 | # build object 643 | def slice_off_argument(val): 644 | if isinstance(val, Slice): 645 | return val.from_bytes(bytes) 646 | elif isinstance(val, type) and is_bitspec(val): 647 | # ^ Check that it was an uninstantiated class, not an instance. 648 | # Otherwise passing an initialized object will silently 649 | # replace it with a brand-new one, initialized from the bytes. 650 | return val.from_bytes(bytes) 651 | else: 652 | return val 653 | spec_args = matched_cls.__dict__[_SPEC][match] 654 | kwargs = {name:slice_off_argument(spec_args[name]) for name in spec_args} 655 | obj = matched_cls(**kwargs) 656 | 657 | # adjust length for matched args; they're not part of the pattern match, 658 | # but not propagating length up means user would have to max(len(i), 659 | # len(i.src), len(i.dst)) or something dumb 660 | matched_length = max([match.byte_length] + [len(k) for k in kwargs.values() 661 | if is_bitspec(k)]) 662 | if matched_length != match.byte_length: 663 | match = match.expand_by(matched_length - match.byte_length) 664 | setattr(obj, _MATCH, match) # for easy `to_bytes`, not slots-compatible 665 | return obj 666 | 667 | # Matches are prioritized on number of constant bits first, matched class 668 | # distance from root second. 669 | # 670 | # **Design Rationale:** 671 | # 672 | # Class depth is a clean tie-breaker for slicing out operand fields, which 673 | # usually has no constant-bits, only don't-care bits and field slices. As 674 | # such it's common to have a complex class hierarchy you want to decode 675 | # from bytes, but none of the classes actually have constant bit matches to 676 | # disambiguate them. This behavior guarantees the "nearest" class gets 677 | # decoded, which usually ends up being the one the decode classmethod 678 | # was called on. 679 | # 680 | # It's an ugly tie-breaker for rare cases where you might want to decode 681 | # macroinstructions or special-cased instruction aliases. The PA-RISC 682 | # example in the module docs does this to decode a no-op encoding as 683 | # a specific NOP, although practically there are multiple no-op encodings 684 | # and which one gets used by a given target's assembler isn't guaranteed. 685 | # 686 | # There might be CISC var-length architectures where the max-number of 687 | # constant bits heuristic isn't valid and you need a proper parser. Can't 688 | # think of any off the top of my head, easily solved via a first-pass 689 | # decoder that looks for prefix bytes or whatever and dispatches to three 690 | # or four appropriate instruction types. 691 | # 692 | # **Alternatives Considered:** 693 | # 694 | # * manually prioritizing conflicts via foo.decorator, too much effort 695 | # for common case 696 | # * passing a floating point weight-adjustment as an extra arg, too much 697 | # internal details leakage + might collide with field names 698 | # 699 | # tl;dr this is where not treating this as a proper parser bites us, 700 | # but afaict it's still worth it and full LR(8) bitparsing or w/e is 701 | # complete overkill 702 | 703 | 704 | # Bitspec arguments affect the pattern match by increasing its byte length. 705 | # Bitspec arguments do not affect the matched constant values, failure to match 706 | # argument constants just results in that argument returning None. 707 | # 708 | # This should be fairly easy to debug (AttributeError: `NoneType` object has 709 | # no attribute 'lift`) and should only really happen if the argument 710 | # does some complex operand dispatch... But it's usually easier to 711 | # specify operand types in a top-level instruction pattern, than to 712 | # figure out operand encoding invariants that hold across all 713 | # instruction classes and push them down to an operand argument. 714 | # 715 | # tl;dr Bitspec pattern matches traverse subclasses, but not arguments. 716 | # This is an implementation detail that also simplifies fast matching, 717 | # but should still be documented somewhere? 718 | 719 | import array 720 | def swap_endianness(bytes, word_length): 721 | """Silently truncates non-word-sized inputs.""" 722 | if not word_length: 723 | return bytes 724 | elif word_length in [2,4,8]: 725 | if tail := len(bytes) % word_length: 726 | bytes, tail = bytes[:-tail], bytes[-tail:] 727 | 728 | arr = array.array({2:'H',4:'I',8:'Q'}[word_length]) 729 | arr.frombytes(bytes) 730 | arr.byteswap() 731 | return arr.tobytes() # Py>=3.2 lol 732 | else: 733 | msg = f'{word_length*8}-bit byte endianness not implemented' 734 | raise NotImplementedError(msg) 735 | 736 | 737 | def to_bytes(self: Bitspec, byteswap=0) -> bytes: 738 | """Assemble IR result of `Bitspec.from_bytes`. 739 | 740 | Only works if object has fields that exactly match its constructor 741 | arguments. This is a common Python convention and is true for PEP 557 742 | dataclasses, but if you don't intend to use this method you can completely 743 | ignore the convention. 744 | 745 | Ambiguities are resolved as follows: 746 | 747 | - Don't-care bit positions will be set to zero. 748 | - Fixed bits will be set to whatever they were originally decoded from. 749 | - If the object was constructed manually rather than via from_bytes, 750 | fixed bits will be chosen from an arbitrary match decorator. If there are 751 | multiple decorators, exact one chosen is undefined. 752 | 753 | As an added benefit, this means instantiated bitspec arguments can be 754 | passed to pattern matches. A common example would be specializing a 755 | particular encoding. Doing so isn't a meaningful optimization, but it's 756 | nice to have it "just work"[^1] rather than throw errors. 757 | 758 | [^1]: This is probably a nitpicky implementation detail, but the provided 759 | example *could* return a length of 1 since `SReg(0)` was never pattern 760 | matched and might have valid definitions with other lengths? Current 761 | solution is dead simple and just treats len(arg.to_bytes()) as a lower 762 | bound on length. FIXME rm docs leave test 763 | 764 | >>> @bitspec.dataclass('.:8 n:4 ....') 765 | ... class SReg: n:int 766 | >>> @bitspec.dataclass('0x01 .... imm:s4', r=SReg, op='load-rel') 767 | ... @bitspec.dataclass('0x01 0000 imm:4', r=SReg(0), op='load-abs') 768 | ... class SIns: r:SReg; op: str; imm:int 769 | >>> SIns.from_bytes(b'\\x01\\x0f') 770 | SIns(r=SReg(n=0), op='load-abs', imm=15) 771 | >>> len(_.r) # re-calculated off of .to_bytes(), not cached! 772 | 2 773 | 774 | Disassembly is expected to be more performance-sensitive than re-assembly, so 775 | the implementation is likely to be slower than from_bytes. 776 | """ 777 | try: 778 | match = getattr(self, _MATCH) 779 | except AttributeError: 780 | # fully synthetic object constructed w/o matching 781 | # FIXME actually walk down matches to find correct match 782 | match = list(getattr(self.__class__, _SPEC).keys())[0] # lol guess 783 | fields = getattr(self.__class__, _SPEC)[match] 784 | 785 | acc = match.const 786 | for name in fields: 787 | field = fields[name] 788 | if isinstance(field, Slice): 789 | val = getattr(self, name) # assume field names == __init__ args 790 | val <<= field.shift 791 | acc |= val 792 | 793 | big_endian = acc.to_bytes(length=match.byte_length, byteorder='big') 794 | return swap_endianness(big_endian, byteswap) 795 | 796 | 797 | def byte_length(self: Bitspec) -> int: 798 | """Byte length of matched value. 799 | 800 | If the object wasn't built by calling `from_bytes` or `iter_bytes`, length 801 | should still be correct in simple cases. Multiple matches of variable 802 | lengths *might* result in wrong length being returned. 803 | """ 804 | if match := getattr(self, _MATCH, None): 805 | return match.byte_length 806 | else: 807 | # Synthetic-constructed instance; realistically, most likely reason to 808 | # check len() on this is the default truthiness implementation. e.g. 809 | # testing an operand with a default value of None will call __len__ if 810 | # it's present. 811 | # So, actual accurate length probably doesn't matter so just guess :) 812 | # If multiple lengths are possible lol good luck. 813 | # FIXME actually walk down matches to find correct match 814 | assert type(self) != type 815 | match = list(getattr(self.__class__, _SPEC).keys())[0] # lol guess 816 | return match.byte_length 817 | 818 | 819 | BIT_CONST_CHARS = set('01.-') 820 | def load_time_parse(specification): 821 | """Parses spec into a match pattern and dict of variables. 822 | 823 | The hand-rolled parser implementation is super-gnarly, but that's okay if 824 | it never leaks out of this function. This is a key source of user-facing 825 | error messages so it's worth keeping the exception call stack minimal. 826 | 827 | Gnarly implementation regression-test hall of shame: 828 | 829 | >>> parse = bitspec.load_time_parse 830 | >>> assert (x := parse('....0001')) == parse('.... 0001'); x 831 | (, {}) 832 | >>> assert (x := parse('0x0008')) != parse('0x08'); x 833 | (, {}) 834 | """ 835 | spec_width = 0 # running total in bits 836 | fields = [] # tagged-tuple IR := ('c' index mask const bit_width) 837 | # | ('v' index mask name is_signed) 838 | 839 | # parse spec, indexing bit-offsets starting from zero @ most significant 840 | # (will get re-indexed later once full width is known) 841 | for token in specification.split(): 842 | if set(token).issubset(BIT_CONST_CHARS): 843 | const_value = int('0b' + token.replace('.','0') 844 | .replace('-','0'), 2) 845 | const_mask = int('0b' + token.replace('0','1') 846 | .replace('.','0') 847 | .replace('-','0'), 2) 848 | const_width = len(token) 849 | fields.append(('c', spec_width, const_mask, const_value, const_width)) 850 | spec_width += const_width 851 | elif token.startswith('0x'): 852 | const_value = int(token, 16) 853 | const_width = len(token[2:]) * 4 854 | const_mask = (1 << const_width) - 1 855 | if len(token) == 3: 856 | suggest_a = token[:2] + '0' + token[2] 857 | suggest_b = bin(const_value)[2:] 858 | suggest_b = '0' * (const_width - len(suggest_b)) + suggest_b 859 | msg = f'''byte constant {token} may be confused for 4-bit nibble 860 | 861 | Please either explicitly pad byte values smaller than 16 (e.g. {suggest_a}) or 862 | write nibble-sized constants in bit form (e.g. {suggest_b}.)''' 863 | raise SyntaxWarning(msg) 864 | if spec_width % 8 != 0: 865 | suggest = bin(const_value)[2:] 866 | suggest = '0' * (const_width - len(suggest)) + suggest 867 | msg = f'''byte constant {token} is not byte-aligned 868 | 869 | Check declaration for bugs, or re-write constant as {suggest} bits.''' 870 | raise IndexError(msg) 871 | fields.append(('c', spec_width, const_mask, const_value, const_width)) 872 | spec_width += const_width 873 | elif ':' in token: 874 | var_name, var_kind = token.split(':') 875 | if not var_name.isidentifier() and var_name not in ('.', '-'): 876 | msg = f"{var_name} in {token} isn't a Python identifier or . or -" 877 | raise SyntaxError(msg) 878 | if is_signed := var_kind.startswith('s'): 879 | var_kind = var_kind[1:] 880 | var_width = int(var_kind) 881 | if var_name in ('.', '-'): 882 | const_value = 0 883 | const_mask = 0 884 | fields.append(('c', spec_width, const_mask, const_value, var_width)) 885 | else: 886 | var_mask = (1 << var_width) - 1 887 | fields.append(('v', spec_width, var_mask, var_name, is_signed)) 888 | spec_width += var_width 889 | else: 890 | # could add a opc2c-style name-length-encoding here, but it's more 891 | # cutesy code golf than a real space saver 892 | raise SyntaxError(f'''unidentified field {token!r} 893 | 894 | Valid fields should be bytes (0xf00f) bits (010..11. where . is a "don't-care" 895 | wildcard) or variable bindings (signed_jump:s24, imm:16, etc.)''') 896 | 897 | if spec_width % 8 != 0: 898 | msg = f'''{spec_width}-bit pattern width isn't a multiple of 8 899 | 900 | Most encodings are byte-aligned. There might be a subtle error in the 901 | specification, or just some forgotten '....' don't-care padding.''' 902 | raise IndexError(msg) 903 | 904 | # accumulate constant fields (remember that index is wrong bit-endianness) 905 | const_bits = [(mask, const, spec_width - index - width) 906 | for ty,index,mask,const,width in fields if ty == 'c'] 907 | mask, const = 0, 0 908 | for m,c,shift in const_bits: 909 | mask |= m << shift 910 | const |= c << shift 911 | match = Match(mask, const, spec_width // 8) 912 | 913 | # accumulate variable fields 914 | var_fields = [(name,Slice(mask << (spec_width - index - mask.bit_length()), 915 | spec_width - index - mask.bit_length(), 916 | signed, 917 | spec_width // 8)) 918 | for ty,index,mask,name,signed in fields if ty == 'v'] 919 | 920 | assert len(var_fields) == len(set(v[0] for v in var_fields)) # TODO dups 921 | var_fields = {name:slicer for name,slicer in var_fields} 922 | 923 | return match, var_fields 924 | 925 | class Match(typing.NamedTuple): 926 | """Match some constant bits inside an exact length of bytes.""" 927 | mask: int 928 | const: int 929 | byte_length: int 930 | def __repr__(self): 931 | fmt = f'{{0:0{self.byte_length * 8}b}}' 932 | lut = {('0','0'):'.', ('0','1'):'.', 933 | ('1','0'):'0', ('1','1'):'1'} 934 | wildcard = ''.join([lut[w] for w in zip(fmt.format(self.mask), 935 | fmt.format(self.const))]) 936 | return f'' 937 | 938 | def matches(self, bytes): # slow path 939 | """Exact-length pattern match. 940 | 941 | >>> m = bitspec.Match(0xffff00, 0xdead00, 4) 942 | >>> [m.matches(bs) for bs in [b'\\xff\\xde\\xad\\xff', 943 | ... b'\\xff\\xde\\xad\\xffAAA']] 944 | [True, False] 945 | """ 946 | if len(bytes) != self.byte_length: 947 | return False 948 | n = int.from_bytes(bytes, byteorder='big') 949 | return (n & self.mask) == self.const 950 | 951 | def expand_by(self, byte_length): 952 | """Don't change pattern, but add some don't-care bytes after it.""" 953 | if byte_length == 0: 954 | return self 955 | elif byte_length > 0: 956 | bits = byte_length * 8 957 | return Match(self.mask << bits, self.const << bits, 958 | self.byte_length + byte_length) 959 | else: 960 | raise NotImplementedError(f'{byte_length} on {self}') 961 | 962 | @staticmethod 963 | def multimatch_precompute(matches) -> object: 964 | """Match lots of stuff at slightly better than O(n) 965 | 966 | [(Match, any)] -> bytes -> [any] 967 | """ 968 | unique_masks = set((m.byte_length, m.mask) for m,_ in matches) 969 | first = lambda t:t[0] 970 | lut = {} 971 | for _, mask in unique_masks: 972 | const_lut = [] 973 | for match,result in matches: 974 | const_lut += [(m.const, result) for m,result in matches 975 | if m.mask == mask] 976 | lut[mask] = {const: list(t[1] for t in group) 977 | for const,group in groupby(sorted(const_lut, key=first), 978 | key=first)} 979 | return unique_masks,lut 980 | 981 | @staticmethod 982 | def multimatch_execute(opaque, bytes) -> list: 983 | unique_masks,lut = opaque 984 | acc = [] 985 | for byte_length, mask in unique_masks: 986 | if len(bytes) < byte_length: 987 | continue 988 | n = int.from_bytes(bytes[:byte_length], byteorder='big') 989 | acc += lut[mask].get(n & mask, []) 990 | return acc 991 | 992 | 993 | class Slice(typing.NamedTuple): 994 | """Extract a contiguous region of bits from fixed length of bytes. 995 | 996 | >>> s = bitspec.Slice(0x00ff00, 8, True, 3) 997 | >>> [s.from_bytes(bs) for bs in [b'\\xfa\\xfe\\xaf', b'\\xaf\\xff\\xfa', 998 | ... b'\\xaf\\x00\\xfa', b'\\xfa\\x01\\xaf', 999 | ... b'A ABBB']] 1000 | [-2, -1, 0, 1, 32] 1001 | >>> 1002 | """ 1003 | mask: int 1004 | shift: int 1005 | signed: bool 1006 | byte_length: int 1007 | 1008 | def __repr__(self): 1009 | fmt = f'{{0:0{self.byte_length * 8}b}}' 1010 | val = 'i' if self.signed else 'u' 1011 | wildcard = fmt.format(self.mask).replace('1',val) 1012 | if self.shift: # s[:-0] == '', so 0 shift breaks printing 1013 | wildcard = wildcard[:-self.shift] + '_'*self.shift 1014 | return f'' 1015 | 1016 | def from_bytes(self, bs): # slow path 1017 | """Extract value from byte_length-sized prefix of bs. 1018 | 1019 | Raises: 1020 | EOFError: not enough bytes 1021 | """ 1022 | if len(bs) < self.byte_length: 1023 | raise EOFError 1024 | val = int.from_bytes(bs[:self.byte_length], byteorder='big') 1025 | val &= self.mask 1026 | val >>= self.shift 1027 | if self.signed: 1028 | width = (self.mask >> self.shift).bit_length() 1029 | sign_mask = 1 << (width - 1) 1030 | if val & sign_mask: 1031 | val ^= sign_mask 1032 | val -= sign_mask 1033 | return val 1034 | 1035 | 1036 | def rle(s): return ' '.join(f'{c}:{len(list(g))}' for c, g in groupby(s)) 1037 | 1038 | 1039 | def check_duplicate_args(var_fields, const_fields): 1040 | sliced = var_fields.keys() 1041 | const = const_fields.keys() 1042 | if collision := set(sliced).intersection(set(const)): 1043 | msg = f'''{collision} is both bit-sliced and assigned a constant value 1044 | 1045 | If it's constant but not needed to disambiguate the pattern match, replace the 1046 | variable match with don't-care "." bits. Otherwise remove the {collision} 1047 | keyword argument while keeping the specification the same.''' 1048 | raise NameError(msg) 1049 | 1050 | 1051 | def check_class_args(cls, var_fields, const_fields, specification): 1052 | """Check if class constructor has matching arguments. 1053 | 1054 | This has to happen inside the decorator's closure in order to get access to 1055 | the actual class object. As a result the stack trace no longer references 1056 | the exact line that caused error, so print it manually. 1057 | """ 1058 | # maybe check if dataclass fields also match, then don't implement 1059 | # assembler if mismatching? 1060 | if not (constructor := getattr(cls, '__init__')): 1061 | msg = f'{cls} has no __init__ method, cannot validate bitspec variables' 1062 | raise RuntimeError(msg) 1063 | 1064 | argspec = inspect.getfullargspec(constructor) 1065 | if argspec.varargs != None or argspec.varkw != None: 1066 | return # we can't reason about *args or **kwargs, assume correctness 1067 | 1068 | fields = set(list(var_fields.keys()) + list(const_fields.keys())) 1069 | 1070 | if argspec.defaults == None: 1071 | unfilled_args = set(argspec.args) 1072 | else: 1073 | unfilled_args = set(argspec.args[:-len(argspec.defaults)]) 1074 | unfilled_args.remove('self') 1075 | if argspec.kwonlydefaults: 1076 | unfilled_args -= set(argspec.kwonlydefaults.keys()) 1077 | unfilled_args -= fields 1078 | 1079 | if extra_fields := fields - set(argspec.args): 1080 | msg_top = f"{extra_fields!r} not present in {cls.__name__}.__init__" 1081 | elif unfilled_args: 1082 | msg_top = f'constructor arguments {unfilled_args!r} not initialized' 1083 | else: 1084 | return # best-effort "static" checks all ok 1085 | 1086 | msg = msg_top + f''' 1087 | 1088 | This would cause an exception when from_bytes is called. 1089 | Specification: {specification} 1090 | Bit-sliced args: {list(var_fields.keys())} 1091 | Additional args: {list(const_fields.keys())} 1092 | Expected __init__ args: {inspect.signature(constructor)}''' 1093 | raise NameError(msg) 1094 | 1095 | 1096 | def install_methods(cls, match, var_fields, const_fields): 1097 | fields = var_fields; fields.update(const_fields) 1098 | # CAUTION: this mutated var_fields, currently this function consumes 'em 1099 | # but that may change in the future :\ 1100 | # Alternatives are {**a,**b} (3.5+) or a|b (3.9+) 1101 | 1102 | spec = cls.__dict__.get(_SPEC, {}) 1103 | if match in spec: 1104 | msg = f'''pattern {match} is ambiguous 1105 | 1106 | First option: {spec[match]!r} 1107 | Second option: {fields!r}''' 1108 | raise SyntaxError(msg) 1109 | if match != None: 1110 | # Sometimes we decorate a class that can't be matched but anchors other 1111 | # matches. Still set _SPEC so is_bitspec works. 1112 | spec[match] = fields 1113 | setattr(cls, _SPEC, spec) 1114 | 1115 | if not getattr(cls, 'from_bytes', None): 1116 | setattr(cls, 'from_bytes', classmethod(from_bytes)) 1117 | if not getattr(cls, 'iter_bytes', None): 1118 | setattr(cls, 'iter_bytes', classmethod(iter_bytes)) 1119 | set_new_attr(cls, '__len__', byte_length) 1120 | set_new_attr(cls, 'to_bytes', to_bytes) 1121 | return cls 1122 | 1123 | 1124 | def set_new_attr(cls, name, value): 1125 | if name in cls.__dict__: 1126 | assert getattr(cls, name) is value # TODO allow overrides? how? 1127 | return # has already been set, don't need to do multiple times 1128 | 1129 | if isinstance(value, FunctionType): 1130 | # adjust generated code (incorrect for statics) (TODO move to codegen) 1131 | value.__qualname__ = f'{cls.__qualname__}.{value.__name__}' 1132 | 1133 | setattr(cls, name, value) 1134 | 1135 | 1136 | # define declaration @ top of file 1137 | Bitspec.from_bytes = (from_bytes) 1138 | Bitspec.iter_bytes = (iter_bytes) 1139 | Bitspec.__len__ = byte_length 1140 | Bitspec.to_bytes = to_bytes 1141 | 1142 | 1143 | if __name__ == '__main__': 1144 | import doctest 1145 | import bitspec as bs 1146 | doctest.testmod( 1147 | #optionflags=doctest.REPORT_ONLY_FIRST_FAILURE, 1148 | optionflags=doctest.ELLIPSIS, 1149 | #verbose=True, 1150 | globs={ 1151 | 'bitspec':bs, 1152 | }, 1153 | ) 1154 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["flit_core >=3.2,<4"] 3 | build-backend = "flit_core.buildapi" 4 | 5 | [project] 6 | name = "bitspec" 7 | authors = [{name = "amtal"}] 8 | readme = "README.md" 9 | requires-python = ">=3.8" 10 | license = {text = "MIT (https://opensource.org/licenses/MIT)"} 11 | classifiers = [ 12 | "Topic :: Software Development :: Disassemblers", 13 | "License :: OSI Approved :: MIT License", 14 | ] 15 | dynamic = ["version", "description"] 16 | 17 | [project.urls] 18 | Documentation = "https://amtal.github.io/bitspec" 19 | Source = "https://github.com/amtal/bitspec" 20 | Issues = "https://github.com/amtal/bitspec/issues" 21 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | [tox] 2 | isolated_build = True 3 | envlist = py38,py39,py310 4 | 5 | [gh-actions] 6 | python = 7 | 3.8: py38 8 | 3.9: py39 9 | 3.10: py310 10 | 11 | [testenv] 12 | deps = 13 | coverage 14 | flake8 15 | mccabe 16 | pylint 17 | pytest 18 | commands = 19 | flake8 --exit-zero bitspec.py 20 | pylint --exit-zero bitspec.py 21 | coverage erase 22 | coverage run --include=bitspec.py -m bitspec 23 | coverage report -m --------------------------------------------------------------------------------