├── .gitignore ├── LICENSE ├── README.md ├── __init__.py ├── examples ├── batch_process.py ├── convert_siglib.py ├── libc-scraper │ ├── .gitignore │ ├── README.md │ ├── merge_ubuntu.py │ ├── process-deb.sh │ ├── run.sh │ └── ubuntu-libc-scraper.py ├── merge_multiple_versions.py └── sig_match.py ├── icon.ico ├── images └── explorer.png ├── plugin.json ├── pyproject.toml ├── requirements.txt ├── setup.cfg ├── sigkit ├── FlatbufSignatureLibrary │ ├── CallRef.py │ ├── Function.py │ ├── Pattern.py │ ├── SignatureLibrary.py │ ├── TrieNode.py │ └── __init__.py ├── __init__.py ├── compute_sig.py ├── sig_serialize_fb.py ├── sig_serialize_json.py ├── sigexplorer.py ├── signaturelibrary.py └── trie_ops.py └── signaturelibrary.fbs /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__ 2 | *.bndb 3 | *.a 4 | *.so 5 | *.so.6 6 | testcases/ 7 | *.pyc 8 | .idea 9 | 10 | *.pkl 11 | *.fb 12 | *.sig 13 | *.zlib 14 | 15 | sigs/ 16 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2019-2020 Vecto 35 Inc 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 4 | 5 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 6 | 7 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Signature Kit Plugin (v1.2.2) 2 | Author: **Vector 35 Inc** 3 | 4 | _Python tools for working with Signature Libraries_ 5 | 6 | ## Description: 7 | 8 | This plugin provides Python tools for generating, manipulating, viewing, loading, and saving signature libraries (.sig) for the Signature System. This plugin also provides UI integration for easy access from the Binary Ninja UI to common functions in the `Plugins\Signature Library` menu. 9 | 10 | 11 | ![Signature Explorer](https://raw.githubusercontent.com/vector35/sigkit/master/images/explorer.png) 12 | 13 | Also included are [example scripts](https://github.com/Vector35/sigkit/tree/master/examples) which demonstrate batch processing and automatic creation of signature libraries for Ubuntu libc. 14 | You can also run the Signature Explorer GUI as a standalone app. 15 | 16 | 17 | ## Installation Instructions 18 | 19 | ### Windows 20 | 21 | 22 | 23 | ### Linux 24 | 25 | 26 | 27 | ### Darwin 28 | 29 | 30 | 31 | ## Minimum Version 32 | 33 | This plugin requires the following minimum version of Binary Ninja: 34 | 35 | * 1997 36 | 37 | 38 | 39 | ## Required Dependencies 40 | 41 | The following dependencies are required for this plugin: 42 | 43 | * pip - flatbuffers 44 | 45 | 46 | ## License 47 | 48 | This plugin is released under a MIT license. 49 | ## Metadata Version 50 | 51 | 2 52 | -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | 3 | # Copyright (c) 2019-2020 Vector 35 Inc 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to 7 | # deal in the Software without restriction, including without limitation the 8 | # rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 9 | # sell copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | # 12 | # The above copyright notice and this permission notice shall be included in 13 | # all copies or substantial portions of the Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 | # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 | # IN THE SOFTWARE. 22 | 23 | from .sigkit.sig_serialize_fb import SignatureLibraryReader, SignatureLibraryWriter 24 | from .sigkit.compute_sig import process_function as generate_function_signature 25 | 26 | def load_signature_library(filename): 27 | """ 28 | Load a signature library from a .sig file. 29 | :param filename: input filename 30 | :return: instance of `TrieNode`, the root of the signature trie. 31 | """ 32 | with open(filename, 'rb') as f: 33 | buf = f.read() 34 | return SignatureLibraryReader().deserialize(buf) 35 | 36 | def save_signature_library(sig_lib, filename): 37 | """ 38 | Save the given signature library to a file. 39 | :param sig_lib: instance of `TrieNode`, the root of the signature trie. 40 | :param filename: destination filename 41 | """ 42 | buf = SignatureLibraryWriter().serialize(sig_lib) 43 | with open(filename, 'wb') as f: 44 | f.write(buf) 45 | -------------------------------------------------------------------------------- /examples/batch_process.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | # Copyright (c) 2015-2020 Vector 35 Inc 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to 7 | # deal in the Software without restriction, including without limitation the 8 | # rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 9 | # sell copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | # 12 | # The above copyright notice and this permission notice shall be included in 13 | # all copies or substantial portions of the Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 | # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 | # IN THE SOFTWARE. 22 | 23 | """ 24 | This script processes many object files using headless mode and generate 25 | function signatures for functions in them in a highly parallelized fashion. 26 | The result is a dictionary of {FunctionNode: FunctionInfo} that is then pickled 27 | and saved to disk. These pickles can be processed with a merging script, i.e. 28 | merge_multiple_versions.py or libc-scraper's merge_ubuntu.py. 29 | """ 30 | 31 | import time 32 | 33 | from binaryninja import * 34 | 35 | import sigkit 36 | 37 | def process_bv(bv): 38 | global results 39 | print(bv.file.filename, ': processing') 40 | guess_relocs = len(bv.relocation_ranges) == 0 41 | 42 | for func in bv.functions: 43 | try: 44 | if bv.get_symbol_at(func.start) is None: continue 45 | node, info = sigkit.generate_function_signature(func, guess_relocs) 46 | results.put((node, info)) 47 | print("Processed", func.name) 48 | except: 49 | import traceback 50 | traceback.print_exc() 51 | print(bv.file.filename, ': done') 52 | 53 | def on_analysis_complete(self): 54 | global wg 55 | process_bv(self.view) 56 | with wg.get_lock(): 57 | wg.value -= 1 58 | self.view.file.close() 59 | 60 | def process_binary(input_binary): 61 | global wg 62 | print(input_binary, ': loading') 63 | if input_binary.endswith('.dll'): 64 | bv = binaryninja.BinaryViewType["PE"].open(input_binary) 65 | cxt = PluginCommandContext(bv) 66 | PluginCommand.get_valid_list(cxt)['PDB\\Load (BETA)'].execute(cxt) 67 | elif input_binary.endswith('.o'): 68 | bv = binaryninja.BinaryViewType["ELF"].open(input_binary) 69 | else: 70 | raise ValueError('unsupported input file', input_binary) 71 | if not bv: 72 | print('Failed to load', input_binary) 73 | return 74 | AnalysisCompletionEvent(bv, on_analysis_complete) 75 | bv.update_analysis() 76 | with wg.get_lock(): 77 | wg.value += 1 78 | 79 | def async_process(input_queue): 80 | for input_binary in input_queue: 81 | process_binary(input_binary) 82 | yield 83 | 84 | def init_child(wg_, results_): 85 | global wg, results 86 | wg, results = wg_, results_ 87 | 88 | if __name__ == '__main__': 89 | import sys 90 | from pathlib import Path 91 | if len(sys.argv) < 3: 92 | print('Usage: %s ' % (sys.argv[0])) 93 | print('The pickle designates the filename of a pickle file that the computed function metadata will be saved to.') 94 | sys.exit(1) 95 | 96 | import multiprocessing as mp 97 | wg = mp.Value('i', 0) 98 | results = mp.Queue() 99 | 100 | func_info = {} 101 | 102 | with mp.Pool(mp.cpu_count(), initializer=init_child, initargs=(wg, results)) as pool: 103 | pool.map(process_binary, map(str, Path('.').glob(sys.argv[1]))) 104 | 105 | while True: 106 | time.sleep(0.1) 107 | with wg.get_lock(): 108 | if wg.value == 0: break 109 | 110 | while not results.empty(): 111 | node, info = results.get() 112 | func_info[node] = info 113 | 114 | import pickle 115 | with open(sys.argv[2], 'wb') as f: 116 | pickle.dump(func_info, f) 117 | -------------------------------------------------------------------------------- /examples/convert_siglib.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2015-2020 Vector 35 Inc 2 | # 3 | # Permission is hereby granted, free of charge, to any person obtaining a copy 4 | # of this software and associated documentation files (the "Software"), to 5 | # deal in the Software without restriction, including without limitation the 6 | # rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 7 | # sell copies of the Software, and to permit persons to whom the Software is 8 | # furnished to do so, subject to the following conditions: 9 | # 10 | # The above copyright notice and this permission notice shall be included in 11 | # all copies or substantial portions of the Software. 12 | # 13 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 18 | # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 19 | # IN THE SOFTWARE. 20 | 21 | """ 22 | This utility shows how to load and save signature libraries using the sigkit API. 23 | Although many file formats are supported, Binary Ninja will only support signatures 24 | in the .sig (flatbuffer) format. The other formats are provided for debugging 25 | purposes. 26 | """ 27 | 28 | import pickle 29 | import zlib 30 | 31 | from sigkit import * 32 | 33 | if __name__ == '__main__': 34 | import sys 35 | 36 | if len(sys.argv) < 2: 37 | print('Usage: convert_siglib.py ') 38 | sys.exit(1) 39 | 40 | # Load a signature library. 41 | filename = sys.argv[1] 42 | basename, ext = filename[:filename.index('.')], filename[filename.index('.'):] 43 | if ext == '.sig': 44 | with open(filename, 'rb') as f: 45 | sig_trie = sig_serialize_fb.load(f) 46 | elif ext == '.json': 47 | with open(filename, 'r') as f: 48 | sig_trie = sig_serialize_json.load(f) 49 | elif ext == '.json.zlib': 50 | with open(filename, 'rb') as f: 51 | sig_trie = sig_serialize_json.deserialize(json.loads(zlib.decompress(f.read()).decode('utf-8'))) 52 | elif ext == '.pkl': 53 | with open(filename, 'rb') as f: 54 | sig_trie = pickle.load(f) 55 | else: 56 | print('Unsupported file extension ' + ext) 57 | sys.exit(1) 58 | 59 | # Save the signature library to a binary format and write it to a file. 60 | buf = sig_serialize_fb.dumps(sig_trie) 61 | with open(basename + '.sig', 'wb') as f: 62 | f.write(buf) 63 | 64 | # This is a pretty stringent assertion, but I want to be sure this implementation is correct. 65 | # having the exact same round-trip depends on having a consistent iteration order through the trie as well 66 | # as the ordering of the functions per node. That's enforced by iterating the trie (DFS) in a sorted fashion. 67 | assert buf == sig_serialize_fb.SignatureLibraryWriter().serialize(sig_serialize_fb.SignatureLibraryReader().deserialize(buf)) 68 | -------------------------------------------------------------------------------- /examples/libc-scraper/.gitignore: -------------------------------------------------------------------------------- 1 | ubuntu 2 | requests-cache.sqlite 3 | -------------------------------------------------------------------------------- /examples/libc-scraper/README.md: -------------------------------------------------------------------------------- 1 | # libc-scraper 2 | 3 | This directory includes scripts that demonstrate how sigkit can scaled up performantly. 4 | 5 | The goal of libc-scraper is to scrape *.debs for Ubuntu libcs, process them using headless mode, and generate space-efficient signature libraries. 6 | 7 | batch_process.py demonstrates how to generate signatures using headless mode. 8 | 9 | Of special interest is merge_ubuntu.py, which shows how you can make create small signature libraries that combine multiple versions of the same library. 10 | Using clever tricks, it is possible to aggressively deduplicate across multiple versions while maintaining accuracy. 11 | -------------------------------------------------------------------------------- /examples/libc-scraper/merge_ubuntu.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | # Copyright (c) 2015-2020 Vector 35 Inc 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to 7 | # deal in the Software without restriction, including without limitation the 8 | # rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 9 | # sell copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | # 12 | # The above copyright notice and this permission notice shall be included in 13 | # all copies or substantial portions of the Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 | # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 | # IN THE SOFTWARE. 22 | 23 | """ 24 | This script generates libc signature libraries after precomputing function 25 | signatures using batch_process.py, using all cpus available on the machine. 26 | """ 27 | 28 | import os, sys 29 | import gc 30 | import pickle 31 | from pathlib import Path 32 | import tqdm 33 | import asyncio 34 | import concurrent.futures 35 | import math 36 | 37 | import sigkit.signaturelibrary 38 | import sigkit.trie_ops 39 | import sigkit.sig_serialize_fb 40 | 41 | cpu_factor = int(math.ceil(math.sqrt(os.cpu_count()))) 42 | 43 | # delete weird, useless funcs and truncate names 44 | def cleanup_info(func_info, maxlen=40): 45 | import re 46 | to_delete = set() 47 | for f in func_info: 48 | if re.match(r'\.L\d+', f.name): 49 | to_delete.add(f) 50 | continue 51 | f.name = f.name[:maxlen] 52 | for f in to_delete: 53 | del func_info[f] 54 | 55 | # load all pickles into a single signature library 56 | def load_pkls(pkls): 57 | # rarely-used libgcc stuff 58 | pkl_blacklist = {'libcilkrts.pkl', 'libubsan.pkl', 'libitm.pkl', 'libgcov.pkl', 'libmpx.pkl', 'libmpxwrappers.pkl', 'libquadmath.pkl', 'libgomp.pkl'} 59 | trie, func_info = sigkit.signaturelibrary.new_trie(), {} 60 | for pkl in pkls: 61 | if os.path.basename(pkl) in pkl_blacklist: continue 62 | with open(pkl, 'rb') as f: 63 | pkl_funcs = pickle.load(f) 64 | cleanup_info(pkl_funcs) 65 | sigkit.trie_ops.trie_insert_funcs(trie, pkl_funcs) 66 | func_info.update(pkl_funcs) 67 | sigkit.trie_ops.finalize_trie(trie, func_info) 68 | return trie, func_info 69 | 70 | def combine_sig_libs(sig_lib1, sig_lib2): 71 | sigkit.trie_ops.combine_signature_libraries(*sig_lib1, *sig_lib2) 72 | return sig_lib1 73 | 74 | def finalize_sig_lib(sig_lib): 75 | sigkit.trie_ops.finalize_trie(*sig_lib) 76 | return sig_lib 77 | 78 | def do_package(package): 79 | loop = asyncio.get_event_loop() 80 | pool = concurrent.futures.ProcessPoolExecutor(cpu_factor) 81 | 82 | async def inner(): 83 | print('Processing', package) 84 | result_filename = os.path.join('sigs', package.replace('/', '-') + '.sig') 85 | if os.path.exists(result_filename): 86 | print(result_filename + ' exists') 87 | return 88 | 89 | pkl_groups = [] 90 | for pkg_version in os.listdir(package): 91 | pkg_version = os.path.join(package, pkg_version) 92 | pkls = Path(pkg_version).glob('**/*.pkl') 93 | pkls = list(map(str, pkls)) 94 | if not pkls: continue 95 | # print(' ' + pkg_version, len(pkls)) 96 | pkl_groups.append(pkls) 97 | if not pkl_groups: 98 | print(package, 'has no versions available') 99 | return 100 | 101 | with tqdm.tqdm(total=len(pkl_groups), desc='generating tries') as pbar: 102 | async def async_load(to_load): 103 | result = await loop.run_in_executor(pool, load_pkls, to_load) 104 | pbar.update(1) 105 | pbar.refresh() 106 | return result 107 | lib_versions = await asyncio.gather(*map(async_load, pkl_groups)) 108 | 109 | # linear merge 110 | # dst_trie, dst_funcs = sigkit.signaturelibrary.new_trie(), {} 111 | # for trie, funcs in tqdm.tqdm(lib_versions): 112 | # sigkit.trie_ops.combine_signature_libraries(dst_trie, dst_funcs, trie, funcs) 113 | 114 | # big brain parallel async binary merge 115 | with tqdm.tqdm(total=len(lib_versions)-1, desc='merging') as pbar: 116 | async def merge(sig_libs): 117 | assert len(sig_libs) 118 | if len(sig_libs) == 1: 119 | return sig_libs[0] 120 | else: 121 | half = len(sig_libs) // 2 122 | sig_lib1, sig_lib2 = await asyncio.gather(merge(sig_libs[:half]), merge(sig_libs[half:])) 123 | sig_libs[:] = [None] * len(sig_libs) # free memory 124 | merged_lib = await loop.run_in_executor(pool, combine_sig_libs, sig_lib1, sig_lib2) 125 | pbar.update(1) 126 | pbar.refresh() 127 | gc.collect() 128 | return merged_lib 129 | sig_lib = await merge(lib_versions) 130 | 131 | dst_trie, dst_funcs = await loop.run_in_executor(pool, finalize_sig_lib, sig_lib) 132 | if not dst_funcs: 133 | print(package, 'has no functions') 134 | return 135 | 136 | buf = sigkit.sig_serialize_fb.SignatureLibraryWriter().serialize(dst_trie) 137 | with open(result_filename, 'wb') as f: 138 | f.write(buf) 139 | print(' saved to', result_filename, ' | size:', len(buf)) 140 | 141 | loop.run_until_complete(inner()) 142 | 143 | def main(): 144 | if not os.path.exists('sigs'): 145 | os.mkdir('sigs') 146 | elif not os.path.isdir('sigs'): 147 | print('Please delete "sigs" before starting') 148 | sys.exit(1) 149 | 150 | tasks = [] 151 | distr = 'ubuntu' 152 | # for version in os.listdir(distr): 153 | for version in ['bionic']: 154 | version = os.path.join(distr, version) 155 | for arch in os.listdir(version): 156 | arch = os.path.join(version, arch) 157 | for package in os.listdir(arch): 158 | package = os.path.join(arch, package) 159 | tasks.append(package) 160 | 161 | # we are going to do some heirarchical multiprocessing because there is a very high pickle message-passing overhead 162 | # so a lot of cpu time gets burned pickling in the main process simply passing work to worker processes 163 | import subprocess 164 | import multiprocessing.pool 165 | pool = multiprocessing.pool.ThreadPool(cpu_factor) 166 | def do_package_in_worker(package): 167 | subprocess.call(['python3', __file__, '-c', package]) 168 | for _ in pool.imap_unordered(do_package_in_worker, tasks): 169 | pass 170 | 171 | if __name__ == '__main__': 172 | if len(sys.argv) <= 1: 173 | main() 174 | elif len(sys.argv) >= 3 and sys.argv[1] == '-c': 175 | # child 176 | do_package(sys.argv[2]) 177 | -------------------------------------------------------------------------------- /examples/libc-scraper/process-deb.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | debfile=$1 3 | pushd `dirname $debfile` 4 | debfilename=`basename $debfile` 5 | echo Now processing $debfilename 6 | debfile_extract=${debfilename%.*}; 7 | if [ -d $debfile_extract ]; then 8 | echo $debfile_extract already exists, exiting 9 | exit 10 | fi 11 | dpkg-deb -x $debfilename $debfile_extract 12 | pushd $debfile_extract 13 | for libfile in `find . -iname '*.a'`; do 14 | f=`basename $libfile` 15 | if [[ $f = libasan.a || $f = libtsan.a ]]; then 16 | echo Skipping $libfile 17 | continue 18 | fi 19 | pushd `dirname $libfile` 20 | echo ..Now processing $f 21 | g=${f%.*}; 22 | if [ ! -d $g ]; then 23 | mkdir -p $g 24 | pushd $g 25 | ar vx ../$f >> ../"$g"_log.txt 26 | python3 ~/sigkit/batch_process.py "*.o" ../"$g".pkl ../"$g"_checkpoint.pkl >> ../"$g"_log.txt 2>&1 27 | rm -f *.o # free disk space 28 | popd 29 | else 30 | echo Skipping existing $g 31 | fi 32 | popd 33 | done 34 | g=objs 35 | python3 ~/sigkit/batch_process.py "**/*.o" "$g".pkl "$g"_checkpoint.pkl >> "$g"_log.txt 2>&1 36 | find . -iname "*.so" -delete 37 | find . -iname "*.x" -delete 38 | find . -iname "*.h" -delete 39 | popd 40 | popd -------------------------------------------------------------------------------- /examples/libc-scraper/run.sh: -------------------------------------------------------------------------------- 1 | find . -iname '*.deb' | (while read line; do 2 | arch=`echo $line | awk -F/ '{print $3}'` 3 | if [[ $arch = amd64 || $arch = arm64 || $arch = armel || $arch = armhf || $arch = i386 || $arch = lpia || $arch = powerpc ]]; then 4 | echo "$line" 5 | fi 6 | done) | parallel -j 3 "process-deb.sh {}" 7 | -------------------------------------------------------------------------------- /examples/libc-scraper/ubuntu-libc-scraper.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | # Copyright (c) 2015-2020 Vector 35 Inc 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to 7 | # deal in the Software without restriction, including without limitation the 8 | # rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 9 | # sell copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | # 12 | # The above copyright notice and this permission notice shall be included in 13 | # all copies or substantial portions of the Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 | # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 | # IN THE SOFTWARE. 22 | 23 | """ 24 | This script downloads .debs for the libc-dev packages off Ubuntu launchpad 25 | leveraging high-performance asynchronous i/o. 26 | """ 27 | 28 | import sys, os 29 | from bs4 import BeautifulSoup 30 | import urllib 31 | 32 | import aiohttp 33 | import asyncio 34 | 35 | packages = ['libc6-dev', 'libgcc-8-dev', 'libgcc-7-dev', 'libgcc-6-dev', 'libgcc-5-dev'] 36 | 37 | session, sem = None, None 38 | async def must(f): 39 | global session, sem 40 | await sem.put(None) 41 | retries = 0 42 | while True: 43 | try: 44 | r = await f(session) 45 | if r.status == 200: break 46 | print(r.status) 47 | except: pass 48 | retries += 1 49 | if retries > 10: 50 | print('Maximum retry count exceeded') 51 | sys.exit(1) 52 | await asyncio.sleep(1.0) 53 | await sem.get() 54 | return r 55 | 56 | async def get_html(url): 57 | async with (await must(lambda session: session.get(url))) as resp: 58 | sys.stderr.write('GET ' + url + '\n') 59 | return BeautifulSoup(await resp.text(), features="html.parser") 60 | 61 | async def get_series(): 62 | series = set() 63 | soup = await get_html('https://launchpad.net/ubuntu/+series') 64 | for strong in soup.find_all('strong'): 65 | for a in strong.find_all('a'): 66 | series.add(a['href']) 67 | return series 68 | 69 | async def get_archs(series): 70 | soup = await get_html('https://launchpad.net' + series + '/+builds') 71 | for select in soup.find_all('select', {'id': 'arch_tag'}): 72 | for option in select.find_all('option'): 73 | if option['value'] == 'all': continue 74 | yield series + '/' + option['value'] 75 | 76 | async def get_versions(arch, package): 77 | soup = await get_html('https://launchpad.net' + arch + '/' + package) 78 | for tr in soup.find_all('tr'): 79 | if len(tr.find_all('td')) != 10: continue 80 | yield tr.find_all('td')[9].find_all('a')[0]['href'] 81 | 82 | async def get_deb_link(version): 83 | soup = await get_html('https://launchpad.net' + version) 84 | for a in soup.find_all('a', {'class': 'sprite'}): 85 | if a['href'].endswith('.deb'): 86 | return a['href'] 87 | 88 | async def download_deb(version, deb_url): 89 | filename = urllib.parse.urlparse(deb_url).path 90 | filename = filename[filename.rindex('/') + 1:] 91 | version = os.curdir + version 92 | filename = os.path.join(version, filename) 93 | if os.path.exists(filename): 94 | print('Skipping existing file', filename) 95 | return 96 | os.makedirs(version, exist_ok=True) 97 | async with (await must(lambda session: session.get(deb_url))) as resp: 98 | data = await resp.read() 99 | if not data: 100 | print('FAILED DOWNLOAD', filename, 'from', deb_url) 101 | return 102 | with open(filename, 'wb') as f: 103 | f.write(data) 104 | print('Downloaded', filename) 105 | 106 | async def process_version(version): 107 | deb_link = await get_deb_link(version) 108 | if deb_link: 109 | await download_deb(version, deb_link) 110 | else: 111 | print('No .deb for', version) 112 | 113 | async def process_arch(arch): 114 | await asyncio.gather(*[asyncio.create_task(process_version(version)) for package in packages async for version in get_versions(arch, package)]) 115 | 116 | async def process_series(series): 117 | await asyncio.gather(*[asyncio.create_task(process_arch(arch)) async for arch in get_archs(series)]) 118 | 119 | async def main(): 120 | global session 121 | async with aiohttp.ClientSession() as session: 122 | await asyncio.gather(*[asyncio.create_task(process_series(series)) for series in await get_series()]) 123 | 124 | if __name__ == '__main__': 125 | MAX_CONCURRENT = 16 126 | loop = asyncio.get_event_loop() 127 | sem = asyncio.Queue(loop=loop, maxsize=MAX_CONCURRENT) 128 | loop.run_until_complete(main()) 129 | loop.close() 130 | -------------------------------------------------------------------------------- /examples/merge_multiple_versions.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | # Copyright (c) 2015-2020 Vector 35 Inc 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to 7 | # deal in the Software without restriction, including without limitation the 8 | # rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 9 | # sell copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | # 12 | # The above copyright notice and this permission notice shall be included in 13 | # all copies or substantial portions of the Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 | # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 | # IN THE SOFTWARE. 22 | 23 | """ 24 | This script shows how you can merge the signature libraries generated for 25 | different versions of the same library. We would want to do this because 26 | there's like a lot of overlap between the two and duplicated functions. 27 | We want to avoid creating huge signatures that are bloated with these 28 | duplicated functions, so we will deduplicate them using the trie_ops package. 29 | 30 | This script loads pickled dicts of {FunctionNode: FunctionInfo} generated 31 | by batch_process.py. 32 | """ 33 | 34 | import pickle, json 35 | import gc 36 | from pathlib import Path 37 | 38 | import sigkit.signaturelibrary, sigkit.trie_ops, sigkit.sig_serialize_json, sigkit.sigexplorer 39 | 40 | 41 | def func_count(trie): 42 | return len(set(trie.all_functions())) 43 | 44 | # Clean up the functions list, exclude some garbage functions, etc. 45 | def preprocess_funcs_list(func_info): 46 | import re 47 | to_delete = set() 48 | for f in func_info: 49 | if re.match(r'\.L\d+', f.name): 50 | to_delete.add(f) 51 | continue 52 | f.name = f.name[:40] # trim long names 53 | for f in to_delete: 54 | del func_info[f] 55 | 56 | def load_pkls(path, glob): 57 | pkls = list(map(str, Path(path).glob(glob))) 58 | trie, func_info = sigkit.signaturelibrary.new_trie(), {} 59 | for pkl in pkls: 60 | with open(pkl, 'rb') as f: 61 | pkl_funcs = pickle.load(f) 62 | preprocess_funcs_list(pkl_funcs) 63 | sigkit.trie_ops.trie_insert_funcs(trie, pkl_funcs) 64 | func_info.update(pkl_funcs) 65 | sigkit.trie_ops.finalize_trie(trie, func_info) 66 | return trie, func_info 67 | 68 | gc.disable() # I AM SPEED - Lightning McQueen 69 | dst_trie, dst_info = load_pkls('.', 'libc_version1/*.pkl') 70 | src_trie, src_info = load_pkls('.', 'libc_version2/*.pkl') 71 | gc.disable() # i am no longer speed. 72 | 73 | size1, size2 = func_count(dst_trie), func_count(src_trie) 74 | print("Pre-merge sizes: %d + %d = %d funcs" % (size1, size2, size1+size2)) 75 | 76 | sigkit.trie_ops.combine_signature_libraries(dst_trie, dst_info, src_trie, src_info) 77 | print("Post-merge size: %d funcs" % (func_count(dst_trie),)) 78 | 79 | sigkit.trie_ops.finalize_trie(dst_trie, dst_info) 80 | print("Finalized size: %d funcs" % (func_count(dst_trie),)) 81 | 82 | print(json.dumps(sigkit.sig_serialize_json.serialize(dst_trie))) 83 | sigkit.explore_signature_library(dst_trie) 84 | -------------------------------------------------------------------------------- /examples/sig_match.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2015-2020 Vector 35 Inc 2 | # 3 | # Permission is hereby granted, free of charge, to any person obtaining a copy 4 | # of this software and associated documentation files (the "Software"), to 5 | # deal in the Software without restriction, including without limitation the 6 | # rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 7 | # sell copies of the Software, and to permit persons to whom the Software is 8 | # furnished to do so, subject to the following conditions: 9 | # 10 | # The above copyright notice and this permission notice shall be included in 11 | # all copies or substantial portions of the Software. 12 | # 13 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 18 | # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 19 | # IN THE SOFTWARE. 20 | 21 | """ 22 | This file contains a signature matcher implementation in Python. This 23 | implementation is only an illustrative example and should be used for testing 24 | purposes only. It is extremely slow compared to the native implementation 25 | found in Binary Ninja. Furthermore, the algorithm shown here is outdated 26 | compared to the native implementation, so matcher results will be of inferior 27 | quality. 28 | """ 29 | 30 | from __future__ import print_function 31 | 32 | from binaryninja import * 33 | 34 | import sigkit.compute_sig 35 | 36 | class SignatureMatcher(object): 37 | def __init__(self, sig_trie, bv): 38 | self.sig_trie = sig_trie 39 | self.bv = bv 40 | 41 | self._matches = {} 42 | self._matches_inv = {} 43 | self.results = {} 44 | 45 | self._cur_match_debug = "" 46 | 47 | def resolve_thunk(self, func, level=0): 48 | if sigkit.compute_sig.get_func_len(func) >= 8: 49 | return func 50 | 51 | first_insn = func.mlil[0] 52 | if first_insn.operation == MediumLevelILOperation.MLIL_TAILCALL: 53 | thunk_dest = self.bv.get_function_at(first_insn.dest.value.value) 54 | elif first_insn.operation == MediumLevelILOperation.MLIL_JUMP and first_insn.dest.operation == MediumLevelILOperation.MLIL_LOAD and first_insn.dest.src.operation == MediumLevelILOperation.MLIL_CONST_PTR: 55 | data_var = self.bv.get_data_var_at(first_insn.dest.src.value.value) 56 | if not data_var or not data_var.data_refs_from: return None 57 | thunk_dest = self.bv.get_function_at(data_var.data_refs_from[0]) 58 | else: 59 | return func 60 | 61 | if thunk_dest is None: 62 | return None 63 | 64 | if level >= 100: 65 | # something is wrong here. there's a weird infinite loop of thunks. 66 | sys.stderr.write('Warning: reached recursion limit while trying to resolve thunk %s!\n' % (func.name,)) 67 | return None 68 | 69 | print('* following thunk %s -> %s' % (func.name, thunk_dest.name)) 70 | return self.resolve_thunk(thunk_dest, level + 1) 71 | 72 | def on_match(self, func, func_node, level=0): 73 | if func in self._matches: 74 | if self._matches[func] != func_node: 75 | sys.stderr.write('Warning: CONFLICT on %s: %s vs %s' % (func.name, self._matches[func], func_node) + '\n') 76 | if func in self.results: 77 | del self.results[func] 78 | return 79 | 80 | self.results[func] = func_node 81 | 82 | if func_node in self._matches_inv: 83 | if self._matches_inv[func_node] != func: 84 | sys.stderr.write('Warning: INVERSE CONFLICT (%s) on %s: %s vs %s' % (self._cur_match_debug, func_node, self._matches_inv[func_node].name, func.name) + '\n') 85 | return 86 | 87 | print((' ' * level) + func.name, '=>', func_node.name, 'from', func_node.source_binary, '(' + self._cur_match_debug + ')') 88 | self._matches[func] = func_node 89 | self._matches_inv[func_node] = func 90 | 91 | def compute_func_callees(self, func): 92 | """ 93 | Return a list of the names of symbols the function calls. 94 | """ 95 | callees = {} 96 | for ref in func.call_sites: 97 | callee_addrs = self.bv.get_callees(ref.address, ref.function, ref.arch) 98 | if len(callee_addrs) != 1: continue 99 | callees[ref.address - func.start] = self.bv.get_function_at(callee_addrs[0]) 100 | return callees 101 | 102 | def does_func_match(self, func, func_node, visited, level=0): 103 | print((' '*level) + 'compare', 'None' if not func else func.name, 'vs', '*' if not func_node else func_node.name, 'from ' + func_node.source_binary if func_node else '') 104 | # no information about this function. assume wildcard. 105 | if func_node is None: 106 | return 999 107 | 108 | # we expect a function to be here but there isn't one. no match. 109 | if func is None: 110 | return 0 111 | 112 | # fix for msvc thunks -.- 113 | thunk_dest = self.resolve_thunk(func) 114 | if not thunk_dest: 115 | sys.stderr.write('Warning: encountered a weird thunk %s, giving up\n' % (func.name,)) 116 | return 0 117 | func = thunk_dest 118 | 119 | # this is essentially a dfs on the callgraph. if we encounter a backedge, 120 | # treat it optimistically, implying that the callers match if the callees match. 121 | # however, we track our previous assumptions, meaning that if we previously 122 | # optimistically assumed b == a, then later on if we compare b and c, we say 123 | # that b != c since we already assumed b == a (and c != a) 124 | if func in visited: 125 | print("we've already seen visited one before") 126 | return 999 if visited[func] == func_node else 0 127 | visited[func] = func_node 128 | 129 | # if we've already figured out what this function is, don't waste our time doing it again. 130 | if func in self._matches: 131 | return 999 if self._matches[func] == func_node else 0 132 | 133 | func_len = sigkit.compute_sig.get_func_len(func) 134 | func_data = self.bv.read(func.start, func_len) 135 | if not func_node.is_bridge: 136 | trie_matches = self.sig_trie.find(func_data) 137 | if func_node not in trie_matches: 138 | print((' ' * level) + 'trie mismatch!') 139 | return 0 140 | else: 141 | print((' ' * level) + 'this is a bridge node.') 142 | 143 | disambiguation_data = func_data[func_node.pattern_offset:func_node.pattern_offset + len(func_node.pattern)] 144 | if not func_node.pattern.matches(disambiguation_data): 145 | print((' ' * level) + 'disambiguation mismatch!') 146 | return 1 147 | 148 | callees = self.compute_func_callees(func) 149 | for call_site in callees: 150 | if call_site not in func_node: 151 | print((' ' * level) + 'call sites mismatch!') 152 | return 2 153 | for call_site, callee in func_node.callees.items(): 154 | if callee is not None and call_site not in callees: 155 | print((' ' * level) + 'call sites mismatch!') 156 | return 2 157 | 158 | for call_site in callees: 159 | if self.does_func_match(callees[call_site], func_node.callees[call_site], visited, level + 1) != 999: 160 | print((' '*level) + 'callee ' + func_node.callees[call_site].name + ' mismatch!') 161 | return 3 162 | 163 | self._cur_match_debug = 'full match' 164 | self.on_match(func, func_node, level) 165 | return 999 166 | 167 | 168 | def process_func(self, func): 169 | """ 170 | Try to sig the given function. 171 | Return the list of signatures the function matched against 172 | """ 173 | func_len = sigkit.compute_sig.get_func_len(func) 174 | func_data = self.bv.read(func.start, func_len) 175 | trie_matches = self.sig_trie.find(func_data) 176 | best_score, results = 0, [] 177 | for candidate_func in trie_matches: 178 | score = self.does_func_match(func, candidate_func, {}) 179 | if score > best_score: 180 | results = [candidate_func] 181 | best_score = score 182 | elif score == best_score: 183 | results.append(candidate_func) 184 | if len(results) == 0: 185 | print(func.name, '=>', 'no match', end=", ") 186 | for x in self.sig_trie.all_values(): 187 | if x.name == func.name: 188 | print('but there was a signature from', x.source_binary) 189 | break 190 | else: 191 | print('but this is OK.') 192 | assert best_score == 0 193 | return results 194 | elif len(results) > 1: 195 | print(func.name, '=>', 'deferred at level', best_score, results) 196 | return results 197 | 198 | match = results[0] 199 | if best_score == 1: 200 | self._cur_match_debug = 'bytes match (but disambiguation mismatch?)' 201 | self.on_match(func, match) 202 | return results 203 | elif best_score == 2: 204 | self._cur_match_debug = 'bytes + disambiguation match (but callee count mismatch)' 205 | self.on_match(func, match) 206 | return results 207 | elif best_score == 3: 208 | self._cur_match_debug = 'bytes + disambiguation match (but callees mismatch)' 209 | self.on_match(func, match) 210 | return results 211 | else: 212 | self._cur_match_debug = 'full match' 213 | self.on_match(func, match) 214 | return results 215 | 216 | def run(self): 217 | queue = self.bv.functions 218 | while True: # silly fixedpoint worklist algorithm 219 | deferred = [] 220 | print('Start of pass %d functions remaining' % (len(queue))) 221 | 222 | for func in queue: 223 | if func in self._matches: 224 | continue 225 | if sigkit.compute_sig.get_func_len(func) < 8: 226 | continue 227 | matches = self.process_func(func) 228 | if len(matches) > 1: 229 | deferred.append(func) 230 | 231 | print('Pass complete, %d functions deferred' % (len(deferred),)) 232 | if len(queue) == len(deferred): 233 | print('No changes. Quit.') 234 | break 235 | queue = deferred 236 | -------------------------------------------------------------------------------- /icon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Vector35/sigkit/a7420964415a875a1e6181ecdc603cfc29e34058/icon.ico -------------------------------------------------------------------------------- /images/explorer.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Vector35/sigkit/a7420964415a875a1e6181ecdc603cfc29e34058/images/explorer.png -------------------------------------------------------------------------------- /plugin.json: -------------------------------------------------------------------------------- 1 | { 2 | "pluginmetadataversion": 2, 3 | "name": "Signature Kit Plugin", 4 | "type": [ 5 | "helper", 6 | "ui", 7 | "core" 8 | ], 9 | "api": [ 10 | "python2", 11 | "python3" 12 | ], 13 | "description": "Python tools for working with Signature Libraries", 14 | "license": { 15 | "name": "MIT", 16 | "text": "Copyright (c) 2019-2020 Vector 35 Inc\n\nPermission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the \"Software\"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:\n\nThe above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.\n\nTHE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE." 17 | }, 18 | "platforms": [ 19 | "Windows", 20 | "Linux", 21 | "Darwin" 22 | ], 23 | "installinstructions": { 24 | "Windows": "", 25 | "Linux": "", 26 | "Darwin": "" 27 | }, 28 | "dependencies": { 29 | "pip": [ 30 | "flatbuffers" 31 | ] 32 | }, 33 | "version": "1.2.2", 34 | "author": "Vector 35 Inc", 35 | "minimumbinaryninjaversion": 1997 36 | } 37 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools"] 3 | build-backend = "setuptools.build_meta" 4 | 5 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | flatbuffers 2 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | name = sigkit 3 | version = 1.2.1 4 | license = "MIT" 5 | long_description = file: README.md 6 | 7 | [options] 8 | install_requires = flatbuffers 9 | packages=find: 10 | 11 | -------------------------------------------------------------------------------- /sigkit/FlatbufSignatureLibrary/CallRef.py: -------------------------------------------------------------------------------- 1 | # automatically generated by the FlatBuffers compiler, do not modify 2 | 3 | # namespace: FlatbufSignatureLibrary 4 | 5 | import flatbuffers 6 | 7 | class CallRef(object): 8 | __slots__ = ['_tab'] 9 | 10 | # CallRef 11 | def Init(self, buf, pos): 12 | self._tab = flatbuffers.table.Table(buf, pos) 13 | 14 | # CallRef 15 | def Offset(self): return self._tab.Get(flatbuffers.number_types.Int32Flags, self._tab.Pos + flatbuffers.number_types.UOffsetTFlags.py_type(0)) 16 | # CallRef 17 | def DstId(self): return self._tab.Get(flatbuffers.number_types.Int32Flags, self._tab.Pos + flatbuffers.number_types.UOffsetTFlags.py_type(4)) 18 | 19 | def CreateCallRef(builder, offset, dstId): 20 | builder.Prep(4, 8) 21 | builder.PrependInt32(dstId) 22 | builder.PrependInt32(offset) 23 | return builder.Offset() 24 | -------------------------------------------------------------------------------- /sigkit/FlatbufSignatureLibrary/Function.py: -------------------------------------------------------------------------------- 1 | # automatically generated by the FlatBuffers compiler, do not modify 2 | 3 | # namespace: FlatbufSignatureLibrary 4 | 5 | import flatbuffers 6 | 7 | class Function(object): 8 | __slots__ = ['_tab'] 9 | 10 | @classmethod 11 | def GetRootAsFunction(cls, buf, offset): 12 | n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset) 13 | x = Function() 14 | x.Init(buf, n + offset) 15 | return x 16 | 17 | # Function 18 | def Init(self, buf, pos): 19 | self._tab = flatbuffers.table.Table(buf, pos) 20 | 21 | # Function 22 | def Name(self): 23 | o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4)) 24 | if o != 0: 25 | return self._tab.String(o + self._tab.Pos) 26 | return None 27 | 28 | # Function 29 | def SourceBinary(self): 30 | o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6)) 31 | if o != 0: 32 | return self._tab.String(o + self._tab.Pos) 33 | return None 34 | 35 | # Function 36 | def Callees(self, j): 37 | o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(8)) 38 | if o != 0: 39 | x = self._tab.Vector(o) 40 | x += flatbuffers.number_types.UOffsetTFlags.py_type(j) * 8 41 | from .CallRef import CallRef 42 | obj = CallRef() 43 | obj.Init(self._tab.Bytes, x) 44 | return obj 45 | return None 46 | 47 | # Function 48 | def CalleesLength(self): 49 | o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(8)) 50 | if o != 0: 51 | return self._tab.VectorLen(o) 52 | return 0 53 | 54 | # Function 55 | def Pattern(self): 56 | o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(10)) 57 | if o != 0: 58 | x = self._tab.Indirect(o + self._tab.Pos) 59 | from .Pattern import Pattern 60 | obj = Pattern() 61 | obj.Init(self._tab.Bytes, x) 62 | return obj 63 | return None 64 | 65 | # Function 66 | def PatternOffset(self): 67 | o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(12)) 68 | if o != 0: 69 | return self._tab.Get(flatbuffers.number_types.Uint32Flags, o + self._tab.Pos) 70 | return 0 71 | 72 | # Function 73 | def IsBridge(self): 74 | o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(14)) 75 | if o != 0: 76 | return bool(self._tab.Get(flatbuffers.number_types.BoolFlags, o + self._tab.Pos)) 77 | return False 78 | 79 | def FunctionStart(builder): builder.StartObject(6) 80 | def FunctionAddName(builder, name): builder.PrependUOffsetTRelativeSlot(0, flatbuffers.number_types.UOffsetTFlags.py_type(name), 0) 81 | def FunctionAddSourceBinary(builder, sourceBinary): builder.PrependUOffsetTRelativeSlot(1, flatbuffers.number_types.UOffsetTFlags.py_type(sourceBinary), 0) 82 | def FunctionAddCallees(builder, callees): builder.PrependUOffsetTRelativeSlot(2, flatbuffers.number_types.UOffsetTFlags.py_type(callees), 0) 83 | def FunctionStartCalleesVector(builder, numElems): return builder.StartVector(8, numElems, 4) 84 | def FunctionAddPattern(builder, pattern): builder.PrependUOffsetTRelativeSlot(3, flatbuffers.number_types.UOffsetTFlags.py_type(pattern), 0) 85 | def FunctionAddPatternOffset(builder, patternOffset): builder.PrependUint32Slot(4, patternOffset, 0) 86 | def FunctionAddIsBridge(builder, isBridge): builder.PrependBoolSlot(5, isBridge, 0) 87 | def FunctionEnd(builder): return builder.EndObject() 88 | -------------------------------------------------------------------------------- /sigkit/FlatbufSignatureLibrary/Pattern.py: -------------------------------------------------------------------------------- 1 | # automatically generated by the FlatBuffers compiler, do not modify 2 | 3 | # namespace: FlatbufSignatureLibrary 4 | 5 | import flatbuffers 6 | 7 | class Pattern(object): 8 | __slots__ = ['_tab'] 9 | 10 | @classmethod 11 | def GetRootAsPattern(cls, buf, offset): 12 | n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset) 13 | x = Pattern() 14 | x.Init(buf, n + offset) 15 | return x 16 | 17 | # Pattern 18 | def Init(self, buf, pos): 19 | self._tab = flatbuffers.table.Table(buf, pos) 20 | 21 | # Pattern 22 | def Data(self, j): 23 | o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4)) 24 | if o != 0: 25 | a = self._tab.Vector(o) 26 | return self._tab.Get(flatbuffers.number_types.Uint8Flags, a + flatbuffers.number_types.UOffsetTFlags.py_type(j * 1)) 27 | return 0 28 | 29 | # Pattern 30 | def DataAsNumpy(self): 31 | o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4)) 32 | if o != 0: 33 | return self._tab.GetVectorAsNumpy(flatbuffers.number_types.Uint8Flags, o) 34 | return 0 35 | 36 | # Pattern 37 | def DataLength(self): 38 | o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4)) 39 | if o != 0: 40 | return self._tab.VectorLen(o) 41 | return 0 42 | 43 | # Pattern 44 | def Mask(self, j): 45 | o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6)) 46 | if o != 0: 47 | a = self._tab.Vector(o) 48 | return self._tab.Get(flatbuffers.number_types.Uint8Flags, a + flatbuffers.number_types.UOffsetTFlags.py_type(j * 1)) 49 | return 0 50 | 51 | # Pattern 52 | def MaskAsNumpy(self): 53 | o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6)) 54 | if o != 0: 55 | return self._tab.GetVectorAsNumpy(flatbuffers.number_types.Uint8Flags, o) 56 | return 0 57 | 58 | # Pattern 59 | def MaskLength(self): 60 | o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6)) 61 | if o != 0: 62 | return self._tab.VectorLen(o) 63 | return 0 64 | 65 | def PatternStart(builder): builder.StartObject(2) 66 | def PatternAddData(builder, data): builder.PrependUOffsetTRelativeSlot(0, flatbuffers.number_types.UOffsetTFlags.py_type(data), 0) 67 | def PatternStartDataVector(builder, numElems): return builder.StartVector(1, numElems, 1) 68 | def PatternAddMask(builder, mask): builder.PrependUOffsetTRelativeSlot(1, flatbuffers.number_types.UOffsetTFlags.py_type(mask), 0) 69 | def PatternStartMaskVector(builder, numElems): return builder.StartVector(1, numElems, 1) 70 | def PatternEnd(builder): return builder.EndObject() 71 | -------------------------------------------------------------------------------- /sigkit/FlatbufSignatureLibrary/SignatureLibrary.py: -------------------------------------------------------------------------------- 1 | # automatically generated by the FlatBuffers compiler, do not modify 2 | 3 | # namespace: FlatbufSignatureLibrary 4 | 5 | import flatbuffers 6 | 7 | class SignatureLibrary(object): 8 | __slots__ = ['_tab'] 9 | 10 | @classmethod 11 | def GetRootAsSignatureLibrary(cls, buf, offset): 12 | n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset) 13 | x = SignatureLibrary() 14 | x.Init(buf, n + offset) 15 | return x 16 | 17 | # SignatureLibrary 18 | def Init(self, buf, pos): 19 | self._tab = flatbuffers.table.Table(buf, pos) 20 | 21 | # SignatureLibrary 22 | def Functions(self, j): 23 | o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4)) 24 | if o != 0: 25 | x = self._tab.Vector(o) 26 | x += flatbuffers.number_types.UOffsetTFlags.py_type(j) * 4 27 | x = self._tab.Indirect(x) 28 | from .Function import Function 29 | obj = Function() 30 | obj.Init(self._tab.Bytes, x) 31 | return obj 32 | return None 33 | 34 | # SignatureLibrary 35 | def FunctionsLength(self): 36 | o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4)) 37 | if o != 0: 38 | return self._tab.VectorLen(o) 39 | return 0 40 | 41 | # SignatureLibrary 42 | def Root(self): 43 | o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6)) 44 | if o != 0: 45 | x = self._tab.Indirect(o + self._tab.Pos) 46 | from .TrieNode import TrieNode 47 | obj = TrieNode() 48 | obj.Init(self._tab.Bytes, x) 49 | return obj 50 | return None 51 | 52 | def SignatureLibraryStart(builder): builder.StartObject(2) 53 | def SignatureLibraryAddFunctions(builder, functions): builder.PrependUOffsetTRelativeSlot(0, flatbuffers.number_types.UOffsetTFlags.py_type(functions), 0) 54 | def SignatureLibraryStartFunctionsVector(builder, numElems): return builder.StartVector(4, numElems, 4) 55 | def SignatureLibraryAddRoot(builder, root): builder.PrependUOffsetTRelativeSlot(1, flatbuffers.number_types.UOffsetTFlags.py_type(root), 0) 56 | def SignatureLibraryEnd(builder): return builder.EndObject() 57 | -------------------------------------------------------------------------------- /sigkit/FlatbufSignatureLibrary/TrieNode.py: -------------------------------------------------------------------------------- 1 | # automatically generated by the FlatBuffers compiler, do not modify 2 | 3 | # namespace: FlatbufSignatureLibrary 4 | 5 | import flatbuffers 6 | 7 | class TrieNode(object): 8 | __slots__ = ['_tab'] 9 | 10 | @classmethod 11 | def GetRootAsTrieNode(cls, buf, offset): 12 | n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset) 13 | x = TrieNode() 14 | x.Init(buf, n + offset) 15 | return x 16 | 17 | # TrieNode 18 | def Init(self, buf, pos): 19 | self._tab = flatbuffers.table.Table(buf, pos) 20 | 21 | # TrieNode 22 | def PatternPrefix(self): 23 | o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4)) 24 | if o != 0: 25 | return self._tab.Get(flatbuffers.number_types.Uint8Flags, o + self._tab.Pos) 26 | return 0 27 | 28 | # TrieNode 29 | def Pattern(self): 30 | o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6)) 31 | if o != 0: 32 | x = self._tab.Indirect(o + self._tab.Pos) 33 | from .Pattern import Pattern 34 | obj = Pattern() 35 | obj.Init(self._tab.Bytes, x) 36 | return obj 37 | return None 38 | 39 | # TrieNode 40 | def Children(self, j): 41 | o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(8)) 42 | if o != 0: 43 | x = self._tab.Vector(o) 44 | x += flatbuffers.number_types.UOffsetTFlags.py_type(j) * 4 45 | x = self._tab.Indirect(x) 46 | from .TrieNode import TrieNode 47 | obj = TrieNode() 48 | obj.Init(self._tab.Bytes, x) 49 | return obj 50 | return None 51 | 52 | # TrieNode 53 | def ChildrenLength(self): 54 | o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(8)) 55 | if o != 0: 56 | return self._tab.VectorLen(o) 57 | return 0 58 | 59 | # TrieNode 60 | def WildcardChild(self): 61 | o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(10)) 62 | if o != 0: 63 | x = self._tab.Indirect(o + self._tab.Pos) 64 | from .TrieNode import TrieNode 65 | obj = TrieNode() 66 | obj.Init(self._tab.Bytes, x) 67 | return obj 68 | return None 69 | 70 | # TrieNode 71 | def Functions(self, j): 72 | o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(12)) 73 | if o != 0: 74 | a = self._tab.Vector(o) 75 | return self._tab.Get(flatbuffers.number_types.Uint32Flags, a + flatbuffers.number_types.UOffsetTFlags.py_type(j * 4)) 76 | return 0 77 | 78 | # TrieNode 79 | def FunctionsAsNumpy(self): 80 | o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(12)) 81 | if o != 0: 82 | return self._tab.GetVectorAsNumpy(flatbuffers.number_types.Uint32Flags, o) 83 | return 0 84 | 85 | # TrieNode 86 | def FunctionsLength(self): 87 | o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(12)) 88 | if o != 0: 89 | return self._tab.VectorLen(o) 90 | return 0 91 | 92 | def TrieNodeStart(builder): builder.StartObject(5) 93 | def TrieNodeAddPatternPrefix(builder, patternPrefix): builder.PrependUint8Slot(0, patternPrefix, 0) 94 | def TrieNodeAddPattern(builder, pattern): builder.PrependUOffsetTRelativeSlot(1, flatbuffers.number_types.UOffsetTFlags.py_type(pattern), 0) 95 | def TrieNodeAddChildren(builder, children): builder.PrependUOffsetTRelativeSlot(2, flatbuffers.number_types.UOffsetTFlags.py_type(children), 0) 96 | def TrieNodeStartChildrenVector(builder, numElems): return builder.StartVector(4, numElems, 4) 97 | def TrieNodeAddWildcardChild(builder, wildcardChild): builder.PrependUOffsetTRelativeSlot(3, flatbuffers.number_types.UOffsetTFlags.py_type(wildcardChild), 0) 98 | def TrieNodeAddFunctions(builder, functions): builder.PrependUOffsetTRelativeSlot(4, flatbuffers.number_types.UOffsetTFlags.py_type(functions), 0) 99 | def TrieNodeStartFunctionsVector(builder, numElems): return builder.StartVector(4, numElems, 4) 100 | def TrieNodeEnd(builder): return builder.EndObject() 101 | -------------------------------------------------------------------------------- /sigkit/FlatbufSignatureLibrary/__init__.py: -------------------------------------------------------------------------------- 1 | import flatbuffers 2 | 3 | if hasattr(flatbuffers, "__version__"): 4 | saved_EndVector = flatbuffers.Builder.EndVector 5 | flatbuffers.Builder.EndVector = lambda self, *args: saved_EndVector(self) -------------------------------------------------------------------------------- /sigkit/__init__.py: -------------------------------------------------------------------------------- 1 | from binaryninja import * 2 | 3 | # exports 4 | from . import trie_ops 5 | from . import sig_serialize_fb 6 | from . import sig_serialize_json 7 | 8 | from .signaturelibrary import TrieNode, FunctionNode, Pattern, MaskedByte, new_trie 9 | from .compute_sig import process_function as generate_function_signature 10 | 11 | if core_ui_enabled(): 12 | from .sigexplorer import explore_signature_library 13 | import binaryninjaui 14 | 15 | def signature_explorer(prompt=True): 16 | """ 17 | Open the signature explorer UI. 18 | :param prompt: if True, prompt the user to open a file immediately. 19 | :return: `App`, a QT window 20 | """ 21 | if "qt_major_version" in binaryninjaui.__dict__ and binaryninjaui.qt_major_version == 6: 22 | from PySide6.QtWidgets import QApplication 23 | else: 24 | from PySide2.QtWidgets import QApplication 25 | app = QApplication.instance() 26 | global widget # avoid lifetime issues from it falling out of scope 27 | widget = sigexplorer.App() 28 | if prompt: 29 | widget.open_file() 30 | widget.show() 31 | if app: # VERY IMPORTANT to avoiding lifetime issues??? 32 | app.exec_() 33 | return widget 34 | 35 | 36 | # UI plugin code 37 | def _generate_signature_library(bv): 38 | guess_relocs = len(bv.relocation_ranges) == 0 39 | if guess_relocs: 40 | log.log_debug('Relocation information unavailable; choosing pattern masks heuristically') 41 | else: 42 | log.log_debug('Generating pattern masks based on relocation ranges') 43 | 44 | func_count = sum(map(lambda func: int(bool(bv.get_symbol_at(func.start))), bv.functions)) 45 | log.log_info('Generating signatures for %d functions' % (func_count,)) 46 | # Warning for usability purposes. Someone will be confused why it's skipping auto-named functions 47 | if func_count / float(len(bv.functions)) < 0.5: 48 | num_skipped = len(bv.functions) - func_count 49 | log.log_warn("%d functions that don't have a name or symbol will be skipped" % (num_skipped,)) 50 | 51 | funcs = {} 52 | for func in bv.functions: 53 | if bv.get_symbol_at(func.start) is None: continue 54 | func_node, info = generate_function_signature(func, guess_relocs) 55 | if func_node and info: 56 | funcs[func_node] = info 57 | log.log_debug('Processed ' + func.name) 58 | 59 | 60 | log.log_debug('Constructing signature trie') 61 | trie = signaturelibrary.new_trie() 62 | trie_ops.trie_insert_funcs(trie, funcs) 63 | log.log_debug('Finalizing trie') 64 | trie_ops.finalize_trie(trie, funcs) 65 | 66 | 67 | if 'SIGNATURE_FILE_NAME' in bv.session_data: 68 | output_filename = bv.session_data['SIGNATURE_FILE_NAME'] 69 | else: 70 | output_filename = get_save_filename_input("Filename:", "*.sig", bv.file.filename + '.sig') 71 | if not output_filename: 72 | log.log_debug('Save cancelled') 73 | return 74 | if isinstance(output_filename, bytes): 75 | output_filename = output_filename.decode('utf-8') 76 | buf = sig_serialize_fb.SignatureLibraryWriter().serialize(trie) 77 | with open(output_filename, 'wb') as f: 78 | f.write(buf) 79 | log.log_info('Saved to ' + output_filename) 80 | 81 | PluginCommand.register( 82 | "Signature Library\\Generate Signature Library", 83 | "Create a Signature Library that the Signature Matcher can use to locate functions.", 84 | _generate_signature_library 85 | ) 86 | 87 | PluginCommand.register( 88 | "Signature Library\\Explore Signature Library", 89 | "View a Signature Library's contents in a graphical interface.", 90 | lambda bv: signature_explorer() 91 | ) 92 | -------------------------------------------------------------------------------- /sigkit/compute_sig.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2015-2020 Vector 35 Inc 2 | # 3 | # Permission is hereby granted, free of charge, to any person obtaining a copy 4 | # of this software and associated documentation files (the "Software"), to 5 | # deal in the Software without restriction, including without limitation the 6 | # rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 7 | # sell copies of the Software, and to permit persons to whom the Software is 8 | # furnished to do so, subject to the following conditions: 9 | # 10 | # The above copyright notice and this permission notice shall be included in 11 | # all copies or substantial portions of the Software. 12 | # 13 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 18 | # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 19 | # IN THE SOFTWARE. 20 | 21 | """ 22 | This package contains code to compute functions signatures using Binary 23 | Ninja's python API. The most useful function is `process_function`, which 24 | generates a function signature for the specified function. 25 | """ 26 | 27 | from binaryninja import * 28 | 29 | from . import signaturelibrary 30 | from . import trie_ops 31 | 32 | def is_llil_relocatable(llil): 33 | """ 34 | Guesses whether a LLIL instruction is likely to contain operands that have been or would be relocated by a linker. 35 | :param llil: the llil instruction 36 | :return: true if the LLIL instruction contains LLIL_CONST_PTR or LLIL_EXTERN_PTR. 37 | """ 38 | if not isinstance(llil, LowLevelILInstruction): 39 | return False 40 | if llil.operation in [LowLevelILOperation.LLIL_CONST_PTR, LowLevelILOperation.LLIL_EXTERN_PTR]: 41 | return True 42 | for operand in llil.operands: 43 | if is_llil_relocatable(operand): 44 | return True 45 | return False 46 | 47 | def guess_relocations_mask(func, sig_length): 48 | """ 49 | Compute the relocations mask on a best-efforts basis using a heuristic based on the LLIL. 50 | :param func: BinaryNinja api function 51 | :param sig_length: how long the mask should be 52 | :return: an array of booleans, signifying whether the byte at each index is significant or not for matching 53 | """ 54 | 55 | mask = [False] * sig_length 56 | i = 0 57 | while i < sig_length: 58 | bb = func.get_basic_block_at(func.start + i) 59 | if not bb: # not in a basicblock; wildcard 60 | mask[i] = False 61 | i += 1 62 | continue 63 | 64 | bb._buildStartCache() 65 | if not bb._instLengths: 66 | i += 1 67 | continue 68 | for insn_len in bb._instLengths: 69 | # This throws an exception for large functions where you need to manually force analysis 70 | try: 71 | llil = func.get_low_level_il_at(func.start + i, bb.arch) 72 | except exceptions.ILException: 73 | log_warn(f"Skipping function at {hex(func.start)}. You need to force the analysis of this function.") 74 | return None 75 | 76 | insn_mask = not is_llil_relocatable(llil) 77 | # if not insn_mask: 78 | # func.set_auto_instr_highlight(func.start + i, HighlightStandardColor.BlueHighlightColor) 79 | mask[i:min(i + insn_len, sig_length)] = [insn_mask] * min(insn_len, sig_length - i) 80 | i += insn_len 81 | if i >= sig_length: break 82 | return mask 83 | 84 | def find_relocation(func, start, end): 85 | """ 86 | Finds a relocation from `start` to `end`. If `start`==`end`, then they will be expanded to the closest instruction boundary 87 | :param func: function start and end are contained in 88 | :param start: start address 89 | :param end: end address 90 | :return: corrected start and end addresses for the relocation 91 | """ 92 | 93 | if end != start: # relocation isn't stupid 94 | return start, end - start 95 | # relocation is stupid (start==end), so just expand to the whole instruction 96 | bb = func.get_basic_block_at(start) 97 | if not bb: # not in a basicblock, don't care. 98 | return None, None 99 | bb._buildStartCache() 100 | for i, insn_start in enumerate(bb._instStarts): 101 | insn_end = insn_start + bb._instLengths[i] 102 | if (insn_start < end and start < insn_end) or (start == end and insn_start <= start < insn_end): 103 | return insn_start, bb._instLengths[i] 104 | 105 | def relocations_mask(func, sig_length): 106 | """ 107 | Compute the relocations mask based on the relocation metadata contained within the binary. 108 | :param func: BinaryNinja api function 109 | :param sig_length: how long the mask should be 110 | :return: an array of booleans, signifying whether the byte at each index is significant or not for matching 111 | """ 112 | 113 | mask = [True] * sig_length 114 | for start, end in func.view.relocation_ranges: 115 | if start > func.start + sig_length or end < func.start: continue 116 | reloc_start, reloc_len = find_relocation(func, start, end) 117 | if reloc_start is None: continue # not in a basicblock, don't care. 118 | reloc_start -= func.start 119 | if reloc_start < 0: 120 | reloc_len = reloc_len + reloc_start 121 | reloc_start = 0 122 | if reloc_len <= 0: continue 123 | mask[reloc_start:reloc_start + reloc_len] = [False] * reloc_len 124 | 125 | in_block = [False] * sig_length 126 | for bb in func.basic_blocks: 127 | bb_start_offset = bb.start - func.start 128 | bb_end_offset = bb_start_offset + get_bb_len(bb) 129 | if bb_start_offset > sig_length or bb.start < func.start: continue 130 | in_block[bb_start_offset:min(bb_end_offset, sig_length)] = [True] * min(get_bb_len(bb), sig_length - bb_start_offset) 131 | 132 | mask = [a and b for a,b in zip(mask, in_block)] 133 | return mask 134 | 135 | def get_bb_len(bb): 136 | """ 137 | Calculate the length of the basicblock, taking into account weird cases like the block ending with an illegal instruction 138 | :param bb: BinaryNinja api basic block 139 | :return: length of the basic block in bytes 140 | """ 141 | if bb.has_invalid_instructions: 142 | log.log_warn("Basic block with illegal instructions in " + bb.function.name) 143 | # stupid ugly HACK to deal with illegal instructions after noreturns that aren't marked noreturn 144 | bb._buildStartCache() 145 | if not bb._instLengths: return 0 146 | return bb._instLengths[-1] + bb._instStarts[-1] 147 | else: 148 | return bb.end - bb.start 149 | 150 | def get_func_len(func): 151 | """ 152 | Calculates the length of the function based on the linear addresses of basic blocks. 153 | The length is truncated so that it never lies outside of the underlying binaryview. 154 | :param func: BinaryNinja api function 155 | :return: the distance to the end of the farthest instruction contained within this function 156 | """ 157 | return min(max(map(lambda bb: bb.start + get_bb_len(bb) - func.start, func.basic_blocks)), func.view.end - func.start) 158 | 159 | def compute_callees(func): 160 | """ 161 | Callees are a map of {offset: dest}, where func+offset is a MLIL_CALL instruction or similar. 162 | But sometimes, our version has MORE calls than the signature! This is because sometimes libraries 163 | are optionally linked in, and when they aren't, those calls turn into stubs (e.g., jump 0x0) 164 | so we make those callees wildcard (when we finalize the trie and resolve references). 165 | in our matching algorithm, we allow calls to wildcard callee to be optional. 166 | :param func: BinaryNinja api function 167 | :return: dictionary of {offset: (destination name, `ReferenceType`)} 168 | """ 169 | bv = func.view 170 | callees = {} 171 | for ref in func.call_sites: 172 | callee_addrs = bv.get_callees(ref.address, ref.function, ref.arch) 173 | if len(callee_addrs) != 1: continue 174 | sym = bv.get_symbol_at(callee_addrs[0]) 175 | if sym is None: continue 176 | callees[ref.address - func.start] = (sym.name, sym.type) 177 | return callees 178 | 179 | def function_pattern(func, guess_relocs, sig_length=None): 180 | """ 181 | Computes a data and mask for the specified function `func` that can be used to identify this function. 182 | For example, a function may look like: 183 | 184 | 0: 53 push rbx 185 | 1: 83 77 05 lea esi, [rdi+5] 186 | 4: bf a0 07 40 00 mov edi,0x4007a0 187 | 9: 31 c0 xor eax,eax 188 | 189 | In this case, because they constitute a pointer, bytes 5-8 are liable to change when this binary is recompiled or linked. 190 | Thus, we would like to wildcard those bytes out and ignore them while matching. 191 | An appropriate function pattern may look like: 53 83 77 05 bf ?? ?? ?? ?? 31 c0 192 | The pattern data is a the sequence of bytes in the pattern and the mask is an array which specifies which bytes are not wildcards. 193 | For example, the data would be b'\x55\x83\x77\x05\xbf\x00\x00\x00\x00\x31\xc0' and the mask would be [1,1,1,1,1,0,0,0,0,1,1]. 194 | 195 | This function is responsible for computing that data and that mask based on the information available in the binaryview. 196 | 197 | :param func: BinaryNinja api function 198 | :param guess_relocs: if False, assume relocation information is available for calculating the mask. otherwise, 199 | guess the relocation mask based on the IL. 200 | :param sig_length: the maximum length of the signature. If None, try to calculate it based on basic block addresses. 201 | :return: list of MaskedByte 202 | """ 203 | 204 | if sig_length is None: 205 | sig_length = min(get_func_len(func), 1000) 206 | 207 | if guess_relocs: 208 | mask = guess_relocations_mask(func, sig_length) 209 | else: 210 | mask = relocations_mask(func, sig_length) 211 | if not mask: 212 | return None 213 | mask = list(map(int, mask)) # bool to int 214 | data = b'' 215 | i = 0 216 | while i < len(mask) and func.start + i < func.view.end: 217 | if mask[i]: 218 | next_byte = func.view.read(func.start + i, 1) 219 | if len(next_byte) != 1: break # end of bv 220 | data += next_byte 221 | else: 222 | data += b'\x00' 223 | i += 1 224 | if len(data) < len(mask): 225 | mask = mask[:len(data)] 226 | assert len(data) == len(mask) 227 | while len(mask) and not mask[-1]: 228 | data = data[:len(data) - 1] 229 | mask = mask[:len(mask) - 1] 230 | return signaturelibrary.Pattern(data,mask) 231 | 232 | def process_function(func, guess_relocs): 233 | """ 234 | Generates a signature for a given function. 235 | This signature can be thought of as a semi-unique fingerprint that is able to match copies of this function 236 | found in other binaries. 237 | 238 | :param func: BinaryNinja api function 239 | :param guess_relocs: if False, assume relocation information is available for calculating the mask. otherwise, 240 | guess the relocation mask based on the IL. 241 | :return: tuple of (FunctionNode, FunctionInfo) 242 | """ 243 | 244 | func_node = signaturelibrary.FunctionNode(func.name) 245 | func_node.source_binary = func.view.file.filename 246 | 247 | info = signaturelibrary.FunctionInfo() 248 | function_pattern_val = function_pattern(func, guess_relocs) 249 | if not function_pattern_val: 250 | return None, None 251 | info.patterns = [function_pattern_val] 252 | info.callees = compute_callees(func) 253 | if hasattr(func.symbol, 'aliases'): 254 | info.aliases = list(map(lambda s: s.decode('utf-8'), func.symbol.aliases)) 255 | else: 256 | info.aliases = [] 257 | return func_node, info 258 | -------------------------------------------------------------------------------- /sigkit/sig_serialize_fb.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2015-2020 Vector 35 Inc 2 | # 3 | # Permission is hereby granted, free of charge, to any person obtaining a copy 4 | # of this software and associated documentation files (the "Software"), to 5 | # deal in the Software without restriction, including without limitation the 6 | # rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 7 | # sell copies of the Software, and to permit persons to whom the Software is 8 | # furnished to do so, subject to the following conditions: 9 | # 10 | # The above copyright notice and this permission notice shall be included in 11 | # all copies or substantial portions of the Software. 12 | # 13 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 18 | # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 19 | # IN THE SOFTWARE. 20 | 21 | """ 22 | Flatbuffers serialization / deserialization 23 | """ 24 | 25 | import zlib 26 | import flatbuffers 27 | 28 | from . import signaturelibrary 29 | from .FlatbufSignatureLibrary import CallRef as FlatBufCallRef 30 | from .FlatbufSignatureLibrary import Function as FlatBufFunction 31 | from .FlatbufSignatureLibrary import Pattern as FlatBufPattern 32 | from .FlatbufSignatureLibrary import SignatureLibrary as FlatBufSignatureLibrary 33 | from .FlatbufSignatureLibrary import TrieNode as FlatBufTrieNode 34 | 35 | SIG_FORMAT_MAGIC = b'BNSG' 36 | SIG_FORMAT_VERSION = 1 37 | 38 | class SignatureLibraryWriter(object): 39 | """ 40 | Serializes signature libraries to a compressed Flatbuffer format usable by Binary Ninja. 41 | """ 42 | def __init__(self, include_source=False): 43 | self.builder = flatbuffers.Builder(4096) 44 | self.func_node_ids = {None: -1} 45 | self._bytes_cache = {} 46 | self._str_cache = {} 47 | self._pattern_cache = {} 48 | self.include_source = include_source 49 | 50 | def _serialize_bytes(self, buf): 51 | if buf not in self._bytes_cache: 52 | self._bytes_cache[buf] = self.builder.CreateByteVector(buf) 53 | return self._bytes_cache[buf] 54 | 55 | def _serialize_string(self, s): 56 | if s not in self._str_cache: 57 | self._str_cache[s] = self.builder.CreateString(s) 58 | return self._str_cache[s] 59 | 60 | def _serialize_pattern_mask(self, mask): 61 | mask = bytearray(mask) 62 | packed = bytearray((len(mask) + 7) // 8) 63 | for i in range(len(mask)): 64 | packed[i // 8] |= mask[i] << (i % 8) 65 | packed = bytes(packed) 66 | return self._serialize_bytes(packed) 67 | 68 | def _serialize_pattern(self, pattern): 69 | if pattern not in self._pattern_cache: 70 | data = self._serialize_bytes(bytes(bytearray(pattern.data()))) 71 | mask = self._serialize_pattern_mask(bytes(bytearray(pattern.mask()))) 72 | FlatBufPattern.PatternStart(self.builder) 73 | FlatBufPattern.PatternAddData(self.builder, data) 74 | FlatBufPattern.PatternAddMask(self.builder, mask) 75 | self._pattern_cache[pattern] = FlatBufPattern.PatternEnd(self.builder) 76 | return self._pattern_cache[pattern] 77 | 78 | def _serialize_func_node(self, func_node): 79 | func_name = self._serialize_string(func_node.name) 80 | if self.include_source and func_node.source_binary: 81 | source_binary = self._serialize_string(func_node.source_binary) 82 | else: 83 | source_binary = None 84 | 85 | if func_node.callees: 86 | FlatBufFunction.FunctionStartCalleesVector(self.builder, len(func_node.callees)) 87 | for call_site, callee in reversed(sorted(func_node.callees.items())): # this needs reversed() because we build flatbuffers by prepending 88 | FlatBufCallRef.CreateCallRef(self.builder, call_site, self.func_node_ids[callee]) 89 | callees = self.builder.EndVector(len(func_node.callees)) 90 | else: 91 | callees = None 92 | 93 | if func_node.pattern: 94 | pattern = self._serialize_pattern(func_node.pattern) 95 | else: 96 | pattern = None 97 | 98 | FlatBufFunction.FunctionStart(self.builder) 99 | if func_name: 100 | FlatBufFunction.FunctionAddName(self.builder, func_name) 101 | if source_binary: 102 | FlatBufFunction.FunctionAddSourceBinary(self.builder, source_binary) 103 | if callees: 104 | FlatBufFunction.FunctionAddCallees(self.builder, callees) 105 | if func_node.is_bridge: 106 | FlatBufFunction.FunctionAddIsBridge(self.builder, func_node.is_bridge) 107 | if pattern: 108 | FlatBufFunction.FunctionAddPattern(self.builder, pattern) 109 | FlatBufFunction.FunctionAddPatternOffset(self.builder, func_node.pattern_offset) 110 | return FlatBufFunction.FunctionEnd(self.builder) 111 | 112 | def _serialize_trie_node(self, trie_node, key=None): 113 | pattern = self._serialize_pattern(trie_node.pattern) 114 | if trie_node.children: 115 | children_offs = [self._serialize_trie_node(v, k.value) for k, v in sorted(trie_node.children.items()) if k != signaturelibrary.MaskedByte.wildcard] 116 | FlatBufTrieNode.TrieNodeStartChildrenVector(self.builder, len(children_offs)) 117 | for off in reversed(children_offs): # this needs reversed() because we build flatbuffers by prepending 118 | self.builder.PrependUOffsetTRelative(off) 119 | children = self.builder.EndVector(len(children_offs)) 120 | if signaturelibrary.MaskedByte.wildcard in trie_node.children: 121 | wildcard_child = self._serialize_trie_node(trie_node.children[signaturelibrary.MaskedByte.wildcard]) 122 | else: 123 | wildcard_child = None 124 | else: 125 | wildcard_child = None 126 | children = None 127 | if trie_node.value: 128 | FlatBufTrieNode.TrieNodeStartFunctionsVector(self.builder, len(trie_node.value)) 129 | for f in reversed(trie_node.value): # this needs reversed() because we build flatbuffers by prepending 130 | self.builder.PrependUint32(self.func_node_ids[f]) 131 | functions = self.builder.EndVector(len(trie_node.value)) 132 | else: 133 | functions = None 134 | 135 | FlatBufTrieNode.TrieNodeStart(self.builder) 136 | if key is not None: # what about duplicate between 0 and wildcard...? 137 | assert type(key) == int and 0 <= key <= 255 138 | assert trie_node.pattern[0].mask == 1 and key == trie_node.pattern[0].value 139 | FlatBufTrieNode.TrieNodeAddPatternPrefix(self.builder, key) 140 | FlatBufTrieNode.TrieNodeAddPattern(self.builder, pattern) 141 | if children: 142 | FlatBufTrieNode.TrieNodeAddChildren(self.builder, children) 143 | if wildcard_child: 144 | FlatBufTrieNode.TrieNodeAddWildcardChild(self.builder, wildcard_child) 145 | if functions: 146 | FlatBufTrieNode.TrieNodeAddFunctions(self.builder, functions) 147 | return FlatBufTrieNode.TrieNodeEnd(self.builder) 148 | 149 | def serialize(self, sig_trie): 150 | """ 151 | Creates a new Flatbuffer and serializes the specified signature trie to it. 152 | Returns a binary signature library ready for use with Binary Ninja. 153 | :param sig_trie: `TrieNode` object 154 | :return: bytes-like object 155 | """ 156 | # Enforce ordering to make the traversal order consistent 157 | for n in sig_trie.all_nodes(): 158 | if n.value: 159 | n.value = list(sorted(n.value, key=lambda func_node: func_node.source_binary + '!' + func_node.name)) 160 | 161 | func_nodes = [] 162 | def visit(func_node): 163 | if func_node in self.func_node_ids: return 164 | self.func_node_ids[func_node] = len(func_nodes) 165 | func_nodes.append(func_node) 166 | for k, f in sorted(func_node.callees.items()): visit(f) 167 | for f in sig_trie.all_values(): visit(f) 168 | 169 | func_nodes = [self._serialize_func_node(f) for f in reversed(func_nodes)] # this needs reversed() because we build flatbuffers by prepending 170 | FlatBufSignatureLibrary.SignatureLibraryStartFunctionsVector(self.builder, len(func_nodes)) 171 | for off in func_nodes: 172 | self.builder.PrependUOffsetTRelative(off) 173 | functions = self.builder.EndVector(len(func_nodes)) 174 | 175 | root = self._serialize_trie_node(sig_trie) 176 | 177 | FlatBufSignatureLibrary.SignatureLibraryStart(self.builder) 178 | FlatBufSignatureLibrary.SignatureLibraryAddFunctions(self.builder, functions) 179 | FlatBufSignatureLibrary.SignatureLibraryAddRoot(self.builder, root) 180 | off = FlatBufSignatureLibrary.SignatureLibraryEnd(self.builder) 181 | self.builder.Finish(off) 182 | 183 | return SIG_FORMAT_MAGIC + bytes(bytearray([SIG_FORMAT_VERSION])) + zlib.compress(bytes(self.builder.Output())) 184 | 185 | class SignatureLibraryReader(object): 186 | """ 187 | Parses and loads compressed Flatbuffer signature libraries. 188 | """ 189 | def __init__(self): 190 | self.funcs = [] 191 | 192 | def _deserialize_pattern(self, serialized): 193 | # we cannot use DataAsNumpy as we don't depend on numpy 194 | data = bytes(bytearray([serialized.Data(i) for i in range(serialized.DataLength())])) 195 | 196 | mask = [] 197 | for i in range(serialized.MaskLength()): 198 | b = serialized.Mask(i) 199 | for j in range(8): 200 | mask.append((b >> j) & 1) 201 | if len(mask) == len(data): break 202 | 203 | return signaturelibrary.Pattern(data, mask) 204 | 205 | def _deserialize_func_node(self, serialized): 206 | func_node = signaturelibrary.FunctionNode(serialized.Name().decode('utf-8')) 207 | if serialized.SourceBinary(): 208 | func_node.source_binary = serialized.SourceBinary().decode('utf-8') 209 | # func_node.is_bridge = serialized.IsBridge() 210 | if serialized.Pattern(): 211 | func_node.pattern = self._deserialize_pattern(serialized.Pattern()) 212 | func_node.pattern_offset = serialized.PatternOffset() 213 | return func_node 214 | 215 | def _deserialize_trie_node(self, serialized): 216 | children = {} 217 | prev = float('-inf') 218 | for i in range(serialized.ChildrenLength()): 219 | child = serialized.Children(i) 220 | children[signaturelibrary.MaskedByte.new(child.PatternPrefix(), 1)] = self._deserialize_trie_node(child) 221 | assert child.PatternPrefix() >= prev # assert sorted 222 | prev = child.PatternPrefix() 223 | wildcard = serialized.WildcardChild() 224 | if wildcard: 225 | children[signaturelibrary.MaskedByte.wildcard] = self._deserialize_trie_node(wildcard) 226 | funcs = [] 227 | for i in range(serialized.FunctionsLength()): 228 | funcs.append(self.funcs[serialized.Functions(i)]) 229 | pattern = self._deserialize_pattern(serialized.Pattern()) 230 | return signaturelibrary.TrieNode(pattern, children, funcs) 231 | 232 | def deserialize(self, buf): 233 | """ 234 | Loads a signature library from an in-memory buffer. 235 | This implementation is extremely inefficient! Use it for debugging and signature library generation only. 236 | :param buf: bytes-like object 237 | :return: root `TrieNode` of the signature library 238 | """ 239 | if buf[0:4] != b'BNSG': 240 | raise RuntimeError('invalid signature library magic') 241 | if ord(buf[4:5]) != SIG_FORMAT_VERSION: 242 | raise RuntimeError('signature version mismatch: got %d, expected %d' % (ord(buf[4:5]), SIG_FORMAT_VERSION)) 243 | buf = zlib.decompress(buf[5:]) 244 | serialized = FlatBufSignatureLibrary.SignatureLibrary.GetRootAsSignatureLibrary(buf, 0) 245 | funcs_serialized = [] 246 | for i in range(serialized.FunctionsLength()): 247 | f = serialized.Functions(i) 248 | funcs_serialized.append(f) 249 | self.funcs.append(self._deserialize_func_node(f)) 250 | for i, f in enumerate(funcs_serialized): # link callgraph 251 | callees = {} 252 | prev = float('-inf') 253 | for j in range(f.CalleesLength()): 254 | callsite = f.Callees(j) 255 | callees[callsite.Offset()] = None if callsite.DstId() == -1 else self.funcs[callsite.DstId()] 256 | assert callsite.Offset() >= prev # assert sorted 257 | prev = callsite.Offset() 258 | self.funcs[i].callees = callees 259 | 260 | trie = self._deserialize_trie_node(serialized.Root()) 261 | for func in trie.all_values(): # recalculate refcounts 262 | func.ref_count += 1 263 | return trie 264 | 265 | 266 | def dumps(sig_trie, **kwargs): 267 | return SignatureLibraryWriter(**kwargs).serialize(sig_trie) 268 | 269 | def dump(sig_trie, fp, **kwargs): 270 | fp.write(dumps(sig_trie, **kwargs)) 271 | 272 | def loads(serialized): 273 | return SignatureLibraryReader().deserialize(serialized) 274 | 275 | def load(fp): 276 | return loads(fp.read()) 277 | -------------------------------------------------------------------------------- /sigkit/sig_serialize_json.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2015-2020 Vector 35 Inc 2 | # 3 | # Permission is hereby granted, free of charge, to any person obtaining a copy 4 | # of this software and associated documentation files (the "Software"), to 5 | # deal in the Software without restriction, including without limitation the 6 | # rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 7 | # sell copies of the Software, and to permit persons to whom the Software is 8 | # furnished to do so, subject to the following conditions: 9 | # 10 | # The above copyright notice and this permission notice shall be included in 11 | # all copies or substantial portions of the Software. 12 | # 13 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 18 | # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 19 | # IN THE SOFTWARE. 20 | 21 | """ 22 | JSON serialization / deserialization 23 | """ 24 | 25 | import json 26 | 27 | from . import signaturelibrary 28 | 29 | def _serialize_func_node(func_node, func_node_ids): 30 | return { 31 | 'name': func_node.name, 32 | 'source_binary': func_node.source_binary, 33 | 'pattern': str(func_node.pattern), 34 | 'pattern_offset': func_node.pattern_offset, 35 | 'callees': {str(call_site): func_node_ids[callee] for call_site, callee in func_node.callees.items()}, 36 | 'is_bridge': func_node.is_bridge 37 | } 38 | 39 | def _serialize_trie_node(trie_node, func_node_ids): 40 | children = {str(k) : _serialize_trie_node(v, func_node_ids) for k, v in trie_node.children.items()} 41 | if trie_node.value: 42 | functions = [func_node_ids[f] for f in trie_node.value] 43 | else: 44 | functions = [] 45 | return { 46 | 'pattern': str(trie_node.pattern), 47 | 'children': children, 48 | 'functions': functions, 49 | } 50 | 51 | def serialize(sig_trie): 52 | """ 53 | Serialize a signature trie to a JSON-compatible format. 54 | :param sig_trie: `TrieNode` object 55 | :return: a python dictionary ready for serialization as JSON 56 | """ 57 | func_nodes = [] 58 | func_node_ids = {None: -1} 59 | def visit(func_node): 60 | if func_node in func_node_ids: return 61 | func_node_ids[func_node] = len(func_nodes) 62 | func_nodes.append(func_node) 63 | for f in func_node.callees.values(): visit(f) 64 | for f in sig_trie.all_values(): visit(f) 65 | 66 | return { 67 | 'functions': [_serialize_func_node(f, func_node_ids, ) for f in func_nodes], 68 | 'trie': _serialize_trie_node(sig_trie, func_node_ids) 69 | } 70 | 71 | def _deserialize_pattern(serialized): 72 | return signaturelibrary.Pattern.from_str(serialized) 73 | 74 | 75 | def _deserialize_func_node(serialized): 76 | func_node = signaturelibrary.FunctionNode(serialized['name']) 77 | func_node.source_binary = serialized['source_binary'] 78 | func_node.pattern = _deserialize_pattern(serialized['pattern']) 79 | func_node.pattern_offset = serialized['pattern_offset'] 80 | # func_node.is_bridge = serialized['is_bridge'] 81 | return func_node 82 | 83 | def _deserialize_trie_node(serialized, funcs_arr): 84 | return signaturelibrary.TrieNode( 85 | _deserialize_pattern(serialized['pattern']), 86 | {signaturelibrary.MaskedByte.from_str(k): _deserialize_trie_node(v, funcs_arr) for k, v in serialized['children'].items()}, 87 | [funcs_arr[i] for i in serialized['functions']] if serialized['functions'] else [] 88 | ) 89 | 90 | def deserialize(serialized): 91 | """ 92 | Deserialize a signature trie from JSON data. 93 | :param serialized: a dict containing JSON-format data to signature trie objects. 94 | :return: the root `TrieNode` 95 | """ 96 | funcs_serialized = serialized['functions'] 97 | funcs = [_deserialize_func_node(f) for f in funcs_serialized] 98 | for i in range(len(funcs)): # link callgraph 99 | funcs[i].callees = {int(call_site): None if callee_id == -1 else funcs[callee_id] 100 | for call_site, callee_id in funcs_serialized[i]['callees'].items()} 101 | 102 | return _deserialize_trie_node(serialized['trie'], funcs) 103 | 104 | def dumps(sig_trie, *args, **kwargs): 105 | return json.dumps(serialize(sig_trie), *args, **kwargs) 106 | 107 | def dump(sig_trie, fp, *args, **kwargs): 108 | return json.dump(serialize(sig_trie), fp, *args, **kwargs) 109 | 110 | def loads(serialized, *args, **kwargs): 111 | return deserialize(json.loads(serialized, *args, **kwargs)) 112 | 113 | def load(fp, *args, **kwargs): 114 | return deserialize(json.load(fp, *args, **kwargs)) 115 | -------------------------------------------------------------------------------- /sigkit/sigexplorer.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Copyright (c) 2015-2020 Vector 35 Inc 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to 7 | # deal in the Software without restriction, including without limitation the 8 | # rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 9 | # sell copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | # 12 | # The above copyright notice and this permission notice shall be included in 13 | # all copies or substantial portions of the Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 | # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 | # IN THE SOFTWARE. 22 | 23 | from __future__ import print_function 24 | 25 | import sys 26 | import os 27 | 28 | import binaryninjaui 29 | if "qt_major_version" in binaryninjaui.__dict__ and binaryninjaui.qt_major_version == 6: 30 | from PySide6.QtCore import (Qt, QRect, QItemSelectionModel, QItemSelection, QSize, Signal) 31 | from PySide6.QtGui import (QStandardItemModel, QIcon, QStandardItem, QKeySequence, QFont, QBrush, QTextDocument, 32 | QCursor, QFontDatabase, QPalette, QAction) 33 | from PySide6.QtWidgets import (QApplication, QTreeView, QVBoxLayout, QWidget, QMenu, QMainWindow, QFileDialog, 34 | QStyledItemDelegate, QStyle, QGroupBox, QHBoxLayout, QPushButton, QAbstractItemView, 35 | QInputDialog, QMessageBox, QLabel) 36 | else: 37 | from PySide2.QtCore import (Qt, QRect, QItemSelectionModel, QItemSelection, QSize, Signal) 38 | from PySide2.QtGui import (QStandardItemModel, QIcon, QStandardItem, QKeySequence, QFont, QBrush, QTextDocument, 39 | QCursor, QFontDatabase, QPalette) 40 | from PySide2.QtWidgets import (QApplication, QTreeView, QVBoxLayout, QWidget, QMenu, QAction, QMainWindow, QFileDialog, 41 | QStyledItemDelegate, QStyle, QGroupBox, QHBoxLayout, QPushButton, QAbstractItemView, 42 | QInputDialog, QMessageBox, QLabel) 43 | 44 | import pickle 45 | import json 46 | import zlib 47 | 48 | if __name__ == "__main__" and __package__ is None: 49 | __package__ = os.path.basename(os.getcwd()) 50 | sys.path.append(os.path.dirname(os.getcwd())) 51 | __import__(__package__) # python2 compat 52 | print('Please run with python -m %s.%s instead of %s directly.' % (__package__, os.path.splitext(__file__)[0], __file__)) 53 | 54 | from . import sig_serialize_json 55 | from . import sig_serialize_fb 56 | 57 | class App(QMainWindow): 58 | def __init__(self): 59 | super(App, self).__init__() 60 | 61 | self.treeView = None 62 | self.model = None 63 | self.pattern_delegate = None 64 | self.callee_delegate = None 65 | self.sig_trie = None 66 | 67 | self.searchResults = None 68 | self.searchIndex = -1 69 | self.findNextAction = None 70 | self.findPrevAction = None 71 | 72 | # these two maps are used to make the hyperlinks work 73 | # mapping from href to FunctionNode 74 | self.hrefs_to_funcs = {} 75 | # mapping from FunctionNode to tree view element (QStandardItem) 76 | self.func_node_items = {} 77 | 78 | self.init_ui() 79 | 80 | def init_ui(self): 81 | self.setWindowTitle('Signature Explorer') 82 | self.resize(1000, 640) 83 | app_icon = QIcon() 84 | app_icon.addFile('icon.ico', QSize(48,48)) 85 | self.setWindowIcon(app_icon) 86 | 87 | self.pattern_delegate = PatternDelegate() 88 | self.callee_delegate = CalleesDelegate() 89 | 90 | self.treeView = TrieView() 91 | # self.treeView.setAlternatingRowColors(True) 92 | 93 | self.model = QStandardItemModel(0, 7, self.treeView) 94 | self.model.setHeaderData(0, Qt.Horizontal, 'Signature') 95 | self.model.setHeaderData(1, Qt.Horizontal, 'Function') 96 | self.model.setHeaderData(2, Qt.Horizontal, 'Callees') 97 | self.model.setHeaderData(3, Qt.Horizontal, 'Offset Extra Pattern') 98 | self.model.setHeaderData(4, Qt.Horizontal, 'Extra Pattern') 99 | self.model.setHeaderData(5, Qt.Horizontal, 'Source Binary') 100 | self.model.setHeaderData(6, Qt.Horizontal, 'ID') 101 | self.treeView.setModel(self.model) 102 | 103 | self.treeView.setSelectionBehavior(QAbstractItemView.SelectRows) 104 | self.treeView.setColumnWidth(0, 400) 105 | self.treeView.setColumnWidth(1, 200) 106 | self.treeView.setColumnWidth(2, 250) 107 | self.treeView.setColumnWidth(3, 25) 108 | self.treeView.setColumnWidth(4, 100) 109 | self.treeView.setColumnWidth(5, 200) 110 | self.treeView.setColumnWidth(6, 75) 111 | self.treeView.setItemDelegateForColumn(0, self.pattern_delegate) 112 | self.treeView.setItemDelegateForColumn(2, self.callee_delegate) 113 | self.treeView.setItemDelegateForColumn(4, self.pattern_delegate) 114 | self.treeView.horizontalScrollBar().setEnabled(True) 115 | self.treeView.setHorizontalScrollBarPolicy(Qt.ScrollBarAsNeeded) 116 | self.treeView.setEditTriggers(QAbstractItemView.NoEditTriggers) 117 | self.treeView.linkActivated.connect(self.on_func_link_clicked) 118 | # self.treeView.expanded.connect(lambda x: self.treeView.resizeColumnToContents(1)) 119 | # self.treeView.collapsed.connect(lambda x: self.treeView.resizeColumnToContents(1)) 120 | 121 | main_layout = QVBoxLayout() 122 | main_layout.addWidget(self.treeView) 123 | 124 | panel = QWidget() 125 | panel.setLayout(main_layout) 126 | self.setCentralWidget(panel) 127 | 128 | menuBar = self.menuBar() 129 | 130 | fileMenu = QMenu("File") 131 | openAction = QAction("&Open", self) 132 | openAction.setShortcuts(QKeySequence.Open) 133 | openAction.triggered.connect(self.open_file) 134 | fileMenu.addAction(openAction) 135 | 136 | closeAction = QAction("&Close", self) 137 | closeAction.setShortcuts(QKeySequence.Close) 138 | closeAction.triggered.connect(self.close_file) 139 | fileMenu.addAction(closeAction) 140 | 141 | saveAsAction = QAction("Save As...", self) 142 | saveAsAction.setShortcuts(QKeySequence.Save) 143 | saveAsAction.triggered.connect(self.save_as) 144 | fileMenu.addAction(saveAsAction) 145 | 146 | menuBar.addMenu(fileMenu) 147 | 148 | editMenu = QMenu("Edit") 149 | 150 | findAction = QAction("&Find", self) 151 | findAction.setShortcuts(QKeySequence.Find) 152 | findAction.triggered.connect(self.search) 153 | editMenu.addAction(findAction) 154 | 155 | self.findNextAction = QAction("&Find Next", self) 156 | self.findNextAction.setShortcuts(QKeySequence.FindNext) 157 | self.findNextAction.triggered.connect(self.select_next) 158 | self.findNextAction.setEnabled(False) 159 | editMenu.addAction(self.findNextAction) 160 | 161 | self.findPrevAction = QAction("&Find Prev", self) 162 | self.findPrevAction.setShortcuts(QKeySequence.FindPrevious) 163 | self.findPrevAction.triggered.connect(self.select_prev) 164 | self.findPrevAction.setEnabled(False) 165 | editMenu.addAction(self.findPrevAction) 166 | 167 | menuBar.addMenu(editMenu) 168 | 169 | viewMenu = QMenu("View") 170 | 171 | expandAction = QAction("&Expand All", self) 172 | expandAction.triggered.connect(self.treeView.expandAll) 173 | viewMenu.addAction(expandAction) 174 | 175 | collapseAction = QAction("&Collapse All", self) 176 | collapseAction.triggered.connect(self.treeView.collapseAll) 177 | viewMenu.addAction(collapseAction) 178 | 179 | menuBar.addMenu(viewMenu) 180 | 181 | def search(self): 182 | query_string, ok = QInputDialog.getText(self, 'Find in Trie', 'Function name') 183 | if not ok or not query_string: 184 | return 185 | 186 | self.searchResults = self.model.findItems(query_string, Qt.MatchContains | Qt.MatchRecursive, 1) 187 | 188 | if self.searchResults: 189 | self.findNextAction.setEnabled(True) 190 | self.findPrevAction.setEnabled(True) 191 | self.searchIndex = 0 192 | self.select_next() 193 | else: 194 | self.findNextAction.setEnabled(False) 195 | self.findPrevAction.setEnabled(False) 196 | self.searchIndex = -1 197 | QMessageBox.warning(self, 'Find in Trie', 'No results found') 198 | 199 | def select_next(self): 200 | next_item = self.searchResults[self.searchIndex] 201 | self.searchIndex = (self.searchIndex + 1) % len(self.searchResults) 202 | self.select_tree_item(next_item) 203 | 204 | def select_prev(self): 205 | prev_item = self.searchResults[self.searchIndex] 206 | self.searchIndex = (self.searchIndex - 1) % len(self.searchResults) 207 | self.select_tree_item(prev_item) 208 | 209 | def select_tree_item(self, item): 210 | path = [] 211 | while item: 212 | path.insert(0, self.model.indexFromItem(item)) 213 | item = item.parent() 214 | # print(path) 215 | for index in path: 216 | self.treeView.setExpanded(index, True) 217 | self.treeView.selectionModel().select(path[-1], QItemSelectionModel.ClearAndSelect | QItemSelectionModel.Rows) 218 | self.treeView.scrollTo(path[-1]) 219 | 220 | def close_file(self): 221 | self.model.removeRows(0, self.model.rowCount()) 222 | self.sig_trie = None 223 | self.hrefs_to_funcs = {} 224 | self.func_node_items = {} 225 | 226 | def open_file(self): 227 | sig_filter = 'Signature library (*.sig)' 228 | json_zlib_filter = 'Compressed JSON signature library (*.json.zlib)' 229 | json_filter = 'JSON signature library (*.json)' 230 | pkl_filter = 'Pickled signature library (*.pkl)' 231 | fname, filter = QFileDialog.getOpenFileName(self, 'Open file', filter=';;'.join([sig_filter, json_zlib_filter, json_filter, pkl_filter])) 232 | if filter and fname: 233 | print('Opening signature library %s' % (fname,)) 234 | 235 | if filter == json_zlib_filter: 236 | with open(fname, 'rb') as f: 237 | json_trie = zlib.decompress(f.read()).decode('utf-8') 238 | sig_trie = sig_serialize_json.deserialize(json.loads(json_trie)) 239 | elif filter == json_filter: 240 | with open(fname, 'r') as f: 241 | json_trie = f.read() 242 | sig_trie = sig_serialize_json.deserialize(json.loads(json_trie)) 243 | elif filter == sig_filter: 244 | with open(fname, 'rb') as f: 245 | fb_trie = f.read() 246 | sig_trie = sig_serialize_fb.SignatureLibraryReader().deserialize(fb_trie) 247 | elif filter == pkl_filter: 248 | with open(fname, 'rb') as f: 249 | sig_trie = pickle.load(f) 250 | else: 251 | return 252 | 253 | self.open_trie(sig_trie, os.path.basename(fname)) 254 | 255 | def save_as(self): 256 | sig_filter = 'Signature library (*.sig)' 257 | json_zlib_filter = 'Compressed JSON signature library (*.json.zlib)' 258 | json_filter = 'JSON signature library (*.json)' 259 | pkl_filter = 'Pickled signature library (*.pkl)' 260 | fname, filter = QFileDialog.getSaveFileName(self, 'Open file', filter=';;'.join([sig_filter, json_zlib_filter, json_filter, pkl_filter])) 261 | 262 | if filter == json_zlib_filter: 263 | with open(fname, 'wb') as f: 264 | f.write(zlib.compress(sig_serialize_json.serialize(self.sig_trie).encode('utf-8'))) 265 | elif filter == json_filter: 266 | with open(fname, 'w') as f: 267 | json.dump(sig_serialize_json.serialize(self.sig_trie), f, indent=4) 268 | elif filter == sig_filter: 269 | with open(fname, 'wb') as f: 270 | f.write(sig_serialize_fb.SignatureLibraryWriter().serialize(self.sig_trie)) 271 | elif filter == pkl_filter: 272 | with open(fname, 'wb') as f: 273 | pickle.dump(self.sig_trie, f) 274 | else: 275 | return 276 | print('Saved as ' + fname) 277 | 278 | @staticmethod 279 | def generate_href(func): 280 | return str(id(func)) 281 | 282 | def get_func_name(self, func_node): 283 | if func_node is None: 284 | return '' 285 | else: 286 | return '' + func_node.name + '' 287 | 288 | # handles when the user clicks on a hyperlink to a function node 289 | def on_func_link_clicked(self, link): 290 | print('Hyperlink clicked: ' + link) 291 | self.select_tree_item(self.func_node_items[self.hrefs_to_funcs[link]]) 292 | 293 | # Generate treeview row for function (leaf) node in the trie 294 | def add_func_node(self, parent, pattern_col_item, func): 295 | self.hrefs_to_funcs[self.generate_href(func)] = func 296 | self.func_node_items[func] = pattern_col_item 297 | 298 | if not func.callees: func.callees = {} 299 | callees_text = '
'.join([str(k) + ': ' + self.get_func_name(v) for k,v in func.callees.items()]) 300 | callees_item = QStandardItem(callees_text) 301 | cols = [pattern_col_item, 302 | QStandardItem(func.name), 303 | callees_item, 304 | QStandardItem(str(func.pattern_offset) if func.pattern else ''), 305 | QStandardItem(str(func.pattern) if func.pattern else ''), 306 | QStandardItem(func.source_binary), 307 | QStandardItem(self.generate_href(func))] 308 | boldface = cols[1].font() 309 | boldface.setBold(True) 310 | cols[1].setFont(boldface) 311 | parent.appendRow(cols) 312 | 313 | # Recursively add rows for this trie node and its children 314 | def add_trie_node(self, parent, pattern_text, node): 315 | left_item = QStandardItem(pattern_text) 316 | 317 | if not node.value: # Stem node 318 | parent.appendRow([left_item, QStandardItem('')]) 319 | else: # Leaf node 320 | self.add_func_node(parent, left_item, node.value[0]) 321 | for func in node.value[1:]: 322 | self.add_func_node(parent, QStandardItem(''), func) 323 | 324 | pairs = map(lambda node: (str(node.pattern), node), node.children.values()) 325 | pairs = sorted(pairs, key=lambda kv: kv[0].replace('?', '\xff')) 326 | for text, child in pairs: 327 | self.add_trie_node(left_item, text, child) 328 | return left_item 329 | 330 | # Add bridge nodes to a special node at the root 331 | def add_bridge_nodes(self, parent, sig_trie): 332 | bridge_item = QStandardItem('(bridge)') 333 | parent.appendRow([bridge_item, QStandardItem('')]) 334 | def visit(func, visited): 335 | if func is None or func in visited: return 336 | visited.add(func) 337 | if func.is_bridge: 338 | self.add_func_node(bridge_item, QStandardItem(''), func) 339 | for callee in func.callees.values(): 340 | visit(callee, visited) 341 | visited = set() 342 | for func in sig_trie.all_values(): 343 | visit(func, visited) 344 | 345 | def open_trie(self, sig_trie, filename): 346 | self.close_file() 347 | self.sig_trie = sig_trie 348 | root_node = self.add_trie_node(self.model, filename, sig_trie) 349 | self.add_bridge_nodes(root_node, sig_trie) 350 | 351 | 352 | # copy-pasted off https://stackoverflow.com/questions/55923137/ lol 353 | class PatternDelegate(QStyledItemDelegate): 354 | def __init__(self): 355 | super(PatternDelegate, self).__init__() 356 | self.font = QFontDatabase.systemFont(QFontDatabase.FixedFont) 357 | 358 | def paint(self, painter, option, index): 359 | if index.data() is None: 360 | return 361 | painter.save() 362 | 363 | painter.setFont(self.font) 364 | defaultPen = painter.pen() 365 | self.initStyleOption(option, index) 366 | style = option.widget.style() 367 | option.text = '' # wipe out the text passed to the original renderer, so just have it render the background 368 | style.drawControl(QStyle.CE_ItemViewItem, option, painter, option.widget) 369 | 370 | offset = 3 371 | ellipsis = '…' 372 | ellipsisWidth = painter.fontMetrics().horizontalAdvance(ellipsis) 373 | rightBorder = option.rect.left() + option.rect.width() - offset 374 | 375 | option.rect.moveRight(option.rect.right() + offset) 376 | 377 | textRole = QPalette.NoRole 378 | if option.state & QStyle.State_Selected: 379 | textRole = QPalette.HighlightedText 380 | 381 | color = 0 382 | painter.setPen(defaultPen) 383 | for c in index.data(): 384 | if color == 0 and c == '?': # little fsm 385 | color = 1 386 | painter.setPen(Qt.red) 387 | elif color == 1 and c != '?': 388 | color = 0 389 | painter.setPen(defaultPen) 390 | 391 | charWidth = painter.fontMetrics().horizontalAdvance(c) 392 | drawRect = option.rect 393 | if drawRect.left() + charWidth + ellipsisWidth > rightBorder: 394 | style.drawItemText(painter, drawRect, option.displayAlignment, option.palette, True, ellipsis, textRole) 395 | break 396 | 397 | style.drawItemText(painter, drawRect, option.displayAlignment, option.palette, True, c, textRole) 398 | 399 | option.rect.moveRight(option.rect.right() + charWidth) 400 | 401 | 402 | painter.restore() 403 | 404 | 405 | # https://stackoverflow.com/questions/35397943/how-to-make-a-fast-qtableview-with-html-formatted-and-clickable-cells 406 | class CalleesDelegate(QStyledItemDelegate): 407 | def __init__(self): 408 | super(CalleesDelegate, self).__init__() 409 | 410 | def anchorAt(self, html, point): 411 | doc = QTextDocument() 412 | doc.setHtml(html) 413 | 414 | textLayout = doc.documentLayout() 415 | assert textLayout != None 416 | return textLayout.anchorAt(point) 417 | 418 | def paint(self, painter, option, index): 419 | options = option 420 | self.initStyleOption(options, index) 421 | 422 | painter.save() 423 | 424 | doc = QTextDocument() 425 | doc.setHtml(options.text) 426 | 427 | options.text = "" 428 | options.widget.style().drawControl(QStyle.CE_ItemViewItem, option, painter, option.widget) 429 | 430 | painter.translate(options.rect.left(), options.rect.top()) 431 | clip = QRect(0, 0, options.rect.width(), options.rect.height()) 432 | doc.drawContents(painter, clip) 433 | 434 | painter.restore() 435 | 436 | def sizeHint(self, option, index): 437 | options = option 438 | self.initStyleOption(options, index) 439 | 440 | doc = QTextDocument() 441 | doc.setHtml(options.text) 442 | doc.setTextWidth(options.rect.width()) 443 | return QSize(doc.idealWidth(), doc.size().height()) 444 | 445 | 446 | class TrieView(QTreeView): 447 | linkUnhovered = Signal() 448 | linkHovered = Signal(str) 449 | linkActivated = Signal(str) 450 | 451 | def __init__(self, *args, **kwargs): 452 | super(TrieView, self).__init__(*args, **kwargs) 453 | self.setMouseTracking(True) 454 | self._mousePressAnchor = '' 455 | self._lastHoveredAnchor = '' 456 | 457 | def mousePressEvent(self, event): 458 | super(TrieView, self).mousePressEvent(event) 459 | anchor = self.anchorAt(event.pos()) 460 | self._mousePressAnchor = anchor 461 | 462 | def mouseMoveEvent(self, event): 463 | anchor = self.anchorAt(event.pos()) 464 | 465 | if self._mousePressAnchor != anchor: 466 | self._mousePressAnchor = '' 467 | 468 | if self._lastHoveredAnchor != anchor: 469 | self._lastHoveredAnchor = anchor 470 | if self._lastHoveredAnchor: 471 | QApplication.setOverrideCursor(QCursor(Qt.PointingHandCursor)) 472 | self.linkHovered.emit(self._lastHoveredAnchor) 473 | else: 474 | QApplication.restoreOverrideCursor() 475 | self.linkUnhovered.emit() 476 | 477 | def mouseReleaseEvent(self, event): 478 | if self._mousePressAnchor: 479 | anchor = self.anchorAt(event.pos()) 480 | 481 | if anchor == self._mousePressAnchor: 482 | self.linkActivated.emit(self._mousePressAnchor) 483 | 484 | self._mousePressAnchor = '' 485 | 486 | super(TrieView, self).mouseReleaseEvent(event) 487 | 488 | def anchorAt(self, pos): 489 | index = self.indexAt(pos) 490 | if index.isValid(): 491 | delegate = self.itemDelegate(index) 492 | wordDelegate = delegate 493 | if isinstance(wordDelegate, CalleesDelegate): 494 | itemRect = self.visualRect(index) 495 | relativeClickPosition = pos - itemRect.topLeft() 496 | 497 | html = index.data() 498 | if html is not None: 499 | return wordDelegate.anchorAt(html, relativeClickPosition) 500 | 501 | return '' 502 | 503 | def explore_signature_library(sig_trie): 504 | """ 505 | Display an in-memory signature trie in the signature explorer GUI. 506 | :param sig_trie: instance of `TrieNode` 507 | """ 508 | if not QApplication.instance(): 509 | app = QApplication(sys.argv) 510 | else: 511 | app = None 512 | widget = App() 513 | widget.show() 514 | widget.open_trie(sig_trie, '(memory)') 515 | if app: 516 | app.exec_() 517 | 518 | if __name__ == "__main__": 519 | app = QApplication(sys.argv) 520 | 521 | widget = App() 522 | widget.show() 523 | 524 | sys.exit(app.exec_()) 525 | -------------------------------------------------------------------------------- /sigkit/signaturelibrary.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2015-2020 Vector 35 Inc 2 | # 3 | # Permission is hereby granted, free of charge, to any person obtaining a copy 4 | # of this software and associated documentation files (the "Software"), to 5 | # deal in the Software without restriction, including without limitation the 6 | # rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 7 | # sell copies of the Software, and to permit persons to whom the Software is 8 | # furnished to do so, subject to the following conditions: 9 | # 10 | # The above copyright notice and this permission notice shall be included in 11 | # all copies or substantial portions of the Software. 12 | # 13 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 18 | # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 19 | # IN THE SOFTWARE. 20 | 21 | """ 22 | This package contains definitions for the data structures and objects used in 23 | Signature Libraries. To construct a new empty signature trie, use `new_trie`. 24 | """ 25 | 26 | # 2-3 compatibility 27 | import sys 28 | 29 | PY2 = sys.version_info[0] == 2 30 | PY3 = sys.version_info[0] == 3 31 | 32 | if PY2: 33 | bytes_ord = ord 34 | else: 35 | bytes_ord = lambda x: x 36 | 37 | import functools 38 | from itertools import starmap 39 | 40 | 41 | @functools.total_ordering # for sorted() 42 | class MaskedByte(object): 43 | """ 44 | Represents a pattern to match a single byte: either a value from 0-255, or a wildcard, '??' 45 | Algebraically, you can imagine that there is a partial ordering where 0-255 < ??, or 46 | alternatively, a total ordering where 0 < 1 < 2 < ... < 255 < ?? 47 | 48 | This class is backed by a flyweight cache. Use `MaskedByte.new` to construct. 49 | """ 50 | 51 | wildcard = None 52 | cache = [] 53 | 54 | def __init__(self, value, mask): 55 | self._value = value 56 | self._mask = mask 57 | 58 | @property 59 | def value(self): 60 | return self._value 61 | 62 | @property 63 | def mask(self): 64 | return self._mask 65 | 66 | @staticmethod 67 | def new(value, mask): 68 | assert type(value) == int 69 | assert 0 <= value <= 255 70 | assert mask == 0 or mask == 1 71 | if mask == 0: 72 | return MaskedByte.wildcard 73 | else: 74 | return MaskedByte.cache[value] 75 | 76 | @staticmethod 77 | def from_str(s): 78 | assert len(s) == 2 79 | if s == '??': 80 | return MaskedByte.wildcard 81 | else: 82 | return MaskedByte.new(int(s, 16), 1) 83 | 84 | def __str__(self): 85 | return '%02x' % (self._value,) if self._mask == 1 else '??' 86 | 87 | def __repr__(self): 88 | return self.__str__() 89 | 90 | def __eq__(self, other): 91 | if not type(other) == type(self): 92 | return False 93 | return self.matches(other) and other.matches(self) 94 | 95 | # this defines a total ordering 96 | def __hash__(self): 97 | if self._mask == 0: 98 | return 256 99 | else: 100 | return self._value # 0-255 101 | 102 | # this is only here for sorting purposes in python, no fancy algebraic interpretation behind it. 103 | def __le__(self, other): 104 | assert type(other) == type(self) 105 | return self.__hash__() <= other.__hash__() 106 | 107 | def matches(self, other): 108 | """ 109 | Defines a *partial* ordering, essentially a >= operator algebraically: 110 | (00...FF) <= ??; other elements are incomparable. 111 | :param other: MaskedByte or byte 112 | :return: True if all bytes matched by `other` are also matched by this 113 | """ 114 | if self._mask == 0: 115 | return True 116 | if isinstance(other, MaskedByte): 117 | if other._mask == 0: 118 | return False 119 | else: 120 | return self._value == other._value 121 | if PY2 and type(other) == str: 122 | assert len(other) == 1 123 | return self._value == ord(other) 124 | assert type(other) == int 125 | return self._value == other 126 | 127 | # Meet operator 128 | def intersect(self, other): 129 | assert isinstance(other, MaskedByte) 130 | if self._mask == 0 and other._mask == 0: 131 | return MaskedByte.wildcard 132 | elif self._mask == 0 and other._mask == 1: 133 | return other 134 | elif self._mask == 1 and other._mask == 0: 135 | return self 136 | elif self._value == other._value: 137 | return self 138 | else: 139 | return None # NO intersection! 140 | 141 | # Join operator 142 | def union(self, other): 143 | assert isinstance(other, MaskedByte) 144 | if self._mask == 0 or other._mask == 0: 145 | return MaskedByte.wildcard 146 | elif self._value == other._value: 147 | return self 148 | else: 149 | return MaskedByte.wildcard # !! 150 | MaskedByte.wildcard = MaskedByte(0, 0) 151 | MaskedByte.cache = [MaskedByte(value, 1) for value in range(256)] 152 | 153 | 154 | class Pattern(): 155 | """ 156 | Represents a pattern used for matching byte sequences; a sequence of MaskedByte. 157 | For example, the string representation of a Pattern looks like `1234??56??78` . 158 | Behaves like an array. 159 | """ 160 | 161 | def __init__(self, data, mask): 162 | """ 163 | Constructs a new pattern object 164 | :param data: bytes-like object, byte sequence of this pattern. 165 | :param mask: wildcard mask for the pattern. must be the same length as `data`. array of 0 or 1, 0 means wildcard at that position 166 | :return: 167 | """ 168 | assert len(data) == len(mask) 169 | assert type(data) == bytes 170 | assert type(mask) == list 171 | for elem in mask: assert elem == 0 or elem == 1 172 | self._array = tuple(MaskedByte.new(bytes_ord(data[i]), mask[i]) for i in range(len(data))) 173 | 174 | @staticmethod 175 | def from_str(s): 176 | if len(s) % 2: 177 | raise ValueError('odd pattern length ' + str(len(s)) + ': ' + s) 178 | p = Pattern(b'', []) 179 | p._array = tuple(MaskedByte.from_str(s[i:i + 2]) for i in range(0, len(s), 2)) 180 | return p 181 | 182 | def __str__(self): 183 | return ''.join(map(str, self._array)) 184 | 185 | def __getitem__(self, item): 186 | if isinstance(item, slice): 187 | p = Pattern(b'', []) 188 | p._array = self._array.__getitem__(item) 189 | return p 190 | return self._array.__getitem__(item) 191 | 192 | def __len__(self): 193 | return self._array.__len__() 194 | 195 | def __iter__(self): 196 | return self._array.__iter__() 197 | 198 | def __eq__(self, other): 199 | if not type(other) == type(self): 200 | return False 201 | return self._array.__eq__(other._array) 202 | 203 | def __hash__(self): 204 | return self._array.__hash__() 205 | 206 | def matches(self, buf): 207 | """ 208 | Checks if this Pattern matches `buf`. 209 | :param buf: Pattern or bytestring 210 | :return: True if all bytes matched by `other` are also matched by this 211 | """ 212 | if len(self._array) > len(buf): return False 213 | return all(starmap(MaskedByte.matches, zip(self._array, buf))) 214 | 215 | # Meet operator 216 | def intersect(self, other): 217 | assert isinstance(other, Pattern) 218 | # right-pad with wildcard 219 | size = max(len(self._array), len(other._array)) 220 | array1 = self._array + tuple([MaskedByte.wildcard] * (size - len(self._array))) 221 | array2 = other._array + tuple([MaskedByte.wildcard] * (size - len(other._array))) 222 | result_array = tuple(starmap(MaskedByte.intersect, zip(array1, array2))) 223 | if not all(result_array): return None # No intersection! 224 | p = Pattern(b'', []) 225 | p._array = result_array 226 | return p 227 | 228 | # Join operator 229 | def union(self, other): 230 | assert isinstance(other, Pattern) 231 | # length truncated to smallest 232 | result_array = tuple(starmap(MaskedByte.union, zip(self._array, other._array))) 233 | p = Pattern(b'', []) 234 | p._array = result_array 235 | return p 236 | 237 | def data(self): 238 | for b in self._array: 239 | yield b.value 240 | 241 | def mask(self): 242 | for b in self._array: 243 | yield b.mask 244 | 245 | 246 | class FunctionInfo(object): 247 | """ 248 | Stores additional information about functions that are useful while generating and manipulating 249 | signature libraries, but excluded from the finalized signature library to save space. 250 | This information is also used to simulate linking when generating the call-graph. 251 | """ 252 | def __init__(self): 253 | self.patterns = None 254 | """list of `Pattern`s which match this function""" 255 | 256 | self.callees = None 257 | """dictionary of {offset: (destination name, `ReferenceType`)}; other symbols this function calls""" 258 | 259 | self.aliases = None 260 | """list of string containing other possible names that could link to this function""" 261 | 262 | def __str__(self): 263 | return '' 264 | 265 | 266 | class FunctionNode(object): 267 | """ 268 | Represents a function that we would like to match and contains relevant metadata for matching purposes. 269 | Function nodes are connected with each other by a call graph. This helps not only encode information about 270 | individual functions but also the relationships between them when matching. 271 | Each FunctionNode is a vertex of the call graph, represented by an edge list stored in `callees`. 272 | 273 | To create a FunctionNode for a given function, see `compute_sig.process_function`. 274 | """ 275 | def __init__(self, name): 276 | self.name = name 277 | """The name of the matched function""" 278 | 279 | self.source_binary = '' 280 | """The filename of the binary that the function came from (malloc.o for example). Optional.""" 281 | 282 | # used to disambiguate when multiple FunctionNodes are matched 283 | self.pattern = Pattern(b'', []) 284 | self.pattern_offset = 0 285 | 286 | self.callees = {} 287 | """Forms a callgraph with other `FunctionNodes`. Dict of {call_offset: destination}.""" 288 | 289 | self.ref_count = 0 290 | """Number of places this node is in its signature trie""" 291 | 292 | @property 293 | def is_bridge(self): 294 | return self.ref_count == 0 295 | 296 | def __str__(self): 297 | return '' 298 | 299 | def __repr__(self): 300 | result = ' 12 and len(b.name) > 12: 54 | return a.name in b.name or b.name in a.name 55 | return False 56 | 57 | 58 | # mathematically speaking, this is defines a less-than-or-equal-to operation over the 59 | # set of function signatures and therefore a partial ordering over that set. 60 | # Let A, B both be signatures. we say that that A <= B if all functions that A matches 61 | # are also matched by B. In other words, for a signature library containing B, it would be 62 | # redundant to add A, since B already already matches all of the functions that A would. 63 | # 64 | # Let A ⨅ B denote a signature that would match all functions matched by both A and B. 65 | # ⨅ defines a "meet" relationship on the lattice of signatures. 66 | # in other words, A <= B iff A ⨅ B = B (and commutatively, B ⨅ A = A). 67 | # concretely, A ⨅ B is equivalent to the signature with pattern and callees that is the 68 | # intersection those of A and B. thus, we can check if A = (A ⨅ B), and likewise A <= B, 69 | # by matching A's pattern directly against B's pattern. 70 | # 71 | # the greatest element of this lattice is None, a signature that matches all functions. 72 | # 73 | # during optimization during trie finalization, we delete all non-maximal signatures 74 | # in the signature trie; i.e., all signatures which are less than another one (and therefore 75 | # redundant) are eliminated. the downside to this approach is that we will lose a degree 76 | # of specificity: consider a function which matches B but doesn't match A (where A <= B). 77 | # we could choose to keep both A and B in the signature library, but if we encountered such 78 | # a function, what should we do with it? how do we distinguish between functions which match 79 | # both A and B as opposed to ones which only match B? more importantly, what name should 80 | # be assigned to such a function? therefore, it's meaningless to include both A and B, and 81 | # eliminating the redundancy also reduces the ambiguity of matches. 82 | # 83 | # this function returns whether A <= B, for two function signatures A and B. 84 | # 85 | # sometimes, we don't have function info (e.g., function bytes) for both nodes. this is typically 86 | # when we're trying to merge additional nodes into a trie that we don't have FunctionInfo for. 87 | # in this case, we only need function info for the nodes we're trying to merge in by exploiting 88 | # the signature trie. We know A <= B if searching for A's data in the trie matches B. 89 | def is_signature_subset(a, func_info, b, sig_trie, visited): 90 | """ 91 | :param a: FunctionNode to check whether is a subset of B 92 | :param func_info: dict containing the function info for A's trie 93 | :param b: FunctionNode to check whether it contains A 94 | :param sig_trie: the trie B belongs to. 95 | :param visited: visited set, should be initialized to {} 96 | :return: whether A matches a subset of what B matches 97 | """ 98 | if a == b: 99 | return True 100 | if int(a is None) < int(b is None): 101 | return True 102 | if int(a is None) > int(b is None): 103 | return False 104 | assert isinstance(a, signaturelibrary.FunctionNode) 105 | assert isinstance(b, signaturelibrary.FunctionNode) 106 | assert a in func_info 107 | 108 | # this is essentially a dfs on the callgraph. if we encounter a backedge, 109 | # treat it optimistically, implying that the callers match if the callees match. 110 | # however, we track our previous assumptions, meaning that if we previously 111 | # optimistically assumed b == a, then later on if we compare b and c, we say 112 | # that b != c since we already assumed b == a (and we already checked above that c != a). 113 | if b in visited: 114 | return visited[b] == a 115 | visited[b] = a 116 | 117 | # if A is bridge, but B isn't, A is obviously more ambiguous than B. (and vice versa) 118 | if int(a.is_bridge) < int(b.is_bridge): 119 | return True 120 | if int(a.is_bridge) > int(b.is_bridge): 121 | return False 122 | 123 | if not b.is_bridge: 124 | for a_pattern in func_info[a].patterns: 125 | # if A is a subset of B, then B >= A; i.e., searching the trie for A's data should match B. 126 | # A <= B --> A ⨅ B = B 127 | if b not in sig_trie.find(a_pattern): 128 | return False 129 | 130 | # return false if B's additional pattern doesn't match A (B ⨅ A != B) 131 | for a_pattern in func_info[a].patterns: 132 | if b.pattern_offset >= 0 and b.pattern_offset + len(b.pattern) < len(a_pattern): 133 | if not b.pattern.matches(a_pattern[b.pattern_offset:]): 134 | return False 135 | 136 | # check that all callees required by B are also required by A 137 | for call_site, callee in b.callees.items(): 138 | if callee is not None and call_site not in a.callees: 139 | return False 140 | if not all(map(lambda k: is_signature_subset(a.callees[k] if k in a.callees else None, func_info, 141 | b.callees[k], sig_trie, visited), b.callees)): 142 | return False 143 | 144 | return True 145 | 146 | 147 | def rewrite_callgraph(funcs, to_delete): 148 | # complete the DFS first, avoid simultaneous modification and traversal 149 | inverse_callgraph = defaultdict(set) 150 | for func in funcs: 151 | if func in to_delete: continue 152 | for callee in func.callees.values(): 153 | if callee in to_delete: 154 | inverse_callgraph[callee].add(func) 155 | 156 | def follow(k): 157 | while k in to_delete: 158 | k = to_delete[k] 159 | return k 160 | 161 | # rewrite callgraph 162 | for k in to_delete: 163 | v = follow(k) 164 | for func in inverse_callgraph[k]: 165 | for call_site in func.callees: 166 | if func.callees[call_site] == k: 167 | func.callees[call_site] = v 168 | assert k != v 169 | # print('replace', k.name, id(k), '=>', v.name, id(v),'in', func.name) 170 | 171 | 172 | def rewrite_trie(sig_trie, to_delete, update=False): 173 | def follow(k): 174 | while k in to_delete: 175 | k = to_delete[k] 176 | return k 177 | 178 | # rewrite trie values 179 | for node in sig_trie.all_nodes(): 180 | if not node.value: continue 181 | new_value = [] 182 | for func in node.value: 183 | func.ref_count -= 1 184 | if func in to_delete: 185 | if update: 186 | v = follow(func) 187 | if v not in new_value: 188 | v.ref_count += 1 189 | new_value.append(v) 190 | else: 191 | if func not in new_value: 192 | func.ref_count += 1 193 | new_value.append(func) 194 | node.value = new_value 195 | 196 | # dfs; delete functionless subtries 197 | def prune(node): 198 | if not node.children: 199 | should_delete = not node.value 200 | return should_delete 201 | new_children = {} 202 | for b, c in node.children.items(): 203 | should_delete = prune(c) 204 | if not should_delete: 205 | new_children[b] = c 206 | node.children = new_children 207 | should_delete = not node.children and not node.value 208 | return should_delete 209 | prune(sig_trie) 210 | 211 | 212 | # one-way deduplication (trie1 to trie2) 213 | def find_redundant(trie1, info1, trie2): 214 | cache = {} 215 | def cached_is_signature_subset(a, func_info, b, sig_trie, visited): 216 | if (a, b) in cache: 217 | return cache[(a, b)] 218 | result = is_signature_subset(a, func_info, b, sig_trie, visited) 219 | cache[(a, b)] = result 220 | return result 221 | 222 | 223 | # search trie2 for funcs from trie1. if `A` is matched by `B`, then `A` matches a subset of `B` 224 | # and should be discarded. references to `A` should be replaced by references to `B`. 225 | # algebraically if A ⨅ B = A, then A <= B, so A is redundant. 226 | to_delete = {} 227 | 228 | def check_if_redundant(func_a, func_b): 229 | while func_b in to_delete: # avoid cycles 230 | func_b = to_delete[func_b] 231 | if func_a == func_b: # avoid infinite loop 232 | return False 233 | if not are_names_compatible(func_a, func_b): 234 | return False 235 | if cached_is_signature_subset(func_a, info1, func_b, trie2, {}): # func <= cand. func is redundant 236 | to_delete[func_a] = func_b 237 | return True 238 | return False 239 | 240 | for func in info1: # func is our `A` 241 | for pattern in info1[func].patterns: 242 | candidates = trie1.find(pattern) 243 | for cand in candidates: # cand is our `B` 244 | check_if_redundant(func, cand) 245 | 246 | # also clean up useless bridge nodes 247 | bridges1 = list(filter(lambda f: f.is_bridge, info1)) 248 | bridges2 = list(filter(lambda f: f.is_bridge, trie2.all_functions())) 249 | for func in bridges1: 250 | for cand in bridges2: 251 | check_if_redundant(func, cand) 252 | 253 | return to_delete 254 | 255 | 256 | # Would it be ok substitute A with B if they have the same name? In that case, trie position is irrelevant as 257 | # we can just have multiple leaf nodes pointing to the same function node 258 | def can_substitute(a, b): 259 | if a == b: return True 260 | if (b is None) != (a is None): return False 261 | assert isinstance(a, signaturelibrary.FunctionNode) 262 | assert isinstance(b, signaturelibrary.FunctionNode) 263 | 264 | if not are_names_compatible(a, b): 265 | return False 266 | 267 | # if A is bridge, but B isn't, A is obviously more ambiguous than B. 268 | if int(a.is_bridge) < int(b.is_bridge): 269 | return False 270 | 271 | # check that all callees required by B are also required by A 272 | for call_site, callee in b.callees.items(): 273 | if callee is not None and call_site not in a.callees: 274 | return False 275 | 276 | return True 277 | 278 | # deal with signatures with the same name at different parts in the signature trie that can be merged 279 | def collapse_by_name(func_info): 280 | by_name = defaultdict(set) 281 | for f in func_info: 282 | by_name[f.name].add(f) 283 | to_delete = {} 284 | for family in by_name.values(): 285 | for func in family: 286 | for cand in family: 287 | while cand in to_delete: # avoid cycles 288 | cand = to_delete[cand] 289 | if func == cand: # avoid infinite loop 290 | continue 291 | if can_substitute(func, cand): 292 | to_delete[func] = cand 293 | # transfer patterns and aliases from deleted functioninfo to cand's 294 | cand_info = func_info[cand] 295 | deleted_info = func_info[func] 296 | cand_info.patterns.extend(deleted_info.patterns) 297 | cand_info.aliases.extend(deleted_info.aliases) 298 | deleted_info.patterns = [] # free memory (!) 299 | deleted_info.aliases = [] 300 | return to_delete 301 | 302 | 303 | def sanity_check(sig_trie): 304 | if not sig_trie.children: 305 | sys.stderr.write('Warning: no functions in trie\n') 306 | return 307 | 308 | count = defaultdict(lambda: 0) 309 | for func in sig_trie.all_values(): 310 | count[func] += 1 311 | for func in sig_trie.all_functions(): 312 | assert func.ref_count == count[func] 313 | 314 | 315 | # we avoid linking across library boundaries ... they're discrete compilation units and we shouldn't assume 316 | # anything about inter-module calls. who knows which version will be linked with what! 317 | # if we can't resolve the reference, exclude that from the signature! if an optional library isn't linked, 318 | # the call will turn into a stub (like jump 0x0), and will not be a call in the real binary. 319 | # so, we give that a wildcard. in our matching algorithm, we allow calls to wildcard callee to be optional. 320 | def resolve_reference(name, sym_type, source_binary, source_to_node): 321 | if sym_type == SymbolType.FunctionSymbol: 322 | # look for callee from the same object file 323 | if source_binary in source_to_node: 324 | result = source_to_node[source_binary] 325 | # print('resolved static reference', name, '=', result.name, 'from', source_binary) 326 | return result 327 | else: 328 | # sys.stderr.write('Warning: missing static reference ' + name + ' from ' + source_binary + '\n') 329 | return None 330 | else: 331 | # look for callee in a different object file 332 | possible_callees = [] 333 | for source in source_to_node: 334 | if source != source_binary: 335 | possible_callees.append(source_to_node[source]) 336 | if not possible_callees: 337 | # sys.stderr.write('Warning: missing extern reference ' + name + ' from ' + source_binary + '\n') 338 | return None 339 | elif len(possible_callees) > 1: 340 | # sys.stderr.write('Warning: multiple definitions for external reference ' + name + ' from ' + source_binary + ': '+ ', '.join(map(lambda n: n.name, possible_callees)) + '\n') 341 | return None 342 | else: 343 | # print('resolved extern reference', name, '=', possible_callees[0].name) 344 | return possible_callees[0] 345 | 346 | 347 | def link_callgraph(func_info): 348 | """ 349 | Construct the callgraph based on `FunctionInfo` and link all the `FunctionNode`s together. 350 | :param func_info: 351 | :return: 352 | """ 353 | name_to_source_to_node = defaultdict(dict) 354 | for node, info in func_info.items(): 355 | for name in [node.name] + info.aliases: 356 | name_to_source_to_node[name][node.source_binary] = node 357 | 358 | for node, info in func_info.items(): 359 | node.callees = {call_site: resolve_reference(name, sym_type, node.source_binary, name_to_source_to_node[name]) 360 | for call_site, (name, sym_type) in info.callees.items()} 361 | # Wildcard out callees that are masked out. 362 | def is_valid_call_site(i): 363 | if i < 0: return False 364 | for pattern in info.patterns: 365 | if i >= len(pattern): return False 366 | if not pattern[i]: return False 367 | return True 368 | node.callees = {call_site: callee if is_valid_call_site(call_site) else None 369 | for call_site, callee in node.callees.items()} 370 | 371 | 372 | def choose_disambiguation_bytes(sig_trie, func_info, min_offset=32, maxlen=5): 373 | for node in sig_trie.all_nodes(): 374 | if not node.value: continue 375 | for f in node.value: assert f in func_info 376 | for f in node.value: # reset patterns 377 | f.pattern = signaturelibrary.Pattern(b'', []) 378 | f.pattern_offset = 0 379 | if len(node.value) <= 1: continue 380 | 381 | # since a FunctionNode can have multiple patterns in its FunctionInfo, we say that the set of functions 382 | # it matches is based on the *join* ⨆ of all of these patterns. our goal here is to find some substring 383 | # in all of these patterns that share no intersection. 384 | # 385 | # let P(f) denote the patterns belonging to FunctionNode f's FunctionInfo. 386 | # then let PU(f) = ⨆ P(f) ; i.e. the join of all patterns, a pattern that would match the union of functions matched by those patterns. 387 | # given some functions f1,f2,... at this trie node, we want to find some substring (i,j) in PU(f1),PU(f2),... 388 | # such that PU(fx)[i:j] ⨅ PU(fy)[i:j] = 0 for all pairs fx,fy in f1,f2,... 389 | # then we will choose PU(f)[i:j] as f's disambiguation pattern for each FunctionNode f in f1,f2,... 390 | 391 | pu = {func: reduce(signaturelibrary.Pattern.union, func_info[func].patterns) for func in node.value} 392 | min_len = min(map(len, pu.values())) 393 | if min_len <= min_offset: # this is hopeless. all those bytes are already in the trie 394 | # print('Warn: no possible disambiguation (length) for', repr(node)) 395 | continue 396 | if reduce(operator.eq, pu.values()): 397 | # print('Warn: no possible disambiguation (content) for', repr(node)) 398 | continue 399 | 400 | def ok(i, j): 401 | for fx in node.value: 402 | for fy in node.value: 403 | if fx == fy: continue 404 | if pu[fx][i:j].intersect(pu[fy][i:j]) is not None: 405 | return False 406 | return True 407 | 408 | for i in range(min_offset, min_len-1): # unfortunately, this is O(min_len*maxlen). 409 | j = i+1 410 | while not ok(i, j) and j < min_len and j-i < maxlen: 411 | j += 1 412 | while ok(i+1, j) and i+1 < j: 413 | i += 1 414 | if ok(i, j): 415 | for f in node.value: 416 | f.pattern = pu[f][i:j] 417 | f.pattern_offset = i 418 | break 419 | # else: 420 | # print('Warn: failed to choose disambiguation for', repr(node)) 421 | 422 | 423 | # finalizing a trie links the call graph and removes any redundant nodes, and adds disambiguation bytes 424 | def finalize_trie(sig_trie, func_info): 425 | link_callgraph(func_info) 426 | sanity_check(sig_trie) 427 | 428 | to_delete = find_redundant(sig_trie, func_info, sig_trie) 429 | rewrite_callgraph(func_info, to_delete) 430 | rewrite_trie(sig_trie, to_delete) 431 | for k in to_delete: assert k.ref_count == 0 432 | for k in to_delete: del func_info[k] 433 | to_delete = collapse_by_name(func_info) 434 | 435 | rewrite_callgraph(func_info, to_delete) 436 | rewrite_trie(sig_trie, to_delete) 437 | for k in to_delete: assert k.ref_count == 0 438 | for k in to_delete: del func_info[k] 439 | sanity_check(sig_trie) 440 | 441 | choose_disambiguation_bytes(sig_trie, func_info) 442 | 443 | 444 | # inserts functions from FunctionInfo dict `src_info` into trie `dst_trie`. 445 | def trie_insert_funcs(dst_trie, src_info, maxlen=32): 446 | for to_add in src_info: 447 | to_add.ref_count = 0 # we are repatriating this function node. reset refcount 448 | for pattern in src_info[to_add].patterns: 449 | pattern = pattern[:maxlen] 450 | inserted = dst_trie.insert(pattern, to_add) 451 | 452 | 453 | # merges a signature trie `src_trie` into another signature trie dst_trie`, with FunctionInfo only available for `src_trie`. 454 | # `dst_trie` is modified. 455 | def update_signature_library(dst_trie, src_trie, src_info): 456 | link_callgraph(src_info) # build callgraph 457 | 458 | # identify redundant signatures 459 | to_delete = find_redundant(src_trie, src_info, dst_trie) 460 | 461 | # merge 462 | trie_insert_funcs(dst_trie, src_info) 463 | rewrite_callgraph(dst_trie.all_functions(), to_delete) 464 | rewrite_trie(dst_trie, to_delete) 465 | 466 | sanity_check(dst_trie) 467 | 468 | 469 | # combines two signature tries, `src_trie` into `dst_trie` where FunctionInfo is available for both tries. 470 | # both `dst_trie` and `dst_info` are mutated: functions from `src_trie` and `src_info` are added `dst_trie` and `dst_info`. 471 | def combine_signature_libraries(dst_trie, dst_info, src_trie, src_info): 472 | # merge 473 | trie_insert_funcs(dst_trie, src_info) 474 | dst_info.update(src_info) 475 | 476 | # identify redundant signatures 477 | to_delete = find_redundant(dst_trie, dst_info, src_trie) 478 | rewrite_callgraph(dst_info, to_delete) 479 | rewrite_trie(dst_trie, to_delete) 480 | for k in to_delete: assert k.ref_count == 0 481 | for k in to_delete: del dst_info[k] 482 | 483 | sanity_check(dst_trie) 484 | -------------------------------------------------------------------------------- /signaturelibrary.fbs: -------------------------------------------------------------------------------- 1 | namespace FlatbufSignatureLibrary; 2 | 3 | struct CallRef { 4 | // offset from the start of the function to the call instruction 5 | offset: int (key); 6 | 7 | // the index of the callee function in the signature library's 8 | // function vector. a value of -1 indicates a missing, or 'null' 9 | // node, that becomes a wildcard when matching. 10 | dst_id: int; 11 | } 12 | 13 | table Pattern { 14 | // pattern data. 15 | // e.g., for 4142??3132 it would be \x41\x42\x00\x31\x32 16 | data: [ubyte] (required); 17 | 18 | // bitfield of the mask in LSB order. 19 | // e.g., for x?xxxxxx xxx? the mask it would be [0b11111101, 0b0111] 20 | mask: [ubyte] (required); 21 | } 22 | 23 | table Function { 24 | // name of the function 25 | name: string (required); 26 | 27 | // which object file or binary the function came from 28 | source_binary: string; 29 | 30 | // a map representing the functions this function calls. 31 | callees: [CallRef]; 32 | 33 | // optional disambiguation pattern 34 | pattern: Pattern; 35 | 36 | pattern_offset: uint; 37 | 38 | // if true, this function is a "bridge" node that serves only to link 39 | // two function nodes on the callgraph. bridge nodes don't exist in 40 | // the signature trie, only in the callgraph. typically, this is 41 | // because the function is too short to put in the signature trie 42 | // while avoiding false positives. 43 | is_bridge: bool; 44 | } 45 | 46 | table TrieNode { 47 | // this is the first byte of the pattern. this is used as the key 48 | // in the children map so that children can be selected using binary search 49 | // if the pattern's first byte is a wildcard, then this field is left 50 | // as zero and `wildcard_child' is used in place of `children'. 51 | patternPrefix: ubyte (key); 52 | 53 | // pattern this trie node matches 54 | pattern: Pattern (required); 55 | 56 | // map of child nodes from their pattern's first byte to the node. 57 | children: [TrieNode]; 58 | 59 | // child node whose pattern begins with a wildcard, if such a child node exists 60 | wildcard_child: TrieNode; 61 | 62 | // the functions this trie node matches. 63 | functions: [uint]; 64 | } 65 | 66 | table SignatureLibrary { 67 | // a vector of all of the function nodes present in this signature library. 68 | functions: [Function] (required); 69 | 70 | // the root trie node. 71 | root: TrieNode (required); 72 | } 73 | 74 | file_extension "sig"; 75 | root_type SignatureLibrary; 76 | --------------------------------------------------------------------------------