├── .gitignore
├── LICENSE
├── README.md
├── __init__.py
├── examples
├── batch_process.py
├── convert_siglib.py
├── libc-scraper
│ ├── .gitignore
│ ├── README.md
│ ├── merge_ubuntu.py
│ ├── process-deb.sh
│ ├── run.sh
│ └── ubuntu-libc-scraper.py
├── merge_multiple_versions.py
└── sig_match.py
├── icon.ico
├── images
└── explorer.png
├── plugin.json
├── pyproject.toml
├── requirements.txt
├── setup.cfg
├── sigkit
├── FlatbufSignatureLibrary
│ ├── CallRef.py
│ ├── Function.py
│ ├── Pattern.py
│ ├── SignatureLibrary.py
│ ├── TrieNode.py
│ └── __init__.py
├── __init__.py
├── compute_sig.py
├── sig_serialize_fb.py
├── sig_serialize_json.py
├── sigexplorer.py
├── signaturelibrary.py
└── trie_ops.py
└── signaturelibrary.fbs
/.gitignore:
--------------------------------------------------------------------------------
1 | __pycache__
2 | *.bndb
3 | *.a
4 | *.so
5 | *.so.6
6 | testcases/
7 | *.pyc
8 | .idea
9 |
10 | *.pkl
11 | *.fb
12 | *.sig
13 | *.zlib
14 |
15 | sigs/
16 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Copyright (c) 2019-2020 Vecto 35 Inc
2 |
3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
4 |
5 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
6 |
7 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Signature Kit Plugin (v1.2.2)
2 | Author: **Vector 35 Inc**
3 |
4 | _Python tools for working with Signature Libraries_
5 |
6 | ## Description:
7 |
8 | This plugin provides Python tools for generating, manipulating, viewing, loading, and saving signature libraries (.sig) for the Signature System. This plugin also provides UI integration for easy access from the Binary Ninja UI to common functions in the `Plugins\Signature Library` menu.
9 |
10 |
11 | 
12 |
13 | Also included are [example scripts](https://github.com/Vector35/sigkit/tree/master/examples) which demonstrate batch processing and automatic creation of signature libraries for Ubuntu libc.
14 | You can also run the Signature Explorer GUI as a standalone app.
15 |
16 |
17 | ## Installation Instructions
18 |
19 | ### Windows
20 |
21 |
22 |
23 | ### Linux
24 |
25 |
26 |
27 | ### Darwin
28 |
29 |
30 |
31 | ## Minimum Version
32 |
33 | This plugin requires the following minimum version of Binary Ninja:
34 |
35 | * 1997
36 |
37 |
38 |
39 | ## Required Dependencies
40 |
41 | The following dependencies are required for this plugin:
42 |
43 | * pip - flatbuffers
44 |
45 |
46 | ## License
47 |
48 | This plugin is released under a MIT license.
49 | ## Metadata Version
50 |
51 | 2
52 |
--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 |
3 | # Copyright (c) 2019-2020 Vector 35 Inc
4 | #
5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
6 | # of this software and associated documentation files (the "Software"), to
7 | # deal in the Software without restriction, including without limitation the
8 | # rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
9 | # sell copies of the Software, and to permit persons to whom the Software is
10 | # furnished to do so, subject to the following conditions:
11 | #
12 | # The above copyright notice and this permission notice shall be included in
13 | # all copies or substantial portions of the Software.
14 | #
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 | # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 | # IN THE SOFTWARE.
22 |
23 | from .sigkit.sig_serialize_fb import SignatureLibraryReader, SignatureLibraryWriter
24 | from .sigkit.compute_sig import process_function as generate_function_signature
25 |
26 | def load_signature_library(filename):
27 | """
28 | Load a signature library from a .sig file.
29 | :param filename: input filename
30 | :return: instance of `TrieNode`, the root of the signature trie.
31 | """
32 | with open(filename, 'rb') as f:
33 | buf = f.read()
34 | return SignatureLibraryReader().deserialize(buf)
35 |
36 | def save_signature_library(sig_lib, filename):
37 | """
38 | Save the given signature library to a file.
39 | :param sig_lib: instance of `TrieNode`, the root of the signature trie.
40 | :param filename: destination filename
41 | """
42 | buf = SignatureLibraryWriter().serialize(sig_lib)
43 | with open(filename, 'wb') as f:
44 | f.write(buf)
45 |
--------------------------------------------------------------------------------
/examples/batch_process.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 |
3 | # Copyright (c) 2015-2020 Vector 35 Inc
4 | #
5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
6 | # of this software and associated documentation files (the "Software"), to
7 | # deal in the Software without restriction, including without limitation the
8 | # rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
9 | # sell copies of the Software, and to permit persons to whom the Software is
10 | # furnished to do so, subject to the following conditions:
11 | #
12 | # The above copyright notice and this permission notice shall be included in
13 | # all copies or substantial portions of the Software.
14 | #
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 | # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 | # IN THE SOFTWARE.
22 |
23 | """
24 | This script processes many object files using headless mode and generate
25 | function signatures for functions in them in a highly parallelized fashion.
26 | The result is a dictionary of {FunctionNode: FunctionInfo} that is then pickled
27 | and saved to disk. These pickles can be processed with a merging script, i.e.
28 | merge_multiple_versions.py or libc-scraper's merge_ubuntu.py.
29 | """
30 |
31 | import time
32 |
33 | from binaryninja import *
34 |
35 | import sigkit
36 |
37 | def process_bv(bv):
38 | global results
39 | print(bv.file.filename, ': processing')
40 | guess_relocs = len(bv.relocation_ranges) == 0
41 |
42 | for func in bv.functions:
43 | try:
44 | if bv.get_symbol_at(func.start) is None: continue
45 | node, info = sigkit.generate_function_signature(func, guess_relocs)
46 | results.put((node, info))
47 | print("Processed", func.name)
48 | except:
49 | import traceback
50 | traceback.print_exc()
51 | print(bv.file.filename, ': done')
52 |
53 | def on_analysis_complete(self):
54 | global wg
55 | process_bv(self.view)
56 | with wg.get_lock():
57 | wg.value -= 1
58 | self.view.file.close()
59 |
60 | def process_binary(input_binary):
61 | global wg
62 | print(input_binary, ': loading')
63 | if input_binary.endswith('.dll'):
64 | bv = binaryninja.BinaryViewType["PE"].open(input_binary)
65 | cxt = PluginCommandContext(bv)
66 | PluginCommand.get_valid_list(cxt)['PDB\\Load (BETA)'].execute(cxt)
67 | elif input_binary.endswith('.o'):
68 | bv = binaryninja.BinaryViewType["ELF"].open(input_binary)
69 | else:
70 | raise ValueError('unsupported input file', input_binary)
71 | if not bv:
72 | print('Failed to load', input_binary)
73 | return
74 | AnalysisCompletionEvent(bv, on_analysis_complete)
75 | bv.update_analysis()
76 | with wg.get_lock():
77 | wg.value += 1
78 |
79 | def async_process(input_queue):
80 | for input_binary in input_queue:
81 | process_binary(input_binary)
82 | yield
83 |
84 | def init_child(wg_, results_):
85 | global wg, results
86 | wg, results = wg_, results_
87 |
88 | if __name__ == '__main__':
89 | import sys
90 | from pathlib import Path
91 | if len(sys.argv) < 3:
92 | print('Usage: %s ' % (sys.argv[0]))
93 | print('The pickle designates the filename of a pickle file that the computed function metadata will be saved to.')
94 | sys.exit(1)
95 |
96 | import multiprocessing as mp
97 | wg = mp.Value('i', 0)
98 | results = mp.Queue()
99 |
100 | func_info = {}
101 |
102 | with mp.Pool(mp.cpu_count(), initializer=init_child, initargs=(wg, results)) as pool:
103 | pool.map(process_binary, map(str, Path('.').glob(sys.argv[1])))
104 |
105 | while True:
106 | time.sleep(0.1)
107 | with wg.get_lock():
108 | if wg.value == 0: break
109 |
110 | while not results.empty():
111 | node, info = results.get()
112 | func_info[node] = info
113 |
114 | import pickle
115 | with open(sys.argv[2], 'wb') as f:
116 | pickle.dump(func_info, f)
117 |
--------------------------------------------------------------------------------
/examples/convert_siglib.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2015-2020 Vector 35 Inc
2 | #
3 | # Permission is hereby granted, free of charge, to any person obtaining a copy
4 | # of this software and associated documentation files (the "Software"), to
5 | # deal in the Software without restriction, including without limitation the
6 | # rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
7 | # sell copies of the Software, and to permit persons to whom the Software is
8 | # furnished to do so, subject to the following conditions:
9 | #
10 | # The above copyright notice and this permission notice shall be included in
11 | # all copies or substantial portions of the Software.
12 | #
13 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
18 | # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
19 | # IN THE SOFTWARE.
20 |
21 | """
22 | This utility shows how to load and save signature libraries using the sigkit API.
23 | Although many file formats are supported, Binary Ninja will only support signatures
24 | in the .sig (flatbuffer) format. The other formats are provided for debugging
25 | purposes.
26 | """
27 |
28 | import pickle
29 | import zlib
30 |
31 | from sigkit import *
32 |
33 | if __name__ == '__main__':
34 | import sys
35 |
36 | if len(sys.argv) < 2:
37 | print('Usage: convert_siglib.py ')
38 | sys.exit(1)
39 |
40 | # Load a signature library.
41 | filename = sys.argv[1]
42 | basename, ext = filename[:filename.index('.')], filename[filename.index('.'):]
43 | if ext == '.sig':
44 | with open(filename, 'rb') as f:
45 | sig_trie = sig_serialize_fb.load(f)
46 | elif ext == '.json':
47 | with open(filename, 'r') as f:
48 | sig_trie = sig_serialize_json.load(f)
49 | elif ext == '.json.zlib':
50 | with open(filename, 'rb') as f:
51 | sig_trie = sig_serialize_json.deserialize(json.loads(zlib.decompress(f.read()).decode('utf-8')))
52 | elif ext == '.pkl':
53 | with open(filename, 'rb') as f:
54 | sig_trie = pickle.load(f)
55 | else:
56 | print('Unsupported file extension ' + ext)
57 | sys.exit(1)
58 |
59 | # Save the signature library to a binary format and write it to a file.
60 | buf = sig_serialize_fb.dumps(sig_trie)
61 | with open(basename + '.sig', 'wb') as f:
62 | f.write(buf)
63 |
64 | # This is a pretty stringent assertion, but I want to be sure this implementation is correct.
65 | # having the exact same round-trip depends on having a consistent iteration order through the trie as well
66 | # as the ordering of the functions per node. That's enforced by iterating the trie (DFS) in a sorted fashion.
67 | assert buf == sig_serialize_fb.SignatureLibraryWriter().serialize(sig_serialize_fb.SignatureLibraryReader().deserialize(buf))
68 |
--------------------------------------------------------------------------------
/examples/libc-scraper/.gitignore:
--------------------------------------------------------------------------------
1 | ubuntu
2 | requests-cache.sqlite
3 |
--------------------------------------------------------------------------------
/examples/libc-scraper/README.md:
--------------------------------------------------------------------------------
1 | # libc-scraper
2 |
3 | This directory includes scripts that demonstrate how sigkit can scaled up performantly.
4 |
5 | The goal of libc-scraper is to scrape *.debs for Ubuntu libcs, process them using headless mode, and generate space-efficient signature libraries.
6 |
7 | batch_process.py demonstrates how to generate signatures using headless mode.
8 |
9 | Of special interest is merge_ubuntu.py, which shows how you can make create small signature libraries that combine multiple versions of the same library.
10 | Using clever tricks, it is possible to aggressively deduplicate across multiple versions while maintaining accuracy.
11 |
--------------------------------------------------------------------------------
/examples/libc-scraper/merge_ubuntu.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 |
3 | # Copyright (c) 2015-2020 Vector 35 Inc
4 | #
5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
6 | # of this software and associated documentation files (the "Software"), to
7 | # deal in the Software without restriction, including without limitation the
8 | # rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
9 | # sell copies of the Software, and to permit persons to whom the Software is
10 | # furnished to do so, subject to the following conditions:
11 | #
12 | # The above copyright notice and this permission notice shall be included in
13 | # all copies or substantial portions of the Software.
14 | #
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 | # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 | # IN THE SOFTWARE.
22 |
23 | """
24 | This script generates libc signature libraries after precomputing function
25 | signatures using batch_process.py, using all cpus available on the machine.
26 | """
27 |
28 | import os, sys
29 | import gc
30 | import pickle
31 | from pathlib import Path
32 | import tqdm
33 | import asyncio
34 | import concurrent.futures
35 | import math
36 |
37 | import sigkit.signaturelibrary
38 | import sigkit.trie_ops
39 | import sigkit.sig_serialize_fb
40 |
41 | cpu_factor = int(math.ceil(math.sqrt(os.cpu_count())))
42 |
43 | # delete weird, useless funcs and truncate names
44 | def cleanup_info(func_info, maxlen=40):
45 | import re
46 | to_delete = set()
47 | for f in func_info:
48 | if re.match(r'\.L\d+', f.name):
49 | to_delete.add(f)
50 | continue
51 | f.name = f.name[:maxlen]
52 | for f in to_delete:
53 | del func_info[f]
54 |
55 | # load all pickles into a single signature library
56 | def load_pkls(pkls):
57 | # rarely-used libgcc stuff
58 | pkl_blacklist = {'libcilkrts.pkl', 'libubsan.pkl', 'libitm.pkl', 'libgcov.pkl', 'libmpx.pkl', 'libmpxwrappers.pkl', 'libquadmath.pkl', 'libgomp.pkl'}
59 | trie, func_info = sigkit.signaturelibrary.new_trie(), {}
60 | for pkl in pkls:
61 | if os.path.basename(pkl) in pkl_blacklist: continue
62 | with open(pkl, 'rb') as f:
63 | pkl_funcs = pickle.load(f)
64 | cleanup_info(pkl_funcs)
65 | sigkit.trie_ops.trie_insert_funcs(trie, pkl_funcs)
66 | func_info.update(pkl_funcs)
67 | sigkit.trie_ops.finalize_trie(trie, func_info)
68 | return trie, func_info
69 |
70 | def combine_sig_libs(sig_lib1, sig_lib2):
71 | sigkit.trie_ops.combine_signature_libraries(*sig_lib1, *sig_lib2)
72 | return sig_lib1
73 |
74 | def finalize_sig_lib(sig_lib):
75 | sigkit.trie_ops.finalize_trie(*sig_lib)
76 | return sig_lib
77 |
78 | def do_package(package):
79 | loop = asyncio.get_event_loop()
80 | pool = concurrent.futures.ProcessPoolExecutor(cpu_factor)
81 |
82 | async def inner():
83 | print('Processing', package)
84 | result_filename = os.path.join('sigs', package.replace('/', '-') + '.sig')
85 | if os.path.exists(result_filename):
86 | print(result_filename + ' exists')
87 | return
88 |
89 | pkl_groups = []
90 | for pkg_version in os.listdir(package):
91 | pkg_version = os.path.join(package, pkg_version)
92 | pkls = Path(pkg_version).glob('**/*.pkl')
93 | pkls = list(map(str, pkls))
94 | if not pkls: continue
95 | # print(' ' + pkg_version, len(pkls))
96 | pkl_groups.append(pkls)
97 | if not pkl_groups:
98 | print(package, 'has no versions available')
99 | return
100 |
101 | with tqdm.tqdm(total=len(pkl_groups), desc='generating tries') as pbar:
102 | async def async_load(to_load):
103 | result = await loop.run_in_executor(pool, load_pkls, to_load)
104 | pbar.update(1)
105 | pbar.refresh()
106 | return result
107 | lib_versions = await asyncio.gather(*map(async_load, pkl_groups))
108 |
109 | # linear merge
110 | # dst_trie, dst_funcs = sigkit.signaturelibrary.new_trie(), {}
111 | # for trie, funcs in tqdm.tqdm(lib_versions):
112 | # sigkit.trie_ops.combine_signature_libraries(dst_trie, dst_funcs, trie, funcs)
113 |
114 | # big brain parallel async binary merge
115 | with tqdm.tqdm(total=len(lib_versions)-1, desc='merging') as pbar:
116 | async def merge(sig_libs):
117 | assert len(sig_libs)
118 | if len(sig_libs) == 1:
119 | return sig_libs[0]
120 | else:
121 | half = len(sig_libs) // 2
122 | sig_lib1, sig_lib2 = await asyncio.gather(merge(sig_libs[:half]), merge(sig_libs[half:]))
123 | sig_libs[:] = [None] * len(sig_libs) # free memory
124 | merged_lib = await loop.run_in_executor(pool, combine_sig_libs, sig_lib1, sig_lib2)
125 | pbar.update(1)
126 | pbar.refresh()
127 | gc.collect()
128 | return merged_lib
129 | sig_lib = await merge(lib_versions)
130 |
131 | dst_trie, dst_funcs = await loop.run_in_executor(pool, finalize_sig_lib, sig_lib)
132 | if not dst_funcs:
133 | print(package, 'has no functions')
134 | return
135 |
136 | buf = sigkit.sig_serialize_fb.SignatureLibraryWriter().serialize(dst_trie)
137 | with open(result_filename, 'wb') as f:
138 | f.write(buf)
139 | print(' saved to', result_filename, ' | size:', len(buf))
140 |
141 | loop.run_until_complete(inner())
142 |
143 | def main():
144 | if not os.path.exists('sigs'):
145 | os.mkdir('sigs')
146 | elif not os.path.isdir('sigs'):
147 | print('Please delete "sigs" before starting')
148 | sys.exit(1)
149 |
150 | tasks = []
151 | distr = 'ubuntu'
152 | # for version in os.listdir(distr):
153 | for version in ['bionic']:
154 | version = os.path.join(distr, version)
155 | for arch in os.listdir(version):
156 | arch = os.path.join(version, arch)
157 | for package in os.listdir(arch):
158 | package = os.path.join(arch, package)
159 | tasks.append(package)
160 |
161 | # we are going to do some heirarchical multiprocessing because there is a very high pickle message-passing overhead
162 | # so a lot of cpu time gets burned pickling in the main process simply passing work to worker processes
163 | import subprocess
164 | import multiprocessing.pool
165 | pool = multiprocessing.pool.ThreadPool(cpu_factor)
166 | def do_package_in_worker(package):
167 | subprocess.call(['python3', __file__, '-c', package])
168 | for _ in pool.imap_unordered(do_package_in_worker, tasks):
169 | pass
170 |
171 | if __name__ == '__main__':
172 | if len(sys.argv) <= 1:
173 | main()
174 | elif len(sys.argv) >= 3 and sys.argv[1] == '-c':
175 | # child
176 | do_package(sys.argv[2])
177 |
--------------------------------------------------------------------------------
/examples/libc-scraper/process-deb.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | debfile=$1
3 | pushd `dirname $debfile`
4 | debfilename=`basename $debfile`
5 | echo Now processing $debfilename
6 | debfile_extract=${debfilename%.*};
7 | if [ -d $debfile_extract ]; then
8 | echo $debfile_extract already exists, exiting
9 | exit
10 | fi
11 | dpkg-deb -x $debfilename $debfile_extract
12 | pushd $debfile_extract
13 | for libfile in `find . -iname '*.a'`; do
14 | f=`basename $libfile`
15 | if [[ $f = libasan.a || $f = libtsan.a ]]; then
16 | echo Skipping $libfile
17 | continue
18 | fi
19 | pushd `dirname $libfile`
20 | echo ..Now processing $f
21 | g=${f%.*};
22 | if [ ! -d $g ]; then
23 | mkdir -p $g
24 | pushd $g
25 | ar vx ../$f >> ../"$g"_log.txt
26 | python3 ~/sigkit/batch_process.py "*.o" ../"$g".pkl ../"$g"_checkpoint.pkl >> ../"$g"_log.txt 2>&1
27 | rm -f *.o # free disk space
28 | popd
29 | else
30 | echo Skipping existing $g
31 | fi
32 | popd
33 | done
34 | g=objs
35 | python3 ~/sigkit/batch_process.py "**/*.o" "$g".pkl "$g"_checkpoint.pkl >> "$g"_log.txt 2>&1
36 | find . -iname "*.so" -delete
37 | find . -iname "*.x" -delete
38 | find . -iname "*.h" -delete
39 | popd
40 | popd
--------------------------------------------------------------------------------
/examples/libc-scraper/run.sh:
--------------------------------------------------------------------------------
1 | find . -iname '*.deb' | (while read line; do
2 | arch=`echo $line | awk -F/ '{print $3}'`
3 | if [[ $arch = amd64 || $arch = arm64 || $arch = armel || $arch = armhf || $arch = i386 || $arch = lpia || $arch = powerpc ]]; then
4 | echo "$line"
5 | fi
6 | done) | parallel -j 3 "process-deb.sh {}"
7 |
--------------------------------------------------------------------------------
/examples/libc-scraper/ubuntu-libc-scraper.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 |
3 | # Copyright (c) 2015-2020 Vector 35 Inc
4 | #
5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
6 | # of this software and associated documentation files (the "Software"), to
7 | # deal in the Software without restriction, including without limitation the
8 | # rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
9 | # sell copies of the Software, and to permit persons to whom the Software is
10 | # furnished to do so, subject to the following conditions:
11 | #
12 | # The above copyright notice and this permission notice shall be included in
13 | # all copies or substantial portions of the Software.
14 | #
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 | # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 | # IN THE SOFTWARE.
22 |
23 | """
24 | This script downloads .debs for the libc-dev packages off Ubuntu launchpad
25 | leveraging high-performance asynchronous i/o.
26 | """
27 |
28 | import sys, os
29 | from bs4 import BeautifulSoup
30 | import urllib
31 |
32 | import aiohttp
33 | import asyncio
34 |
35 | packages = ['libc6-dev', 'libgcc-8-dev', 'libgcc-7-dev', 'libgcc-6-dev', 'libgcc-5-dev']
36 |
37 | session, sem = None, None
38 | async def must(f):
39 | global session, sem
40 | await sem.put(None)
41 | retries = 0
42 | while True:
43 | try:
44 | r = await f(session)
45 | if r.status == 200: break
46 | print(r.status)
47 | except: pass
48 | retries += 1
49 | if retries > 10:
50 | print('Maximum retry count exceeded')
51 | sys.exit(1)
52 | await asyncio.sleep(1.0)
53 | await sem.get()
54 | return r
55 |
56 | async def get_html(url):
57 | async with (await must(lambda session: session.get(url))) as resp:
58 | sys.stderr.write('GET ' + url + '\n')
59 | return BeautifulSoup(await resp.text(), features="html.parser")
60 |
61 | async def get_series():
62 | series = set()
63 | soup = await get_html('https://launchpad.net/ubuntu/+series')
64 | for strong in soup.find_all('strong'):
65 | for a in strong.find_all('a'):
66 | series.add(a['href'])
67 | return series
68 |
69 | async def get_archs(series):
70 | soup = await get_html('https://launchpad.net' + series + '/+builds')
71 | for select in soup.find_all('select', {'id': 'arch_tag'}):
72 | for option in select.find_all('option'):
73 | if option['value'] == 'all': continue
74 | yield series + '/' + option['value']
75 |
76 | async def get_versions(arch, package):
77 | soup = await get_html('https://launchpad.net' + arch + '/' + package)
78 | for tr in soup.find_all('tr'):
79 | if len(tr.find_all('td')) != 10: continue
80 | yield tr.find_all('td')[9].find_all('a')[0]['href']
81 |
82 | async def get_deb_link(version):
83 | soup = await get_html('https://launchpad.net' + version)
84 | for a in soup.find_all('a', {'class': 'sprite'}):
85 | if a['href'].endswith('.deb'):
86 | return a['href']
87 |
88 | async def download_deb(version, deb_url):
89 | filename = urllib.parse.urlparse(deb_url).path
90 | filename = filename[filename.rindex('/') + 1:]
91 | version = os.curdir + version
92 | filename = os.path.join(version, filename)
93 | if os.path.exists(filename):
94 | print('Skipping existing file', filename)
95 | return
96 | os.makedirs(version, exist_ok=True)
97 | async with (await must(lambda session: session.get(deb_url))) as resp:
98 | data = await resp.read()
99 | if not data:
100 | print('FAILED DOWNLOAD', filename, 'from', deb_url)
101 | return
102 | with open(filename, 'wb') as f:
103 | f.write(data)
104 | print('Downloaded', filename)
105 |
106 | async def process_version(version):
107 | deb_link = await get_deb_link(version)
108 | if deb_link:
109 | await download_deb(version, deb_link)
110 | else:
111 | print('No .deb for', version)
112 |
113 | async def process_arch(arch):
114 | await asyncio.gather(*[asyncio.create_task(process_version(version)) for package in packages async for version in get_versions(arch, package)])
115 |
116 | async def process_series(series):
117 | await asyncio.gather(*[asyncio.create_task(process_arch(arch)) async for arch in get_archs(series)])
118 |
119 | async def main():
120 | global session
121 | async with aiohttp.ClientSession() as session:
122 | await asyncio.gather(*[asyncio.create_task(process_series(series)) for series in await get_series()])
123 |
124 | if __name__ == '__main__':
125 | MAX_CONCURRENT = 16
126 | loop = asyncio.get_event_loop()
127 | sem = asyncio.Queue(loop=loop, maxsize=MAX_CONCURRENT)
128 | loop.run_until_complete(main())
129 | loop.close()
130 |
--------------------------------------------------------------------------------
/examples/merge_multiple_versions.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 |
3 | # Copyright (c) 2015-2020 Vector 35 Inc
4 | #
5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
6 | # of this software and associated documentation files (the "Software"), to
7 | # deal in the Software without restriction, including without limitation the
8 | # rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
9 | # sell copies of the Software, and to permit persons to whom the Software is
10 | # furnished to do so, subject to the following conditions:
11 | #
12 | # The above copyright notice and this permission notice shall be included in
13 | # all copies or substantial portions of the Software.
14 | #
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 | # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 | # IN THE SOFTWARE.
22 |
23 | """
24 | This script shows how you can merge the signature libraries generated for
25 | different versions of the same library. We would want to do this because
26 | there's like a lot of overlap between the two and duplicated functions.
27 | We want to avoid creating huge signatures that are bloated with these
28 | duplicated functions, so we will deduplicate them using the trie_ops package.
29 |
30 | This script loads pickled dicts of {FunctionNode: FunctionInfo} generated
31 | by batch_process.py.
32 | """
33 |
34 | import pickle, json
35 | import gc
36 | from pathlib import Path
37 |
38 | import sigkit.signaturelibrary, sigkit.trie_ops, sigkit.sig_serialize_json, sigkit.sigexplorer
39 |
40 |
41 | def func_count(trie):
42 | return len(set(trie.all_functions()))
43 |
44 | # Clean up the functions list, exclude some garbage functions, etc.
45 | def preprocess_funcs_list(func_info):
46 | import re
47 | to_delete = set()
48 | for f in func_info:
49 | if re.match(r'\.L\d+', f.name):
50 | to_delete.add(f)
51 | continue
52 | f.name = f.name[:40] # trim long names
53 | for f in to_delete:
54 | del func_info[f]
55 |
56 | def load_pkls(path, glob):
57 | pkls = list(map(str, Path(path).glob(glob)))
58 | trie, func_info = sigkit.signaturelibrary.new_trie(), {}
59 | for pkl in pkls:
60 | with open(pkl, 'rb') as f:
61 | pkl_funcs = pickle.load(f)
62 | preprocess_funcs_list(pkl_funcs)
63 | sigkit.trie_ops.trie_insert_funcs(trie, pkl_funcs)
64 | func_info.update(pkl_funcs)
65 | sigkit.trie_ops.finalize_trie(trie, func_info)
66 | return trie, func_info
67 |
68 | gc.disable() # I AM SPEED - Lightning McQueen
69 | dst_trie, dst_info = load_pkls('.', 'libc_version1/*.pkl')
70 | src_trie, src_info = load_pkls('.', 'libc_version2/*.pkl')
71 | gc.disable() # i am no longer speed.
72 |
73 | size1, size2 = func_count(dst_trie), func_count(src_trie)
74 | print("Pre-merge sizes: %d + %d = %d funcs" % (size1, size2, size1+size2))
75 |
76 | sigkit.trie_ops.combine_signature_libraries(dst_trie, dst_info, src_trie, src_info)
77 | print("Post-merge size: %d funcs" % (func_count(dst_trie),))
78 |
79 | sigkit.trie_ops.finalize_trie(dst_trie, dst_info)
80 | print("Finalized size: %d funcs" % (func_count(dst_trie),))
81 |
82 | print(json.dumps(sigkit.sig_serialize_json.serialize(dst_trie)))
83 | sigkit.explore_signature_library(dst_trie)
84 |
--------------------------------------------------------------------------------
/examples/sig_match.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2015-2020 Vector 35 Inc
2 | #
3 | # Permission is hereby granted, free of charge, to any person obtaining a copy
4 | # of this software and associated documentation files (the "Software"), to
5 | # deal in the Software without restriction, including without limitation the
6 | # rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
7 | # sell copies of the Software, and to permit persons to whom the Software is
8 | # furnished to do so, subject to the following conditions:
9 | #
10 | # The above copyright notice and this permission notice shall be included in
11 | # all copies or substantial portions of the Software.
12 | #
13 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
18 | # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
19 | # IN THE SOFTWARE.
20 |
21 | """
22 | This file contains a signature matcher implementation in Python. This
23 | implementation is only an illustrative example and should be used for testing
24 | purposes only. It is extremely slow compared to the native implementation
25 | found in Binary Ninja. Furthermore, the algorithm shown here is outdated
26 | compared to the native implementation, so matcher results will be of inferior
27 | quality.
28 | """
29 |
30 | from __future__ import print_function
31 |
32 | from binaryninja import *
33 |
34 | import sigkit.compute_sig
35 |
36 | class SignatureMatcher(object):
37 | def __init__(self, sig_trie, bv):
38 | self.sig_trie = sig_trie
39 | self.bv = bv
40 |
41 | self._matches = {}
42 | self._matches_inv = {}
43 | self.results = {}
44 |
45 | self._cur_match_debug = ""
46 |
47 | def resolve_thunk(self, func, level=0):
48 | if sigkit.compute_sig.get_func_len(func) >= 8:
49 | return func
50 |
51 | first_insn = func.mlil[0]
52 | if first_insn.operation == MediumLevelILOperation.MLIL_TAILCALL:
53 | thunk_dest = self.bv.get_function_at(first_insn.dest.value.value)
54 | elif first_insn.operation == MediumLevelILOperation.MLIL_JUMP and first_insn.dest.operation == MediumLevelILOperation.MLIL_LOAD and first_insn.dest.src.operation == MediumLevelILOperation.MLIL_CONST_PTR:
55 | data_var = self.bv.get_data_var_at(first_insn.dest.src.value.value)
56 | if not data_var or not data_var.data_refs_from: return None
57 | thunk_dest = self.bv.get_function_at(data_var.data_refs_from[0])
58 | else:
59 | return func
60 |
61 | if thunk_dest is None:
62 | return None
63 |
64 | if level >= 100:
65 | # something is wrong here. there's a weird infinite loop of thunks.
66 | sys.stderr.write('Warning: reached recursion limit while trying to resolve thunk %s!\n' % (func.name,))
67 | return None
68 |
69 | print('* following thunk %s -> %s' % (func.name, thunk_dest.name))
70 | return self.resolve_thunk(thunk_dest, level + 1)
71 |
72 | def on_match(self, func, func_node, level=0):
73 | if func in self._matches:
74 | if self._matches[func] != func_node:
75 | sys.stderr.write('Warning: CONFLICT on %s: %s vs %s' % (func.name, self._matches[func], func_node) + '\n')
76 | if func in self.results:
77 | del self.results[func]
78 | return
79 |
80 | self.results[func] = func_node
81 |
82 | if func_node in self._matches_inv:
83 | if self._matches_inv[func_node] != func:
84 | sys.stderr.write('Warning: INVERSE CONFLICT (%s) on %s: %s vs %s' % (self._cur_match_debug, func_node, self._matches_inv[func_node].name, func.name) + '\n')
85 | return
86 |
87 | print((' ' * level) + func.name, '=>', func_node.name, 'from', func_node.source_binary, '(' + self._cur_match_debug + ')')
88 | self._matches[func] = func_node
89 | self._matches_inv[func_node] = func
90 |
91 | def compute_func_callees(self, func):
92 | """
93 | Return a list of the names of symbols the function calls.
94 | """
95 | callees = {}
96 | for ref in func.call_sites:
97 | callee_addrs = self.bv.get_callees(ref.address, ref.function, ref.arch)
98 | if len(callee_addrs) != 1: continue
99 | callees[ref.address - func.start] = self.bv.get_function_at(callee_addrs[0])
100 | return callees
101 |
102 | def does_func_match(self, func, func_node, visited, level=0):
103 | print((' '*level) + 'compare', 'None' if not func else func.name, 'vs', '*' if not func_node else func_node.name, 'from ' + func_node.source_binary if func_node else '')
104 | # no information about this function. assume wildcard.
105 | if func_node is None:
106 | return 999
107 |
108 | # we expect a function to be here but there isn't one. no match.
109 | if func is None:
110 | return 0
111 |
112 | # fix for msvc thunks -.-
113 | thunk_dest = self.resolve_thunk(func)
114 | if not thunk_dest:
115 | sys.stderr.write('Warning: encountered a weird thunk %s, giving up\n' % (func.name,))
116 | return 0
117 | func = thunk_dest
118 |
119 | # this is essentially a dfs on the callgraph. if we encounter a backedge,
120 | # treat it optimistically, implying that the callers match if the callees match.
121 | # however, we track our previous assumptions, meaning that if we previously
122 | # optimistically assumed b == a, then later on if we compare b and c, we say
123 | # that b != c since we already assumed b == a (and c != a)
124 | if func in visited:
125 | print("we've already seen visited one before")
126 | return 999 if visited[func] == func_node else 0
127 | visited[func] = func_node
128 |
129 | # if we've already figured out what this function is, don't waste our time doing it again.
130 | if func in self._matches:
131 | return 999 if self._matches[func] == func_node else 0
132 |
133 | func_len = sigkit.compute_sig.get_func_len(func)
134 | func_data = self.bv.read(func.start, func_len)
135 | if not func_node.is_bridge:
136 | trie_matches = self.sig_trie.find(func_data)
137 | if func_node not in trie_matches:
138 | print((' ' * level) + 'trie mismatch!')
139 | return 0
140 | else:
141 | print((' ' * level) + 'this is a bridge node.')
142 |
143 | disambiguation_data = func_data[func_node.pattern_offset:func_node.pattern_offset + len(func_node.pattern)]
144 | if not func_node.pattern.matches(disambiguation_data):
145 | print((' ' * level) + 'disambiguation mismatch!')
146 | return 1
147 |
148 | callees = self.compute_func_callees(func)
149 | for call_site in callees:
150 | if call_site not in func_node:
151 | print((' ' * level) + 'call sites mismatch!')
152 | return 2
153 | for call_site, callee in func_node.callees.items():
154 | if callee is not None and call_site not in callees:
155 | print((' ' * level) + 'call sites mismatch!')
156 | return 2
157 |
158 | for call_site in callees:
159 | if self.does_func_match(callees[call_site], func_node.callees[call_site], visited, level + 1) != 999:
160 | print((' '*level) + 'callee ' + func_node.callees[call_site].name + ' mismatch!')
161 | return 3
162 |
163 | self._cur_match_debug = 'full match'
164 | self.on_match(func, func_node, level)
165 | return 999
166 |
167 |
168 | def process_func(self, func):
169 | """
170 | Try to sig the given function.
171 | Return the list of signatures the function matched against
172 | """
173 | func_len = sigkit.compute_sig.get_func_len(func)
174 | func_data = self.bv.read(func.start, func_len)
175 | trie_matches = self.sig_trie.find(func_data)
176 | best_score, results = 0, []
177 | for candidate_func in trie_matches:
178 | score = self.does_func_match(func, candidate_func, {})
179 | if score > best_score:
180 | results = [candidate_func]
181 | best_score = score
182 | elif score == best_score:
183 | results.append(candidate_func)
184 | if len(results) == 0:
185 | print(func.name, '=>', 'no match', end=", ")
186 | for x in self.sig_trie.all_values():
187 | if x.name == func.name:
188 | print('but there was a signature from', x.source_binary)
189 | break
190 | else:
191 | print('but this is OK.')
192 | assert best_score == 0
193 | return results
194 | elif len(results) > 1:
195 | print(func.name, '=>', 'deferred at level', best_score, results)
196 | return results
197 |
198 | match = results[0]
199 | if best_score == 1:
200 | self._cur_match_debug = 'bytes match (but disambiguation mismatch?)'
201 | self.on_match(func, match)
202 | return results
203 | elif best_score == 2:
204 | self._cur_match_debug = 'bytes + disambiguation match (but callee count mismatch)'
205 | self.on_match(func, match)
206 | return results
207 | elif best_score == 3:
208 | self._cur_match_debug = 'bytes + disambiguation match (but callees mismatch)'
209 | self.on_match(func, match)
210 | return results
211 | else:
212 | self._cur_match_debug = 'full match'
213 | self.on_match(func, match)
214 | return results
215 |
216 | def run(self):
217 | queue = self.bv.functions
218 | while True: # silly fixedpoint worklist algorithm
219 | deferred = []
220 | print('Start of pass %d functions remaining' % (len(queue)))
221 |
222 | for func in queue:
223 | if func in self._matches:
224 | continue
225 | if sigkit.compute_sig.get_func_len(func) < 8:
226 | continue
227 | matches = self.process_func(func)
228 | if len(matches) > 1:
229 | deferred.append(func)
230 |
231 | print('Pass complete, %d functions deferred' % (len(deferred),))
232 | if len(queue) == len(deferred):
233 | print('No changes. Quit.')
234 | break
235 | queue = deferred
236 |
--------------------------------------------------------------------------------
/icon.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Vector35/sigkit/a7420964415a875a1e6181ecdc603cfc29e34058/icon.ico
--------------------------------------------------------------------------------
/images/explorer.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Vector35/sigkit/a7420964415a875a1e6181ecdc603cfc29e34058/images/explorer.png
--------------------------------------------------------------------------------
/plugin.json:
--------------------------------------------------------------------------------
1 | {
2 | "pluginmetadataversion": 2,
3 | "name": "Signature Kit Plugin",
4 | "type": [
5 | "helper",
6 | "ui",
7 | "core"
8 | ],
9 | "api": [
10 | "python2",
11 | "python3"
12 | ],
13 | "description": "Python tools for working with Signature Libraries",
14 | "license": {
15 | "name": "MIT",
16 | "text": "Copyright (c) 2019-2020 Vector 35 Inc\n\nPermission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the \"Software\"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:\n\nThe above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.\n\nTHE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE."
17 | },
18 | "platforms": [
19 | "Windows",
20 | "Linux",
21 | "Darwin"
22 | ],
23 | "installinstructions": {
24 | "Windows": "",
25 | "Linux": "",
26 | "Darwin": ""
27 | },
28 | "dependencies": {
29 | "pip": [
30 | "flatbuffers"
31 | ]
32 | },
33 | "version": "1.2.2",
34 | "author": "Vector 35 Inc",
35 | "minimumbinaryninjaversion": 1997
36 | }
37 |
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [build-system]
2 | requires = ["setuptools"]
3 | build-backend = "setuptools.build_meta"
4 |
5 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | flatbuffers
2 |
--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [metadata]
2 | name = sigkit
3 | version = 1.2.1
4 | license = "MIT"
5 | long_description = file: README.md
6 |
7 | [options]
8 | install_requires = flatbuffers
9 | packages=find:
10 |
11 |
--------------------------------------------------------------------------------
/sigkit/FlatbufSignatureLibrary/CallRef.py:
--------------------------------------------------------------------------------
1 | # automatically generated by the FlatBuffers compiler, do not modify
2 |
3 | # namespace: FlatbufSignatureLibrary
4 |
5 | import flatbuffers
6 |
7 | class CallRef(object):
8 | __slots__ = ['_tab']
9 |
10 | # CallRef
11 | def Init(self, buf, pos):
12 | self._tab = flatbuffers.table.Table(buf, pos)
13 |
14 | # CallRef
15 | def Offset(self): return self._tab.Get(flatbuffers.number_types.Int32Flags, self._tab.Pos + flatbuffers.number_types.UOffsetTFlags.py_type(0))
16 | # CallRef
17 | def DstId(self): return self._tab.Get(flatbuffers.number_types.Int32Flags, self._tab.Pos + flatbuffers.number_types.UOffsetTFlags.py_type(4))
18 |
19 | def CreateCallRef(builder, offset, dstId):
20 | builder.Prep(4, 8)
21 | builder.PrependInt32(dstId)
22 | builder.PrependInt32(offset)
23 | return builder.Offset()
24 |
--------------------------------------------------------------------------------
/sigkit/FlatbufSignatureLibrary/Function.py:
--------------------------------------------------------------------------------
1 | # automatically generated by the FlatBuffers compiler, do not modify
2 |
3 | # namespace: FlatbufSignatureLibrary
4 |
5 | import flatbuffers
6 |
7 | class Function(object):
8 | __slots__ = ['_tab']
9 |
10 | @classmethod
11 | def GetRootAsFunction(cls, buf, offset):
12 | n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
13 | x = Function()
14 | x.Init(buf, n + offset)
15 | return x
16 |
17 | # Function
18 | def Init(self, buf, pos):
19 | self._tab = flatbuffers.table.Table(buf, pos)
20 |
21 | # Function
22 | def Name(self):
23 | o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
24 | if o != 0:
25 | return self._tab.String(o + self._tab.Pos)
26 | return None
27 |
28 | # Function
29 | def SourceBinary(self):
30 | o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
31 | if o != 0:
32 | return self._tab.String(o + self._tab.Pos)
33 | return None
34 |
35 | # Function
36 | def Callees(self, j):
37 | o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(8))
38 | if o != 0:
39 | x = self._tab.Vector(o)
40 | x += flatbuffers.number_types.UOffsetTFlags.py_type(j) * 8
41 | from .CallRef import CallRef
42 | obj = CallRef()
43 | obj.Init(self._tab.Bytes, x)
44 | return obj
45 | return None
46 |
47 | # Function
48 | def CalleesLength(self):
49 | o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(8))
50 | if o != 0:
51 | return self._tab.VectorLen(o)
52 | return 0
53 |
54 | # Function
55 | def Pattern(self):
56 | o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(10))
57 | if o != 0:
58 | x = self._tab.Indirect(o + self._tab.Pos)
59 | from .Pattern import Pattern
60 | obj = Pattern()
61 | obj.Init(self._tab.Bytes, x)
62 | return obj
63 | return None
64 |
65 | # Function
66 | def PatternOffset(self):
67 | o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(12))
68 | if o != 0:
69 | return self._tab.Get(flatbuffers.number_types.Uint32Flags, o + self._tab.Pos)
70 | return 0
71 |
72 | # Function
73 | def IsBridge(self):
74 | o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(14))
75 | if o != 0:
76 | return bool(self._tab.Get(flatbuffers.number_types.BoolFlags, o + self._tab.Pos))
77 | return False
78 |
79 | def FunctionStart(builder): builder.StartObject(6)
80 | def FunctionAddName(builder, name): builder.PrependUOffsetTRelativeSlot(0, flatbuffers.number_types.UOffsetTFlags.py_type(name), 0)
81 | def FunctionAddSourceBinary(builder, sourceBinary): builder.PrependUOffsetTRelativeSlot(1, flatbuffers.number_types.UOffsetTFlags.py_type(sourceBinary), 0)
82 | def FunctionAddCallees(builder, callees): builder.PrependUOffsetTRelativeSlot(2, flatbuffers.number_types.UOffsetTFlags.py_type(callees), 0)
83 | def FunctionStartCalleesVector(builder, numElems): return builder.StartVector(8, numElems, 4)
84 | def FunctionAddPattern(builder, pattern): builder.PrependUOffsetTRelativeSlot(3, flatbuffers.number_types.UOffsetTFlags.py_type(pattern), 0)
85 | def FunctionAddPatternOffset(builder, patternOffset): builder.PrependUint32Slot(4, patternOffset, 0)
86 | def FunctionAddIsBridge(builder, isBridge): builder.PrependBoolSlot(5, isBridge, 0)
87 | def FunctionEnd(builder): return builder.EndObject()
88 |
--------------------------------------------------------------------------------
/sigkit/FlatbufSignatureLibrary/Pattern.py:
--------------------------------------------------------------------------------
1 | # automatically generated by the FlatBuffers compiler, do not modify
2 |
3 | # namespace: FlatbufSignatureLibrary
4 |
5 | import flatbuffers
6 |
7 | class Pattern(object):
8 | __slots__ = ['_tab']
9 |
10 | @classmethod
11 | def GetRootAsPattern(cls, buf, offset):
12 | n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
13 | x = Pattern()
14 | x.Init(buf, n + offset)
15 | return x
16 |
17 | # Pattern
18 | def Init(self, buf, pos):
19 | self._tab = flatbuffers.table.Table(buf, pos)
20 |
21 | # Pattern
22 | def Data(self, j):
23 | o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
24 | if o != 0:
25 | a = self._tab.Vector(o)
26 | return self._tab.Get(flatbuffers.number_types.Uint8Flags, a + flatbuffers.number_types.UOffsetTFlags.py_type(j * 1))
27 | return 0
28 |
29 | # Pattern
30 | def DataAsNumpy(self):
31 | o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
32 | if o != 0:
33 | return self._tab.GetVectorAsNumpy(flatbuffers.number_types.Uint8Flags, o)
34 | return 0
35 |
36 | # Pattern
37 | def DataLength(self):
38 | o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
39 | if o != 0:
40 | return self._tab.VectorLen(o)
41 | return 0
42 |
43 | # Pattern
44 | def Mask(self, j):
45 | o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
46 | if o != 0:
47 | a = self._tab.Vector(o)
48 | return self._tab.Get(flatbuffers.number_types.Uint8Flags, a + flatbuffers.number_types.UOffsetTFlags.py_type(j * 1))
49 | return 0
50 |
51 | # Pattern
52 | def MaskAsNumpy(self):
53 | o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
54 | if o != 0:
55 | return self._tab.GetVectorAsNumpy(flatbuffers.number_types.Uint8Flags, o)
56 | return 0
57 |
58 | # Pattern
59 | def MaskLength(self):
60 | o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
61 | if o != 0:
62 | return self._tab.VectorLen(o)
63 | return 0
64 |
65 | def PatternStart(builder): builder.StartObject(2)
66 | def PatternAddData(builder, data): builder.PrependUOffsetTRelativeSlot(0, flatbuffers.number_types.UOffsetTFlags.py_type(data), 0)
67 | def PatternStartDataVector(builder, numElems): return builder.StartVector(1, numElems, 1)
68 | def PatternAddMask(builder, mask): builder.PrependUOffsetTRelativeSlot(1, flatbuffers.number_types.UOffsetTFlags.py_type(mask), 0)
69 | def PatternStartMaskVector(builder, numElems): return builder.StartVector(1, numElems, 1)
70 | def PatternEnd(builder): return builder.EndObject()
71 |
--------------------------------------------------------------------------------
/sigkit/FlatbufSignatureLibrary/SignatureLibrary.py:
--------------------------------------------------------------------------------
1 | # automatically generated by the FlatBuffers compiler, do not modify
2 |
3 | # namespace: FlatbufSignatureLibrary
4 |
5 | import flatbuffers
6 |
7 | class SignatureLibrary(object):
8 | __slots__ = ['_tab']
9 |
10 | @classmethod
11 | def GetRootAsSignatureLibrary(cls, buf, offset):
12 | n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
13 | x = SignatureLibrary()
14 | x.Init(buf, n + offset)
15 | return x
16 |
17 | # SignatureLibrary
18 | def Init(self, buf, pos):
19 | self._tab = flatbuffers.table.Table(buf, pos)
20 |
21 | # SignatureLibrary
22 | def Functions(self, j):
23 | o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
24 | if o != 0:
25 | x = self._tab.Vector(o)
26 | x += flatbuffers.number_types.UOffsetTFlags.py_type(j) * 4
27 | x = self._tab.Indirect(x)
28 | from .Function import Function
29 | obj = Function()
30 | obj.Init(self._tab.Bytes, x)
31 | return obj
32 | return None
33 |
34 | # SignatureLibrary
35 | def FunctionsLength(self):
36 | o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
37 | if o != 0:
38 | return self._tab.VectorLen(o)
39 | return 0
40 |
41 | # SignatureLibrary
42 | def Root(self):
43 | o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
44 | if o != 0:
45 | x = self._tab.Indirect(o + self._tab.Pos)
46 | from .TrieNode import TrieNode
47 | obj = TrieNode()
48 | obj.Init(self._tab.Bytes, x)
49 | return obj
50 | return None
51 |
52 | def SignatureLibraryStart(builder): builder.StartObject(2)
53 | def SignatureLibraryAddFunctions(builder, functions): builder.PrependUOffsetTRelativeSlot(0, flatbuffers.number_types.UOffsetTFlags.py_type(functions), 0)
54 | def SignatureLibraryStartFunctionsVector(builder, numElems): return builder.StartVector(4, numElems, 4)
55 | def SignatureLibraryAddRoot(builder, root): builder.PrependUOffsetTRelativeSlot(1, flatbuffers.number_types.UOffsetTFlags.py_type(root), 0)
56 | def SignatureLibraryEnd(builder): return builder.EndObject()
57 |
--------------------------------------------------------------------------------
/sigkit/FlatbufSignatureLibrary/TrieNode.py:
--------------------------------------------------------------------------------
1 | # automatically generated by the FlatBuffers compiler, do not modify
2 |
3 | # namespace: FlatbufSignatureLibrary
4 |
5 | import flatbuffers
6 |
7 | class TrieNode(object):
8 | __slots__ = ['_tab']
9 |
10 | @classmethod
11 | def GetRootAsTrieNode(cls, buf, offset):
12 | n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
13 | x = TrieNode()
14 | x.Init(buf, n + offset)
15 | return x
16 |
17 | # TrieNode
18 | def Init(self, buf, pos):
19 | self._tab = flatbuffers.table.Table(buf, pos)
20 |
21 | # TrieNode
22 | def PatternPrefix(self):
23 | o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
24 | if o != 0:
25 | return self._tab.Get(flatbuffers.number_types.Uint8Flags, o + self._tab.Pos)
26 | return 0
27 |
28 | # TrieNode
29 | def Pattern(self):
30 | o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
31 | if o != 0:
32 | x = self._tab.Indirect(o + self._tab.Pos)
33 | from .Pattern import Pattern
34 | obj = Pattern()
35 | obj.Init(self._tab.Bytes, x)
36 | return obj
37 | return None
38 |
39 | # TrieNode
40 | def Children(self, j):
41 | o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(8))
42 | if o != 0:
43 | x = self._tab.Vector(o)
44 | x += flatbuffers.number_types.UOffsetTFlags.py_type(j) * 4
45 | x = self._tab.Indirect(x)
46 | from .TrieNode import TrieNode
47 | obj = TrieNode()
48 | obj.Init(self._tab.Bytes, x)
49 | return obj
50 | return None
51 |
52 | # TrieNode
53 | def ChildrenLength(self):
54 | o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(8))
55 | if o != 0:
56 | return self._tab.VectorLen(o)
57 | return 0
58 |
59 | # TrieNode
60 | def WildcardChild(self):
61 | o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(10))
62 | if o != 0:
63 | x = self._tab.Indirect(o + self._tab.Pos)
64 | from .TrieNode import TrieNode
65 | obj = TrieNode()
66 | obj.Init(self._tab.Bytes, x)
67 | return obj
68 | return None
69 |
70 | # TrieNode
71 | def Functions(self, j):
72 | o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(12))
73 | if o != 0:
74 | a = self._tab.Vector(o)
75 | return self._tab.Get(flatbuffers.number_types.Uint32Flags, a + flatbuffers.number_types.UOffsetTFlags.py_type(j * 4))
76 | return 0
77 |
78 | # TrieNode
79 | def FunctionsAsNumpy(self):
80 | o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(12))
81 | if o != 0:
82 | return self._tab.GetVectorAsNumpy(flatbuffers.number_types.Uint32Flags, o)
83 | return 0
84 |
85 | # TrieNode
86 | def FunctionsLength(self):
87 | o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(12))
88 | if o != 0:
89 | return self._tab.VectorLen(o)
90 | return 0
91 |
92 | def TrieNodeStart(builder): builder.StartObject(5)
93 | def TrieNodeAddPatternPrefix(builder, patternPrefix): builder.PrependUint8Slot(0, patternPrefix, 0)
94 | def TrieNodeAddPattern(builder, pattern): builder.PrependUOffsetTRelativeSlot(1, flatbuffers.number_types.UOffsetTFlags.py_type(pattern), 0)
95 | def TrieNodeAddChildren(builder, children): builder.PrependUOffsetTRelativeSlot(2, flatbuffers.number_types.UOffsetTFlags.py_type(children), 0)
96 | def TrieNodeStartChildrenVector(builder, numElems): return builder.StartVector(4, numElems, 4)
97 | def TrieNodeAddWildcardChild(builder, wildcardChild): builder.PrependUOffsetTRelativeSlot(3, flatbuffers.number_types.UOffsetTFlags.py_type(wildcardChild), 0)
98 | def TrieNodeAddFunctions(builder, functions): builder.PrependUOffsetTRelativeSlot(4, flatbuffers.number_types.UOffsetTFlags.py_type(functions), 0)
99 | def TrieNodeStartFunctionsVector(builder, numElems): return builder.StartVector(4, numElems, 4)
100 | def TrieNodeEnd(builder): return builder.EndObject()
101 |
--------------------------------------------------------------------------------
/sigkit/FlatbufSignatureLibrary/__init__.py:
--------------------------------------------------------------------------------
1 | import flatbuffers
2 |
3 | if hasattr(flatbuffers, "__version__"):
4 | saved_EndVector = flatbuffers.Builder.EndVector
5 | flatbuffers.Builder.EndVector = lambda self, *args: saved_EndVector(self)
--------------------------------------------------------------------------------
/sigkit/__init__.py:
--------------------------------------------------------------------------------
1 | from binaryninja import *
2 |
3 | # exports
4 | from . import trie_ops
5 | from . import sig_serialize_fb
6 | from . import sig_serialize_json
7 |
8 | from .signaturelibrary import TrieNode, FunctionNode, Pattern, MaskedByte, new_trie
9 | from .compute_sig import process_function as generate_function_signature
10 |
11 | if core_ui_enabled():
12 | from .sigexplorer import explore_signature_library
13 | import binaryninjaui
14 |
15 | def signature_explorer(prompt=True):
16 | """
17 | Open the signature explorer UI.
18 | :param prompt: if True, prompt the user to open a file immediately.
19 | :return: `App`, a QT window
20 | """
21 | if "qt_major_version" in binaryninjaui.__dict__ and binaryninjaui.qt_major_version == 6:
22 | from PySide6.QtWidgets import QApplication
23 | else:
24 | from PySide2.QtWidgets import QApplication
25 | app = QApplication.instance()
26 | global widget # avoid lifetime issues from it falling out of scope
27 | widget = sigexplorer.App()
28 | if prompt:
29 | widget.open_file()
30 | widget.show()
31 | if app: # VERY IMPORTANT to avoiding lifetime issues???
32 | app.exec_()
33 | return widget
34 |
35 |
36 | # UI plugin code
37 | def _generate_signature_library(bv):
38 | guess_relocs = len(bv.relocation_ranges) == 0
39 | if guess_relocs:
40 | log.log_debug('Relocation information unavailable; choosing pattern masks heuristically')
41 | else:
42 | log.log_debug('Generating pattern masks based on relocation ranges')
43 |
44 | func_count = sum(map(lambda func: int(bool(bv.get_symbol_at(func.start))), bv.functions))
45 | log.log_info('Generating signatures for %d functions' % (func_count,))
46 | # Warning for usability purposes. Someone will be confused why it's skipping auto-named functions
47 | if func_count / float(len(bv.functions)) < 0.5:
48 | num_skipped = len(bv.functions) - func_count
49 | log.log_warn("%d functions that don't have a name or symbol will be skipped" % (num_skipped,))
50 |
51 | funcs = {}
52 | for func in bv.functions:
53 | if bv.get_symbol_at(func.start) is None: continue
54 | func_node, info = generate_function_signature(func, guess_relocs)
55 | if func_node and info:
56 | funcs[func_node] = info
57 | log.log_debug('Processed ' + func.name)
58 |
59 |
60 | log.log_debug('Constructing signature trie')
61 | trie = signaturelibrary.new_trie()
62 | trie_ops.trie_insert_funcs(trie, funcs)
63 | log.log_debug('Finalizing trie')
64 | trie_ops.finalize_trie(trie, funcs)
65 |
66 |
67 | if 'SIGNATURE_FILE_NAME' in bv.session_data:
68 | output_filename = bv.session_data['SIGNATURE_FILE_NAME']
69 | else:
70 | output_filename = get_save_filename_input("Filename:", "*.sig", bv.file.filename + '.sig')
71 | if not output_filename:
72 | log.log_debug('Save cancelled')
73 | return
74 | if isinstance(output_filename, bytes):
75 | output_filename = output_filename.decode('utf-8')
76 | buf = sig_serialize_fb.SignatureLibraryWriter().serialize(trie)
77 | with open(output_filename, 'wb') as f:
78 | f.write(buf)
79 | log.log_info('Saved to ' + output_filename)
80 |
81 | PluginCommand.register(
82 | "Signature Library\\Generate Signature Library",
83 | "Create a Signature Library that the Signature Matcher can use to locate functions.",
84 | _generate_signature_library
85 | )
86 |
87 | PluginCommand.register(
88 | "Signature Library\\Explore Signature Library",
89 | "View a Signature Library's contents in a graphical interface.",
90 | lambda bv: signature_explorer()
91 | )
92 |
--------------------------------------------------------------------------------
/sigkit/compute_sig.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2015-2020 Vector 35 Inc
2 | #
3 | # Permission is hereby granted, free of charge, to any person obtaining a copy
4 | # of this software and associated documentation files (the "Software"), to
5 | # deal in the Software without restriction, including without limitation the
6 | # rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
7 | # sell copies of the Software, and to permit persons to whom the Software is
8 | # furnished to do so, subject to the following conditions:
9 | #
10 | # The above copyright notice and this permission notice shall be included in
11 | # all copies or substantial portions of the Software.
12 | #
13 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
18 | # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
19 | # IN THE SOFTWARE.
20 |
21 | """
22 | This package contains code to compute functions signatures using Binary
23 | Ninja's python API. The most useful function is `process_function`, which
24 | generates a function signature for the specified function.
25 | """
26 |
27 | from binaryninja import *
28 |
29 | from . import signaturelibrary
30 | from . import trie_ops
31 |
32 | def is_llil_relocatable(llil):
33 | """
34 | Guesses whether a LLIL instruction is likely to contain operands that have been or would be relocated by a linker.
35 | :param llil: the llil instruction
36 | :return: true if the LLIL instruction contains LLIL_CONST_PTR or LLIL_EXTERN_PTR.
37 | """
38 | if not isinstance(llil, LowLevelILInstruction):
39 | return False
40 | if llil.operation in [LowLevelILOperation.LLIL_CONST_PTR, LowLevelILOperation.LLIL_EXTERN_PTR]:
41 | return True
42 | for operand in llil.operands:
43 | if is_llil_relocatable(operand):
44 | return True
45 | return False
46 |
47 | def guess_relocations_mask(func, sig_length):
48 | """
49 | Compute the relocations mask on a best-efforts basis using a heuristic based on the LLIL.
50 | :param func: BinaryNinja api function
51 | :param sig_length: how long the mask should be
52 | :return: an array of booleans, signifying whether the byte at each index is significant or not for matching
53 | """
54 |
55 | mask = [False] * sig_length
56 | i = 0
57 | while i < sig_length:
58 | bb = func.get_basic_block_at(func.start + i)
59 | if not bb: # not in a basicblock; wildcard
60 | mask[i] = False
61 | i += 1
62 | continue
63 |
64 | bb._buildStartCache()
65 | if not bb._instLengths:
66 | i += 1
67 | continue
68 | for insn_len in bb._instLengths:
69 | # This throws an exception for large functions where you need to manually force analysis
70 | try:
71 | llil = func.get_low_level_il_at(func.start + i, bb.arch)
72 | except exceptions.ILException:
73 | log_warn(f"Skipping function at {hex(func.start)}. You need to force the analysis of this function.")
74 | return None
75 |
76 | insn_mask = not is_llil_relocatable(llil)
77 | # if not insn_mask:
78 | # func.set_auto_instr_highlight(func.start + i, HighlightStandardColor.BlueHighlightColor)
79 | mask[i:min(i + insn_len, sig_length)] = [insn_mask] * min(insn_len, sig_length - i)
80 | i += insn_len
81 | if i >= sig_length: break
82 | return mask
83 |
84 | def find_relocation(func, start, end):
85 | """
86 | Finds a relocation from `start` to `end`. If `start`==`end`, then they will be expanded to the closest instruction boundary
87 | :param func: function start and end are contained in
88 | :param start: start address
89 | :param end: end address
90 | :return: corrected start and end addresses for the relocation
91 | """
92 |
93 | if end != start: # relocation isn't stupid
94 | return start, end - start
95 | # relocation is stupid (start==end), so just expand to the whole instruction
96 | bb = func.get_basic_block_at(start)
97 | if not bb: # not in a basicblock, don't care.
98 | return None, None
99 | bb._buildStartCache()
100 | for i, insn_start in enumerate(bb._instStarts):
101 | insn_end = insn_start + bb._instLengths[i]
102 | if (insn_start < end and start < insn_end) or (start == end and insn_start <= start < insn_end):
103 | return insn_start, bb._instLengths[i]
104 |
105 | def relocations_mask(func, sig_length):
106 | """
107 | Compute the relocations mask based on the relocation metadata contained within the binary.
108 | :param func: BinaryNinja api function
109 | :param sig_length: how long the mask should be
110 | :return: an array of booleans, signifying whether the byte at each index is significant or not for matching
111 | """
112 |
113 | mask = [True] * sig_length
114 | for start, end in func.view.relocation_ranges:
115 | if start > func.start + sig_length or end < func.start: continue
116 | reloc_start, reloc_len = find_relocation(func, start, end)
117 | if reloc_start is None: continue # not in a basicblock, don't care.
118 | reloc_start -= func.start
119 | if reloc_start < 0:
120 | reloc_len = reloc_len + reloc_start
121 | reloc_start = 0
122 | if reloc_len <= 0: continue
123 | mask[reloc_start:reloc_start + reloc_len] = [False] * reloc_len
124 |
125 | in_block = [False] * sig_length
126 | for bb in func.basic_blocks:
127 | bb_start_offset = bb.start - func.start
128 | bb_end_offset = bb_start_offset + get_bb_len(bb)
129 | if bb_start_offset > sig_length or bb.start < func.start: continue
130 | in_block[bb_start_offset:min(bb_end_offset, sig_length)] = [True] * min(get_bb_len(bb), sig_length - bb_start_offset)
131 |
132 | mask = [a and b for a,b in zip(mask, in_block)]
133 | return mask
134 |
135 | def get_bb_len(bb):
136 | """
137 | Calculate the length of the basicblock, taking into account weird cases like the block ending with an illegal instruction
138 | :param bb: BinaryNinja api basic block
139 | :return: length of the basic block in bytes
140 | """
141 | if bb.has_invalid_instructions:
142 | log.log_warn("Basic block with illegal instructions in " + bb.function.name)
143 | # stupid ugly HACK to deal with illegal instructions after noreturns that aren't marked noreturn
144 | bb._buildStartCache()
145 | if not bb._instLengths: return 0
146 | return bb._instLengths[-1] + bb._instStarts[-1]
147 | else:
148 | return bb.end - bb.start
149 |
150 | def get_func_len(func):
151 | """
152 | Calculates the length of the function based on the linear addresses of basic blocks.
153 | The length is truncated so that it never lies outside of the underlying binaryview.
154 | :param func: BinaryNinja api function
155 | :return: the distance to the end of the farthest instruction contained within this function
156 | """
157 | return min(max(map(lambda bb: bb.start + get_bb_len(bb) - func.start, func.basic_blocks)), func.view.end - func.start)
158 |
159 | def compute_callees(func):
160 | """
161 | Callees are a map of {offset: dest}, where func+offset is a MLIL_CALL instruction or similar.
162 | But sometimes, our version has MORE calls than the signature! This is because sometimes libraries
163 | are optionally linked in, and when they aren't, those calls turn into stubs (e.g., jump 0x0)
164 | so we make those callees wildcard (when we finalize the trie and resolve references).
165 | in our matching algorithm, we allow calls to wildcard callee to be optional.
166 | :param func: BinaryNinja api function
167 | :return: dictionary of {offset: (destination name, `ReferenceType`)}
168 | """
169 | bv = func.view
170 | callees = {}
171 | for ref in func.call_sites:
172 | callee_addrs = bv.get_callees(ref.address, ref.function, ref.arch)
173 | if len(callee_addrs) != 1: continue
174 | sym = bv.get_symbol_at(callee_addrs[0])
175 | if sym is None: continue
176 | callees[ref.address - func.start] = (sym.name, sym.type)
177 | return callees
178 |
179 | def function_pattern(func, guess_relocs, sig_length=None):
180 | """
181 | Computes a data and mask for the specified function `func` that can be used to identify this function.
182 | For example, a function may look like:
183 |
184 | 0: 53 push rbx
185 | 1: 83 77 05 lea esi, [rdi+5]
186 | 4: bf a0 07 40 00 mov edi,0x4007a0
187 | 9: 31 c0 xor eax,eax
188 |
189 | In this case, because they constitute a pointer, bytes 5-8 are liable to change when this binary is recompiled or linked.
190 | Thus, we would like to wildcard those bytes out and ignore them while matching.
191 | An appropriate function pattern may look like: 53 83 77 05 bf ?? ?? ?? ?? 31 c0
192 | The pattern data is a the sequence of bytes in the pattern and the mask is an array which specifies which bytes are not wildcards.
193 | For example, the data would be b'\x55\x83\x77\x05\xbf\x00\x00\x00\x00\x31\xc0' and the mask would be [1,1,1,1,1,0,0,0,0,1,1].
194 |
195 | This function is responsible for computing that data and that mask based on the information available in the binaryview.
196 |
197 | :param func: BinaryNinja api function
198 | :param guess_relocs: if False, assume relocation information is available for calculating the mask. otherwise,
199 | guess the relocation mask based on the IL.
200 | :param sig_length: the maximum length of the signature. If None, try to calculate it based on basic block addresses.
201 | :return: list of MaskedByte
202 | """
203 |
204 | if sig_length is None:
205 | sig_length = min(get_func_len(func), 1000)
206 |
207 | if guess_relocs:
208 | mask = guess_relocations_mask(func, sig_length)
209 | else:
210 | mask = relocations_mask(func, sig_length)
211 | if not mask:
212 | return None
213 | mask = list(map(int, mask)) # bool to int
214 | data = b''
215 | i = 0
216 | while i < len(mask) and func.start + i < func.view.end:
217 | if mask[i]:
218 | next_byte = func.view.read(func.start + i, 1)
219 | if len(next_byte) != 1: break # end of bv
220 | data += next_byte
221 | else:
222 | data += b'\x00'
223 | i += 1
224 | if len(data) < len(mask):
225 | mask = mask[:len(data)]
226 | assert len(data) == len(mask)
227 | while len(mask) and not mask[-1]:
228 | data = data[:len(data) - 1]
229 | mask = mask[:len(mask) - 1]
230 | return signaturelibrary.Pattern(data,mask)
231 |
232 | def process_function(func, guess_relocs):
233 | """
234 | Generates a signature for a given function.
235 | This signature can be thought of as a semi-unique fingerprint that is able to match copies of this function
236 | found in other binaries.
237 |
238 | :param func: BinaryNinja api function
239 | :param guess_relocs: if False, assume relocation information is available for calculating the mask. otherwise,
240 | guess the relocation mask based on the IL.
241 | :return: tuple of (FunctionNode, FunctionInfo)
242 | """
243 |
244 | func_node = signaturelibrary.FunctionNode(func.name)
245 | func_node.source_binary = func.view.file.filename
246 |
247 | info = signaturelibrary.FunctionInfo()
248 | function_pattern_val = function_pattern(func, guess_relocs)
249 | if not function_pattern_val:
250 | return None, None
251 | info.patterns = [function_pattern_val]
252 | info.callees = compute_callees(func)
253 | if hasattr(func.symbol, 'aliases'):
254 | info.aliases = list(map(lambda s: s.decode('utf-8'), func.symbol.aliases))
255 | else:
256 | info.aliases = []
257 | return func_node, info
258 |
--------------------------------------------------------------------------------
/sigkit/sig_serialize_fb.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2015-2020 Vector 35 Inc
2 | #
3 | # Permission is hereby granted, free of charge, to any person obtaining a copy
4 | # of this software and associated documentation files (the "Software"), to
5 | # deal in the Software without restriction, including without limitation the
6 | # rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
7 | # sell copies of the Software, and to permit persons to whom the Software is
8 | # furnished to do so, subject to the following conditions:
9 | #
10 | # The above copyright notice and this permission notice shall be included in
11 | # all copies or substantial portions of the Software.
12 | #
13 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
18 | # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
19 | # IN THE SOFTWARE.
20 |
21 | """
22 | Flatbuffers serialization / deserialization
23 | """
24 |
25 | import zlib
26 | import flatbuffers
27 |
28 | from . import signaturelibrary
29 | from .FlatbufSignatureLibrary import CallRef as FlatBufCallRef
30 | from .FlatbufSignatureLibrary import Function as FlatBufFunction
31 | from .FlatbufSignatureLibrary import Pattern as FlatBufPattern
32 | from .FlatbufSignatureLibrary import SignatureLibrary as FlatBufSignatureLibrary
33 | from .FlatbufSignatureLibrary import TrieNode as FlatBufTrieNode
34 |
35 | SIG_FORMAT_MAGIC = b'BNSG'
36 | SIG_FORMAT_VERSION = 1
37 |
38 | class SignatureLibraryWriter(object):
39 | """
40 | Serializes signature libraries to a compressed Flatbuffer format usable by Binary Ninja.
41 | """
42 | def __init__(self, include_source=False):
43 | self.builder = flatbuffers.Builder(4096)
44 | self.func_node_ids = {None: -1}
45 | self._bytes_cache = {}
46 | self._str_cache = {}
47 | self._pattern_cache = {}
48 | self.include_source = include_source
49 |
50 | def _serialize_bytes(self, buf):
51 | if buf not in self._bytes_cache:
52 | self._bytes_cache[buf] = self.builder.CreateByteVector(buf)
53 | return self._bytes_cache[buf]
54 |
55 | def _serialize_string(self, s):
56 | if s not in self._str_cache:
57 | self._str_cache[s] = self.builder.CreateString(s)
58 | return self._str_cache[s]
59 |
60 | def _serialize_pattern_mask(self, mask):
61 | mask = bytearray(mask)
62 | packed = bytearray((len(mask) + 7) // 8)
63 | for i in range(len(mask)):
64 | packed[i // 8] |= mask[i] << (i % 8)
65 | packed = bytes(packed)
66 | return self._serialize_bytes(packed)
67 |
68 | def _serialize_pattern(self, pattern):
69 | if pattern not in self._pattern_cache:
70 | data = self._serialize_bytes(bytes(bytearray(pattern.data())))
71 | mask = self._serialize_pattern_mask(bytes(bytearray(pattern.mask())))
72 | FlatBufPattern.PatternStart(self.builder)
73 | FlatBufPattern.PatternAddData(self.builder, data)
74 | FlatBufPattern.PatternAddMask(self.builder, mask)
75 | self._pattern_cache[pattern] = FlatBufPattern.PatternEnd(self.builder)
76 | return self._pattern_cache[pattern]
77 |
78 | def _serialize_func_node(self, func_node):
79 | func_name = self._serialize_string(func_node.name)
80 | if self.include_source and func_node.source_binary:
81 | source_binary = self._serialize_string(func_node.source_binary)
82 | else:
83 | source_binary = None
84 |
85 | if func_node.callees:
86 | FlatBufFunction.FunctionStartCalleesVector(self.builder, len(func_node.callees))
87 | for call_site, callee in reversed(sorted(func_node.callees.items())): # this needs reversed() because we build flatbuffers by prepending
88 | FlatBufCallRef.CreateCallRef(self.builder, call_site, self.func_node_ids[callee])
89 | callees = self.builder.EndVector(len(func_node.callees))
90 | else:
91 | callees = None
92 |
93 | if func_node.pattern:
94 | pattern = self._serialize_pattern(func_node.pattern)
95 | else:
96 | pattern = None
97 |
98 | FlatBufFunction.FunctionStart(self.builder)
99 | if func_name:
100 | FlatBufFunction.FunctionAddName(self.builder, func_name)
101 | if source_binary:
102 | FlatBufFunction.FunctionAddSourceBinary(self.builder, source_binary)
103 | if callees:
104 | FlatBufFunction.FunctionAddCallees(self.builder, callees)
105 | if func_node.is_bridge:
106 | FlatBufFunction.FunctionAddIsBridge(self.builder, func_node.is_bridge)
107 | if pattern:
108 | FlatBufFunction.FunctionAddPattern(self.builder, pattern)
109 | FlatBufFunction.FunctionAddPatternOffset(self.builder, func_node.pattern_offset)
110 | return FlatBufFunction.FunctionEnd(self.builder)
111 |
112 | def _serialize_trie_node(self, trie_node, key=None):
113 | pattern = self._serialize_pattern(trie_node.pattern)
114 | if trie_node.children:
115 | children_offs = [self._serialize_trie_node(v, k.value) for k, v in sorted(trie_node.children.items()) if k != signaturelibrary.MaskedByte.wildcard]
116 | FlatBufTrieNode.TrieNodeStartChildrenVector(self.builder, len(children_offs))
117 | for off in reversed(children_offs): # this needs reversed() because we build flatbuffers by prepending
118 | self.builder.PrependUOffsetTRelative(off)
119 | children = self.builder.EndVector(len(children_offs))
120 | if signaturelibrary.MaskedByte.wildcard in trie_node.children:
121 | wildcard_child = self._serialize_trie_node(trie_node.children[signaturelibrary.MaskedByte.wildcard])
122 | else:
123 | wildcard_child = None
124 | else:
125 | wildcard_child = None
126 | children = None
127 | if trie_node.value:
128 | FlatBufTrieNode.TrieNodeStartFunctionsVector(self.builder, len(trie_node.value))
129 | for f in reversed(trie_node.value): # this needs reversed() because we build flatbuffers by prepending
130 | self.builder.PrependUint32(self.func_node_ids[f])
131 | functions = self.builder.EndVector(len(trie_node.value))
132 | else:
133 | functions = None
134 |
135 | FlatBufTrieNode.TrieNodeStart(self.builder)
136 | if key is not None: # what about duplicate between 0 and wildcard...?
137 | assert type(key) == int and 0 <= key <= 255
138 | assert trie_node.pattern[0].mask == 1 and key == trie_node.pattern[0].value
139 | FlatBufTrieNode.TrieNodeAddPatternPrefix(self.builder, key)
140 | FlatBufTrieNode.TrieNodeAddPattern(self.builder, pattern)
141 | if children:
142 | FlatBufTrieNode.TrieNodeAddChildren(self.builder, children)
143 | if wildcard_child:
144 | FlatBufTrieNode.TrieNodeAddWildcardChild(self.builder, wildcard_child)
145 | if functions:
146 | FlatBufTrieNode.TrieNodeAddFunctions(self.builder, functions)
147 | return FlatBufTrieNode.TrieNodeEnd(self.builder)
148 |
149 | def serialize(self, sig_trie):
150 | """
151 | Creates a new Flatbuffer and serializes the specified signature trie to it.
152 | Returns a binary signature library ready for use with Binary Ninja.
153 | :param sig_trie: `TrieNode` object
154 | :return: bytes-like object
155 | """
156 | # Enforce ordering to make the traversal order consistent
157 | for n in sig_trie.all_nodes():
158 | if n.value:
159 | n.value = list(sorted(n.value, key=lambda func_node: func_node.source_binary + '!' + func_node.name))
160 |
161 | func_nodes = []
162 | def visit(func_node):
163 | if func_node in self.func_node_ids: return
164 | self.func_node_ids[func_node] = len(func_nodes)
165 | func_nodes.append(func_node)
166 | for k, f in sorted(func_node.callees.items()): visit(f)
167 | for f in sig_trie.all_values(): visit(f)
168 |
169 | func_nodes = [self._serialize_func_node(f) for f in reversed(func_nodes)] # this needs reversed() because we build flatbuffers by prepending
170 | FlatBufSignatureLibrary.SignatureLibraryStartFunctionsVector(self.builder, len(func_nodes))
171 | for off in func_nodes:
172 | self.builder.PrependUOffsetTRelative(off)
173 | functions = self.builder.EndVector(len(func_nodes))
174 |
175 | root = self._serialize_trie_node(sig_trie)
176 |
177 | FlatBufSignatureLibrary.SignatureLibraryStart(self.builder)
178 | FlatBufSignatureLibrary.SignatureLibraryAddFunctions(self.builder, functions)
179 | FlatBufSignatureLibrary.SignatureLibraryAddRoot(self.builder, root)
180 | off = FlatBufSignatureLibrary.SignatureLibraryEnd(self.builder)
181 | self.builder.Finish(off)
182 |
183 | return SIG_FORMAT_MAGIC + bytes(bytearray([SIG_FORMAT_VERSION])) + zlib.compress(bytes(self.builder.Output()))
184 |
185 | class SignatureLibraryReader(object):
186 | """
187 | Parses and loads compressed Flatbuffer signature libraries.
188 | """
189 | def __init__(self):
190 | self.funcs = []
191 |
192 | def _deserialize_pattern(self, serialized):
193 | # we cannot use DataAsNumpy as we don't depend on numpy
194 | data = bytes(bytearray([serialized.Data(i) for i in range(serialized.DataLength())]))
195 |
196 | mask = []
197 | for i in range(serialized.MaskLength()):
198 | b = serialized.Mask(i)
199 | for j in range(8):
200 | mask.append((b >> j) & 1)
201 | if len(mask) == len(data): break
202 |
203 | return signaturelibrary.Pattern(data, mask)
204 |
205 | def _deserialize_func_node(self, serialized):
206 | func_node = signaturelibrary.FunctionNode(serialized.Name().decode('utf-8'))
207 | if serialized.SourceBinary():
208 | func_node.source_binary = serialized.SourceBinary().decode('utf-8')
209 | # func_node.is_bridge = serialized.IsBridge()
210 | if serialized.Pattern():
211 | func_node.pattern = self._deserialize_pattern(serialized.Pattern())
212 | func_node.pattern_offset = serialized.PatternOffset()
213 | return func_node
214 |
215 | def _deserialize_trie_node(self, serialized):
216 | children = {}
217 | prev = float('-inf')
218 | for i in range(serialized.ChildrenLength()):
219 | child = serialized.Children(i)
220 | children[signaturelibrary.MaskedByte.new(child.PatternPrefix(), 1)] = self._deserialize_trie_node(child)
221 | assert child.PatternPrefix() >= prev # assert sorted
222 | prev = child.PatternPrefix()
223 | wildcard = serialized.WildcardChild()
224 | if wildcard:
225 | children[signaturelibrary.MaskedByte.wildcard] = self._deserialize_trie_node(wildcard)
226 | funcs = []
227 | for i in range(serialized.FunctionsLength()):
228 | funcs.append(self.funcs[serialized.Functions(i)])
229 | pattern = self._deserialize_pattern(serialized.Pattern())
230 | return signaturelibrary.TrieNode(pattern, children, funcs)
231 |
232 | def deserialize(self, buf):
233 | """
234 | Loads a signature library from an in-memory buffer.
235 | This implementation is extremely inefficient! Use it for debugging and signature library generation only.
236 | :param buf: bytes-like object
237 | :return: root `TrieNode` of the signature library
238 | """
239 | if buf[0:4] != b'BNSG':
240 | raise RuntimeError('invalid signature library magic')
241 | if ord(buf[4:5]) != SIG_FORMAT_VERSION:
242 | raise RuntimeError('signature version mismatch: got %d, expected %d' % (ord(buf[4:5]), SIG_FORMAT_VERSION))
243 | buf = zlib.decompress(buf[5:])
244 | serialized = FlatBufSignatureLibrary.SignatureLibrary.GetRootAsSignatureLibrary(buf, 0)
245 | funcs_serialized = []
246 | for i in range(serialized.FunctionsLength()):
247 | f = serialized.Functions(i)
248 | funcs_serialized.append(f)
249 | self.funcs.append(self._deserialize_func_node(f))
250 | for i, f in enumerate(funcs_serialized): # link callgraph
251 | callees = {}
252 | prev = float('-inf')
253 | for j in range(f.CalleesLength()):
254 | callsite = f.Callees(j)
255 | callees[callsite.Offset()] = None if callsite.DstId() == -1 else self.funcs[callsite.DstId()]
256 | assert callsite.Offset() >= prev # assert sorted
257 | prev = callsite.Offset()
258 | self.funcs[i].callees = callees
259 |
260 | trie = self._deserialize_trie_node(serialized.Root())
261 | for func in trie.all_values(): # recalculate refcounts
262 | func.ref_count += 1
263 | return trie
264 |
265 |
266 | def dumps(sig_trie, **kwargs):
267 | return SignatureLibraryWriter(**kwargs).serialize(sig_trie)
268 |
269 | def dump(sig_trie, fp, **kwargs):
270 | fp.write(dumps(sig_trie, **kwargs))
271 |
272 | def loads(serialized):
273 | return SignatureLibraryReader().deserialize(serialized)
274 |
275 | def load(fp):
276 | return loads(fp.read())
277 |
--------------------------------------------------------------------------------
/sigkit/sig_serialize_json.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2015-2020 Vector 35 Inc
2 | #
3 | # Permission is hereby granted, free of charge, to any person obtaining a copy
4 | # of this software and associated documentation files (the "Software"), to
5 | # deal in the Software without restriction, including without limitation the
6 | # rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
7 | # sell copies of the Software, and to permit persons to whom the Software is
8 | # furnished to do so, subject to the following conditions:
9 | #
10 | # The above copyright notice and this permission notice shall be included in
11 | # all copies or substantial portions of the Software.
12 | #
13 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
18 | # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
19 | # IN THE SOFTWARE.
20 |
21 | """
22 | JSON serialization / deserialization
23 | """
24 |
25 | import json
26 |
27 | from . import signaturelibrary
28 |
29 | def _serialize_func_node(func_node, func_node_ids):
30 | return {
31 | 'name': func_node.name,
32 | 'source_binary': func_node.source_binary,
33 | 'pattern': str(func_node.pattern),
34 | 'pattern_offset': func_node.pattern_offset,
35 | 'callees': {str(call_site): func_node_ids[callee] for call_site, callee in func_node.callees.items()},
36 | 'is_bridge': func_node.is_bridge
37 | }
38 |
39 | def _serialize_trie_node(trie_node, func_node_ids):
40 | children = {str(k) : _serialize_trie_node(v, func_node_ids) for k, v in trie_node.children.items()}
41 | if trie_node.value:
42 | functions = [func_node_ids[f] for f in trie_node.value]
43 | else:
44 | functions = []
45 | return {
46 | 'pattern': str(trie_node.pattern),
47 | 'children': children,
48 | 'functions': functions,
49 | }
50 |
51 | def serialize(sig_trie):
52 | """
53 | Serialize a signature trie to a JSON-compatible format.
54 | :param sig_trie: `TrieNode` object
55 | :return: a python dictionary ready for serialization as JSON
56 | """
57 | func_nodes = []
58 | func_node_ids = {None: -1}
59 | def visit(func_node):
60 | if func_node in func_node_ids: return
61 | func_node_ids[func_node] = len(func_nodes)
62 | func_nodes.append(func_node)
63 | for f in func_node.callees.values(): visit(f)
64 | for f in sig_trie.all_values(): visit(f)
65 |
66 | return {
67 | 'functions': [_serialize_func_node(f, func_node_ids, ) for f in func_nodes],
68 | 'trie': _serialize_trie_node(sig_trie, func_node_ids)
69 | }
70 |
71 | def _deserialize_pattern(serialized):
72 | return signaturelibrary.Pattern.from_str(serialized)
73 |
74 |
75 | def _deserialize_func_node(serialized):
76 | func_node = signaturelibrary.FunctionNode(serialized['name'])
77 | func_node.source_binary = serialized['source_binary']
78 | func_node.pattern = _deserialize_pattern(serialized['pattern'])
79 | func_node.pattern_offset = serialized['pattern_offset']
80 | # func_node.is_bridge = serialized['is_bridge']
81 | return func_node
82 |
83 | def _deserialize_trie_node(serialized, funcs_arr):
84 | return signaturelibrary.TrieNode(
85 | _deserialize_pattern(serialized['pattern']),
86 | {signaturelibrary.MaskedByte.from_str(k): _deserialize_trie_node(v, funcs_arr) for k, v in serialized['children'].items()},
87 | [funcs_arr[i] for i in serialized['functions']] if serialized['functions'] else []
88 | )
89 |
90 | def deserialize(serialized):
91 | """
92 | Deserialize a signature trie from JSON data.
93 | :param serialized: a dict containing JSON-format data to signature trie objects.
94 | :return: the root `TrieNode`
95 | """
96 | funcs_serialized = serialized['functions']
97 | funcs = [_deserialize_func_node(f) for f in funcs_serialized]
98 | for i in range(len(funcs)): # link callgraph
99 | funcs[i].callees = {int(call_site): None if callee_id == -1 else funcs[callee_id]
100 | for call_site, callee_id in funcs_serialized[i]['callees'].items()}
101 |
102 | return _deserialize_trie_node(serialized['trie'], funcs)
103 |
104 | def dumps(sig_trie, *args, **kwargs):
105 | return json.dumps(serialize(sig_trie), *args, **kwargs)
106 |
107 | def dump(sig_trie, fp, *args, **kwargs):
108 | return json.dump(serialize(sig_trie), fp, *args, **kwargs)
109 |
110 | def loads(serialized, *args, **kwargs):
111 | return deserialize(json.loads(serialized, *args, **kwargs))
112 |
113 | def load(fp, *args, **kwargs):
114 | return deserialize(json.load(fp, *args, **kwargs))
115 |
--------------------------------------------------------------------------------
/sigkit/sigexplorer.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Copyright (c) 2015-2020 Vector 35 Inc
4 | #
5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
6 | # of this software and associated documentation files (the "Software"), to
7 | # deal in the Software without restriction, including without limitation the
8 | # rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
9 | # sell copies of the Software, and to permit persons to whom the Software is
10 | # furnished to do so, subject to the following conditions:
11 | #
12 | # The above copyright notice and this permission notice shall be included in
13 | # all copies or substantial portions of the Software.
14 | #
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 | # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 | # IN THE SOFTWARE.
22 |
23 | from __future__ import print_function
24 |
25 | import sys
26 | import os
27 |
28 | import binaryninjaui
29 | if "qt_major_version" in binaryninjaui.__dict__ and binaryninjaui.qt_major_version == 6:
30 | from PySide6.QtCore import (Qt, QRect, QItemSelectionModel, QItemSelection, QSize, Signal)
31 | from PySide6.QtGui import (QStandardItemModel, QIcon, QStandardItem, QKeySequence, QFont, QBrush, QTextDocument,
32 | QCursor, QFontDatabase, QPalette, QAction)
33 | from PySide6.QtWidgets import (QApplication, QTreeView, QVBoxLayout, QWidget, QMenu, QMainWindow, QFileDialog,
34 | QStyledItemDelegate, QStyle, QGroupBox, QHBoxLayout, QPushButton, QAbstractItemView,
35 | QInputDialog, QMessageBox, QLabel)
36 | else:
37 | from PySide2.QtCore import (Qt, QRect, QItemSelectionModel, QItemSelection, QSize, Signal)
38 | from PySide2.QtGui import (QStandardItemModel, QIcon, QStandardItem, QKeySequence, QFont, QBrush, QTextDocument,
39 | QCursor, QFontDatabase, QPalette)
40 | from PySide2.QtWidgets import (QApplication, QTreeView, QVBoxLayout, QWidget, QMenu, QAction, QMainWindow, QFileDialog,
41 | QStyledItemDelegate, QStyle, QGroupBox, QHBoxLayout, QPushButton, QAbstractItemView,
42 | QInputDialog, QMessageBox, QLabel)
43 |
44 | import pickle
45 | import json
46 | import zlib
47 |
48 | if __name__ == "__main__" and __package__ is None:
49 | __package__ = os.path.basename(os.getcwd())
50 | sys.path.append(os.path.dirname(os.getcwd()))
51 | __import__(__package__) # python2 compat
52 | print('Please run with python -m %s.%s instead of %s directly.' % (__package__, os.path.splitext(__file__)[0], __file__))
53 |
54 | from . import sig_serialize_json
55 | from . import sig_serialize_fb
56 |
57 | class App(QMainWindow):
58 | def __init__(self):
59 | super(App, self).__init__()
60 |
61 | self.treeView = None
62 | self.model = None
63 | self.pattern_delegate = None
64 | self.callee_delegate = None
65 | self.sig_trie = None
66 |
67 | self.searchResults = None
68 | self.searchIndex = -1
69 | self.findNextAction = None
70 | self.findPrevAction = None
71 |
72 | # these two maps are used to make the hyperlinks work
73 | # mapping from href to FunctionNode
74 | self.hrefs_to_funcs = {}
75 | # mapping from FunctionNode to tree view element (QStandardItem)
76 | self.func_node_items = {}
77 |
78 | self.init_ui()
79 |
80 | def init_ui(self):
81 | self.setWindowTitle('Signature Explorer')
82 | self.resize(1000, 640)
83 | app_icon = QIcon()
84 | app_icon.addFile('icon.ico', QSize(48,48))
85 | self.setWindowIcon(app_icon)
86 |
87 | self.pattern_delegate = PatternDelegate()
88 | self.callee_delegate = CalleesDelegate()
89 |
90 | self.treeView = TrieView()
91 | # self.treeView.setAlternatingRowColors(True)
92 |
93 | self.model = QStandardItemModel(0, 7, self.treeView)
94 | self.model.setHeaderData(0, Qt.Horizontal, 'Signature')
95 | self.model.setHeaderData(1, Qt.Horizontal, 'Function')
96 | self.model.setHeaderData(2, Qt.Horizontal, 'Callees')
97 | self.model.setHeaderData(3, Qt.Horizontal, 'Offset Extra Pattern')
98 | self.model.setHeaderData(4, Qt.Horizontal, 'Extra Pattern')
99 | self.model.setHeaderData(5, Qt.Horizontal, 'Source Binary')
100 | self.model.setHeaderData(6, Qt.Horizontal, 'ID')
101 | self.treeView.setModel(self.model)
102 |
103 | self.treeView.setSelectionBehavior(QAbstractItemView.SelectRows)
104 | self.treeView.setColumnWidth(0, 400)
105 | self.treeView.setColumnWidth(1, 200)
106 | self.treeView.setColumnWidth(2, 250)
107 | self.treeView.setColumnWidth(3, 25)
108 | self.treeView.setColumnWidth(4, 100)
109 | self.treeView.setColumnWidth(5, 200)
110 | self.treeView.setColumnWidth(6, 75)
111 | self.treeView.setItemDelegateForColumn(0, self.pattern_delegate)
112 | self.treeView.setItemDelegateForColumn(2, self.callee_delegate)
113 | self.treeView.setItemDelegateForColumn(4, self.pattern_delegate)
114 | self.treeView.horizontalScrollBar().setEnabled(True)
115 | self.treeView.setHorizontalScrollBarPolicy(Qt.ScrollBarAsNeeded)
116 | self.treeView.setEditTriggers(QAbstractItemView.NoEditTriggers)
117 | self.treeView.linkActivated.connect(self.on_func_link_clicked)
118 | # self.treeView.expanded.connect(lambda x: self.treeView.resizeColumnToContents(1))
119 | # self.treeView.collapsed.connect(lambda x: self.treeView.resizeColumnToContents(1))
120 |
121 | main_layout = QVBoxLayout()
122 | main_layout.addWidget(self.treeView)
123 |
124 | panel = QWidget()
125 | panel.setLayout(main_layout)
126 | self.setCentralWidget(panel)
127 |
128 | menuBar = self.menuBar()
129 |
130 | fileMenu = QMenu("File")
131 | openAction = QAction("&Open", self)
132 | openAction.setShortcuts(QKeySequence.Open)
133 | openAction.triggered.connect(self.open_file)
134 | fileMenu.addAction(openAction)
135 |
136 | closeAction = QAction("&Close", self)
137 | closeAction.setShortcuts(QKeySequence.Close)
138 | closeAction.triggered.connect(self.close_file)
139 | fileMenu.addAction(closeAction)
140 |
141 | saveAsAction = QAction("Save As...", self)
142 | saveAsAction.setShortcuts(QKeySequence.Save)
143 | saveAsAction.triggered.connect(self.save_as)
144 | fileMenu.addAction(saveAsAction)
145 |
146 | menuBar.addMenu(fileMenu)
147 |
148 | editMenu = QMenu("Edit")
149 |
150 | findAction = QAction("&Find", self)
151 | findAction.setShortcuts(QKeySequence.Find)
152 | findAction.triggered.connect(self.search)
153 | editMenu.addAction(findAction)
154 |
155 | self.findNextAction = QAction("&Find Next", self)
156 | self.findNextAction.setShortcuts(QKeySequence.FindNext)
157 | self.findNextAction.triggered.connect(self.select_next)
158 | self.findNextAction.setEnabled(False)
159 | editMenu.addAction(self.findNextAction)
160 |
161 | self.findPrevAction = QAction("&Find Prev", self)
162 | self.findPrevAction.setShortcuts(QKeySequence.FindPrevious)
163 | self.findPrevAction.triggered.connect(self.select_prev)
164 | self.findPrevAction.setEnabled(False)
165 | editMenu.addAction(self.findPrevAction)
166 |
167 | menuBar.addMenu(editMenu)
168 |
169 | viewMenu = QMenu("View")
170 |
171 | expandAction = QAction("&Expand All", self)
172 | expandAction.triggered.connect(self.treeView.expandAll)
173 | viewMenu.addAction(expandAction)
174 |
175 | collapseAction = QAction("&Collapse All", self)
176 | collapseAction.triggered.connect(self.treeView.collapseAll)
177 | viewMenu.addAction(collapseAction)
178 |
179 | menuBar.addMenu(viewMenu)
180 |
181 | def search(self):
182 | query_string, ok = QInputDialog.getText(self, 'Find in Trie', 'Function name')
183 | if not ok or not query_string:
184 | return
185 |
186 | self.searchResults = self.model.findItems(query_string, Qt.MatchContains | Qt.MatchRecursive, 1)
187 |
188 | if self.searchResults:
189 | self.findNextAction.setEnabled(True)
190 | self.findPrevAction.setEnabled(True)
191 | self.searchIndex = 0
192 | self.select_next()
193 | else:
194 | self.findNextAction.setEnabled(False)
195 | self.findPrevAction.setEnabled(False)
196 | self.searchIndex = -1
197 | QMessageBox.warning(self, 'Find in Trie', 'No results found')
198 |
199 | def select_next(self):
200 | next_item = self.searchResults[self.searchIndex]
201 | self.searchIndex = (self.searchIndex + 1) % len(self.searchResults)
202 | self.select_tree_item(next_item)
203 |
204 | def select_prev(self):
205 | prev_item = self.searchResults[self.searchIndex]
206 | self.searchIndex = (self.searchIndex - 1) % len(self.searchResults)
207 | self.select_tree_item(prev_item)
208 |
209 | def select_tree_item(self, item):
210 | path = []
211 | while item:
212 | path.insert(0, self.model.indexFromItem(item))
213 | item = item.parent()
214 | # print(path)
215 | for index in path:
216 | self.treeView.setExpanded(index, True)
217 | self.treeView.selectionModel().select(path[-1], QItemSelectionModel.ClearAndSelect | QItemSelectionModel.Rows)
218 | self.treeView.scrollTo(path[-1])
219 |
220 | def close_file(self):
221 | self.model.removeRows(0, self.model.rowCount())
222 | self.sig_trie = None
223 | self.hrefs_to_funcs = {}
224 | self.func_node_items = {}
225 |
226 | def open_file(self):
227 | sig_filter = 'Signature library (*.sig)'
228 | json_zlib_filter = 'Compressed JSON signature library (*.json.zlib)'
229 | json_filter = 'JSON signature library (*.json)'
230 | pkl_filter = 'Pickled signature library (*.pkl)'
231 | fname, filter = QFileDialog.getOpenFileName(self, 'Open file', filter=';;'.join([sig_filter, json_zlib_filter, json_filter, pkl_filter]))
232 | if filter and fname:
233 | print('Opening signature library %s' % (fname,))
234 |
235 | if filter == json_zlib_filter:
236 | with open(fname, 'rb') as f:
237 | json_trie = zlib.decompress(f.read()).decode('utf-8')
238 | sig_trie = sig_serialize_json.deserialize(json.loads(json_trie))
239 | elif filter == json_filter:
240 | with open(fname, 'r') as f:
241 | json_trie = f.read()
242 | sig_trie = sig_serialize_json.deserialize(json.loads(json_trie))
243 | elif filter == sig_filter:
244 | with open(fname, 'rb') as f:
245 | fb_trie = f.read()
246 | sig_trie = sig_serialize_fb.SignatureLibraryReader().deserialize(fb_trie)
247 | elif filter == pkl_filter:
248 | with open(fname, 'rb') as f:
249 | sig_trie = pickle.load(f)
250 | else:
251 | return
252 |
253 | self.open_trie(sig_trie, os.path.basename(fname))
254 |
255 | def save_as(self):
256 | sig_filter = 'Signature library (*.sig)'
257 | json_zlib_filter = 'Compressed JSON signature library (*.json.zlib)'
258 | json_filter = 'JSON signature library (*.json)'
259 | pkl_filter = 'Pickled signature library (*.pkl)'
260 | fname, filter = QFileDialog.getSaveFileName(self, 'Open file', filter=';;'.join([sig_filter, json_zlib_filter, json_filter, pkl_filter]))
261 |
262 | if filter == json_zlib_filter:
263 | with open(fname, 'wb') as f:
264 | f.write(zlib.compress(sig_serialize_json.serialize(self.sig_trie).encode('utf-8')))
265 | elif filter == json_filter:
266 | with open(fname, 'w') as f:
267 | json.dump(sig_serialize_json.serialize(self.sig_trie), f, indent=4)
268 | elif filter == sig_filter:
269 | with open(fname, 'wb') as f:
270 | f.write(sig_serialize_fb.SignatureLibraryWriter().serialize(self.sig_trie))
271 | elif filter == pkl_filter:
272 | with open(fname, 'wb') as f:
273 | pickle.dump(self.sig_trie, f)
274 | else:
275 | return
276 | print('Saved as ' + fname)
277 |
278 | @staticmethod
279 | def generate_href(func):
280 | return str(id(func))
281 |
282 | def get_func_name(self, func_node):
283 | if func_node is None:
284 | return ''
285 | else:
286 | return '' + func_node.name + ''
287 |
288 | # handles when the user clicks on a hyperlink to a function node
289 | def on_func_link_clicked(self, link):
290 | print('Hyperlink clicked: ' + link)
291 | self.select_tree_item(self.func_node_items[self.hrefs_to_funcs[link]])
292 |
293 | # Generate treeview row for function (leaf) node in the trie
294 | def add_func_node(self, parent, pattern_col_item, func):
295 | self.hrefs_to_funcs[self.generate_href(func)] = func
296 | self.func_node_items[func] = pattern_col_item
297 |
298 | if not func.callees: func.callees = {}
299 | callees_text = ' '.join([str(k) + ': ' + self.get_func_name(v) for k,v in func.callees.items()])
300 | callees_item = QStandardItem(callees_text)
301 | cols = [pattern_col_item,
302 | QStandardItem(func.name),
303 | callees_item,
304 | QStandardItem(str(func.pattern_offset) if func.pattern else ''),
305 | QStandardItem(str(func.pattern) if func.pattern else ''),
306 | QStandardItem(func.source_binary),
307 | QStandardItem(self.generate_href(func))]
308 | boldface = cols[1].font()
309 | boldface.setBold(True)
310 | cols[1].setFont(boldface)
311 | parent.appendRow(cols)
312 |
313 | # Recursively add rows for this trie node and its children
314 | def add_trie_node(self, parent, pattern_text, node):
315 | left_item = QStandardItem(pattern_text)
316 |
317 | if not node.value: # Stem node
318 | parent.appendRow([left_item, QStandardItem('')])
319 | else: # Leaf node
320 | self.add_func_node(parent, left_item, node.value[0])
321 | for func in node.value[1:]:
322 | self.add_func_node(parent, QStandardItem(''), func)
323 |
324 | pairs = map(lambda node: (str(node.pattern), node), node.children.values())
325 | pairs = sorted(pairs, key=lambda kv: kv[0].replace('?', '\xff'))
326 | for text, child in pairs:
327 | self.add_trie_node(left_item, text, child)
328 | return left_item
329 |
330 | # Add bridge nodes to a special node at the root
331 | def add_bridge_nodes(self, parent, sig_trie):
332 | bridge_item = QStandardItem('(bridge)')
333 | parent.appendRow([bridge_item, QStandardItem('')])
334 | def visit(func, visited):
335 | if func is None or func in visited: return
336 | visited.add(func)
337 | if func.is_bridge:
338 | self.add_func_node(bridge_item, QStandardItem(''), func)
339 | for callee in func.callees.values():
340 | visit(callee, visited)
341 | visited = set()
342 | for func in sig_trie.all_values():
343 | visit(func, visited)
344 |
345 | def open_trie(self, sig_trie, filename):
346 | self.close_file()
347 | self.sig_trie = sig_trie
348 | root_node = self.add_trie_node(self.model, filename, sig_trie)
349 | self.add_bridge_nodes(root_node, sig_trie)
350 |
351 |
352 | # copy-pasted off https://stackoverflow.com/questions/55923137/ lol
353 | class PatternDelegate(QStyledItemDelegate):
354 | def __init__(self):
355 | super(PatternDelegate, self).__init__()
356 | self.font = QFontDatabase.systemFont(QFontDatabase.FixedFont)
357 |
358 | def paint(self, painter, option, index):
359 | if index.data() is None:
360 | return
361 | painter.save()
362 |
363 | painter.setFont(self.font)
364 | defaultPen = painter.pen()
365 | self.initStyleOption(option, index)
366 | style = option.widget.style()
367 | option.text = '' # wipe out the text passed to the original renderer, so just have it render the background
368 | style.drawControl(QStyle.CE_ItemViewItem, option, painter, option.widget)
369 |
370 | offset = 3
371 | ellipsis = '…'
372 | ellipsisWidth = painter.fontMetrics().horizontalAdvance(ellipsis)
373 | rightBorder = option.rect.left() + option.rect.width() - offset
374 |
375 | option.rect.moveRight(option.rect.right() + offset)
376 |
377 | textRole = QPalette.NoRole
378 | if option.state & QStyle.State_Selected:
379 | textRole = QPalette.HighlightedText
380 |
381 | color = 0
382 | painter.setPen(defaultPen)
383 | for c in index.data():
384 | if color == 0 and c == '?': # little fsm
385 | color = 1
386 | painter.setPen(Qt.red)
387 | elif color == 1 and c != '?':
388 | color = 0
389 | painter.setPen(defaultPen)
390 |
391 | charWidth = painter.fontMetrics().horizontalAdvance(c)
392 | drawRect = option.rect
393 | if drawRect.left() + charWidth + ellipsisWidth > rightBorder:
394 | style.drawItemText(painter, drawRect, option.displayAlignment, option.palette, True, ellipsis, textRole)
395 | break
396 |
397 | style.drawItemText(painter, drawRect, option.displayAlignment, option.palette, True, c, textRole)
398 |
399 | option.rect.moveRight(option.rect.right() + charWidth)
400 |
401 |
402 | painter.restore()
403 |
404 |
405 | # https://stackoverflow.com/questions/35397943/how-to-make-a-fast-qtableview-with-html-formatted-and-clickable-cells
406 | class CalleesDelegate(QStyledItemDelegate):
407 | def __init__(self):
408 | super(CalleesDelegate, self).__init__()
409 |
410 | def anchorAt(self, html, point):
411 | doc = QTextDocument()
412 | doc.setHtml(html)
413 |
414 | textLayout = doc.documentLayout()
415 | assert textLayout != None
416 | return textLayout.anchorAt(point)
417 |
418 | def paint(self, painter, option, index):
419 | options = option
420 | self.initStyleOption(options, index)
421 |
422 | painter.save()
423 |
424 | doc = QTextDocument()
425 | doc.setHtml(options.text)
426 |
427 | options.text = ""
428 | options.widget.style().drawControl(QStyle.CE_ItemViewItem, option, painter, option.widget)
429 |
430 | painter.translate(options.rect.left(), options.rect.top())
431 | clip = QRect(0, 0, options.rect.width(), options.rect.height())
432 | doc.drawContents(painter, clip)
433 |
434 | painter.restore()
435 |
436 | def sizeHint(self, option, index):
437 | options = option
438 | self.initStyleOption(options, index)
439 |
440 | doc = QTextDocument()
441 | doc.setHtml(options.text)
442 | doc.setTextWidth(options.rect.width())
443 | return QSize(doc.idealWidth(), doc.size().height())
444 |
445 |
446 | class TrieView(QTreeView):
447 | linkUnhovered = Signal()
448 | linkHovered = Signal(str)
449 | linkActivated = Signal(str)
450 |
451 | def __init__(self, *args, **kwargs):
452 | super(TrieView, self).__init__(*args, **kwargs)
453 | self.setMouseTracking(True)
454 | self._mousePressAnchor = ''
455 | self._lastHoveredAnchor = ''
456 |
457 | def mousePressEvent(self, event):
458 | super(TrieView, self).mousePressEvent(event)
459 | anchor = self.anchorAt(event.pos())
460 | self._mousePressAnchor = anchor
461 |
462 | def mouseMoveEvent(self, event):
463 | anchor = self.anchorAt(event.pos())
464 |
465 | if self._mousePressAnchor != anchor:
466 | self._mousePressAnchor = ''
467 |
468 | if self._lastHoveredAnchor != anchor:
469 | self._lastHoveredAnchor = anchor
470 | if self._lastHoveredAnchor:
471 | QApplication.setOverrideCursor(QCursor(Qt.PointingHandCursor))
472 | self.linkHovered.emit(self._lastHoveredAnchor)
473 | else:
474 | QApplication.restoreOverrideCursor()
475 | self.linkUnhovered.emit()
476 |
477 | def mouseReleaseEvent(self, event):
478 | if self._mousePressAnchor:
479 | anchor = self.anchorAt(event.pos())
480 |
481 | if anchor == self._mousePressAnchor:
482 | self.linkActivated.emit(self._mousePressAnchor)
483 |
484 | self._mousePressAnchor = ''
485 |
486 | super(TrieView, self).mouseReleaseEvent(event)
487 |
488 | def anchorAt(self, pos):
489 | index = self.indexAt(pos)
490 | if index.isValid():
491 | delegate = self.itemDelegate(index)
492 | wordDelegate = delegate
493 | if isinstance(wordDelegate, CalleesDelegate):
494 | itemRect = self.visualRect(index)
495 | relativeClickPosition = pos - itemRect.topLeft()
496 |
497 | html = index.data()
498 | if html is not None:
499 | return wordDelegate.anchorAt(html, relativeClickPosition)
500 |
501 | return ''
502 |
503 | def explore_signature_library(sig_trie):
504 | """
505 | Display an in-memory signature trie in the signature explorer GUI.
506 | :param sig_trie: instance of `TrieNode`
507 | """
508 | if not QApplication.instance():
509 | app = QApplication(sys.argv)
510 | else:
511 | app = None
512 | widget = App()
513 | widget.show()
514 | widget.open_trie(sig_trie, '(memory)')
515 | if app:
516 | app.exec_()
517 |
518 | if __name__ == "__main__":
519 | app = QApplication(sys.argv)
520 |
521 | widget = App()
522 | widget.show()
523 |
524 | sys.exit(app.exec_())
525 |
--------------------------------------------------------------------------------
/sigkit/signaturelibrary.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2015-2020 Vector 35 Inc
2 | #
3 | # Permission is hereby granted, free of charge, to any person obtaining a copy
4 | # of this software and associated documentation files (the "Software"), to
5 | # deal in the Software without restriction, including without limitation the
6 | # rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
7 | # sell copies of the Software, and to permit persons to whom the Software is
8 | # furnished to do so, subject to the following conditions:
9 | #
10 | # The above copyright notice and this permission notice shall be included in
11 | # all copies or substantial portions of the Software.
12 | #
13 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
18 | # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
19 | # IN THE SOFTWARE.
20 |
21 | """
22 | This package contains definitions for the data structures and objects used in
23 | Signature Libraries. To construct a new empty signature trie, use `new_trie`.
24 | """
25 |
26 | # 2-3 compatibility
27 | import sys
28 |
29 | PY2 = sys.version_info[0] == 2
30 | PY3 = sys.version_info[0] == 3
31 |
32 | if PY2:
33 | bytes_ord = ord
34 | else:
35 | bytes_ord = lambda x: x
36 |
37 | import functools
38 | from itertools import starmap
39 |
40 |
41 | @functools.total_ordering # for sorted()
42 | class MaskedByte(object):
43 | """
44 | Represents a pattern to match a single byte: either a value from 0-255, or a wildcard, '??'
45 | Algebraically, you can imagine that there is a partial ordering where 0-255 < ??, or
46 | alternatively, a total ordering where 0 < 1 < 2 < ... < 255 < ??
47 |
48 | This class is backed by a flyweight cache. Use `MaskedByte.new` to construct.
49 | """
50 |
51 | wildcard = None
52 | cache = []
53 |
54 | def __init__(self, value, mask):
55 | self._value = value
56 | self._mask = mask
57 |
58 | @property
59 | def value(self):
60 | return self._value
61 |
62 | @property
63 | def mask(self):
64 | return self._mask
65 |
66 | @staticmethod
67 | def new(value, mask):
68 | assert type(value) == int
69 | assert 0 <= value <= 255
70 | assert mask == 0 or mask == 1
71 | if mask == 0:
72 | return MaskedByte.wildcard
73 | else:
74 | return MaskedByte.cache[value]
75 |
76 | @staticmethod
77 | def from_str(s):
78 | assert len(s) == 2
79 | if s == '??':
80 | return MaskedByte.wildcard
81 | else:
82 | return MaskedByte.new(int(s, 16), 1)
83 |
84 | def __str__(self):
85 | return '%02x' % (self._value,) if self._mask == 1 else '??'
86 |
87 | def __repr__(self):
88 | return self.__str__()
89 |
90 | def __eq__(self, other):
91 | if not type(other) == type(self):
92 | return False
93 | return self.matches(other) and other.matches(self)
94 |
95 | # this defines a total ordering
96 | def __hash__(self):
97 | if self._mask == 0:
98 | return 256
99 | else:
100 | return self._value # 0-255
101 |
102 | # this is only here for sorting purposes in python, no fancy algebraic interpretation behind it.
103 | def __le__(self, other):
104 | assert type(other) == type(self)
105 | return self.__hash__() <= other.__hash__()
106 |
107 | def matches(self, other):
108 | """
109 | Defines a *partial* ordering, essentially a >= operator algebraically:
110 | (00...FF) <= ??; other elements are incomparable.
111 | :param other: MaskedByte or byte
112 | :return: True if all bytes matched by `other` are also matched by this
113 | """
114 | if self._mask == 0:
115 | return True
116 | if isinstance(other, MaskedByte):
117 | if other._mask == 0:
118 | return False
119 | else:
120 | return self._value == other._value
121 | if PY2 and type(other) == str:
122 | assert len(other) == 1
123 | return self._value == ord(other)
124 | assert type(other) == int
125 | return self._value == other
126 |
127 | # Meet operator
128 | def intersect(self, other):
129 | assert isinstance(other, MaskedByte)
130 | if self._mask == 0 and other._mask == 0:
131 | return MaskedByte.wildcard
132 | elif self._mask == 0 and other._mask == 1:
133 | return other
134 | elif self._mask == 1 and other._mask == 0:
135 | return self
136 | elif self._value == other._value:
137 | return self
138 | else:
139 | return None # NO intersection!
140 |
141 | # Join operator
142 | def union(self, other):
143 | assert isinstance(other, MaskedByte)
144 | if self._mask == 0 or other._mask == 0:
145 | return MaskedByte.wildcard
146 | elif self._value == other._value:
147 | return self
148 | else:
149 | return MaskedByte.wildcard # !!
150 | MaskedByte.wildcard = MaskedByte(0, 0)
151 | MaskedByte.cache = [MaskedByte(value, 1) for value in range(256)]
152 |
153 |
154 | class Pattern():
155 | """
156 | Represents a pattern used for matching byte sequences; a sequence of MaskedByte.
157 | For example, the string representation of a Pattern looks like `1234??56??78` .
158 | Behaves like an array.
159 | """
160 |
161 | def __init__(self, data, mask):
162 | """
163 | Constructs a new pattern object
164 | :param data: bytes-like object, byte sequence of this pattern.
165 | :param mask: wildcard mask for the pattern. must be the same length as `data`. array of 0 or 1, 0 means wildcard at that position
166 | :return:
167 | """
168 | assert len(data) == len(mask)
169 | assert type(data) == bytes
170 | assert type(mask) == list
171 | for elem in mask: assert elem == 0 or elem == 1
172 | self._array = tuple(MaskedByte.new(bytes_ord(data[i]), mask[i]) for i in range(len(data)))
173 |
174 | @staticmethod
175 | def from_str(s):
176 | if len(s) % 2:
177 | raise ValueError('odd pattern length ' + str(len(s)) + ': ' + s)
178 | p = Pattern(b'', [])
179 | p._array = tuple(MaskedByte.from_str(s[i:i + 2]) for i in range(0, len(s), 2))
180 | return p
181 |
182 | def __str__(self):
183 | return ''.join(map(str, self._array))
184 |
185 | def __getitem__(self, item):
186 | if isinstance(item, slice):
187 | p = Pattern(b'', [])
188 | p._array = self._array.__getitem__(item)
189 | return p
190 | return self._array.__getitem__(item)
191 |
192 | def __len__(self):
193 | return self._array.__len__()
194 |
195 | def __iter__(self):
196 | return self._array.__iter__()
197 |
198 | def __eq__(self, other):
199 | if not type(other) == type(self):
200 | return False
201 | return self._array.__eq__(other._array)
202 |
203 | def __hash__(self):
204 | return self._array.__hash__()
205 |
206 | def matches(self, buf):
207 | """
208 | Checks if this Pattern matches `buf`.
209 | :param buf: Pattern or bytestring
210 | :return: True if all bytes matched by `other` are also matched by this
211 | """
212 | if len(self._array) > len(buf): return False
213 | return all(starmap(MaskedByte.matches, zip(self._array, buf)))
214 |
215 | # Meet operator
216 | def intersect(self, other):
217 | assert isinstance(other, Pattern)
218 | # right-pad with wildcard
219 | size = max(len(self._array), len(other._array))
220 | array1 = self._array + tuple([MaskedByte.wildcard] * (size - len(self._array)))
221 | array2 = other._array + tuple([MaskedByte.wildcard] * (size - len(other._array)))
222 | result_array = tuple(starmap(MaskedByte.intersect, zip(array1, array2)))
223 | if not all(result_array): return None # No intersection!
224 | p = Pattern(b'', [])
225 | p._array = result_array
226 | return p
227 |
228 | # Join operator
229 | def union(self, other):
230 | assert isinstance(other, Pattern)
231 | # length truncated to smallest
232 | result_array = tuple(starmap(MaskedByte.union, zip(self._array, other._array)))
233 | p = Pattern(b'', [])
234 | p._array = result_array
235 | return p
236 |
237 | def data(self):
238 | for b in self._array:
239 | yield b.value
240 |
241 | def mask(self):
242 | for b in self._array:
243 | yield b.mask
244 |
245 |
246 | class FunctionInfo(object):
247 | """
248 | Stores additional information about functions that are useful while generating and manipulating
249 | signature libraries, but excluded from the finalized signature library to save space.
250 | This information is also used to simulate linking when generating the call-graph.
251 | """
252 | def __init__(self):
253 | self.patterns = None
254 | """list of `Pattern`s which match this function"""
255 |
256 | self.callees = None
257 | """dictionary of {offset: (destination name, `ReferenceType`)}; other symbols this function calls"""
258 |
259 | self.aliases = None
260 | """list of string containing other possible names that could link to this function"""
261 |
262 | def __str__(self):
263 | return ''
264 |
265 |
266 | class FunctionNode(object):
267 | """
268 | Represents a function that we would like to match and contains relevant metadata for matching purposes.
269 | Function nodes are connected with each other by a call graph. This helps not only encode information about
270 | individual functions but also the relationships between them when matching.
271 | Each FunctionNode is a vertex of the call graph, represented by an edge list stored in `callees`.
272 |
273 | To create a FunctionNode for a given function, see `compute_sig.process_function`.
274 | """
275 | def __init__(self, name):
276 | self.name = name
277 | """The name of the matched function"""
278 |
279 | self.source_binary = ''
280 | """The filename of the binary that the function came from (malloc.o for example). Optional."""
281 |
282 | # used to disambiguate when multiple FunctionNodes are matched
283 | self.pattern = Pattern(b'', [])
284 | self.pattern_offset = 0
285 |
286 | self.callees = {}
287 | """Forms a callgraph with other `FunctionNodes`. Dict of {call_offset: destination}."""
288 |
289 | self.ref_count = 0
290 | """Number of places this node is in its signature trie"""
291 |
292 | @property
293 | def is_bridge(self):
294 | return self.ref_count == 0
295 |
296 | def __str__(self):
297 | return ''
298 |
299 | def __repr__(self):
300 | result = ''
308 | return result
309 |
310 |
311 | class TrieNode(object):
312 | """
313 | A prefix tree, aka a Trie.
314 | This trie has several special characteristics:
315 | - The bytestrings of stem nodes can contain wildcards, which match any byte.
316 | - Bytestrings can start with a wildcard.
317 | - Nodes contain an array of function nodes, which represent functions matched by the pattern corresponding to that trie position.
318 | - Most importantly, the function nodes are themselves connected by a call graph (a directed graph).
319 | This means that all of the function nodes are actually interconnected orthogonally from the trie itself.
320 | In fact, a trie node may contain a function node that has a call edge to a function node which itself is not contained within the trie!
321 | In such cases, we refer to such nodes as "bridge" nodes, as they have no purpose for function matching other than
322 | to link two related functions via the call graph.
323 |
324 | Here is an example to illustrate:
325 | 01
326 | 2345: func1 (calls func2)
327 | 4567: func3
328 | 02
329 | 5678: func4 (calls func3)
330 | func2 (not in any trie leaf node) calls func4
331 |
332 | In this case, there are six trie nodes (including the root), four function nodes, and `func2` is a bridge node.
333 | """
334 |
335 | def __init__(self, pattern, children, value):
336 | """
337 | Don't call me directly. Call new_trie() instead to construct an empty trie and use insert() to add to it.
338 |
339 | :param pattern: Pattern object
340 | :param children: forms a trie of TrieNode. dict of {MaskedByte: child node}.
341 | :param value: array of FunctionNode present at this TrieNode
342 | """
343 | assert isinstance(pattern, Pattern)
344 | for elem in pattern: assert isinstance(elem, MaskedByte)
345 |
346 | self.pattern = pattern
347 | self.children = children
348 | self.value = value
349 |
350 | def __repr__(self):
351 | result = str(self.pattern)
352 | if self.value is not None:
353 | result += ':' + str(self.value)
354 | return result
355 |
356 | def find(self, buf):
357 | """
358 | Traverses this prefix trie to find matched function nodes in a specified buffer of data.
359 | At each trie node visited, all function nodes contained by that node are appended to the results list.
360 | :param buf: bytes-like object
361 | :return: a list of `FunctionNode`s which match the given bytes
362 | """
363 | if not self.pattern.matches(buf):
364 | return [] # no match
365 |
366 | matches = []
367 | if self.value is not None:
368 | matches.extend(self.value)
369 |
370 | if len(self.pattern) == len(buf): return matches
371 | buf = buf[len(self.pattern):]
372 |
373 | next_byte = buf[0]
374 | if next_byte in self.children:
375 | matches.extend(self.children[next_byte].find(buf))
376 |
377 | return matches
378 |
379 | def _is_degenerate(self):
380 | """
381 | A trie node is degenerate it would match any byte sequence
382 | :return: if the pattern is empty or all wildcards
383 | """
384 | if not self.pattern:
385 | return True
386 | for m in self.pattern:
387 | if m.mask: return False
388 | return True
389 |
390 | def _split(self, j):
391 | split_node = TrieNode(self.pattern[j:], self.children, self.value)
392 | self.pattern = self.pattern[:j]
393 | self.value = None
394 | if split_node._is_degenerate() and not split_node.children:
395 | # print('deleting degenerate node ', repr(split_node))
396 | for f in split_node.value:
397 | f.ref_count -= 1
398 | self.children = {}
399 | return
400 | self.children = {split_node.pattern[0]: split_node}
401 |
402 | def _add_child(self, child):
403 | assert child.pattern[0] not in self.children
404 | assert isinstance(child.pattern[0], MaskedByte)
405 | self.children[child.pattern[0]] = child
406 |
407 | def insert(self, pattern, value):
408 | """
409 | Inserts a new FunctionNode into this trie at the position specified by the pattern (`data`,`mask`).
410 | To avoid false postitives, the function node may be rejected from the trie and not inserted if the specified
411 | pattern is too short or too ambiguous.
412 |
413 | :param pattern: Pattern object
414 | :param value: `FunctionNode`
415 | :return: True if the function node was inserted, or False if it was rejected
416 | """
417 | if len(pattern) < 8:
418 | # sys.stderr.write('Too short pattern for %s\n' % (value,))
419 | return False
420 | if sum(map(lambda e: e.mask, pattern)) < 8:
421 | # sys.stderr.write('Too ambiguous mask for %s\n' % (value,))
422 | return False
423 |
424 | i = 0
425 | j = 0
426 | node = self
427 | while i < len(pattern):
428 | if j == len(node.pattern): # end of node
429 | j = 0
430 | if pattern[i] in node.children: # next node
431 | node = node.children[pattern[i]]
432 | else: # we need to insert a new node
433 | new_node = TrieNode(pattern[i:], {}, None)
434 | node._add_child(new_node)
435 | node = new_node
436 | break
437 | elif pattern[i] != node.pattern[j]: # need to split node
438 | node._split(j)
439 | new_node = TrieNode(pattern[i:], {}, None)
440 | node._add_child(new_node)
441 | node = new_node
442 | break
443 | else:
444 | i += 1
445 | j += 1
446 |
447 | if node.value is None:
448 | node.value = [value]
449 | else:
450 | node.value.append(value)
451 | # sys.stderr.write('Ambiguous functions %s\n' % (node,))
452 | value.ref_count += 1
453 | return True
454 |
455 | def pretty_print(self, prefix_len=0):
456 | indent = ' ' * prefix_len
457 | result = indent + repr(self)
458 | for child in self.children.values():
459 | result += '\n' + child.pretty_print(prefix_len + len(self.pattern))
460 | return result
461 |
462 | def all_nodes(self):
463 | """
464 | Yields all the trie nodes in this subtree using a simple DFS.
465 | :return: generator of `TrieNode`
466 | """
467 | yield self
468 | for k, child in sorted(self.children.items()):
469 | for node in child.all_nodes():
470 | yield node
471 |
472 | def all_values(self):
473 | """
474 | Yields function nodes that are directly contained by some trie node within this subtrie.
475 | Doesn't include "bridge" nodes!
476 | :return: generator of `FunctionNode`
477 | """
478 | for node in self.all_nodes():
479 | if node.value:
480 | for val in node.value:
481 | yield val
482 |
483 | def all_functions(self):
484 | """
485 | Yields ALL function nodes, including bridge nodes by performing a DFS on the callgraph as well.
486 | Note that if this is called on a subtree, these functions may not be in under this subtree!
487 | Therefore, it only really makes sense to call this on the root node.
488 | :return: generator of `FunctionNode`
489 | """
490 | def visit(func_node, visited): # callgraph dfs
491 | if func_node is None or func_node in visited: return
492 | visited.add(func_node)
493 | yield func_node
494 | for callee in func_node.callees.values():
495 | for func in visit(callee, visited):
496 | yield func
497 | visited = set()
498 | for func_node in self.all_values():
499 | for func in visit(func_node, visited):
500 | yield func
501 |
502 |
503 | def new_trie():
504 | """
505 | Constructs a new, empty signature trie.
506 | :return: an empty trie
507 | """
508 | return TrieNode(Pattern(b'', []), {}, None)
509 |
--------------------------------------------------------------------------------
/sigkit/trie_ops.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 |
3 | # Copyright (c) 2015-2020 Vector 35 Inc
4 | #
5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
6 | # of this software and associated documentation files (the "Software"), to
7 | # deal in the Software without restriction, including without limitation the
8 | # rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
9 | # sell copies of the Software, and to permit persons to whom the Software is
10 | # furnished to do so, subject to the following conditions:
11 | #
12 | # The above copyright notice and this permission notice shall be included in
13 | # all copies or substantial portions of the Software.
14 | #
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 | # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 | # IN THE SOFTWARE.
22 |
23 | """
24 | This package contains code for advanced manipulation of signature library
25 | data structures and operations like signature trie merging and finalization.
26 | This package is intended for users who are creating their own signature
27 | libraries.
28 |
29 | The most useful functions are `trie_insert_funcs`, `combine_signature_libraries`,
30 | `update_signature_library`, and `finalize_trie`.
31 | """
32 |
33 | # You know the old joke, right?
34 | # When I wrote this code, only God and I understood it.
35 | # now only God understands it
36 |
37 | import sys
38 | from collections import defaultdict
39 | from functools import reduce
40 | import operator
41 |
42 | from binaryninja import SymbolType
43 |
44 | from . import signaturelibrary
45 |
46 | def are_names_compatible(a, b):
47 | if a == b:
48 | return True
49 | if a.name == b.name:
50 | return True
51 | if a.name.startswith(b.name) or b.name.startswith(a.name):
52 | return True
53 | if len(a.name) > 12 and len(b.name) > 12:
54 | return a.name in b.name or b.name in a.name
55 | return False
56 |
57 |
58 | # mathematically speaking, this is defines a less-than-or-equal-to operation over the
59 | # set of function signatures and therefore a partial ordering over that set.
60 | # Let A, B both be signatures. we say that that A <= B if all functions that A matches
61 | # are also matched by B. In other words, for a signature library containing B, it would be
62 | # redundant to add A, since B already already matches all of the functions that A would.
63 | #
64 | # Let A ⨅ B denote a signature that would match all functions matched by both A and B.
65 | # ⨅ defines a "meet" relationship on the lattice of signatures.
66 | # in other words, A <= B iff A ⨅ B = B (and commutatively, B ⨅ A = A).
67 | # concretely, A ⨅ B is equivalent to the signature with pattern and callees that is the
68 | # intersection those of A and B. thus, we can check if A = (A ⨅ B), and likewise A <= B,
69 | # by matching A's pattern directly against B's pattern.
70 | #
71 | # the greatest element of this lattice is None, a signature that matches all functions.
72 | #
73 | # during optimization during trie finalization, we delete all non-maximal signatures
74 | # in the signature trie; i.e., all signatures which are less than another one (and therefore
75 | # redundant) are eliminated. the downside to this approach is that we will lose a degree
76 | # of specificity: consider a function which matches B but doesn't match A (where A <= B).
77 | # we could choose to keep both A and B in the signature library, but if we encountered such
78 | # a function, what should we do with it? how do we distinguish between functions which match
79 | # both A and B as opposed to ones which only match B? more importantly, what name should
80 | # be assigned to such a function? therefore, it's meaningless to include both A and B, and
81 | # eliminating the redundancy also reduces the ambiguity of matches.
82 | #
83 | # this function returns whether A <= B, for two function signatures A and B.
84 | #
85 | # sometimes, we don't have function info (e.g., function bytes) for both nodes. this is typically
86 | # when we're trying to merge additional nodes into a trie that we don't have FunctionInfo for.
87 | # in this case, we only need function info for the nodes we're trying to merge in by exploiting
88 | # the signature trie. We know A <= B if searching for A's data in the trie matches B.
89 | def is_signature_subset(a, func_info, b, sig_trie, visited):
90 | """
91 | :param a: FunctionNode to check whether is a subset of B
92 | :param func_info: dict containing the function info for A's trie
93 | :param b: FunctionNode to check whether it contains A
94 | :param sig_trie: the trie B belongs to.
95 | :param visited: visited set, should be initialized to {}
96 | :return: whether A matches a subset of what B matches
97 | """
98 | if a == b:
99 | return True
100 | if int(a is None) < int(b is None):
101 | return True
102 | if int(a is None) > int(b is None):
103 | return False
104 | assert isinstance(a, signaturelibrary.FunctionNode)
105 | assert isinstance(b, signaturelibrary.FunctionNode)
106 | assert a in func_info
107 |
108 | # this is essentially a dfs on the callgraph. if we encounter a backedge,
109 | # treat it optimistically, implying that the callers match if the callees match.
110 | # however, we track our previous assumptions, meaning that if we previously
111 | # optimistically assumed b == a, then later on if we compare b and c, we say
112 | # that b != c since we already assumed b == a (and we already checked above that c != a).
113 | if b in visited:
114 | return visited[b] == a
115 | visited[b] = a
116 |
117 | # if A is bridge, but B isn't, A is obviously more ambiguous than B. (and vice versa)
118 | if int(a.is_bridge) < int(b.is_bridge):
119 | return True
120 | if int(a.is_bridge) > int(b.is_bridge):
121 | return False
122 |
123 | if not b.is_bridge:
124 | for a_pattern in func_info[a].patterns:
125 | # if A is a subset of B, then B >= A; i.e., searching the trie for A's data should match B.
126 | # A <= B --> A ⨅ B = B
127 | if b not in sig_trie.find(a_pattern):
128 | return False
129 |
130 | # return false if B's additional pattern doesn't match A (B ⨅ A != B)
131 | for a_pattern in func_info[a].patterns:
132 | if b.pattern_offset >= 0 and b.pattern_offset + len(b.pattern) < len(a_pattern):
133 | if not b.pattern.matches(a_pattern[b.pattern_offset:]):
134 | return False
135 |
136 | # check that all callees required by B are also required by A
137 | for call_site, callee in b.callees.items():
138 | if callee is not None and call_site not in a.callees:
139 | return False
140 | if not all(map(lambda k: is_signature_subset(a.callees[k] if k in a.callees else None, func_info,
141 | b.callees[k], sig_trie, visited), b.callees)):
142 | return False
143 |
144 | return True
145 |
146 |
147 | def rewrite_callgraph(funcs, to_delete):
148 | # complete the DFS first, avoid simultaneous modification and traversal
149 | inverse_callgraph = defaultdict(set)
150 | for func in funcs:
151 | if func in to_delete: continue
152 | for callee in func.callees.values():
153 | if callee in to_delete:
154 | inverse_callgraph[callee].add(func)
155 |
156 | def follow(k):
157 | while k in to_delete:
158 | k = to_delete[k]
159 | return k
160 |
161 | # rewrite callgraph
162 | for k in to_delete:
163 | v = follow(k)
164 | for func in inverse_callgraph[k]:
165 | for call_site in func.callees:
166 | if func.callees[call_site] == k:
167 | func.callees[call_site] = v
168 | assert k != v
169 | # print('replace', k.name, id(k), '=>', v.name, id(v),'in', func.name)
170 |
171 |
172 | def rewrite_trie(sig_trie, to_delete, update=False):
173 | def follow(k):
174 | while k in to_delete:
175 | k = to_delete[k]
176 | return k
177 |
178 | # rewrite trie values
179 | for node in sig_trie.all_nodes():
180 | if not node.value: continue
181 | new_value = []
182 | for func in node.value:
183 | func.ref_count -= 1
184 | if func in to_delete:
185 | if update:
186 | v = follow(func)
187 | if v not in new_value:
188 | v.ref_count += 1
189 | new_value.append(v)
190 | else:
191 | if func not in new_value:
192 | func.ref_count += 1
193 | new_value.append(func)
194 | node.value = new_value
195 |
196 | # dfs; delete functionless subtries
197 | def prune(node):
198 | if not node.children:
199 | should_delete = not node.value
200 | return should_delete
201 | new_children = {}
202 | for b, c in node.children.items():
203 | should_delete = prune(c)
204 | if not should_delete:
205 | new_children[b] = c
206 | node.children = new_children
207 | should_delete = not node.children and not node.value
208 | return should_delete
209 | prune(sig_trie)
210 |
211 |
212 | # one-way deduplication (trie1 to trie2)
213 | def find_redundant(trie1, info1, trie2):
214 | cache = {}
215 | def cached_is_signature_subset(a, func_info, b, sig_trie, visited):
216 | if (a, b) in cache:
217 | return cache[(a, b)]
218 | result = is_signature_subset(a, func_info, b, sig_trie, visited)
219 | cache[(a, b)] = result
220 | return result
221 |
222 |
223 | # search trie2 for funcs from trie1. if `A` is matched by `B`, then `A` matches a subset of `B`
224 | # and should be discarded. references to `A` should be replaced by references to `B`.
225 | # algebraically if A ⨅ B = A, then A <= B, so A is redundant.
226 | to_delete = {}
227 |
228 | def check_if_redundant(func_a, func_b):
229 | while func_b in to_delete: # avoid cycles
230 | func_b = to_delete[func_b]
231 | if func_a == func_b: # avoid infinite loop
232 | return False
233 | if not are_names_compatible(func_a, func_b):
234 | return False
235 | if cached_is_signature_subset(func_a, info1, func_b, trie2, {}): # func <= cand. func is redundant
236 | to_delete[func_a] = func_b
237 | return True
238 | return False
239 |
240 | for func in info1: # func is our `A`
241 | for pattern in info1[func].patterns:
242 | candidates = trie1.find(pattern)
243 | for cand in candidates: # cand is our `B`
244 | check_if_redundant(func, cand)
245 |
246 | # also clean up useless bridge nodes
247 | bridges1 = list(filter(lambda f: f.is_bridge, info1))
248 | bridges2 = list(filter(lambda f: f.is_bridge, trie2.all_functions()))
249 | for func in bridges1:
250 | for cand in bridges2:
251 | check_if_redundant(func, cand)
252 |
253 | return to_delete
254 |
255 |
256 | # Would it be ok substitute A with B if they have the same name? In that case, trie position is irrelevant as
257 | # we can just have multiple leaf nodes pointing to the same function node
258 | def can_substitute(a, b):
259 | if a == b: return True
260 | if (b is None) != (a is None): return False
261 | assert isinstance(a, signaturelibrary.FunctionNode)
262 | assert isinstance(b, signaturelibrary.FunctionNode)
263 |
264 | if not are_names_compatible(a, b):
265 | return False
266 |
267 | # if A is bridge, but B isn't, A is obviously more ambiguous than B.
268 | if int(a.is_bridge) < int(b.is_bridge):
269 | return False
270 |
271 | # check that all callees required by B are also required by A
272 | for call_site, callee in b.callees.items():
273 | if callee is not None and call_site not in a.callees:
274 | return False
275 |
276 | return True
277 |
278 | # deal with signatures with the same name at different parts in the signature trie that can be merged
279 | def collapse_by_name(func_info):
280 | by_name = defaultdict(set)
281 | for f in func_info:
282 | by_name[f.name].add(f)
283 | to_delete = {}
284 | for family in by_name.values():
285 | for func in family:
286 | for cand in family:
287 | while cand in to_delete: # avoid cycles
288 | cand = to_delete[cand]
289 | if func == cand: # avoid infinite loop
290 | continue
291 | if can_substitute(func, cand):
292 | to_delete[func] = cand
293 | # transfer patterns and aliases from deleted functioninfo to cand's
294 | cand_info = func_info[cand]
295 | deleted_info = func_info[func]
296 | cand_info.patterns.extend(deleted_info.patterns)
297 | cand_info.aliases.extend(deleted_info.aliases)
298 | deleted_info.patterns = [] # free memory (!)
299 | deleted_info.aliases = []
300 | return to_delete
301 |
302 |
303 | def sanity_check(sig_trie):
304 | if not sig_trie.children:
305 | sys.stderr.write('Warning: no functions in trie\n')
306 | return
307 |
308 | count = defaultdict(lambda: 0)
309 | for func in sig_trie.all_values():
310 | count[func] += 1
311 | for func in sig_trie.all_functions():
312 | assert func.ref_count == count[func]
313 |
314 |
315 | # we avoid linking across library boundaries ... they're discrete compilation units and we shouldn't assume
316 | # anything about inter-module calls. who knows which version will be linked with what!
317 | # if we can't resolve the reference, exclude that from the signature! if an optional library isn't linked,
318 | # the call will turn into a stub (like jump 0x0), and will not be a call in the real binary.
319 | # so, we give that a wildcard. in our matching algorithm, we allow calls to wildcard callee to be optional.
320 | def resolve_reference(name, sym_type, source_binary, source_to_node):
321 | if sym_type == SymbolType.FunctionSymbol:
322 | # look for callee from the same object file
323 | if source_binary in source_to_node:
324 | result = source_to_node[source_binary]
325 | # print('resolved static reference', name, '=', result.name, 'from', source_binary)
326 | return result
327 | else:
328 | # sys.stderr.write('Warning: missing static reference ' + name + ' from ' + source_binary + '\n')
329 | return None
330 | else:
331 | # look for callee in a different object file
332 | possible_callees = []
333 | for source in source_to_node:
334 | if source != source_binary:
335 | possible_callees.append(source_to_node[source])
336 | if not possible_callees:
337 | # sys.stderr.write('Warning: missing extern reference ' + name + ' from ' + source_binary + '\n')
338 | return None
339 | elif len(possible_callees) > 1:
340 | # sys.stderr.write('Warning: multiple definitions for external reference ' + name + ' from ' + source_binary + ': '+ ', '.join(map(lambda n: n.name, possible_callees)) + '\n')
341 | return None
342 | else:
343 | # print('resolved extern reference', name, '=', possible_callees[0].name)
344 | return possible_callees[0]
345 |
346 |
347 | def link_callgraph(func_info):
348 | """
349 | Construct the callgraph based on `FunctionInfo` and link all the `FunctionNode`s together.
350 | :param func_info:
351 | :return:
352 | """
353 | name_to_source_to_node = defaultdict(dict)
354 | for node, info in func_info.items():
355 | for name in [node.name] + info.aliases:
356 | name_to_source_to_node[name][node.source_binary] = node
357 |
358 | for node, info in func_info.items():
359 | node.callees = {call_site: resolve_reference(name, sym_type, node.source_binary, name_to_source_to_node[name])
360 | for call_site, (name, sym_type) in info.callees.items()}
361 | # Wildcard out callees that are masked out.
362 | def is_valid_call_site(i):
363 | if i < 0: return False
364 | for pattern in info.patterns:
365 | if i >= len(pattern): return False
366 | if not pattern[i]: return False
367 | return True
368 | node.callees = {call_site: callee if is_valid_call_site(call_site) else None
369 | for call_site, callee in node.callees.items()}
370 |
371 |
372 | def choose_disambiguation_bytes(sig_trie, func_info, min_offset=32, maxlen=5):
373 | for node in sig_trie.all_nodes():
374 | if not node.value: continue
375 | for f in node.value: assert f in func_info
376 | for f in node.value: # reset patterns
377 | f.pattern = signaturelibrary.Pattern(b'', [])
378 | f.pattern_offset = 0
379 | if len(node.value) <= 1: continue
380 |
381 | # since a FunctionNode can have multiple patterns in its FunctionInfo, we say that the set of functions
382 | # it matches is based on the *join* ⨆ of all of these patterns. our goal here is to find some substring
383 | # in all of these patterns that share no intersection.
384 | #
385 | # let P(f) denote the patterns belonging to FunctionNode f's FunctionInfo.
386 | # then let PU(f) = ⨆ P(f) ; i.e. the join of all patterns, a pattern that would match the union of functions matched by those patterns.
387 | # given some functions f1,f2,... at this trie node, we want to find some substring (i,j) in PU(f1),PU(f2),...
388 | # such that PU(fx)[i:j] ⨅ PU(fy)[i:j] = 0 for all pairs fx,fy in f1,f2,...
389 | # then we will choose PU(f)[i:j] as f's disambiguation pattern for each FunctionNode f in f1,f2,...
390 |
391 | pu = {func: reduce(signaturelibrary.Pattern.union, func_info[func].patterns) for func in node.value}
392 | min_len = min(map(len, pu.values()))
393 | if min_len <= min_offset: # this is hopeless. all those bytes are already in the trie
394 | # print('Warn: no possible disambiguation (length) for', repr(node))
395 | continue
396 | if reduce(operator.eq, pu.values()):
397 | # print('Warn: no possible disambiguation (content) for', repr(node))
398 | continue
399 |
400 | def ok(i, j):
401 | for fx in node.value:
402 | for fy in node.value:
403 | if fx == fy: continue
404 | if pu[fx][i:j].intersect(pu[fy][i:j]) is not None:
405 | return False
406 | return True
407 |
408 | for i in range(min_offset, min_len-1): # unfortunately, this is O(min_len*maxlen).
409 | j = i+1
410 | while not ok(i, j) and j < min_len and j-i < maxlen:
411 | j += 1
412 | while ok(i+1, j) and i+1 < j:
413 | i += 1
414 | if ok(i, j):
415 | for f in node.value:
416 | f.pattern = pu[f][i:j]
417 | f.pattern_offset = i
418 | break
419 | # else:
420 | # print('Warn: failed to choose disambiguation for', repr(node))
421 |
422 |
423 | # finalizing a trie links the call graph and removes any redundant nodes, and adds disambiguation bytes
424 | def finalize_trie(sig_trie, func_info):
425 | link_callgraph(func_info)
426 | sanity_check(sig_trie)
427 |
428 | to_delete = find_redundant(sig_trie, func_info, sig_trie)
429 | rewrite_callgraph(func_info, to_delete)
430 | rewrite_trie(sig_trie, to_delete)
431 | for k in to_delete: assert k.ref_count == 0
432 | for k in to_delete: del func_info[k]
433 | to_delete = collapse_by_name(func_info)
434 |
435 | rewrite_callgraph(func_info, to_delete)
436 | rewrite_trie(sig_trie, to_delete)
437 | for k in to_delete: assert k.ref_count == 0
438 | for k in to_delete: del func_info[k]
439 | sanity_check(sig_trie)
440 |
441 | choose_disambiguation_bytes(sig_trie, func_info)
442 |
443 |
444 | # inserts functions from FunctionInfo dict `src_info` into trie `dst_trie`.
445 | def trie_insert_funcs(dst_trie, src_info, maxlen=32):
446 | for to_add in src_info:
447 | to_add.ref_count = 0 # we are repatriating this function node. reset refcount
448 | for pattern in src_info[to_add].patterns:
449 | pattern = pattern[:maxlen]
450 | inserted = dst_trie.insert(pattern, to_add)
451 |
452 |
453 | # merges a signature trie `src_trie` into another signature trie dst_trie`, with FunctionInfo only available for `src_trie`.
454 | # `dst_trie` is modified.
455 | def update_signature_library(dst_trie, src_trie, src_info):
456 | link_callgraph(src_info) # build callgraph
457 |
458 | # identify redundant signatures
459 | to_delete = find_redundant(src_trie, src_info, dst_trie)
460 |
461 | # merge
462 | trie_insert_funcs(dst_trie, src_info)
463 | rewrite_callgraph(dst_trie.all_functions(), to_delete)
464 | rewrite_trie(dst_trie, to_delete)
465 |
466 | sanity_check(dst_trie)
467 |
468 |
469 | # combines two signature tries, `src_trie` into `dst_trie` where FunctionInfo is available for both tries.
470 | # both `dst_trie` and `dst_info` are mutated: functions from `src_trie` and `src_info` are added `dst_trie` and `dst_info`.
471 | def combine_signature_libraries(dst_trie, dst_info, src_trie, src_info):
472 | # merge
473 | trie_insert_funcs(dst_trie, src_info)
474 | dst_info.update(src_info)
475 |
476 | # identify redundant signatures
477 | to_delete = find_redundant(dst_trie, dst_info, src_trie)
478 | rewrite_callgraph(dst_info, to_delete)
479 | rewrite_trie(dst_trie, to_delete)
480 | for k in to_delete: assert k.ref_count == 0
481 | for k in to_delete: del dst_info[k]
482 |
483 | sanity_check(dst_trie)
484 |
--------------------------------------------------------------------------------
/signaturelibrary.fbs:
--------------------------------------------------------------------------------
1 | namespace FlatbufSignatureLibrary;
2 |
3 | struct CallRef {
4 | // offset from the start of the function to the call instruction
5 | offset: int (key);
6 |
7 | // the index of the callee function in the signature library's
8 | // function vector. a value of -1 indicates a missing, or 'null'
9 | // node, that becomes a wildcard when matching.
10 | dst_id: int;
11 | }
12 |
13 | table Pattern {
14 | // pattern data.
15 | // e.g., for 4142??3132 it would be \x41\x42\x00\x31\x32
16 | data: [ubyte] (required);
17 |
18 | // bitfield of the mask in LSB order.
19 | // e.g., for x?xxxxxx xxx? the mask it would be [0b11111101, 0b0111]
20 | mask: [ubyte] (required);
21 | }
22 |
23 | table Function {
24 | // name of the function
25 | name: string (required);
26 |
27 | // which object file or binary the function came from
28 | source_binary: string;
29 |
30 | // a map representing the functions this function calls.
31 | callees: [CallRef];
32 |
33 | // optional disambiguation pattern
34 | pattern: Pattern;
35 |
36 | pattern_offset: uint;
37 |
38 | // if true, this function is a "bridge" node that serves only to link
39 | // two function nodes on the callgraph. bridge nodes don't exist in
40 | // the signature trie, only in the callgraph. typically, this is
41 | // because the function is too short to put in the signature trie
42 | // while avoiding false positives.
43 | is_bridge: bool;
44 | }
45 |
46 | table TrieNode {
47 | // this is the first byte of the pattern. this is used as the key
48 | // in the children map so that children can be selected using binary search
49 | // if the pattern's first byte is a wildcard, then this field is left
50 | // as zero and `wildcard_child' is used in place of `children'.
51 | patternPrefix: ubyte (key);
52 |
53 | // pattern this trie node matches
54 | pattern: Pattern (required);
55 |
56 | // map of child nodes from their pattern's first byte to the node.
57 | children: [TrieNode];
58 |
59 | // child node whose pattern begins with a wildcard, if such a child node exists
60 | wildcard_child: TrieNode;
61 |
62 | // the functions this trie node matches.
63 | functions: [uint];
64 | }
65 |
66 | table SignatureLibrary {
67 | // a vector of all of the function nodes present in this signature library.
68 | functions: [Function] (required);
69 |
70 | // the root trie node.
71 | root: TrieNode (required);
72 | }
73 |
74 | file_extension "sig";
75 | root_type SignatureLibrary;
76 |
--------------------------------------------------------------------------------