├── .gitignore ├── .travis.yml ├── LICENSE ├── MANIFEST.in ├── README.md ├── cpytraceafl ├── __init__.py ├── _testheadermodule.c ├── _tracehookmodule.c ├── cpytraceafl.h ├── rewriter.py ├── tracehook.py └── version.py ├── default.nix ├── dummy-afl-qemu-trace ├── examples ├── hpack_example.py ├── pillow_pcx_example.py ├── pypdf2_example.py └── simplejson_example.py ├── pytest.ini ├── setup.cfg ├── setup.py ├── tcplistenfeeder.py └── tests ├── test_header.py ├── test_rewriter.py └── test_tracehook.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | *.so 3 | *.o 4 | .eggs/ 5 | build/ 6 | *.egg-info/ 7 | dist/ 8 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: nix 2 | nix: 2.2.2 3 | jobs: 4 | include: 5 | - name: Unit tests 6 | script: nix-shell . --pure --run 'python setup.py test' 7 | 8 | - name: Non-nix installation 9 | script: 10 | - nix-shell . --pure --run 'python setup.py sdist' 11 | - mkdir -p tmp/venv 12 | - pushd tmp 13 | - nix-shell --pure -p python37Packages.pip --run \ 14 | 'python -m venv venv; source venv/bin/activate; pip install ../dist/cpytraceafl-*.tar.gz; python -c "import cpytraceafl"' 15 | - popd 16 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Robert Scott 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | recursive-include cpytraceafl *.h 2 | recursive-include tests *.py 3 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # cpytraceafl 2 | 3 | CPython bytecode instrumentation and forkserver tools for fuzzing python code using AFL. 4 | 5 | The tools in this repository enable coverage-guided fuzzing of pure python and mixed python/c 6 | code using [American Fuzzy Lop](https://github.com/google/AFL) (even better, 7 | [AFL++](https://github.com/vanhauser-thc/AFLplusplus)). 8 | 9 | There are three main parts to this: 10 | 11 | - A bytecode rewriter using a technique inspired by inspired by Ned Batchelder's "wicked hack" 12 | detailed at https://nedbatchelder.com/blog/200804/wicked_hack_python_bytecode_tracing.html. 13 | In this case, the rewriter identifies "basic blocks" in the python bytecode and abuses the 14 | `code` object's `lnotab` (line-number table) to mark each basic block as a new "line". 15 | These new "lines" are what trigger CPython's line-level trace hooks. The result of this being 16 | that we can get our trace hook executed on every new basic block. 17 | - A minimal & fast tracehook written in C, tallying visited locations to sysv shared memory. 18 | - A basic forkserver implementation. 19 | 20 | Preparing code for fuzzing involves a couple of steps. The first thing that should happen in 21 | the python process is a call to `install_rewriter()`. It's important that this is done very 22 | early as any modules that are imported before this will not be properly instrumented. 23 | 24 | ```python 25 | from cpytraceafl.rewriter import install_rewriter 26 | 27 | install_rewriter() 28 | ``` 29 | 30 | `install_rewriter()` can optionally be provided with a `selector` controlling which code objects 31 | are instrumented and to what degree. 32 | 33 | Following this, modules can be imported as normal and will be instrumented by the monkeypatched 34 | `compile` functions. It's usually a good idea to initialize the test environment next, 35 | performing as many setup procedures as possible before the input file is read. This may 36 | include doing an initial run of the function under test to ensure any internal imports or caches 37 | are set up. This is because we want to minimize work that has to be done post-fork - any work 38 | done now only has to be done once, 39 | 40 | After calling 41 | 42 | ```python 43 | from cpytraceafl import fuzz_from_here 44 | 45 | fuzz_from_here() 46 | ``` 47 | 48 | the `fork()` will have been made and tracing started. You now simply read your input file and 49 | call your function under test. 50 | 51 | Examples for fuzzing some common packages are provided in [examples/](./examples/). 52 | 53 | As for hooking this script up to AFL, I tend to use the included 54 | [dummy-afl-qemu-trace](./dummy-afl-qemu-trace) shim script to fool AFL's QEmu mode into 55 | communicating directly with the python process. 56 | 57 | ## Fuzzing mixed python/c code 58 | 59 | As of version 0.4.0, `cpytraceafl` can gather trace information from C extension modules that 60 | have been compiled with AFL instrumentation (e.g. using `llvm_mode`). This means that it can 61 | be used to seamlessly fuzz projects which have a mix of python and C "speedups". This is 62 | important not only because a lot of python format-parsing packages use this approach, but 63 | because issues revealed in native code are far more likely to have security implications. 64 | 65 | Including instrumented native code requires a little more care when preparing a target for 66 | fuzzing. For instance, it's important to ensure the `cpytraceafl.tracehook` module has been 67 | imported and it has had its `set_map_start(...)` function provided with a valid memory 68 | area *before* any instrumented extension modules are loaded. This is because simply loading an 69 | instrumented native module will cause it to attempt to log its execution trace somewhere. 70 | 71 | The example [pillow_pcx_example.py](./examples/pillow_pcx_example.py) demonstrates a fuzzing 72 | target taking the necessary precautions into account. 73 | 74 | It's possible that you're _only_ interested in tracing the native code, using `cpytraceafl` 75 | just as a driver, in which case you can omit the early `install_rewriter()` call and all 76 | the weirdness involved with that. 77 | 78 | ## Regular expressions 79 | 80 | [cpytraceafl-regex](https://github.com/risicle/cpytraceafl-regex) is a companion, 81 | `re`-replacement regex implementation with added instrumentation that should aid AFL in 82 | generating examples that pass regular expressions used in the target code, or 83 | exercise them in interesting ways. Without this, AFL will just see regular expressions 84 | as a black box that will act as a barrier to path exploration. 85 | 86 | ## Trophy cabinet 87 | 88 | `cpytraceafl` has been used to find: 89 | 90 | - Pillow: [CVE-2020-10177](https://nvd.nist.gov/vuln/detail/CVE-2020-10177), 91 | [CVE-2020-10378](https://nvd.nist.gov/vuln/detail/CVE-2020-10378), 92 | [CVE-2020-10379](https://nvd.nist.gov/vuln/detail/CVE-2020-10379), 93 | [CVE-2020-10994](https://nvd.nist.gov/vuln/detail/CVE-2020-10994), 94 | [CVE-2020-11538](https://nvd.nist.gov/vuln/detail/CVE-2020-11538). 95 | - bsdiff4: [CVE-2020-15904](https://nvd.nist.gov/vuln/detail/CVE-2020-15904) 96 | - asyncpg: [CVE-2020-17446](https://nvd.nist.gov/vuln/detail/CVE-2020-17446) 97 | - clickhouse-driver: [CVE-2020-26759](https://nvd.nist.gov/vuln/detail/CVE-2020-26759) 98 | 99 | ## Q & A 100 | 101 | ### Is there any point in fuzzing python? Isn't it too slow? 102 | 103 | Well, yes and no. My experience has been that fuzzing python code is simply "a bit different" 104 | from fuzzing native code - you tend to be looking for different things. In terms of raw speed, 105 | fuzzing python is certainly not fast, but iteration rates I tend to work with aren't completely 106 | dissimilar to what I'm used to getting with AFL's Qemu mode (of course, no two fuzzing targets 107 | are really directly comparable). 108 | 109 | Because of the memory-safe nature of pure python code, it's also more uncommon for issues 110 | uncovered through fuzzing to be security issues - logical flaws in parsing tend to lead to 111 | unexpected/unhandled exceptions. So it's still a rather useful tool in simply looking for bugs. 112 | It can be used, for example, to generate a corpus of example inputs for your test suite which 113 | exercise a large amount of the code. 114 | 115 | However, note that while *pure* python code may be memory safe, as soon as you start using 116 | the C api, Cython, or even start playing with the `ctypes` module, it is *not*. 117 | 118 | ### Does basic block analysis make any sense for python code? 119 | 120 | From a rigorous academic stance, and for some uses, possibly not - you've got to keep in mind 121 | that half the bytecode instructions could result in calls out to more arbitrary python or 122 | (uninstrumented) native code that could have arbitrary side effects. But for our needs it works 123 | well enough (recall that AFL coverage analysis is robust to random instrumentation 124 | sites being omitted through `AFL_INST_RATIO` or `AFL_INST_LIBS`). 125 | 126 | ### Doesn't abusing `lnotab` break python's debugging mechanisms? 127 | 128 | Absolutely it does. Don't use instrumented programs to debug problematic cases - use it to 129 | generate problematic inputs. Analyze them with instrumentation turned off. 130 | 131 | ### I'm getting `undefined symbol: __afl_area_ptr` 132 | 133 | Looks like you're trying to import an (instrumented) native extension module before the 134 | `cpytraceafl.tracehook` module has been loaded (which is what provides that symbol). 135 | 136 | ### I'm getting Segmentation Faults after importing an instrumented native module 137 | 138 | You probably also need to provide `cpytraceafl.tracehook.set_map_start(...)` with a valid 139 | writeable memory area before the import. Assuming you're not interested in the trace associated 140 | with the import process, this can just be a dummy which you later discard. I'd recommend either 141 | using an `mmap` object or `sysv_ipc.SharedMemory`. When `fuzz_from_here()` is called, this will 142 | be replaced with right one. 143 | 144 | It's also possible the instrumented module was built with a different AFL `MAP_SIZE_POW2` from 145 | that in `cpytraceafl.MAP_SIZE_BITS`. 146 | 147 | ### Do I need a specially-built/instrumented version of cpython to use this? 148 | 149 | No, you can use your normal distribution-installed python. If you're just looking at 150 | fuzzing pure python, you don't need to even think about building any binaries with 151 | funny compilers. 152 | 153 | You may be interested in building c/c++/cython-based modules or their underlying native 154 | libraries with instrumentation if that's what you're trying to fuzz, but I suspect using 155 | a natively-instrumented _cpython_ would be quite complicated and extremely slow. 156 | 157 | ### Do you have any tips on detecting memory errors in cpython extensions? 158 | 159 | I have tended to use `tcmalloc`'s debugging modes with `TCMALLOC_PAGE_FENCE` and 160 | `TCMALLOC_PAGE_FENCE_NEVER_RECLAIM` enabled. In fact I have 161 | [a fork](https://github.com/gperftools/gperftools/compare/master...risicle:ris-extras) 162 | of `gperftools` containing some additional `tcmalloc` hacks I've found useful. 163 | 164 | One problem with this of course is that much of cpython's memory is allocated 165 | using its own memory pool allocator, which is largely invisible to the `malloc` 166 | implementation. So I've also got 167 | [a patch for cpython](https://gist.github.com/risicle/12c6f20518807699d816b8cb4389b840) 168 | which adds a very basic canary mechanism to its pool allocator (at the slight expense of 169 | memory efficiency). 170 | -------------------------------------------------------------------------------- /cpytraceafl/__init__.py: -------------------------------------------------------------------------------- 1 | import ctypes 2 | import os 3 | import signal 4 | import struct 5 | import sys 6 | 7 | import sysv_ipc 8 | 9 | from cpytraceafl import tracehook 10 | 11 | 12 | # these values *must* agree with those set in afl's config.h, and also those used when compiling 13 | # any instrumented native modules 14 | FORKSRV_FD = 198 15 | DEFAULT_MAP_SIZE_BITS = 16 16 | DEFAULT_SHM_ENV_VAR = "__AFL_SHM_ID" 17 | 18 | MAP_SIZE_ENV_VAR = "AFL_MAP_SIZE" 19 | NGRAM_SIZE_ENV_VAR = "AFL_NGRAM_SIZE" 20 | 21 | 22 | def get_map_size_bits_env(): 23 | if MAP_SIZE_ENV_VAR in os.environ: 24 | map_size = int(os.environ[MAP_SIZE_ENV_VAR]) 25 | map_size_bits = map_size.bit_length() - 1 26 | if 1< 3 | 4 | #include "cpytraceafl.h" 5 | 6 | 7 | static PyObject * _test_record_loc(PyObject *self, PyObject *args) { 8 | uint32_t this_loc; 9 | 10 | if (!PyArg_ParseTuple(args, "I", &this_loc)) 11 | return NULL; 12 | 13 | cpytraceafl_record_loc(this_loc); 14 | 15 | Py_INCREF(Py_None); 16 | return Py_None; 17 | } 18 | 19 | static PyMethodDef TracehookMethods[] = { 20 | { 21 | "_test_record_loc", 22 | _test_record_loc, 23 | METH_VARARGS, 24 | "Method testing cpytraceafl_record_loc" 25 | }, 26 | {NULL, NULL, 0, NULL} 27 | }; 28 | 29 | static struct PyModuleDef testheadermodule = { 30 | PyModuleDef_HEAD_INIT, 31 | // name 32 | "_testheader", 33 | // documentation 34 | "Module to aid in testing use of cpytraceafl's c header interface", 35 | // per-interpreter state size 36 | -1, 37 | TracehookMethods 38 | }; 39 | 40 | PyMODINIT_FUNC 41 | PyInit__testheader(void) 42 | { 43 | return PyModule_Create(&testheadermodule); 44 | } 45 | -------------------------------------------------------------------------------- /cpytraceafl/_tracehookmodule.c: -------------------------------------------------------------------------------- 1 | #define PY_SSIZE_T_CLEAN 2 | #include 3 | 4 | #include "cpytraceafl.h" 5 | 6 | #define HASH_PRIME 0xedb6417b 7 | 8 | unsigned char afl_map_size_bits = 16; 9 | unsigned char afl_ngram_size = 0; 10 | 11 | char* __afl_area_ptr = NULL; 12 | 13 | // non-ngram-aware AFL should be able to interpret this symbol as a 14 | // plain old uint32 and work fine 15 | __thread afl_prev_loc_vector_t __afl_prev_loc; 16 | 17 | // needed by AFL++'s context sensitive coverage feature 18 | __thread uint32_t __afl_prev_ctx; 19 | 20 | static PyObject * tracehook_set_map_start(PyObject *self, PyObject *args) { 21 | unsigned long long _afl_map_start; 22 | 23 | if (!PyArg_ParseTuple(args, "K", &_afl_map_start)) 24 | return NULL; 25 | 26 | __afl_area_ptr = (char *) _afl_map_start; 27 | 28 | Py_INCREF(Py_None); 29 | return Py_None; 30 | } 31 | 32 | static PyObject * tracehook_set_map_size_bits(PyObject *self, PyObject *args) { 33 | if (!PyArg_ParseTuple(args, "b", &afl_map_size_bits)) 34 | return NULL; 35 | 36 | Py_INCREF(Py_None); 37 | return Py_None; 38 | } 39 | 40 | static PyObject * tracehook_set_ngram_size(PyObject *self, PyObject *args) { 41 | unsigned char ngram_size; 42 | if (!PyArg_ParseTuple(args, "b", &ngram_size)) 43 | return NULL; 44 | 45 | if (ngram_size != 0 && (ngram_size < 2 || ngram_size > NGRAM_SIZE_MAX)) { 46 | PyErr_SetString(PyExc_ValueError, "ngram size must be 0 or between 2 and NGRAM_SIZE_MAX"); 47 | return NULL; 48 | } 49 | 50 | afl_ngram_size = ngram_size; 51 | 52 | Py_INCREF(Py_None); 53 | return Py_None; 54 | } 55 | 56 | static PyObject * tracehook_global_trace_hook(PyObject *self, PyObject *args) { 57 | PyObject* frame; 58 | char* event; 59 | PyObject* arg; 60 | 61 | if (!PyArg_ParseTuple(args, "OsO", &frame, &event, &arg)) 62 | return NULL; 63 | 64 | if (!strcmp(event, "call")) { 65 | PyObject* code = PyObject_GetAttrString(frame, "f_code"); 66 | if (code == NULL) return NULL; 67 | PyObject* lnotab = PyObject_GetAttrString(code, "co_lnotab"); 68 | Py_DECREF(code); 69 | if (lnotab == NULL) return NULL; 70 | Py_ssize_t len = PyObject_Length(lnotab); 71 | Py_DECREF(lnotab); 72 | if (len > 0) { // else this is not a function we're interested in 73 | PyObject* line_trace_hook = PyObject_GetAttrString(self, "line_trace_hook"); 74 | if (line_trace_hook == NULL) return NULL; 75 | Py_INCREF(line_trace_hook); 76 | return line_trace_hook; 77 | } 78 | } 79 | 80 | Py_INCREF(Py_None); 81 | return Py_None; 82 | } 83 | 84 | static PyObject * tracehook_line_trace_hook(PyObject *self, PyObject *args) { 85 | PyObject* frame; 86 | char* event; 87 | PyObject* arg; 88 | 89 | if (!PyArg_ParseTuple(args, "OsO", &frame, &event, &arg)) 90 | return NULL; 91 | 92 | // In instrumented code objects, this number is effectively the current basic block number 93 | // added to the code object's co_firstlineno (which is used as a "base hash" for the code 94 | // object). Previously we used the raw memory location of the code object for this, but 95 | // that has the potential to be chaotic if an execution path affects the order in which 96 | // various memory allocations are made. 97 | PyObject* f_lineno = PyObject_GetAttrString(frame, "f_lineno"); 98 | if (f_lineno == NULL) return NULL; 99 | uint32_t lineno = (uint32_t)PyLong_AsUnsignedLong(f_lineno); 100 | Py_DECREF(f_lineno); 101 | 102 | // bytecode offset is also useful & consistent entropy - we'll have that too. 103 | PyObject* f_lasti = PyObject_GetAttrString(frame, "f_lasti"); 104 | if (f_lasti == NULL) return NULL; 105 | uint32_t bytecode_offset = (uint32_t)PyLong_AsUnsignedLong(f_lasti); 106 | Py_DECREF(f_lasti); 107 | 108 | if (!lineno) // avoid zero multiplication 109 | lineno = ~(uint32_t)0; 110 | if (!bytecode_offset) // avoid zero multiplication 111 | bytecode_offset = ~(uint32_t)0; 112 | 113 | // multiplicative hashing - keep most significant bits of a modular multiplication as "hash" 114 | uint32_t state = HASH_PRIME; 115 | state *= lineno; 116 | state *= bytecode_offset; 117 | 118 | cpytraceafl_record_loc(state >> (32-afl_map_size_bits)); 119 | 120 | PyObject* line_trace_hook = PyObject_GetAttrString(self, "line_trace_hook"); 121 | if (line_trace_hook == NULL) return NULL; 122 | Py_INCREF(line_trace_hook); 123 | return line_trace_hook; 124 | } 125 | 126 | static PyMethodDef TracehookMethods[] = { 127 | { 128 | "set_map_start", 129 | tracehook_set_map_start, 130 | METH_VARARGS, 131 | "Set start address of AFL shared memory region" 132 | }, 133 | { 134 | "set_map_size_bits", 135 | tracehook_set_map_size_bits, 136 | METH_VARARGS, 137 | "Set log2 of size of AFL shared memory region" 138 | }, 139 | { 140 | "set_ngram_size", 141 | tracehook_set_ngram_size, 142 | METH_VARARGS, 143 | "Set number of branches to remember, 0 to disable ngram mode" 144 | }, 145 | { 146 | "global_trace_hook", 147 | tracehook_global_trace_hook, 148 | METH_VARARGS, 149 | "Global tracehook callable for passing to sys.settrace()" 150 | }, 151 | { 152 | "line_trace_hook", 153 | tracehook_line_trace_hook, 154 | METH_VARARGS, 155 | "'line' tracehook callable, returned by global_trace_hook when appropriate" 156 | }, 157 | {NULL, NULL, 0, NULL} 158 | }; 159 | 160 | static struct PyModuleDef tracehookmodule = { 161 | PyModuleDef_HEAD_INIT, 162 | // name 163 | "_tracehook", 164 | // documentation 165 | NULL, 166 | // per-interpreter state size 167 | -1, 168 | TracehookMethods 169 | }; 170 | 171 | PyMODINIT_FUNC 172 | PyInit__tracehook(void) 173 | { 174 | return PyModule_Create(&tracehookmodule); 175 | } 176 | -------------------------------------------------------------------------------- /cpytraceafl/cpytraceafl.h: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | extern unsigned char afl_map_size_bits; 5 | extern unsigned char afl_ngram_size; 6 | 7 | #define NGRAM_SIZE_MAX 16U 8 | #define PREV_LOC_SIZE_MAX sizeof(uint64_t) 9 | #define PREV_LOC_VECTOR_SIZE_MAX NGRAM_SIZE_MAX*PREV_LOC_SIZE_MAX 10 | 11 | // we over-allocate as much prev_loc space as we could possibly need 12 | // (given the above values) because at compile-time we don't know what 13 | // ngram settings (if any) will be in use 14 | typedef struct { 15 | union { 16 | char byte[PREV_LOC_VECTOR_SIZE_MAX]; 17 | uint32_t u32[NGRAM_SIZE_MAX]; 18 | } as; 19 | } afl_prev_loc_vector_t; 20 | 21 | extern __thread afl_prev_loc_vector_t __afl_prev_loc; 22 | extern char* __afl_area_ptr; 23 | 24 | // the least significant afl_map_size_bits bits will be used to generate the final 25 | // map location, so callers should ensure that's where the most entropy is packed 26 | static inline void cpytraceafl_record_loc(uint32_t this_loc) { 27 | uint32_t prev_loc = __afl_prev_loc.as.u32[0]; 28 | uint32_t loc_mask = (~(uint32_t)0) >> (32-afl_map_size_bits); 29 | if (afl_ngram_size) { 30 | // reduce ngram elements into prev_loc 31 | for (int i=1; i < afl_ngram_size; i++) { 32 | prev_loc ^= __afl_prev_loc.as.u32[i]; 33 | } 34 | } 35 | 36 | uint32_t map_slot = this_loc ^ prev_loc; 37 | // ensure we can't be addressing outside our allocated region for whatever reason 38 | map_slot &= loc_mask; 39 | // mimic "never zero" behaviour when incrementing visits 40 | uint8_t visits = __afl_area_ptr[map_slot] + 1; 41 | __afl_area_ptr[map_slot] = visits ? visits : 1; 42 | 43 | if (afl_ngram_size) { 44 | // advance the conveyor belt 45 | memmove(&__afl_prev_loc.as.u32[1], __afl_prev_loc.as.u32, sizeof(uint32_t) * (afl_ngram_size-1)); 46 | } 47 | __afl_prev_loc.as.u32[0] = (this_loc>>1) & loc_mask; 48 | } 49 | -------------------------------------------------------------------------------- /cpytraceafl/rewriter.py: -------------------------------------------------------------------------------- 1 | INST_RATIO_PRECISION_BITS = 7 2 | 3 | 4 | # here we rely on injected dependencies, the `dis` module and the class `ramdom.Random`, 5 | # because of the risk of recursion during import. we avoid module-level imports so we can 6 | # ensure we're only importing whatever's strictly necessary before the rewriter has been 7 | # installed 8 | def rewrite(python_version, dis, random_class, code, selector=True): 9 | code_type = type(code) 10 | consts = tuple( 11 | rewrite(python_version, dis, random_class, const, selector) if isinstance(const, code_type) else const 12 | for const in code.co_consts 13 | ) 14 | 15 | selection = selector(code) if callable(selector) else selector 16 | 17 | if selection is True: 18 | inst_sel = True 19 | elif not selection: 20 | inst_sel = False 21 | else: 22 | # use (hash of) code object to seed the random instance 23 | rng = random_class(code) 24 | inst_sel = lambda: ( 25 | rng.getrandbits(INST_RATIO_PRECISION_BITS) <= ( 26 | (1<= 4 and args[3]) or kwargs.get("flags") or 0 155 | original_retval = original_compile(*args, **kwargs) 156 | if flags & PyCF_ONLY_AST: 157 | return original_retval 158 | return rewrite(version_info, dis, random.Random, original_retval, selector) 159 | builtins.compile = rewriting_compile 160 | 161 | original_compile_bytecode = _frozen_importlib_external._compile_bytecode 162 | @functools.wraps(original_compile_bytecode) 163 | def rewriting_compile_bytecode(*args, **kwargs): 164 | return rewrite(version_info, dis, random.Random, original_compile_bytecode(*args, **kwargs), selector) 165 | _frozen_importlib_external._compile_bytecode = rewriting_compile_bytecode 166 | -------------------------------------------------------------------------------- /cpytraceafl/tracehook.py: -------------------------------------------------------------------------------- 1 | import ctypes 2 | import sys 3 | 4 | # this file is simply a facade for the (underscored) native module which ensures the module 5 | # is loaded in a specific way... 6 | 7 | # set RTLD_GLOBAL for tracehook import so that afl-instrumented native libs are able to find 8 | # our __afl_area_ptr and __afl_prev_loc globals 9 | _prev_dlopenflags = sys.getdlopenflags() 10 | sys.setdlopenflags(_prev_dlopenflags | ctypes.RTLD_GLOBAL) 11 | from cpytraceafl._tracehook import * 12 | sys.setdlopenflags(_prev_dlopenflags) 13 | -------------------------------------------------------------------------------- /cpytraceafl/version.py: -------------------------------------------------------------------------------- 1 | __version__ = "0.7.1" 2 | -------------------------------------------------------------------------------- /default.nix: -------------------------------------------------------------------------------- 1 | { 2 | pkgs ? import {}, 3 | pythonPackages ? pkgs.python37Packages, 4 | forTest ? true 5 | }: 6 | { 7 | cpytraceaflEnv = pkgs.stdenv.mkDerivation { 8 | name = "cpytraceafl-env"; 9 | buildInputs = [ 10 | pythonPackages.sysv_ipc 11 | ] ++ pkgs.stdenv.lib.optionals forTest [ 12 | pythonPackages.pytest 13 | pythonPackages.pytestrunner 14 | ]; 15 | }; 16 | } 17 | -------------------------------------------------------------------------------- /dummy-afl-qemu-trace: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | # put a symlink to this script in your AFL_PATH directory under the name afl-qemu-trace 4 | # to fuzz a python forkserver using an unmodified AFL (or AFL++) in "qemu mode". 5 | 6 | if [ $QEMU_SET_ENV ] ; then 7 | export $(echo $QEMU_SET_ENV | tr ',' ' ') 8 | fi 9 | 10 | shift 11 | exec "$@" > forkserver.out 2> forkserver.err 12 | -------------------------------------------------------------------------------- /examples/hpack_example.py: -------------------------------------------------------------------------------- 1 | from cpytraceafl.rewriter import install_rewriter 2 | 3 | install_rewriter() 4 | 5 | import hpack 6 | 7 | # initial call to set up any internal caches or imports before the fork 8 | d = hpack.Decoder() 9 | d.decode(b'\x82\x86\x84\x01\x8c\xf1\xe3\xc2\xe5\xf2:k\xa0\xab\x90\xf4\xff') 10 | d.decode( 11 | b'\x82\x86\x84\x01\x8c\xf1\xe3\xc2\xe5\xf2:k\xa0\xab\x90\xf4\xff' 12 | b'\x0f\t\x86\xa8\xeb\x10d\x9c\xbf' 13 | ) 14 | 15 | import sys 16 | from cpytraceafl import fuzz_from_here, crashing_excepthook 17 | 18 | fuzz_from_here(excepthook=crashing_excepthook) 19 | 20 | with open(sys.argv[1], "rb") as f: 21 | try: 22 | d = hpack.Decoder() 23 | # we have inserted the sentinel 0xdeadbeef into our examples as a marker signifying 24 | # a new part of a differentially encoded header 25 | for fragment in f.read().split(b"\xde\xad\xbe\xef"): 26 | if fragment: 27 | d.decode(fragment) 28 | except hpack.HPACKDecodingError: 29 | pass 30 | -------------------------------------------------------------------------------- /examples/pillow_pcx_example.py: -------------------------------------------------------------------------------- 1 | """ 2 | An example of mixed python/c code fuzzing. In this case, it is assumed that the Pillow 3 | package in use has been compiled with e.g. afl's llvm_mode.. 4 | """ 5 | from cpytraceafl.rewriter import install_rewriter 6 | 7 | install_rewriter() 8 | 9 | from cpytraceafl import fuzz_from_here, DEFAULT_MAP_SIZE_BITS, get_map_size_bits_env 10 | # must ensure the tracehook module gets imported *before* any instrumented native modules, 11 | # so that the __afl_area_ptr and __afl_prev_loc global symbols have been loaded 12 | from cpytraceafl.tracehook import set_map_start 13 | import sysv_ipc 14 | 15 | # if we're going to "warm up" the code under test in a way that executes native instrumented 16 | # code *before* we do the fork & start tracing, we need to provide a dummy memory area for 17 | # __afl_area_ptr to point to. here, use some fresh sysv shared memory because it's what we 18 | # have to hand. 19 | map_size_bits = get_map_size_bits_env() or DEFAULT_MAP_SIZE_BITS 20 | dummy_sm = sysv_ipc.SharedMemory(None, size=1< MAX_DATA_LEN: 38 | # make sure afl doesnt see this trace as "interesting" 39 | return 40 | for i, chunk in enumerate(data.split(CHUNK_SEPARATOR)): 41 | if (i or not SERVER_FIRST_CHUNK) and self._receive_data() == 0: 42 | print(f"{time.monotonic()} \tDone, chunks remaining") 43 | break 44 | print(f"{time.monotonic()} \tSending {len(chunk)} B") 45 | self.request.sendall(chunk) 46 | else: 47 | self._receive_data() 48 | print(f"{time.monotonic()} \tDone, all chunks sent") 49 | 50 | 51 | return _TCPListenFeederHandler 52 | 53 | 54 | # usage: tcplistenfeeder.py 55 | if __name__ == "__main__": 56 | import sys 57 | 58 | with TCPServer(("127.0.0.1", int(sys.argv[1])), get_handler(sys.argv[2])) as server: 59 | server.serve_forever() 60 | -------------------------------------------------------------------------------- /tests/test_header.py: -------------------------------------------------------------------------------- 1 | import ctypes 2 | import mmap 3 | 4 | import pytest 5 | 6 | from cpytraceafl._testheader import _test_record_loc 7 | from cpytraceafl import tracehook 8 | 9 | 10 | @pytest.mark.parametrize("map_size_bits", (8, 13, 16, 18,)) 11 | def test_record_loc_masks_prev_loc(map_size_bits): 12 | with mmap.mmap(-1, 1<= 2 or foo: 37 | raise StopIteration 38 | 39 | def qux(i, **j): 40 | i.oof += i("oof")[0] 41 | try: 42 | i.oof(j["rab"], len(j), lambda l: l or (i % 2)) 43 | except XYZException as e: 44 | print(e) 45 | return i 46 | 47 | while a > b: 48 | a -= [a[0] or b[a] for k in bar(123)] 49 | yield qux(a and 321) 50 | if b: 51 | break 52 | 53 | def zab(x, y, z, *w): 54 | z, _ = x if y(z) else w + "xuq" 55 | try: 56 | return (v(z) if v.y else v(y) for v in w if v and v({})) 57 | except A: 58 | return x() 59 | except B: 60 | return y() 61 | except C: 62 | return w[0]() 63 | finally: 64 | z[0] = 2 65 | """ 66 | 67 | 68 | def _extract_lnotabs(code_obj): 69 | return ( 70 | tuple(_extract_lnotabs(const) for const in code_obj.co_consts if isinstance(const, CodeType)), 71 | code_obj.co_lnotab, 72 | ) 73 | 74 | 75 | # allows a compact way of representing a lnotab bytestring, handling interleaving & conversion 76 | def _l(*a): 77 | return bytes(chain.from_iterable((b, 1) for b in a)) 78 | 79 | 80 | @pytest.mark.parametrize("selector,expected_lnotabs", tuple(chain.from_iterable( 81 | # keep together a number of "aliases" of a selector that should result in the same output 82 | ((selector, expected_lnotabs) for selector in selector_aliases) 83 | for selector_aliases, expected_lnotabs in ( 84 | ( 85 | (True, lambda _: True, 100, lambda _: 100, 99.99, lambda _: 99.99, 1000,), 86 | ( 87 | ( 88 | ( 89 | ( 90 | ( 91 | ( 92 | # lambda 93 | ((), _l(0, 6, 7,) if pv < (3, 6) else _l(0, 4, 6,)), 94 | ), 95 | # qux 96 | ( 97 | _l(0, 73, 10, 23, 13, 1,) 98 | if pv < (3, 6) else ( 99 | _l(0, 58, 8, 22, 10, 2,) 100 | if pv < (3, 7) else 101 | _l(0, 58, 8, 20, 12, 2,) 102 | ) 103 | ), 104 | ), 105 | # listcomp 106 | ((), _l(0, 6, 16, 7, 6,) if pv < (3, 6) else _l(0, 4, 12, 6, 4,),), 107 | ), 108 | # baz 109 | ( 110 | _l(0, 38, 40, 10, 17, 4, 11, 6, 4, 12, 10, 12, 6, 6, 16, 12, 47, 3, 4, 7, 4, 1,) 111 | if pv < (3, 6) else ( 112 | _l(0, 28, 28, 8, 14, 4, 10, 4, 4, 8, 8, 8, 4, 4, 12, 8, 34, 2, 4, 6, 4, 2,) 113 | if pv < (3, 8) else 114 | _l(0, 26, 28, 8, 14, 4, 8, 4, 4, 8, 8, 8, 4, 4, 10, 8, 34, 2, 4, 6, 4,) 115 | ) 116 | ), 117 | ), 118 | ( 119 | ( 120 | # genexpr 121 | ( 122 | (), 123 | ( 124 | _l(0, 3, 12, 12, 9, 12, 9, 1, 4,) 125 | if pv < (3, 6) else 126 | _l(0, 2, 8, 8, 6, 8, 6, 2, 4,) 127 | ), 128 | ), 129 | ), 130 | # zab 131 | ( 132 | _l(0, 12, 6, 7, 45, 10, 14, 10, 14, 10, 18, 1, 4,) 133 | if pv < (3, 6) else ( 134 | _l(0, 8, 4, 6, 30, 8, 12, 8, 12, 8, 16, 6,) 135 | if pv < (3, 8) else 136 | _l(0, 8, 4, 6, 36, 8, 20, 8, 20, 8, 24, 6,) 137 | ) 138 | ), 139 | ), 140 | ), 141 | # module 142 | _l(0,), 143 | ), 144 | ), 145 | ( 146 | (False, lambda _: False, 0, lambda _: 0,), 147 | ( 148 | ( 149 | ( 150 | ( 151 | ( 152 | ( 153 | # lambda 154 | ((), b""), 155 | ), 156 | # qux 157 | b"", 158 | ), 159 | # listcomp 160 | ((), b"",), 161 | ), 162 | # baz 163 | b"", 164 | ), 165 | ( 166 | ( 167 | # genexpr 168 | ((), b"",), 169 | ), 170 | # zab 171 | b"", 172 | ), 173 | ), 174 | # module 175 | b"", 176 | ), 177 | ), 178 | ( 179 | ( 180 | lambda code: code.co_name in ("baz", "zab"), 181 | lambda code: 100 if code.co_name in ("baz", "zab") else 0, 182 | ), 183 | ( 184 | ( 185 | ( 186 | ( 187 | ( 188 | ( 189 | # lambda 190 | ((), b""), 191 | ), 192 | # qux 193 | b"", 194 | ), 195 | # listcomp 196 | ((), b"",), 197 | ), 198 | # baz 199 | ( 200 | _l(0, 38, 40, 10, 17, 4, 11, 6, 4, 12, 10, 12, 6, 6, 16, 12, 47, 3, 4, 7, 4, 1,) 201 | if pv < (3, 6) else ( 202 | _l(0, 28, 28, 8, 14, 4, 10, 4, 4, 8, 8, 8, 4, 4, 12, 8, 34, 2, 4, 6, 4, 2,) 203 | if pv < (3, 8) else 204 | _l(0, 26, 28, 8, 14, 4, 8, 4, 4, 8, 8, 8, 4, 4, 10, 8, 34, 2, 4, 6, 4,) 205 | ) 206 | ), 207 | ), 208 | ( 209 | ( 210 | # genexpr 211 | ((), b"",), 212 | ), 213 | # zab 214 | ( 215 | _l(0, 12, 6, 7, 45, 10, 14, 10, 14, 10, 18, 1, 4,) 216 | if pv < (3, 6) else ( 217 | _l(0, 8, 4, 6, 30, 8, 12, 8, 12, 8, 16, 6,) 218 | if pv < (3, 8) else 219 | _l(0, 8, 4, 6, 36, 8, 20, 8, 20, 8, 24, 6,) 220 | ) 221 | ), 222 | ), 223 | ), 224 | # module 225 | b"", 226 | ), 227 | ), 228 | ( 229 | (20, lambda _: 20,), 230 | ( 231 | ( 232 | ( 233 | ( 234 | ( 235 | ( 236 | # lambda 237 | ((), b"",), 238 | ), 239 | # qux 240 | _l(0,), 241 | ), 242 | # listcomp 243 | ((), b"",), 244 | ), 245 | # baz 246 | ( 247 | _l(0, 38, 88, 66, 77, 1,) 248 | if pv < (3, 6) else ( 249 | _l(0, 28, 68, 48, 58, 2,) 250 | if pv < (3, 8) else 251 | _l(0, 26, 66, 46, 58,) 252 | ) 253 | ), 254 | ), 255 | ( 256 | ( 257 | # genexpr 258 | ((), _l(48,) if pv < (3, 6) else _l(32,),), 259 | ), 260 | # zab 261 | ( 262 | _l(80, 14, 57,) 263 | if pv < (3, 6) else ( 264 | _l(56, 12,) 265 | if pv < (3, 8) else 266 | _l(62, 20,) 267 | ) 268 | ), 269 | ), 270 | ), 271 | # module 272 | _l(0,), 273 | ), 274 | ), 275 | ) 276 | ))) 277 | def test_rewrite(selector, expected_lnotabs): 278 | orig_code = builtins.compile(test_source, "foo.py", "exec") 279 | 280 | def mk_mock_random_instance(code_obj): 281 | # a simple pseudo-pseudo-random number generator "seeded" on the co_firstlineno of 282 | # the provided code object - something that shouldn't be volatile between python 283 | # versions. 284 | # a new mock random instance is created for each use to confine the state of the 285 | # generator to one code object, limiting the amount changes in calling patterns can 286 | # propagate. 287 | inst = mock.create_autospec(random.Random, instance=True) 288 | inst_counter = count(code_obj.co_firstlineno, 19) 289 | inst.getrandbits.side_effect = lambda bits: next(inst_counter) % (1<