├── .gitignore ├── README.md ├── testing ├── parse.py └── verify.py └── queries ├── malloc_to_memcpy.yaml ├── wireshark_infinite_loop.yaml └── linux_kaslr_leak.yaml /.gitignore: -------------------------------------------------------------------------------- 1 | testing/intermediates 2 | .queriesTestDB 3 | *.swp 4 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | joern-traversals 2 | ------------- 3 | 4 | A collection of example [joern](https://github.com/fabsx00/joern) queries with unit tests. These queries are for the old version of joern that used neo4j and tinkerpop v2, for some examples using octopus (the new update of joern using tinkerpop3 on TitanDB) look [here](https://tsyrklevich.net/2016/10/31/notes-on-octopus-gremlin3/). 5 | 6 | Run unit tests 7 | -------------- 8 | 9 | Install dependencies with `pip install pyyaml`. To run tests: 10 | 11 | rm -rf testing/intermediates .queriesTestDB 12 | ./testing/parse.py testing/intermediates queries/*.yaml 13 | joern testing/intermediates -outdir .queriesTestDB 14 | 15 | # Start the DB with the correct path to ./.queriesTestDB 16 | ./testing/verify.py queries/*.yaml 17 | -------------------------------------------------------------------------------- /testing/parse.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2 2 | # 3 | # Parse the YAML files, pull out positive and negative tests and write them to disk 4 | # indexed by their md5 hash. These are intended to be processed by joern, and then a 5 | # a second step (verify.py) will go through and verify that tests succeeded or failed 6 | # by correlating results to tests by their md5 hashes 7 | 8 | import sys 9 | import os.path 10 | import hashlib 11 | 12 | from yaml import load 13 | 14 | try: 15 | from yaml import CLoader as Loader 16 | except ImportError: 17 | from yaml import Loader 18 | 19 | if len(sys.argv) < 3: 20 | print "Usage: parse.py ..." 21 | exit(1) 22 | 23 | sys.argv.pop(0) 24 | output_dir = sys.argv.pop(0) 25 | 26 | if not os.path.exists(output_dir): 27 | os.makedirs(output_dir) 28 | 29 | for arg in sys.argv: 30 | yaml = load(file(arg, 'r'), Loader) 31 | 32 | for entry in yaml: 33 | tests = [] 34 | if entry.has_key('POSITIVE_TESTS') and entry['POSITIVE_TESTS']: 35 | tests += entry['POSITIVE_TESTS'] 36 | 37 | if entry.has_key('NEGATIVE_TESTS') and entry['NEGATIVE_TESTS']: 38 | tests += entry['NEGATIVE_TESTS'] 39 | 40 | for t in tests: 41 | digest = hashlib.md5() 42 | digest.update(t) 43 | 44 | f = open(os.path.join(output_dir, digest.hexdigest() + '.c'), 'w') 45 | f.write(t) 46 | f.close() 47 | -------------------------------------------------------------------------------- /queries/malloc_to_memcpy.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | - DESCRIPTION: |- 3 | Fabian's example from his 31c3 talk. Find cases where an additive expression used for a 4 | malloc (e.g. len+1) is different from the additive expression used in the length argument 5 | directly following it's next use; this indicates a potential integer overflow. 6 | 7 | Limitations: Only additive expressions, doesn't work for cases where one of the statements 8 | is not an additive expression [e.g. malloc(len+1) -> memcpy(.., len);] 9 | QUERY: |- 10 | getCallsTo("malloc").ithArguments("0") 11 | .sideEffect { cnt = it.code } 12 | .match { it.type == "AdditiveExpression" }.statements() 13 | .out("REACHES") 14 | .match { it.type == "CallExpression" && it.code.startsWith("memcpy") }.ithArguments("2") 15 | .filter { it.code != cnt } 16 | .match { it.type == "AdditiveExpression" } 17 | POSITIVE_TESTS: 18 | - |- 19 | main() { 20 | void *bad_ptr = malloc(len + 8); 21 | memcpy(bad_ptr, spooky_buf, len + 7); 22 | } 23 | NEGATIVE_TESTS: 24 | - |- 25 | main() { 26 | void *bad_ptr = malloc(len + 1); 27 | memcpy(bad_ptr, spooky_buf, len); 28 | } 29 | 30 | - DESCRIPTION: |- 31 | My modification of Fabian's example: 32 | 33 | Look for malloc($expression) where $expression involves only a single variable (and 34 | potentially multiple constants/operations), and a later memcpy involving only the same 35 | single variable with a different expression. 36 | 37 | Limitations: Only catches cases where a single identifier is used in the expression for the 38 | allocation, this means multiple variables (or even addition with the result of a function 39 | call or a macro) will not show up. 40 | 41 | TODO: Could this be more elegantly expressed using #groupBy to isolate expressions with 42 | a single variable? 43 | QUERY: |- 44 | getCallsTo("malloc").ithArguments("0") 45 | .sideEffect { expression = it.code } 46 | .filter { it.match { it.type == "Identifier" }.count() == 1 } 47 | .sideEffect { variable = it.match { it.type == "Identifier" }.code } 48 | .statements().out("REACHES") 49 | .match { it.type == "CallExpression" && it.code.startsWith("memcpy") }.ithArguments("2") 50 | .filter { it.code != expression } 51 | .filter { it.match { it.type == "Identifier" }.count() == 1 } 52 | .filter { it.match { it.type == "Identifier" }.code.toList()[0] == variable.toList()[0] } 53 | POSITIVE_TESTS: 54 | - |- 55 | main() { 56 | void *bad_ptr = malloc(len + 1); 57 | memcpy(bad_ptr, spooky_buf, len); 58 | } 59 | NEGATIVE_TESTS: 60 | - |- 61 | main() { 62 | void *bad_ptr = malloc(len + MACRO_VALUE); 63 | memcpy(bad_ptr, spooky_buf, len); 64 | } 65 | -------------------------------------------------------------------------------- /testing/verify.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2 2 | # 3 | # Run queries against a database with unit tests loaded. The unit tests are in files 4 | # with md5 hashes of their contents so we can cross-reference a test to the results 5 | # in the query. Before we run positive/negative unit tests we sanity check that tests 6 | # are in the database. 7 | 8 | import sys 9 | import re 10 | import os.path 11 | import hashlib 12 | 13 | from yaml import load 14 | from joern.all import JoernSteps 15 | 16 | try: 17 | from yaml import CLoader as Loader 18 | except ImportError: 19 | from yaml import Loader 20 | 21 | if len(sys.argv) < 2: 22 | print "Usage: verify.py ..." 23 | exit(1) 24 | 25 | j = JoernSteps() 26 | j.setGraphDbURL('http://localhost:7474/db/data') 27 | j.connectToDatabase() 28 | 29 | sys.argv.pop(0) 30 | print "Running tests:" 31 | 32 | # tests hashes are encoded in the intermediate path names, this extracts them 33 | def extract_paths(paths): 34 | paths = map(lambda p: str.split(str(p), "/")[-1], paths) 35 | return map(lambda p: str.split(str(p), ".c")[0], paths) 36 | 37 | all_tests = extract_paths(j.runGremlinQuery("getNodesWithType('File').filepath")) 38 | 39 | for arg in sys.argv: 40 | yaml = load(file(arg, 'r'), Loader) 41 | 42 | for idx, entry in enumerate(yaml): 43 | query = entry['QUERY'] 44 | query = re.sub("^ +", "", query, flags=re.MULTILINE) 45 | query = re.sub(" +$", "", query, flags=re.MULTILINE) 46 | query = str.split(query, "\n") 47 | query = filter(lambda l: not re.match('//', l), query) 48 | query = str.join("", query) 49 | 50 | query = """%s 51 | .transform { g.v(it.functionId).functionToFile().filepath }.scatter() 52 | """ % (query) 53 | 54 | try: 55 | result = j.runGremlinQuery(query) 56 | 57 | if not isinstance(result, list): 58 | raise Exception("Result of query for files was not a list") 59 | 60 | result = extract_paths(result) 61 | 62 | if entry.has_key('POSITIVE_TESTS') and entry['POSITIVE_TESTS']: 63 | test_names = [hashlib.md5(test).hexdigest() for test in entry['POSITIVE_TESTS']] 64 | missing = set(test_names) - set(all_tests) 65 | if len(missing) > 0: 66 | raise Exception("Unit test for hashes %s are not present in your database, re-create intermediates" % (missing)) 67 | 68 | failures = set(test_names) - set(result) 69 | if len(failures) > 0: 70 | raise Exception("Positive test failure %s" % (failures)) 71 | else: 72 | sys.stdout.write('.') 73 | 74 | if entry.has_key('NEGATIVE_TESTS') and entry['NEGATIVE_TESTS']: 75 | test_names = [hashlib.md5(test).hexdigest() for test in entry['NEGATIVE_TESTS']] 76 | missing = set(test_names) - set(all_tests) 77 | if len(missing) > 0: 78 | raise Exception("Unit test for hashes %s are not present in your database, re-create intermediates" % (missing)) 79 | 80 | failures = set(result) & set(test_names) 81 | if len(failures) > 0: 82 | raise Exception("Negative test failure %s" % (failures)) 83 | else: 84 | sys.stdout.write('.') 85 | 86 | except: 87 | print "Error (%s:entry %i): %s" % (arg, idx + 1, sys.exc_info()[1]) 88 | pass 89 | 90 | print 91 | -------------------------------------------------------------------------------- /queries/wireshark_infinite_loop.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | - DESCRIPTION: |- 3 | Look for a for statement with an increment based on user input. Based on this bug: 4 | 5 | https://anonsvn.wireshark.org/viewvc/trunk/epan/dissectors/packet-assa_r3.c?r1=51196&r2=51195&pathrev=51196 6 | 7 | QUERY: |- 8 | getNodesWithType("ForStatement") 9 | .children() 10 | .filter { it.type == "AssignmentExpr" } 11 | .filter { it.rval().code.toList()[0].matches(".*tvb_get_.*") } 12 | 13 | POSITIVE_TESTS: 14 | - |- 15 | main() { 16 | for (i = 0; i < len; i += tvb_get_guint8(tvb, i)) { 17 | } 18 | } 19 | 20 | NEGATIVE_TESTS: 21 | 22 | - DESCRIPTION: |- 23 | More generically, Wireshark dissectors normally update their position in a packet/stream in 24 | a variable named offset as they process. There are a number of previous infinite loop DoS 25 | vulnerabilities based on the offset being incremented in a loop by user-controlled data set 26 | to 0 or being able to wrap around the offset variable in a loop, for example: 27 | https://code.wireshark.org/review/#/c/5338/2/epan/dissectors/packet-tn5250.c 28 | 29 | Check a value that comes from tvb_get_* is later added to the offset variable in a loop. 30 | 31 | Limitations: 32 | + Assumes an assignment, does not work for offset += tvb_... 33 | 34 | QUERY: |- 35 | getCallsTo("tvb_get_*") 36 | .statements() 37 | .filter { it.defines().count() > 0 } 38 | .sideEffect { output_var_name = it.defines().toList()[-1].code } 39 | .out("REACHES") 40 | .match { it.type == "AssignmentExpr" } 41 | .and( 42 | _().lval().match { it.code.matches(".*[oO]ff.*") }, 43 | _().rval().filter { it.type != "CallExpression" }.match { it.code == output_var_name }) 44 | .dedup() 45 | .filter { isInLoop(it) } 46 | 47 | POSITIVE_TESTS: 48 | - |- 49 | tvb_get_ntohl() {} // Declaration required to be able to search for functions by name in the index 50 | main() { 51 | while(1) { 52 | guint32 len = tvb_get_ntohl(tvb, offset); 53 | offset += len; 54 | } 55 | } 56 | 57 | NEGATIVE_TESTS: 58 | - |- 59 | tvb_get_ntohl() {} // Declaration required to be able to search for functions by name in the index 60 | main() { 61 | guint32 len = tvb_get_ntohl(tvb, offset); 62 | offset += len; 63 | } 64 | 65 | - DESCRIPTION: |- 66 | The above is a good start, but it turns out that many core Wireshark functions check bounds 67 | for the offset so there is an unacceptable number of false-positives. There are two types 68 | of bugs the above query could find: 1) infinite loops caused by loops updating the offset 69 | using only user supplied input [e.g. no constants are added], or 2) infinite loops caused 70 | by integer overflows to the offset variable, causing the loop to start processing at the 71 | beginning of the last iteration. This query looks for the latter. 72 | 73 | There is an 'unsanitized' joern step that allows you to pick a statement in the CFG and 74 | find all paths in the CFG to that statement where user input has not been sanitized. We 75 | would like to do the opposite, start from a call to tvb_get_*, end at a statement that 76 | adds that value to the offset, and make sure the argument has not been passed to a 77 | bounds-checking function in between. We can do this by using the 'cfgPaths' step (which is 78 | used by 'unsanitized' step.) This allows us specify a source and a destination statement, 79 | along with a closure expressing what 'sanitizing' statements look like, and receive any 80 | paths from the source to the destination where a given set of sanitizing statements are 81 | avoided. 82 | 83 | Since we are interested in integer overflows, we only care to target functions that return 84 | 32-bit or 64-bit integers. Afterwards we use slightly more complex logic to find instances 85 | where that tainted user input is added to an offset variable, and then perform the cfgPaths 86 | sanitization search. 87 | 88 | This query found the following bugs: 89 | + https://bugs.wireshark.org/bugzilla/show_bug.cgi?id=11023 90 | + https://bugs.wireshark.org/bugzilla/show_bug.cgi?id=11024 91 | + https://bugs.wireshark.org/bugzilla/show_bug.cgi?id=11037 92 | 93 | Limitations: 94 | + Assumes an assignment, fails for offset += tvb_... 95 | + Assumes a call to proto_tree_add_item/etc sanitizes, but it might miss cases where the 96 | tainted value is added to something [e.g. proto_tree_add_item(tained + 4) allowing an 97 | integer overflow to bypass the bounds checking] 98 | + The tainted user data might be passed as an argument to a sanitizing functions so that 99 | it does not cause a bound check, we should check specific parameters. 100 | 101 | QUERY: |- 102 | getFunctionsByName("tvb_get_*").as('func') 103 | .out("IS_FUNCTION_OF_AST").out.filter { it.type == "ReturnType" } 104 | .filter { it.code.matches(".*int(32|64).*") } 105 | .back('func') 106 | .transform { getCallsTo(it.name) }.scatter() 107 | .sideEffect { src = it } 108 | .statements() 109 | .filter { it.defines().count() > 0 } 110 | .sideEffect { output_var_name = it.defines().code.toList()[0].replace("*", "") } 111 | .out("REACHES") 112 | .match { it.type == "AssignmentExpr" } 113 | .filter { it.rval().toList()[0].code.matches(".*$output_var_name.*") } 114 | .filter { it.rval().toList()[0].type != 'CallExpression' } 115 | // The following is to get around null accesses stemming from: 116 | // https://github.com/fabsx00/joern/issues/49 117 | .filter { it.parents().toList()[0].type != "IdentifierDecl" } 118 | .filter { lval = it.lval().toList()[0]; lval.code == 'offset' || (src.ithArguments("1").toList()[0].code != 'offset' && lval.code.matches(".*[oO]ff.*")) } 119 | .dedup() 120 | .filter { isInLoop(it) } 121 | .sideEffect { dst = it } 122 | .transform { cfgPaths(output_var_name, { cur, sym -> 123 | cur._().filter { it.uses().code.toList().contains(output_var_name) }.or( 124 | _().codeContains('.*proto_tree_add_(text|item).*'), 125 | _().codeContains('.*tvb_get_(str|ptr).*'), 126 | _().codeContains('.*tvb_new_subset.*'), 127 | _().codeContains('.*tlv_length_remaining.*')) 128 | }, src.statements().toList()[0], dst.statements().toList()[0]) } 129 | .scatter().transform { it.toList()[0] } 130 | .dedup() 131 | 132 | POSITIVE_TESTS: 133 | - |- 134 | guint32 tvb_get_ntohl() {} // Declaration required to be able to search for functions by name in the index 135 | main() { 136 | guint32 len; 137 | while(1) { 138 | len = tvb_get_ntohl(tvb, offset); 139 | offset += len; 140 | } 141 | } 142 | 143 | NEGATIVE_TESTS: 144 | - |- 145 | guint32 tvb_get_ntohl() {} // Declaration required to be able to search for functions by name in the index 146 | main() { 147 | guint32 len; 148 | while(1) { 149 | len = tvb_get_ntohl(tvb, offset); 150 | proto_tree_add_text(len); // Sanitizer! 151 | offset += len; 152 | } 153 | } 154 | 155 | - DESCRIPTION: |- 156 | The above query is good for finding cases where an infinite loop is possible by 157 | overflowing the offset counter and causing the processing to start-back at the original 158 | point; however, it is also possible that we could cause an infinite loop if we could cause 159 | the offset to only be incremented by zero. This means looking for loops where the offset 160 | is not incremented by fixed amounts and only by used controlled data. 161 | 162 | This query requires a modification to cfgPaths, hence there are no unit tests because they 163 | would fail with modifications :/ The modification is to allow specifying the same source and 164 | destination node, e.g. making sure that we find no expressions that modify the offset 165 | variable in a way that might force it to increment in an entire loop iteration. 166 | 167 | Found https://bugs.wireshark.org/bugzilla/show_bug.cgi?id=11036 168 | 169 | Requires cfgPaths modification: 170 | --- a/joern/joernsteps/taintTracking/dataflow.groovy 171 | +++ b/joern/joernsteps/taintTracking/dataflow.groovy 172 | @@ -127,7 +127,7 @@ Object.metaClass._cfgPaths = {symbol, sanitizer, curNode, dst, visited, path -> 173 | } 174 | 175 | // return path when destination has been reached 176 | - if(curNode == dst){ 177 | + if (curNode == dst && path != []) { 178 | 179 | Limitations: 180 | + We only find the infinite loop case where only a single user-controlled zero variable is 181 | added to the offset; however, there could be multiple such zero-value variables. 182 | 183 | QUERY: |- 184 | getFunctionsByName("tvb_get_ntohl").transform { getCallsTo(it.name) }.scatter() 185 | .sideEffect { offset_var_regex = (it.ithArguments("1").toList()[0].code == 'offset' ? 'offset' : "[oO]ff") } 186 | .sideEffect { src = it } 187 | .statements() 188 | .filter { it.defines().count() > 0 } 189 | .sideEffect { output_var_name = it.defines().code.toList()[0].replace("*", "") } 190 | .out("REACHES") 191 | .match { it.type == "AssignmentExpr" } 192 | .filter { it.rval().toList()[0].code.matches(".*$output_var_name.*") } 193 | .filter { it.rval().toList()[0].type != 'CallExpression' } 194 | .filter { it.rval().toList()[0].type != 'AdditiveExpression' } 195 | .filter { it.parents().toList()[0].type != "IdentifierDecl" } 196 | .filter { lval = it.lval().toList()[0]; lval.code == 'offset' || (src.ithArguments("1").toList()[0].code != 'offset' && lval.code.matches(".*[oO]ff.*")) } 197 | .dedup() 198 | .filter { isInLoop(it) } 199 | .transform { cfgPaths('FAKEFAKE', { cur, sym -> 200 | //println cur._().code.toList(); 201 | //println cur._().match { it.type == "AssignmentExpr" }.toList(); 202 | cur._().or( 203 | _().match { it.type == "AssignmentExpr" }.filter { it.lval().toList().size > 0 } 204 | .filter { it.lval().toList()[0].code.matches(".*$offset_var_regex.*") } 205 | .filter { !it.rval().toList()[0].code.matches(".*$output_var_name.*") }, 206 | _().match { it.type == "IncDecOp" && it.lval().toList().size > 0 } 207 | .filter { it.lval.toList()[0].code.matches(".*$offset_var_regex.*") }, 208 | _().isCheck(".*$output_var_name .*")) 209 | }, src.statements().toList()[0], src.statements().toList()[0]) } 210 | .scatter().transform { it.toList()[0] } 211 | .dedup() 212 | 213 | POSITIVE_TESTS: 214 | NEGATIVE_TESTS: 215 | -------------------------------------------------------------------------------- /queries/linux_kaslr_leak.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | - DESCRIPTION: |- 3 | Look at the simplest possible case, inspired by this bug 4 | https://git.backbone.ws/linux/backbone-sources/commit/a117dac 5 | 6 | Look at calls to copy_to_user where the data being copied out is a structure allocated as a 7 | local stack variable that is not sanitized correctly. Unfortunately joern does not store much 8 | useful type information (e.g. searchable struct definitions and elements), so the best we can 9 | do is make sure the struct is not sanitized, a second step is required to see if all the 10 | individual struct fields are written to. This first step reduces the search space of 11 | copy_to_user calls on Linux 4.7.2 from 2491 to 191. Ultimately after narrowing down the 12 | number of possible hits was further narrowed down by a post-processing step that looked for 13 | structs where all members were assigned to, just one memory leak was found. 14 | 15 | Limitations: 16 | - Only looks for calls to copy_to_user() with sizeof() in the length parameter. 17 | - Ignores pointers (e.g. dynamically allocated structs, buffers, etc.) 18 | - Ignores structs passed in an argument (e.g. we could do some simple interprocedural 19 | analysis to look backwards to see if the data was initialized correctly in the calling 20 | functions.) 21 | - Returns too many results, needs post-processing step due to lack of struct type info. 22 | 23 | Ideas: 24 | - Fix any limitation listed above 25 | - Look specifically at unsanitized unions involved in a copy_to_user with different width 26 | members 27 | - Look specifically at unsanitized structs involved in a copy_to_user with members named 28 | something like 'padding' or 'reserved' 29 | - Look specifically at unsanitized unpacked structs involved in a copy_to_user that would 30 | include compiler added padding on 32 or 64-bit platforms 31 | 32 | Found https://github.com/torvalds/linux/commit/02a9079c66341836c4914c33c06a73245060df2e 33 | 34 | This requires a modified reachableCfgNodes to run 35 | 36 | diff --git a/joern/joernsteps/cfg.groovy b/joern/joernsteps/cfg.groovy 37 | index 7e8a9a0..2e7c05b 100644 38 | --- a/joern/joernsteps/cfg.groovy 39 | +++ b/joern/joernsteps/cfg.groovy 40 | @@ -12,20 +12,25 @@ Gremlin.defineStep('toExitNode', [Vertex,Pipe], { 41 | /** 42 | Search the CFG breadth-first so that we can keep track of all nodes we've visited in 43 | the entire search rather than just along the current path (massive optimization for 44 | - high branching-factor CFGs, e.g. state machines). 45 | + high branching-factor CFGs, e.g. state machines.) Can search forwards or backwards. 46 | */ 47 | -Object.metaClass._reachableCfgNodes = { curNodes, visited -> 48 | - nextNodes = curNodes._().out('FLOWS_TO').toSet() - visited 49 | +Object.metaClass._reachableCfgNodes = { curNodes, visited, forward -> 50 | + if (forward == true) { 51 | + nextNodes = curNodes._().out('FLOWS_TO').toSet() - visited 52 | + } else { 53 | + nextNodes = curNodes._().in('FLOWS_TO').toSet() - visited 54 | + } 55 | + 56 | if (nextNodes.isEmpty()) { return visited } 57 | 58 | visited.addAll(nextNodes) 59 | - return _reachableCfgNodes(nextNodes.toList(), visited) 60 | + return _reachableCfgNodes(nextNodes.toList(), visited, forward) 61 | } 62 | 63 | -Gremlin.defineStep('reachableCfgNodes', [Vertex, Pipe], { 64 | - _().transform { _reachableCfgNodes(it.statements().toList(), new HashSet())}.scatter() 65 | +Gremlin.defineStep('reachableCfgNodes', [Vertex, Pipe], { forward -> 66 | + _().transform { _reachableCfgNodes(it.statements().toList(), new HashSet(), forward)}.scatter() 67 | }) 68 | 69 | Object.metaClass.isInLoop = { it -> 70 | - it._().reachableCfgNodes().toSet().contains(it.statements().toList()[0]) 71 | + it._().reachableCfgNodes(true).toSet().contains(it.statements().toList()[0]) 72 | 73 | QUERY: |- 74 | getCallsTo('copy_to_user') 75 | .as('copy') 76 | .sideEffect { end_node = it.statements().toList()[0] } 77 | .ithArguments('1') 78 | .match { it.type == 'Identifier' } 79 | .sideEffect { var_name = it.code } 80 | .back('copy') 81 | .ithArguments('2') 82 | // Whole struct is being copied out. 83 | .filter { it.codeContains('.*sizeof.*').count() > 0 } 84 | .back('copy') 85 | // Search backwards to find the declaration 86 | .reachableCfgNodes(false) 87 | .filter { it.type == 'IdentifierDeclStatement' } 88 | .match { it.type == 'Identifier' && it.code == var_name } 89 | .back(1) 90 | .sideEffect { start_node = it } 91 | // If it assigned a value assigned from the beginning, there is no memory leak 92 | .filter { !it.astNodes().type.toList().contains('AssignmentExpr') } 93 | .match { it.type == 'IdentifierDeclType' && !it.codeContains('^(const )?(float|off_t|compat_int_t|unsigned|s8|s16|s32|s64|u8|u16|u32|u64|__u8|__u16|__u32|__u64|int|unsigned long|unsigned int|u_int16_t|uint32_t|uint64_t|ssize_t|size_t|unsigned char)( const)?$') && !it.codeContains('.*\\*$') } 94 | .transform { 95 | cfgPaths('FAKE', { cur, sym -> 96 | cur._().or( 97 | // Ignore code paths that require capabilities 98 | _().codeContains('.*capable \\(.*'), 99 | // Ignore code paths where the entire variable is written to with memcpy 100 | // - hacky, could do this cleaner with AST parsing but I am being lazy. Also 101 | // misses cases where sizeof has no parens or its sizeof the type 102 | _().codeContains('.*memcpy \\( (& )?' + var_name + ' ,.*sizeof \\( ' + var_name + ' \\).*'), 103 | // Ignore code paths where the value is assigned to directly 104 | _().match { it.type == 'AssignmentExpr' && it.lval().code.toList().contains(var_name) }, 105 | // Look for references to 'var' or '& var' passed to a sanitizing functions 106 | _().match { it.type == 'CallExpression' }.as('call') 107 | .match { it.type == 'Argument' && it.code.matches('(& )?' + var_name) } 108 | .sideEffect { arg_num = it.childNum }.back('call') 109 | .or( 110 | // Simple sanitizers 111 | _().callToCallee().codeContains('(memset|copy_from_user)'), 112 | // Do some real hacky interprocedural analysis, check if the arg is passed to 113 | // a function that appears to memset the argument 114 | _().transform { 115 | getFunctionsByName(it.callToCallee().code.first()) 116 | .out.match { it.type == 'ParameterList' } 117 | .out.filter { it.childNum == arg_num } 118 | .out('REACHES').codeContains('memset.*') 119 | }.scatter() 120 | ) 121 | ) 122 | }, start_node, end_node) 123 | } 124 | .scatter() 125 | 126 | // make unit tests happy 127 | .transform { it.toList()[0] } 128 | 129 | // pretty-print 130 | //.transform { 131 | // [it.first()] + it._().filter { it.uses().code.toList().contains(var_name) }.toList() + [it.last()] 132 | //} 133 | //.transform { [it.first().id.toString() + '\n'] + it.code.toList() + ["\n\n"] } 134 | //.scatter() 135 | 136 | POSITIVE_TESTS: 137 | - |- 138 | copy_to_user() {} // Declaration required to be able to search for functions by name in the index 139 | main() { 140 | struct foo bar; 141 | copy_to_user(fake, &bar, sizeof(bar)); 142 | } 143 | 144 | - |- 145 | copy_to_user() {} // Declaration required to be able to search for functions by name in the index 146 | main() { 147 | struct foo bar; 148 | if (condition) { 149 | memset(&bar, 0, sizeof(bar)); 150 | } 151 | copy_to_user(fake, &bar, sizeof(bar)); 152 | } 153 | 154 | NEGATIVE_TESTS: 155 | - |- 156 | copy_to_user() {} // Declaration required to be able to search for functions by name in the index 157 | main() { 158 | struct foo bar; 159 | memset(&bar, 0, sizeof(bar)); 160 | copy_to_user(fake, &bar, sizeof(bar)); 161 | } 162 | 163 | - |- 164 | copy_to_user() {} // Declaration required to be able to search for functions by name in the index 165 | 166 | func(void *foo) { memset(foo, 0, sizeof(*foo)); } 167 | 168 | main() { 169 | struct foo bar; 170 | func(&bar); 171 | copy_to_user(fake, &bar, sizeof(bar)); 172 | } 173 | 174 | - DESCRIPTION: |- 175 | This performs a similar search to the above, we look for unions or union pointers where the 176 | contents of the type are not sanitized. Unions specifically are interesting because of the 177 | possibility that different sized types exist within the union and the contents of the entire 178 | union are copied out when a code path is hit that only fills part of the union. This search 179 | returns 7 results, of which 2 have bugs. 180 | 181 | Found https://github.com/torvalds/linux/commit/30f939feaeee23e21391cfc7b484f012eb189c3c 182 | Found https://github.com/torvalds/linux/commit/d69bb92e402ff948bdcd39f19c9067874fb86873 183 | 184 | QUERY: |- 185 | getCallsTo('copy_to_user') 186 | .as('copy') 187 | .sideEffect { end_node = it.statements().toList()[0] } 188 | .ithArguments('1') 189 | .match { it.type == 'Identifier' } 190 | .sideEffect { var_name = it.code } 191 | .back('copy') 192 | .back('copy') 193 | // Search backwards to find the declaration 194 | .reachableCfgNodes(false) 195 | .filter { it.type == 'IdentifierDeclStatement' } 196 | .match { it.type == 'Identifier' && it.code == var_name } 197 | .back(1) 198 | .sideEffect { start_node = it } 199 | // If it assigned a value assigned from the beginning, there is no memory leak 200 | .filter { !it.astNodes().type.toList().contains('AssignmentExpr') } 201 | .match { it.type == 'IdentifierDeclType' && it.codeContains(".*union.*") } 202 | .transform { 203 | cfgPaths('FAKE', { cur, sym -> 204 | cur._().or( 205 | // Ignore code paths that require capabilities 206 | _().codeContains('.*capable \\(.*'), 207 | _().codeContains('.*' + var_name + '.*=.*(kzalloc|vzalloc|memdup_user).*'), 208 | _().codeContains('.*(memset|memcpy).*' + var_name + '.*') 209 | ) 210 | }, start_node, end_node) 211 | } 212 | .scatter() 213 | 214 | // make unit tests happy 215 | .transform { it.toList()[0] } 216 | 217 | // pretty-print 218 | //.transform { it._().filter { it.codeContains('.*' + var_name + '.*').count() > 0 }.toList() } 219 | //.transform { [it.first().id.toString() + '\n'] + it.code.toList() + ["\n\n"] } 220 | //.scatter() 221 | 222 | POSITIVE_TESTS: 223 | - |- 224 | copy_to_user() {} // Declaration required to be able to search for functions by name in the index 225 | main() { 226 | union foo *bar; 227 | bar = kmalloc(sizeof(struct foo)); 228 | copy_to_user(fake, &bar, sizeof(bar)); 229 | } 230 | 231 | - |- 232 | copy_to_user() {} // Declaration required to be able to search for functions by name in the index 233 | main() { 234 | union foo bar; 235 | copy_to_user(fake, &bar, sizeof(bar)); 236 | } 237 | 238 | NEGATIVE_TESTS: 239 | - |- 240 | copy_to_user() {} // Declaration required to be able to search for functions by name in the index 241 | main() { 242 | union foo *bar; 243 | bar = kzalloc(sizeof(struct foo)); 244 | copy_to_user(fake, &bar, sizeof(bar)); 245 | } 246 | 247 | - |- 248 | copy_to_user() {} // Declaration required to be able to search for functions by name in the index 249 | main() { 250 | union foo bar; 251 | memcpy(bar, baz, sizeof(bar)); 252 | copy_to_user(fake, &bar, sizeof(bar)); 253 | } 254 | --------------------------------------------------------------------------------