├── .gitignore
├── README.md
├── testing
    ├── parse.py
    └── verify.py
└── queries
    ├── malloc_to_memcpy.yaml
    ├── wireshark_infinite_loop.yaml
    └── linux_kaslr_leak.yaml


/.gitignore:
--------------------------------------------------------------------------------
1 | testing/intermediates
2 | .queriesTestDB
3 | *.swp
4 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | joern-traversals
 2 | -------------
 3 | 
 4 | A collection of example [joern](https://github.com/fabsx00/joern) queries with unit tests. These queries are for the old version of joern that used neo4j and tinkerpop v2, for some examples using octopus (the new update of joern using tinkerpop3 on TitanDB) look [here](https://tsyrklevich.net/2016/10/31/notes-on-octopus-gremlin3/).
 5 | 
 6 | Run unit tests
 7 | --------------
 8 | 
 9 | Install dependencies with `pip install pyyaml`. To run tests:
10 | 
11 |     rm -rf testing/intermediates .queriesTestDB
12 |     ./testing/parse.py testing/intermediates queries/*.yaml
13 |     joern testing/intermediates -outdir .queriesTestDB
14 | 
15 |     # Start the DB with the correct path to ./.queriesTestDB
16 |     ./testing/verify.py queries/*.yaml
17 | 


--------------------------------------------------------------------------------
/testing/parse.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python2
 2 | #
 3 | # Parse the YAML files, pull out positive and negative tests and write them to disk
 4 | #  indexed by their md5 hash. These are intended to be processed by joern, and then a
 5 | #  a second step (verify.py) will go through and verify that tests succeeded or failed
 6 | #  by correlating results to tests by their md5 hashes
 7 | 
 8 | import sys
 9 | import os.path
10 | import hashlib
11 | 
12 | from yaml import load
13 | 
14 | try:
15 |   from yaml import CLoader as Loader
16 | except ImportError:
17 |   from yaml import Loader
18 | 
19 | if len(sys.argv) < 3:
20 |   print "Usage: parse.py <output directory> <file 1> <file 2> ..."
21 |   exit(1)
22 | 
23 | sys.argv.pop(0)
24 | output_dir = sys.argv.pop(0)
25 | 
26 | if not os.path.exists(output_dir):
27 |   os.makedirs(output_dir)
28 | 
29 | for arg in sys.argv:
30 |   yaml = load(file(arg, 'r'), Loader)
31 | 
32 |   for entry in yaml:
33 |     tests = []
34 |     if entry.has_key('POSITIVE_TESTS') and entry['POSITIVE_TESTS']:
35 |       tests += entry['POSITIVE_TESTS']
36 | 
37 |     if entry.has_key('NEGATIVE_TESTS') and entry['NEGATIVE_TESTS']:
38 |       tests += entry['NEGATIVE_TESTS']
39 | 
40 |     for t in tests:
41 |       digest = hashlib.md5()
42 |       digest.update(t)
43 | 
44 |       f = open(os.path.join(output_dir, digest.hexdigest() + '.c'), 'w')
45 |       f.write(t)
46 |       f.close()
47 | 


--------------------------------------------------------------------------------
/queries/malloc_to_memcpy.yaml:
--------------------------------------------------------------------------------
 1 | ---
 2 | - DESCRIPTION: |-
 3 |     Fabian's example from his 31c3 talk. Find cases where an additive expression used for a
 4 |     malloc (e.g. len+1) is different from the additive expression used in the length argument
 5 |     directly following it's next use; this indicates a potential integer overflow.
 6 | 
 7 |     Limitations: Only additive expressions, doesn't work for cases where one of the statements
 8 |     is not an additive expression [e.g. malloc(len+1) -> memcpy(.., len);]
 9 |   QUERY: |-
10 |     getCallsTo("malloc").ithArguments("0")
11 |       .sideEffect { cnt = it.code }
12 |       .match { it.type == "AdditiveExpression" }.statements()
13 |       .out("REACHES")
14 |       .match { it.type == "CallExpression" && it.code.startsWith("memcpy") }.ithArguments("2") 
15 |       .filter { it.code != cnt }
16 |       .match { it.type == "AdditiveExpression" }
17 |   POSITIVE_TESTS:
18 |     - |-
19 |       main() {
20 |         void *bad_ptr = malloc(len + 8);
21 |         memcpy(bad_ptr, spooky_buf, len + 7);
22 |       }
23 |   NEGATIVE_TESTS:
24 |     - |-
25 |       main() {
26 |         void *bad_ptr = malloc(len + 1);
27 |         memcpy(bad_ptr, spooky_buf, len);
28 |       }
29 | 
30 | - DESCRIPTION: |-
31 |     My modification of Fabian's example:
32 | 
33 |     Look for malloc($expression) where $expression involves only a single variable (and
34 |     potentially multiple constants/operations), and a later memcpy involving only the same
35 |     single variable with a different expression.
36 | 
37 |     Limitations: Only catches cases where a single identifier is used in the expression for the
38 |     allocation, this means multiple variables (or even addition with the result of a function
39 |     call or a macro) will not show up.
40 | 
41 |     TODO: Could this be more elegantly expressed using #groupBy to isolate expressions with
42 |     a single variable?
43 |   QUERY: |-
44 |     getCallsTo("malloc").ithArguments("0")
45 |       .sideEffect { expression = it.code }
46 |       .filter { it.match { it.type == "Identifier" }.count() == 1 }
47 |       .sideEffect { variable = it.match { it.type == "Identifier" }.code }
48 |       .statements().out("REACHES")
49 |       .match { it.type == "CallExpression" && it.code.startsWith("memcpy") }.ithArguments("2")
50 |       .filter { it.code != expression }
51 |       .filter { it.match { it.type == "Identifier" }.count() == 1 }
52 |       .filter { it.match { it.type == "Identifier" }.code.toList()[0] == variable.toList()[0] }
53 |   POSITIVE_TESTS:
54 |     - |-
55 |       main() {
56 |         void *bad_ptr = malloc(len + 1);
57 |         memcpy(bad_ptr, spooky_buf, len);
58 |       }
59 |   NEGATIVE_TESTS:
60 |     - |-
61 |       main() {
62 |         void *bad_ptr = malloc(len + MACRO_VALUE);
63 |         memcpy(bad_ptr, spooky_buf, len);
64 |       }
65 | 


--------------------------------------------------------------------------------
/testing/verify.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python2
 2 | #
 3 | # Run queries against a database with unit tests loaded. The unit tests are in files
 4 | #  with md5 hashes of their contents so we can cross-reference a test to the results
 5 | #  in the query. Before we run positive/negative unit tests we sanity check that tests
 6 | #  are in the database.
 7 | 
 8 | import sys
 9 | import re
10 | import os.path
11 | import hashlib
12 | 
13 | from yaml import load
14 | from joern.all import JoernSteps
15 | 
16 | try:
17 |   from yaml import CLoader as Loader
18 | except ImportError:
19 |   from yaml import Loader
20 | 
21 | if len(sys.argv) < 2:
22 |   print "Usage: verify.py <file 1> <file 2> ..."
23 |   exit(1)
24 | 
25 | j = JoernSteps()
26 | j.setGraphDbURL('http://localhost:7474/db/data')
27 | j.connectToDatabase()
28 | 
29 | sys.argv.pop(0)
30 | print "Running tests:"
31 | 
32 | # tests hashes are encoded in the intermediate path names, this extracts them
33 | def extract_paths(paths):
34 |   paths = map(lambda p: str.split(str(p), "/")[-1], paths)
35 |   return map(lambda p: str.split(str(p), ".c")[0], paths)
36 |   
37 | all_tests = extract_paths(j.runGremlinQuery("getNodesWithType('File').filepath"))
38 | 
39 | for arg in sys.argv:
40 |   yaml = load(file(arg, 'r'), Loader)
41 | 
42 |   for idx, entry in enumerate(yaml):
43 |     query = entry['QUERY']
44 |     query = re.sub("^ +", "", query, flags=re.MULTILINE)
45 |     query = re.sub(" +$", "", query, flags=re.MULTILINE)
46 |     query = str.split(query, "\n")
47 |     query = filter(lambda l: not re.match('//', l), query)
48 |     query = str.join("", query)
49 | 
50 |     query = """%s
51 |       .transform { g.v(it.functionId).functionToFile().filepath }.scatter()
52 |     """ % (query)
53 | 
54 |     try:
55 |       result = j.runGremlinQuery(query)
56 | 
57 |       if not isinstance(result, list):
58 |         raise Exception("Result of query for files was not a list")
59 | 
60 |       result = extract_paths(result)
61 | 
62 |       if entry.has_key('POSITIVE_TESTS') and entry['POSITIVE_TESTS']:
63 |         test_names = [hashlib.md5(test).hexdigest() for test in entry['POSITIVE_TESTS']]
64 |         missing = set(test_names) - set(all_tests)
65 |         if len(missing) > 0:
66 |           raise Exception("Unit test for hashes %s are not present in your database, re-create intermediates" % (missing))
67 | 
68 |         failures = set(test_names) - set(result)
69 |         if len(failures) > 0:
70 |           raise Exception("Positive test failure %s" % (failures))
71 |         else:
72 |           sys.stdout.write('.')
73 | 
74 |       if entry.has_key('NEGATIVE_TESTS') and entry['NEGATIVE_TESTS']:
75 |         test_names = [hashlib.md5(test).hexdigest() for test in entry['NEGATIVE_TESTS']]
76 |         missing = set(test_names) - set(all_tests)
77 |         if len(missing) > 0:
78 |           raise Exception("Unit test for hashes %s are not present in your database, re-create intermediates" % (missing))
79 | 
80 |         failures = set(result) & set(test_names)
81 |         if len(failures) > 0:
82 |           raise Exception("Negative test failure %s" % (failures))
83 |         else:
84 |           sys.stdout.write('.')
85 | 
86 |     except:
87 |       print "Error (%s:entry %i): %s" % (arg, idx + 1, sys.exc_info()[1])
88 |       pass
89 | 
90 | print
91 | 


--------------------------------------------------------------------------------
/queries/wireshark_infinite_loop.yaml:
--------------------------------------------------------------------------------
  1 | ---
  2 | - DESCRIPTION: |-
  3 |     Look for a for statement with an increment based on user input.  Based on this bug:
  4 |     
  5 |     https://anonsvn.wireshark.org/viewvc/trunk/epan/dissectors/packet-assa_r3.c?r1=51196&r2=51195&pathrev=51196
  6 | 
  7 |   QUERY: |-
  8 |     getNodesWithType("ForStatement")
  9 |       .children()
 10 |       .filter { it.type == "AssignmentExpr" }
 11 |       .filter { it.rval().code.toList()[0].matches(".*tvb_get_.*") }
 12 | 
 13 |   POSITIVE_TESTS:
 14 |     - |-
 15 |       main() {
 16 |         for (i = 0; i < len; i += tvb_get_guint8(tvb, i)) {
 17 |         }
 18 |       }
 19 | 
 20 |   NEGATIVE_TESTS:
 21 | 
 22 | - DESCRIPTION: |-
 23 |     More generically, Wireshark dissectors normally update their position in a packet/stream in
 24 |     a variable named offset as they process. There are a number of previous infinite loop DoS
 25 |     vulnerabilities based on the offset being incremented in a loop by user-controlled data set
 26 |     to 0 or being able to wrap around the offset variable in a loop, for example:
 27 |     https://code.wireshark.org/review/#/c/5338/2/epan/dissectors/packet-tn5250.c
 28 | 
 29 |     Check a value that comes from tvb_get_* is later added to the offset variable in a loop.
 30 | 
 31 |     Limitations:
 32 |      + Assumes an assignment, does not work for offset += tvb_...
 33 | 
 34 |   QUERY: |-
 35 |     getCallsTo("tvb_get_*")
 36 |       .statements()
 37 |       .filter { it.defines().count() > 0 }
 38 |       .sideEffect { output_var_name = it.defines().toList()[-1].code }
 39 |       .out("REACHES")
 40 |       .match { it.type == "AssignmentExpr" }
 41 |       .and(
 42 |         _().lval().match { it.code.matches(".*[oO]ff.*") },
 43 |         _().rval().filter { it.type != "CallExpression" }.match { it.code == output_var_name })
 44 |       .dedup()
 45 |       .filter { isInLoop(it) }
 46 | 
 47 |   POSITIVE_TESTS:
 48 |     - |-
 49 |       tvb_get_ntohl() {}      // Declaration required to be able to search for functions by name in the index
 50 |       main() {
 51 |         while(1) {
 52 |           guint32 len = tvb_get_ntohl(tvb, offset);
 53 |           offset += len;
 54 |         }
 55 |       }
 56 | 
 57 |   NEGATIVE_TESTS:
 58 |     - |-
 59 |       tvb_get_ntohl() {}      // Declaration required to be able to search for functions by name in the index
 60 |       main() {
 61 |         guint32 len = tvb_get_ntohl(tvb, offset);
 62 |         offset += len;
 63 |       }
 64 | 
 65 | - DESCRIPTION: |-
 66 |     The above is a good start, but it turns out that many core Wireshark functions check bounds
 67 |     for the offset so there is an unacceptable number of false-positives. There are two types
 68 |     of bugs the above query could find: 1) infinite loops caused by loops updating the offset
 69 |     using only user supplied input [e.g. no constants are added], or 2) infinite loops caused
 70 |     by integer overflows to the offset variable, causing the loop to start processing at the
 71 |     beginning of the last iteration. This query looks for the latter.
 72 | 
 73 |     There is an 'unsanitized' joern step that allows you to pick a statement in the CFG and
 74 |     find all paths in the CFG to that statement where user input has not been sanitized. We
 75 |     would like to do the opposite, start from a call to tvb_get_*, end at a statement that
 76 |     adds that value to the offset, and make sure the argument has not been passed to a
 77 |     bounds-checking function in between. We can do this by using the 'cfgPaths' step (which is
 78 |     used by 'unsanitized' step.) This allows us specify a source and a destination statement,
 79 |     along with a closure expressing what 'sanitizing' statements look like, and receive any
 80 |     paths from the source to the destination where a given set of sanitizing statements are
 81 |     avoided.
 82 | 
 83 |     Since we are interested in integer overflows, we only care to target functions that return
 84 |     32-bit or 64-bit integers. Afterwards we use slightly more complex logic to find instances
 85 |     where that tainted user input is added to an offset variable, and then perform the cfgPaths
 86 |     sanitization search.
 87 | 
 88 |     This query found the following bugs:
 89 |      + https://bugs.wireshark.org/bugzilla/show_bug.cgi?id=11023
 90 |      + https://bugs.wireshark.org/bugzilla/show_bug.cgi?id=11024
 91 |      + https://bugs.wireshark.org/bugzilla/show_bug.cgi?id=11037
 92 | 
 93 |     Limitations:
 94 |      + Assumes an assignment, fails for offset += tvb_...
 95 |      + Assumes a call to proto_tree_add_item/etc sanitizes, but it might miss cases where the
 96 |        tainted value is added to something [e.g. proto_tree_add_item(tained + 4) allowing an
 97 |        integer overflow to bypass the bounds checking]
 98 |      + The tainted user data might be passed as an argument to a sanitizing functions so that
 99 |        it does not cause a bound check, we should check specific parameters.
100 | 
101 |   QUERY: |-
102 |     getFunctionsByName("tvb_get_*").as('func')
103 |       .out("IS_FUNCTION_OF_AST").out.filter { it.type == "ReturnType" }
104 |       .filter { it.code.matches(".*int(32|64).*") }
105 |       .back('func')
106 |       .transform { getCallsTo(it.name) }.scatter()
107 |       .sideEffect { src = it }
108 |       .statements()
109 |       .filter { it.defines().count() > 0 }
110 |       .sideEffect { output_var_name = it.defines().code.toList()[0].replace("*", "") }
111 |       .out("REACHES")
112 |       .match { it.type == "AssignmentExpr" }
113 |       .filter { it.rval().toList()[0].code.matches(".*$output_var_name.*") }
114 |       .filter { it.rval().toList()[0].type != 'CallExpression' }
115 |       // The following is to get around null accesses stemming from:
116 |       // https://github.com/fabsx00/joern/issues/49
117 |       .filter { it.parents().toList()[0].type != "IdentifierDecl" }
118 |       .filter { lval = it.lval().toList()[0]; lval.code == 'offset' || (src.ithArguments("1").toList()[0].code != 'offset' && lval.code.matches(".*[oO]ff.*")) }
119 |       .dedup()
120 |       .filter { isInLoop(it) }
121 |       .sideEffect { dst = it }
122 |       .transform { cfgPaths(output_var_name, { cur, sym ->
123 |         cur._().filter { it.uses().code.toList().contains(output_var_name) }.or(
124 |           _().codeContains('.*proto_tree_add_(text|item).*'),
125 |           _().codeContains('.*tvb_get_(str|ptr).*'),
126 |           _().codeContains('.*tvb_new_subset.*'),
127 |           _().codeContains('.*tlv_length_remaining.*'))
128 |         }, src.statements().toList()[0], dst.statements().toList()[0]) }
129 |       .scatter().transform { it.toList()[0] }
130 |       .dedup()
131 | 
132 |   POSITIVE_TESTS:
133 |     - |-
134 |       guint32 tvb_get_ntohl() {}      // Declaration required to be able to search for functions by name in the index
135 |       main() {
136 |         guint32 len;
137 |         while(1) {
138 |           len = tvb_get_ntohl(tvb, offset);
139 |           offset += len;
140 |         }
141 |       }
142 | 
143 |   NEGATIVE_TESTS:
144 |     - |-
145 |       guint32 tvb_get_ntohl() {}      // Declaration required to be able to search for functions by name in the index
146 |       main() {
147 |         guint32 len;
148 |         while(1) {
149 |           len = tvb_get_ntohl(tvb, offset);
150 |           proto_tree_add_text(len); // Sanitizer!
151 |           offset += len;
152 |         }
153 |       }
154 | 
155 | - DESCRIPTION: |-
156 |     The above query is good for finding cases where an infinite loop is possible by
157 |     overflowing the offset counter and causing the processing to start-back at the original
158 |     point; however, it is also possible that we could cause an infinite loop if we could cause
159 |     the offset to only be incremented by zero. This means looking for loops where the offset
160 |     is not incremented by fixed amounts and only by used controlled data.
161 | 
162 |     This query requires a modification to cfgPaths, hence there are no unit tests because they
163 |     would fail with modifications :/ The modification is to allow specifying the same source and
164 |     destination node, e.g. making sure that we find no expressions that modify the offset
165 |     variable in a way that might force it to increment in an entire loop iteration.
166 | 
167 |     Found https://bugs.wireshark.org/bugzilla/show_bug.cgi?id=11036
168 | 
169 |     Requires cfgPaths modification:
170 |     --- a/joern/joernsteps/taintTracking/dataflow.groovy
171 |     +++ b/joern/joernsteps/taintTracking/dataflow.groovy
172 |     @@ -127,7 +127,7 @@ Object.metaClass._cfgPaths = {symbol, sanitizer, curNode, dst, visited, path ->
173 |        }
174 | 
175 |           // return path when destination has been reached
176 |           -  if(curNode == dst){
177 |           +  if (curNode == dst && path != []) {
178 | 
179 |     Limitations:
180 |      + We only find the infinite loop case where only a single user-controlled zero variable is
181 |        added to the offset; however, there could be multiple such zero-value variables.
182 | 
183 |   QUERY: |-
184 |     getFunctionsByName("tvb_get_ntohl").transform { getCallsTo(it.name) }.scatter()
185 |       .sideEffect { offset_var_regex = (it.ithArguments("1").toList()[0].code == 'offset' ? 'offset' : "[oO]ff") }
186 |       .sideEffect { src = it }
187 |       .statements()
188 |       .filter { it.defines().count() > 0 }
189 |       .sideEffect { output_var_name = it.defines().code.toList()[0].replace("*", "") }
190 |       .out("REACHES")
191 |       .match { it.type == "AssignmentExpr" }
192 |       .filter { it.rval().toList()[0].code.matches(".*$output_var_name.*") }
193 |       .filter { it.rval().toList()[0].type != 'CallExpression' }
194 |       .filter { it.rval().toList()[0].type != 'AdditiveExpression' }
195 |       .filter { it.parents().toList()[0].type != "IdentifierDecl" }
196 |       .filter { lval = it.lval().toList()[0]; lval.code == 'offset' || (src.ithArguments("1").toList()[0].code != 'offset' && lval.code.matches(".*[oO]ff.*")) }
197 |       .dedup()
198 |       .filter { isInLoop(it) }
199 |       .transform { cfgPaths('FAKEFAKE', { cur, sym ->
200 |         //println cur._().code.toList();
201 |         //println cur._().match { it.type == "AssignmentExpr" }.toList();
202 |         cur._().or(
203 |           _().match { it.type == "AssignmentExpr" }.filter { it.lval().toList().size > 0 }
204 |             .filter { it.lval().toList()[0].code.matches(".*$offset_var_regex.*") }
205 |             .filter { !it.rval().toList()[0].code.matches(".*$output_var_name.*") },
206 |           _().match { it.type == "IncDecOp" && it.lval().toList().size > 0 }
207 |             .filter { it.lval.toList()[0].code.matches(".*$offset_var_regex.*") },
208 |           _().isCheck(".*$output_var_name .*"))
209 |         }, src.statements().toList()[0], src.statements().toList()[0]) }
210 |       .scatter().transform { it.toList()[0] }
211 |       .dedup()
212 | 
213 |   POSITIVE_TESTS:
214 |   NEGATIVE_TESTS:
215 | 


--------------------------------------------------------------------------------
/queries/linux_kaslr_leak.yaml:
--------------------------------------------------------------------------------
  1 | ---
  2 | - DESCRIPTION: |-
  3 |     Look at the simplest possible case, inspired by this bug
  4 |     https://git.backbone.ws/linux/backbone-sources/commit/a117dac
  5 | 
  6 |     Look at calls to copy_to_user where the data being copied out is a structure allocated as a
  7 |     local stack variable that is not sanitized correctly. Unfortunately joern does not store much
  8 |     useful type information (e.g. searchable struct definitions and elements), so the best we can
  9 |     do is make sure the struct is not sanitized, a second step is required to see if all the
 10 |     individual struct fields are written to. This first step reduces the search space of
 11 |     copy_to_user calls on Linux 4.7.2 from 2491 to 191. Ultimately after narrowing down the
 12 |     number of possible hits was further narrowed down by a post-processing step that looked for
 13 |     structs where all members were assigned to, just one memory leak was found.
 14 | 
 15 |     Limitations:
 16 |     - Only looks for calls to copy_to_user() with sizeof() in the length parameter.
 17 |     - Ignores pointers (e.g. dynamically allocated structs, buffers, etc.)
 18 |     - Ignores structs passed in an argument (e.g. we could do some simple interprocedural
 19 |       analysis to look backwards to see if the data was initialized correctly in the calling
 20 |       functions.)
 21 |     - Returns too many results, needs post-processing step due to lack of struct type info.
 22 | 
 23 |     Ideas:
 24 |     - Fix any limitation listed above
 25 |     - Look specifically at unsanitized unions involved in a copy_to_user with different width
 26 |       members
 27 |     - Look specifically at unsanitized structs involved in a copy_to_user with members named
 28 |       something like 'padding' or 'reserved'
 29 |     - Look specifically at unsanitized unpacked structs involved in a copy_to_user that would
 30 |       include compiler added padding on 32 or 64-bit platforms
 31 | 
 32 |     Found https://github.com/torvalds/linux/commit/02a9079c66341836c4914c33c06a73245060df2e
 33 | 
 34 |     This requires a modified reachableCfgNodes to run
 35 | 
 36 |     diff --git a/joern/joernsteps/cfg.groovy b/joern/joernsteps/cfg.groovy
 37 |     index 7e8a9a0..2e7c05b 100644
 38 |     --- a/joern/joernsteps/cfg.groovy
 39 |     +++ b/joern/joernsteps/cfg.groovy
 40 |     @@ -12,20 +12,25 @@ Gremlin.defineStep('toExitNode', [Vertex,Pipe], {
 41 |      /**
 42 |         Search the CFG breadth-first so that we can keep track of all nodes we've visited in
 43 |          the entire search rather than just along the current path (massive optimization for
 44 |     -    high branching-factor CFGs, e.g. state machines).
 45 |     +    high branching-factor CFGs, e.g. state machines.) Can search forwards or backwards.
 46 |      */
 47 |     -Object.metaClass._reachableCfgNodes = { curNodes, visited ->
 48 |     -  nextNodes = curNodes._().out('FLOWS_TO').toSet() - visited
 49 |     +Object.metaClass._reachableCfgNodes = { curNodes, visited, forward ->
 50 |     +  if (forward == true) {
 51 |     +    nextNodes = curNodes._().out('FLOWS_TO').toSet() - visited
 52 |     +  } else {
 53 |     +    nextNodes = curNodes._().in('FLOWS_TO').toSet() - visited
 54 |     +  }
 55 |     +
 56 |        if (nextNodes.isEmpty()) { return visited }
 57 | 
 58 |        visited.addAll(nextNodes)
 59 |     -  return _reachableCfgNodes(nextNodes.toList(), visited)
 60 |     +  return _reachableCfgNodes(nextNodes.toList(), visited, forward)
 61 |      }
 62 | 
 63 |     -Gremlin.defineStep('reachableCfgNodes', [Vertex, Pipe], {
 64 |     -  _().transform { _reachableCfgNodes(it.statements().toList(), new HashSet())}.scatter()
 65 |     +Gremlin.defineStep('reachableCfgNodes', [Vertex, Pipe], { forward ->
 66 |     +  _().transform { _reachableCfgNodes(it.statements().toList(), new HashSet(), forward)}.scatter()
 67 |      })
 68 | 
 69 |      Object.metaClass.isInLoop = { it ->
 70 |     -  it._().reachableCfgNodes().toSet().contains(it.statements().toList()[0])
 71 |     +  it._().reachableCfgNodes(true).toSet().contains(it.statements().toList()[0])
 72 | 
 73 |   QUERY: |-
 74 |     getCallsTo('copy_to_user')
 75 |       .as('copy')
 76 |       .sideEffect { end_node = it.statements().toList()[0] }
 77 |       .ithArguments('1')
 78 |       .match { it.type == 'Identifier' }
 79 |       .sideEffect { var_name = it.code }
 80 |       .back('copy')
 81 |       .ithArguments('2')
 82 |       // Whole struct is being copied out.
 83 |       .filter { it.codeContains('.*sizeof.*').count() > 0 }
 84 |       .back('copy')
 85 |       // Search backwards to find the declaration
 86 |       .reachableCfgNodes(false)
 87 |       .filter { it.type == 'IdentifierDeclStatement' }
 88 |       .match { it.type == 'Identifier' && it.code == var_name }
 89 |       .back(1)
 90 |       .sideEffect { start_node = it }
 91 |       // If it assigned a value assigned from the beginning, there is no memory leak
 92 |       .filter { !it.astNodes().type.toList().contains('AssignmentExpr') }
 93 |       .match { it.type == 'IdentifierDeclType' && !it.codeContains('^(const )?(float|off_t|compat_int_t|unsigned|s8|s16|s32|s64|u8|u16|u32|u64|__u8|__u16|__u32|__u64|int|unsigned long|unsigned int|u_int16_t|uint32_t|uint64_t|ssize_t|size_t|unsigned char)( const)?$') && !it.codeContains('.*\\*$') }
 94 |       .transform {
 95 |         cfgPaths('FAKE', { cur, sym ->
 96 |           cur._().or(
 97 |             // Ignore code paths that require capabilities
 98 |             _().codeContains('.*capable \\(.*'),
 99 |             // Ignore code paths where the entire variable is written to with memcpy
100 |             // - hacky, could do this cleaner with AST parsing but I am being lazy. Also
101 |             // misses cases where sizeof has no parens or its sizeof the type
102 |             _().codeContains('.*memcpy \\( (& )?' + var_name + ' ,.*sizeof \\( ' + var_name + ' \\).*'),
103 |             // Ignore code paths where the value is assigned to directly
104 |             _().match { it.type == 'AssignmentExpr' && it.lval().code.toList().contains(var_name) },
105 |             // Look for references to 'var' or '& var' passed to a sanitizing functions
106 |             _().match { it.type == 'CallExpression' }.as('call')
107 |               .match { it.type == 'Argument' && it.code.matches('(& )?' + var_name) }
108 |               .sideEffect { arg_num = it.childNum }.back('call')
109 |               .or(
110 |                 // Simple sanitizers
111 |                 _().callToCallee().codeContains('(memset|copy_from_user)'),
112 |                 // Do some real hacky interprocedural analysis, check if the arg is passed to
113 |                 // a function that appears to memset the argument
114 |                 _().transform {
115 |                   getFunctionsByName(it.callToCallee().code.first())
116 |                     .out.match { it.type == 'ParameterList' }
117 |                     .out.filter { it.childNum == arg_num }
118 |                     .out('REACHES').codeContains('memset.*')
119 |                 }.scatter()
120 |               )
121 |           )
122 |         }, start_node, end_node)
123 |       }
124 |       .scatter()
125 | 
126 |       // make unit tests happy
127 |       .transform { it.toList()[0] }
128 | 
129 |       // pretty-print
130 |       //.transform {
131 |       //  [it.first()] + it._().filter { it.uses().code.toList().contains(var_name) }.toList() + [it.last()]
132 |       //}
133 |       //.transform { [it.first().id.toString() + '\n'] + it.code.toList() + ["\n\n"] }
134 |       //.scatter()
135 | 
136 |   POSITIVE_TESTS:
137 |     - |-
138 |       copy_to_user() {}      // Declaration required to be able to search for functions by name in the index
139 |       main() {
140 |         struct foo bar;
141 |         copy_to_user(fake, &bar, sizeof(bar));
142 |       }
143 | 
144 |     - |-
145 |       copy_to_user() {}      // Declaration required to be able to search for functions by name in the index
146 |       main() {
147 |         struct foo bar;
148 |         if (condition) {
149 |           memset(&bar, 0, sizeof(bar));
150 |         }
151 |         copy_to_user(fake, &bar, sizeof(bar));
152 |       }
153 | 
154 |   NEGATIVE_TESTS:
155 |     - |-
156 |       copy_to_user() {}      // Declaration required to be able to search for functions by name in the index
157 |       main() {
158 |         struct foo bar;
159 |         memset(&bar, 0, sizeof(bar));
160 |         copy_to_user(fake, &bar, sizeof(bar));
161 |       }
162 | 
163 |     - |-
164 |       copy_to_user() {}      // Declaration required to be able to search for functions by name in the index
165 | 
166 |       func(void *foo) { memset(foo, 0, sizeof(*foo)); }
167 | 
168 |       main() {
169 |         struct foo bar;
170 |         func(&bar);
171 |         copy_to_user(fake, &bar, sizeof(bar));
172 |       }
173 | 
174 | - DESCRIPTION: |-
175 |     This performs a similar search to the above, we look for unions or union pointers where the
176 |     contents of the type are not sanitized. Unions specifically are interesting because of the
177 |     possibility that different sized types exist within the union and the contents of the entire
178 |     union are copied out when a code path is hit that only fills part of the union. This search
179 |     returns 7 results, of which 2 have bugs.
180 | 
181 |     Found https://github.com/torvalds/linux/commit/30f939feaeee23e21391cfc7b484f012eb189c3c
182 |     Found https://github.com/torvalds/linux/commit/d69bb92e402ff948bdcd39f19c9067874fb86873
183 | 
184 |   QUERY: |-
185 |     getCallsTo('copy_to_user')
186 |       .as('copy')
187 |       .sideEffect { end_node = it.statements().toList()[0] }
188 |       .ithArguments('1')
189 |       .match { it.type == 'Identifier' }
190 |       .sideEffect { var_name = it.code }
191 |       .back('copy')
192 |       .back('copy')
193 |       // Search backwards to find the declaration
194 |       .reachableCfgNodes(false)
195 |       .filter { it.type == 'IdentifierDeclStatement' }
196 |       .match { it.type == 'Identifier' && it.code == var_name }
197 |       .back(1)
198 |       .sideEffect { start_node = it }
199 |       // If it assigned a value assigned from the beginning, there is no memory leak
200 |       .filter { !it.astNodes().type.toList().contains('AssignmentExpr') }
201 |       .match { it.type == 'IdentifierDeclType' && it.codeContains(".*union.*") }
202 |       .transform {
203 |         cfgPaths('FAKE', { cur, sym ->
204 |           cur._().or(
205 |             // Ignore code paths that require capabilities
206 |             _().codeContains('.*capable \\(.*'),
207 |             _().codeContains('.*' + var_name + '.*=.*(kzalloc|vzalloc|memdup_user).*'),
208 |             _().codeContains('.*(memset|memcpy).*' + var_name + '.*')
209 |           )
210 |         }, start_node, end_node)
211 |       }
212 |       .scatter()
213 | 
214 |       // make unit tests happy
215 |       .transform { it.toList()[0] }
216 | 
217 |       // pretty-print
218 |       //.transform { it._().filter { it.codeContains('.*' + var_name + '.*').count() > 0 }.toList() }
219 |       //.transform { [it.first().id.toString() + '\n'] + it.code.toList() + ["\n\n"] }
220 |       //.scatter()
221 | 
222 |   POSITIVE_TESTS:
223 |     - |-
224 |       copy_to_user() {}      // Declaration required to be able to search for functions by name in the index
225 |       main() {
226 |         union foo *bar;
227 |         bar = kmalloc(sizeof(struct foo));
228 |         copy_to_user(fake, &bar, sizeof(bar));
229 |       }
230 | 
231 |     - |-
232 |       copy_to_user() {}      // Declaration required to be able to search for functions by name in the index
233 |       main() {
234 |         union foo bar;
235 |         copy_to_user(fake, &bar, sizeof(bar));
236 |       }
237 | 
238 |   NEGATIVE_TESTS:
239 |     - |-
240 |       copy_to_user() {}      // Declaration required to be able to search for functions by name in the index
241 |       main() {
242 |         union foo *bar;
243 |         bar = kzalloc(sizeof(struct foo));
244 |         copy_to_user(fake, &bar, sizeof(bar));
245 |       }
246 | 
247 |     - |-
248 |       copy_to_user() {}      // Declaration required to be able to search for functions by name in the index
249 |       main() {
250 |         union foo bar;
251 |         memcpy(bar, baz, sizeof(bar));
252 |         copy_to_user(fake, &bar, sizeof(bar));
253 |       }
254 | 


--------------------------------------------------------------------------------