├── README.md ├── logs └── .keep ├── main.py └── requirements.txt /README.md: -------------------------------------------------------------------------------- 1 | # Project REGULATOR: Automated learning of regexes for DNS discovery 2 | 3 | I had a lot of fun making this and I hope this project will change the way you 4 | see subdomain enumeration. The method explored here is highly effective and 5 | efficient. 6 | 7 | With this said, it's not a silver bullet. Not every DNS zone performs well with 8 | this method. It fails when there are no latent text structures in the hostnames 9 | (ie. they are seemingly random) or you have limited observational data. 10 | 11 | This project was developed primarily to showcase the power of regular language 12 | ranking via the `dank` (https://github.com/cramppet/dank) library. I wanted to 13 | show that the concept of ranking and using regexes as templates for fuzzing can 14 | work very well. 15 | 16 | For more information see the blog post here: https://cramppet.github.io/regulator/index.html 17 | 18 | ## Install 19 | 20 | 1. clone the repository 21 | 2. install the dependencies `pip3 install -r requirements.txt` 22 | 23 | ## Usage 24 | 25 | 1. Run your subdomain enumeration tool of choice 26 | 2. Supply the hostnames found to REGULATOR: `python3 main.py -t -f -o ` 27 | 28 | # Example 29 | 30 | 1. `python3 main.py -t adobe.com -f adobe.subs -o adobe.brute` 31 | 3. `puredns resolve adobe.brute --write adobe.valid` 32 | 33 | Be advised that the discovered hosts will overlap with your original input data. 34 | If you want the subdomains that were not previously found by the subdomain 35 | enumeration tool, use the following command: 36 | 37 | `comm -23 <(sort -u adobe.valid) <(sort -u adobe.subs) > adobe.final` 38 | -------------------------------------------------------------------------------- /logs/.keep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cramppet/regulator/2371a0607de25f0a01a138905ccd841cc465bba5/logs/.keep -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | import re 2 | import string 3 | import logging 4 | import argparse 5 | 6 | from typing import List, Set 7 | from itertools import combinations_with_replacement 8 | 9 | import datrie 10 | import tldextract 11 | import editdistance 12 | 13 | from dank.DankEncoder import DankEncoder 14 | from dank.DankGenerator import DankGenerator 15 | 16 | 17 | MEMO = {} 18 | LOGFILE_NAME = 'logs/regulator.log' 19 | DNS_CHARS = string.ascii_lowercase + string.digits + '._-' 20 | 21 | 22 | def edit_closures(items: List[str], delta: int = 5) -> List[Set[str]]: 23 | """computes all subsets of items bounded by fixed edit distance""" 24 | global MEMO 25 | ret = [] 26 | for a in items: 27 | found = False 28 | r = set([a]) 29 | for b in items: 30 | dist = MEMO[a+b] if a+b in MEMO else MEMO[b+a] 31 | if dist < delta: 32 | r.add(b) 33 | for s in ret: 34 | if r == s: 35 | found = True 36 | break 37 | if not found: 38 | ret.append(r) 39 | return ret 40 | 41 | 42 | def tokenize(items: List[str]): 43 | """tokenize DNS hostnames into leveled word tokens""" 44 | ret = [] 45 | hosts = [] 46 | for item in items: 47 | t = tldextract.extract(item) 48 | hosts.append(t.subdomain) 49 | labels = [host.split('.') for host in hosts] 50 | for label in labels: 51 | n = [] 52 | for item in label: 53 | t = [] 54 | tokens = [f'-{e}' if i != 0 else e for i, e in enumerate(item.split('-'))] 55 | for token in tokens: 56 | subtokens = [x for x in re.split('([0-9]+)', token) if len(x) > 0] 57 | for i in range(len(subtokens)): 58 | # Special case where we have a hyphenated number: foo-12.example.com 59 | if subtokens[i] == '-' and i+1 < len(subtokens): 60 | subtokens[i+1] = ('-' + subtokens[i+1]) 61 | else: 62 | t.append(subtokens[i]) 63 | n.append(t) 64 | ret.append(n) 65 | return ret 66 | 67 | 68 | def compress_number_ranges(regex: str) -> str: 69 | """given an 'uncompressed' regex, returns a regex with ranges instead""" 70 | ret = regex[:] 71 | stack, groups, repl, extra, hyphen = [], [], {}, {}, {} 72 | for i, e in enumerate(regex): 73 | if e == '(': 74 | stack.append(i) 75 | elif e == ')': 76 | start = stack.pop() 77 | group = regex[start+1:i] 78 | tokens = group.split('|') 79 | numbers = [token for token in tokens if token.isnumeric()] 80 | nonnumbers = [token for token in tokens if not token.isnumeric() and not re.match('-[0-9]+', token)] 81 | hyphenatednumbers = [token[1:] for token in tokens if re.match('-[0-9]+', token)] 82 | # Only primitive groups: a single alteration of tokens 83 | if '?' in group or ')' in group or '(' in group: 84 | continue 85 | # Only allow one or the other for now 86 | elif len(numbers) != 0 and len(hyphenatednumbers) != 0: 87 | continue 88 | # At least 2 numerical tokens 89 | elif len(numbers) > 1: 90 | g1 = '|'.join(numbers) 91 | g2 = '|'.join(nonnumbers) 92 | repl[g1] = group 93 | extra[g1] = g2 94 | groups.append(g1) 95 | # At least 2 hyphenated numerical tokens 96 | elif len(hyphenatednumbers) > 1: 97 | g1 = '|'.join(hyphenatednumbers) 98 | g2 = '|'.join(nonnumbers) 99 | repl[g1] = group 100 | extra[g1] = g2 101 | groups.append(g1) 102 | hyphen[g1] = True 103 | for group in groups: 104 | generalized = '(' if not group in hyphen else '(-' 105 | positions = {} 106 | # Reverse because of the way integers are interpreted in hostnames 107 | tokens = [g[::-1] for g in group.split('|')] 108 | for token in tokens: 109 | for position, symbol in enumerate(token): 110 | if not position in positions: 111 | positions[position] = set([]) 112 | positions[position].add(int(symbol)) 113 | # A position is optional iff some token doesn't have that position 114 | s = sorted(tokens, key=lambda x: len(x)) 115 | start, end = len(s[-1])-1, len(s[0])-1 116 | for i in range(start, end, -1): 117 | positions[i].add(None) 118 | # We go in reverse because of reversing the token order above 119 | for i, symbols in sorted(positions.items(), key=lambda x: x[0], reverse=True): 120 | optional = None in symbols 121 | if optional: 122 | symbols.remove(None) 123 | s = sorted(symbols) 124 | start, end = s[0], s[-1] 125 | if start != end: 126 | generalized += f'[{start}-{end}]{"?" if optional else ""}' 127 | else: 128 | generalized += f'{start}{"?" if optional else ""}' 129 | generalized += ')' 130 | ext = extra[group] 131 | rep = repl[group] 132 | if ext != '': 133 | generalized = f'({generalized}|({ext}))' 134 | ret = ret.replace(f'({rep})', generalized) 135 | return ret 136 | 137 | 138 | def closure_to_regex(domain: str, members: List[str]) -> str: 139 | """converts edit closure to a regular language""" 140 | ret, levels, optional = '', {}, {} 141 | tokens = tokenize(members) 142 | for member in tokens: 143 | for i, level in enumerate(member): 144 | if i not in levels: 145 | levels[i] = {} 146 | optional[i] = {} 147 | for j, token in enumerate(level): 148 | if not j in levels[i]: 149 | levels[i][j] = set([]) 150 | optional[i][j] = [] 151 | levels[i][j].add(token) 152 | optional[i][j].append(token) 153 | for i, level in enumerate(levels): 154 | n = '(.' if i != 0 else '' 155 | for j, position in enumerate(levels[level]): 156 | k = len(levels[level][position]) 157 | # Special case: first token in DNS name 158 | if i == 0 and j == 0: 159 | n += f"({'|'.join(levels[level][position])})" 160 | # Special case: single element in alternation at start of level 161 | elif k == 1 and j == 0: 162 | # TODO: Should we make this optional too? 163 | n += f"{'|'.join(levels[level][position])}" 164 | # General case 165 | else: 166 | # A position is optional if some token doesn't have that position 167 | isoptional = len(optional[level][position]) != len(members) 168 | n += f"({'|'.join(levels[level][position])}){'?' if isoptional else ''}" 169 | # A level is optional if either not every host has the level, or if there 170 | # are distinct level values 171 | values = list(map(lambda x: ''.join(x), zip(*optional[level].values()))) 172 | isoptional = len(set(values)) != 1 or len(values) != len(members) 173 | ret += (n + ")?" if isoptional else n + ")") if i != 0 else n 174 | return compress_number_ranges(f'{ret}.{domain}') 175 | 176 | 177 | def is_good_rule(regex: str, nkeys: int, threshold: int, max_ratio: float) -> bool: 178 | """applies ratio test to determine if a rule is acceptable""" 179 | e = DankEncoder(regex,256) 180 | nwords = e.num_words(1,256) 181 | return nwords < threshold or (nwords/nkeys) < max_ratio 182 | 183 | def sort_and_unique(file_name: str): 184 | with open(file_name, "r") as file: 185 | data = file.readlines() 186 | data = sorted(set(data)) 187 | with open(file_name, "w") as file: 188 | file.writelines(data) 189 | 190 | def main(): 191 | global DNS_CHARS, MEMO 192 | 193 | logging.basicConfig(format='%(asctime)-15s - %(name)s - %(levelname)s - %(message)s', level=logging.INFO, filename=LOGFILE_NAME, filemode='a') 194 | parser = argparse.ArgumentParser(description='DNS Regulator') 195 | parser.add_argument('-th', '--threshold', required=False, type=int, default=500, help='Threshold to start performing ratio test') 196 | parser.add_argument('-mr', '--max-ratio', required=False, type=float, default=25.0, help='Ratio test parameter R: len(Synth)/len(Obs) < R') 197 | parser.add_argument('-ml', '--max-length', required=False, type=int, default=1000, help='Maximum rule length for global search') 198 | parser.add_argument('-dl', '--dist-low', required=False, type=int, default=2, help='Lower bound on string edit distance range') 199 | parser.add_argument('-dh', '--dist-high', required=False, type=int, default=10, help='Upper bound on string edit distance range') 200 | parser.add_argument('-t', '--target', required=True, type=str, help='The domain to target') 201 | parser.add_argument('-f', '--hosts', required=True, type=str, help='The observed hosts file') 202 | parser.add_argument('-o', '--output', required=False, type=str, help='Output filename (default: output)', default="output") 203 | args = vars(parser.parse_args()) 204 | 205 | logging.info(f'REGULATOR starting: MAX_RATIO={args["max_ratio"]}, THRESHOLD={args["threshold"]}') 206 | 207 | trie = datrie.Trie(DNS_CHARS) 208 | known_hosts, new_rules = set([]), set([]) 209 | 210 | def first_token(item: str): 211 | tokens = tokenize([item]) 212 | return tokens[0][0][0] 213 | 214 | with open(args['hosts'], 'r') as handle: 215 | known_hosts = sorted(list(set([line.strip() for line in handle.readlines()]))) 216 | for host in known_hosts: 217 | if host != args['target']: 218 | tokens = tokenize([host]) 219 | if len(tokens) > 0 and len(tokens[0]) > 0 and len(tokens[0][0]) > 0: 220 | trie[host] = True 221 | else: 222 | logging.warning(f'Rejecting malformed input: {host}') 223 | known_hosts.remove(host) 224 | 225 | logging.info(f'Loaded {len(known_hosts)} observations') 226 | logging.info('Building table of all pairwise distances...') 227 | 228 | for s, t in combinations_with_replacement(known_hosts, 2): 229 | MEMO[s+t] = editdistance.eval(s,t) 230 | 231 | logging.info('Table building complete') 232 | 233 | # No enforced prefix 234 | for k in range(args['dist_low'], args['dist_high']): 235 | logging.info(f'k={k}') 236 | closures = edit_closures(known_hosts, delta=k) 237 | for closure in closures: 238 | if len(closure) > 1: 239 | r = closure_to_regex(args['target'], closure) 240 | # This is probably the only place you'd want to apply this check; rules 241 | # inferred using this method tend to be very big which makes this part 242 | # slow, especially at scale. 243 | if len(r) > args['max_length']: 244 | continue 245 | if r not in new_rules and is_good_rule(r, len(closure), args['threshold'], args['max_ratio']): 246 | new_rules.add(r) 247 | else: 248 | # TODO: What should we do here? 249 | pass 250 | 251 | # Unigrams + bigrams as fixed prefixes 252 | ngrams = sorted(list(set(DNS_CHARS) | set([''.join([i,j]) for i in DNS_CHARS for j in DNS_CHARS]))) 253 | for ngram in ngrams: 254 | keys = trie.keys(ngram) 255 | if len(keys) == 0: 256 | continue 257 | 258 | # First chance: try ngrams first because they are the shortest 259 | r = closure_to_regex(args['target'], keys) 260 | if r not in new_rules and is_good_rule(r, len(keys), args['threshold'], args['max_ratio']): 261 | new_rules.add(r) 262 | 263 | last, prefixes = None, sorted(list(set([first_token(k) for k in trie.keys(ngram)]))) 264 | for prefix in prefixes: 265 | logging.info(f'Prefix={prefix}') 266 | keys = trie.keys(prefix) 267 | 268 | # Second chance: use prefix tokens starting with the ngram 269 | r = closure_to_regex(args['target'], keys) 270 | if r not in new_rules and is_good_rule(r, len(keys), args['threshold'], args['max_ratio']): 271 | if last is None or not prefix.startswith(last): 272 | last = prefix 273 | else: 274 | logging.warning(f"Rejecting redundant prefix: {prefix}") 275 | continue 276 | new_rules.add(r) 277 | 278 | if len(prefix) > 1: 279 | for k in range(args['dist_low'], args['dist_high']): 280 | closures = edit_closures(keys, delta=k) 281 | for closure in closures: 282 | # Third chance: deconstruct prefix using edit distance 283 | r = closure_to_regex(args['target'], closure) 284 | if r not in new_rules and is_good_rule(r, len(closure), args['threshold'], args['max_ratio']): 285 | new_rules.add(r) 286 | 287 | # Failure: we have no strategy for dealing with this 288 | elif r not in new_rules: 289 | logging.error(f'Rule cannot be processed: {r}') 290 | 291 | #Saving rules with a static name 292 | with open(f"{args['target']}.rules", 'w') as handle: 293 | for rule in new_rules: 294 | handle.write(f'{rule}\n') 295 | 296 | with open(args['output'], 'w') as handle: 297 | for line in new_rules: 298 | for item in DankGenerator(line.strip()): 299 | handle.write(item.decode('utf-8')+'\n') 300 | 301 | #Sorting and uniquifying files(So we can handle a smaller number of hosts) 302 | sort_and_unique(args['output']) 303 | 304 | #Replacing incorrect/malformed subdomains (e.g. test..example.com) 305 | with open(args['output'], 'r+') as handle: 306 | #Sorting and uniquifying is required since for example before replacing test..example.com, test.example.com could have existed 307 | replaced = sorted(set(map(lambda line: re.sub('\.{2,}', '.', line) ,handle.readlines()))) 308 | with open(args['output'], 'w') as handle: 309 | handle.writelines(replaced) 310 | 311 | 312 | if __name__ == '__main__': 313 | main() 314 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | dank 2 | datrie 3 | dnspython 4 | tldextract 5 | editdistance 6 | --------------------------------------------------------------------------------