├── .gitignore ├── README.md ├── genmarkov.py ├── tarpyt-fcgi.py ├── tarpyt-wsgiref.py ├── tarpyt.cfg ├── tarpyt.py └── www └── robots.txt /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | *.mkv 3 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | TarPyt 2 | ====== 3 | TarPyt is a Python Web tarpit, inspired by a description of a PHP 4 | equivalent called [Labyrinth](https://code.google.com/p/weblabyrinth/). 5 | It's an automated scanner's worst nightmare, a maze of twisty URIs, all alike. 6 | 7 | Running TarPyt 8 | -------------- 9 | 10 | TarPyt can be run on its own, or as a 11 | [WSGI](http://wsgi.readthedocs.org/en/latest/index.html) app. In its current 12 | (testing) state, running under a separate webserver as a WSGI app has not been 13 | tested, but should work just fine. To run standalone, just run: 14 | 15 | python tarpyt.py 16 | 17 | Generating Markov chains 18 | ------------------------ 19 | 20 | The included `genmarkov.py` can be used to generate and 21 | [pickle](http://docs.python.org/library/pickle.html) a simple Markov chain for 22 | building html-like content. I've had decent luck pointing it at the 23 | [Alexa](http://www.alexa.com/topsites) top 20 web sites, downloaded with 24 | `wget`. Currently, TarPyt only uses these chains to generate urls, but full 25 | pages will be coming soon. 26 | 27 | Features 28 | -------- 29 | A random list of features, to be better organized later: 30 | 31 | * WSGI-compatible interface 32 | * "Random" different responses (HTTP redirects, link pages, etc.) 33 | * Base response on a hash of the request, so it's consistent 34 | * Generate Markov chains of HTML and URI paths for realistic responses 35 | * Infinite redirects, slow responses 36 | * Artificially slow responses (1 Bps) 37 | * Artificially large (4GB) content-length headers for agents that pre-allocate storage 38 | 39 | Todo 40 | ---- 41 | * Shell script to get top 100 Alexa sites and build markov chains 42 | * Use Markov chains to build HTML in responses 43 | * False positives for scanners: SQLi (database errors), etc. 44 | * Alerting, stats? 45 | 46 | Attacks 47 | ------- 48 | Possible ideas for cruelty to scanners/spiders: 49 | 50 | * Pathological-case compression (high resource use for recipient) 51 | * Broken markup: research edge cases for XML parsers 52 | -------------------------------------------------------------------------------- /genmarkov.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from HTMLParser import HTMLParser, HTMLParseError 4 | from optparse import OptionParser 5 | import fileinput 6 | import pickle 7 | import random 8 | import sys 9 | from codecs import open 10 | 11 | class MarkovChain(object): 12 | def __init__(self): 13 | self.words = {} 14 | self.maxcount = 0 15 | 16 | def add(self, prev_word, next_word): 17 | if not (prev_word in self.words): 18 | self.words[prev_word] = {} 19 | if not (next_word in self.words[prev_word]): 20 | self.words[prev_word][next_word] = 0 21 | self.words[prev_word][next_word] += 1 22 | self.maxcount = max(self.maxcount, self.words[prev_word][next_word]) 23 | 24 | def get(self, prev_word): 25 | if not (prev_word in self.words): 26 | return random.choice(self.words.keys()) 27 | followers = self.words[prev_word] 28 | allcounts = sum(followers.itervalues()) 29 | randval = random.randint(1,allcounts) 30 | partial_sum = 0 31 | for word, count in followers.iteritems(): 32 | partial_sum += count 33 | if partial_sum >= randval: 34 | return word 35 | 36 | class TagState(object): 37 | def __init__(self, tag='^', prev='^'): 38 | self.prev_child = prev 39 | self.tag = tag 40 | 41 | class MarkovBuilder(HTMLParser, object): 42 | def setup_chain(self): 43 | self.siblings = MarkovChain() 44 | self.children = MarkovChain() 45 | self.attrs = MarkovChain() 46 | self.data = MarkovChain() 47 | self.uripaths = MarkovChain() 48 | self.tagstack = [TagState()] 49 | self.maxsibs = 0 50 | self.maxdepth = 0 51 | self.gendepth = 0 52 | self.popped = True 53 | 54 | def handle_starttag(self, tag, attrs): 55 | self.popped = False 56 | state = self.tagstack[-1] 57 | #print >>sys.stderr, "{0: >10}{1}{2}".format(state.tag," "*len(self.tagstack), tag) 58 | if state.tag in ('meta','input','img','br','script','style','link'): 59 | self.handle_endtag(state.tag) 60 | state = self.tagstack[-1] 61 | if state.prev_child == '^': 62 | self.children.add(state.tag, tag) 63 | else: 64 | self.siblings.add(state.prev_child, tag) 65 | state.prev_child = tag 66 | prev_attr = tag 67 | for attr in attrs: 68 | if attr[0] == 'href': 69 | prev_path = '^' 70 | elems = attr[1].split('/') 71 | if len(elems) > 1 and elems[1] == '': 72 | elems = elems[3:] 73 | for elem in elems: 74 | self.uripaths.add(prev_path, elem) 75 | prev_path = elem 76 | self.uripaths.add(prev_path, '$') 77 | str_attr = u'{0}="{1}"'.format(*attr) 78 | self.attrs.add(prev_attr, str_attr) 79 | prev_attr = str_attr 80 | self.attrs.add(prev_attr, '$') 81 | newstate = TagState(tag=tag) 82 | self.tagstack.append(newstate) 83 | self.maxdepth = max(self.maxdepth, len(self.tagstack)) 84 | 85 | def handle_endtag(self, tag): 86 | state = self.tagstack.pop() 87 | if len(self.tagstack) == 0: 88 | self.tagstack = [TagState()] 89 | if state.prev_child == '^': 90 | self.children.add(state.tag, '$') 91 | if self.popped: 92 | self.siblings.add(state.tag, '$') 93 | self.popped = True 94 | 95 | def handle_startendtag(self, tag, attrs): 96 | self.handle_starttag(tag, attrs) 97 | self.handle_endtag(tag) 98 | 99 | def handle_data(self, data): 100 | self.data.add(self.tagstack[-1].tag, data) 101 | 102 | def unknown_decl(self, data): 103 | pass 104 | 105 | def save(self, filename='html.mkv'): 106 | out = open(filename, mode='wb') 107 | pickle.dump(self, out, protocol=2) 108 | 109 | def generate(self, tag='html', generate_links=False, depth=0): 110 | out = [] 111 | count = 0 112 | while tag != '$' and count <= self.siblings.maxcount: 113 | count += 1 114 | contents = [] 115 | if generate_links and tag == 'a': 116 | path = ['^'] 117 | while path[-1] != '$': 118 | path.append(self.uripaths.get(path[-1])) 119 | contents = [tag]+[u'href="/{0}"'.format('/'.join(path[1:-1]))] 120 | else: 121 | attr = tag 122 | while attr != '$': 123 | contents.append(attr) 124 | attr = self.attrs.get(attr) 125 | out.append(u'<{0}>\n'.format(' '.join(contents))) 126 | data = self.data.get(tag) 127 | if data and data != '$': 128 | out.append(data) 129 | first_child = self.children.get(tag) 130 | if first_child and first_child != '$': 131 | depth += 1 132 | if depth <= self.maxdepth: 133 | out.append(self.generate( 134 | first_child, 135 | generate_links=generate_links, 136 | depth=depth)) 137 | out.append(u'{0}>\n'.format(tag)) 138 | tag = self.siblings.get(tag) 139 | return ''.join(out) 140 | 141 | def reset(self): 142 | super(MarkovBuilder, self).reset() 143 | self.tagstack = [TagState()] 144 | 145 | class GotCharset(Exception): 146 | def __init__(self, charset): 147 | assert charset 148 | self.charset = charset 149 | 150 | class EncodingDetector(HTMLParser): 151 | def handle_starttag(self, tag, attrs): 152 | if tag == 'meta': 153 | attrhash = {} 154 | for attr in attrs: 155 | attrhash[attr[0]] = attr[1] 156 | if 'charset' in attrhash: 157 | raise GotCharset, attrhash['charset'] 158 | elif 'http-equiv' in attrhash: 159 | if attrhash['http-equiv'].lower() == 'content-type': 160 | for chunk in attrhash['content'].split(';'): 161 | if 'charset' in chunk: 162 | raise GotCharset, chunk.split('=')[-1] 163 | 164 | def parse(filenames): 165 | builder = MarkovBuilder() 166 | builder.setup_chain() 167 | for fname in filenames: 168 | filein = open(fname, mode='rb', encoding='utf-8', errors='replace') 169 | try: 170 | try: 171 | getcharset = EncodingDetector() 172 | for line in filein: 173 | getcharset.feed(line) 174 | except GotCharset as e: 175 | filein.close() 176 | filein = open(fname, mode='rb', encoding=e.charset, errors='replace') 177 | filein.seek(0) 178 | builder.feed(filein.read()) 179 | builder.close() 180 | except HTMLParseError: 181 | pass 182 | builder.reset() 183 | filein.close() 184 | return builder 185 | 186 | 187 | if __name__=='__main__': 188 | parser = OptionParser() 189 | parser.add_option('-p', '--pickle', 190 | help='Pickle the MarkovBuilder into FILE', metavar='FILE') 191 | (options, args) = parser.parse_args() 192 | builder = parse(args) 193 | if options.pickle: 194 | builder.save(options.pickle) 195 | print builder.generate() 196 | -------------------------------------------------------------------------------- /tarpyt-fcgi.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from flup.server.fcgi import WSGIServer 4 | from optparse import OptionParser 5 | from tarpyt import Tarpyt 6 | 7 | if __name__=='__main__': 8 | parser = OptionParser() 9 | parser.add_option('-f', '--config', 10 | help='Tarpyt config file', metavar='FILE') 11 | (options, args) = parser.parse_args() 12 | tarpyt = Tarpyt(options.config) 13 | WSGIServer(tarpyt.application).run() 14 | -------------------------------------------------------------------------------- /tarpyt-wsgiref.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from wsgiref.simple_server import make_server 4 | from optparse import OptionParser 5 | from tarpyt import Tarpyt 6 | 7 | if __name__=='__main__': 8 | parser = OptionParser() 9 | parser.add_option('-f', '--config', 10 | help='Tarpyt config file', metavar='FILE') 11 | (options, args) = parser.parse_args() 12 | tarpyt = Tarpyt(options.config) 13 | httpd = make_server('', 8080, tarpyt.application) 14 | httpd.serve_forever() 15 | -------------------------------------------------------------------------------- /tarpyt.cfg: -------------------------------------------------------------------------------- 1 | #Config file for TarPyt 2 | [tarpyt] 3 | # A pickled instance of a genmarkov.MarkovBuilder object 4 | #markov_file= html.mkv 5 | # A directory out of which to serve files. Overrides [responses] below 6 | www_dir= ./www 7 | 8 | # Responses to use, chosen proportional to their weight 9 | [responses] 10 | # A page full of links 11 | linkpage= 7 12 | # A redirect to some other page 13 | redirect= 1 14 | # A chain of redirects that loops back on itself 15 | inf_redirect= 1 16 | # An empty response with an oversize Content-Length header 17 | oversize= 1 18 | # A response returned one byte per second 19 | slow= 1 20 | # XSL template recursion 21 | xslt_recurse= 1 22 | # external entity references large/infinite local system files 23 | xxe_dos= 1 24 | # Memory exhaustion through recursive entity definitions 25 | entity_dos= 1 26 | -------------------------------------------------------------------------------- /tarpyt.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import random 4 | from zlib import adler32 5 | from ConfigParser import SafeConfigParser, NoOptionError 6 | from bisect import bisect_right 7 | import pickle 8 | import time 9 | import urllib 10 | import os 11 | #import sys #stderr 12 | 13 | from genmarkov import MarkovBuilder, TagState, MarkovChain 14 | 15 | class Tarpyt(object): 16 | def __init__(self, config=None): 17 | self.builder = None 18 | self.www_dir = None 19 | conf = SafeConfigParser() 20 | if config: 21 | if hasattr(config, 'readline'): 22 | conf.readfp(config) 23 | else: 24 | conf.read(config) 25 | if conf.has_section('tarpyt'): 26 | try: 27 | mkvfile = conf.get('tarpyt','markov_file') 28 | mfile = open(mkvfile, 'rb') 29 | self.set_builder(pickle.load(mfile)) 30 | except NoOptionError: 31 | self.builder = None 32 | try: 33 | www = conf.get('tarpyt', 'www_dir') 34 | self.www_dir = os.path.abspath(www) if os.path.isdir(www) else None 35 | except NoOptionError: 36 | self.www_dir = None 37 | self.weight_total = 0 38 | self.responses = [] 39 | if conf.has_section('responses'): 40 | def update(response): 41 | self.responses.append( 42 | getattr(self, 'response_' + response[0]) ) 43 | self.weight_total += int(response[1]) 44 | return self.weight_total 45 | self.weights = map(update, 46 | sorted( conf.items('responses'), key=lambda x: int(x[1]) )) 47 | else: 48 | self.responses.append(self.response_linkpage) 49 | self.weights = [1] 50 | self.weight_total = 1 51 | 52 | def getresponse(self, key): 53 | index = adler32(key) % self.weight_total 54 | i = bisect_right(self.weights, index) 55 | r = self.responses[i] 56 | return self.responses[bisect_right(self.weights, index)] 57 | 58 | def getlink(self, path='/'): 59 | next_path = path.rstrip('/') + '/{0}' 60 | return next_path.format(chr(random.randint(0x61,0x7A))) 61 | 62 | def set_builder(self, builder): 63 | self.builder = builder 64 | def getlink(path='/'): 65 | pathlist = ['^'] 66 | pathlist.extend(filter(lambda x: x, path.split('/'))) 67 | elem = self.builder.uripaths.get(pathlist[-1]) 68 | if elem == '$': 69 | elem = self.builder.uripaths.get(None) 70 | return '/'+'/'.join(pathlist[1:]+[elem]) 71 | self.getlink = getlink 72 | 73 | def response_slow(self, environ, start_response): 74 | """ Category: tarpit 75 | Returns an html page, but very slowly 76 | """ 77 | content = None 78 | if self.builder: 79 | content = self.builder.generate(generate_links=True) 80 | else: 81 | content = u"A" * 4096 82 | status = '200 OK' 83 | headers = [('Content-Length',str(len(content)))] 84 | start_response(status, headers) 85 | for char in content.encode('utf-8'): 86 | yield char 87 | time.sleep(1) 88 | 89 | def response_linkpage(self, environ, start_response): 90 | """ Category: tarpit 91 | Returns an html page full of links 92 | """ 93 | response_body = None 94 | if self.builder: 95 | response_body = self.builder.generate(generate_links=True) 96 | else: 97 | page_string = "