147 |
148 |
149 | |
150 |
151 |
152 |
153 |
154 |
155 | |
156 |
157 | |
158 |
159 | Close
160 | Help
161 | |
162 |
163 |
164 |
165 | """
166 |
--------------------------------------------------------------------------------
/liveweb/tools/wsgiapp.py:
--------------------------------------------------------------------------------
1 | """Really simple wsgi framework.
2 | """
3 |
4 | import re
5 | import traceback
6 |
7 | class wsgiapp:
8 | """Simple WSGI web framework.
9 |
10 | class applicaiton(wsgiapp):
11 | urls = [
12 | ("/", "index")
13 | ]
14 | def GET_index(self):
15 | self.header("Content-Type", "text/plain")
16 | return "hello, world!"
17 | """
18 | def __init__(self, environ, start_response):
19 | self.start = start_response
20 | self.environ = environ
21 |
22 | self.status = "200 OK"
23 | self._headers = {}
24 |
25 | def input(self):
26 | tokens = self.environ.get("QUERY_STRING", "").split("&")
27 | print "input", tokens
28 | return dict(kv.split("=") for kv in tokens if "=" in kv)
29 |
30 | def header(self, name, value):
31 | self._headers[name.title()] = value
32 |
33 | def __iter__(self):
34 | try:
35 | x = self.delegate()
36 | self.start(self.status, self._headers.items())
37 | return iter(x)
38 | except:
39 | headers = {"Content-Type": "text/plain"}
40 | self.start("500 Internal Error", headers.items())
41 | out = "Internal Error:\n\n"
42 | exc = traceback.format_exc()
43 | return iter([out, exc])
44 |
45 | def delegate(self):
46 | """Delegates the request to appropriate method.
47 | """
48 | path = self.environ['PATH_INFO']
49 | method = self.environ['REQUEST_METHOD']
50 |
51 | # Try each pattern and dispatch to the right method
52 | for pattern, name in self.urls:
53 | m = re.match('^' + pattern + '$', path)
54 | if m:
55 | funcname = method.upper() + "_" + name
56 | f = getattr(self, funcname)
57 | return f(*m.groups())
58 |
59 | # give "404 Not Found" if all the patterns are exhausted
60 | return self.notfound()
61 |
62 | def notfound(self):
63 | self.status = "404 Not Found"
64 | self.headers = {"Content-Type": "text/html"}.items()
65 | return ["Not Found"]
66 |
67 |
--------------------------------------------------------------------------------
/liveweb/webapp.py:
--------------------------------------------------------------------------------
1 | """The webapp for arc proxy.
2 | """
3 |
4 | from cStringIO import StringIO
5 | import gzip
6 | import logging
7 | import socket
8 | import datetime
9 |
10 | from warc.arc import ARCRecord, ARCFile
11 |
12 | from . import proxy
13 | from . import errors
14 | from . import config
15 | from . import file_pool
16 | from . import cache
17 |
18 | pool = None
19 | _cache = None
20 |
21 | def init_arc_file(fileobj):
22 | """Writes the ARC file headers when a new file is created.
23 | """
24 | zfileobj = gzip.GzipFile(fileobj=fileobj, filename=None, mode="w")
25 |
26 | headers = {}
27 | headers['date'] = datetime.datetime.utcnow()
28 | headers['ip_address'] = socket.gethostbyname(socket.gethostname())
29 | headers['org'] = "InternetArchive"
30 |
31 | afile = ARCFile(fileobj=zfileobj, filename=fileobj.name, mode='wb', version=1, file_headers=headers)
32 | afile._write_header()
33 | afile.close()
34 | fileobj.flush()
35 |
36 | def setup():
37 | """This is called from main to initialize the requires globals.
38 | """
39 | global pool, _cache
40 |
41 | # Write ARC file header if the archive format is "arc"
42 | if config.archive_format == "arc":
43 | init_file = init_arc_file
44 | else:
45 | init_file = None
46 |
47 | pool = file_pool.FilePool(config.output_directory,
48 | pattern=config.filename_pattern,
49 | max_files=config.num_writers,
50 | max_file_size=config.filesize_limit,
51 | init_file_func=init_file)
52 | _cache = cache.create(type=config.cache, config=config)
53 |
54 | # For redis cache, use redis for keeping track of file number sequence
55 | if config.cache == 'redis':
56 | pool.set_sequence(_cache)
57 |
58 | class application:
59 | """WSGI application for liveweb proxy.
60 | """
61 | def __init__(self, environ, start_response):
62 | self.environ = environ
63 | self.start_response = start_response
64 |
65 | def parse_request(self):
66 | self.method = self.environ['REQUEST_METHOD']
67 | if 'REQUEST_URI' in self.environ: # This is for uwsgi
68 | self.url = self.environ['REQUEST_URI'] #TODO: Is this a valid environment variable always?
69 | if 'RAW_URI' in self.environ: # This is for gunicorn
70 | self.url = self.environ['RAW_URI'] #TODO: Is this a valid environment variable always?
71 |
72 | # Allow accessing the proxy using regular URL so that we can use
73 | # tools like ab.
74 | if self.url.startswith("/_web/"):
75 | self.url = self.url[len("/_web/"):]
76 |
77 | # Since this is a proxy, the URL is always of the form http://hostname/path
78 | # nginx is stripping the http://host from the passed URL and just passing the /path here.
79 | # This is a work-around for that issue.
80 | if self.url.startswith("/"):
81 | self.url = "http://" + self.environ['HTTP_HOST'] + self.url
82 |
83 | def __iter__(self):
84 | try:
85 | self.parse_request()
86 |
87 | record = self.get_record()
88 | if config.http_passthrough:
89 | return self.proxy_response(record)
90 | else:
91 | return self.success(record.content_length, record.content_iter)
92 | except:
93 | logging.error("Internal Error - %s", self.url, exc_info=True)
94 | return self.error("500 Internal Server Error")
95 |
96 | def get_record(self):
97 | """Fetches the Record object from cache or constructs from web.
98 | """
99 | record = _cache.get(self.url)
100 | if record is None:
101 | http_response = proxy.urlopen(self.url)
102 | record = http_response.write_arc(pool)
103 | _cache.set(self.url, record)
104 | return record
105 |
106 | def proxy_response(self, record):
107 | """Send the response data as it is """
108 | # TODO: This is very inefficient. Improve.
109 |
110 | # Now we only have the ARC record data.
111 | record_payload = record.read_all()
112 | record_payload = gzip.GzipFile(fileobj=StringIO(record_payload)).read()
113 | arc = ARCRecord.from_string(record_payload, version=1)
114 |
115 | # Create a FakeSocket and read HTTP headers and payload.
116 | sock = proxy._FakeSocket(StringIO(arc.payload))
117 | response = proxy.ProxyHTTPResponse(self.url, sock)
118 | response.begin()
119 |
120 | status = "%d %s" % (response.status, response.reason)
121 | headers = response.getheaders()
122 | self.start_response(status, headers)
123 | return response.get_payload()
124 |
125 | def success(self, clen, data):
126 | status = '200 OK'
127 | response_headers = [
128 | ('Content-type', 'application/x-arc-record'),
129 | ('Content-Length', str(clen))
130 | ]
131 | self.start_response(status, response_headers)
132 | return iter(data)
133 |
134 | def error(self, status, headers=None):
135 | if headers is None:
136 | headers = [
137 | ('Content-Type', 'text/plain'),
138 | ('Content-Length', '0'),
139 | ]
140 | self.start_response(status, headers)
141 | return iter([])
142 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | BeautifulSoup==3.2.1
2 | Genshi==0.6
3 | PyYAML==3.10
4 | hiredis==0.1.1
5 | py==1.4.7
6 | pytest==2.2.3
7 | redis==2.4.12
8 | uWSGI==1.9.14
9 | warc
10 | wsgiref==0.1.2
11 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 |
2 | from setuptools import setup
3 |
4 | requirements = [line.strip() for line in open("requirements.txt")]
5 |
6 | setup(
7 | name="liveweb",
8 | version="2.0.dev",
9 | description="Liveweb proxy",
10 | license='GPL v2',
11 | author="Internet Archive",
12 | author_email="info@archive.org",
13 | url="http://github.com/internetarchive/liveweb",
14 | packages=["liveweb", "liveweb.tools"],
15 | platforms=["any"],
16 | entry_points={
17 | "console_scripts": [
18 | "liveweb-proxy=liveweb.cli:main"
19 | ]
20 | },
21 | install_requires=requirements
22 | )
23 |
24 |
--------------------------------------------------------------------------------
|