My Proxy Server

├── static └── robots.txt ├── README ├── app.yaml ├── index.yaml ├── main.html ├── transform_content.py └── mirror.py /static/robots.txt: -------------------------------------------------------------------------------- 1 | User-agent: * 2 | Disallow: / 3 | Allow: /main 4 | -------------------------------------------------------------------------------- /README: -------------------------------------------------------------------------------- 1 | Setup your own proxy server using Google App Engine 2 | 3 | Tutorial: http://www.labnol.org/internet/setup-proxy-server/12890/ 4 | 5 | Video: https://www.youtube.com/watch?v=3f6Zq4prys0 6 | 7 | Support: http://twitter.com/labnol 8 | -------------------------------------------------------------------------------- /app.yaml: -------------------------------------------------------------------------------- 1 | application: YOUR_APP_ID 2 | version: secureable 3 | runtime: python27 4 | api_version: 1 5 | threadsafe: true 6 | 7 | handlers: 8 | 9 | - url: /robots\.txt 10 | static_files: static/robots.txt 11 | upload: static/robots\.txt 12 | 13 | - url: /static 14 | static_dir: static 15 | secure: optional 16 | 17 | - url: /.* 18 | script: mirror.app 19 | secure: optional -------------------------------------------------------------------------------- /index.yaml: -------------------------------------------------------------------------------- 1 | indexes: 2 | 3 | # AUTOGENERATED 4 | 5 | # This index.yaml is automatically updated whenever the dev_appserver 6 | # detects that a new type of query is run. If you want to manage the 7 | # index.yaml file manually, remove the above marker line (the line 8 | # saying "# AUTOGENERATED"). If you want to manage some indexes 9 | # manually, move them above the marker line. The index.yaml file is 10 | # automatically uploaded to the admin console when you next deploy 11 | # your application using appcfg.py. 12 | -------------------------------------------------------------------------------- /main.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | My Proxy Server 5 | 56 | 57 | 58 | 59 |

60 |

My Proxy Server

61 | 62 | 66 | 67 | 69 | 73 |

74 | 75 | 76 | -------------------------------------------------------------------------------- /transform_content.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # Copyright 2008 Brett Slatkin 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | __author__ = "Brett Slatkin (bslatkin@gmail.com)" 17 | 18 | import os 19 | import re 20 | import urlparse 21 | 22 | ################################################################################ 23 | 24 | # URLs that have absolute addresses 25 | ABSOLUTE_URL_REGEX = r"(http(s?):)?//(?P[^\"'> \t\)]+)" 26 | 27 | # URLs that are relative to the base of the current hostname. 28 | BASE_RELATIVE_URL_REGEX = r"/(?!(/)|(http(s?)://)|(url\())(?P[^\"'> \t\)]*)" 29 | 30 | # URLs that have '../' or './' to start off their paths. 31 | TRAVERSAL_URL_REGEX = r"(?P\.(\.)?)/(?!(/)|(http(s?)://)|(url\())(?P[^\"'> \t\)]*)" 32 | 33 | # URLs that are in the same directory as the requested URL. 34 | SAME_DIR_URL_REGEX = r"(?!(/)|(http(s?)://)|(url\())(?P[^\"'> \t\)]+)" 35 | 36 | # URL matches the root directory. 37 | ROOT_DIR_URL_REGEX = r"(?!//(?!>))/(?P)(?=[ \t\n]*[\"'\)>/])" 38 | 39 | # Start of a tag using 'src' or 'href' 40 | TAG_START = r"(?i)\b(?Psrc|href|action|url|background)(?P[\t ]*=[\t ]*)(?P[\"']?)" 41 | 42 | # Start of a CSS import 43 | CSS_IMPORT_START = r"(?i)@import(?P[\t ]+)(?P[\"']?)" 44 | 45 | # CSS url() call 46 | CSS_URL_START = r"(?i)\burl\((?P[\"']?)" 47 | 48 | REPLACEMENT_REGEXES = [ 49 | (TAG_START + SAME_DIR_URL_REGEX, 50 | "\g\g\g%(accessed_dir)s\g"), 51 | 52 | (TAG_START + TRAVERSAL_URL_REGEX, 53 | "\g\g\g%(accessed_dir)s/\g/\g"), 54 | 55 | (TAG_START + BASE_RELATIVE_URL_REGEX, 56 | "\g\g\g/%(base)s/\g"), 57 | 58 | (TAG_START + ROOT_DIR_URL_REGEX, 59 | "\g\g\g/%(base)s/"), 60 | 61 | # Need this because HTML tags could end with '/>', which confuses the 62 | # tag-matching regex above, since that's the end-of-match signal. 63 | (TAG_START + ABSOLUTE_URL_REGEX, 64 | "\g\g\g/\g"), 65 | 66 | (CSS_IMPORT_START + SAME_DIR_URL_REGEX, 67 | "@import\g\g%(accessed_dir)s\g"), 68 | 69 | (CSS_IMPORT_START + TRAVERSAL_URL_REGEX, 70 | "@import\g\g%(accessed_dir)s/\g/\g"), 71 | 72 | (CSS_IMPORT_START + BASE_RELATIVE_URL_REGEX, 73 | "@import\g\g/%(base)s/\g"), 74 | 75 | (CSS_IMPORT_START + ABSOLUTE_URL_REGEX, 76 | "@import\g\g/\g"), 77 | 78 | (CSS_URL_START + SAME_DIR_URL_REGEX, 79 | "url(\g%(accessed_dir)s\g"), 80 | 81 | (CSS_URL_START + TRAVERSAL_URL_REGEX, 82 | "url(\g%(accessed_dir)s/\g/\g"), 83 | 84 | (CSS_URL_START + BASE_RELATIVE_URL_REGEX, 85 | "url(\g/%(base)s/\g"), 86 | 87 | (CSS_URL_START + ABSOLUTE_URL_REGEX, 88 | "url(\g/\g"), 89 | ] 90 | 91 | ################################################################################ 92 | 93 | def TransformContent(base_url, accessed_url, content): 94 | url_obj = urlparse.urlparse(accessed_url) 95 | accessed_dir = os.path.dirname(url_obj.path) 96 | if not accessed_dir.endswith("/"): 97 | accessed_dir += "/" 98 | 99 | for pattern, replacement in REPLACEMENT_REGEXES: 100 | fixed_replacement = replacement % { 101 | "base": base_url, 102 | "accessed_dir": accessed_dir, 103 | } 104 | content = re.sub(pattern, fixed_replacement, content) 105 | return content -------------------------------------------------------------------------------- /mirror.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # Copyright 2008 Brett Slatkin 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | __author__ = "Brett Slatkin (bslatkin@gmail.com)" 17 | 18 | import datetime 19 | import hashlib 20 | import logging 21 | import pickle 22 | import re 23 | import time 24 | import urllib 25 | import wsgiref.handlers 26 | 27 | from google.appengine.api import memcache 28 | from google.appengine.api import urlfetch 29 | from google.appengine.ext import db 30 | import webapp2 31 | from google.appengine.ext.webapp import template 32 | from google.appengine.runtime import apiproxy_errors 33 | 34 | import transform_content 35 | 36 | ################################################################################ 37 | 38 | DEBUG = False 39 | EXPIRATION_DELTA_SECONDS = 3600 40 | EXPIRATION_RECENT_URLS_SECONDS = 90 41 | 42 | ## DEBUG = True 43 | ## EXPIRATION_DELTA_SECONDS = 10 44 | ## EXPIRATION_RECENT_URLS_SECONDS = 1 45 | 46 | HTTP_PREFIX = "http://" 47 | HTTPS_PREFIX = "http://" 48 | 49 | IGNORE_HEADERS = frozenset([ 50 | 'set-cookie', 51 | 'expires', 52 | 'cache-control', 53 | 54 | # Ignore hop-by-hop headers 55 | 'connection', 56 | 'keep-alive', 57 | 'proxy-authenticate', 58 | 'proxy-authorization', 59 | 'te', 60 | 'trailers', 61 | 'transfer-encoding', 62 | 'upgrade', 63 | ]) 64 | 65 | TRANSFORMED_CONTENT_TYPES = frozenset([ 66 | "text/html", 67 | "text/css", 68 | ]) 69 | 70 | 71 | MAX_CONTENT_SIZE = 10 ** 6 72 | 73 | MAX_URL_DISPLAY_LENGTH = 50 74 | 75 | ################################################################################ 76 | 77 | def get_url_key_name(url): 78 | url_hash = hashlib.sha256() 79 | url_hash.update(url) 80 | return "hash_" + url_hash.hexdigest() 81 | 82 | ################################################################################ 83 | 84 | class EntryPoint(db.Model): 85 | translated_address = db.TextProperty(required=True) 86 | last_updated = db.DateTimeProperty(auto_now=True) 87 | display_address = db.TextProperty() 88 | 89 | 90 | class MirroredContent(object): 91 | def __init__(self, original_address, translated_address, 92 | status, headers, data, base_url): 93 | self.original_address = original_address 94 | self.translated_address = translated_address 95 | self.status = status 96 | self.headers = headers 97 | self.data = data 98 | self.base_url = base_url 99 | 100 | @staticmethod 101 | def get_by_key_name(key_name): 102 | return memcache.get(key_name) 103 | 104 | @staticmethod 105 | def fetch_and_store(key_name, base_url, translated_address, mirrored_url): 106 | """Fetch and cache a page. 107 | 108 | Args: 109 | key_name: Hash to use to store the cached page. 110 | base_url: The hostname of the page that's being mirrored. 111 | translated_address: The URL of the mirrored page on this site. 112 | mirrored_url: The URL of the original page. Hostname should match 113 | the base_url. 114 | 115 | Returns: 116 | A new MirroredContent object, if the page was successfully retrieved. 117 | None if any errors occurred or the content could not be retrieved. 118 | """ 119 | 120 | logging.debug("Fetching '%s'", mirrored_url) 121 | try: 122 | response = urlfetch.fetch(mirrored_url) 123 | except (urlfetch.Error, apiproxy_errors.Error): 124 | logging.exception("Could not fetch URL") 125 | return None 126 | 127 | adjusted_headers = {} 128 | for key, value in response.headers.iteritems(): 129 | adjusted_key = key.lower() 130 | if adjusted_key not in IGNORE_HEADERS: 131 | adjusted_headers[adjusted_key] = value 132 | 133 | content = response.content 134 | page_content_type = adjusted_headers.get("content-type", "") 135 | for content_type in TRANSFORMED_CONTENT_TYPES: 136 | # Startswith() because there could be a 'charset=UTF-8' in the header. 137 | if page_content_type.startswith(content_type): 138 | content = transform_content.TransformContent(base_url, mirrored_url, 139 | content) 140 | break 141 | 142 | # If the transformed content is over 1MB, truncate it (yikes!) 143 | if len(content) > MAX_CONTENT_SIZE: 144 | logging.warning('Content is over 1MB; truncating') 145 | content = content[:MAX_CONTENT_SIZE] 146 | 147 | new_content = MirroredContent( 148 | base_url=base_url, 149 | original_address=mirrored_url, 150 | translated_address=translated_address, 151 | status=response.status_code, 152 | headers=adjusted_headers, 153 | data=content) 154 | if not memcache.add(key_name, new_content, time=EXPIRATION_DELTA_SECONDS): 155 | logging.error('memcache.add failed: key_name = "%s", ' 156 | 'original_url = "%s"', key_name, mirrored_url) 157 | 158 | return new_content 159 | 160 | ################################################################################ 161 | 162 | class BaseHandler(webapp2.RequestHandler): 163 | def get_relative_url(self): 164 | slash = self.request.url.find("/", len(self.request.scheme + "://")) 165 | if slash == -1: 166 | return "/" 167 | return self.request.url[slash:] 168 | 169 | 170 | class HomeHandler(BaseHandler): 171 | def get(self): 172 | # Handle the input form to redirect the user to a relative url 173 | form_url = self.request.get("url") 174 | if form_url: 175 | # Accept URLs that still have a leading 'http://' 176 | inputted_url = urllib.unquote(form_url) 177 | if inputted_url.startswith(HTTP_PREFIX): 178 | inputted_url = inputted_url[len(HTTP_PREFIX):] 179 | return self.redirect("/" + inputted_url) 180 | 181 | latest_urls = memcache.get('latest_urls') 182 | if latest_urls is None: 183 | latest_urls = EntryPoint.gql("ORDER BY last_updated DESC").fetch(25) 184 | 185 | # Generate a display address that truncates the URL, adds an ellipsis. 186 | # This is never actually saved in the Datastore. 187 | for entry_point in latest_urls: 188 | entry_point.display_address = \ 189 | entry_point.translated_address[:MAX_URL_DISPLAY_LENGTH] 190 | if len(entry_point.display_address) == MAX_URL_DISPLAY_LENGTH: 191 | entry_point.display_address += '...' 192 | 193 | if not memcache.add('latest_urls', latest_urls, 194 | time=EXPIRATION_RECENT_URLS_SECONDS): 195 | logging.error('memcache.add failed: latest_urls') 196 | 197 | # Do this dictionary construction here, to decouple presentation from 198 | # how we store data. 199 | secure_url = None 200 | if self.request.scheme == "http": 201 | secure_url = "https://mirrorrr.appspot.com" 202 | context = { 203 | "latest_urls": latest_urls, 204 | "secure_url": secure_url, 205 | } 206 | self.response.out.write(template.render("main.html", context)) 207 | 208 | 209 | class MirrorHandler(BaseHandler): 210 | def get(self, base_url): 211 | assert base_url 212 | 213 | # Log the user-agent and referrer, to see who is linking to us. 214 | logging.debug('User-Agent = "%s", Referrer = "%s"', 215 | self.request.user_agent, 216 | self.request.referer) 217 | logging.debug('Base_url = "%s", url = "%s"', base_url, self.request.url) 218 | 219 | translated_address = self.get_relative_url()[1:] # remove leading / 220 | mirrored_url = HTTP_PREFIX + translated_address 221 | 222 | # Use sha256 hash instead of mirrored url for the key name, since key 223 | # names can only be 500 bytes in length; URLs may be up to 2KB. 224 | key_name = get_url_key_name(mirrored_url) 225 | logging.info("Handling request for '%s' = '%s'", mirrored_url, key_name) 226 | 227 | content = MirroredContent.get_by_key_name(key_name) 228 | cache_miss = False 229 | if content is None: 230 | logging.debug("Cache miss") 231 | cache_miss = True 232 | content = MirroredContent.fetch_and_store(key_name, base_url, 233 | translated_address, 234 | mirrored_url) 235 | if content is None: 236 | return self.error(404) 237 | 238 | # Store the entry point down here, once we know the request is good and 239 | # there has been a cache miss (i.e., the page expired). If the referrer 240 | # wasn't local, or it was '/', then this is an entry point. 241 | if (cache_miss and 242 | 'Googlebot' not in self.request.user_agent and 243 | 'Slurp' not in self.request.user_agent and 244 | (not self.request.referer.startswith(self.request.host_url) or 245 | self.request.referer == self.request.host_url + "/")): 246 | # Ignore favicons as entry points; they're a common browser fetch on 247 | # every request for a new site that we need to special case them here. 248 | if not self.request.url.endswith("favicon.ico"): 249 | logging.info("Inserting new entry point") 250 | entry_point = EntryPoint( 251 | key_name=key_name, 252 | translated_address=translated_address) 253 | try: 254 | entry_point.put() 255 | except (db.Error, apiproxy_errors.Error): 256 | logging.exception("Could not insert EntryPoint") 257 | 258 | for key, value in content.headers.iteritems(): 259 | self.response.headers[key] = value 260 | if not DEBUG: 261 | self.response.headers['cache-control'] = \ 262 | 'max-age=%d' % EXPIRATION_DELTA_SECONDS 263 | 264 | self.response.out.write(content.data) 265 | 266 | app = webapp2.WSGIApplication([ 267 | (r"/", HomeHandler), 268 | (r"/main", HomeHandler), 269 | (r"/([^/]+).*", MirrorHandler) 270 | ], debug=DEBUG) 271 | --------------------------------------------------------------------------------