├── .gitignore ├── README.md ├── app.yaml ├── main.html ├── mirror.py ├── static ├── base.css ├── favicon.ico ├── favicon.png ├── lock.png ├── mirrorrr_logo.png ├── mirrorrr_screenshot.png ├── nolock.png └── robots.txt ├── transform_content.py └── transform_content_test.py /.gitignore: -------------------------------------------------------------------------------- 1 | .svn 2 | *.pyc 3 | .DS_Store 4 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Google App Engine app that Mirrors the content of URLs you supply. Rewrites the fetched page to mirror all content, including images, Flash, Javascript, CSS, and even favicons. You stay within the cache when you follow links. Useful for pulling load off of slashdotted servers. Also can be used to anonymize web access. 2 | 3 | Example live version: 4 | 5 | [https://mirrorrr.appspot.com](https://mirrorrr.appspot.com) 6 | 7 | Instructions on how to setup your own proxy: 8 | 9 | [http://www.hongkiat.com/blog/proxy-with-google-app-engine/](http://www.hongkiat.com/blog/proxy-with-google-app-engine/) 10 | 11 | For POST support and other features, see mirrorrr-plus: 12 | 13 | [https://code.google.com/p/mirrorrr-plus/](https://code.google.com/p/mirrorrr-plus/) 14 | -------------------------------------------------------------------------------- /app.yaml: -------------------------------------------------------------------------------- 1 | application: your-app-id-here 2 | version: first 3 | module: default 4 | runtime: python27 5 | api_version: 1 6 | threadsafe: yes 7 | 8 | inbound_services: 9 | - warmup 10 | 11 | instance_class: F1 12 | automatic_scaling: 13 | min_idle_instances: 1 14 | max_idle_instances: 1 15 | max_concurrent_requests: 40 16 | 17 | handlers: 18 | - url: /robots\.txt 19 | static_files: static/robots.txt 20 | upload: static/robots\.txt 21 | 22 | - url: /favicon\.ico 23 | static_files: static/favicon.ico 24 | upload: static/favicon\.ico 25 | secure: optional 26 | 27 | - url: /static 28 | static_dir: static 29 | secure: optional 30 | 31 | - url: /_ah/warmup 32 | script: mirror.app 33 | secure: optional 34 | 35 | - url: /.* 36 | script: mirror.app 37 | secure: optional 38 | -------------------------------------------------------------------------------- /main.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 |
5 | 6 |[\"']?)" 41 | 42 | # Start of a CSS import 43 | CSS_IMPORT_START = r"(?i)@import(?P[\t ]+)(?P [\"']?)" 44 | 45 | # CSS url() call 46 | CSS_URL_START = r"(?i)\burl\((?P[\"']?)" 47 | 48 | 49 | REPLACEMENT_REGEXES = [ 50 | (TAG_START + SAME_DIR_URL_REGEX, 51 | "\g\g \g %(accessed_dir)s\g"), 52 | 53 | (TAG_START + TRAVERSAL_URL_REGEX, 54 | "\g \g \g %(accessed_dir)s/\g/\g "), 55 | 56 | (TAG_START + BASE_RELATIVE_URL_REGEX, 57 | "\g \g \g /%(base)s/\g"), 58 | 59 | (TAG_START + ROOT_DIR_URL_REGEX, 60 | "\g \g \g /%(base)s/"), 61 | 62 | # Need this because HTML tags could end with '/>', which confuses the 63 | # tag-matching regex above, since that's the end-of-match signal. 64 | (TAG_START + ABSOLUTE_URL_REGEX, 65 | "\g\g \g /\g"), 66 | 67 | (CSS_IMPORT_START + SAME_DIR_URL_REGEX, 68 | "@import\g \g %(accessed_dir)s\g"), 69 | 70 | (CSS_IMPORT_START + TRAVERSAL_URL_REGEX, 71 | "@import\g \g %(accessed_dir)s/\g/\g "), 72 | 73 | (CSS_IMPORT_START + BASE_RELATIVE_URL_REGEX, 74 | "@import\g \g /%(base)s/\g"), 75 | 76 | (CSS_IMPORT_START + ABSOLUTE_URL_REGEX, 77 | "@import\g \g /\g"), 78 | 79 | (CSS_URL_START + SAME_DIR_URL_REGEX, 80 | "url(\g %(accessed_dir)s\g"), 81 | 82 | (CSS_URL_START + TRAVERSAL_URL_REGEX, 83 | "url(\g %(accessed_dir)s/\g/\g "), 84 | 85 | (CSS_URL_START + BASE_RELATIVE_URL_REGEX, 86 | "url(\g /%(base)s/\g"), 87 | 88 | (CSS_URL_START + ABSOLUTE_URL_REGEX, 89 | "url(\g /\g"), 90 | ] 91 | 92 | ################################################################################ 93 | 94 | def TransformContent(base_url, accessed_url, content): 95 | url_obj = urlparse.urlparse(accessed_url) 96 | accessed_dir = os.path.dirname(url_obj.path) 97 | if not accessed_dir.endswith("/"): 98 | accessed_dir += "/" 99 | 100 | for pattern, replacement in REPLACEMENT_REGEXES: 101 | fixed_replacement = replacement % { 102 | "base": base_url, 103 | "accessed_dir": accessed_dir, 104 | } 105 | content = re.sub(pattern, fixed_replacement, content) 106 | return content 107 | -------------------------------------------------------------------------------- /transform_content_test.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # Copyright 2008 Brett Slatkin 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | __author__ = "Brett Slatkin (bslatkin@gmail.com)" 17 | 18 | import logging 19 | import unittest 20 | 21 | import transform_content 22 | 23 | ################################################################################ 24 | 25 | class TransformTest(unittest.TestCase): 26 | 27 | def _RunTransformTest(self, base_url, accessed_url, original, expected): 28 | tag_tests = [ 29 | ' ', 30 | "
", 31 | "
", 32 | "
", 33 | "
", 35 | "
", 36 | "
", 37 | '', 38 | "", 39 | "", 40 | "", 41 | "", 43 | "", 44 | "", 45 | "
", 46 | " ", 47 | ' ', 48 | '