├── data └── .gitignore ├── web ├── requirements.txt ├── Dockerfile ├── uwsgi.ini ├── config.py ├── browser.py ├── worker.py ├── app.py ├── handlers.py └── views │ └── index.html ├── set-scale.sh ├── browser-chrome ├── register.sh └── Dockerfile ├── browser-firefox ├── register.sh └── Dockerfile ├── .gitignore ├── LICENSE ├── docker-compose.yml └── README.md /data/.gitignore: -------------------------------------------------------------------------------- 1 | # Ignore everything in this directory 2 | * 3 | # Except this file 4 | !.gitignore 5 | -------------------------------------------------------------------------------- /web/requirements.txt: -------------------------------------------------------------------------------- 1 | bottle 2 | selenium 3 | redis 4 | hiredis 5 | uwsgi 6 | gevent 7 | requests 8 | -------------------------------------------------------------------------------- /set-scale.sh: -------------------------------------------------------------------------------- 1 | docker-compose scale chrome=$1 firefox=$1 2 | docker-compose scale workerchrome=$1 workerfirefox=$1 3 | 4 | -------------------------------------------------------------------------------- /browser-chrome/register.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | ip=$(head -n 1 /etc/hosts | cut -f 1) 4 | redis-cli -h redis_1 lpush $NODE_KEY $ip 5 | 6 | bash /opt/bin/entry_point.sh 7 | -------------------------------------------------------------------------------- /browser-firefox/register.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | ip=$(head -n 1 /etc/hosts | cut -f 1) 4 | redis-cli -h redis_1 lpush $NODE_KEY $ip 5 | 6 | bash /opt/bin/entry_point.sh 7 | -------------------------------------------------------------------------------- /web/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:2.7 2 | 3 | WORKDIR /code 4 | 5 | ADD requirements.txt /code/ 6 | 7 | RUN pip install -r requirements.txt 8 | 9 | ADD . /code 10 | -------------------------------------------------------------------------------- /browser-chrome/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM selenium/standalone-chrome 2 | 3 | USER root 4 | RUN apt-get update && apt-get install -y redis-tools 5 | 6 | WORKDIR /reg 7 | 8 | ADD register.sh /reg/ 9 | 10 | RUN chmod +x /reg/register.sh 11 | 12 | USER seluser 13 | 14 | CMD /reg/register.sh 15 | 16 | -------------------------------------------------------------------------------- /browser-firefox/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM selenium/standalone-firefox 2 | 3 | USER root 4 | RUN apt-get update && apt-get install -y redis-tools 5 | 6 | WORKDIR /reg 7 | 8 | ADD register.sh /reg/ 9 | 10 | RUN chmod +x /reg/register.sh 11 | 12 | USER seluser 13 | 14 | CMD /reg/register.sh 15 | 16 | -------------------------------------------------------------------------------- /web/uwsgi.ini: -------------------------------------------------------------------------------- 1 | [uwsgi] 2 | if-not-env = PORT 3 | http-socket = :8080 4 | endif = 5 | 6 | master = true 7 | buffer-size = 65536 8 | die-on-term = true 9 | 10 | if-env = VIRTUAL_ENV 11 | venv = $(VIRTUAL_ENV) 12 | endif = 13 | 14 | gevent = 1000 15 | gevent-early-monkey-patch = 16 | #processes = 10 17 | 18 | file = app.py 19 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.py[cod] 2 | 3 | # C extensions 4 | *.so 5 | 6 | # Packages 7 | *.egg 8 | *.egg-info 9 | dist 10 | build 11 | eggs 12 | parts 13 | bin 14 | var 15 | sdist 16 | develop-eggs 17 | .installed.cfg 18 | lib 19 | lib64 20 | __pycache__ 21 | 22 | # Installer logs 23 | pip-log.txt 24 | 25 | # Unit test / coverage reports 26 | .coverage 27 | .tox 28 | nosetests.xml 29 | 30 | # Translations 31 | *.mo 32 | 33 | # Mr Developer 34 | .mr.developer.cfg 35 | .project 36 | .pydevproject 37 | 38 | .vagrant 39 | -------------------------------------------------------------------------------- /web/config.py: -------------------------------------------------------------------------------- 1 | from handlers import WebRecorderHandler, SavePageNowHandler 2 | from collections import OrderedDict 3 | 4 | def get_config(): 5 | config = {} 6 | 7 | archives = OrderedDict() 8 | archives['webrecorder'] = WebRecorderHandler() 9 | # archives['test'] = WebRecorderHandler('https://webrecorder.io/preview/', desc='Preview with webrecorder.io (Not Recording)') 10 | archives['ia-save'] = SavePageNowHandler() 11 | 12 | config['archives'] = archives 13 | 14 | config['default_archive'] = 'webrecorder' 15 | 16 | config['redis_url'] = 'redis://redis_1/' 17 | config['chrome_url_log'] = True 18 | 19 | config['archive_cache_secs'] = 600 20 | config['err_cache_secs'] = 10 21 | 22 | config['wait_timeout_secs'] = 30 23 | return config 24 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2015 Ilya Kreymer 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | # Redis 2 | redis: 3 | restart: always 4 | image: redis:latest 5 | ports: 6 | - "6379:6379" 7 | 8 | volumes: 9 | - ./data/:/data/ 10 | 11 | 12 | # Selenium Chrome Web Driver 13 | chrome: 14 | build: ./browser-chrome 15 | restart: always 16 | links: 17 | - redis:redis 18 | 19 | environment: 20 | - NODE_KEY=nodes-chrome 21 | 22 | # see https://github.com/SeleniumHQ/docker-selenium/issues/79 23 | volumes: 24 | - /dev/shm:/dev/shm 25 | 26 | # Selenium FF Web Driver 27 | firefox: 28 | build: ./browser-firefox 29 | restart: always 30 | links: 31 | - redis:redis 32 | 33 | environment: 34 | - NODE_KEY=nodes-firefox 35 | 36 | 37 | # Worker for Chrome Browsers 38 | workerchrome: 39 | restart: always 40 | build: ./web 41 | links: 42 | - redis:redis 43 | - chrome:browser 44 | 45 | environment: 46 | - NODE_KEY=nodes-chrome 47 | 48 | command: python worker.py chrome 49 | 50 | 51 | # Worker for FF Browsers 52 | workerfirefox: 53 | restart: always 54 | build: ./web 55 | links: 56 | - redis:redis 57 | - firefox:browser 58 | 59 | environment: 60 | - NODE_KEY=nodes-firefox 61 | 62 | command: python worker.py firefox 63 | 64 | 65 | # Web App 66 | web: 67 | restart: always 68 | build: ./web 69 | 70 | command: uwsgi uwsgi.ini 71 | 72 | links: 73 | - redis:redis 74 | 75 | ports: 76 | - "8080:8080" 77 | 78 | -------------------------------------------------------------------------------- /web/browser.py: -------------------------------------------------------------------------------- 1 | from selenium.webdriver.common.desired_capabilities import DesiredCapabilities 2 | from selenium.webdriver import Chrome, Remote, Firefox 3 | 4 | from selenium.webdriver.chrome.options import Options 5 | from selenium.webdriver.firefox.firefox_profile import FirefoxProfile 6 | 7 | import json 8 | import logging 9 | 10 | 11 | # ============================================================================ 12 | class Browser(object): 13 | def __init__(self, host_name=None, readlog=False): 14 | self.readlog = readlog 15 | self.host_name = host_name 16 | 17 | self.caps = self._init_caps() 18 | self._init_driver() 19 | 20 | def _init_local(self): 21 | raise NotImplemented() 22 | 23 | def _init_driver(self): 24 | self.driver = None 25 | 26 | if not self.host_name: 27 | self.driver = self._init_local() 28 | return 29 | 30 | while True: 31 | try: 32 | self.driver = Remote(command_executor='http://{0}:4444/wd/hub'.format(self.host_name), 33 | desired_capabilities=self.caps) 34 | break 35 | except: 36 | import traceback 37 | traceback.print_exc() 38 | print('RETRY CONN') 39 | 40 | def close(self): 41 | if self.driver: 42 | self.driver.quit() 43 | 44 | def visit(self, url): 45 | try: 46 | self.driver.get(url) 47 | except: 48 | self._init_driver() 49 | self.driver.get(url) 50 | 51 | results = {} 52 | return results 53 | 54 | 55 | # ============================================================================ 56 | class ChromeBrowser(Browser): 57 | def _init_caps(self): 58 | caps = DesiredCapabilities.CHROME 59 | 60 | if self.readlog: 61 | caps['loggingPrefs'] = {'performance': 'ALL'} 62 | caps['chromeOptions'] = {'perfLoggingPrefs': {'enableTimeline': False, 'enablePage': False}} 63 | 64 | return caps 65 | 66 | def _init_local(self): 67 | return Chrome(chrome_options=Options(), desired_capabilities=self.caps) 68 | 69 | def visit(self, url): 70 | results = super(ChromeBrowser, self).visit(url) 71 | 72 | if not self.readlog: 73 | return results 74 | 75 | try: 76 | log = self.driver.get_log('performance') 77 | 78 | except Exception as e: 79 | import traceback 80 | traceback.print_exc() 81 | return results 82 | 83 | for entry in log: 84 | message = entry.get('message') 85 | try: 86 | message = json.loads(message) 87 | message = message['message'] 88 | if message['method'].startswith('Network'): 89 | resp = message['params'].get('response') 90 | if not resp: 91 | continue 92 | 93 | resp_url = resp.get('url', '') 94 | if resp_url and resp_url.startswith('http'): 95 | results[resp_url] = {'status': resp.get('status')} 96 | except: 97 | continue 98 | 99 | return results 100 | 101 | 102 | # ============================================================================ 103 | class FirefoxBrowser(Browser): 104 | def _init_caps(self): 105 | caps = DesiredCapabilities.FIREFOX 106 | return caps 107 | 108 | def _init_local(self): 109 | firefox_profile = FirefoxProfile() 110 | firefox_profile.set_preference('extensions.logging.enabled', False) 111 | firefox_profile.set_preference('network.dns.disableIPv6', False) 112 | return Firefox(firefox_profile) 113 | 114 | 115 | # ============================================================================ 116 | if __name__ == "__main__": 117 | global browser 118 | 119 | import sys 120 | 121 | if len(sys.argv) <= 1 or sys.argv[1] != 'firefox': 122 | browser = ChromeBrowser() 123 | else: 124 | browser = FirefoxBrowser() 125 | -------------------------------------------------------------------------------- /web/worker.py: -------------------------------------------------------------------------------- 1 | from redis import StrictRedis 2 | from redis.utils import pipeline 3 | 4 | from browser import ChromeBrowser, FirefoxBrowser 5 | 6 | import json 7 | import sys 8 | import logging 9 | import socket 10 | import os 11 | 12 | from config import get_config 13 | 14 | 15 | def get_avail_browser(config, rc, browser_type): 16 | key = os.environ['NODE_KEY'] 17 | while True: 18 | try: 19 | host = rc.blpop(key, 10) 20 | if not host: 21 | continue 22 | 23 | host = host[1] 24 | 25 | logging.debug('Got host ' + host) 26 | 27 | browser = create_browser(host, config, browser_type) 28 | logging.debug('Mapped to ' + host) 29 | return browser 30 | except Exception as e: 31 | logging.debug(e) 32 | logging.debug('Failed to map to ' + host) 33 | 34 | 35 | 36 | def create_browser(host, config, browser_type): 37 | if browser_type == 'chrome': 38 | browser = ChromeBrowser(host, config.get('chrome_url_log', False)) 39 | elif browser_type == 'firefox': 40 | browser = FirefoxBrowser(host, False) 41 | else: 42 | raise Exception('Invalid Browser Type: ' + str(browser_type)) 43 | 44 | return browser 45 | 46 | 47 | def get_cache_key(archive, browser_type, url): 48 | """ Return redis key for given url and cache""" 49 | return 'r:' + browser_type + ':' + archive + ':' + url 50 | 51 | 52 | def get_wait_key(archive, browser_type, url): 53 | """ Redis key for pending operation""" 54 | return 'w:' + browser_type + ':' + archive + ':' + url 55 | 56 | 57 | def get_queue_key(browser_type): 58 | return 'q:urls:' + browser_type 59 | 60 | 61 | def init_redis(config): 62 | """ Init redis from config, with fallback to localhost 63 | """ 64 | try: 65 | rc = StrictRedis.from_url(config['redis_url']) 66 | rc.ping() 67 | except: 68 | rc = StrictRedis.from_url('redis://localhost/') 69 | rc.ping() 70 | 71 | return rc 72 | 73 | 74 | def init(browser_type): 75 | """ Initialize the uwsgi worker which will read urls to archive from redis queue 76 | and use associated web driver to connect to remote web browser 77 | """ 78 | logging.basicConfig(format='%(asctime)s: [%(levelname)s]: %(message)s', 79 | level=logging.DEBUG) 80 | logging.debug('WebDriver Worker Started') 81 | 82 | config = get_config() 83 | 84 | archives = config['archives'] 85 | 86 | rc = init_redis(config) 87 | 88 | browser = get_avail_browser(config, rc, browser_type) 89 | 90 | run(rc, browser, archives, config, browser_type) 91 | 92 | 93 | def run(rc, browser, archives, config, browser_type): 94 | """ Read from redis queue in a loop and use associated web driver 95 | to load page on demand 96 | """ 97 | url = None 98 | queue_key = get_queue_key(browser_type) 99 | logging.debug(queue_key) 100 | 101 | while True: 102 | cmd = rc.blpop(queue_key, 10) 103 | 104 | if not cmd: 105 | continue 106 | 107 | val= json.loads(cmd[1]) 108 | archive = val['archive'] 109 | url = val['url'] 110 | 111 | result_key = get_cache_key(archive, browser_type, url) 112 | wait_key = get_wait_key(archive, browser_type, url) 113 | 114 | try: 115 | result = archives[archive](browser, url) 116 | cache_time = config['archive_cache_secs'] 117 | except Exception as e: 118 | import traceback 119 | traceback.print_exc() 120 | 121 | result = {'archived': False, 'error': {'msg': str(e) }} 122 | cache_time = config['err_cache_secs'] 123 | 124 | json_result = json.dumps(result) 125 | actual_url = result.get('actual_url') 126 | 127 | with pipeline(rc) as pi: 128 | if actual_url and actual_url != url: 129 | actual_key = get_cache_key(archive, browser_type, actual_url) 130 | pi.setex(actual_key, cache_time, json_result) 131 | 132 | pi.setex(result_key, cache_time, json_result) 133 | 134 | pi.rpush(wait_key, 1) 135 | pi.expire(wait_key, cache_time) 136 | 137 | 138 | if __name__ == "__main__": 139 | import sys 140 | if len(sys.argv) < 2: 141 | browser = 'chrome' 142 | else: 143 | browser = sys.argv[1] 144 | 145 | init(browser) 146 | 147 | -------------------------------------------------------------------------------- /web/app.py: -------------------------------------------------------------------------------- 1 | from bottle import route, Route, request, default_app, view, HTTPError, response 2 | 3 | from redis import StrictRedis 4 | from redis.utils import pipeline 5 | 6 | import json 7 | import uwsgi 8 | import os 9 | import logging 10 | import requests 11 | 12 | from config import get_config 13 | 14 | from worker import get_cache_key, get_wait_key, get_queue_key 15 | from worker import init_redis 16 | 17 | application = None 18 | 19 | ERROR_RESP = {'archived': False, 'queued': False, 'error': {'msg': 'unknown'}} 20 | 21 | 22 | def init(): 23 | """ Init the application and add routes """ 24 | 25 | logging.basicConfig(format='%(asctime)s: [%(levelname)s]: %(message)s', 26 | level=logging.DEBUG) 27 | 28 | global theconfig 29 | theconfig = get_config() 30 | 31 | global rc 32 | rc = init_redis(theconfig) 33 | 34 | app = default_app() 35 | 36 | return app 37 | 38 | 39 | 40 | @route(['/', '/index.html', '/index.htm']) 41 | @view('index') 42 | def home(): 43 | return {'archives': theconfig['archives'], 44 | 'default_archive': theconfig.get('default_archive')} 45 | 46 | 47 | 48 | def get_params(): 49 | url = request.query.get('url') 50 | 51 | archive = request.query.get('archive') 52 | 53 | browser_type = request.query.get('browser', 'chrome') 54 | 55 | if not url: 56 | raise HTTPError(status=400, body='No url= specified') 57 | 58 | if archive not in theconfig['archives']: 59 | raise HTTPError(status=400, body='No archive {0}'.format(archive)) 60 | 61 | if not url.startswith(('http://', 'https://')): 62 | url = 'http://' + url 63 | 64 | return browser_type, archive, url 65 | 66 | 67 | @route('/archivepage') 68 | def archive_page(): 69 | browser_type, archive, url = get_params() 70 | 71 | response_key = get_cache_key(archive, browser_type, url) 72 | wait_key = get_wait_key(archive, browser_type, url) 73 | 74 | queue_key = get_queue_key(browser_type) 75 | 76 | result = None 77 | 78 | if not rc.exists(response_key): 79 | cmd = dict(request.query) 80 | cmd['url'] = url 81 | 82 | num = rc.incr('total_urls:' + browser_type) 83 | cmd['num'] = num 84 | 85 | cmd = json.dumps(cmd) 86 | 87 | with pipeline(rc) as pi: 88 | waiting_str = {'archived': False, 89 | 'queued': True, 90 | 'num': num} 91 | 92 | pi.set(response_key, json.dumps(waiting_str)) 93 | pi.rpush(queue_key, cmd) 94 | 95 | rc.blpop(wait_key, theconfig['wait_timeout_secs']) 96 | 97 | result = rc.get(response_key) 98 | 99 | if result: 100 | result = json.loads(result) 101 | 102 | if 'queued' in result: 103 | result['queue_pos'] = 0 104 | front = rc.lindex(queue_key, 0) 105 | if front: 106 | front = json.loads(front) 107 | front_num = front.get('num', 0) 108 | 109 | # pos == 1 implies this url is next up 110 | # pos <= 0 implies this url was removed from queue and is being processed 111 | pos = result['num'] - front_num + 1 112 | result['queue_pos'] = pos 113 | else: 114 | result['ttl'] = rc.ttl(response_key) 115 | else: 116 | result = ERROR_RESP 117 | 118 | return result 119 | 120 | 121 | @route('/download') 122 | def download(): 123 | browser_type, archive, url = get_params() 124 | 125 | response_key = get_cache_key(archive, browser_type, url) 126 | 127 | result = rc.get(response_key) 128 | if not result: 129 | raise HTTPError(status=404, body='Url Not Archived') 130 | 131 | result = json.loads(result) 132 | if not 'download_url' in result: 133 | raise HTTPError(status=404, body='Download Not Available') 134 | 135 | headers = {} 136 | session = result.get('download_session') 137 | 138 | if session: 139 | headers['Cookie'] = session 140 | 141 | r = requests.get(result['download_url'], 142 | headers=headers, 143 | stream=True) 144 | 145 | if r.status_code != 200: 146 | raise HTTPError(status=400, body='Invalid Download Result: {0} {1}'.format(r.status_code, r.reason)) 147 | 148 | pass_headers = ('Content-Disposition', 'Content-Length', 'Content-Type') 149 | 150 | for h in pass_headers: 151 | response.set_header(h, r.headers.get(h)) 152 | 153 | response.body = r.iter_content() 154 | return response 155 | 156 | 157 | application = init() 158 | -------------------------------------------------------------------------------- /web/handlers.py: -------------------------------------------------------------------------------- 1 | from selenium.common.exceptions import NoSuchElementException 2 | from datetime import datetime 3 | from urllib import urlencode 4 | 5 | 6 | # ============================================================================ 7 | class PrefixHandler(object): 8 | def __init__(self, prefix, desc='Url Prefix Archiving Handler'): 9 | self.prefix = prefix 10 | self.desc = desc 11 | 12 | def __call__(self, browser, url): 13 | log_results = browser.visit(self.prefix + url) 14 | 15 | try: 16 | error = self.get_error(log_results, browser, url) 17 | except NoSuchElementException: 18 | # no error 19 | error = None 20 | except Exception as e: 21 | error = {'msg': str(e)} 22 | 23 | results = {'time': str(datetime.utcnow())} 24 | 25 | if error: 26 | results['error'] = error 27 | results['archived'] = False 28 | else: 29 | results['archived'] = True 30 | results['actual_url'] = self.get_actual_url(browser) 31 | self.set_success_results(browser, url, results) 32 | 33 | results['browser_url'] = self.get_browser_url(browser) 34 | 35 | for n in list(log_results.keys()): 36 | if not self.is_archived_url(n): 37 | del log_results[n] 38 | 39 | results['log'] = log_results 40 | 41 | return results 42 | 43 | def set_success_results(self, browser, url, results): 44 | pass 45 | 46 | def get_error(self, log_results, browser, url): 47 | return None 48 | 49 | def is_archived_url(self, url): 50 | return url.startswith(self.prefix) 51 | 52 | def get_desc(self): 53 | return self.desc 54 | 55 | def get_browser_url(self, browser): 56 | try: 57 | return browser.driver.current_url 58 | except: 59 | return '' 60 | 61 | def get_actual_url(self, browser): 62 | url = self.get_browser_url(browser) 63 | try: 64 | inx = url[1:].index('/http') 65 | url = url[inx + 2:] 66 | except: 67 | pass 68 | 69 | return url 70 | 71 | 72 | # ============================================================================ 73 | class SavePageNowHandler(PrefixHandler): 74 | BLOCKED_MSGS = ('Sorry.', 'Page cannot be crawled or displayed due to robots.txt.') 75 | 76 | def __init__(self, prefix='https://web.archive.org/save/', 77 | desc='Internet Archive Save Page Now Archiving'): 78 | super(SavePageNowHandler, self).__init__(prefix, desc) 79 | 80 | def set_success_results(self, browser, url, results): 81 | cookie = browser.driver.get_cookie('webrecorder.session') 82 | 83 | # not exact but close enough 84 | results['replay_url'] = 'https://web.archive.org/web/' + url 85 | 86 | def get_error(self, log_results, browser, url): 87 | err_text = browser.driver.find_element_by_css_selector("div#positionHome #error h2").text 88 | info = err_text + ' ' + browser.driver.find_element_by_css_selector("div#positionHome #error p").text 89 | 90 | if err_text in self.BLOCKED_MSGS: 91 | return {'msg': info, 'type': 'blocked'} 92 | else: 93 | return {'msg': info} 94 | 95 | return None 96 | 97 | 98 | # ============================================================================ 99 | class WebRecorderHandler(PrefixHandler): 100 | def __init__(self, prefix='https://webrecorder.io/record/', 101 | desc='webrecorder.io Archiving'): 102 | super(WebRecorderHandler, self).__init__(prefix, desc) 103 | 104 | def get_error(self, log_results, browser, url): 105 | browser.driver.switch_to.frame('iframe') 106 | err_elem = browser.driver.find_element_by_css_selector('div.webrec-error div.page-header span.h2') 107 | if err_elem.text == 'WebRecorder.io error': 108 | try: 109 | msg = browser.driver.find_element_by_css_selector('div.webrec-error p.h4').text 110 | if 'Name or service not known' in msg: 111 | msg = 'This url could not be reached' 112 | except: 113 | msg = 'unknown' 114 | 115 | return {'msg': msg} 116 | 117 | return None 118 | 119 | def set_success_results(self, browser, url, results): 120 | cookie = browser.driver.get_cookie('webrecorder.session') 121 | 122 | if cookie: 123 | query = urlencode({'url': url, 'sesh': cookie['value']}) 124 | #results['download_session'] = cookie['name'] + '=' + cookie['value'] 125 | results['download_url'] = 'https://webrecorder.io/cmd/sesh_download?' + query 126 | results['replay_url'] = 'https://webrecorder.io/cmd/setsesh?' + query 127 | 128 | return results 129 | 130 | def is_archived_url(self, url): 131 | if url.startswith(self.prefix) and '_/' in url: 132 | return True 133 | 134 | return False 135 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## Note: This repository is obsolete and represents an original attempt at browser automation. 2 | ## Please see the new Browsertrix at [webrecorder/browsertrix](https://github.com/webrecorder/browsertrix) 3 | 4 | ## Browsertrix 0.1.1 5 | 6 | Browsertrix is a web archiving automation system, desgined to create high-fidelity web archives 7 | by automating real browsers running in containers (Docker) using Selenium and other automation tools. 8 | The system does not currently do any archiving of its own, but automates browsing loading through existing archiving 9 | and recording tools. 10 | 11 | By loading pages directly through a browser, it will be possible to fully recreate a page as the user experiences it, including all dynamic content 12 | and interaction. 13 | 14 | Browsertrix is named after Heritrix, the venerable web crawler technology which has become a standard for web archiving. 15 | 16 | ## What Browsertrix Does 17 | 18 | The first iteration of Browsertrix supports archiving a single web page, through an existing archiving back-end. 19 | 20 | Urls can be submitted to Browsertrix via HTTP and it will attempt to load the urls in an available browser right away. 21 | Browsertrix can operate synchronously or asynchronously. If the operation does not complete within the specified timeout 22 | (default 30 secs), a `queued` response is returned and the user may retry the operation to get the result at a later time. 23 | The results of the archiving operation are cached (for 10 mins if successful, for 30 secs otherwise) so that future requests will return the cached result. 24 | 25 | Redis is used to queue urls for archiving, and cache results for the archiving operation. Configurable options 26 | are currently available in the `config.py` module. 27 | 28 | Additional automated browser "crawling" and multi-url features are planned for the next iteration. 29 | 30 | 31 | ### Installation 32 | 33 | Docker and Docker Compose are the only requirements for running Browsertrix. 34 | 35 | Install Docker as recommended at: https://docs.docker.com/installation/ 36 | 37 | Install Docker Compose with: `pip install docker-compose` 38 | 39 | After cloning this repository, run `docker-compose up` 40 | 41 | ### Web Interface 42 | 43 | In this version, a basic 'Archive This Website' UI is available on the home page and provides a form to submit urls 44 | to be archived through Chrome or Firefox. The interfaces wraps the Archiving API explained below. 45 | 46 | The supported backends are https://webrecorder.io/ and IA Save Page Now feature. 47 | 48 | `http://$DOCKER_HOST/` where `DOCKER_HOST` is the host where Docker is running. 49 | 50 | 51 | ### Scaling Workers 52 | 53 | By default, Browsertrix starts with one Chrome and one Firefox worker. `docker-compose scale` can be used 54 | to set the number of workers as needed. 55 | 56 | The `set-scale.sh` script is provided as a convenience to resize the number of workers, resizing both 57 | the Chrome and Firefox workers. For example, to have 4 of each browser, you can run: 58 | 59 | `./set-scale.sh 4` 60 | 61 | 62 | ### Archiving API `/archivepage` 63 | 64 | This first iteration of Browsertrix provides an api endpoint at the `/archivepage` endpoint for archiving a single page. 65 | 66 | To archive a url, a GET request can be made to `http:///archivepage?url=URL&archive=ARCHIVE[&browser=browser]` 67 | 68 | * `url` - The URL to be archived 69 | 70 | * `archive` - One of the available archives specified in `config.py`. Current archives are `ia-save` and `webrecorder` 71 | 72 | * `browser` - (Optional) Currently either `chrome` or `firefox`. Chrome is the default if omitted. 73 | 74 | ### Results 75 | 76 | The result of the archiving operation is a JSON block. The block contains one of the following. 77 | 78 | * `error: true` is set and `msg` field contains more details about the error. 79 | The `type` field indicates a specific type of error, eg: `type: blocked` currently indicates the archiving service can not 80 | archive this page. 81 | 82 | * `queued: true` is the timeout for archiving the page (currently 30 secs) has been exceeded. If this is the case, the url has been put on a queue and the query should be retried until the page is archived. `queue-pos` field indicates the position in the queue, with `queue-pos: 1` means the url is up next, and `queue-pos: 0` means the url is currently being loaded in the browser. 83 | 84 | * `archived: true` is set if the archiving of the page has fully finished. The following additional properties may be set in the JSON result: 85 | 86 | - `replay_url` - if the archived page is immediately available for replay, this is the url to access the archived content. 87 | 88 | - `download_url` - if the archived content is available for download as a WARC file, this is the link to the WARC. 89 | 90 | - `actual_url` - if the original url caused a redirect, this will contain the actual url that was archived (only present if different from original). 91 | 92 | - `browser_url` - The actual url loaded by the browser to "seed" the archive. 93 | 94 | - `time` - Timestamp of when the page was archived. 95 | 96 | - `ttl` - time remaining (in seconds) for this entry to be stored in the cache. After the entry expires, a subsequent query will re-archive the page. Default is 10 min (600 secs) and can be configured in `config.py` 97 | 98 | - `log` HTTP response log from the browser, available only in Chrome. The format is `{: }` for each url loaded to archive the current page. 99 | 100 | 101 | 102 | #### Support 103 | 104 | Initial work on this project was sponsored by the [Hypothes.is Annotation Fund](http://anno.fund/#portfolioModal2) 105 | -------------------------------------------------------------------------------- /web/views/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 36 | 37 | 136 | 137 | 138 | Fork me on GitHub 139 | 140 |
141 |

Archive This Website

142 |

Enter URL, Create Web Archive

143 |
144 | 145 | % for name in archives.keys(): 146 | 150 | % end 151 | 152 | 153 | Archive using: 154 | 157 | 158 | 161 | 162 | 163 | 164 | 165 |
166 | 167 | 170 | 171 |
172 |
Sorry, an error has occured
173 |
Reason:
174 |
175 | 176 |
177 |
The url is queued and will be recorded soon
178 |
Position in Queue:
179 |
180 | 181 |
182 |
Archived!
183 |
 total url(s) were archived
184 | 185 | 186 |
187 | 188 | 191 |
192 | 193 |
194 | 195 | 196 |
Powered by Browsertrix
197 |
198 | 199 | 200 | 201 | --------------------------------------------------------------------------------