├── data
    └── .gitignore
├── web
    ├── requirements.txt
    ├── Dockerfile
    ├── uwsgi.ini
    ├── config.py
    ├── browser.py
    ├── worker.py
    ├── app.py
    ├── handlers.py
    └── views
    │   └── index.html
├── set-scale.sh
├── browser-chrome
    ├── register.sh
    └── Dockerfile
├── browser-firefox
    ├── register.sh
    └── Dockerfile
├── .gitignore
├── LICENSE
├── docker-compose.yml
└── README.md


/data/.gitignore:
--------------------------------------------------------------------------------
1 | # Ignore everything in this directory
2 | *
3 | # Except this file
4 | !.gitignore
5 | 


--------------------------------------------------------------------------------
/web/requirements.txt:
--------------------------------------------------------------------------------
1 | bottle
2 | selenium
3 | redis
4 | hiredis
5 | uwsgi
6 | gevent
7 | requests
8 | 


--------------------------------------------------------------------------------
/set-scale.sh:
--------------------------------------------------------------------------------
1 | docker-compose scale chrome=$1 firefox=$1
2 | docker-compose scale workerchrome=$1 workerfirefox=$1
3 | 
4 | 


--------------------------------------------------------------------------------
/browser-chrome/register.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | ip=$(head -n 1 /etc/hosts | cut -f 1)
4 | redis-cli -h redis_1 lpush $NODE_KEY $ip
5 | 
6 | bash /opt/bin/entry_point.sh
7 | 


--------------------------------------------------------------------------------
/browser-firefox/register.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | ip=$(head -n 1 /etc/hosts | cut -f 1)
4 | redis-cli -h redis_1 lpush $NODE_KEY $ip
5 | 
6 | bash /opt/bin/entry_point.sh
7 | 


--------------------------------------------------------------------------------
/web/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:2.7
 2 | 
 3 | WORKDIR /code
 4 | 
 5 | ADD requirements.txt /code/
 6 | 
 7 | RUN pip install -r requirements.txt
 8 | 
 9 | ADD . /code
10 | 


--------------------------------------------------------------------------------
/browser-chrome/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM selenium/standalone-chrome
 2 | 
 3 | USER root
 4 | RUN apt-get update && apt-get install -y redis-tools
 5 | 
 6 | WORKDIR /reg
 7 | 
 8 | ADD register.sh /reg/
 9 | 
10 | RUN chmod +x /reg/register.sh
11 | 
12 | USER seluser
13 | 
14 | CMD /reg/register.sh
15 | 
16 | 


--------------------------------------------------------------------------------
/browser-firefox/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM selenium/standalone-firefox
 2 | 
 3 | USER root
 4 | RUN apt-get update && apt-get install -y redis-tools
 5 | 
 6 | WORKDIR /reg
 7 | 
 8 | ADD register.sh /reg/
 9 | 
10 | RUN chmod +x /reg/register.sh
11 | 
12 | USER seluser
13 | 
14 | CMD /reg/register.sh
15 | 
16 | 


--------------------------------------------------------------------------------
/web/uwsgi.ini:
--------------------------------------------------------------------------------
 1 | [uwsgi]
 2 | if-not-env = PORT
 3 | http-socket = :8080
 4 | endif =
 5 | 
 6 | master = true
 7 | buffer-size = 65536
 8 | die-on-term = true
 9 | 
10 | if-env = VIRTUAL_ENV
11 | venv = $(VIRTUAL_ENV)
12 | endif =
13 | 
14 | gevent = 1000
15 | gevent-early-monkey-patch =
16 | #processes = 10
17 | 
18 | file = app.py
19 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *.py[cod]
 2 | 
 3 | # C extensions
 4 | *.so
 5 | 
 6 | # Packages
 7 | *.egg
 8 | *.egg-info
 9 | dist
10 | build
11 | eggs
12 | parts
13 | bin
14 | var
15 | sdist
16 | develop-eggs
17 | .installed.cfg
18 | lib
19 | lib64
20 | __pycache__
21 | 
22 | # Installer logs
23 | pip-log.txt
24 | 
25 | # Unit test / coverage reports
26 | .coverage
27 | .tox
28 | nosetests.xml
29 | 
30 | # Translations
31 | *.mo
32 | 
33 | # Mr Developer
34 | .mr.developer.cfg
35 | .project
36 | .pydevproject
37 | 
38 | .vagrant
39 | 


--------------------------------------------------------------------------------
/web/config.py:
--------------------------------------------------------------------------------
 1 | from handlers import WebRecorderHandler, SavePageNowHandler
 2 | from collections import OrderedDict
 3 | 
 4 | def get_config():
 5 |     config = {}
 6 | 
 7 |     archives = OrderedDict()
 8 |     archives['webrecorder'] = WebRecorderHandler()
 9 | #    archives['test'] = WebRecorderHandler('https://webrecorder.io/preview/', desc='Preview with <a href="https://webrecorder.io">webrecorder.io</a> (Not Recording)')
10 |     archives['ia-save'] = SavePageNowHandler()
11 | 
12 |     config['archives'] = archives
13 | 
14 |     config['default_archive'] = 'webrecorder'
15 | 
16 |     config['redis_url'] = 'redis://redis_1/'
17 |     config['chrome_url_log'] = True
18 | 
19 |     config['archive_cache_secs'] = 600
20 |     config['err_cache_secs'] = 10
21 | 
22 |     config['wait_timeout_secs'] = 30
23 |     return config
24 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2015 Ilya Kreymer
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | # Redis
 2 | redis:
 3 |   restart: always
 4 |   image: redis:latest
 5 |   ports:
 6 |     - "6379:6379"
 7 |   
 8 |   volumes:
 9 |     - ./data/:/data/
10 | 
11 | 
12 | # Selenium Chrome Web Driver
13 | chrome:
14 |   build: ./browser-chrome
15 |   restart: always
16 |   links:
17 |     - redis:redis
18 | 
19 |   environment:
20 |     - NODE_KEY=nodes-chrome
21 | 
22 |   # see https://github.com/SeleniumHQ/docker-selenium/issues/79
23 |   volumes:
24 |     - /dev/shm:/dev/shm
25 | 
26 | # Selenium FF Web Driver
27 | firefox:
28 |   build: ./browser-firefox
29 |   restart: always
30 |   links:
31 |     - redis:redis
32 | 
33 |   environment:
34 |     - NODE_KEY=nodes-firefox
35 | 
36 | 
37 | # Worker for Chrome Browsers
38 | workerchrome:
39 |   restart: always
40 |   build: ./web
41 |   links:
42 |     - redis:redis
43 |     - chrome:browser
44 | 
45 |   environment:
46 |     - NODE_KEY=nodes-chrome
47 | 
48 |   command: python worker.py chrome
49 | 
50 | 
51 | # Worker for FF Browsers
52 | workerfirefox:
53 |   restart: always
54 |   build: ./web
55 |   links:
56 |     - redis:redis
57 |     - firefox:browser
58 | 
59 |   environment:
60 |     - NODE_KEY=nodes-firefox
61 | 
62 |   command: python worker.py firefox
63 | 
64 | 
65 | # Web App
66 | web:
67 |   restart: always
68 |   build: ./web
69 | 
70 |   command: uwsgi uwsgi.ini
71 | 
72 |   links:
73 |     - redis:redis
74 | 
75 |   ports:
76 |     - "8080:8080"
77 | 
78 | 


--------------------------------------------------------------------------------
/web/browser.py:
--------------------------------------------------------------------------------
  1 | from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
  2 | from selenium.webdriver import Chrome, Remote, Firefox
  3 | 
  4 | from selenium.webdriver.chrome.options import Options
  5 | from selenium.webdriver.firefox.firefox_profile import FirefoxProfile
  6 | 
  7 | import json
  8 | import logging
  9 | 
 10 | 
 11 | # ============================================================================
 12 | class Browser(object):
 13 |     def __init__(self, host_name=None, readlog=False):
 14 |         self.readlog = readlog
 15 |         self.host_name = host_name
 16 | 
 17 |         self.caps = self._init_caps()
 18 |         self._init_driver()
 19 | 
 20 |     def _init_local(self):
 21 |         raise NotImplemented()
 22 | 
 23 |     def _init_driver(self):
 24 |         self.driver = None
 25 | 
 26 |         if not self.host_name:
 27 |             self.driver = self._init_local()
 28 |             return
 29 | 
 30 |         while True:
 31 |             try:
 32 |                 self.driver = Remote(command_executor='http://{0}:4444/wd/hub'.format(self.host_name),
 33 |                                      desired_capabilities=self.caps)
 34 |                 break
 35 |             except:
 36 |                 import traceback
 37 |                 traceback.print_exc()
 38 |                 print('RETRY CONN')
 39 | 
 40 |     def close(self):
 41 |         if self.driver:
 42 |             self.driver.quit()
 43 | 
 44 |     def visit(self, url):
 45 |         try:
 46 |             self.driver.get(url)
 47 |         except:
 48 |             self._init_driver()
 49 |             self.driver.get(url)
 50 | 
 51 |         results = {}
 52 |         return results
 53 | 
 54 | 
 55 | # ============================================================================
 56 | class ChromeBrowser(Browser):
 57 |     def _init_caps(self):
 58 |         caps = DesiredCapabilities.CHROME
 59 | 
 60 |         if self.readlog:
 61 |             caps['loggingPrefs'] = {'performance': 'ALL'}
 62 |             caps['chromeOptions'] = {'perfLoggingPrefs': {'enableTimeline': False, 'enablePage': False}}
 63 | 
 64 |         return caps
 65 | 
 66 |     def _init_local(self):
 67 |         return Chrome(chrome_options=Options(), desired_capabilities=self.caps)
 68 | 
 69 |     def visit(self, url):
 70 |         results = super(ChromeBrowser, self).visit(url)
 71 | 
 72 |         if not self.readlog:
 73 |             return results
 74 | 
 75 |         try:
 76 |             log = self.driver.get_log('performance')
 77 | 
 78 |         except Exception as e:
 79 |             import traceback
 80 |             traceback.print_exc()
 81 |             return results
 82 | 
 83 |         for entry in log:
 84 |             message = entry.get('message')
 85 |             try:
 86 |                 message = json.loads(message)
 87 |                 message = message['message']
 88 |                 if message['method'].startswith('Network'):
 89 |                     resp = message['params'].get('response')
 90 |                     if not resp:
 91 |                         continue
 92 | 
 93 |                     resp_url = resp.get('url', '')
 94 |                     if resp_url and resp_url.startswith('http'):
 95 |                         results[resp_url] = {'status': resp.get('status')}
 96 |             except:
 97 |                 continue
 98 | 
 99 |         return results
100 | 
101 | 
102 | # ============================================================================
103 | class FirefoxBrowser(Browser):
104 |     def _init_caps(self):
105 |         caps = DesiredCapabilities.FIREFOX
106 |         return caps
107 | 
108 |     def _init_local(self):
109 |         firefox_profile = FirefoxProfile()
110 |         firefox_profile.set_preference('extensions.logging.enabled', False)
111 |         firefox_profile.set_preference('network.dns.disableIPv6', False)
112 |         return Firefox(firefox_profile)
113 | 
114 | 
115 | # ============================================================================
116 | if __name__ == "__main__":
117 |     global browser
118 | 
119 |     import sys
120 | 
121 |     if len(sys.argv) <= 1 or sys.argv[1] != 'firefox':
122 |         browser = ChromeBrowser()
123 |     else:
124 |         browser = FirefoxBrowser()
125 | 


--------------------------------------------------------------------------------
/web/worker.py:
--------------------------------------------------------------------------------
  1 | from redis import StrictRedis
  2 | from redis.utils import pipeline
  3 | 
  4 | from browser import ChromeBrowser, FirefoxBrowser
  5 | 
  6 | import json
  7 | import sys
  8 | import logging
  9 | import socket
 10 | import os
 11 | 
 12 | from config import get_config
 13 | 
 14 | 
 15 | def get_avail_browser(config, rc, browser_type):
 16 |     key = os.environ['NODE_KEY']
 17 |     while True:
 18 |         try:
 19 |             host = rc.blpop(key, 10)
 20 |             if not host:
 21 |                 continue
 22 | 
 23 |             host = host[1]
 24 | 
 25 |             logging.debug('Got host ' + host)
 26 | 
 27 |             browser = create_browser(host, config, browser_type)
 28 |             logging.debug('Mapped to ' + host)
 29 |             return browser
 30 |         except Exception as e:
 31 |             logging.debug(e)
 32 |             logging.debug('Failed to map to ' + host)
 33 | 
 34 | 
 35 | 
 36 | def create_browser(host, config, browser_type):
 37 |     if browser_type == 'chrome':
 38 |         browser = ChromeBrowser(host, config.get('chrome_url_log', False))
 39 |     elif browser_type == 'firefox':
 40 |         browser = FirefoxBrowser(host, False)
 41 |     else:
 42 |         raise Exception('Invalid Browser Type: ' + str(browser_type))
 43 | 
 44 |     return browser
 45 | 
 46 | 
 47 | def get_cache_key(archive, browser_type, url):
 48 |     """ Return redis key for given url and cache"""
 49 |     return 'r:' + browser_type + ':' + archive + ':' + url
 50 | 
 51 | 
 52 | def get_wait_key(archive, browser_type, url):
 53 |     """ Redis key for pending operation"""
 54 |     return 'w:' + browser_type + ':' + archive + ':' + url
 55 | 
 56 | 
 57 | def get_queue_key(browser_type):
 58 |     return 'q:urls:' + browser_type
 59 | 
 60 | 
 61 | def init_redis(config):
 62 |     """ Init redis from config, with fallback to localhost
 63 |     """
 64 |     try:
 65 |         rc = StrictRedis.from_url(config['redis_url'])
 66 |         rc.ping()
 67 |     except:
 68 |         rc = StrictRedis.from_url('redis://localhost/')
 69 |         rc.ping()
 70 | 
 71 |     return rc
 72 | 
 73 | 
 74 | def init(browser_type):
 75 |     """ Initialize the uwsgi worker which will read urls to archive from redis queue
 76 |     and use associated web driver to connect to remote web browser
 77 |     """
 78 |     logging.basicConfig(format='%(asctime)s: [%(levelname)s]: %(message)s',
 79 |                         level=logging.DEBUG)
 80 |     logging.debug('WebDriver Worker Started')
 81 | 
 82 |     config = get_config()
 83 | 
 84 |     archives = config['archives']
 85 | 
 86 |     rc = init_redis(config)
 87 | 
 88 |     browser = get_avail_browser(config, rc, browser_type)
 89 | 
 90 |     run(rc, browser, archives, config, browser_type)
 91 | 
 92 | 
 93 | def run(rc, browser, archives, config, browser_type):
 94 |     """ Read from redis queue in a loop and use associated web driver
 95 |     to load page on demand
 96 |     """
 97 |     url = None
 98 |     queue_key = get_queue_key(browser_type)
 99 |     logging.debug(queue_key)
100 | 
101 |     while True:
102 |         cmd = rc.blpop(queue_key, 10)
103 | 
104 |         if not cmd:
105 |             continue
106 | 
107 |         val= json.loads(cmd[1])
108 |         archive = val['archive']
109 |         url = val['url']
110 | 
111 |         result_key = get_cache_key(archive, browser_type, url)
112 |         wait_key = get_wait_key(archive, browser_type, url)
113 | 
114 |         try:
115 |             result = archives[archive](browser, url)
116 |             cache_time = config['archive_cache_secs']
117 |         except Exception as e:
118 |             import traceback
119 |             traceback.print_exc()
120 | 
121 |             result = {'archived': False, 'error': {'msg': str(e) }}
122 |             cache_time = config['err_cache_secs']
123 | 
124 |         json_result = json.dumps(result)
125 |         actual_url = result.get('actual_url')
126 | 
127 |         with pipeline(rc) as pi:
128 |             if actual_url and actual_url != url:
129 |                 actual_key = get_cache_key(archive, browser_type, actual_url)
130 |                 pi.setex(actual_key, cache_time, json_result)
131 | 
132 |             pi.setex(result_key, cache_time, json_result)
133 | 
134 |             pi.rpush(wait_key, 1)
135 |             pi.expire(wait_key, cache_time)
136 | 
137 | 
138 | if __name__ == "__main__":
139 |     import sys
140 |     if len(sys.argv) < 2:
141 |         browser = 'chrome'
142 |     else:
143 |         browser = sys.argv[1]
144 | 
145 |     init(browser)
146 | 
147 | 


--------------------------------------------------------------------------------
/web/app.py:
--------------------------------------------------------------------------------
  1 | from bottle import route, Route, request, default_app, view, HTTPError, response
  2 | 
  3 | from redis import StrictRedis
  4 | from redis.utils import pipeline
  5 | 
  6 | import json
  7 | import uwsgi
  8 | import os
  9 | import logging
 10 | import requests
 11 | 
 12 | from config import get_config
 13 | 
 14 | from worker import get_cache_key, get_wait_key, get_queue_key
 15 | from worker import init_redis
 16 | 
 17 | application = None
 18 | 
 19 | ERROR_RESP = {'archived': False, 'queued': False, 'error': {'msg': 'unknown'}}
 20 | 
 21 | 
 22 | def init():
 23 |     """ Init the application and add routes """
 24 | 
 25 |     logging.basicConfig(format='%(asctime)s: [%(levelname)s]: %(message)s',
 26 |                         level=logging.DEBUG)
 27 | 
 28 |     global theconfig
 29 |     theconfig = get_config()
 30 | 
 31 |     global rc
 32 |     rc = init_redis(theconfig)
 33 | 
 34 |     app = default_app()
 35 | 
 36 |     return app
 37 | 
 38 | 
 39 | 
 40 | @route(['/', '/index.html', '/index.htm'])
 41 | @view('index')
 42 | def home():
 43 |     return {'archives': theconfig['archives'],
 44 |             'default_archive': theconfig.get('default_archive')}
 45 | 
 46 | 
 47 | 
 48 | def get_params():
 49 |     url = request.query.get('url')
 50 | 
 51 |     archive = request.query.get('archive')
 52 | 
 53 |     browser_type = request.query.get('browser', 'chrome')
 54 | 
 55 |     if not url:
 56 |         raise HTTPError(status=400, body='No url= specified')
 57 | 
 58 |     if archive not in theconfig['archives']:
 59 |         raise HTTPError(status=400, body='No archive {0}'.format(archive))
 60 | 
 61 |     if not url.startswith(('http://', 'https://')):
 62 |         url = 'http://' + url
 63 | 
 64 |     return browser_type, archive, url
 65 | 
 66 | 
 67 | @route('/archivepage')
 68 | def archive_page():
 69 |     browser_type, archive, url = get_params()
 70 | 
 71 |     response_key = get_cache_key(archive, browser_type, url)
 72 |     wait_key = get_wait_key(archive, browser_type, url)
 73 | 
 74 |     queue_key = get_queue_key(browser_type)
 75 | 
 76 |     result = None
 77 | 
 78 |     if not rc.exists(response_key):
 79 |         cmd = dict(request.query)
 80 |         cmd['url'] = url
 81 | 
 82 |         num = rc.incr('total_urls:' + browser_type)
 83 |         cmd['num'] = num
 84 | 
 85 |         cmd = json.dumps(cmd)
 86 | 
 87 |         with pipeline(rc) as pi:
 88 |             waiting_str = {'archived': False,
 89 |                            'queued': True,
 90 |                            'num': num}
 91 | 
 92 |             pi.set(response_key, json.dumps(waiting_str))
 93 |             pi.rpush(queue_key, cmd)
 94 | 
 95 |         rc.blpop(wait_key, theconfig['wait_timeout_secs'])
 96 | 
 97 |     result = rc.get(response_key)
 98 | 
 99 |     if result:
100 |         result = json.loads(result)
101 | 
102 |         if 'queued' in result:
103 |             result['queue_pos'] = 0
104 |             front = rc.lindex(queue_key, 0)
105 |             if front:
106 |                 front = json.loads(front)
107 |                 front_num = front.get('num', 0)
108 | 
109 |                 # pos == 1 implies this url is next up
110 |                 # pos <= 0 implies this url was removed from queue and is being processed
111 |                 pos = result['num'] - front_num + 1
112 |                 result['queue_pos'] = pos
113 |         else:
114 |             result['ttl'] = rc.ttl(response_key)
115 |     else:
116 |         result = ERROR_RESP
117 | 
118 |     return result
119 | 
120 | 
121 | @route('/download')
122 | def download():
123 |     browser_type, archive, url = get_params()
124 | 
125 |     response_key = get_cache_key(archive, browser_type, url)
126 | 
127 |     result = rc.get(response_key)
128 |     if not result:
129 |         raise HTTPError(status=404, body='Url Not Archived')
130 | 
131 |     result = json.loads(result)
132 |     if not 'download_url' in result:
133 |         raise HTTPError(status=404, body='Download Not Available')
134 | 
135 |     headers = {}
136 |     session = result.get('download_session')
137 | 
138 |     if session:
139 |         headers['Cookie'] = session
140 | 
141 |     r = requests.get(result['download_url'],
142 |                      headers=headers,
143 |                      stream=True)
144 | 
145 |     if r.status_code != 200:
146 |         raise HTTPError(status=400, body='Invalid Download Result: {0} {1}'.format(r.status_code, r.reason))
147 | 
148 |     pass_headers = ('Content-Disposition', 'Content-Length', 'Content-Type')
149 | 
150 |     for h in pass_headers:
151 |         response.set_header(h, r.headers.get(h))
152 | 
153 |     response.body = r.iter_content()
154 |     return response
155 | 
156 | 
157 | application = init()
158 | 


--------------------------------------------------------------------------------
/web/handlers.py:
--------------------------------------------------------------------------------
  1 | from selenium.common.exceptions import NoSuchElementException
  2 | from datetime import datetime
  3 | from urllib import urlencode
  4 | 
  5 | 
  6 | # ============================================================================
  7 | class PrefixHandler(object):
  8 |     def __init__(self, prefix, desc='Url Prefix Archiving Handler'):
  9 |         self.prefix = prefix
 10 |         self.desc = desc
 11 | 
 12 |     def __call__(self, browser, url):
 13 |         log_results = browser.visit(self.prefix + url)
 14 | 
 15 |         try:
 16 |             error = self.get_error(log_results, browser, url)
 17 |         except NoSuchElementException:
 18 |             # no error
 19 |             error = None
 20 |         except Exception as e:
 21 |             error = {'msg': str(e)}
 22 | 
 23 |         results = {'time': str(datetime.utcnow())}
 24 | 
 25 |         if error:
 26 |             results['error'] = error
 27 |             results['archived'] = False
 28 |         else:
 29 |             results['archived'] = True
 30 |             results['actual_url'] = self.get_actual_url(browser)
 31 |             self.set_success_results(browser, url, results)
 32 | 
 33 |         results['browser_url'] = self.get_browser_url(browser)
 34 | 
 35 |         for n in list(log_results.keys()):
 36 |             if not self.is_archived_url(n):
 37 |                 del log_results[n]
 38 | 
 39 |         results['log'] = log_results
 40 | 
 41 |         return results
 42 | 
 43 |     def set_success_results(self, browser, url, results):
 44 |         pass
 45 | 
 46 |     def get_error(self, log_results, browser, url):
 47 |         return None
 48 | 
 49 |     def is_archived_url(self, url):
 50 |         return url.startswith(self.prefix)
 51 | 
 52 |     def get_desc(self):
 53 |         return self.desc
 54 | 
 55 |     def get_browser_url(self, browser):
 56 |         try:
 57 |             return browser.driver.current_url
 58 |         except:
 59 |             return ''
 60 | 
 61 |     def get_actual_url(self, browser):
 62 |         url = self.get_browser_url(browser)
 63 |         try:
 64 |             inx = url[1:].index('/http')
 65 |             url = url[inx + 2:]
 66 |         except:
 67 |             pass
 68 | 
 69 |         return url
 70 | 
 71 | 
 72 | # ============================================================================
 73 | class SavePageNowHandler(PrefixHandler):
 74 |     BLOCKED_MSGS = ('Sorry.', 'Page cannot be crawled or displayed due to robots.txt.')
 75 | 
 76 |     def __init__(self, prefix='https://web.archive.org/save/',
 77 |                        desc='Internet Archive <a href="https://web.archive.org/web/">Save Page Now</a> Archiving'):
 78 |         super(SavePageNowHandler, self).__init__(prefix, desc)
 79 | 
 80 |     def set_success_results(self, browser, url, results):
 81 |         cookie = browser.driver.get_cookie('webrecorder.session')
 82 | 
 83 |         # not exact but close enough
 84 |         results['replay_url'] = 'https://web.archive.org/web/' + url
 85 | 
 86 |     def get_error(self, log_results, browser, url):
 87 |         err_text = browser.driver.find_element_by_css_selector("div#positionHome #error h2").text
 88 |         info = err_text + ' ' + browser.driver.find_element_by_css_selector("div#positionHome #error p").text
 89 | 
 90 |         if err_text in self.BLOCKED_MSGS:
 91 |             return {'msg': info, 'type': 'blocked'}
 92 |         else:
 93 |             return {'msg': info}
 94 | 
 95 |         return None
 96 | 
 97 | 
 98 | # ============================================================================
 99 | class WebRecorderHandler(PrefixHandler):
100 |     def __init__(self, prefix='https://webrecorder.io/record/',
101 |                        desc='<a href="https://webrecorder.io/">webrecorder.io</a> Archiving'):
102 |         super(WebRecorderHandler, self).__init__(prefix, desc)
103 | 
104 |     def get_error(self, log_results, browser, url):
105 |         browser.driver.switch_to.frame('iframe')
106 |         err_elem = browser.driver.find_element_by_css_selector('div.webrec-error div.page-header span.h2')
107 |         if err_elem.text == 'WebRecorder.io error':
108 |             try:
109 |                 msg = browser.driver.find_element_by_css_selector('div.webrec-error p.h4').text
110 |                 if 'Name or service not known' in msg:
111 |                     msg = 'This url could not be reached'
112 |             except:
113 |                 msg = 'unknown'
114 | 
115 |             return {'msg': msg}
116 | 
117 |         return None
118 | 
119 |     def set_success_results(self, browser, url, results):
120 |         cookie = browser.driver.get_cookie('webrecorder.session')
121 | 
122 |         if cookie:
123 |             query = urlencode({'url': url, 'sesh': cookie['value']})
124 |             #results['download_session'] = cookie['name'] + '=' + cookie['value']
125 |             results['download_url'] = 'https://webrecorder.io/cmd/sesh_download?' + query
126 |             results['replay_url'] = 'https://webrecorder.io/cmd/setsesh?' + query
127 | 
128 |         return results
129 | 
130 |     def is_archived_url(self, url):
131 |         if url.startswith(self.prefix) and '_/' in url:
132 |             return True
133 | 
134 |         return False
135 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | ## Note: This repository is obsolete and represents an original attempt at browser automation.
  2 | ## Please see the new Browsertrix at [webrecorder/browsertrix](https://github.com/webrecorder/browsertrix)
  3 | 
  4 | ## Browsertrix 0.1.1
  5 | 
  6 | Browsertrix is a web archiving automation system, desgined to create high-fidelity web archives
  7 | by automating real browsers running in containers (Docker) using Selenium and other automation tools.
  8 | The system does not currently do any archiving of its own, but automates browsing loading through existing archiving
  9 | and recording tools.
 10 | 
 11 | By loading pages directly through a browser, it will be possible to fully recreate a page as the user experiences it, including all dynamic content
 12 | and interaction.
 13 | 
 14 | Browsertrix is named after Heritrix, the venerable web crawler technology which has become a standard for web archiving.
 15 | 
 16 | ## What Browsertrix Does
 17 | 
 18 | The first iteration of Browsertrix supports archiving a single web page, through an existing archiving back-end.
 19 | 
 20 | Urls can be submitted to Browsertrix via HTTP and it will attempt to load the urls in an available browser right away.
 21 | Browsertrix can operate synchronously or asynchronously. If the operation does not complete within the specified timeout
 22 | (default 30 secs), a `queued` response is returned and the user may retry the operation to get the result at a later time.
 23 | The results of the archiving operation are cached (for 10 mins if successful, for 30 secs otherwise) so that future requests will return the cached result.
 24 | 
 25 | Redis is used to queue urls for archiving, and cache results for the archiving operation. Configurable options
 26 | are currently available in the `config.py` module.
 27 | 
 28 | Additional automated browser "crawling" and multi-url features are planned for the next iteration.
 29 | 
 30 | 
 31 | ### Installation
 32 | 
 33 | Docker and Docker Compose are the only requirements for running Browsertrix.
 34 | 
 35 | Install Docker as recommended at: https://docs.docker.com/installation/
 36 | 
 37 | Install Docker Compose with: `pip install docker-compose`
 38 | 
 39 | After cloning this repository, run `docker-compose up`
 40 | 
 41 | ### Web Interface
 42 | 
 43 | In this version, a basic 'Archive This Website' UI is available on the home page and provides a form to submit urls
 44 | to be archived through Chrome or Firefox. The interfaces wraps the Archiving API explained below.
 45 | 
 46 | The supported backends are https://webrecorder.io/ and IA Save Page Now feature.
 47 | 
 48 | `http://$DOCKER_HOST/` where `DOCKER_HOST` is the host where Docker is running.
 49 | 
 50 | 
 51 | ### Scaling Workers
 52 | 
 53 | By default, Browsertrix starts with one Chrome and one Firefox worker. `docker-compose scale` can be used
 54 | to set the number of workers as needed.
 55 | 
 56 | The `set-scale.sh` script is provided as a convenience to resize the number of workers, resizing both
 57 | the Chrome and Firefox workers. For example, to have 4 of each browser, you can run:
 58 | 
 59 | `./set-scale.sh 4`
 60 | 
 61 | 
 62 | ### Archiving API `/archivepage`
 63 | 
 64 | This first iteration of Browsertrix provides an api endpoint at the `/archivepage` endpoint for archiving a single page.
 65 | 
 66 | To archive a url, a GET request can be made to `http://<DOCKER HOST>/archivepage?url=URL&archive=ARCHIVE[&browser=browser]`
 67 | 
 68 | * `url` - The URL to be archived
 69 | 
 70 | * `archive` - One of the available archives specified in `config.py`. Current archives are `ia-save` and `webrecorder`
 71 | 
 72 | * `browser` - (Optional) Currently either `chrome` or `firefox`. Chrome is the default if omitted.
 73 | 
 74 | ### Results
 75 | 
 76 | The result of the archiving operation is a JSON block. The block contains one of the following.
 77 | 
 78 | * `error: true` is set and `msg` field contains more details about the error.
 79 |   The `type` field indicates a specific type of error, eg: `type: blocked` currently indicates the archiving service can not
 80 |   archive this page.
 81 | 
 82 | * `queued: true` is the timeout for archiving the page (currently 30 secs) has been exceeded. If this is the case, the url has been put on a queue and the query should be retried until the page is archived. `queue-pos` field indicates the position in the queue, with `queue-pos: 1` means the url is up next, and `queue-pos: 0` means the url is currently being loaded in the browser.
 83 | 
 84 | * `archived: true` is set if the archiving of the page has fully finished. The following additional properties may be set in the JSON result:
 85 | 
 86 |    - `replay_url` - if the archived page is immediately available for replay, this is the url to access the archived content.
 87 |   
 88 |    - `download_url` - if the archived content is available for download as a WARC file, this is the link to the WARC.
 89 |    
 90 |    - `actual_url` - if the original url caused a redirect, this will contain the actual url that was archived (only present if different from original).
 91 |     
 92 |    - `browser_url` - The actual url loaded by the browser to "seed" the archive.
 93 |     
 94 |    - `time` - Timestamp of when the page was archived.
 95 |    
 96 |    - `ttl` - time remaining (in seconds) for this entry to be stored in the cache. After the entry expires, a subsequent query will re-archive the page. Default is 10 min (600 secs) and can be configured in `config.py`
 97 |    
 98 |    - `log` HTTP response log from the browser, available only in Chrome. The format is `{<URL>: <STATUS>}` for each url loaded to archive the current page.
 99 | 
100 | 
101 | 
102 | #### Support
103 | 
104 | Initial work on this project was sponsored by the [Hypothes.is Annotation Fund](http://anno.fund/#portfolioModal2)
105 | 


--------------------------------------------------------------------------------
/web/views/index.html:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE html>
  2 | <html>
  3 |     <head>
  4 |         <style>
  5 |             @import url("//d2v52k3cl9vedd.cloudfront.net/basscss/6.1.6/basscss.min.css");
  6 |             body { margin-top: 5%; margin-left: 0px; font-family: "myriad-pro", sans-serif; }
  7 | 
  8 |             .error {
  9 |                 font-weight: bold;
 10 |                 display: none;
 11 |             }
 12 | 
 13 |             .queued {
 14 |                 font-weight: bold;
 15 |                 display: none;
 16 |             }
 17 | 
 18 |             .archived {
 19 |                 font-weight: bold;
 20 |                 display: none;
 21 |             }
 22 | 
 23 |             #showresponse {
 24 |                 font-variant: small-caps;
 25 |                 display: none;
 26 |                 color: black;
 27 |             }
 28 | 
 29 |             #response {
 30 |                 font-family: monospace;
 31 |                 display: none;
 32 |                 min-width: 100%;
 33 |                 min-height: 400px;
 34 |             }
 35 |         </style>
 36 |         <script src="//code.jquery.com/jquery-2.1.4.min.js"></script>
 37 |         <script>
 38 |             $(function() {
 39 |                 $("#archivethispage").submit(function(e) {
 40 |                     e.preventDefault();
 41 |                     var query = $(this).serialize();
 42 | 
 43 |                     var url = $("#url").val();
 44 | 
 45 |                     if (!url) {
 46 |                         return false;
 47 |                     }
 48 | 
 49 |                     //$("#loading").show();
 50 | 
 51 |                     $("#submit").prop("disabled", true);
 52 |                     $("#submit").text("Archiving, Please Wait...");
 53 | 
 54 |                     $(".error").hide();
 55 |                     $(".queued").hide();
 56 |                     $(".archived").hide();
 57 | 
 58 |                     $("#response").hide();
 59 |                     $("#showresponse").hide();
 60 | 
 61 |                     $.getJSON("/archivepage?" + query, function(data) {
 62 |                         //$("#loading").hide();
 63 | 
 64 |                         $("#submit").prop("disabled", false);
 65 |                         $("#submit").text("Archive This Website!");
 66 | 
 67 |                         $("#showresponse").show();
 68 | 
 69 |                         if (data) {
 70 |                             handle_results(data, query);
 71 |                         }
 72 |                     });
 73 |                 });
 74 | 
 75 |                 $("#showresponse").click(function() {
 76 |                     $("#response").toggle();
 77 |                 });
 78 |             });
 79 | 
 80 |             function handle_results(results, query) {
 81 |                 $("#response").val(JSON.stringify(results, null, 2));
 82 | 
 83 |                 if (results.error) {
 84 |                     if (results.error.type == "blocked") {
 85 |                         $(".error #type").text(" -- Page Blocked");
 86 |                     }
 87 |                     $(".error #reason").text(results.error.msg);
 88 |                     $(".error").show();
 89 |                     return;
 90 |                 }
 91 | 
 92 |                 if (results.queued) {
 93 |                     var pos = results.queue_pos;
 94 |                     if (pos != undefined) {
 95 |                         if (pos <= 0) {
 96 |                             pos = "0 (Page is currently being archived)";
 97 |                         }
 98 |                         $(".queued #pos").text(pos);
 99 |                         $(".queued").show();
100 |                     }
101 |                     return;
102 |                 }
103 | 
104 |                 if (results.archived) {
105 |                     var num = 0;
106 |                     
107 |                     if (results.log) {
108 |                         num = Object.keys(results.log).length;
109 |                     }
110 |                         
111 |                     if (num > 0) {
112 |                         $(".archived #num").text(num);
113 |                         $(".archived #log-count").show();
114 |                     } else {
115 |                         $(".archived #log-count").hide();
116 |                     }
117 | 
118 |                     if (results.download_url) {
119 |                         $(".archived #download a").attr("href", results.download_url);
120 |                         $(".archived #download").show();
121 |                     } else {
122 |                         $(".archived #download").hide();
123 |                     }
124 | 
125 |                     if (results.replay_url) {
126 |                         $(".archived #replay a").attr("href", results.replay_url);
127 |                         $(".archived #replay").show();
128 |                     } else {
129 |                         $(".archived #replay").hide();
130 |                     }
131 | 
132 |                     $(".archived").show();
133 |                 }
134 |             }
135 |         </script>
136 |     </head>
137 |     <body>
138 |         <a href="https://github.com/ikreymer/browsertrix"><img style="position: absolute; top: 0; right: 0; border: 0;" src="https://camo.githubusercontent.com/38ef81f8aca64bb9a64448d0d70f1308ef5341ab/68747470733a2f2f73332e616d617a6f6e6177732e636f6d2f6769746875622f726962626f6e732f666f726b6d655f72696768745f6461726b626c75655f3132313632312e706e67" alt="Fork me on GitHub" data-canonical-src="https://s3.amazonaws.com/github/ribbons/forkme_right_darkblue_121621.png"></a>
139 | 
140 |         <div class="sm-col-12 full-width block">
141 |             <h2 class="mx-auto sm-col-4">Archive This Website</h2>
142 |             <p class="sm-col-4 mx-auto">Enter URL, Create Web Archive</p>
143 |             <form id="archivethispage" class="sm-col-4 h5 mx-auto mb2" method="get" action="archivepage">
144 |                 <input type="text" name="url" id="url" class="block full-width mb2 field-light" placeholder="Enter url to archive here, eg: http://example.com/"/>  
145 |                 % for name in archives.keys():
146 |                 <label for="{{name}}" class="block full-width">
147 |                     <input type="radio" name="archive" value="{{name}}" class="" {{! 'checked="checked"' if name == default_archive else '' }} required="true"/>
148 |                     <span>{{!archives[name].get_desc()}}</span>
149 |                 </label>
150 |                 % end
151 | 
152 |                 <span class="sm-col-12 mt2 full-width block">
153 |                     <span>Archive using:</span>
154 |                     <label for="chrome" class="">
155 |                         <input type="radio" name="browser" value="chrome" checked="checked"/>Chrome
156 |                     </label>
157 | 
158 |                     <label for="chrome" class="">
159 |                         <input type="radio" name="browser" value="firefox"/>Firefox
160 |                     </label>
161 | 
162 |                 </span>
163 | 
164 |                 <button id="submit" class="button mt2" type="submit">Archive This Website!</button>
165 |             </form>
166 | 
167 |             <div class="sm-col-4 mx-auto" id="loading" style="display: none">
168 |                 <h3>Archiving...</h3>
169 |             </div>
170 | 
171 |             <div class="sm-col-4 mx-auto error red">
172 |                 <div class="block full-width">Sorry, an error has occured<span id="type"></span></div>
173 |                 <div class="reason block full-width">Reason: <span id="reason"></span></div>
174 |             </div>
175 | 
176 |             <div class="sm-col-4 mx-auto queued orange">
177 |                 <div class="block full-width">The url is queued and will be recorded soon</div>
178 |                 <div class="block full-width">Position in Queue: <span id="pos"></span></div>
179 |             </div>
180 | 
181 |             <div class="sm-col-4 mx-auto archived olive">
182 |                 <div class="block full-width">Archived!</div>
183 |                 <div id="log-count" class="block full-width"><span id="num"></span>&nbsp;total url(s) were archived</div>
184 |                 <div id="replay" class="block full-width"><a href="#">View Archived Page</a></div>
185 |                 <div id="download" class="block full-width"><a href="#">Download Archive (WARC)</a></div>
186 |             </div>
187 | 
188 |             <div class="sm-col-4 mx-auto mt2">
189 |                 <a id="showresponse" href="#">show full json response</a>
190 |             </div>
191 |         </div>
192 | 
193 |         <div class="sm-col-10 full-width mx-auto block">
194 |             <textarea id="response"></textarea>
195 | 
196 |             <div style="margin-top: 200px">Powered by <a href="https://github.com/ikreymer/browsertrix">Browsertrix</a></div>
197 |         </div>
198 | 
199 |     </body>
200 | </html>
201 | 


--------------------------------------------------------------------------------