├── .coveragerc
├── .dockerignore
├── .flake8
├── .gitignore
├── .travis.yml
├── Dockerfile
├── LICENSE
├── NOTICE
├── README.md
├── browsertrix
    ├── __init__.py
    ├── api.py
    ├── crawl.py
    ├── schema.py
    └── utils.py
├── browsertrix_cli
    ├── basecli.py
    ├── crawl.py
    ├── main.py
    └── profile.py
├── cli-requirements.txt
├── dev-requirements.txt
├── docker-compose.yml
├── flocks
    ├── browsers-headless.yaml
    └── browsers.yaml
├── frontend
    ├── .eslintrc.js
    ├── .prettierrc
    ├── package.json
    ├── public
    │   └── index.html
    ├── src
    │   ├── actions
    │   │   ├── crawls.js
    │   │   ├── httpRequests.js
    │   │   └── index.js
    │   ├── components
    │   │   ├── Crawl
    │   │   │   ├── Control.js
    │   │   │   ├── Info.js
    │   │   │   └── index.js
    │   │   ├── CrawlCreator
    │   │   │   ├── CreationForm.js
    │   │   │   ├── fields.js
    │   │   │   ├── index.js
    │   │   │   └── validate.js
    │   │   ├── Crawls
    │   │   │   ├── LoadingCrawls.js
    │   │   │   ├── SelectCrawl.js
    │   │   │   └── index.js
    │   │   └── Header
    │   │   │   ├── HeaderLink.js
    │   │   │   └── index.js
    │   ├── containers
    │   │   └── App.js
    │   ├── reducers
    │   │   ├── crawls.js
    │   │   └── index.js
    │   ├── root.js
    │   ├── store
    │   │   ├── dev.js
    │   │   ├── index.js
    │   │   ├── middleware.js
    │   │   └── prod.js
    │   ├── styles
    │   │   └── global.scss
    │   ├── utils
    │   │   ├── bootstrap.js
    │   │   ├── endpoints.js
    │   │   ├── index.js
    │   │   └── rhlConfig.js
    │   └── wrap-with-provider.js
    ├── webpack
    │   ├── development-server.js
    │   └── webpack.config.js
    └── yarn.lock
├── install-browsers.sh
├── mypy.ini
├── pool_config.yaml
├── pyproject.toml
├── pytest.ini
├── pywb
    ├── Dockerfile
    ├── config.yaml
    ├── crawlapp.py
    ├── run.sh
    ├── static
    │   └── browsertrix-logo.svg
    ├── templates
    │   └── fullsearch.html
    └── uwsgi.ini
├── requirements.txt
├── sample-crawls
    ├── custom-scopes.yaml
    ├── emulate-mobile-browser.yaml
    ├── example.yaml
    ├── override-browser-http-cookies-language.yaml
    ├── social-media-replay.yaml
    └── social-media.yaml
├── scripts
    ├── format.sh
    └── lint.sh
├── setup.py
├── static
    ├── .gitkeep
    ├── app.js
    ├── browsertrix-logo.svg
    └── index.html
├── test-docker-requirements.txt
├── test-local-requirements.txt
└── tests
    ├── __init__.py
    ├── conftest.py
    ├── crawl_tests.yaml
    ├── start-test-compose.sh
    ├── stop-test-compose.sh
    ├── test-docker-compose.yml
    ├── test_api.py
    ├── test_live_crawl.py
    ├── test_pool_config.yaml
    └── utils.py


/.coveragerc:
--------------------------------------------------------------------------------
1 | [run]
2 | source = codecov
3 | branch = True
4 | omit =
5 |     */test/*
6 |     */tests/*
7 | 
8 | 


--------------------------------------------------------------------------------
/.dockerignore:
--------------------------------------------------------------------------------
  1 | pywb
  2 | build
  3 | webarchive
  4 | browsertrix_cli
  5 | 
  6 | # Created by .ignore support plugin (hsz.mobi)
  7 | ### Node template
  8 | # Logs
  9 | logs
 10 | *.log
 11 | npm-debug.log*
 12 | yarn-debug.log*
 13 | yarn-error.log*
 14 | 
 15 | # Runtime data
 16 | pids
 17 | *.pid
 18 | *.seed
 19 | *.pid.lock
 20 | 
 21 | # Directory for instrumented libs generated by jscoverage/JSCover
 22 | lib-cov
 23 | 
 24 | # Coverage directory used by tools like istanbul
 25 | coverage
 26 | 
 27 | # nyc test coverage
 28 | .nyc_output
 29 | 
 30 | # Grunt intermediate storage (http://gruntjs.com/creating-plugins#storing-task-files)
 31 | .grunt
 32 | 
 33 | # Bower dependency directory (https://bower.io/)
 34 | bower_components
 35 | 
 36 | # node-waf configuration
 37 | .lock-wscript
 38 | 
 39 | # Compiled binary addons (https://nodejs.org/api/addons.html)
 40 | build/Release
 41 | 
 42 | # Dependency directories
 43 | node_modules/
 44 | jspm_packages/
 45 | 
 46 | # TypeScript v1 declaration files
 47 | typings/
 48 | 
 49 | # Optional npm cache directory
 50 | .npm
 51 | 
 52 | # Optional eslint cache
 53 | .eslintcache
 54 | 
 55 | # Optional REPL history
 56 | .node_repl_history
 57 | 
 58 | # Output of 'npm pack'
 59 | *.tgz
 60 | 
 61 | # Yarn Integrity file
 62 | .yarn-integrity
 63 | 
 64 | # dotenv environment variables file
 65 | .env
 66 | 
 67 | # next.js build output
 68 | .next
 69 | ### JetBrains template
 70 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and WebStorm
 71 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
 72 | 
 73 | # User-specific stuff
 74 | .idea/**/workspace.xml
 75 | .idea/**/tasks.xml
 76 | .idea/**/dictionaries
 77 | .idea/**/shelf
 78 | 
 79 | # Sensitive or high-churn files
 80 | .idea/**/dataSources/
 81 | .idea/**/dataSources.ids
 82 | .idea/**/dataSources.local.xml
 83 | .idea/**/sqlDataSources.xml
 84 | .idea/**/dynamic.xml
 85 | .idea/**/uiDesigner.xml
 86 | .idea/**/dbnavigator.xml
 87 | 
 88 | # Gradle
 89 | .idea/**/gradle.xml
 90 | .idea/**/libraries
 91 | 
 92 | # CMake
 93 | cmake-build-debug/
 94 | cmake-build-release/
 95 | 
 96 | # Mongo Explorer plugin
 97 | .idea/**/mongoSettings.xml
 98 | 
 99 | # File-based project format
100 | *.iws
101 | 
102 | # IntelliJ
103 | out/
104 | 
105 | # mpeltonen/sbt-idea plugin
106 | .idea_modules/
107 | 
108 | # JIRA plugin
109 | atlassian-ide-plugin.xml
110 | 
111 | # Cursive Clojure plugin
112 | .idea/replstate.xml
113 | 
114 | # Crashlytics plugin (for Android Studio and IntelliJ)
115 | com_crashlytics_export_strings.xml
116 | crashlytics.properties
117 | crashlytics-build.properties
118 | fabric.properties
119 | 
120 | # Editor-based Rest Client
121 | .idea/httpRequests
122 | ### Example user template template
123 | ### Example user template
124 | 
125 | # IntelliJ project files
126 | .idea
127 | *.iml
128 | out
129 | gen### Python template
130 | # Byte-compiled / optimized / DLL files
131 | __pycache__/
132 | *.py[cod]
133 | *$py.class
134 | 
135 | # C extensions
136 | *.so
137 | 
138 | # Distribution / packaging
139 | .Python
140 | build/
141 | develop-eggs/
142 | dist/
143 | downloads/
144 | eggs/
145 | .eggs/
146 | lib/
147 | lib64/
148 | parts/
149 | sdist/
150 | var/
151 | wheels/
152 | *.egg-info/
153 | .installed.cfg
154 | *.egg
155 | MANIFEST
156 | 
157 | # PyInstaller
158 | #  Usually these files are written by a python script from a template
159 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
160 | *.manifest
161 | *.spec
162 | 
163 | # Installer logs
164 | pip-log.txt
165 | pip-delete-this-directory.txt
166 | 
167 | # Unit test / coverage reports
168 | htmlcov/
169 | .tox/
170 | .coverage
171 | .coverage.*
172 | .cache
173 | nosetests.xml
174 | coverage.xml
175 | *.cover
176 | .hypothesis/
177 | .pytest_cache/
178 | 
179 | # Translations
180 | *.mo
181 | *.pot
182 | 
183 | # Django stuff:
184 | *.log
185 | local_settings.py
186 | db.sqlite3
187 | 
188 | # Flask stuff:
189 | instance/
190 | .webassets-cache
191 | 
192 | # Scrapy stuff:
193 | .scrapy
194 | 
195 | # Sphinx documentation
196 | docs/_build/
197 | 
198 | # PyBuilder
199 | target/
200 | 
201 | # Jupyter Notebook
202 | .ipynb_checkpoints
203 | 
204 | # pyenv
205 | .python-version
206 | 
207 | # celery beat schedule file
208 | celerybeat-schedule
209 | 
210 | # SageMath parsed files
211 | *.sage.py
212 | 
213 | # Environments
214 | .env
215 | .venv
216 | env/
217 | venv/
218 | ENV/
219 | env.bak/
220 | venv.bak/
221 | 
222 | # Spyder project settings
223 | .spyderproject
224 | .spyproject
225 | 
226 | # Rope project settings
227 | .ropeproject
228 | 
229 | # mkdocs documentation
230 | /site
231 | 
232 | 
233 | # mypy
234 | .mypy_cache/
235 | localCompose
236 | pip-wheel-metadata
237 | scripts
238 | #tests
239 | poetry.lock
240 | pyproject.toml
241 | pytest.ini
242 | README.md
243 | mypy.ini
244 | .flake8
245 | frontend
246 | webarchive
247 | 


--------------------------------------------------------------------------------
/.flake8:
--------------------------------------------------------------------------------
 1 | [flake8]
 2 | ignore = E203, E266, E501, W503
 3 | max-line-length = 88
 4 | max-complexity = 18
 5 | select = B,C,E,F,W,T4,B950
 6 | exclude = .git,
 7 |         __pycache__,
 8 |         .mypy_cache,
 9 |         venv,
10 |         .venv
11 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Created by .ignore support plugin (hsz.mobi)
  2 | ### Python template
  3 | # Byte-compiled / optimized / DLL files
  4 | __pycache__/
  5 | *.py[cod]
  6 | *$py.class
  7 | 
  8 | # C extensions
  9 | *.so
 10 | 
 11 | # Distribution / packaging
 12 | .Python
 13 | build/
 14 | develop-eggs/
 15 | dist/
 16 | downloads/
 17 | eggs/
 18 | .eggs/
 19 | lib/
 20 | lib64/
 21 | parts/
 22 | sdist/
 23 | var/
 24 | wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | .hypothesis/
 50 | .pytest_cache/
 51 | 
 52 | # Translations
 53 | *.mo
 54 | *.pot
 55 | 
 56 | # Django stuff:
 57 | *.log
 58 | local_settings.py
 59 | db.sqlite3
 60 | 
 61 | # Flask stuff:
 62 | instance/
 63 | .webassets-cache
 64 | 
 65 | # Scrapy stuff:
 66 | .scrapy
 67 | 
 68 | # Sphinx documentation
 69 | docs/_build/
 70 | 
 71 | # PyBuilder
 72 | target/
 73 | 
 74 | # Jupyter Notebook
 75 | .ipynb_checkpoints
 76 | 
 77 | # pyenv
 78 | .python-version
 79 | 
 80 | # celery beat schedule file
 81 | celerybeat-schedule
 82 | 
 83 | # SageMath parsed files
 84 | *.sage.py
 85 | 
 86 | # Environments
 87 | .env
 88 | .venv
 89 | env/
 90 | venv/
 91 | ENV/
 92 | env.bak/
 93 | venv.bak/
 94 | 
 95 | # Spyder project settings
 96 | .spyderproject
 97 | .spyproject
 98 | 
 99 | # Rope project settings
100 | .ropeproject
101 | 
102 | # mkdocs documentation
103 | /site
104 | 
105 | # mypy
106 | .mypy_cache/
107 | ### Example user template template
108 | ### Example user template
109 | 
110 | # IntelliJ project files
111 | .idea
112 | *.iml
113 | out
114 | gen### JetBrains template
115 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and WebStorm
116 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
117 | 
118 | # User-specific stuff
119 | .idea/**/workspace.xml
120 | .idea/**/tasks.xml
121 | .idea/**/dictionaries
122 | .idea/**/shelf
123 | 
124 | # Sensitive or high-churn files
125 | .idea/**/dataSources/
126 | .idea/**/dataSources.ids
127 | .idea/**/dataSources.local.xml
128 | .idea/**/sqlDataSources.xml
129 | .idea/**/dynamic.xml
130 | .idea/**/uiDesigner.xml
131 | .idea/**/dbnavigator.xml
132 | 
133 | # Gradle
134 | .idea/**/gradle.xml
135 | .idea/**/libraries
136 | 
137 | # CMake
138 | cmake-build-debug/
139 | cmake-build-release/
140 | 
141 | # Mongo Explorer plugin
142 | .idea/**/mongoSettings.xml
143 | 
144 | # File-based project format
145 | *.iws
146 | 
147 | # IntelliJ
148 | out/
149 | 
150 | # mpeltonen/sbt-idea plugin
151 | .idea_modules/
152 | 
153 | # JIRA plugin
154 | atlassian-ide-plugin.xml
155 | 
156 | # Cursive Clojure plugin
157 | .idea/replstate.xml
158 | 
159 | # Crashlytics plugin (for Android Studio and IntelliJ)
160 | com_crashlytics_export_strings.xml
161 | crashlytics.properties
162 | crashlytics-build.properties
163 | fabric.properties
164 | 
165 | # Editor-based Rest Client
166 | .idea/httpRequests
167 | 
168 | ### Node template
169 | # Logs
170 | logs
171 | *.log
172 | npm-debug.log*
173 | yarn-debug.log*
174 | yarn-error.log*
175 | 
176 | # Runtime data
177 | pids
178 | *.pid
179 | *.seed
180 | *.pid.lock
181 | 
182 | # Directory for instrumented libs generated by jscoverage/JSCover
183 | lib-cov
184 | 
185 | # Coverage directory used by tools like istanbul
186 | coverage
187 | 
188 | # nyc test coverage
189 | .nyc_output
190 | 
191 | # Grunt intermediate storage (http://gruntjs.com/creating-plugins#storing-task-files)
192 | .grunt
193 | 
194 | # Bower dependency directory (https://bower.io/)
195 | bower_components
196 | 
197 | # node-waf configuration
198 | .lock-wscript
199 | 
200 | # Compiled binary addons (https://nodejs.org/api/addons.html)
201 | build/Release
202 | 
203 | # Dependency directories
204 | node_modules/
205 | jspm_packages/
206 | 
207 | # TypeScript v1 declaration files
208 | typings/
209 | 
210 | # Optional npm cache directory
211 | .npm
212 | 
213 | # Optional eslint cache
214 | .eslintcache
215 | 
216 | # Optional REPL history
217 | .node_repl_history
218 | 
219 | # Output of 'npm pack'
220 | *.tgz
221 | 
222 | # Yarn Integrity file
223 | .yarn-integrity
224 | 
225 | *.tar.gz
226 | **/test-webarchive/
227 | **/webarchive/
228 | 
229 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: python
 2 | 
 3 | python:
 4 |   - "3.7"
 5 | 
 6 | os:
 7 |   - linux
 8 | 
 9 | dist: xenial
10 | 
11 | sudo: required
12 | 
13 | jobs:
14 |   include:
15 |     - stage: local tests
16 |       script:
17 |         - python setup.py install
18 |         - pip install -U -r test-local-requirements.txt
19 |         - py.test ./tests/test_api.py
20 | 
21 |     - stage: docker integration tests
22 | 
23 |       services:
24 |         - docker
25 | 
26 |       env:
27 |         - DOCKER_COMPOSE_VERSION=1.23.2
28 | 
29 |       before_install:
30 |         - ./install-browsers.sh --headless
31 |         - sudo rm /usr/local/bin/docker-compose
32 |         - curl -L https://github.com/docker/compose/releases/download/${DOCKER_COMPOSE_VERSION}/docker-compose-`uname -s`-`uname -m` > docker-compose
33 |         - chmod +x docker-compose
34 |         - sudo mv docker-compose /usr/local/bin
35 | 
36 |       script:
37 |         - bash ./tests/start-test-compose.sh
38 |         - pip install -U -r test-docker-requirements.txt
39 |         - py.test --headless ./tests/test_live_crawl.py
40 |         - docker-compose logs
41 |         - bash ./tests/stop-test-compose.sh
42 | 
43 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:3.7.3
 2 | 
 3 | WORKDIR /app
 4 | 
 5 | COPY requirements.txt ./
 6 | 
 7 | RUN pip install --no-cache-dir -r requirements.txt
 8 | 
 9 | COPY browsertrix ./browsertrix
10 | COPY static ./static
11 | 
12 | CMD uvicorn --reload --host 0.0.0.0 --port 8000 browsertrix.api:app
13 | 
14 | 
15 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Apache License
 2 | Version 2.0, January 2004
 3 | http://www.apache.org/licenses/
 4 | 
 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
 6 | 
 7 | 1. Definitions.
 8 | 
 9 | "License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document.
10 | 
11 | "Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License.
12 | 
13 | "Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity.
14 | 
15 | "You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License.
16 | 
17 | "Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files.
18 | 
19 | "Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types.
20 | 
21 | "Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below).
22 | 
23 | "Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof.
24 | 
25 | "Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution."
26 | 
27 | "Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work.
28 | 
29 | 2. Grant of Copyright License.
30 | 
31 | Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form.
32 | 
33 | 3. Grant of Patent License.
34 | 
35 | Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed.
36 | 
37 | 4. Redistribution.
38 | 
39 | You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions:
40 | 
41 |     You must give any other recipients of the Work or Derivative Works a copy of this License; and
42 |     You must cause any modified files to carry prominent notices stating that You changed the files; and
43 |     You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and
44 |     If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License.
45 | 
46 | You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License.
47 | 
48 | 5. Submission of Contributions.
49 | 
50 | Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions.
51 | 
52 | 6. Trademarks.
53 | 
54 | This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file.
55 | 
56 | 7. Disclaimer of Warranty.
57 | 
58 | Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License.
59 | 
60 | 8. Limitation of Liability.
61 | 
62 | In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages.
63 | 
64 | 9. Accepting Warranty or Additional Liability.
65 | 
66 | While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability.
67 | 
68 | END OF TERMS AND CONDITIONS
69 | 


--------------------------------------------------------------------------------
/NOTICE:
--------------------------------------------------------------------------------
1 | Browsertrix
2 | Copyright 2018-2020 Webrecorder Software, Rhizome, and Contributors.
3 | 
4 | Distributed under the Apache License 2.0.
5 | See LICENSE for details.
6 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | **Depcrecated**: The Browsertrix system is being refactored into more modular individual components. The main component, [Browsertrix Crawler](https://github.com/webrecorder/browsertrix-crawler) will soon support most of the same crawling features via an integrated Docker image that can be deployed via the command-line. The UI and scheduling components will soon be reimplemented as additional components.
  2 | 
  3 | Please see Browsertrix Crawler for latest development.
  4 | 
  5 | <hr>
  6 | 
  7 | <img src="static/browsertrix-logo.svg" width="350">
  8 | 
  9 | [![Code Style](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/ambv/black) [![Build Status](https://travis-ci.org/webrecorder/browsertrix.svg?branch=master)](https://travis-ci.org/webrecorder/browsertrix)
 10 | 
 11 | ## High Fidelity Browser-Based Crawling Automation
 12 | 
 13 | Browsertrix is a brand new toolset from the Webrecorder project for automating browsers to perform complex scripted behaviors
 14 | as well as crawl multiple pages. (The name was originally used for an older project with similar goals).
 15 | 
 16 | Browsertrix is a system for orchestrating Docker-based Chrome browsers, crawling processes, behavior systems, web archiving capture and replay, and full-text search.
 17 | 
 18 | It includes the following features:
 19 | * Crawling via customizable YAML-based crawl spec
 20 | * High-fidelity browser-based crawlers (controlled via [webrecorder/autobrowser](https://github.com/webrecorder/autobrowser))
 21 | * Execution of complex, domain-specific in-page behaviors (provided by [webrecorder/behaviors](https://github.com/webrecorder/behaviors))
 22 | * Capture or replay into designated [pywb](https://github.com/webrecorder/pywb) collections
 23 | * Screenshot creation of each page (optional).
 24 | * Text extraction for each page and full text search via Solr (optional).
 25 | * Support for customized browser profiles to minimize capture of private information.
 26 | 
 27 | ## Getting Started
 28 | 
 29 | ### Installing Browsertrix
 30 | 
 31 | Browsertrix is currently designed to run with Docker and Docker Compose.
 32 | The Browsertrix CLI requires local Python 3.6+.
 33 | 
 34 | To install, run:
 35 | 
 36 | ```bash
 37 | git clone https://github.com/webrecorder/browsertrix
 38 | cd browsertrix
 39 | python setup.py install
 40 | ./install-browsers.sh
 41 | docker-compose build
 42 | docker-compose up -d
 43 | ```
 44 | 
 45 | The `install-browsers.sh` script installs additional Docker images necessary for dynamic browser creation.
 46 | The script can be used to update the images as well.
 47 | 
 48 | ### Installing Browsertrix CLI
 49 | 
 50 | The Browsertrix CLI is installed by running `python setup.py install` and includes full functionality for running crawls and creating browser profiles.
 51 | 
 52 | Once installed, browsertrix commands are available via the `browsertrix` command.
 53 | 
 54 | ## Creating a Crawl
 55 | 
 56 | To create a crawl, first a crawl spec should be defined in a yaml file.
 57 | An example spec, [sample-crawls/example.yaml](sample-crawls/example.yaml) might look as follows:
 58 | 
 59 | ```yaml
 60 | crawls:
 61 |   - name: example
 62 |     crawl_type: all-links
 63 |     num_browsers: 1
 64 | 
 65 |     coll: example
 66 |     mode: record
 67 | 
 68 |     seed_urls:
 69 |       - https://www.iana.org/
 70 | ```
 71 | 
 72 | Then, simply run `browsertrix crawl create sample-crawls/example.yaml --watch`
 73 | 
 74 | The `--watch` param will also result in the crawling browser opening in a new browser window via vnc connection.
 75 | 
 76 | If started successfully, the output will be similar to:
 77 | ```
 78 | Crawl Created and Started: cf30281efc7a
 79 | Status: running
 80 | Opening Browser 1 of 1 (CKVEMACNI6YBUKLQI6UKKBLB) for crawl cf30281efc7a
 81 | ```
 82 | 
 83 | To view all running crawls, simply run `browsertrix crawl list` which should result in output similar to:
 84 | 
 85 | ```
 86 | CRAWL ID      NAME          STARTED       DURATION      STATUS   CRAWL TYPE    COLL              MODE      TO CRAWL  PENDING   SEEN      BROWSERS   TABS  
 87 | cf30281efc7a  example       0:00:35 ago   0:00:10       running  all-links     example           record    15        1         25        1          1    
 88 | ```
 89 | 
 90 | To get more detailed info on the crawl, run `browsertrix crawl info --urls <crawl_id>` (where `<crawl_id> = cf30281efc7a` in this example)
 91 | 
 92 | To follow the crawl log in the console window, add the `--log` option (the log followed will be from the first browser).
 93 | 
 94 | ### Crawling Options
 95 | 
 96 | Browsertrix supports a number of options, with a key option being the `crawl_type`, which can be:
 97 | 
 98 | - `single-page` -- crawl only the specified seed urls
 99 | - `all-links` -- crawl the seed url(s) and all links discovered until max depth is exceeded
100 | - `same-domain` -- crawl the seed url(s) and all links discovered that are on the same domain or sub-domain (up to a depth of 100)
101 | - `custom` -- Supports custom depth and scope rules!
102 | 
103 | The first 3 options are designed to be a simple way to specify common options, and more may be added later.
104 | 
105 | When using `custom`, the `crawl_depth` param can specify the crawl depth (hops) from each seed url.
106 | 
107 | The `scopes` list can contain one or more [urlcanon MatchRules](https://github.com/iipc/urlcanon/blob/master/python/urlcanon/rules.py#L70) specifying urls that are in scope for the crawl.
108 | 
109 | See [custom-scopes.yaml](sample_crawl_spec/custom-scopes.yaml) for an example on how to use the custom option.
110 | 
111 | 
112 | The `coll` option specifies the pywb collection to use for crawling, and mode specifies `record` (default) or `replay` or
113 | `live` (direct live web connection).
114 | 
115 | The `num_browsers` and `num_tabs` option allow for selecting total number of browsers and number of tabs per browser to use for this crawl.
116 | 
117 | The seed urls for the crawl should be provided in the `seed_urls` list.
118 | 
119 | The `cache` option specifies cacheing options for a crawl, with available options:
120 | - `always` -- Strict cacheing via `Cache-Control` on almost every resource to limit duplicate urls in a single browser session (default option when omitted)
121 | - `default` -- Keep default cacheing for a page
122 | - `never` -- disables all cacheing for all urls.
123 | 
124 | All example crawl configs demonstrating these options are available in: [sample-crawls](sample-crawls/)
125 | 
126 | ### In-Page Behaviors
127 | 
128 | For every page, Browsertrix runs a designated behavior before collecting outlinks, (optionally) taking screenshots,
129 | and moving on to the next page.
130 | 
131 | The behaviors are served via a separate behavior API server. The current list of available behaviors is available at:
132 | https://github.com/webrecorder/behaviors/tree/master/behaviors
133 | 
134 | The behaviors are built using a special library of behavior functions (preliminary docs available here:
135 | https://webrecorder.github.io/behaviors/)
136 | 
137 | If no site-specific behavior is found, the default `autoscroll.js`
138 | 
139 | The `behavior_max_time` crawl option specifies the maximum time a behavior can run (current default is 60 seconds). 
140 | When crawling sites with infinite scroll, it is recommended to set the `behavior_max_time` to be much higher.
141 | 
142 | 
143 | ### pywb Collections and Access
144 | 
145 | All data crawled is placed in the `./webarchive/collections/` directory which corresponds to the [standard pywb directory structure conventions](https://pywb.readthedocs.io/en/latest/manual/configuring.html#directory-structure) eg. a collection `test` would be found under `./webarchive/collections/test`.
146 | 
147 | Collections are created automatically on first use and can also be managed via `wb-manager` with `webarchive` as the working directory.
148 | 
149 | The running pywb instance can also be accessed via `http://localhost:8180/`
150 | 
151 | ### Replay Crawling and Screenshots
152 | 
153 | Currently, screenshot creation is automatically enabled when crawling in record mode and screenshots are added automatically
154 | to the same collection.
155 | 
156 | Browsertrix supports crawling in replay mode, over an existing collection, which may be useful for QA processes,
157 | especially when combined with screenshot creation.
158 | 
159 | By setting the `mode` and `screenshot_coll` properties for each crawl, it is possible to run Browsertrix over replay and generate screenshots into a different collection, which may be used for QA comparison.
160 | 
161 | Additional screenshot options are to be added soon. (Currently, the screenshot is taken after the behavior is run but this will likely change).
162 | 
163 | Crawl options can also be overriden via command line.
164 | 
165 | For example, given a crawl spec `./my_crawl.yaml`, one could first capture with:
166 | ```
167 | browsertrix crawl create ./my_crawl.yaml --screenshot_coll screenshots-capture
168 | ```
169 | 
170 | and then run:
171 | ```
172 | browsertrix crawl create ./my_crawl.yaml --screenshot_coll --mode replay screenshots-qa
173 | ```
174 | 
175 | By default, screenshots are saved with `urn:screenshot:<url>` prefix.
176 | Based on the above crawls, one could then query all capture and qa screenshots in pywb via:
177 | ```
178 | http://localhost:8180/screenshots-capture/*/urn:screenshot:*
179 | http://localhost:8180/screenshots-qa/*/urn:screenshot:*
180 | ```
181 | 
182 | Sample record and replay configs, [social-media.yaml](sample-crawls/social-media.yaml) and [social-media-replay.yaml](sample-crawls/social-media-replay.yaml), are also available.
183 | 
184 | (Note: The screenshot functionality will likely change and additional options will be added)
185 | 
186 | ### Other Crawl operations
187 | 
188 | Other crawl operations include:
189 | * `browsertrix crawl stop` for stopping a crawl
190 | * `browsertrix crawl logs` for printing and following logs for one or all crawlers
191 | * `browsertrix crawl watch <crawl_id>` for attaching and watching all the browsers in a given crawl.
192 | * `browsertrix crawl remove` for removing a crawl
193 | * `browsertrix crawl remove-all` for stopping and removing all crawls.
194 | 
195 | See `browsertrix crawl -h` for a complete reference of available commands.
196 | 
197 | ## Full Text Search
198 | 
199 | Browsertrix now includes a prototype integration with Apache Solr. Text is extracted for each page, after taking a screenshot, and ingested into Solr. The extracted text (as provided via raw DOM text nodes) from all frames,
200 | as well as the title, and url are indexed in Solr using default schema. (This is likely to evolve as well).
201 | 
202 | The search is available for each collection via the pywb replay interface at: `http://localhost:8180/<coll_id>`
203 | 
204 | The replay interface currently includes a list of pages, screenshot (if enabled) and ability to search the collection.
205 | 
206 | (Note: solr data is stored in the `./solr` volume, and may require a permission adjustment on certain systems via `chmod a+w ./solr`)
207 | 
208 | 
209 | ## Browser Profiles
210 | 
211 | It is often useful to prepare a browser, such as by logging into social media, other password protected sites
212 | to be able to capture content that is not generally accessible. However, doing so during a crawl is tedious, and worse,
213 | may result in passwords being recorded to WARC.
214 | 
215 | Browsertrix addresses this problem with the support of browser profiles. A profile can be created by running a base
216 | Chrome browser, performing custom actions, and then 'saving' the running browser into a new 'profile' image.
217 | 
218 | To create a profile:
219 | 
220 | 1. Run:
221 | ```browsertrix profile create```
222 | 
223 | 2. This should start a new remote browser (Chrome 73 by default) and open it in a new window. You can now interact with the browser and log in to any sites as needed.
224 | 
225 | 3. The command line should have the following message and a prompt to enter the profile name, eg. `logged-in`
226 | 
227 | ```
228 | A new browser window should have been opened
229 | You can use the browser to log-in to accounts or otherwise prepare the browser profile
230 | (The content will not be recorded to WARC)
231 | When done, please enter a new name to save the browser profile: 
232 | ```
233 | 
234 | 4. Once the name is entered the profile is saved, and you can continue browsing to make a new profile, or select 'no' and close the browser.
235 | 
236 |    If everything worked, running ```browsertrix profile list``` should show:
237 | 
238 | ```
239 | PROFILE           BASE BROWSER
240 | logged-in         chrome:73
241 | ```
242 | 
243 | 5. To use the profile, set the `profile` property in the crawl spec YAML, or simply include `--profile` in the command line:
244 | 
245 | ```
246 | browsertrix crawl create ./my_crawl.yaml --profile logged-in
247 | ```
248 | 
249 | The browsers used for the crawl will be a copy of the browser saved during profile creation.
250 | 
251 | `browsertrix profile remove` can be used to remove an unneeded profile.
252 | 
253 | Note: The profile functionality is brand new and subject to change. At present, it is tied to the particular browser Docker image used an extend the image. The system may switch to Docker volumes in the future.
254 | 
255 | ## Testing
256 | 
257 | Browsertrix includes several test suites, also tested on automatically via Travis CI.
258 | 
259 | ### Docker Integration Tests
260 | 
261 | Browsertrix includes a Docker-based test suite that runs crawls over content replayed from a WARC
262 | (no live web content is accessed). This test suite requires Python 3.6+.
263 | 
264 | To run this test suite, run:
265 | 
266 | ```bash
267 | bash ./tests/start-test-compose.sh
268 | pip install -U -r test-docker-requirements.txt
269 | py.test --headless ./tests/test_live_crawl.py
270 | bash ./tests/stop-test-compose.sh
271 | ```
272 | 
273 | The test suite does not perform any live crawling, but runs all the [tests/crawl_tests.yaml](tests/crawl_tests.yaml) in replay mode using an existing test WARC downloaded from S3.
274 | 
275 | ### Local API Tests
276 | 
277 | To install and run local tests of the API (without Docker), run the following:
278 | (Python 3.7+ is required)
279 | 
280 | ```bash
281 | pip install -U -r requirements.txt -r test-local-requirements.txt
282 | py.test ./tests/test_api.py
283 | ```
284 | 
285 | ## UI
286 | 
287 | Browsertrix also includes a UI (still under development) which will
288 | have the same features as the CLI.
289 | 
290 | To access the browsertrix UI, load `http://localhost:8000/`
291 | 
292 | The frontend React app is found in `./frontend` and can be started via:
293 | 
294 | ```
295 | yarn run develop
296 | ```
297 | 
298 | (The develop server is started at `http://localhost:8001` to avoid conflict with production)
299 | 
300 | To build the production bundle, run:
301 | ```
302 | yarn run build-prod
303 | ```
304 | 
305 | This should update the production server running at `http://localhost:8000`
306 | 


--------------------------------------------------------------------------------
/browsertrix/__init__.py:
--------------------------------------------------------------------------------
1 | from better_exceptions import hook
2 | 
3 | __version__ = '0.1.0'
4 | 
5 | hook()
6 | 


--------------------------------------------------------------------------------
/browsertrix/api.py:
--------------------------------------------------------------------------------
  1 | from fastapi import APIRouter, FastAPI
  2 | from starlette.middleware.cors import ALL_METHODS, CORSMiddleware
  3 | from starlette.responses import FileResponse, UJSONResponse
  4 | from starlette.staticfiles import StaticFiles
  5 | 
  6 | from .crawl import CrawlManager
  7 | from .schema import *
  8 | 
  9 | app = FastAPI(debug=True)
 10 | app.add_middleware(
 11 |     CORSMiddleware, allow_origins=["*"], allow_methods=ALL_METHODS, allow_headers=["*"]
 12 | )
 13 | crawl_man = CrawlManager()
 14 | crawl_router = APIRouter()
 15 | 
 16 | 
 17 | # ============================================================================
 18 | @app.post('/crawls', response_model=CreateStartResponse, response_class=UJSONResponse)
 19 | async def create_crawl(new_crawl: CreateCrawlRequest):
 20 |     return await crawl_man.create_new(new_crawl)
 21 | 
 22 | 
 23 | @app.get('/crawls', response_model=CrawlInfosResponse, response_class=UJSONResponse)
 24 | async def get_all_crawls():
 25 |     return await crawl_man.get_all_crawls()
 26 | 
 27 | 
 28 | @crawl_router.put(
 29 |     '/{crawl_id}/urls',
 30 |     response_model=OperationSuccessResponse,
 31 |     response_class=UJSONResponse,
 32 | )
 33 | async def queue_urls(crawl_id: str, url_list: QueueUrlsRequest):
 34 |     return await crawl_man.queue_crawl_urls(crawl_id, url_list.urls)
 35 | 
 36 | 
 37 | @crawl_router.get(
 38 |     '/{crawl_id}', response_model=CrawlInfoResponse, response_class=UJSONResponse
 39 | )
 40 | async def get_crawl(crawl_id: str):
 41 |     return await crawl_man.get_crawl_info(crawl_id)
 42 | 
 43 | 
 44 | @crawl_router.get(
 45 |     '/{crawl_id}/urls',
 46 |     response_model=CrawlInfoUrlsResponse,
 47 |     response_class=UJSONResponse,
 48 | )
 49 | async def get_crawl_urls(crawl_id: str):
 50 |     return await crawl_man.get_crawl_urls(crawl_id)
 51 | 
 52 | 
 53 | @crawl_router.get(
 54 |     '/{crawl_id}/info',
 55 |     response_model=FullCrawlInfoResponse,
 56 |     response_class=UJSONResponse,
 57 | )
 58 | async def get_full_crawl_info(crawl_id: str):
 59 |     return await crawl_man.get_full_crawl_info(crawl_id)
 60 | 
 61 | 
 62 | @crawl_router.post(
 63 |     '/{crawl_id}/start',
 64 |     response_model=CreateStartResponse,
 65 |     response_class=UJSONResponse,
 66 | )
 67 | async def start_crawl(crawl_id: str):
 68 |     return await crawl_man.start_crawl(crawl_id)
 69 | 
 70 | 
 71 | @crawl_router.post(
 72 |     '/{crawl_id}/stop',
 73 |     response_model=OperationSuccessResponse,
 74 |     response_class=UJSONResponse,
 75 | )
 76 | async def stop_crawl(crawl_id: str):
 77 |     return await crawl_man.stop_crawl(crawl_id)
 78 | 
 79 | 
 80 | @crawl_router.get(
 81 |     '/{crawl_id}/done', response_model=CrawlDoneResponse, response_class=UJSONResponse
 82 | )
 83 | async def is_done_crawl(crawl_id: str):
 84 |     return await crawl_man.is_crawl_done(crawl_id)
 85 | 
 86 | 
 87 | @crawl_router.delete(
 88 |     '/{crawl_id}', response_model=OperationSuccessResponse, response_class=UJSONResponse
 89 | )
 90 | async def delete_crawl(crawl_id: str):
 91 |     return await crawl_man.delete_crawl(crawl_id)
 92 | 
 93 | 
 94 | @app.route('/')
 95 | def ui(*args, **kwargs):
 96 |     return FileResponse('static/index.html')
 97 | 
 98 | 
 99 | app.include_router(crawl_router, prefix='/crawl', tags=['crawl'])
100 | app.mount('/static', StaticFiles(directory='static', check_dir=True), 'static')
101 | app.add_event_handler('startup', crawl_man.startup)
102 | app.add_event_handler('shutdown', crawl_man.shutdown)
103 | 


--------------------------------------------------------------------------------
/browsertrix/schema.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | from enum import Enum
  3 | from typing import Any, Dict, List, Optional, Set, Union
  4 | 
  5 | from pydantic import BaseModel, Schema, UrlStr
  6 | 
  7 | __all__ = [
  8 |     'BrowserCookie',
  9 |     'BrowserOverrides',
 10 |     'CacheMode',
 11 |     'CaptureMode',
 12 |     'CookieSameSite',
 13 |     'CrawlDoneResponse',
 14 |     'CrawlInfo',
 15 |     'CrawlInfoResponse',
 16 |     'CrawlInfoUrlsResponse',
 17 |     'CrawlInfosResponse',
 18 |     'CrawlType',
 19 |     'CreateCrawlRequest',
 20 |     'CreateStartResponse',
 21 |     'EmulatedDevice',
 22 |     'EmulatedGeoLocation',
 23 |     'FullCrawlInfoResponse',
 24 |     'OperationSuccessResponse',
 25 |     'QueueUrlsRequest',
 26 | ]
 27 | 
 28 | # ============================================================================
 29 | OptionalList = Optional[List[str]]
 30 | OptionalSet = Optional[Set[str]]
 31 | Number = Union[int, float]
 32 | 
 33 | UrlStr.max_length = math.inf
 34 | UrlStr.relative = True
 35 | 
 36 | 
 37 | class CrawlType(str, Enum):
 38 |     SINGLE_PAGE = 'single-page'
 39 |     ALL_LINKS = 'all-links'
 40 |     SAME_DOMAIN = 'same-domain'
 41 |     CUSTOM = 'custom'
 42 | 
 43 | 
 44 | class CaptureMode(str, Enum):
 45 |     RECORD = 'record'
 46 |     REPLAY = 'replay'
 47 |     LIVE = 'live'
 48 | 
 49 | 
 50 | class CacheMode(str, Enum):
 51 |     ALWAYS = 'always'
 52 |     NEVER = 'never'
 53 |     DEFAULT = 'default'
 54 | 
 55 | 
 56 | class CookieSameSite(str, Enum):
 57 |     STRICT = 'Strict'
 58 |     LAX = 'LAX'
 59 |     EXTENDED = 'Extended'
 60 |     NONE = 'None'
 61 | 
 62 | 
 63 | class EmulatedDevice(BaseModel):
 64 |     width: Number
 65 |     height: Number
 66 |     deviceScaleFactor: Optional[Number] = None
 67 |     maxTouchPoints: Optional[Number] = None
 68 |     isMobile: Optional[bool] = None
 69 |     hasTouch: Optional[bool] = None
 70 |     isLandscape: Optional[bool] = None
 71 | 
 72 | 
 73 | class EmulatedGeoLocation(BaseModel):
 74 |     latitude: Number
 75 |     longitude: Number
 76 | 
 77 | 
 78 | class BrowserCookie(BaseModel):
 79 |     name: str
 80 |     value: str
 81 |     url: Optional[UrlStr] = None
 82 |     domain: Optional[str] = None
 83 |     path: Optional[str] = None
 84 |     secure: Optional[bool] = None
 85 |     httpOnly: Optional[bool] = None
 86 |     expires: Optional[Number] = None
 87 |     sameSite: Optional[CookieSameSite] = None
 88 | 
 89 | 
 90 | class BrowserOverrides(BaseModel):
 91 |     user_agent: Optional[str] = None
 92 |     accept_language: Optional[str] = None
 93 |     navigator_platform: Optional[str] = None
 94 |     extra_headers: Optional[Dict[str, str]] = None
 95 |     cookies: Optional[List[BrowserCookie]] = None
 96 |     geo_location: Optional[EmulatedGeoLocation] = None
 97 |     device: Optional[EmulatedDevice] = None
 98 | 
 99 | 
100 | class BaseCreateCrawl(BaseModel):
101 |     crawl_type: CrawlType = Schema(
102 |         CrawlType.SINGLE_PAGE, description='What type of crawl should be launched'
103 |     )
104 |     crawl_depth: Optional[int] = None
105 |     num_browsers: int = Schema(
106 |         2, description='How many browsers should be used for the crawl'
107 |     )
108 |     num_tabs: int = Schema(1, description='How many tabs should be used for the crawl')
109 |     name: Optional[str] = Schema('', description='User friendly name for the crawl')
110 |     coll: Optional[str] = Schema('live', description='Default Collection')
111 | 
112 |     mode: CaptureMode = Schema(CaptureMode.RECORD, description='Default Mode')
113 | 
114 |     screenshot_coll: Optional[str] = Schema(
115 |         '', description='Collection to store screenshots, if any'
116 |     )
117 | 
118 |     text_coll: Optional[str] = Schema(
119 |         '', description='Collection to store full-text indexes, if any'
120 |     )
121 | 
122 | 
123 | class CreateCrawlRequest(BaseCreateCrawl):
124 |     class Config:
125 |         extra = 'forbid'
126 | 
127 |     seed_urls: List[UrlStr] = []
128 |     scopes: List[Dict[Any, Any]] = []
129 | 
130 |     cache: CacheMode = CacheMode.ALWAYS
131 | 
132 |     browser: Optional[str] = 'chrome:73'
133 |     user_params: Dict[Any, Any] = dict()
134 | 
135 |     profile: Optional[str] = None
136 | 
137 |     ignore_extra: Optional[Dict[Any, Any]] = None
138 | 
139 |     behavior_max_time: int = 0
140 |     headless: bool = False
141 |     screenshot_target_uri: Optional[str] = None
142 | 
143 |     start: bool = True
144 |     browser_overrides: Optional[BrowserOverrides] = None
145 | 
146 | 
147 | class OperationSuccessResponse(BaseModel):
148 |     success: bool
149 | 
150 | 
151 | class CreateStartResponse(OperationSuccessResponse):
152 |     id: str
153 |     status: str = 'new'
154 |     browsers: Optional[List[str]]
155 | 
156 | 
157 | class CrawlInfoResponse(BaseCreateCrawl):
158 |     id: str
159 |     status: str = 'new'
160 |     start_time: int = 0
161 |     finish_time: int = 0
162 |     browsers: OptionalList
163 |     tabs_done: List[Dict[Any, Any]]
164 |     headless: bool = False
165 |     num_queue: int = 0
166 |     num_seen: int = 0
167 |     num_pending: int = 0
168 | 
169 | 
170 | class CrawlInfosResponse(BaseModel):
171 |     crawls: List[CrawlInfoResponse]
172 | 
173 | 
174 | class CrawlInfo(BaseModel):
175 |     """ Model for validate a:{crawl_id}:info key
176 |     All fields should be set in the model
177 |     """
178 | 
179 |     id: str
180 |     name: str
181 |     coll: str
182 |     screenshot_coll: str
183 |     text_coll: str
184 |     mode: str
185 |     status: str
186 |     crawl_type: str
187 |     crawl_depth: int
188 |     num_browsers: int
189 |     num_tabs: int
190 |     start_time: int = 0
191 |     finish_time: int = 0
192 |     headless: bool = False
193 |     browser_overrides: Optional[BrowserOverrides] = None
194 | 
195 | 
196 | class CrawlInfoUrlsResponse(BaseModel):
197 |     scopes: List[Dict[Any, Any]]
198 |     queue: List[Dict[Any, Any]]
199 |     pending: OptionalList
200 |     seen: OptionalSet
201 | 
202 | 
203 | class FullCrawlInfoResponse(CrawlInfo, CrawlInfoUrlsResponse):
204 |     success: bool
205 | 
206 | 
207 | class QueueUrlsRequest(BaseModel):
208 |     urls: List[str]
209 | 
210 | 
211 | class CrawlDoneResponse(BaseModel):
212 |     done: bool
213 | 


--------------------------------------------------------------------------------
/browsertrix/utils.py:
--------------------------------------------------------------------------------
 1 | from asyncio import AbstractEventLoop
 2 | from os import environ
 3 | from typing import Any, Dict, Optional, Type, Union
 4 | from urllib.parse import urlsplit
 5 | 
 6 | from aioredis import Redis, create_redis
 7 | from ujson import loads as ujson_loads
 8 | 
 9 | __all__ = ['env', 'extract_domain', 'init_redis']
10 | 
11 | 
12 | async def init_redis(redis_url: str, loop: AbstractEventLoop) -> Redis:
13 |     return await create_redis(redis_url, encoding='utf-8', loop=loop)
14 | 
15 | 
16 | def env(
17 |     key: str,
18 |     type_: Type[Union[str, bool, int, dict, float]] = str,
19 |     default: Optional[Any] = None,
20 | ) -> Union[str, int, bool, float, Dict]:
21 |     """Returns the value of the supplied env key name converting
22 |     the env key's value to the specified type.
23 | 
24 |     If the env key does not exist the default value is returned.
25 | 
26 |     Boolean values for env keys are expected to be:
27 |       - true: 1, true, yes, y, ok, on
28 |       - false: 0, false, no, n, nok, off
29 | 
30 |     :param key: The name of the environment variable
31 |     :param type_: What type should the the env key's value be converted to,
32 |     defaults to str
33 |     :param default: The default value of the env key, defaults to None
34 |     :return: The value of the env key or the supplied default
35 |     """
36 |     if key not in environ:
37 |         return default
38 | 
39 |     val = environ[key]
40 | 
41 |     if type_ == str:
42 |         return val
43 |     elif type_ == bool:
44 |         if val.lower() in ['1', 'true', 'yes', 'y', 'ok', 'on']:
45 |             return True
46 |         if val.lower() in ['0', 'false', 'no', 'n', 'nok', 'off']:
47 |             return False
48 |         raise ValueError(
49 |             f'Invalid environment variable "{key}" (expected a boolean): "{val}"'
50 |         )
51 |     elif type_ == int:
52 |         try:
53 |             return int(val)
54 |         except ValueError:
55 |             raise ValueError(
56 |                 f'Invalid environment variable "{key}" (expected a integer): "{val}"'
57 |             )
58 |     elif type_ == float:
59 |         try:
60 |             return float(val)
61 |         except ValueError:
62 |             raise ValueError(
63 |                 f'Invalid environment variable "{key}" (expected a float): "{val}"'
64 |             )
65 |     elif type_ == dict:
66 |         return ujson_loads(val)
67 | 
68 | 
69 | def extract_domain(url: str) -> str:
70 |     """Extracts and returns the domain, including the suffix,
71 |     of the supplied URL
72 | 
73 |     :param url: The url to have its domain extracted from
74 |     :return: The extracted domain
75 |     """
76 |     extracted = urlsplit(url).netloc
77 |     return extracted.replace('www.', '')
78 | 


--------------------------------------------------------------------------------
/browsertrix_cli/basecli.py:
--------------------------------------------------------------------------------
  1 | import click
  2 | import sys
  3 | 
  4 | import requests
  5 | 
  6 | 
  7 | # ============================================================================
  8 | class Settings:
  9 |     quiet_mode = False
 10 |     sesh = None
 11 | 
 12 |     server_prefix = None
 13 |     shepherd_prefix = None
 14 |     view_browsers_prefix = None
 15 | 
 16 | 
 17 | settings = Settings()
 18 | 
 19 | 
 20 | # ============================================================================
 21 | @click.group()
 22 | @click.option(
 23 |     '--server',
 24 |     metavar='<URL>',
 25 |     type=str,
 26 |     default='http://localhost:8000',
 27 |     help='The Browsertrix server url',
 28 | )
 29 | @click.option(
 30 |     '--shepherd',
 31 |     metavar='<URL>',
 32 |     type=str,
 33 |     default='http://localhost:9020',
 34 |     help='The Shepherd server url',
 35 | )
 36 | @click.option(
 37 |     '-q',
 38 |     '--quiet',
 39 |     is_flag=True,
 40 |     default=False,
 41 |     type=bool,
 42 |     help='quiet mode: print only crawl ids if success',
 43 | )
 44 | def cli(server, quiet, shepherd):
 45 |     settings.server_prefix = server
 46 | 
 47 |     settings.shepherd_prefix = shepherd
 48 |     settings.view_browsers_prefix = shepherd + '/attach/'
 49 | 
 50 |     settings.sesh = requests.session()
 51 | 
 52 |     settings.quiet_mode = quiet
 53 | 
 54 | 
 55 | # ============================================================================
 56 | def is_quiet():
 57 |     return settings.quiet_mode
 58 | 
 59 | 
 60 | # ============================================================================
 61 | def ensure_success(res, exit=True):
 62 |     """ Ensure API response is successful
 63 |         print error and exit if not
 64 | 
 65 |         :param res: Response from requests
 66 |         :param exit: Exit on any error
 67 |         :return: parsed JSON response as dict
 68 |     """
 69 |     if res.status_code == 200:
 70 |         json = res.json()
 71 |         return json
 72 | 
 73 |     if not is_quiet():
 74 |         print('Error response from API server')
 75 |         print('{0}: {1}'.format(res.status_code, res.text))
 76 | 
 77 |     if exit:
 78 |         sys.exit(1)
 79 | 
 80 | 
 81 | # ============================================================================
 82 | def conn_error_exit(url):
 83 |     if not is_quiet():
 84 |         print(
 85 |             'Unable to connect to {0}. Is Browsertrix container running in Docker?'.format(
 86 |                 url
 87 |             )
 88 |         )
 89 |     sys.exit(2)
 90 | 
 91 | 
 92 | # ============================================================================
 93 | def sesh_get(url, prefix=None):
 94 |     url = (prefix or settings.server_prefix) + url
 95 |     try:
 96 |         res = settings.sesh.get(url)
 97 |         return ensure_success(res)
 98 |     except requests.exceptions.ConnectionError:
 99 |         conn_error_exit(url)
100 | 
101 | 
102 | # ============================================================================
103 | def sesh_post(url, json=None, prefix=None):
104 |     url = (prefix or settings.server_prefix) + url
105 |     try:
106 |         res = settings.sesh.post(url, json=json)
107 |         return ensure_success(res)
108 |     except requests.exceptions.ConnectionError:
109 |         conn_error_exit(url)
110 | 
111 | 
112 | # ============================================================================
113 | def sesh_delete(url, prefix=None):
114 |     url = (prefix or settings.server_prefix) + url
115 |     try:
116 |         res = settings.sesh.delete(url)
117 |         return ensure_success(res, exit=False)
118 |     except requests.exceptions.ConnectionError:
119 |         conn_error_exit(url)
120 | 


--------------------------------------------------------------------------------
/browsertrix_cli/crawl.py:
--------------------------------------------------------------------------------
  1 | import click
  2 | import datetime
  3 | import docker
  4 | import sys
  5 | import time
  6 | import yaml
  7 | import webbrowser
  8 | 
  9 | from collections import defaultdict
 10 | 
 11 | 
 12 | from browsertrix_cli.basecli import (
 13 |     cli,
 14 |     is_quiet,
 15 |     sesh_get,
 16 |     sesh_post,
 17 |     sesh_delete,
 18 |     settings,
 19 | )
 20 | from browsertrix_cli.profile import get_profile_image
 21 | 
 22 | 
 23 | COLUMNS = [
 24 |     ('id', 'CRAWL ID', 12),
 25 |     ('name', 'NAME', 12),
 26 |     ('start_time', 'STARTED', 12),
 27 |     ('finish_time', 'DURATION', 12),
 28 |     ('status', 'STATUS', 7),
 29 |     ('crawl_type', 'CRAWL TYPE', 12),
 30 |     ('coll', 'COLL', 16),
 31 |     ('mode', 'MODE', 8),
 32 |     ('num_queue', 'TO CRAWL', 8),
 33 |     ('num_pending', 'PENDING', 8),
 34 |     ('num_seen', 'SEEN', 8),
 35 |     ('num_browsers', 'BROWSERS', 9),
 36 |     ('num_tabs', 'TABS', 3),
 37 | ]
 38 | 
 39 | 
 40 | # ============================================================================
 41 | @cli.group(help='Commands for working with crawls')
 42 | def crawl():
 43 |     pass
 44 | 
 45 | 
 46 | # ============================================================================
 47 | def format_duration(start_time, finish_time):
 48 |     """ Format duration of crawl
 49 | 
 50 |     :param start_time: start time of crawl
 51 |     :param finish_time: finish time of crawl
 52 |     :return: string text for time elapsed since timestr
 53 |     """
 54 |     try:
 55 |         if start_time == 0:
 56 |             return '-'
 57 | 
 58 |         if not finish_time:
 59 |             finish = datetime.datetime.now()
 60 |         else:
 61 |             finish = datetime.datetime.fromtimestamp(int(finish_time))
 62 | 
 63 |         start = datetime.datetime.fromtimestamp(int(start_time))
 64 |         elapsed = finish - start
 65 |         return str(elapsed).split('.', 1)[0]
 66 |     except Exception:
 67 |         return start_time
 68 | 
 69 | 
 70 | # ============================================================================
 71 | def print_container_log(
 72 |     docker_api, reqid, name='autobrowser-', follow=False, wait=False
 73 | ):
 74 | 
 75 |     full_name = name + reqid
 76 |     while True:
 77 |         try:
 78 |             container = docker_api.containers.get(full_name)
 79 |             break
 80 |         except docker.errors.NotFound:
 81 |             if not wait:
 82 |                 return False
 83 | 
 84 |             print('Waiting for Logs...')
 85 |             time.sleep(0.25)
 86 |             continue
 87 | 
 88 |     print('---- Logs for Crawl {0}: {1} ----'.format(reqid, full_name))
 89 |     res = container.logs(follow=follow, stream=True)
 90 |     for line in res:
 91 |         sys.stdout.write(line.decode('utf-8'))
 92 | 
 93 |     print('-----------------------------------')
 94 |     print('')
 95 |     print('')
 96 | 
 97 |     return True
 98 | 
 99 | 
100 | # ============================================================================
101 | def print_logs(browsers, follow=False, wait=False, all_containers=False):
102 |     docker_api = docker.from_env(version='auto')
103 | 
104 |     if follow is None:
105 |         follow = False
106 | 
107 |     for reqid in browsers:
108 |         if all_containers:
109 |             print_container_log(
110 |                 docker_api, reqid, wait=False, follow=False, name='browser-'
111 |             )
112 | 
113 |             print_container_log(
114 |                 docker_api, reqid, wait=False, follow=False, name='xserver-'
115 |             )
116 | 
117 |         print_container_log(docker_api, reqid, wait=wait, follow=follow)
118 | 
119 | 
120 | # ============================================================================
121 | def open_browsers(browsers, crawl_id, tabs_done=None, num_tabs=-1):
122 |     count = 1
123 |     for reqid in browsers:
124 |         skip = False
125 |         if not tabs_done or tabs_done.get(reqid) != num_tabs:
126 |             msg = 'Opening Browser {0} of {1} ({2}) for crawl {3}'
127 |         else:
128 |             msg = 'Skipping Finished Browser {0} of {1}, ({2}) for crawl {3}'
129 |             skip = True
130 | 
131 |         if not is_quiet():
132 |             print(msg.format(count, len(browsers), reqid, crawl_id))
133 | 
134 |         if not skip:
135 |             webbrowser.open(settings.view_browsers_prefix + reqid)
136 |         count += 1
137 | 
138 | 
139 | # ============================================================================
140 | @crawl.command(name='list', help='List all crawls')
141 | def list_crawls():
142 |     """ List all available crawls
143 |     """
144 |     res = sesh_get('/crawls')
145 | 
146 |     sorted_list = sorted(res['crawls'], key=lambda x: x['start_time'], reverse=True)
147 | 
148 |     if is_quiet():
149 |         for crawl in sorted_list:
150 |             print(crawl['id'])
151 | 
152 |         return
153 | 
154 |     format_str = '{value: <{size}}  '
155 | 
156 |     for _, text, size in COLUMNS:
157 |         sys.stdout.write(format_str.format(value=text, size=size))
158 |     print()
159 | 
160 |     for crawl in sorted_list:
161 |         for field, _, size in COLUMNS:
162 |             value = crawl[field]
163 |             if field == 'start_time':
164 |                 value = format_duration(value, None) + ' ago'
165 |             elif field == 'finish_time':
166 |                 value = format_duration(crawl['start_time'], value)
167 | 
168 |             sys.stdout.write(format_str.format(value=value, size=size))
169 |         print()
170 |     print()
171 | 
172 | 
173 | # ============================================================================
174 | @crawl.command(
175 |     name='create', help='Create (and optionally start) new crawl from yaml crawl spec'
176 | )
177 | @click.option(
178 |     '--start/--no-start',
179 |     default=True,
180 |     help="Start/Don't start crawl immediately after creation",
181 | )
182 | @click.option(
183 |     '--browser',
184 |     type=str,
185 |     default=None,
186 |     help='Browser Docker image to use for crawling, (overrides setting in spec)',
187 | )
188 | @click.option(
189 |     '--profile',
190 |     type=str,
191 |     default=None,
192 |     help='Browser Profile Docker image to use for crawling (overrides "browser" option)',
193 | )
194 | @click.option(
195 |     '--coll',
196 |     type=str,
197 |     default=None,
198 |     help='Set the collection (overrides setting in spec)',
199 | )
200 | @click.option(
201 |     '--mode',
202 |     type=str,
203 |     default=None,
204 |     help='Set the capture mode (overrides setting in spec)',
205 | )
206 | @click.option(
207 |     '--screenshot_coll',
208 |     type=str,
209 |     default=None,
210 |     help='Set the collection to save screenshots (overrides setting in spec)',
211 | )
212 | @click.option(
213 |     '--headless',
214 |     type=bool,
215 |     is_flag=True,
216 |     help='Use headless mode. Browsers can not be opened for watching the crawl',
217 | )
218 | @click.option(
219 |     '--behavior-time',
220 |     default=None,
221 |     type=int,
222 |     help='Max duration to run each in-page behavior',
223 | )
224 | @click.option(
225 |     '--watch',
226 |     is_flag=True,
227 |     default=False,
228 |     type=bool,
229 |     help='Watch all started browsers in a local browser (only if starting crawl)',
230 | )
231 | @click.option(
232 |     '--log',
233 |     is_flag=True,
234 |     default=False,
235 |     type=bool,
236 |     help='Tail the log for the browser crawler',
237 | )
238 | @click.argument('crawl_spec_file', type=click.File('rt'))
239 | def create_crawl(
240 |     crawl_spec_file,
241 |     start,
242 |     browser,
243 |     profile,
244 |     coll,
245 |     mode,
246 |     screenshot_coll,
247 |     headless,
248 |     behavior_time,
249 |     watch,
250 |     log,
251 | ):
252 |     """ Create a new crawl!
253 | 
254 |         :param crawl_spec_file: YAML file with one or more crawls in 'crawls' key
255 |         :param start: If true, start crawl immediately after creation
256 |         :param browser: Browser Docker image to use for crawling (overrides setting in spec)
257 |         :param profile: Browser Profile Docker image to use for crawling (overrides "browser" setting)
258 |         :param coll: Set the collection (overrides setting in spec)
259 |         :param mode: Set the capture mode (overrides setting in spec)
260 |         :param screenshot_coll: Set the collection to save screenshots (overrides setting in spec)
261 |         :param headless: Use headless mode. Browsers can not be opened for watching the crawl
262 |         :param behavior_time: Max duration (in seconds) to run each in-page behavior
263 |         :param watch: Watch all started browsers in a local browser (only if starting crawl)
264 | 
265 |     """
266 |     root = yaml.load(crawl_spec_file, Loader=yaml.Loader)
267 | 
268 |     for crawl_spec in root['crawls']:
269 |         if not start:
270 |             msg = 'Created'
271 |         else:
272 |             msg = 'Created and Started'
273 | 
274 |         if headless is not None:
275 |             crawl_spec['headless'] = headless
276 | 
277 |         if behavior_time is not None:
278 |             crawl_spec['behavior_time'] = behavior_time
279 | 
280 |         if profile is not None:
281 |             crawl_spec['profile'] = profile
282 | 
283 |         if 'profile' in crawl_spec:
284 |             browser = get_profile_image(crawl_spec.get('profile', ''))
285 | 
286 |         if browser is not None:
287 |             crawl_spec['browser'] = browser
288 | 
289 |         if coll is not None:
290 |             crawl_spec['coll'] = coll
291 | 
292 |         if mode is not None:
293 |             crawl_spec['mode'] = mode
294 | 
295 |         if screenshot_coll is not None:
296 |             crawl_spec['screenshot_coll'] = screenshot_coll
297 | 
298 |         if not is_quiet():
299 |             print('Creating New Crawl, Please Wait...')
300 | 
301 |         res = sesh_post('/crawls', json=crawl_spec)
302 | 
303 |         if is_quiet():
304 |             print(res['id'])
305 |         else:
306 |             print('Crawl {0}: {1}'.format(msg, res['id']))
307 |             print('Status: {0}'.format(res['status']))
308 | 
309 |         if watch:
310 |             if not start:
311 |                 if not is_quiet():
312 |                     print("Can't watch, crawl not started")
313 | 
314 |             elif headless:
315 |                 if not is_quiet():
316 |                     print("Can't watch, crawl is running in headless mode")
317 | 
318 |             else:
319 |                 open_browsers(res['browsers'], res['id'])
320 | 
321 |         if log:
322 |             print_logs(res['browsers'], follow=True, wait=True)
323 | 
324 | 
325 | # ============================================================================
326 | @crawl.command(name='start', help='Start an existing crawl')
327 | @click.argument('crawl_id', nargs=-1)
328 | def start_crawl(crawl_id, browser, headless, behavior_time):
329 |     """ Start an existing crawl
330 | 
331 |         :param crawl_id: list of crawl ids to start
332 |     """
333 |     for id_ in crawl_id:
334 |         res = sesh_post('/crawl/{0}/start'.format(id_))
335 | 
336 |         if is_quiet():
337 |             print(res['id'])
338 |         else:
339 |             print('Started Crawl: {0}'.format(res['id']))
340 | 
341 | 
342 | # ============================================================================
343 | @crawl.command(name='info', help='Get info on an existing crawl(s)')
344 | @click.argument('crawl_id', nargs=-1)
345 | @click.option(
346 |     '--urls/--no-urls',
347 |     default=False,
348 |     help='Get detailed info on crawl, listing all urls',
349 | )
350 | def get_info(crawl_id, urls):
351 |     """ Get info on existing crawl(s)
352 | 
353 |         :param crawl_id: list of crawl ids to get info on
354 |         :param urls: Get detailed info on crawl, listing all urls
355 |     """
356 |     for id_ in crawl_id:
357 |         if urls:
358 |             res = sesh_get('/crawl/{0}/info'.format(id_))
359 |         else:
360 |             res = sesh_get('/crawl/{0}'.format(id_))
361 | 
362 |         print(yaml.dump(res))
363 | 
364 | 
365 | # ============================================================================
366 | @crawl.command(name='watch', help='Watch crawling browsers in local browser')
367 | @click.argument('crawl_id', nargs=-1)
368 | def watch_crawl(crawl_id):
369 |     """ Watch crawling browsers in local browser
370 | 
371 |         :param crawl_id: list of crawl ids to watch
372 |     """
373 |     for id_ in crawl_id:
374 |         res = sesh_get('/crawl/{0}'.format(id_))
375 | 
376 |         if res.get('headless'):
377 |             if not is_quiet():
378 |                 print("Can not watch, crawl is running in headless mode")
379 |                 continue
380 | 
381 |         if res.get('status') != 'running':
382 |             if not is_quiet():
383 |                 print('Crawl not running: {0}'.format(id_))
384 |                 continue
385 | 
386 |         browsers = res['browsers']
387 | 
388 |         done_count = defaultdict(int)
389 | 
390 |         for info in res.get('tabs_done'):
391 |             done_count[info['id']] += 1
392 | 
393 |         if not browsers:
394 |             if not is_quiet():
395 |                 print('No Browsers')
396 |                 continue
397 | 
398 |         open_browsers(browsers, id_, done_count, res['num_tabs'])
399 | 
400 | 
401 | # ============================================================================
402 | @crawl.command(name='stop', help='Stop one or more existing crawls')
403 | @click.argument('crawl_id', nargs=-1)
404 | def stop_crawl(crawl_id):
405 |     """ Stop one or more existing crawls
406 | 
407 |         :param crawl_id: list of crawl ids to stop
408 |     """
409 |     for id_ in crawl_id:
410 |         res = sesh_post('/crawl/{0}/stop'.format(id_))
411 | 
412 |         if not res.get('success'):
413 |             print('Error stopping: ' + res)
414 |             return
415 | 
416 |         if is_quiet():
417 |             print(id_)
418 |         else:
419 |             print('Stopped Crawl: {0}'.format(id_))
420 | 
421 | 
422 | # ============================================================================
423 | @crawl.command(name='remove', help='Remove one or more existing crawls')
424 | @click.argument('crawl_id', nargs=-1)
425 | def remove_crawl(crawl_id):
426 |     """ Remove one or more existing crawls
427 | 
428 |         :param crawl_id: list of crawl ids to stop
429 |     """
430 |     for id_ in crawl_id:
431 |         res = sesh_delete('/crawl/{0}'.format(id_))
432 | 
433 |         if not res.get('success'):
434 |             print('Error removing: ' + res)
435 |             return
436 | 
437 |         if is_quiet():
438 |             print(id_)
439 |         else:
440 |             print('Removed Crawl: {0}'.format(id_))
441 | 
442 | 
443 | # ============================================================================
444 | @crawl.command(name='remove-all', help='Stop and remove all crawls')
445 | def remove_all():
446 |     """ Stop and remove all crawls
447 |     """
448 |     res = sesh_get('/crawls')
449 | 
450 |     crawls = res['crawls']
451 | 
452 |     for crawl in crawls:
453 |         id_ = crawl['id']
454 |         res = sesh_delete('/crawl/{0}'.format(id_))
455 |         if not is_quiet():
456 |             print('Removed Crawl: {0}'.format(id_))
457 | 
458 | 
459 | # ============================================================================
460 | @crawl.command(name='logs', help='View crawl logs for one or all crawlers')
461 | @click.argument('crawl_id', nargs=1)
462 | @click.option(
463 |     '-b',
464 |     '--browser',
465 |     type=int,
466 |     default=0,
467 |     help='1-based index of browser to show logs for, or 0 for all (default)',
468 | )
469 | @click.option(
470 |     '-f',
471 |     '--follow',
472 |     type=bool,
473 |     default=False,
474 |     is_flag=True,
475 |     help='follow crawl log in real-time',
476 | )
477 | @click.option(
478 |     '-a',
479 |     '--all-containers',
480 |     type=bool,
481 |     default=False,
482 |     is_flag=True,
483 |     help='include logs from all containers, not just crawler',
484 | )
485 | def logs(crawl_id, browser, follow, all_containers):
486 |     """ View crawl logs for one or all crawlers
487 |     :param crawl_id: The crawl_id to view logs for
488 |     :param browser: 1-based index of browser to show logs for, or 0 for all (default)
489 |     :param follow: follow crawl log in real-time (for one browser only)
490 |     :param all_containers: include logs from all containers, not just crawler
491 |     """
492 |     res = sesh_get('/crawl/{0}'.format(crawl_id))
493 | 
494 |     num_browsers = len(res['browsers'])
495 |     if browser <= 0:
496 |         print_logs(res['browsers'], follow=follow, all_containers=all_containers)
497 |     elif browser > num_browsers:
498 |         print(
499 |             'Crawl has {0} browsers. Index must be 1 to {0}'.format(
500 |                 num_browsers, num_browsers
501 |             )
502 |         )
503 |     else:
504 |         print_logs(
505 |             [res['browsers'][browser - 1]], follow=follow, all_containers=all_containers
506 |         )
507 | 


--------------------------------------------------------------------------------
/browsertrix_cli/main.py:
--------------------------------------------------------------------------------
1 | from browsertrix_cli.basecli import cli
2 | import browsertrix_cli.profile
3 | import browsertrix_cli.crawl
4 | 
5 | 
6 | # ============================================================================
7 | if __name__ == '__main__':
8 |     cli()
9 | 


--------------------------------------------------------------------------------
/browsertrix_cli/profile.py:
--------------------------------------------------------------------------------
  1 | import click
  2 | import docker
  3 | import sys
  4 | import time
  5 | import webbrowser
  6 | 
  7 | 
  8 | from browsertrix_cli.basecli import cli, is_quiet, sesh_get, settings
  9 | 
 10 | 
 11 | # ============================================================================
 12 | docker_api = None
 13 | 
 14 | PROFILE_PREFIX = 'oldwebtoday/profile:'
 15 | 
 16 | LABEL_BROWSERPROFILE = 'wr.browserprofile'
 17 | LABEL_BASEBROWSER = 'wr.basebrowser'
 18 | 
 19 | 
 20 | # ============================================================================
 21 | def get_profile_image(profile):
 22 |     try:
 23 |         global docker_api
 24 |         if not docker_api:
 25 |             docker_api = docker.from_env(version='auto')
 26 | 
 27 |         image_name = PROFILE_PREFIX + profile
 28 |         image = docker_api.images.get(image_name)
 29 |         assert image.labels.get(LABEL_BROWSERPROFILE) == profile
 30 |         return 'profile:' + profile
 31 | 
 32 |     except (docker.errors.ImageNotFound, AssertionError):
 33 |         if not is_quiet():
 34 |             print('Profile "{0}" not found'.format(profile))
 35 |         sys.exit(1)
 36 | 
 37 | 
 38 | # ============================================================================
 39 | @cli.group(help='Commands for creating/removing browser profiles')
 40 | def profile():
 41 |     global docker_api
 42 |     docker_api = docker.from_env(version='auto')
 43 | 
 44 | 
 45 | # ============================================================================
 46 | @profile.command(name='list', help='List Profiles')
 47 | def list_profiles():
 48 |     res = docker_api.images.list(filters={'label': LABEL_BROWSERPROFILE})
 49 | 
 50 |     format_str = '{profile: <16}  {base}'
 51 |     if not is_quiet():
 52 |         print(format_str.format(profile='PROFILE', base='BASE BROWSER'))
 53 | 
 54 |     for image in res:
 55 |         if not image.tags:
 56 |             continue
 57 | 
 58 |         if not image.tags[0].startswith(PROFILE_PREFIX):
 59 |             continue
 60 | 
 61 |         profile = image.tags[0][len(PROFILE_PREFIX) :]
 62 |         base_browser = image.labels.get(LABEL_BASEBROWSER, '(unknown)')
 63 | 
 64 |         if not is_quiet():
 65 |             print(format_str.format(profile=profile, base=base_browser))
 66 |         else:
 67 |             print(profile)
 68 | 
 69 |     if not is_quiet():
 70 |         print()
 71 | 
 72 | 
 73 | # ============================================================================
 74 | @profile.command(name='remove', help='Remove Profile')
 75 | @click.argument('profile', type=str)
 76 | def remove_profile(profile):
 77 |     full_tag = PROFILE_PREFIX + profile
 78 | 
 79 |     try:
 80 |         docker_api.images.remove(full_tag, force=True, noprune=False)
 81 |         if not is_quiet():
 82 |             print('Removed profile "{0}"!'.format(profile))
 83 | 
 84 |     except docker.errors.ImageNotFound:
 85 |         if not is_quiet():
 86 |             print('Profile "{0}" not found'.format(profile))
 87 |         sys.exit(1)
 88 | 
 89 | 
 90 | # ============================================================================
 91 | @profile.command(name='create', help='Create Profile')
 92 | @click.option(
 93 |     '--browser', default='chrome:73', type=str, help='Base Browser Image to Extend'
 94 | )
 95 | def create_profile(browser):
 96 |     res = sesh_get(
 97 |         '/api/request/{0}/about:blank'.format(browser), prefix=settings.shepherd_prefix
 98 |     )
 99 | 
100 |     reqid = res.get('reqid')
101 | 
102 |     curr_browser = None
103 | 
104 |     webbrowser.open(settings.view_browsers_prefix + reqid)
105 | 
106 |     print('A new browser window should have been opened')
107 |     print(
108 |         'You can use the browser to log-in to accounts or otherwise prepare the browser profile'
109 |     )
110 |     print('(The content will not be recorded to WARC)')
111 | 
112 |     while True:
113 |         profile_name = click.prompt(
114 |             'When done, please enter a new name to save the browser profile', type=str
115 |         )
116 | 
117 |         if not curr_browser:
118 |             curr_browser = docker_api.containers.get('browser-' + reqid)
119 | 
120 |         # exit_code, output = curr_browser.exec_run('/app/prep-commit.sh')
121 |         exit_code, output = curr_browser.exec_run('pkill -f "/usr/bin/google-chrome"')
122 |         if not is_quiet():
123 |             print('Killed Chrome to Save Profile for Commit')
124 |             print('Result: {0}'.format(exit_code))
125 |             print(output.decode('utf-8'))
126 | 
127 |         time.sleep(1.5)
128 | 
129 |         conf = {
130 |             'Labels': {LABEL_BROWSERPROFILE: profile_name, LABEL_BASEBROWSER: browser}
131 |         }
132 | 
133 |         res = curr_browser.commit(
134 |             repository=PROFILE_PREFIX[:-1],
135 |             tag=profile_name,
136 |             message='Browser Profile',
137 |             conf=conf,
138 |         )
139 | 
140 |         if not is_quiet():
141 |             print('Created Image: {0} ({1})'.format(res.tags[0], res.short_id))
142 | 
143 |         print('The browser should have restarted to about:blank')
144 |         if not click.confirm('Continue browsing to create another profile?'):
145 |             break
146 | 


--------------------------------------------------------------------------------
/cli-requirements.txt:
--------------------------------------------------------------------------------
1 | click
2 | docker
3 | pyyaml
4 | requests
5 | 


--------------------------------------------------------------------------------
/dev-requirements.txt:
--------------------------------------------------------------------------------
 1 | flake8
 2 | flake8-bugbear
 3 | flake8-mypy
 4 | mypy
 5 | pytest
 6 | pytest-asyncio
 7 | black
 8 | mock
 9 | requests
10 | fakeredis
11 | PyYAML
12 | 


--------------------------------------------------------------------------------
/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | version: '3.5'
 2 | 
 3 | services:
 4 |     browsertrix:
 5 |         image: webrecorder/browsertrix
 6 |         build:
 7 |             context: .
 8 |         environment:
 9 |             - REDIS_URL=redis://redis/0
10 |             - DEFAULT_POOL=auto-pool
11 |             - SCREENSHOT_API_URL=http://pywb:8080/api/screenshot/{coll}
12 |             - EXTRACTED_RAW_DOM_API_URL=http://pywb:8080/api/dom/{coll}
13 |             - PROXY_HOST=pywb
14 | 
15 |         depends_on:
16 |             - redis
17 |             - pywb
18 | 
19 |         ports:
20 |             - 8000:8000
21 | 
22 |         #volumes:
23 |         #    - ./:/app/
24 | 
25 |     pywb:
26 |         build: ./pywb/
27 | 
28 |         environment:
29 |             - REDIS_URL=redis://redis/0
30 | 
31 |         volumes:
32 |             - ./webarchive:/webarchive
33 | 
34 |         networks:
35 |             - default
36 |             - browsers
37 | 
38 |         ports:
39 |             - 8180:8080
40 | 
41 |         depends_on:
42 |             - redis
43 | 
44 |     solr:
45 |         image: solr
46 | 
47 |         volumes:
48 |           - ./solr:/var/solr
49 | 
50 |         entrypoint:
51 |           - docker-entrypoint.sh
52 |           - solr-precreate
53 |           - browsertrix
54 | 
55 |         ports:
56 |           - 8983:8983
57 | 
58 |     shepherd:
59 |         image: oldwebtoday/shepherd:1.1.0-dev
60 | 
61 |         environment:
62 |             - BROWSER_NET=browsertrix_browsers
63 |             - MAIN_NET=browsertrix_default
64 | 
65 |             - PROXY_HOST=pywb
66 |             - PROXY_PORT=8080
67 | 
68 |         depends_on:
69 |             - redis
70 | 
71 |         volumes:
72 |             - /var/run/docker.sock:/var/run/docker.sock
73 |             - ./flocks:/app/flocks
74 |             - ./pool_config.yaml:/app/pool_config.yaml
75 | 
76 |         ports:
77 |             - 9020:9020
78 | 
79 |     redis:
80 |         image: redis:3.2.4
81 | 
82 |     behaviors:
83 |       image: webrecorder/behaviors
84 | 
85 |       ports:
86 |             - 3030:3030
87 | 
88 | networks:
89 |     default:
90 |         driver: bridge
91 | 
92 |     browsers:
93 |         driver: bridge
94 | 
95 | 


--------------------------------------------------------------------------------
/flocks/browsers-headless.yaml:
--------------------------------------------------------------------------------
 1 | name: browsers-headless
 2 | auto_remove: false
 3 | 
 4 | volumes:
 5 |   tmpdir: '/tmp/.X11-unix'
 6 | 
 7 | containers:
 8 |   - name: browser
 9 |     image: oldwebtoday/base-browser
10 |     set_user_params: true
11 |     external_network: '${BROWSER_NET}'
12 | 
13 |     environment:
14 |       DISPLAY: ':99'
15 |       PULSE_SERVER: '/tmp/.X11-unix/pulse-socket'
16 | 
17 |       PROXY_HOST: 'pywb'
18 |       PROXY_PORT: '8080'
19 |       PROXY_CA_URL: 'http://wsgiprox/download/pem'
20 |       PROXY_CA_FILE: '/tmp/proxy-ca.pem'
21 | 
22 |   - name: autobrowser
23 |     image: webrecorder/autobrowser
24 | 
25 |     external_network: '${MAIN_NET}'
26 | 
27 |     environment:
28 |       BROWSER_HOST: 'browser'
29 |       REDIS_URL: 'redis://redis:6379/0'
30 |       TAB_TYPE: 'CrawlerTab'
31 | 
32 | 
33 | 


--------------------------------------------------------------------------------
/flocks/browsers.yaml:
--------------------------------------------------------------------------------
 1 | name: browsers
 2 | auto_remove: false
 3 | 
 4 | volumes:
 5 |   tmpdir: '/tmp/.X11-unix'
 6 | 
 7 | containers:
 8 |   - name: xserver
 9 |     image: oldwebtoday/vnc-webrtc-audio
10 |     ports:
11 |       vnc_port: 6080
12 |       cmd_port: 6082
13 |       ice_tcp_port: 10235
14 |       ice_udp_port: '10235/udp'
15 | 
16 |     environment:
17 |       IDLE_TIMEOUT: '${IDLE_TIMEOUT}'
18 | 
19 |   - name: browser
20 |     image: oldwebtoday/base-browser
21 |     image_label: wr.name
22 |     set_user_params: true
23 |     external_network: '${BROWSER_NET}'
24 | 
25 |     environment:
26 |       DISPLAY: ':99'
27 |       PULSE_SERVER: '/tmp/.X11-unix/pulse-socket'
28 | 
29 |       #default to no proxy, set by crawler
30 |       #PROXY_HOST: 'pywb'
31 |       PROXY_PORT: '8080'
32 |       PROXY_CA_URL: 'http://wsgiprox/download/pem'
33 |       PROXY_CA_FILE: '/tmp/proxy-ca.pem'
34 | 
35 |   - name: autobrowser
36 |     image: webrecorder/autobrowser
37 | 
38 |     external_network: '${MAIN_NET}'
39 | 
40 |     environment:
41 |       BROWSER_HOST: 'browser'
42 |       REDIS_URL: 'redis://redis:6379/0'
43 |       TAB_TYPE: 'CrawlerTab'
44 | 
45 | 
46 | 


--------------------------------------------------------------------------------
/frontend/.eslintrc.js:
--------------------------------------------------------------------------------
 1 | module.exports = {
 2 |   extends: ['plugin:prettier/recommended', 'prettier/react'],
 3 |   parser: 'babel-eslint',
 4 |   parserOptions: {
 5 |     ecmaVersion: 10
 6 |   },
 7 |   env: {
 8 |     browser: true,
 9 |     node: true
10 |   }
11 | };
12 | 


--------------------------------------------------------------------------------
/frontend/.prettierrc:
--------------------------------------------------------------------------------
1 | {
2 |   "singleQuote": true,
3 |   "jsxSingleQuote": true
4 | }
5 | 


--------------------------------------------------------------------------------
/frontend/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "crawlmanager-frontend",
 3 |   "version": "1.0.0",
 4 |   "main": "index.js",
 5 |   "license": "Appache-2.0",
 6 |   "dependencies": {
 7 |     "@hot-loader/react-dom": "16.8.6",
 8 |     "fstream": "^1.0.12",
 9 |     "immutable": "^4.0.0-rc.12",
10 |     "js-yaml": "^3.13.1",
11 |     "plur": "^3.1.1",
12 |     "prop-types": "^15.7.2",
13 |     "react": "^16.8.6",
14 |     "react-dom": "^16.8.6",
15 |     "react-redux": "^6.0.1",
16 |     "react-router": "^5.0.0",
17 |     "react-router-dom": "^5.0.0",
18 |     "react-table": "^6.9.2",
19 |     "react-toastify": "^5.0.0-rc.3",
20 |     "react-virtualized": "^9.21.0",
21 |     "redux": "^4.0.1",
22 |     "redux-actions": "^2.6.5",
23 |     "redux-batched-actions": "^0.4.1",
24 |     "redux-devtools-extension": "^2.13.8",
25 |     "redux-form": "^8.1.0",
26 |     "redux-immutable": "^4.0.0",
27 |     "redux-promise": "^0.6.0",
28 |     "redux-thunk": "^2.3.0",
29 |     "uikit": "^3.0.3",
30 |     "url-regex": "^4.1.1"
31 |   },
32 |   "devDependencies": {
33 |     "@babel/cli": "^7.4.3",
34 |     "@babel/core": "^7.4.3",
35 |     "@babel/plugin-proposal-class-properties": "^7.4.0",
36 |     "@babel/plugin-proposal-decorators": "^7.4.0",
37 |     "@babel/preset-env": "^7.4.3",
38 |     "@babel/preset-react": "^7.0.0",
39 |     "@types/react": "^16.8.13",
40 |     "babel-eslint": "^10.0.1",
41 |     "babel-loader": "^8.0.5",
42 |     "babel-plugin-add-module-exports": "^1.0.0",
43 |     "babel-plugin-transform-react-remove-prop-types": "^0.4.24",
44 |     "babel-preset-env": "^1.7.0",
45 |     "css-loader": "^2.1.1",
46 |     "eslint": "^5.16.0",
47 |     "eslint-config-prettier": "^4.1.0",
48 |     "eslint-plugin-prettier": "^3.0.1",
49 |     "eslint-plugin-react": "^7.12.4",
50 |     "fastify": "^2.2.0",
51 |     "fastify-static": "^2.4.0",
52 |     "fastify-webpack-hmr": "^2.0.1",
53 |     "file-loader": "^3.0.1",
54 |     "html-webpack-plugin": "^3.2.0",
55 |     "node-sass": "^4.11.0",
56 |     "prettier": "^1.16.4",
57 |     "react-hot-loader": "^4.8.3",
58 |     "sass-loader": "^7.1.0",
59 |     "style-loader": "^0.23.1",
60 |     "url-loader": "^1.1.2",
61 |     "webpack": "^4.29.6",
62 |     "webpack-cli": "^3.3.0",
63 |     "webpack-dev-middleware": "^3.6.2",
64 |     "webpack-hot-middleware": "^2.24.3"
65 |   },
66 |   "scripts": {
67 |     "develop": "node ./webpack/development-server.js",
68 |     "build-prod": "NODE_ENV=production webpack --config ./webpack/webpack.config.js"
69 |   }
70 | }
71 | 


--------------------------------------------------------------------------------
/frontend/public/index.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html lang="en">
 3 | <head>
 4 |     <meta charset="UTF-8">
 5 |     <title>HI</title>
 6 | </head>
 7 | <body>
 8 | <div id="mount"></div>
 9 | <script src="app.js"></script>
10 | </body>
11 | </html>


--------------------------------------------------------------------------------
/frontend/src/actions/crawls.js:
--------------------------------------------------------------------------------
  1 | import { toast } from 'react-toastify';
  2 | import { makeHTTPRequest } from './httpRequests';
  3 | import { EndpointRequests } from '../utils';
  4 | 
  5 | export const ActionTypes = {
  6 |   getAll: Symbol('crawl-get-all'),
  7 |   gotAll: Symbol('crawl-got-all'),
  8 |   gotAllInit: Symbol('crawl-got-all-init'),
  9 |   create: Symbol('crawl-create'),
 10 |   urls: Symbol('crawl-get-urls'),
 11 |   addURLs: Symbol('crawl-add-urls'),
 12 |   updateURLInfo: Symbol('crawl-update-url-info'),
 13 |   info: Symbol('crawl-get-info'),
 14 |   stop: Symbol('crawl-stop'),
 15 |   start: Symbol('crawl-start'),
 16 |   isDone: Symbol('crawl-is-done'),
 17 |   deleteCrawl: Symbol('crawl-delete'),
 18 |   updateInfo: Symbol('crawl-update-info')
 19 | };
 20 | 
 21 | export function getAllCrawls(init = false) {
 22 |   const request = EndpointRequests.retrieveAllCrawls();
 23 |   return makeHTTPRequest(request, {
 24 |     onError({ error }) {
 25 |       toast(`Failed to retrieve info about all crawls: ${error}`, {
 26 |         type: toast.TYPE.ERROR
 27 |       });
 28 |     },
 29 |     async onResponse({ response }) {
 30 |       if (!response.ok) {
 31 |         toast(
 32 |           `Failed to retrieve info about all crawls: Details 
 33 |         HTTP ${response.status}`,
 34 |           {
 35 |             type: toast.TYPE.ERROR
 36 |           }
 37 |         );
 38 |         return;
 39 |       }
 40 |       return {
 41 |         type: init ? ActionTypes.gotAllInit : ActionTypes.gotAll,
 42 |         payload: await response.json()
 43 |       };
 44 |     }
 45 |   });
 46 | }
 47 | 
 48 | export function getCrawlInfo(id) {
 49 |   const request = EndpointRequests.crawlInfo(id);
 50 |   return makeHTTPRequest(request, {
 51 |     onError({ error }) {
 52 |       toast(`Failed to retrieve the info for crawl - ${id}: ${error}`, {
 53 |         type: toast.TYPE.ERROR
 54 |       });
 55 |     },
 56 |     async onResponse({ response }) {
 57 |       const json = await response.json();
 58 |       if (!response.ok) {
 59 |         toast(
 60 |           `Failed to get the crawl info - ${id}: Details 
 61 |         ${json.detail}`,
 62 |           {
 63 |             type: toast.TYPE.ERROR
 64 |           }
 65 |         );
 66 |         return;
 67 |       }
 68 |       return {
 69 |         type: ActionTypes.info,
 70 |         payload: json
 71 |       };
 72 |     }
 73 |   });
 74 | }
 75 | 
 76 | export function addCrawlURLs(id, urls) {
 77 |   const { request } = EndpointRequests.addCrawlURLs(id, urls);
 78 |   return makeHTTPRequest(request, {
 79 |     onError({ error }) {
 80 |       toast(`Failed to add the urls to the crawl - ${id}: ${error}`, {
 81 |         type: toast.TYPE.ERROR
 82 |       });
 83 |     },
 84 |     async onResponse({ response }) {
 85 |       const json = await response.json();
 86 |       if (!response.ok) {
 87 |         toast(
 88 |           `Failed to add urls to the crawl - ${id}: Details 
 89 |         ${json.detail}`,
 90 |           {
 91 |             type: toast.TYPE.ERROR
 92 |           }
 93 |         );
 94 |         return;
 95 |       }
 96 |       return {
 97 |         type: ActionTypes.addURLs,
 98 |         payload: {
 99 |           id,
100 |           urls
101 |         }
102 |       };
103 |     }
104 |   });
105 | }
106 | 
107 | export function getCrawlURLs(id) {
108 |   const { request } = EndpointRequests.getCrawlURLs(id);
109 |   return makeHTTPRequest(request, {
110 |     onError({ error }) {
111 |       toast(`Failed to retrieve the crawls URLs - ${id}: ${error}`, {
112 |         type: toast.TYPE.ERROR
113 |       });
114 |     },
115 |     async onResponse({ response }) {
116 |       const json = await response.json();
117 |       if (!response.ok) {
118 |         toast(
119 |           `Failed to retrieve the crawls URLs - ${id}: Details 
120 |         ${json.detail}`,
121 |           {
122 |             type: toast.TYPE.ERROR
123 |           }
124 |         );
125 |         return;
126 |       }
127 |       return {
128 |         type: ActionTypes.updateURLInfo,
129 |         payload: Object.assign(
130 |           {
131 |             id
132 |           },
133 |           json
134 |         )
135 |       };
136 |     }
137 |   });
138 | }
139 | 
140 | /**
141 |  *
142 |  * @param {Object} [newCrawlConfig]
143 |  */
144 | export function createCrawl(newCrawlConfig) {
145 |   const { body, request } = EndpointRequests.createNewCrawl(newCrawlConfig);
146 |   return makeHTTPRequest(request, {
147 |     onError({ error }) {
148 |       toast(`Failed to create the new crawl ${error}`, {
149 |         type: toast.TYPE.ERROR
150 |       });
151 |     },
152 |     async onResponse({ dispatch, response }) {
153 |       const json = await response.json();
154 |       if (!response.ok) {
155 |         toast(
156 |           `Failed to create the crawl - ${json.id}: Details 
157 |         ${json.detail}`,
158 |           {
159 |             type: toast.TYPE.ERROR
160 |           }
161 |         );
162 |         return;
163 |       }
164 |       const request = EndpointRequests.crawlInfo(json.id);
165 |       const infoResponse = await fetch(request);
166 |       const result = infoResponse.ok
167 |         ? await infoResponse.json()
168 |         : Object.assign(
169 |             {
170 |               id: json.id
171 |             },
172 |             body,
173 |             newCrawlConfig.crawlRunInfo
174 |           );
175 | 
176 |       console.log('newly created crawl info', result);
177 |       return {
178 |         type: ActionTypes.create,
179 |         payload: result
180 |       };
181 |     }
182 |   });
183 | }
184 | 
185 | /**
186 |  *
187 |  * @param {string} id
188 |  * @param {Object} [startConfig]
189 |  */
190 | export function startCrawl(id, startConfig) {
191 |   const { body, request } = EndpointRequests.startCrawl(id, startConfig);
192 |   return makeHTTPRequest(request, {
193 |     onError({ error }) {
194 |       toast(`Failed to start the crawl - ${id}: ${error}`, {
195 |         type: toast.TYPE.ERROR
196 |       });
197 |     },
198 |     async onResponse({ response }) {
199 |       const json = await response.json();
200 |       if (!response.ok) {
201 |         toast(
202 |           `Failed to start the crawl - ${id}: Details 
203 |         ${json.detail}`,
204 |           {
205 |             type: toast.TYPE.ERROR
206 |           }
207 |         );
208 |         return;
209 |       }
210 |       const result = {
211 |         id,
212 |         ...body
213 |       };
214 |       console.log('started crawl info', result);
215 |       return {
216 |         type: ActionTypes.start,
217 |         payload: result
218 |       };
219 |     }
220 |   });
221 | }
222 | 
223 | export function stopCrawl(id) {
224 |   const request = EndpointRequests.stopCrawl(id);
225 |   return makeHTTPRequest(request, {
226 |     onError({ error }) {
227 |       toast(`Failed to remove the crawl - ${id}: ${error}`, {
228 |         type: toast.TYPE.ERROR
229 |       });
230 |     },
231 |     async onResponse({ dispatch, response }) {
232 |       const json = await response.json();
233 |       if (!response.ok) {
234 |         toast(
235 |           `Failed to stop the crawl - ${id}: Details 
236 |         ${json.detail}`,
237 |           {
238 |             type: toast.TYPE.ERROR
239 |           }
240 |         );
241 |         return;
242 |       }
243 |       return {
244 |         type: ActionTypes.stop,
245 |         payload: { id }
246 |       };
247 |     }
248 |   });
249 | }
250 | 
251 | export function removeCrawl(id) {
252 |   const request = EndpointRequests.removeCrawl(id);
253 |   return makeHTTPRequest(request, {
254 |     onError({ error }) {
255 |       toast(`Failed to remove the crawl - ${id}: ${error}`, {
256 |         type: toast.TYPE.ERROR
257 |       });
258 |     },
259 |     async onResponse({ dispatch, response }) {
260 |       const json = await response.json();
261 |       if (!response.ok) {
262 |         toast(
263 |           `Failed to remove the crawl - ${id}: Details 
264 |         ${json.detail}`,
265 |           {
266 |             type: toast.TYPE.ERROR
267 |           }
268 |         );
269 |         return;
270 |       }
271 |       return {
272 |         type: ActionTypes.deleteCrawl,
273 |         payload: { id }
274 |       };
275 |     }
276 |   });
277 | }
278 | 


--------------------------------------------------------------------------------
/frontend/src/actions/httpRequests.js:
--------------------------------------------------------------------------------
 1 | export const HTTPRequestAction = Symbol('http-request-maker');
 2 | 
 3 | export const FetchStates = {
 4 |   preflight: Symbol('http-request-preflight'),
 5 |   inflight: Symbol('http-request-inflight'),
 6 |   done: Symbol('http-request-done'),
 7 |   error: Symbol('http-request-errored')
 8 | };
 9 | 
10 | export function requestErrorAction({ error, payload }) {
11 |   return {
12 |     type: HTTPRequestAction,
13 |     payload: Object.assign({ error }, payload)
14 |   };
15 | }
16 | 
17 | /**
18 |  * @typedef {Object} MakeHTTPRequestInit
19 |  * @property {function ({dispatch: Function, response: Response}): *} onResponse
20 |  * @property {function ({dispatch: Function, error: Error}): *} onError
21 |  */
22 | 
23 | function requestComplete(nextAction, wasError, url) {
24 |   nextAction.meta = nextAction.meta || {};
25 |   nextAction.meta.httpRequest = {
26 |     url,
27 |     state: wasError ? FetchStates.error : FetchStates.done
28 |   };
29 |   return nextAction;
30 | }
31 | 
32 | /**
33 |  *
34 |  * @param {Request} request
35 |  * @param {MakeHTTPRequestInit} init
36 |  */
37 | export function makeHTTPRequest(request, { onResponse, onError }) {
38 |   return dispatch => {
39 |     const init = {
40 |       type: HTTPRequestAction,
41 |       meta: {
42 |         httpRequest: {
43 |           state: FetchStates.preflight,
44 |           url: request.url
45 |         }
46 |       }
47 |     };
48 |     if (!dispatch(init)) return; // no op, this is a duplicate request
49 |     let wasError = false;
50 |     dispatch(
51 |       fetch(request)
52 |         .then(response => onResponse({ dispatch, response }))
53 |         .catch(error => {
54 |           wasError = true;
55 |           return onError({ dispatch, error });
56 |         })
57 |         .then(requestFinished =>
58 |           requestComplete(
59 |             requestFinished || { type: HTTPRequestAction },
60 |             wasError,
61 |             request.url
62 |           )
63 |         )
64 |     );
65 |   };
66 | }
67 | 


--------------------------------------------------------------------------------
/frontend/src/actions/index.js:
--------------------------------------------------------------------------------
1 | export * from './crawls';
2 | 


--------------------------------------------------------------------------------
/frontend/src/components/Crawl/Control.js:
--------------------------------------------------------------------------------
 1 | import React, { Component } from 'react';
 2 | import * as PropTypes from 'prop-types';
 3 | import { CrawlRecord } from '../../reducers/crawls';
 4 | 
 5 | export default class Control extends Component {
 6 |   static propTypes = {
 7 |     crawl: PropTypes.instanceOf(CrawlRecord).isRequired,
 8 |     getCrawlInfo: PropTypes.func.isRequired,
 9 |     startCrawl: PropTypes.func.isRequired,
10 |     stopCrawl: PropTypes.func.isRequired,
11 |     removeCrawl: PropTypes.func.isRequired
12 |   };
13 | 
14 |   startCrawl() {
15 |     const { crawl } = this.props;
16 |     this.props.startCrawl(crawl.startCrawlConfig());
17 |   }
18 | 
19 |   constructor(props) {
20 |     super(props);
21 |     this.startCrawl = this.startCrawl.bind(this);
22 |   }
23 | 
24 |   render() {
25 |     const { crawl, getCrawlInfo, removeCrawl, stopCrawl } = this.props;
26 |     return (
27 |       <div className='uk-grid uk-flex-center uk-margin-bottom' data-uk-grid=''>
28 |         <div>
29 |           <button
30 |             onClick={this.startCrawl}
31 |             disabled={crawl.running}
32 |             className='uk-button uk-button-default'
33 |           >
34 |             Start Crawl
35 |           </button>
36 |         </div>
37 |         <div>
38 |           <button
39 |             onClick={stopCrawl}
40 |             disabled={!crawl.running}
41 |             className='uk-button uk-button-default'
42 |           >
43 |             Stop Crawl
44 |           </button>
45 |         </div>
46 |         <div>
47 |           <button
48 |             className='uk-button uk-button-default'
49 |             onClick={getCrawlInfo}
50 |           >
51 |             Update
52 |           </button>
53 |         </div>
54 |         <div>
55 |           <button className='uk-button uk-button-default' onClick={removeCrawl}>
56 |             Remove
57 |           </button>
58 |         </div>
59 |       </div>
60 |     );
61 |   }
62 | }
63 | 


--------------------------------------------------------------------------------
/frontend/src/components/Crawl/Info.js:
--------------------------------------------------------------------------------
 1 | import React, { Component } from 'react';
 2 | import * as PropTypes from 'prop-types';
 3 | import { CrawlRecord } from '../../reducers/crawls';
 4 | 
 5 | export default class Info extends Component {
 6 |   static propTypes = {
 7 |     crawl: PropTypes.instanceOf(CrawlRecord).isRequired
 8 |   };
 9 | 
10 |   renderQueue() {
11 |     const crawlId = this.props.crawl.id;
12 |     const queue = this.props.crawl.queue;
13 |     const q = new Array(queue.size);
14 |     for (let i = 0; i < queue.size; i++) {
15 |       const qinfo = queue.get(i);
16 |       q[i] = (
17 |         <li key={`${crawlId}-seeds-${i}`}>
18 |           <p>
19 |             {qinfo.get('url')} @ depth {qinfo.get('depth')}
20 |           </p>
21 |         </li>
22 |       );
23 |     }
24 |     return (
25 |       <>
26 |         <h4 className='uk-text-center'>Queue</h4>
27 |         <ul className='uk-list uk-flex uk-flex-column uk-flex-center uk-flex-middle'>{q}</ul>
28 |       </>
29 |     );
30 |   }
31 | 
32 |   renderBrowsers() {
33 |     const crawlId = this.props.crawl.id;
34 |     const browsers = this.props.crawl.browsers;
35 |     const b = new Array(browsers.length);
36 |     for (let i = 0; i < browsers.length; i++) {
37 |       b[i] = (
38 |         <li key={`${crawlId}-browsers-${i}`}>
39 |           <a
40 |             href={`${window.location.protocol + "//" + window.location.hostname}:9020/attach/${browsers[i]}`}
41 |             target='_blank'
42 |           >
43 |             View {browsers[i]}
44 |           </a>
45 |         </li>
46 |       );
47 |     }
48 |     return (
49 |       <>
50 |         <h4 className='uk-text-center'>View Running Crawl</h4>
51 |         <ul className='uk-list uk-flex uk-flex-column uk-flex-center uk-flex-middle'>{b}</ul>
52 |       </>
53 |     );
54 |   }
55 | 
56 |   render() {
57 |     const { crawl } = this.props;
58 |     console.log('Viewing crawl info', crawl.toJS());
59 |     return (
60 |       <>
61 |         <table className='uk-table uk-table-middle uk-table-justify'>
62 |           <thead>
63 |             <tr>
64 |               <th>Type</th>
65 |               <th>Status</th>
66 |               <th>Num Browsers</th>
67 |               <th>Num Tabs</th>
68 |               <th>Crawl Depth</th>
69 |             </tr>
70 |           </thead>
71 |           <tbody>
72 |             <tr>
73 |               <td>{crawl.crawl_type}</td>
74 |               <td>{crawl.status || 'new'}</td>
75 |               <td>{crawl.num_browsers}</td>
76 |               <td>{crawl.num_tabs}</td>
77 |               <td>{crawl.crawl_depth}</td>
78 |             </tr>
79 |           </tbody>
80 |         </table>
81 |         {crawl.browsers.length > 0 && this.renderBrowsers()}
82 |         {crawl.queue.size > 0 && this.renderQueue()}
83 |       </>
84 |     );
85 |   }
86 | }
87 | 


--------------------------------------------------------------------------------
/frontend/src/components/Crawl/index.js:
--------------------------------------------------------------------------------
  1 | import React, { Component } from 'react';
  2 | import * as PropTypes from 'prop-types';
  3 | import { connect } from 'react-redux';
  4 | import { Redirect } from 'react-router-dom';
  5 | import { CrawlRecord } from '../../reducers/crawls';
  6 | import {
  7 |   getCrawlInfo,
  8 |   removeCrawl,
  9 |   startCrawl,
 10 |   stopCrawl
 11 | } from '../../actions';
 12 | import Info from './Info';
 13 | import Control from './Control';
 14 | 
 15 | class Crawl extends Component {
 16 |   static propTypes = {
 17 |     location: PropTypes.object.isRequired,
 18 |     match: PropTypes.object.isRequired,
 19 |     crawl: PropTypes.instanceOf(CrawlRecord),
 20 |     getCrawlInfo: PropTypes.func.isRequired,
 21 |     startCrawl: PropTypes.func.isRequired,
 22 |     stopCrawl: PropTypes.func.isRequired,
 23 |     removeCrawl: PropTypes.func.isRequired
 24 |   };
 25 | 
 26 |   constructor(props) {
 27 |     super(props);
 28 | 
 29 |     this.handle = null;
 30 |   }
 31 | 
 32 |   componentDidMount() {
 33 |     const { crawl } = this.props;
 34 | 
 35 |     if (crawl.get('status') === 'running') {
 36 |       this.autoUpdate();
 37 |     }
 38 |   }
 39 | 
 40 |   componentDidUpdate(prevProps) {
 41 |     const { crawl } = this.props;
 42 | 
 43 |     if (
 44 |       crawl &&
 45 |       crawl.get('status') === 'running' &&
 46 |       prevProps.crawl.get('status') !== 'running'
 47 |     ) {
 48 |       this.autoUpdate();
 49 |     } else if (
 50 |       !crawl ||
 51 |       (crawl.get('status') !== 'running' && prevProps.crawl.get('status') === 'running')
 52 |     ) {
 53 |       clearInterval(this.handle);
 54 |     }
 55 |   }
 56 | 
 57 |   componentWillUnmount() {
 58 |     clearInterval(this.handle);
 59 |   }
 60 | 
 61 |   autoUpdate = () => {
 62 |     this.handle = setInterval(this.props.getCrawlInfo, 1000);
 63 |   }
 64 | 
 65 |   render() {
 66 |     if (this.props.crawl == null) return <Redirect to='/' />;
 67 |     const {
 68 |       crawl,
 69 |       getCrawlInfo,
 70 |       removeCrawl,
 71 |       stopCrawl,
 72 |       startCrawl
 73 |     } = this.props;
 74 |     return (
 75 |       <div className='uk-container uk-container-small'>
 76 |         <div
 77 |           className='uk-grid uk-flex-center uk-margin-bottom'
 78 |           data-uk-grid=''
 79 |         >
 80 |           <div>
 81 |             <h1>Crawl Id - {crawl.id}</h1>
 82 |           </div>
 83 |           <div>
 84 |             <span className='uk-label uk-label-info'>
 85 |               {crawl.running ? '' : 'Not'} Running
 86 |             </span>
 87 |           </div>
 88 |         </div>
 89 |         <Control
 90 |           crawl={crawl}
 91 |           getCrawlInfo={getCrawlInfo}
 92 |           removeCrawl={removeCrawl}
 93 |           startCrawl={startCrawl}
 94 |           stopCrawl={stopCrawl}
 95 |         />
 96 |         <Info crawl={crawl} />
 97 |       </div>
 98 |     );
 99 |   }
100 | }
101 | 
102 | const mapStateToProps = (state, ownProps) => ({
103 |   crawl: state.get('crawls').get(ownProps.match.params.crawlid)
104 | });
105 | 
106 | const mapDispatchToProps = (dispatch, ownProps) => ({
107 |   getCrawlInfo() {
108 |     dispatch(getCrawlInfo(ownProps.match.params.crawlid));
109 |   },
110 |   startCrawl(config) {
111 |     dispatch(startCrawl(ownProps.match.params.crawlid, config));
112 |   },
113 |   stopCrawl() {
114 |     dispatch(stopCrawl(ownProps.match.params.crawlid));
115 |   },
116 |   removeCrawl() {
117 |     dispatch(removeCrawl(ownProps.match.params.crawlid));
118 |   }
119 | });
120 | 
121 | const ConnectedCrawl = connect(
122 |   mapStateToProps,
123 |   mapDispatchToProps
124 | )(Crawl);
125 | 
126 | export default ConnectedCrawl;
127 | 


--------------------------------------------------------------------------------
/frontend/src/components/CrawlCreator/CreationForm.js:
--------------------------------------------------------------------------------
  1 | import React from 'react';
  2 | import FormSection from 'redux-form/lib/FormSection';
  3 | import Field from 'redux-form/lib/immutable/Field';
  4 | import FieldArray from 'redux-form/lib/immutable/FieldArray';
  5 | import reduxForm from 'redux-form/lib/immutable/reduxForm';
  6 | import {
  7 |   CrawlConfigInputField,
  8 |   CrawlConfigSelectField,
  9 |   URLFields
 10 | } from './fields';
 11 | 
 12 | export function validate(values, props) {
 13 |   const errors = {
 14 |     crawlInfo: {},
 15 |   };
 16 |   const crawlInfo = values.get('crawlInfo');
 17 |   if (!crawlInfo) {
 18 |     errors.crawlInfo = 'required';
 19 |   } else {
 20 |     if (values.get('num_browsers') <= 0) {
 21 |       errors.crawlInfo.num_browsers =
 22 |         'The number of browser to be used cannot be less than or equal to zero';
 23 |     }
 24 |     if (values.get('num_tabs') <= 0) {
 25 |       errors.crawlInfo.num_tabs =
 26 |         'The number of tabs to be used cannot be less than or equal to zero';
 27 |     }
 28 |     if (values.get('crawl_depth') <= 0) {
 29 |       errors.crawlInfo.crawl_depth =
 30 |         'The depth of crawl cannot be less than or equal to zero';
 31 |     }
 32 | 
 33 |     if (values.get('behavior_max_time') <= 0) {
 34 |       errors.crawlInfo.behavior_max_time =
 35 |         'The runtime of behaviors must be greater than zero';
 36 |     }
 37 |     const seeds = crawlInfo.get('seed_urls');
 38 |     if (!seeds || seeds.size <= 0) {
 39 |       errors.crawlInfo.seed_urls = 'Must include seed urls';
 40 |     }
 41 |   }
 42 |   return errors;
 43 | }
 44 | 
 45 | function seedURLsRequired(value, allValues, props) {
 46 |   if (!value) return 'Required';
 47 |   if (value.size === 0) return 'Required';
 48 | }
 49 | 
 50 | function CrawlCreationForm({ crawlType, handleSubmit, valid, submitting }) {
 51 |   const submitDisabled = !valid || submitting;
 52 |   return (
 53 |     <form className='uk-form-stacked' onSubmit={handleSubmit} autoComplete='on'>
 54 |       <FormSection name='crawlInfo'>
 55 |         <div uk-child-width-expands="true" uk-grid="true">
 56 |           <div className='uk-width-1-2'>
 57 |             <div className='uk-column-1-2'>
 58 |               <Field
 59 |                 name='name'
 60 |                 type='text'
 61 |                 label='Name'
 62 |                 component={CrawlConfigInputField}
 63 |               />
 64 |               <Field
 65 |                 name='coll'
 66 |                 type='text'
 67 |                 label='Collection'
 68 |                 component={CrawlConfigInputField}
 69 |               />
 70 |               <Field
 71 |                 name='crawl_type'
 72 |                 label='Crawl Type'
 73 |                 component={CrawlConfigSelectField}
 74 |               >
 75 |                 <option value='single-page'>Single Page</option>
 76 |                 <option value='same-domain'>Same Domain Links</option>
 77 |                 <option value='all-links'>All Links on Page</option>
 78 |                 <option value='custom'>Custom Depth</option>
 79 |               </Field>
 80 |               <Field
 81 |                 name='num_browsers'
 82 |                 type='number'
 83 |                 label='How Many Browsers'
 84 |                 component={CrawlConfigInputField}
 85 |               />
 86 |               <Field
 87 |                 name='num_tabs'
 88 |                 type='number'
 89 |                 label='How Many Tabs'
 90 |                 component={CrawlConfigInputField}
 91 |               />
 92 |               <Field
 93 |                 name='behavior_max_time'
 94 |                 type='number'
 95 |                 label='Behaviors Runtime (Seconds)'
 96 |                 title='How Long Should Behaviors run for, in seconds.'
 97 |                 component={CrawlConfigInputField}
 98 |               />
 99 |             </div>
100 |             <ul uk-accordion="true">
101 |               <li className="">
102 |                 <a className="uk-accordion-title" href="#">More Options...</a>
103 |                 <div className="uk-accordion-content uk-column-1-2">
104 |                   <Field
105 |                   name='crawl_depth'
106 |                   type='number'
107 |                   label='Crawl Depth'
108 |                   props={{ disabled: crawlType === 'single-page', min: 1 }}
109 |                   component={CrawlConfigInputField}
110 |                   />
111 |                   <Field
112 |                     name='browser'
113 |                     label='Which Browser'
114 |                     component={CrawlConfigSelectField}
115 |                   >
116 |                     <option value='chrome:73'>Chrome 73</option>
117 |                   </Field>
118 |                   <Field
119 |                     name='headless'
120 |                     label='Headless'
121 |                     component={CrawlConfigSelectField}
122 |                   >
123 |                     <option value={false}>No</option>
124 |                     <option value={true}>Yes</option>
125 |                   </Field>
126 |                   <Field
127 |                     name='cache'
128 |                     label='Cache Settings'
129 |                     component={CrawlConfigSelectField}
130 |                   >
131 |                     <option value='always'>Always</option>
132 |                     <option value='never'>Never</option>
133 |                     <option value='default'>Default</option>
134 |                   </Field>
135 |                 </div>
136 |               </li>
137 |             </ul>
138 |           </div>
139 |           <div className="uk-width-1-2">
140 |             <FieldArray
141 |               name='seed_urls'
142 |               component={URLFields}
143 |               validate={seedURLsRequired}
144 |             />
145 |           </div>
146 |         </div>
147 |         <div className='uk-flex uk-flex-middle uk-flex-center'>
148 |           <button
149 |             className='uk-button uk-button-default uk-button-primary uk-margin-bottom'
150 |             type='submit'
151 |             disabled={submitDisabled}
152 |           >
153 |             Create Crawl
154 |           </button>
155 |         </div>
156 |       </FormSection>
157 |     </form>
158 |   );
159 | }
160 | 
161 | export const initialValues = {
162 |   crawlInfo: {
163 |     crawl_type: 'single-page',
164 |     num_browsers: 1,
165 |     num_tabs: 1,
166 |     name: 'test crawl',
167 |     coll: 'test',
168 |     crawl_depth: 1,
169 |     headless: false,
170 |     cache: 'always',
171 |     browser: 'chrome:73',
172 |     behavior_max_time: 60,
173 |     seed_urls: [],
174 |   }
175 | };
176 | 
177 | export default reduxForm({
178 |   form: 'CreateCrawl',
179 |   enableReinitialize: true,
180 |   destroyOnUnmount: false,
181 |   initialValues,
182 |   validate
183 | })(CrawlCreationForm);
184 | 


--------------------------------------------------------------------------------
/frontend/src/components/CrawlCreator/fields.js:
--------------------------------------------------------------------------------
  1 | import React, { Component } from 'react';
  2 | import * as PropTypes from 'prop-types';
  3 | import Field from 'redux-form/lib/immutable/Field';
  4 | import urlRegx from 'url-regex';
  5 | import UIKit from 'uikit';
  6 | 
  7 | export function CrawlConfigSelectField({ label, input, meta, children }) {
  8 |   const id = input.name;
  9 |   return (
 10 |     <div>
 11 |       <label className='uk-form-label' htmlFor={id}>
 12 |         {label}
 13 |       </label>
 14 |       <select className='uk-select' id={id} {...input}>
 15 |         {children}
 16 |       </select>
 17 |     </div>
 18 |   );
 19 | }
 20 | 
 21 | export function CrawlConfigInputField({ disabled, label, min, type, input, meta, title }) {
 22 |   const id = `${input.name}-${type}`;
 23 |   const className = `uk-input ${meta.valid ? '' : 'uk-form-danger'}`;
 24 |   return (
 25 |     <div title={title}>
 26 |       <label className='uk-form-label' htmlFor={id}>
 27 |         {label}
 28 |       </label>
 29 |       <input disabled={disabled} className={className} type={type} min={min} size='sm' id={id} {...input} />
 30 |     </div>
 31 |   );
 32 | }
 33 | 
 34 | const isURLRe = urlRegx({ exact: true, strict: false });
 35 | 
 36 | const isURLTest = url => (isURLRe.test(url) ? null : 'Not a URL');
 37 | 
 38 | class URLToCrawl extends Component {
 39 |   static propTypes = {
 40 |     idx: PropTypes.number.isRequired,
 41 |     remove: PropTypes.func.isRequired
 42 |   };
 43 | 
 44 |   constructor(props) {
 45 |     super(props);
 46 |     this.renderURL = this.renderURL.bind(this);
 47 |     this.remove = this.remove.bind(this);
 48 |   }
 49 | 
 50 |   renderURL({ input, meta }) {
 51 |     const className = `uk-input ${meta.valid ? '' : 'uk-form-danger'}`;
 52 |     return (
 53 |       <input
 54 |         autoFocus
 55 |         className={className}
 56 |         type={input.type}
 57 |         id={input.name}
 58 |         value={input.value}
 59 |         onChange={input.onChange}
 60 |         onBlur={input.onBlur}
 61 |         placeholder='Seed URL'
 62 |       />
 63 |     );
 64 |   }
 65 | 
 66 |   remove() {
 67 |     this.props.remove(this.props.idx);
 68 |   }
 69 | 
 70 |   render() {
 71 |     return (
 72 |       <div className='uk-inline uk-form-controls uk-width-1-1 uk-margin-small'>
 73 |         <a
 74 |           className='uk-form-icon uk-form-icon-flip'
 75 |           onClick={this.remove}
 76 |           data-uk-icon='icon: close'
 77 |         />
 78 |         <Field
 79 |           component={this.renderURL}
 80 |           validate={isURLTest}
 81 |           type='url'
 82 |           name={`seed_urls.${this.props.idx}`}
 83 |         />
 84 |       </div>
 85 |     );
 86 |   }
 87 | }
 88 | 
 89 | export class BulkURLInput extends Component {
 90 |   static propTypes = {
 91 |     addURL: PropTypes.func.isRequired
 92 |   };
 93 | 
 94 |   constructor(props) {
 95 |     super(props);
 96 |     this.textAreaRef = React.createRef();
 97 |     this.addURLs = this.addURLs.bind(this);
 98 |     this.close = this.close.bind(this);
 99 |   }
100 | 
101 |   componentWillUnmount() {
102 |     UIKit.modal('#bulk-seed-input').$destroy(true);
103 |   }
104 | 
105 |   addURLs() {
106 |     const value = this.textAreaRef.current.value;
107 |     if (value) {
108 |       const rawValues = value.split('\n');
109 |       let added = false;
110 |       for (let i = 0; i < rawValues.length; i++) {
111 |         const value = rawValues[i].trim();
112 |         if (isURLRe.test(value)) {
113 |           added = true;
114 |           this.props.addURL(value);
115 |         }
116 |       }
117 |       if (added) {
118 |         this.close();
119 |       }
120 |     }
121 |   }
122 | 
123 |   close() {
124 |     this.textAreaRef.current.value = '';
125 |     UIKit.modal('#bulk-seed-input').hide();
126 |   }
127 | 
128 |   render() {
129 |     return (
130 |       <div id='bulk-seed-input' className='uk-flex-top' data-uk-modal=''>
131 |         <div className='uk-modal-dialog'>
132 |           <div className='uk-modal-header'>
133 |             <h2 className='uk-modal-title'>Enter URLs On A Single Line</h2>
134 |           </div>
135 |           <div className='uk-modal-body uk-margin-auto-vertical'>
136 |             <textarea
137 |               ref={this.textAreaRef}
138 |               className='uk-textarea'
139 |               rows='10'
140 |               placeholder='Seed URLs'
141 |             />
142 |           </div>
143 |           <div className='uk-modal-footer uk-text-right'>
144 |             <button
145 |               className='uk-button uk-button-small uk-button-danger uk-margin-right'
146 |               type='button'
147 |               onClick={this.close}
148 |             >
149 |               Cancel
150 |             </button>
151 |             <button
152 |               className='uk-button uk-button-small uk-button-default'
153 |               type='button'
154 |               onClick={this.addURLs}
155 |             >
156 |               Save
157 |             </button>
158 |           </div>
159 |         </div>
160 |       </div>
161 |     );
162 |   }
163 | }
164 | 
165 | export class URLFields extends Component {
166 |   static propTypes = {
167 |     fields: PropTypes.object.isRequired
168 |   };
169 | 
170 |   constructor(props) {
171 |     super(props);
172 |     this.renderURLs = this.renderURLs.bind(this);
173 |     this.addURL = this.addURL.bind(this);
174 |     this.removeURL = this.removeURL.bind(this);
175 |   }
176 | 
177 |   addURL(maybeURL) {
178 |     this.props.fields.push(typeof maybeURL === 'string' ? maybeURL : '');
179 |   }
180 | 
181 |   removeURL(idx) {
182 |     this.props.fields.remove(idx);
183 |   }
184 | 
185 |   renderURLs() {
186 |     const { fields } = this.props;
187 |     const urls = new Array(fields.length);
188 |     for (let i = 0; i < fields.length; i++) {
189 |       urls[i] = (
190 |         <URLToCrawl
191 |           key={`crawl-url-input-${i}`}
192 |           idx={i}
193 |           remove={this.removeURL}
194 |         />
195 |       );
196 |     }
197 |     return urls;
198 |   }
199 | 
200 |   render() {
201 |     const haveURLS = this.props.fields.length >= 1;
202 |     return (
203 |       <>
204 |         <div
205 |           className='uk-grid uk-grid-small uk-flex uk-flex-center uk-margin-top uk-margin-bottom'
206 |           data-uk-grid=''
207 |         >
208 |           <div>
209 |             <button
210 |               className='uk-button uk-button-default'
211 |               onClick={this.addURL}
212 |             >
213 |               Add Seed URL
214 |             </button>
215 |           </div>
216 |           <div>
217 |             <button
218 |               className='uk-button uk-button-default'
219 |               data-uk-toggle='target: #bulk-seed-input'
220 |             >
221 |               Bulk Add Seeds
222 |             </button>
223 |           </div>
224 |         </div>
225 |         {this.props.meta.error && (
226 |           <div
227 |             className='uk-flex uk-flex-middle uk-flex-center'
228 |             style={{ color: '#f0506e' }}
229 |           >
230 |             <span
231 |               className='uk-margin-small-right uk-icon'
232 |               data-uk-icon='warning'
233 |             />
234 |             Seed URLs Required
235 |           </div>
236 |         )}
237 |         <BulkURLInput addURL={this.addURL} />
238 |         <div className='uk-width-expand uk-overflow-auto uk-height-medium'>
239 |           <ul className='uk-list'>{haveURLS && this.renderURLs()}</ul>
240 |         </div>
241 |       </>
242 |     );
243 |   }
244 | }
245 | 


--------------------------------------------------------------------------------
/frontend/src/components/CrawlCreator/index.js:
--------------------------------------------------------------------------------
 1 | import React from 'react';
 2 | import * as PropTypes from 'prop-types';
 3 | import { connect } from 'react-redux';
 4 | import { formValueSelector } from 'redux-form/immutable';
 5 | import { withRouter } from 'react-router-dom';
 6 | import { createCrawl } from '../../actions';
 7 | import CrawlCreationForm, { initialValues } from './CreationForm';
 8 | 
 9 | function CrawlCreator({ crawlType, createCrawl }) {
10 |   return (
11 |     <>
12 |       <h3 className='display-4 uk-text-center'>New Crawl</h3>
13 |       <CrawlCreationForm
14 |         crawlType={crawlType}
15 |         initialValues={initialValues}
16 |         onSubmit={createCrawl} />
17 |     </>
18 |   );
19 | }
20 | 
21 | CrawlCreator.propTypes = {
22 |   createCrawl: PropTypes.func.isRequired
23 | };
24 | 
25 | const selector = formValueSelector('CreateCrawl');
26 | const mapStateToProps = state => {
27 |   return {
28 |     crawlType: selector(state, 'crawlInfo.crawl_type')
29 |   };
30 | };
31 | 
32 | const mapDispatchToProps = (dispatch, ownProps) => ({
33 |   createCrawl(crawlInfo) {
34 |     ownProps.history.push('/');
35 |     dispatch(createCrawl(crawlInfo.toJS()));
36 |   }
37 | });
38 | 
39 | const ConnectedCrawlCreator = connect(
40 |   mapStateToProps,
41 |   mapDispatchToProps
42 | )(CrawlCreator);
43 | 
44 | export default withRouter(ConnectedCrawlCreator);
45 | 


--------------------------------------------------------------------------------
/frontend/src/components/CrawlCreator/validate.js:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/webrecorder/browsertrix-old/726578cfec61c8e0e074393a6373591697ff3bc0/frontend/src/components/CrawlCreator/validate.js


--------------------------------------------------------------------------------
/frontend/src/components/Crawls/LoadingCrawls.js:
--------------------------------------------------------------------------------
 1 | import React from 'react';
 2 | 
 3 | export default function LoadingCrawls() {
 4 |   return (
 5 |     <div className='uk-flex uk-flex-column uk-flex-middle'>
 6 |       <h1 className='uk-heading-primary'>Retrieving Crawl Info</h1>
 7 |       <span data-uk-spinner='ratio: 4.5' />
 8 |     </div>
 9 |   );
10 | }
11 | 


--------------------------------------------------------------------------------
/frontend/src/components/Crawls/SelectCrawl.js:
--------------------------------------------------------------------------------
  1 | import React from 'react';
  2 | import ReactTable from "react-table";
  3 | import * as PropTypes from 'prop-types';
  4 | import { List, Map } from 'immutable';
  5 | import { Link } from 'react-router-dom';
  6 | 
  7 | export default function SelectCrawls({ crawls, removeCrawl }) {
  8 | 
  9 | 
 10 |   function doRemove(id) {
 11 |     //this.props.removeCrawl(this.props.crawlId);
 12 |     console.log(id);
 13 |     removeCrawl(id);
 14 |   }
 15 | 
 16 |   function formatTimeDiff(from, to) {
 17 |     to = to ? new Date(to * 1000) : new Date();
 18 |     from = new Date(from * 1000);
 19 | 
 20 |     var delta = Math.abs(to - from) / 1000;
 21 | 
 22 |     var hours = Math.floor(delta / 3600);
 23 |     delta -= hours * 3600;
 24 | 
 25 |     var minutes = Math.floor(delta / 60) % 60;
 26 |     delta -= minutes * 60;
 27 | 
 28 |     var seconds = Math.floor(delta % 60);
 29 | 
 30 |     hours = hours.toString().padStart(2, '0');
 31 |     minutes = minutes.toString().padStart(2, '0');
 32 |     seconds = seconds.toString().padStart(2, '0');
 33 | 
 34 |     return `${hours}:${minutes}:${seconds}`;
 35 | }
 36 | 
 37 | 
 38 |   const columns = [
 39 |   {
 40 |     Header: 'Name (Id)',
 41 |     accessor: 'id',
 42 |     Cell: props => <Link className='uk-button uk-button-text' to={`/crawl/${props.value}`}>{props.original.name ? props.original.name + ' (' + props.value + ')': props.value}</Link>
 43 |   },
 44 |   {
 45 |     Header: 'Started',
 46 |     accessor: 'start_time',
 47 |     Cell: props => <span>{`${formatTimeDiff(props.value)} ago`}</span>
 48 |   },
 49 |   {
 50 |     Header: 'Duration',
 51 |     accessor: 'finish_time',
 52 |     Cell: props => <span>{`${formatTimeDiff(props.original.start_time, props.value)}`}</span>
 53 |   },
 54 |   {
 55 |     Header: 'Status',
 56 |     accessor: 'status'
 57 |   },
 58 |   {
 59 |     Header: 'Crawl Type',
 60 |     accessor: 'crawl_type'
 61 |   },
 62 |   {
 63 |     Header: 'Collection',
 64 |     accessor: 'coll',
 65 |     Cell: props => <a href={`${window.location.protocol + "//" + window.location.hostname}:8180/${props.value}`}>{props.value}</a>
 66 |   },
 67 |   {
 68 |     Header: 'Mode',
 69 |     accessor: 'mode'
 70 |   },
 71 |   {
 72 |     Header: 'To Crawl',
 73 |     accessor: 'num_queue'
 74 |   },
 75 |   {
 76 |     Header: 'Have Crawled',
 77 |     accessor: 'num_seen'
 78 |   },
 79 |   {
 80 |     Header: 'Remove Crawl',
 81 |     id: 'remove_crawl',
 82 |     Cell: props => (<span
 83 |             className='removeCrawlFromListIcon' style={{'textAlign': 'center'}}
 84 |             data-uk-icon='close'
 85 |           />)
 86 |   },
 87 |   ];
 88 | 
 89 |   function resolveData(data) {
 90 |     return data.valueSeq().toArray();
 91 |   }
 92 | 
 93 |   function getTdProps(state, rowInfo, column, instance) {
 94 |     return {
 95 |       onClick: (e, handleOriginal) => {
 96 |         if (column.id === "remove_crawl") {
 97 |           doRemove(rowInfo.row.id);
 98 |         }
 99 |       }
100 |     };
101 |   }
102 | 
103 |   const crawlTable = (<ReactTable data={crawls}
104 |                                   columns={columns}
105 |                                   resolveData={resolveData}
106 |                                   defaultSorted={[{'id': 'start_time', 'desc': true}]}
107 |                                   getTdProps={getTdProps}/>);
108 | 
109 |   return (
110 |     <div>
111 |       <h3 className='uk-text-center'>All Crawls</h3>
112 |       {crawlTable}
113 |     </div>
114 |   );
115 | }
116 | 
117 | SelectCrawls.propTypes = {
118 |   crawls: PropTypes.instanceOf(Map).isRequired,
119 |   removeCrawl: PropTypes.func.isRequired
120 | };
121 | 
122 | 
123 | 


--------------------------------------------------------------------------------
/frontend/src/components/Crawls/index.js:
--------------------------------------------------------------------------------
 1 | import React, { Component } from 'react';
 2 | import { connect } from 'react-redux';
 3 | import * as PropTypes from 'prop-types';
 4 | import { List, Map } from 'immutable';
 5 | import { getAllCrawls, removeCrawl } from '../../actions/crawls';
 6 | import LoadingCrawls from './LoadingCrawls';
 7 | import SelectCrawl from './SelectCrawl';
 8 | import CrawlCreator from '../CrawlCreator';
 9 | 
10 | class Crawls extends Component {
11 |   static propTypes = {
12 |     crawls: PropTypes.instanceOf(Map).isRequired,
13 |     crawlsFetched: PropTypes.bool.isRequired,
14 |     init: PropTypes.func.isRequired,
15 |     removeCrawl: PropTypes.func.isRequired
16 |   };
17 | 
18 |   constructor(props) {
19 |     super(props);
20 | 
21 |     this.handle = null;
22 |   }
23 | 
24 |   componentDidMount() {
25 |     if (!this.props.crawlsFetched) {
26 |       this.props.loadCrawls(true);
27 |     }
28 | 
29 |     if (this.props.crawls.size > 0) {
30 |       this.autoUpdate();
31 |     }
32 |   }
33 | 
34 |   componentDidUpdate() {
35 |     if (this.props.crawls.size > 0 && !this.handle) {
36 |       this.autoUpdate();
37 |     } else if (this.props.crawls.size === 0 && this.handle) {
38 |       clearInterval(this.handle);
39 |       this.handle = null;
40 |     }
41 |   }
42 | 
43 |   componentWillUnmount() {
44 |     clearInterval(this.handle);
45 |   }
46 | 
47 |   autoUpdate = () => {
48 |     this.handle = setInterval(this.props.loadCrawls, 1000);
49 |   }
50 | 
51 |   render() {
52 |     let component;
53 |     if (!this.props.crawlsFetched) {
54 |       component = <LoadingCrawls />;
55 |     } else {
56 |       component = (
57 |         <SelectCrawl
58 |           removeCrawl={this.props.removeCrawl}
59 |           crawls={this.props.crawls}
60 |         />
61 |       );
62 |     }
63 |     return <div className='uk-container'>{component}</div>;
64 |   }
65 | }
66 | 
67 | const mapDispatchToProps = (dispatch, ownProps) => ({
68 |   loadCrawls(init = false) {
69 |     dispatch(getAllCrawls(init));
70 |   },
71 |   removeCrawl(id) {
72 |     dispatch(removeCrawl(id));
73 |   }
74 | });
75 | 
76 | const mapStateToProps = (state, ownProps) => ({
77 |   crawls: state.get('crawls'),
78 |   crawlsFetched: state.get('crawlsFetched')
79 | });
80 | 
81 | const ConnectedCrawls = connect(
82 |   mapStateToProps,
83 |   mapDispatchToProps
84 | )(Crawls);
85 | 
86 | export default ConnectedCrawls;
87 | 


--------------------------------------------------------------------------------
/frontend/src/components/Header/HeaderLink.js:
--------------------------------------------------------------------------------
 1 | import React from 'react';
 2 | import * as PropTypes from 'prop-types';
 3 | import { Link } from 'react-router-dom';
 4 | 
 5 | export default function HeaderLink({ to, location, children, button }) {
 6 |   if (button) {
 7 |     return (
 8 |       <Link className='uk-button uk-button-default' to={to}>
 9 |         {children}
10 |       </Link>
11 |     );
12 |   }
13 |   return (
14 |     <li className={to === location.pathname ? 'uk-active' : undefined}>
15 |       <Link to={to}>{children}</Link>
16 |     </li>
17 |   );
18 | }
19 | 
20 | HeaderLink.propTypes = {
21 |   button: PropTypes.bool,
22 |   to: PropTypes.string.isRequired
23 | };
24 | 
25 | HeaderLink.defaultProps = {
26 |   button: false
27 | };
28 | 


--------------------------------------------------------------------------------
/frontend/src/components/Header/index.js:
--------------------------------------------------------------------------------
 1 | import React from 'react';
 2 | import * as PropTypes from 'prop-types';
 3 | import { withRouter } from 'react-router-dom';
 4 | import HeaderLink from './HeaderLink';
 5 | 
 6 | function Header(props) {
 7 |   return (
 8 |     <div className='uk-container'>
 9 |       <img className='logo' src="/static/browsertrix-logo.svg"/>
10 |       <nav className='uk-navbar-container' uk-navbar="true">
11 |         <div className="uk-navbar-left">
12 |         <ul className="uk-nav uk-tab uk-flex-left">
13 |             <HeaderLink location={props.location} to='/'>
14 |               View All Crawls
15 |             </HeaderLink>
16 |             <HeaderLink location={props.location} to='/createCrawl'>
17 |               Create New Crawl
18 |             </HeaderLink>
19 |         </ul>
20 |         </div>
21 |       </nav>
22 |     </div>
23 |   );
24 | }
25 | 
26 | Header.propTypes = {
27 |   match: PropTypes.object.isRequired,
28 |   location: PropTypes.object.isRequired,
29 |   history: PropTypes.object.isRequired
30 | };
31 | 
32 | export default withRouter(Header);
33 | 


--------------------------------------------------------------------------------
/frontend/src/containers/App.js:
--------------------------------------------------------------------------------
 1 | import { hot } from 'react-hot-loader/root';
 2 | import React from 'react';
 3 | import { Route, Switch } from 'react-router-dom';
 4 | import Header from '../components/Header';
 5 | import Crawls from '../components/Crawls';
 6 | import CrawlCreator from '../components/CrawlCreator';
 7 | import Crawl from '../components/Crawl';
 8 | 
 9 | function AllCrawls(props) {
10 |   return <Crawls {...props} />;
11 | }
12 | 
13 | function CreateCrawl(props) {
14 |   return (
15 |     <div className='uk-container'>
16 |       <CrawlCreator/>
17 |     </div>
18 |   );
19 | }
20 | 
21 | function viewCrawl(props) {
22 |   return <Crawl {...props} />;
23 | }
24 | 
25 | function App() {
26 |   return (
27 |     <>
28 |       <Header />
29 |       <div className='route-container'>
30 |         <Switch>
31 |           <Route exact path='/' render={AllCrawls} />
32 |           <Route exact path='/createCrawl' render={CreateCrawl} />
33 |           <Route exact path='/crawl/:crawlid' render={viewCrawl} />
34 |         </Switch>
35 |       </div>
36 |     </>
37 |   );
38 | }
39 | 
40 | let ExportedApp = App;
41 | 
42 | if (process.env.NODE_ENV === 'development') {
43 |   ExportedApp = hot(App);
44 | }
45 | 
46 | export default ExportedApp;
47 | 


--------------------------------------------------------------------------------
/frontend/src/reducers/crawls.js:
--------------------------------------------------------------------------------
  1 | import { fromJS, List, Map, Record } from 'immutable';
  2 | import { ActionTypes } from '../actions/crawls';
  3 | 
  4 | export class CrawlRecord extends Record({
  5 |   behavior_max_time: 60,
  6 |   browser: 'chrome:67',
  7 |   browsers: [],
  8 |   browsersDone: [],
  9 |   coll: '',
 10 |   crawl_type: '',
 11 |   crawl_depth: 0,
 12 |   finish_time: 0,
 13 |   id: '',
 14 |   mode: '',
 15 |   num_browsers: 0,
 16 |   num_queue: 0,
 17 |   num_pending: 0,
 18 |   num_seen: 0,
 19 |   num_tabs: 0,
 20 |   name: '',
 21 |   pending: [],
 22 |   queue: [],
 23 |   running: false,
 24 |   scopes: [],
 25 |   seen: [],
 26 |   start_time: 0,
 27 |   status: 'new'
 28 | }) {
 29 |   updateBrowsers(browsers) {
 30 |     const existingBrowsers = this.browsers;
 31 |     return this.set('browsers', existingBrowsers.concat(browsers));
 32 |   }
 33 | 
 34 |   crawlRunning(browsers) {
 35 |     const existingBrowsers = this.browsers;
 36 |     return this.merge({
 37 |       browsers: existingBrowsers.concat(browsers),
 38 |       running: true,
 39 |       status: 'running'
 40 |     });
 41 |   }
 42 | 
 43 |   crawlStopped() {
 44 |     return this.merge({
 45 |       running: false,
 46 |       status: 'stopped'
 47 |     });
 48 |   }
 49 | 
 50 |   startCrawlConfig() {
 51 |     return {
 52 |       browser: this.browser,
 53 |       behavior_max_time: this.behavior_max_time,
 54 |       headless: this.headless
 55 |     };
 56 |   }
 57 | }
 58 | 
 59 | export function crawlsFetchedReducer(state = false, action) {
 60 |   if (action.type === ActionTypes.gotAllInit) {
 61 |     return true;
 62 |   }
 63 |   return state;
 64 | }
 65 | 
 66 | export function crawlIds(state = List([]), { type, payload, meta }) {
 67 | 
 68 |   switch (type) {
 69 |     case ActionTypes.deleteCrawl:
 70 |       const idx = state.indexOf(payload.id);
 71 |       return state.delete(idx);
 72 |     case ActionTypes.gotAllInit:
 73 |     case ActionTypes.gotAll:
 74 |       return List().withMutations(mutable => {
 75 |         const { crawls } = payload;
 76 |         for (let i = 0; i < crawls.length; i++) {
 77 |           mutable.push(crawls[i].id);
 78 |         }
 79 |         return mutable;
 80 |       });
 81 |     case ActionTypes.create:
 82 |       return state.push(payload.id);
 83 |   }
 84 |   return state;
 85 | }
 86 | 
 87 | export function crawlsReducer(state = Map({}), { type, payload, meta }) {
 88 |   switch (type) {
 89 |     case ActionTypes.start:
 90 |       if (!state.has(payload.id)) return state;
 91 |       return state.updateIn([payload.id], crawl =>
 92 |         crawl.crawlRunning(payload.browsers)
 93 |       );
 94 |     case ActionTypes.stop:
 95 |       if (!state.has(payload.id)) return state;
 96 |       return state.updateIn([payload.id], crawl =>
 97 |         crawl.crawlStopped()
 98 |       );
 99 |     case ActionTypes.deleteCrawl:
100 |       return state.delete(payload.id);
101 |     case ActionTypes.addURLs:
102 |       if (!state.has(payload.id)) return state;
103 |       return state.updateIn([payload.id], crawl => crawl.merge(payload));
104 |     case ActionTypes.gotAllInit:
105 |     case ActionTypes.gotAll:
106 |       return state.withMutations(mutable => {
107 |         const { crawls } = payload;
108 |         for (let i = 0; i < crawls.length; i++) {
109 |           const rawCrawl = crawls[i];
110 |           const crec = mutable.get(rawCrawl.id);
111 |           if (!crec) {
112 |             mutable.set(rawCrawl.id, new CrawlRecord(rawCrawl));
113 |           } else {
114 |             mutable.set(rawCrawl.id, crec.merge(rawCrawl));
115 |           }
116 |         }
117 |         return mutable;
118 |       });
119 |     case ActionTypes.create:
120 |       return state.set(payload.id, new CrawlRecord(payload));
121 |     case ActionTypes.updateURLInfo:
122 |       if (!state.has(payload.id)) return state;
123 |       return state.updateIn([payload.id], crawl => crawl.mergeDeep(payload));
124 |     case ActionTypes.info:
125 |       if (!state.has(payload.id)) return state;
126 |       return state.updateIn([payload.id], crawl => crawl.merge(fromJS(payload)));
127 |   }
128 |   return state;
129 | }
130 | 


--------------------------------------------------------------------------------
/frontend/src/reducers/index.js:
--------------------------------------------------------------------------------
 1 | import { combineReducers } from 'redux-immutable';
 2 | import { reducer as reduxFormReducer } from 'redux-form/immutable';
 3 | import { crawlIds, crawlsFetchedReducer, crawlsReducer } from './crawls';
 4 | 
 5 | const rootReducer = combineReducers({
 6 |   crawlIds,
 7 |   crawls: crawlsReducer,
 8 |   crawlsFetched: crawlsFetchedReducer,
 9 |   form: reduxFormReducer
10 | });
11 | 
12 | export default rootReducer;
13 | 


--------------------------------------------------------------------------------
/frontend/src/root.js:
--------------------------------------------------------------------------------
 1 | import './styles/global.scss';
 2 | import './utils/rhlConfig';
 3 | import React from 'react';
 4 | import ReactDOM from 'react-dom';
 5 | import { Provider } from 'react-redux';
 6 | import { HashRouter } from 'react-router-dom';
 7 | import Icons from 'uikit/dist/js/uikit-icons';
 8 | import Uikit from 'uikit';
 9 | import createStore from './store';
10 | import App from './containers/App';
11 | 
12 | window.Uikit = Uikit;
13 | Uikit.use(Icons);
14 | const store = createStore();
15 | 
16 | ReactDOM.render(
17 |   <Provider store={store}>
18 |     <HashRouter>
19 |       <App />
20 |     </HashRouter>
21 |   </Provider>,
22 |   document.getElementById('mount')
23 | );
24 | 


--------------------------------------------------------------------------------
/frontend/src/store/dev.js:
--------------------------------------------------------------------------------
 1 | import { createStore } from 'redux';
 2 | import { composeWithDevTools } from 'redux-devtools-extension';
 3 | import Immutable from 'immutable';
 4 | import * as actionCreators from '../actions';
 5 | import rootReducer from '../reducers';
 6 | import middleWare from './middleware';
 7 | 
 8 | const composer = composeWithDevTools({
 9 |   actionCreators,
10 |   serialize: {
11 |     immutable: Immutable
12 |   }
13 | });
14 | 
15 | export default function configureStore() {
16 |   return createStore(rootReducer, Immutable.Map({}), composer(middleWare));
17 | }
18 | 


--------------------------------------------------------------------------------
/frontend/src/store/index.js:
--------------------------------------------------------------------------------
1 | module.exports =
2 |   process.env.NODE_ENV === 'production'
3 |     ? require('./prod')
4 |     : require('./dev');
5 | 


--------------------------------------------------------------------------------
/frontend/src/store/middleware.js:
--------------------------------------------------------------------------------
 1 | import { applyMiddleware } from 'redux';
 2 | import thunkMiddleware from 'redux-thunk';
 3 | import promiseMiddleware from 'redux-promise';
 4 | import { FetchStates } from '../actions/httpRequests';
 5 | 
 6 | function getRequestTracker() {
 7 |   if (typeof window !== 'undefined') {
 8 |     if (window.__requestTrack == null) {
 9 |       window.__requestTrack = {};
10 |     }
11 |     return window.__requestTrack;
12 |   }
13 |   return {};
14 | }
15 | 
16 | function isURLTracked(url) {
17 |   const requests = getRequestTracker();
18 |   return requests[url] != null;
19 | }
20 | 
21 | function trackURL(url) {
22 |   const requests = getRequestTracker();
23 |   requests[url] = true;
24 | }
25 | 
26 | function untrackURL(url) {
27 |   const requests = getRequestTracker();
28 |   delete requests[url];
29 | }
30 | 
31 | function createRequestMiddleware() {
32 |   return store => next => action => {
33 |     if (action.meta && action.meta.httpRequest) {
34 |       const httpRequest = action.meta.httpRequest;
35 |       switch (httpRequest.state) {
36 |         case FetchStates.preflight:
37 |           if (isURLTracked(action.meta.httpRequest.url)) return;
38 |           trackURL(action.meta.httpRequest.url);
39 |           break;
40 |         case FetchStates.done:
41 |         case FetchStates.error:
42 |           untrackURL(action.meta.httpRequest.url);
43 |           break;
44 |       }
45 |     }
46 |     return next(action);
47 |   };
48 | }
49 | 
50 | export default applyMiddleware(
51 |   thunkMiddleware,
52 |   createRequestMiddleware(),
53 |   promiseMiddleware
54 | );
55 | 


--------------------------------------------------------------------------------
/frontend/src/store/prod.js:
--------------------------------------------------------------------------------
1 | import { createStore } from 'redux';
2 | import rootReducer from '../reducers';
3 | import middleWare from './middleware';
4 | import { Map } from 'immutable';
5 | 
6 | export default function configureStore() {
7 |   return createStore(rootReducer, Map({}), middleWare);
8 | }
9 | 


--------------------------------------------------------------------------------
/frontend/src/styles/global.scss:
--------------------------------------------------------------------------------
 1 | @charset "UTF-8";
 2 | 
 3 | 
 4 | @import "../../node_modules/react-table/react-table.css";
 5 | 
 6 | @import "../../node_modules/uikit/src/scss/variables-theme";
 7 | @import "../../node_modules/uikit/src/scss/mixins-theme";
 8 | 
 9 | @mixin hook-base-body() {
10 |   -webkit-font-smoothing: antialiased;
11 |   -moz-osx-font-smoothing: grayscale;
12 |   text-rendering: optimizeLegibility;
13 | }
14 | 
15 | @import "../../node_modules/uikit/src/scss/uikit-theme";
16 | @import "../../node_modules/react-toastify/scss/main";
17 | 
18 | html {
19 |   overflow-y: scroll;
20 | }
21 | 
22 | body {
23 |   overflow: hidden;
24 | }
25 | 
26 | .uk-tab a {
27 |     font-size: 1.1rem;
28 |     font-weight: bold;
29 | }
30 | 
31 | 
32 | 
33 | nav {
34 |   border-bottom: 1px solid #e8e6ea;
35 |   transition: background-color 250ms ease;
36 |   background-color: rgba(255, 255, 255, 1) !important;
37 | 
38 |   .uk-navbar-center {
39 |     transition: max-height 1.5s ease;
40 |     max-height: 90px;
41 |   }
42 | 
43 |   &.uk-active {
44 |     backface-visibility: hidden;
45 |     -webkit-backface-visibility: hidden;
46 |     background-color: rgba(255, 255, 255, 0.75) !important;
47 |     z-index: 2;
48 |     @extend .uk-box-shadow-small;
49 | 
50 |     .uk-navbar-center {
51 |       max-height: 60px;
52 |     }
53 | 
54 |     &:hover {
55 |       background-color: rgba(255, 255, 255, 1) !important;
56 |     }
57 |   }
58 | }
59 | 
60 | 
61 | .form-border {
62 |   position: relative;
63 |   color: #212529;
64 |   padding: 1rem;
65 |   margin-right: 0;
66 |   margin-left: 0;
67 |   border: .2rem solid #ececec;
68 |   border-radius: 8px;
69 | }
70 | 
71 | .create-crawl-seedlist {
72 |   height: 300px;
73 |   max-height: 300px;
74 |   overflow: auto;
75 |   margin-top: 1rem !important;
76 | }
77 | 
78 | .route-container {
79 |   @extend .uk-section;
80 |   @extend .uk-section-default;
81 |   padding-top: 30px;
82 | 
83 | }
84 | 
85 | .removeCrawlFromListIcon {
86 |   cursor: pointer;
87 |   color: red;
88 | }
89 | 
90 | .logo {
91 |     height: auto;
92 |     width: 250px;
93 |     margin-top: 10px;
94 |     margin-bottom: 10px;
95 | }
96 | 
97 | 


--------------------------------------------------------------------------------
/frontend/src/utils/bootstrap.js:
--------------------------------------------------------------------------------
1 | import Uikit from 'uikit';
2 | 


--------------------------------------------------------------------------------
/frontend/src/utils/endpoints.js:
--------------------------------------------------------------------------------
  1 | function getEndpointConfig() {
  2 |   if (typeof window !== 'undefined' && window.__crawlmanEndpoints != null) {
  3 |     return window.__crawlmanEndpoints;
  4 |   }
  5 |   return {
  6 |     crawls: {
  7 |       ep: window.location.origin + '/crawls',
  8 |       retrieve: { method: 'GET' },
  9 |       create: {
 10 |         defaults: { crawl_type: 'single-page', num_browsers: 2, num_tabs: 1, start: true },
 11 |         method: 'POST'
 12 |       }
 13 |     },
 14 |     crawl: {
 15 |       ep: window.location.origin + '/crawl/',
 16 |       info: { path: '/info', method: 'GET' },
 17 |       remove: { method: 'DELETE' },
 18 |       start: {
 19 |         defaults: {
 20 |           browser: 'chrome:67',
 21 |           behavior_max_time: 60,
 22 |           cache: 'always',
 23 |           headless: false
 24 |         },
 25 |         method: 'POST',
 26 |         path: '/start'
 27 |       },
 28 |       stop: { method: 'POST', path: '/stop' },
 29 |       done: { method: 'GET', path: '/done' },
 30 |       retrieveURLS: { method: 'GET', path: '/urls' },
 31 |       addURLS: { method: 'POST', path: '/urls' }
 32 |     }
 33 |   };
 34 | }
 35 | 
 36 | class Endpoints {
 37 |   constructor({ crawls, crawl }) {
 38 |     this.crawls = crawls;
 39 |     this.crawl = crawl;
 40 |   }
 41 | 
 42 |   /**
 43 |    * @param {Object} [newCrawlConfig]
 44 |    * @return {{body: Object, request: Request}}
 45 |    */
 46 |   createNewCrawl(newCrawlConfig) {
 47 |     const { defaults = {}, method } = this.crawls.create;
 48 |     const body = Object.assign(
 49 |       {},
 50 |       defaults,
 51 |       newCrawlConfig.crawlInfo,
 52 |       newCrawlConfig.crawlRunInfo
 53 |     );
 54 | 
 55 |     return {
 56 |       body,
 57 |       request: new Request(this.crawls.ep, {
 58 |         method,
 59 |         body: JSON.stringify(body)
 60 |       })
 61 |     };
 62 |   }
 63 | 
 64 |   /**
 65 |    * @return {Request}
 66 |    */
 67 |   retrieveAllCrawls() {
 68 |     const { method } = this.crawls.retrieve;
 69 |     return new Request(this.crawls.ep, { method });
 70 |   }
 71 | 
 72 |   /**
 73 |    * @param {string} id
 74 |    * @return {Request}
 75 |    */
 76 |   crawlInfo(id) {
 77 |     const { path = '', method } = this.crawl.info;
 78 |     const url = `${this.crawl.ep}${id}${path}`;
 79 |     return new Request(url, { method });
 80 |   }
 81 | 
 82 |   /**
 83 |    * @param {string} id
 84 |    * @return {Request}
 85 |    */
 86 |   crawlDone(id) {
 87 |     const { path = '', method } = this.crawl.done;
 88 |     const url = `${this.crawl.ep}${id}${path}`;
 89 |     return new Request(url, { method });
 90 |   }
 91 | 
 92 |   /**
 93 |    * @param {string} id
 94 |    * @return {Request}
 95 |    */
 96 |   getCrawlURLs(id) {
 97 |     const { path = '', method } = this.crawl.retrieveURLS;
 98 |     const url = `${this.crawl.ep}${id}${path}`;
 99 |     return new Request(url, { method });
100 |   }
101 | 
102 |   /**
103 |    * @param {string} id
104 |    * @param {Array<string>} urls
105 |    * @return {{body: Object, request: Request}}
106 |    */
107 |   addCrawlURLs(id, urls) {
108 |     const { path = '', method } = this.crawl.addURLS;
109 |     const url = `${this.crawl.ep}${id}${path}`;
110 |     const body = { urls };
111 |     return {
112 |       body,
113 |       request: new Request(url, {
114 |         method,
115 |         body: JSON.stringify(body)
116 |       })
117 |     };
118 |   }
119 | 
120 |   /**
121 |    * @param {string} id
122 |    * @return {Request}
123 |    */
124 |   removeCrawl(id) {
125 |     const { path = '', method } = this.crawl.remove;
126 |     const url = `${this.crawl.ep}${id}${path}`;
127 |     return new Request(url, { method });
128 |   }
129 | 
130 |   /**
131 |    * @param {string} id
132 |    * @param {Object} [config]
133 |    * @return {{body: Object, request: Request}}
134 |    */
135 |   startCrawl(id, config) {
136 |     const { path = '', method, defaults = {} } = this.crawl.start;
137 |     const url = `${this.crawl.ep}${id}${path}`;
138 |     const body = Object.assign({}, defaults, config);
139 |     return {
140 |       body,
141 |       request: new Request(url, {
142 |         method,
143 |         body: JSON.stringify(body)
144 |       })
145 |     };
146 |   }
147 | 
148 |   /**
149 |    * @param {string} id
150 |    * @return {Request}
151 |    */
152 |   stopCrawl(id) {
153 |     const { path = '', method } = this.crawl.stop;
154 |     const url = `${this.crawl.ep}${id}${path}`;
155 |     return new Request(url, { method });
156 |   }
157 | }
158 | 
159 | export const EndpointRequests = new Endpoints(getEndpointConfig());
160 | 


--------------------------------------------------------------------------------
/frontend/src/utils/index.js:
--------------------------------------------------------------------------------
1 | export { EndpointRequests } from './endpoints';
2 | 


--------------------------------------------------------------------------------
/frontend/src/utils/rhlConfig.js:
--------------------------------------------------------------------------------
1 | import { setConfig, hot } from 'react-hot-loader';
2 | if (process.env.NODE_ENV === 'development') {
3 |   setConfig({
4 |     logLevel: 'debug'
5 |     // ignoreSFC: true, // RHL will be __completely__ disabled for SFC
6 |     // pureRender: true // RHL will not change render method
7 |   });
8 | }
9 | 


--------------------------------------------------------------------------------
/frontend/src/wrap-with-provider.js:
--------------------------------------------------------------------------------
 1 | import React from 'react';
 2 | import { Provider } from 'react-redux';
 3 | import * as PropTypes from 'prop-types';
 4 | import createStore from './store';
 5 | 
 6 | export default function WrapWithProvider({ element }) {
 7 |   const store = createStore();
 8 |   return <Provider store={store}>{element}</Provider>;
 9 | };
10 | 
11 | WrapWithProvider.propTypes = {
12 |   element: PropTypes.element.isRequired
13 | };
14 | 


--------------------------------------------------------------------------------
/frontend/webpack/development-server.js:
--------------------------------------------------------------------------------
 1 | const path = require('path');
 2 | const createServer = require('fastify');
 3 | const webpack = require('webpack');
 4 | const HMR = require('fastify-webpack-hmr');
 5 | const config = require('./webpack.config');
 6 | 
 7 | const fastify = createServer({
 8 |   logger: false
 9 | });
10 | 
11 | const staticPath = path.join(__dirname, '..', 'public');
12 | 
13 | fastify
14 |   .register(HMR, {
15 |     compiler: webpack(config),
16 |     webpackDev: {
17 |       logLevel: 'debug'
18 |     }
19 |   })
20 |   .register(require('fastify-static'), {
21 |     root: staticPath,
22 |     prefix: '/'
23 |   });
24 | 
25 | async function run() {
26 |   const listeningOn = await fastify.listen(8001);
27 |   console.log(
28 |     `Dev server listening on\n${
29 |       listeningOn.startsWith('http://127.0.0.1')
30 |         ? listeningOn.replace('http://127.0.0.1', 'http://localhost')
31 |         : listeningOn
32 |     }`
33 |   );
34 |   console.log(fastify.printRoutes());
35 | }
36 | 
37 | run().catch(error => {
38 |   console.error(error);
39 | });
40 | 


--------------------------------------------------------------------------------
/frontend/webpack/webpack.config.js:
--------------------------------------------------------------------------------
  1 | const path = require('path');
  2 | const webpack = require('webpack');
  3 | 
  4 | const rootPath = path.join(__dirname, '..');
  5 | const staticPath = path.join(path.join(rootPath, '..'), 'static');
  6 | const srcPath = path.join(rootPath, 'src');
  7 | const entryPath = path.join(srcPath, 'root.js');
  8 | const hotConf =
  9 |   'webpack-hot-middleware/client?path=__webpack_hmr&reload=true&overlay=false';
 10 | 
 11 | const mode = process.env.NODE_ENV || 'development';
 12 | const production = mode === 'production';
 13 | 
 14 | const plugins = [
 15 |   new webpack.NamedModulesPlugin(),
 16 |   new webpack.IgnorePlugin({
 17 |     contextRegExp: /moment$/
 18 |   }),
 19 |   new webpack.DefinePlugin({
 20 |     'process.env.NODE_ENV': JSON.stringify(mode),
 21 |     __DEV__: JSON.stringify(!production)
 22 |   }),
 23 |   new webpack.NoEmitOnErrorsPlugin()
 24 | ];
 25 | 
 26 | const jsLoader = {
 27 |   loader: 'babel-loader',
 28 |   options: {
 29 |     cacheDirectory: true,
 30 |     babelrc: false,
 31 |     presets: [
 32 |       '@babel/preset-react',
 33 |       [
 34 |         '@babel/preset-env',
 35 |         {
 36 |           loose: true,
 37 |           debug: false,
 38 |           modules: false,
 39 |           useBuiltIns: false,
 40 |           targets: {
 41 |             chrome: '70',
 42 |             firefox: '66'
 43 |           }
 44 |         }
 45 |       ]
 46 |     ],
 47 |     plugins: [['@babel/plugin-proposal-class-properties', { loose: true }]]
 48 |   }
 49 | };
 50 | 
 51 | if (mode === 'development') {
 52 |   plugins.push(new webpack.HotModuleReplacementPlugin());
 53 |   jsLoader.options.plugins.unshift('react-hot-loader/babel');
 54 | } else {
 55 |   jsLoader.options.plugins.push([
 56 |     'transform-react-remove-prop-types',
 57 |     {
 58 |       ignoreFilenames: ['node_modules']
 59 |     }
 60 |   ]);
 61 | }
 62 | 
 63 | module.exports = {
 64 |   mode: mode,
 65 |   context: rootPath,
 66 |   entry: production ? entryPath : [entryPath, hotConf],
 67 |   output: {
 68 |     path: production ? staticPath : path.join(rootPath, 'public'),
 69 |     filename: 'app.js',
 70 |     publicPath: '/'
 71 |   },
 72 |   module: {
 73 |     rules: [
 74 |       {
 75 |         test: /\.jsx?$/,
 76 |         exclude: /node_modules/,
 77 |         use: [jsLoader]
 78 |       },
 79 |       {
 80 |         test: /\.s[ac]ss$/,
 81 |         use: [
 82 |           {
 83 |             loader: 'style-loader',
 84 |             options: {
 85 |               hmr: true
 86 |             }
 87 |           },
 88 |           {
 89 |             loader: 'css-loader'
 90 |           },
 91 |           {
 92 |             loader: 'sass-loader',
 93 |             options: {
 94 |               includePaths: [
 95 |                 'node_modules/uikit/src/scss',
 96 |                 'node_modules/react-toastify/scss/main'
 97 |               ],
 98 |               precision: 8
 99 |             }
100 |           }
101 |         ]
102 |       },
103 |       {
104 |         test: /\.(eot|otf|ttf|woff(2)?)(\?.*)?$/,
105 |         use: [
106 |           {
107 |             loader: 'url-loader'
108 |           }
109 |         ]
110 |       }
111 |     ]
112 |   },
113 |   resolve: {
114 |     extensions: ['.mjs', '.js', '.jsx', '.wasm', '.json', '.ts', '.tsx'],
115 |     alias: {
116 |       'react-dom': require.resolve('@hot-loader/react-dom')
117 |     }
118 |   },
119 |   plugins
120 | };
121 | 


--------------------------------------------------------------------------------
/install-browsers.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | # Current Browser
 4 | docker pull oldwebtoday/chrome:73
 5 | 
 6 | # Base Browser
 7 | docker pull oldwebtoday/base-browser
 8 | 
 9 | # Borwser automation driver
10 | docker pull webrecorder/autobrowser
11 | 
12 | if [ "$1" != "--headless" ]; then
13 | 
14 |     # Required for non-headless mode
15 |     docker pull oldwebtoday/vnc-webrtc-audio
16 |     docker pull oldwebtoday/base-displayaudio
17 | fi
18 | 
19 | 


--------------------------------------------------------------------------------
/mypy.ini:
--------------------------------------------------------------------------------
 1 | [mypy]
 2 | # Specify the target platform details in config, so your developers are
 3 | # free to run mypy on Windows, Linux, or macOS and get consistent
 4 | # results.
 5 | python_version=3.7
 6 | platform=linux
 7 | 
 8 | # flake8-mypy expects the two following for sensible formatting
 9 | show_column_numbers=True
10 | show_error_context=True
11 | show_traceback=True
12 | 
13 | # show error messages from unrelated files
14 | follow_imports=normal
15 | 
16 | # suppress errors about unsatisfied imports
17 | ignore_missing_imports=True
18 | 
19 | # no strict optionals
20 | strict_optional=False
21 | 
22 | # be strict
23 | disallow_untyped_calls=False
24 | warn_return_any=True
25 | warn_no_return=True
26 | warn_redundant_casts=True
27 | warn_unused_ignores=False
28 | 
29 | # The following are off by default.  Flip them on if you feel
30 | # adventurous.
31 | check_untyped_defs=True
32 | # verbosity=10000


--------------------------------------------------------------------------------
/pool_config.yaml:
--------------------------------------------------------------------------------
 1 | default_pool: fixed-pool
 2 | 
 3 | pools:
 4 |   - name: fixed-pool
 5 |     type: fixed
 6 |     duration: 300
 7 |     max_size: 10
 8 |     expire_check: 30
 9 |     number_ttl: 30
10 |     wait_ping_ttl: 30
11 | 
12 |   - name: auto-pool
13 |     type: persist
14 |     duration: 300
15 |     max_size: 10
16 |     expire_check: 30
17 |     grace_time: 300
18 |     stop_on_pause: false
19 | 
20 | 
21 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.black]
 2 | line-length = 88
 3 | target-version = ['py36', 'py37', 'py38']
 4 | include = '\.pyi?$'
 5 | exclude = '''
 6 | /(
 7 |     \.eggs
 8 |   | \.git
 9 |   | \.hg
10 |   | \.mypy_cache
11 |   | \.tox
12 |   | \.venv
13 |   | _build
14 |   | buck-out
15 |   | build
16 |   | dist
17 |   | venv
18 | )/
19 | '''
20 | 
21 | 


--------------------------------------------------------------------------------
/pytest.ini:
--------------------------------------------------------------------------------
1 | [pytest]
2 | testpaths = tests
3 | addopts=-vv -s


--------------------------------------------------------------------------------
/pywb/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM webrecorder/pywb:new-wombat
 2 | 
 3 | WORKDIR /app
 4 | 
 5 | COPY config.yaml ./
 6 | COPY crawlapp.py ./
 7 | 
 8 | COPY uwsgi.ini /uwsgi/
 9 | 
10 | COPY ./templates ./templates
11 | COPY ./static ./static
12 | 
13 | #WORKDIR /webarchive
14 | 
15 | 


--------------------------------------------------------------------------------
/pywb/config.yaml:
--------------------------------------------------------------------------------
 1 | debug: true
 2 | 
 3 | proxy:
 4 |   coll: coll
 5 |   recording: true
 6 | 
 7 |   enable_banner: false
 8 |   enable_wombat: true
 9 | 
10 | recorder: live
11 | 
12 | autoindex: 10
13 | 
14 | enable_auto_fetch: true
15 | 
16 | collections:
17 |   live: $live
18 | 
19 | collections_root: /webarchive/collections
20 |   
21 | 


--------------------------------------------------------------------------------
/pywb/crawlapp.py:
--------------------------------------------------------------------------------
  1 | # fmt: off
  2 | from gevent.monkey import patch_all; patch_all()
  3 | # fmt: on
  4 | import base64
  5 | import hashlib
  6 | import json
  7 | import os
  8 | import traceback
  9 | from itertools import chain
 10 | from urllib.parse import parse_qsl
 11 | 
 12 | import redis
 13 | import requests
 14 | from pywb.apps.frontendapp import FrontEndApp
 15 | from pywb.apps.wbrequestresponse import WbResponse
 16 | from pywb.manager.manager import CollectionsManager
 17 | from pywb.rewrite.templateview import BaseInsertView
 18 | from warcio.timeutils import timestamp_now, timestamp_to_iso_date
 19 | from werkzeug.routing import Rule
 20 | 
 21 | EMPTY_LIST = []
 22 | EMPTY_DICT = {}
 23 | SKIPPED_NODES = {'script', 'style'}
 24 | TITLE = 'title'
 25 | TEXT = '#text'
 26 | 
 27 | 
 28 | # ============================================================================
 29 | def extract_text(node, metadata=None):
 30 |     node_name = node.get('nodeName', '').lower()
 31 |     if node_name not in SKIPPED_NODES:
 32 |         children = node.get('children', EMPTY_LIST)
 33 |         if node_name == TEXT:
 34 |             text_value = node.get('nodeValue', '').strip()
 35 |             if text_value:
 36 |                 yield text_value
 37 |         elif node_name == TITLE:
 38 |             title = ' '.join(
 39 |                 chain.from_iterable(extract_text(child, None) for child in children)
 40 |             )
 41 |             if metadata is not None:
 42 |                 metadata['title'] = title
 43 |             else:
 44 |                 yield title
 45 |         else:
 46 |             if children:
 47 |                 for text_value in chain.from_iterable(
 48 |                     extract_text(child, metadata) for child in children
 49 |                 ):
 50 |                     yield text_value
 51 |             content_doc = node.get('contentDocument')
 52 |             if content_doc:
 53 |                 for text_value in extract_text(content_doc, None):
 54 |                     yield text_value
 55 | 
 56 | 
 57 | # ============================================================================
 58 | class CrawlProxyApp(FrontEndApp):
 59 |     def __init__(self, config_file=None, custom_config=None):
 60 |         self.colls_dir = os.path.join(os.environ.get('VOLUME_DIR', '.'), 'collections')
 61 | 
 62 |         # ensure collections dir exists for auto-index
 63 |         os.makedirs(self.colls_dir, exist_ok=True)
 64 | 
 65 |         super(CrawlProxyApp, self).__init__(
 66 |             config_file='./config.yaml', custom_config=custom_config
 67 |         )
 68 | 
 69 |         self.redis = redis.StrictRedis.from_url(
 70 |             os.environ['REDIS_URL'], decode_responses=True
 71 |         )
 72 | 
 73 |         self.collections_checked = set()
 74 | 
 75 |         self.custom_record_path = (
 76 |             self.recorder_path + '&put_record={rec_type}&url={url}'
 77 |         )
 78 | 
 79 |         self.solr_ingester = FullTextIngester()
 80 | 
 81 |     def ensure_coll_exists(self, coll):
 82 |         if coll == 'live':
 83 |             return
 84 | 
 85 |         if coll in self.collections_checked:
 86 |             return
 87 | 
 88 |         try:
 89 |             m = CollectionsManager(coll, colls_dir=self.colls_dir, must_exist=False)
 90 |             m.add_collection()
 91 |         except FileExistsError:
 92 |             pass
 93 | 
 94 |         self.collections_checked.add(coll)
 95 | 
 96 |     def proxy_route_request(self, url, environ):
 97 |         try:
 98 |             key = 'up:' + environ['REMOTE_ADDR']
 99 |             timestamp, coll, mode, cache = self.redis.hmget(
100 |                 key, 'timestamp', 'coll', 'mode', 'cache'
101 |             )
102 | 
103 |             # environ['pywb_redis_key'] = key
104 |             environ['pywb_proxy_default_timestamp'] = timestamp or timestamp_now()
105 |             environ['pywb_cache'] = cache
106 | 
107 |             if coll:
108 |                 self.ensure_coll_exists(coll)
109 | 
110 |                 if mode == 'replay' or coll == 'live':
111 |                     proxy_prefix = '/' + coll + '/bn_/'
112 |                 else:
113 |                     proxy_prefix = '/' + coll + '/record/bn_/'
114 |             else:
115 |                 proxy_prefix = self.proxy_prefix
116 | 
117 |         except Exception as e:
118 |             traceback.print_exc()
119 | 
120 |         return proxy_prefix + url
121 | 
122 |     def _init_routes(self):
123 |         super(CrawlProxyApp, self)._init_routes()
124 |         self.url_map.add(
125 |             Rule(
126 |                 '/api/screenshot/<coll>', endpoint=self.put_screenshot, methods=['PUT']
127 |             )
128 |         )
129 | 
130 |         self.url_map.add(
131 |             Rule('/api/dom/<coll>', endpoint=self.put_raw_dom, methods=['PUT'])
132 |         )
133 | 
134 |         self.url_map.add(
135 |             Rule('/api/search/<coll>', endpoint=self.page_search, methods=['GET'])
136 |         )
137 | 
138 |         self.url_map.add(
139 |             Rule('/<coll>/search', endpoint=self.serve_orig_coll_page, methods=['GET'])
140 |         )
141 | 
142 |     def serve_orig_coll_page(self, environ, coll='$root'):
143 |         return super(CrawlProxyApp, self).serve_coll_page(environ, coll)
144 | 
145 |     def serve_coll_page(self, environ, coll='$root'):
146 |         if not self.is_valid_coll(coll):
147 |             self.raise_not_found(environ, 'No handler for "/{0}"'.format(coll))
148 | 
149 |         self.setup_paths(environ, coll)
150 | 
151 |         view = BaseInsertView(self.rewriterapp.jinja_env, 'fullsearch.html')
152 | 
153 |         wb_prefix = environ.get('SCRIPT_NAME', '')
154 |         if wb_prefix:
155 |             wb_prefix += '/'
156 | 
157 |         content = view.render_to_string(environ, wb_prefix=wb_prefix, coll=coll)
158 | 
159 |         return WbResponse.text_response(
160 |             content, content_type='text/html; charset="utf-8"'
161 |         )
162 | 
163 |     def page_search(self, environ, coll):
164 |         params = dict(parse_qsl(environ.get('QUERY_STRING')))
165 | 
166 |         result = self.solr_ingester.query_solr(coll, params)
167 | 
168 |         return WbResponse.json_response(result)
169 | 
170 |     def put_screenshot(self, environ, coll):
171 |         data = environ['wsgi.input'].read()
172 |         params = dict(parse_qsl(environ.get('QUERY_STRING')))
173 | 
174 |         return self.put_record(
175 |             environ, coll, 'urn:screenshot:{url}', 'resource', params, data
176 |         )
177 | 
178 |     def put_raw_dom(self, environ, coll):
179 |         text = environ['wsgi.input'].read()
180 |         params = dict(parse_qsl(environ.get('QUERY_STRING')))
181 | 
182 |         res = self.put_record(environ, coll, 'urn:dom:{url}', 'metadata', params, text)
183 | 
184 |         self.solr_ingester.ingest(coll, text, params)
185 |         return res
186 | 
187 |     def put_record(self, environ, coll, target_uri_format, rec_type, params, data):
188 |         self.ensure_coll_exists(coll)
189 | 
190 |         headers = {'Content-Type': environ.get('CONTENT_TYPE', 'text/plain')}
191 | 
192 |         url = params.get('url')
193 | 
194 |         if not url:
195 |             return WbResponse.json_response({'error': 'no url'})
196 | 
197 |         timestamp = params.get('timestamp')
198 |         if timestamp:
199 |             headers['WARC-Date'] = timestamp_to_iso_date(timestamp)
200 | 
201 |         target_uri = target_uri_format.format(url=url)
202 |         put_url = self.custom_record_path.format(
203 |             url=target_uri, coll=coll, rec_type=rec_type
204 |         )
205 |         res = requests.put(put_url, headers=headers, data=data)
206 | 
207 |         res = res.json()
208 | 
209 |         return WbResponse.json_response(res)
210 | 
211 |     def serve_content(self, environ, *args, **kwargs):
212 |         res = super(CrawlProxyApp, self).serve_content(environ, *args, **kwargs)
213 | 
214 |         if (
215 |             environ.get('pywb_cache') == 'always'
216 |             and res.status_headers.statusline.startswith('200')
217 |             and environ.get('HTTP_REFERER')
218 |         ):
219 | 
220 |             res.status_headers.headers.append(
221 |                 ('Cache-Control', 'public, max-age=31536000, immutable')
222 |             )
223 | 
224 |         return res
225 | 
226 | 
227 | # =============================================================================
228 | class FullTextIngester:
229 |     def __init__(self):
230 |         self.solr_api = 'http://solr:8983/solr/browsertrix/update/json/docs?commit=true'
231 |         self.solr_update_api = 'http://solr:8983/solr/browsertrix/update?commit=true'
232 |         self.solr_select_api = 'http://solr:8983/solr/browsertrix/select'
233 | 
234 |         self.page_query = '?q=title_t:*&fq=coll_s:{coll}&fl=title_t,url_s,timestamp_ss,has_screenshot_b&rows={rows}&start={start}&sort=timestamp_ss+{sort}'
235 |         self.text_query = '?q={q}&fq={fq}&fl=id,title_t,url_s,timestamp_ss,has_screenshot_b&hl=true&hl.fl=content_t&hl.snippets=3&rows={rows}&start={start}'
236 | 
237 |     def update_if_dupe(self, digest, coll, url, timestamp, timestamp_dt):
238 |         try:
239 |             query = 'digest_s:"{0}" AND coll_s:{1} AND url_s:"{2}"'.format(
240 |                 digest, coll, url
241 |             )
242 |             resp = requests.get(self.solr_select_api, params={'q': query, 'fl': 'id'})
243 | 
244 |             resp = resp.json()
245 |             resp = resp.get('response')
246 |             if not resp:
247 |                 return False
248 | 
249 |             docs = resp.get('docs')
250 |             if not docs:
251 |                 return False
252 | 
253 |             id_ = docs[0].get('id')
254 |             if not id_:
255 |                 return False
256 | 
257 |             add_cmd = {
258 |                 'add': {
259 |                     'doc': {
260 |                         'id': id_,
261 |                         'timestamp_ss': {'add': timestamp},
262 |                         'timestamp_dts': {'add': timestamp_dt},
263 |                     }
264 |                 }
265 |             }
266 | 
267 |             resp = requests.post(self.solr_update_api, json=add_cmd)
268 |             return True
269 | 
270 |         except Exception as e:
271 |             print(e)
272 |             return False
273 | 
274 |     def ingest(self, coll, text, params):
275 |         parsed = json.loads(text)
276 |         mdata = {}
277 |         content = "\n".join(text for text in extract_text(parsed["root"], mdata))
278 |         title = mdata.get('title')
279 |         url = params.get('url')
280 |         timestamp_ss = params.get('timestamp')
281 |         timestamp_dts = timestamp_to_iso_date(timestamp_ss)
282 |         has_screenshot_b = params.get('hasScreenshot') == '1'
283 | 
284 |         title = title or url
285 | 
286 |         digest = self.get_digest(content)
287 | 
288 |         if self.update_if_dupe(digest, coll, url, timestamp_ss, timestamp_dts):
289 |             return
290 | 
291 |         data = {
292 |             'coll_s': coll,
293 |             'title_t': title,
294 |             'content_t': content,
295 |             'url_s': url,
296 |             'digest_s': digest,
297 |             'timestamp_ss': timestamp_ss,
298 |             'timestamp_dts': timestamp_dts,
299 |             'has_screenshot_b': has_screenshot_b,
300 |         }
301 | 
302 |         result = requests.post(self.solr_api, json=data)
303 | 
304 |     def get_digest(self, text):
305 |         m = hashlib.sha1()
306 |         m.update(text.encode('utf-8'))
307 |         return 'sha1:' + base64.b32encode(m.digest()).decode('utf-8')
308 | 
309 |     def query_solr(self, coll, params):
310 |         search = params.get('search')
311 | 
312 |         start = int(params.get('start', 0))
313 | 
314 |         rows = int(params.get('limit', 10))
315 | 
316 |         sort = params.get('sort', 'asc')
317 | 
318 |         if not search:
319 |             qurl = self.solr_select_api + self.page_query.format(
320 |                 coll=coll, start=start, rows=rows, sort=sort
321 |             )
322 |             res = requests.get(qurl)
323 | 
324 |             res = res.json()
325 |             resp = res.get('response', {})
326 |             docs = resp.get('docs')
327 | 
328 |             return {
329 |                 'total': resp.get('numFound'),
330 |                 'results': [
331 |                     {
332 |                         'title': doc.get('title_t'),
333 |                         'url': doc.get('url_s'),
334 |                         'timestamp': doc.get('timestamp_ss'),
335 |                         'has_screenshot': doc.get('has_screenshot_b'),
336 |                     }
337 |                     for doc in docs
338 |                 ],
339 |             }
340 | 
341 |         else:
342 |             query = 'content_t:"{q}" OR title_t:"{q}" OR url_s:"*{q}*"'.format(
343 |                 q=search, coll=coll
344 |             )
345 |             res = requests.get(
346 |                 self.solr_select_api
347 |                 + self.text_query.format(
348 |                     q=query, start=start, rows=rows, fq='coll_s:' + coll
349 |                 )
350 |             )
351 | 
352 |             res = res.json()
353 |             resp = res.get('response', {})
354 |             docs = resp.get('docs')
355 |             hl = res.get('highlighting', {})
356 | 
357 |             return {
358 |                 'total': resp.get('numFound'),
359 |                 'results': [
360 |                     {
361 |                         'title': doc.get('title_t'),
362 |                         'url': doc.get('url_s'),
363 |                         'timestamp': doc.get('timestamp_ss'),
364 |                         'has_screenshot': doc.get('has_screenshot_b'),
365 |                         'matched': hl.get(doc.get('id'), {}).get('content_t'),
366 |                     }
367 |                     for doc in docs
368 |                 ],
369 |             }
370 | 
371 | 
372 | # =============================================================================
373 | application = CrawlProxyApp()
374 | 


--------------------------------------------------------------------------------
/pywb/run.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | mkdir -p collections
4 | 
5 | uwsgi uwsgi.ini
6 | 
7 | 


--------------------------------------------------------------------------------
/pywb/static/browsertrix-logo.svg:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <svg width="1267px" height="257px" viewBox="0 0 1267 257" version="1.1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
 3 |     <!-- Generator: Sketch 51.3 (57544) - http://www.bohemiancoding.com/sketch -->
 4 |     <title>Browsertrix-Horizontal-no-padding</title>
 5 |     <desc>Created with Sketch.</desc>
 6 |     <defs></defs>
 7 |     <g id="Browsertrix-Horizontal-no-padding" stroke="none" stroke-width="1" fill="none" fill-rule="evenodd">
 8 |         <g id="C4-Logo" transform="translate(2.000000, 5.000000)" fill="#000000">
 9 |             <path d="M88,142.596779 L88,187.400287 C129.247836,195.322678 179.850046,199.721014 233.177384,199.721014 C286.783114,199.721014 337.635191,195.276635 379,187.275923 L379,142.796048 C432.052464,153.427171 466,169.72673 466,188 C466,220.032515 361.682347,246 233,246 C104.317653,246 0,220.032515 0,188 C0,169.612176 34.3745016,153.222886 88,142.596779 Z" id="Combined-Shape"></path>
10 |             <path d="M353,0 L353,155.25055 C317.606088,160.636359 276.364698,163.605072 233.177384,163.605072 C190.252242,163.605072 149.249471,160.672294 114,155.348368 L114,0 L353,0 Z M187,13 L187,30 L337,30 L337,13 L187,13 Z M131,13 L131,30 L149,30 L149,13 L131,13 Z M159,13 L159,30 L177,30 L177,13 L159,13 Z" id="Combined-Shape" fill-rule="nonzero"></path>
11 |         </g>
12 |         <path d="M569.535156,63.4082031 L599.408203,63.4082031 C604.831407,63.4082031 609.878557,64.2512937 614.549805,65.9375 C619.221052,67.6237063 623.026352,70.3238747 625.96582,74.0380859 C628.905288,77.7522972 630.375,82.5032263 630.375,88.2910156 C630.375,92.9850495 629.292654,97.0637848 627.12793,100.527344 C624.963205,103.990903 622.149106,106.702464 618.685547,108.662109 C623.561873,110.712901 627.515284,113.902973 630.545898,118.232422 C633.576513,122.561871 635.091797,127.552055 635.091797,133.203125 C635.091797,139.127634 633.656264,144.095032 630.785156,148.105469 C627.914048,152.115905 624.120141,155.100902 619.40332,157.060547 C614.6865,159.020192 609.570991,160 604.056641,160 L569.535156,160 L569.535156,63.4082031 Z M600.228516,104.21875 C605.150415,104.21875 609.160792,103.045259 612.259766,100.698242 C615.358739,98.3512252 616.908203,94.6256766 616.908203,89.5214844 C616.908203,84.7363042 615.32456,81.2727972 612.157227,79.1308594 C608.989893,76.9889216 604.80862,75.9179688 599.613281,75.9179688 L583.34375,75.9179688 L583.34375,104.21875 L600.228516,104.21875 Z M604.330078,146.943359 C609.525417,146.943359 613.672511,145.758475 616.771484,143.388672 C619.870458,141.018868 621.419922,137.213568 621.419922,131.972656 C621.419922,126.868464 619.813493,122.994805 616.600586,120.351562 C613.387679,117.70832 609.229192,116.386719 604.125,116.386719 L583.34375,116.386719 L583.34375,146.943359 L604.330078,146.943359 Z M652.728516,91.7089844 L665.716797,91.7089844 L665.716797,103.535156 C667.311857,99.4335732 669.886701,96.1979285 673.441406,93.828125 C676.996112,91.4583215 680.846984,90.2734375 684.994141,90.2734375 C687.272798,90.2734375 689.426097,90.6380172 691.454102,91.3671875 C693.482106,92.0963578 695.225253,93.1217382 696.683594,94.4433594 L690.873047,105.585938 C688.639963,103.899731 685.99676,103.056641 682.943359,103.056641 C679.61652,103.056641 676.642917,104.013662 674.022461,105.927734 C671.402005,107.841806 669.339851,110.690085 667.835938,114.472656 C666.332024,118.255227 665.580078,122.880832 665.580078,128.349609 L665.580078,160 L652.728516,160 L652.728516,91.7089844 Z M717.772461,156.547852 C712.691055,153.334945 708.737644,149.005561 705.912109,143.55957 C703.086574,138.11358 701.673828,132.15498 701.673828,125.683594 C701.673828,119.212207 703.075181,113.276394 705.87793,107.875977 C708.680678,102.475559 712.634089,98.1917476 717.738281,95.0244141 C722.842473,91.8570805 728.675748,90.2734375 735.238281,90.2734375 C741.755241,90.2734375 747.577123,91.8570805 752.704102,95.0244141 C757.83108,98.1917476 761.818671,102.486952 764.666992,107.910156 C767.515314,113.33336 768.939453,119.25778 768.939453,125.683594 C768.939453,132.15498 767.515314,138.11358 764.666992,143.55957 C761.818671,149.005561 757.83108,153.334945 752.704102,156.547852 C747.577123,159.760758 741.755241,161.367188 735.238281,161.367188 C728.675748,161.367188 722.853867,159.760758 717.772461,156.547852 Z M746.483398,146.738281 C749.468439,144.459624 751.678704,141.474628 753.114258,137.783203 C754.549812,134.091778 755.267578,130.058616 755.267578,125.683594 C755.267578,121.308572 754.549812,117.298195 753.114258,113.652344 C751.678704,110.006492 749.468439,107.078462 746.483398,104.868164 C743.498357,102.657867 739.772809,101.552734 735.306641,101.552734 C730.840472,101.552734 727.114924,102.657867 724.129883,104.868164 C721.144842,107.078462 718.934577,110.006492 717.499023,113.652344 C716.063469,117.298195 715.345703,121.308572 715.345703,125.683594 C715.345703,130.058616 716.063469,134.103172 717.499023,137.817383 C718.934577,141.531594 721.156235,144.51659 724.164062,146.772461 C727.17189,149.028332 730.886046,150.15625 735.306641,150.15625 C739.772809,150.15625 743.498357,149.016938 746.483398,146.738281 Z M775.84375,91.7089844 L789.925781,91.7089844 L802.845703,140.859375 L817.201172,91.7089844 L828.822266,91.7089844 L843.177734,140.722656 L856.097656,91.7089844 L870.179688,91.7089844 L850.150391,160 L837.845703,160 L823.080078,109.550781 L808.177734,160.136719 L795.873047,160.136719 L775.84375,91.7089844 Z M890.584961,158.974609 C886.87075,157.379549 883.737643,155.214857 881.185547,152.480469 C878.633451,149.74608 876.810552,146.715511 875.716797,143.388672 L886.517578,138.330078 C887.748053,141.884783 889.63931,144.790028 892.191406,147.045898 C894.743502,149.301769 898.138651,150.429688 902.376953,150.429688 C906.159524,150.429688 909.155913,149.518238 911.366211,147.695312 C913.576508,145.872387 914.681641,143.525405 914.681641,140.654297 C914.681641,138.557932 914.043626,136.826179 912.767578,135.458984 C911.49153,134.09179 909.885101,133.009444 907.948242,132.211914 C906.011384,131.414384 903.334001,130.514328 899.916016,129.511719 C895.540994,128.235671 891.986342,126.971035 889.251953,125.717773 C886.517564,124.464512 884.181976,122.596041 882.245117,120.112305 C880.308259,117.628568 879.339844,114.358744 879.339844,110.302734 C879.339844,106.064432 880.399403,102.430028 882.518555,99.3994141 C884.637706,96.3688 887.474592,94.0901769 891.029297,92.5634766 C894.584002,91.0367762 898.480448,90.2734375 902.71875,90.2734375 C908.141954,90.2734375 912.915669,91.5266802 917.040039,94.0332031 C921.164409,96.5397261 924.411446,100.026019 926.78125,104.492188 L916.732422,110.029297 C913.542302,103.876922 908.711621,100.800781 902.240234,100.800781 C899.095687,100.800781 896.566416,101.529941 894.652344,102.988281 C892.738272,104.446622 891.78125,106.315093 891.78125,108.59375 C891.78125,110.325529 892.373692,111.783848 893.558594,112.96875 C894.743496,114.153652 896.2246,115.09928 898.001953,115.805664 C899.779306,116.512048 902.285791,117.366531 905.521484,118.369141 C910.169945,119.781908 913.952459,121.171868 916.869141,122.539062 C919.785822,123.906257 922.3037,125.968411 924.422852,128.725586 C926.542003,131.482761 927.601562,135.094379 927.601562,139.560547 C927.601562,144.300154 926.428072,148.31053 924.081055,151.591797 C921.734038,154.873063 918.657897,157.322583 914.852539,158.94043 C911.047181,160.558276 907.002626,161.367188 902.71875,161.367188 C898.343728,161.367188 894.299172,160.569669 890.584961,158.974609 Z M952.655273,156.376953 C947.75616,153.186833 944.087577,148.948594 941.649414,143.662109 C939.211251,138.375625 937.992188,132.54235 937.992188,126.162109 C937.992188,119.918589 939.165678,114.085314 941.512695,108.662109 C943.859712,103.238905 947.402971,98.8639489 952.142578,95.5371094 C956.882185,92.2102698 962.647101,90.546875 969.4375,90.546875 C976.638057,90.546875 982.494118,92.1988767 987.005859,95.5029297 C991.517601,98.8069827 994.753245,103.022435 996.712891,108.149414 C998.672536,113.276393 999.652344,118.688123 999.652344,124.384766 C999.652344,126.800142 999.515626,128.736972 999.242188,130.195312 L952.005859,130.195312 C952.324871,135.891956 954.045232,140.506167 957.166992,144.038086 C960.288753,147.570005 964.766247,149.335938 970.599609,149.335938 C974.291034,149.335938 977.583644,148.777675 980.477539,147.661133 C983.371434,146.544591 986.02603,144.664727 988.441406,142.021484 L997.601562,149.746094 C993.864565,153.665384 989.820009,156.547842 985.467773,158.393555 C981.115538,160.239267 976.159533,161.162109 970.599609,161.162109 C963.535772,161.162109 957.554386,159.567073 952.655273,156.376953 Z M986.800781,120.146484 C986.481769,114.586561 984.87534,110.097673 981.981445,106.679688 C979.087551,103.261702 974.883491,101.552734 969.369141,101.552734 C964.082656,101.552734 959.992527,103.284488 957.098633,106.748047 C954.204738,110.211606 952.507164,114.677707 952.005859,120.146484 L986.800781,120.146484 Z M1015.58008,91.7089844 L1028.56836,91.7089844 L1028.56836,103.535156 C1030.16342,99.4335732 1032.73826,96.1979285 1036.29297,93.828125 C1039.84767,91.4583215 1043.69855,90.2734375 1047.8457,90.2734375 C1050.12436,90.2734375 1052.27766,90.6380172 1054.30566,91.3671875 C1056.33367,92.0963578 1058.07682,93.1217382 1059.53516,94.4433594 L1053.72461,105.585938 C1051.49153,103.899731 1048.84832,103.056641 1045.79492,103.056641 C1042.46808,103.056641 1039.49448,104.013662 1036.87402,105.927734 C1034.25357,107.841806 1032.19141,110.690085 1030.6875,114.472656 C1029.18359,118.255227 1028.43164,122.880832 1028.43164,128.349609 L1028.43164,160 L1015.58008,160 L1015.58008,91.7089844 Z M1077.95801,156.103516 C1074.79067,152.594383 1073.20703,147.740916 1073.20703,141.542969 L1073.20703,102.714844 L1062.20117,102.714844 L1062.20117,91.7089844 L1073.34375,91.7089844 L1073.34375,75.5761719 L1086.33203,64.9121094 L1086.33203,91.7089844 L1103.62695,91.7089844 L1103.62695,102.714844 L1086.33203,102.714844 L1086.33203,140.175781 C1086.33203,143.548194 1087.08398,146.031893 1088.58789,147.626953 C1090.0918,149.222013 1092.07421,150.019531 1094.53516,150.019531 C1095.85678,150.019531 1097.23534,149.814455 1098.6709,149.404297 C1100.10645,148.994139 1101.43945,148.492842 1102.66992,147.900391 L1102.66992,159.658203 C1101.57617,160.113935 1100.04949,160.512694 1098.08984,160.854492 C1096.1302,161.196291 1094.10222,161.367188 1092.00586,161.367188 C1085.80791,161.367188 1081.12534,159.612648 1077.95801,156.103516 Z M1119.21289,91.7089844 L1132.20117,91.7089844 L1132.20117,103.535156 C1133.79623,99.4335732 1136.37108,96.1979285 1139.92578,93.828125 C1143.48049,91.4583215 1147.33136,90.2734375 1151.47852,90.2734375 C1153.75717,90.2734375 1155.91047,90.6380172 1157.93848,91.3671875 C1159.96648,92.0963578 1161.70963,93.1217382 1163.16797,94.4433594 L1157.35742,105.585938 C1155.12434,103.899731 1152.48114,103.056641 1149.42773,103.056641 C1146.10089,103.056641 1143.12729,104.013662 1140.50684,105.927734 C1137.88638,107.841806 1135.82423,110.690085 1134.32031,114.472656 C1132.8164,118.255227 1132.06445,122.880832 1132.06445,128.349609 L1132.06445,160 L1119.21289,160 L1119.21289,91.7089844 Z M1173.69531,63.4082031 L1187.98242,63.4082031 L1187.98242,77.6269531 L1173.69531,77.6269531 L1173.69531,63.4082031 Z M1174.24219,91.7089844 L1187.43555,91.7089844 L1187.43555,160 L1174.24219,160 L1174.24219,91.7089844 Z M1223.46094,125.546875 L1200.21875,91.7089844 L1216.35156,91.7089844 L1231.45898,115.703125 L1246.63477,91.7089844 L1262.63086,91.7089844 L1239.52539,125.546875 L1262.63086,160 L1246.63477,160 L1231.45898,135.458984 L1216.35156,160 L1200.21875,160 L1223.46094,125.546875 Z" id="Browsertrix" fill="#000000"></path>
13 |     </g>
14 | </svg>


--------------------------------------------------------------------------------
/pywb/templates/fullsearch.html:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE html>
  2 | <html>
  3 | <head>
  4 |     <meta http-equiv="content-type" content="text/html; charset=UTF-8;charset=utf-8">
  5 |     <title>{% if metadata %} {{ metadata.title if metadata.title else coll }} {% else %} {{ coll }} {% endif %} Pages</title>
  6 |     <link rel="stylesheet" href="{{ static_prefix }}/css/bootstrap.min.css">
  7 |     <style>
  8 |     #resDiv {
  9 |         margin-top: 30px;
 10 |     }
 11 | 
 12 |     #resDiv h3 {
 13 |         text-align: center;
 14 |     }
 15 |     #resTitle {
 16 |         text-align: center;
 17 |         margin-bottom: 20px;
 18 |     }
 19 |     .dup {
 20 |         float: right;
 21 |     }
 22 |     li {
 23 |         margin-bottom: 20px;
 24 |     }
 25 |     li div {
 26 |         display: inline-block;
 27 |         vertical-align: top;
 28 |     }
 29 |     .liContent {
 30 |         width: 66%;
 31 |     }
 32 |     .row {
 33 |         margin-bottom: 10px;
 34 |     }
 35 |     .urlmatch {
 36 |         color: darkgreen;
 37 |     }
 38 |     .capturedon {
 39 |         font-style: italic;
 40 |         font-size: smaller;
 41 |     }
 42 |     img.thumb {
 43 |         max-width: 100%;
 44 |     }
 45 |     .img-crop {
 46 |         max-height: 200px;
 47 |         overflow: hidden;
 48 |     }
 49 |     em {
 50 |         font-weight: bold;
 51 |         font-style: italic;
 52 |         color: magenta;
 53 |     }
 54 |     .snippet {
 55 |         font-family: monospace;
 56 |     }
 57 |     .logo {
 58 |         width: 200px;
 59 |         height: auto;
 60 |     }
 61 |     .coll-header {
 62 |         font-variant: all-small-caps;
 63 |         margin-left: 20px;
 64 |         font-size: 1.5rem;
 65 |     }
 66 |     </style>
 67 |     <script>
 68 |     var coll = "{{ coll }}";
 69 | 
 70 |     var sp = new URLSearchParams(window.location.search);
 71 | 
 72 |     var searchQ = sp.get("search");
 73 | 
 74 |     function tsToDate(ts, is_gmt) {
 75 |         if (ts.length < 14) return ts;
 76 |         
 77 |         var datestr =
 78 |         ts.substring(0, 4) +
 79 |         '-' +
 80 |         ts.substring(4, 6) +
 81 |         '-' +
 82 |         ts.substring(6, 8) +
 83 |         'T' +
 84 |         ts.substring(8, 10) +
 85 |         ':' +
 86 |         ts.substring(10, 12) +
 87 |         ':' +
 88 |         ts.substring(12, 14) +
 89 |         '-00:00';
 90 | 
 91 |       var date = new Date(datestr);
 92 |       return is_gmt ? date.toGMTString() : date.toLocaleString();
 93 |     };
 94 | 
 95 | 
 96 |     function fetchPages(start, limit) {
 97 |         var fetchUrl = "/api/search/" + coll + "?sort=desc";
 98 | 
 99 |         if (start) {
100 |             fetchUrl += "&start=" + start;
101 |         }
102 | 
103 |         if (limit) {
104 |             fetchUrl += "&limit=" + limit;
105 |         }
106 | 
107 |         if (searchQ) {
108 |             fetchUrl += "&search=" + searchQ;
109 |         }
110 |             
111 |         window.fetch(fetchUrl)
112 |         .then(function(response) {
113 |             return response.json()
114 |         }).then(function(data) {
115 |             parseResults(data);
116 |         });
117 |     }
118 | 
119 |     function getHref(coll, timestamp, url) {
120 |         return "/" + coll + "/" + timestamp + "/" + url;
121 |     }
122 | 
123 |     function parseResults(results) {
124 |         var ol = document.querySelector("ol");
125 | 
126 |         for (var page of results.results) {
127 |             var li = document.createElement("li");
128 |             var href = getHref(coll, page.timestamp[0], page.url);
129 |             var content = "";
130 |             if (page.has_screenshot) {
131 |                 var imgUrl = getHref(coll, page.timestamp[0] + 'im_', "urn:screenshot:" + page.url);
132 |                 content += '<div class="col-3 img-crop"><a href="' + imgUrl + '" target="_blank"><img class="thumb" src="' + imgUrl + '"/></a></div>';
133 |             }
134 | 
135 |             content += '<div class="col-9">'
136 |             content += '<h5><a href="' + href + '">' + page.title + "</a></h5>";
137 |             content += '<p><span class="urlmatch">' + page.url + '</span><br/>';
138 |             content += '<span class="capturedon">Captured on ' + tsToDate(page.timestamp[0]);
139 | 
140 |             if (page.timestamp.length > 1) {
141 |                 content += "<span class='dup'>" + (page.timestamp.length - 1) + " text duplicate";
142 | 
143 |                 if (page.timestamp.length > 2) {
144 |                     content += "(s)";
145 |                 }
146 | 
147 |                 content += ": ";
148 |                 for (var i = 1; i < page.timestamp.length; i++) {
149 |                     if (i > 1) {
150 |                         content += ", ";
151 |                     }
152 |                     content += '<a href="' + getHref(coll, page.timestamp[i], page.url) + '">Link</a>';
153 |                 }
154 |                 content += "</span>";
155 |             }
156 | 
157 |             content += '</span></p>';
158 | 
159 |             if (page.matched) {
160 |                 for (match of page.matched) {
161 |                     content += "<p class='snippet'>" + match + "</p>";
162 |                 }
163 |             }
164 | 
165 |             content += '</div>';
166 |             console.log(content);
167 | 
168 |             li.innerHTML = content;
169 |             ol.appendChild(li);
170 |         }
171 | 
172 |         var count = document.querySelector("ol").children.length;
173 |         var totalStr = "Showing " + count + " of " + results.total;
174 |         document.querySelector("#resTotal").innerText = totalStr;
175 | 
176 |         document.querySelector("#resTitle").innerText = (!searchQ ? "All Pages": "Search Results");
177 | 
178 |         document.querySelector("button").style.display = ((count < results.total) ? "" : "none");
179 |     }
180 | 
181 |     function fetchMore() {
182 |         var count = document.querySelector("ol").children.length;
183 |         fetchPages(count, 20);
184 |     }
185 | 
186 |     function init() {
187 |         fetchPages();
188 | 
189 |         document.querySelector("#search-text").value = searchQ;
190 |     }
191 | 
192 |     document.addEventListener("DOMContentLoaded", init, {once: true});
193 | 
194 |     </script>
195 | </head>
196 | <body>
197 | <div class="container mt-4">
198 |     <span><img class="logo" src="/static/browsertrix-logo.svg"/></span>
199 |     <span class="coll-header"><b>{{ coll }}</b>&nbsp;Web Collection</span></div>
200 | </div>
201 | <div class="container mt-4">
202 |     <div class="form-row">
203 |         <div class="col-12">
204 |             <form>
205 |                 <label for="search-text" class="lead" aria-label="Search Text">Enter Search Text</label>
206 |                 <span style="float: right">(Go to: <a href="search">Url Search</a>)</span>
207 |                 <input aria-label="text" aria-required="true" name="search" placeholder="" class="form-control form-control-lg" id="search-text"/>
208 |             </form>
209 |         </div>
210 |     </div>
211 |     <div class="row">
212 |         <div class="col-12" id="resDiv">
213 |             <h2 id="resTitle"></h2>
214 |             <ol id="resList">
215 |             </ol>
216 |             <span id="resTotal"></span>
217 |             <button onclick="fetchMore()" class="btn btn-primary" style="display: none">Show More</button>
218 |         </div>
219 |     </div>
220 | </div>
221 | </body>
222 | </html>
223 | 
224 | 


--------------------------------------------------------------------------------
/pywb/uwsgi.ini:
--------------------------------------------------------------------------------
 1 | [uwsgi]
 2 | if-not-env = PORT
 3 | http-socket = :8080
 4 | socket = :8081
 5 | endif =
 6 | 
 7 | master = true
 8 | buffer-size = 65536
 9 | die-on-term = true
10 | 
11 | if-env = VIRTUAL_ENV
12 | venv = $(VIRTUAL_ENV)
13 | endif =
14 | 
15 | processes = 8
16 | gevent = 100
17 | 
18 | # specify config file here
19 | env = PYWB_CONFIG_FILE=/app/config.yaml
20 | wsgi-file = /app/crawlapp.py
21 | 
22 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | aiodns
 2 | aiofiles
 3 | aiohttp
 4 | aioredis
 5 | better_exceptions
 6 | cchardet
 7 | fastapi==0.29.0
 8 | pyyaml
 9 | ujson
10 | uvloop
11 | uvicorn
12 | 


--------------------------------------------------------------------------------
/sample-crawls/custom-scopes.yaml:
--------------------------------------------------------------------------------
 1 | crawls:
 2 |   - name: custom-scopes
 3 |     crawl_type: custom
 4 |     num_browsers: 1
 5 |     num_tabs: 1
 6 | 
 7 |     crawl_depth: 1
 8 | 
 9 |     scopes:
10 |       - domain: rhizome.org
11 |       - domain: webrecorder.io
12 | 
13 |     coll: rhiz_wr
14 |     mode: record
15 |     screenshot_coll: screenshots
16 | 
17 |     seed_urls:
18 |       - https://rhizome.org/
19 | 
20 |     behavior_max_time: 120
21 |     browser: chrome:73
22 | 
23 | 


--------------------------------------------------------------------------------
/sample-crawls/emulate-mobile-browser.yaml:
--------------------------------------------------------------------------------
 1 | crawls:
 2 |   - name: social-media-device-override
 3 |     crawl_type: single-page
 4 |     num_browsers: 2
 5 |     num_tabs: 1
 6 | 
 7 |     coll: emulated-device
 8 |     mode: record
 9 | 
10 |     seed_urls:
11 |       - https://twitter.com/webrecorder_io
12 |       - https://instagram.com/rhizomedotorg
13 | 
14 |     behavior_max_time: 300
15 |     browser: chrome:73
16 | 
17 |     cache: default
18 |     browser_overrides:
19 |       # https://chromedevtools.github.io/devtools-protocol/tot/Network/#method-setUserAgentOverride
20 |       user_agent: "Mozilla/5.0 (Linux; Android 7.0; SM-N920V) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.136 Mobile Safari/537.36"
21 |       navigator_platform: "Linux armv8l"
22 |       device:
23 |         width: 412
24 |         height: 732
25 |         deviceScaleFactor: 3
26 |         isMobile: true
27 |         hasTouch: true
28 |         isLandscape: false
29 | 


--------------------------------------------------------------------------------
/sample-crawls/example.yaml:
--------------------------------------------------------------------------------
 1 | crawls:
 2 |   - name: example
 3 |     crawl_type: single-page
 4 |     num_browsers: 1
 5 | 
 6 |     coll: example
 7 |     mode: record
 8 | 
 9 |     seed_urls:
10 |       - https://www.iana.org/
11 | 
12 | 


--------------------------------------------------------------------------------
/sample-crawls/override-browser-http-cookies-language.yaml:
--------------------------------------------------------------------------------
 1 | crawls:
 2 |   - name: social-media-http-override
 3 |     crawl_type: single-page
 4 |     num_browsers: 2
 5 |     num_tabs: 1
 6 | 
 7 |     coll: emulated-device
 8 |     mode: record
 9 | 
10 |     seed_urls:
11 |       - https://twitter.com/webrecorder_io
12 |       - https://instagram.com/rhizomedotorg
13 | 
14 |     behavior_max_time: 300
15 |     browser: chrome:73
16 | 
17 |     cache: default
18 |     browser_overrides:
19 |       accept_language: "fr-CH, fr;q=0.9, de;q=0.8, en;q=0.7, *;q=0.5"
20 |       cookies:
21 |       # https://chromedevtools.github.io/devtools-protocol/tot/Network#type-CookieParam
22 |         - name: foo
23 |           value: bar
24 |       extra_headers:
25 |         DNT: 1
26 | 


--------------------------------------------------------------------------------
/sample-crawls/social-media-replay.yaml:
--------------------------------------------------------------------------------
 1 | crawls:
 2 |   - name: social-media
 3 |     crawl_type: single-page
 4 |     num_browsers: 2
 5 |     num_tabs: 1
 6 | 
 7 |     coll: social-media
 8 |     mode: replay
 9 |     screenshots_coll: screenshots-qa
10 | 
11 |     seed_urls:
12 |       - https://twitter.com/webrecorder_io
13 |       - https://instagram.com/rhizomedotorg
14 | 
15 |     behavior_max_time: 300
16 |     browser: chrome:73
17 | 
18 |     cache: default
19 | 
20 | 


--------------------------------------------------------------------------------
/sample-crawls/social-media.yaml:
--------------------------------------------------------------------------------
 1 | crawls:
 2 |   - name: social-media
 3 |     crawl_type: single-page
 4 |     num_browsers: 2
 5 |     num_tabs: 1
 6 | 
 7 |     coll: social-media
 8 |     mode: record
 9 | 
10 |     seed_urls:
11 |       - https://twitter.com/webrecorder_io
12 |       - https://instagram.com/rhizomedotorg
13 | 
14 |     behavior_max_time: 300
15 |     browser: chrome:73
16 | 
17 |     cache: default
18 | 
19 | 


--------------------------------------------------------------------------------
/scripts/format.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | 
3 | black -S -v browsertrix browsertrix_cli pywb
4 | 


--------------------------------------------------------------------------------
/scripts/lint.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | dirs="browsertrix/ browsertrix_cli/"
 4 | 
 5 | case "$1" in
 6 |     "types"*)
 7 |      mypy --config-file mypy.ini ${dirs}
 8 |     ;;
 9 | 
10 |     "lint"*)
11 |      flake8 ${dirs}
12 |     ;;
13 | 
14 |     *)
15 |      printf "Checking Typing And Linting\n"
16 |      mypy --config-file mypy.ini ${dirs}
17 |      flake8 ${dirs}
18 |     ;;
19 | esac
20 | 
21 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # vim: set sw=4 et:
 3 | 
 4 | from setuptools import setup, find_packages
 5 | from setuptools.command.test import test as TestCommand
 6 | import glob
 7 | 
 8 | __version__ = '0.1.0.dev0'
 9 | 
10 | def load_requirements(filename):
11 |     with open(filename, 'rt') as fh:
12 |         requirements = fh.read().rstrip().split('\n')
13 |     return requirements
14 | 
15 | 
16 | setup(
17 |     name='browsertrix-cli',
18 |     version=__version__,
19 |     author='John Berlin, Ilya Kreymer',
20 |     author_email='john.berlin@rhizome.org, ikreymer@gmail.com',
21 |     license='Apache 2.0',
22 |     #packages=find_packages(exclude=['test']),
23 |     packages=['browsertrix_cli'],
24 |     url='https://github.com/webrecorder/browsertrix',
25 |     description='Browsertrix CLI: Commandline interface for Webrecorder crawling system',
26 |     long_description=open('README.md').read(),
27 |     provides=[
28 |         'browsertrix_cli',
29 |         ],
30 |     install_requires=load_requirements('cli-requirements.txt'),
31 |     zip_safe=False,
32 |     entry_points="""
33 |         [console_scripts]
34 |         browsertrix=browsertrix_cli.main:cli
35 |         btrix=browsertrix_cli.main:cli
36 |     """,
37 |     test_suite='',
38 |     tests_require=load_requirements('test-local-requirements.txt'),
39 |     classifiers=[
40 |         'Development Status :: 4 - Beta',
41 |         'Environment :: Web Environment',
42 |         'License :: OSI Approved :: Apache Software License',
43 |         'Programming Language :: Python :: 3.7',
44 |         'Topic :: Software Development :: Libraries :: Python Modules',
45 |         'Topic :: Utilities',
46 |     ]
47 | )
48 | 


--------------------------------------------------------------------------------
/static/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/webrecorder/browsertrix-old/726578cfec61c8e0e074393a6373591697ff3bc0/static/.gitkeep


--------------------------------------------------------------------------------
/static/browsertrix-logo.svg:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <svg width="1267px" height="257px" viewBox="0 0 1267 257" version="1.1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
 3 |     <!-- Generator: Sketch 51.3 (57544) - http://www.bohemiancoding.com/sketch -->
 4 |     <title>Browsertrix-Horizontal-no-padding</title>
 5 |     <desc>Created with Sketch.</desc>
 6 |     <defs></defs>
 7 |     <g id="Browsertrix-Horizontal-no-padding" stroke="none" stroke-width="1" fill="none" fill-rule="evenodd">
 8 |         <g id="C4-Logo" transform="translate(2.000000, 5.000000)" fill="#000000">
 9 |             <path d="M88,142.596779 L88,187.400287 C129.247836,195.322678 179.850046,199.721014 233.177384,199.721014 C286.783114,199.721014 337.635191,195.276635 379,187.275923 L379,142.796048 C432.052464,153.427171 466,169.72673 466,188 C466,220.032515 361.682347,246 233,246 C104.317653,246 0,220.032515 0,188 C0,169.612176 34.3745016,153.222886 88,142.596779 Z" id="Combined-Shape"></path>
10 |             <path d="M353,0 L353,155.25055 C317.606088,160.636359 276.364698,163.605072 233.177384,163.605072 C190.252242,163.605072 149.249471,160.672294 114,155.348368 L114,0 L353,0 Z M187,13 L187,30 L337,30 L337,13 L187,13 Z M131,13 L131,30 L149,30 L149,13 L131,13 Z M159,13 L159,30 L177,30 L177,13 L159,13 Z" id="Combined-Shape" fill-rule="nonzero"></path>
11 |         </g>
12 |         <path d="M569.535156,63.4082031 L599.408203,63.4082031 C604.831407,63.4082031 609.878557,64.2512937 614.549805,65.9375 C619.221052,67.6237063 623.026352,70.3238747 625.96582,74.0380859 C628.905288,77.7522972 630.375,82.5032263 630.375,88.2910156 C630.375,92.9850495 629.292654,97.0637848 627.12793,100.527344 C624.963205,103.990903 622.149106,106.702464 618.685547,108.662109 C623.561873,110.712901 627.515284,113.902973 630.545898,118.232422 C633.576513,122.561871 635.091797,127.552055 635.091797,133.203125 C635.091797,139.127634 633.656264,144.095032 630.785156,148.105469 C627.914048,152.115905 624.120141,155.100902 619.40332,157.060547 C614.6865,159.020192 609.570991,160 604.056641,160 L569.535156,160 L569.535156,63.4082031 Z M600.228516,104.21875 C605.150415,104.21875 609.160792,103.045259 612.259766,100.698242 C615.358739,98.3512252 616.908203,94.6256766 616.908203,89.5214844 C616.908203,84.7363042 615.32456,81.2727972 612.157227,79.1308594 C608.989893,76.9889216 604.80862,75.9179688 599.613281,75.9179688 L583.34375,75.9179688 L583.34375,104.21875 L600.228516,104.21875 Z M604.330078,146.943359 C609.525417,146.943359 613.672511,145.758475 616.771484,143.388672 C619.870458,141.018868 621.419922,137.213568 621.419922,131.972656 C621.419922,126.868464 619.813493,122.994805 616.600586,120.351562 C613.387679,117.70832 609.229192,116.386719 604.125,116.386719 L583.34375,116.386719 L583.34375,146.943359 L604.330078,146.943359 Z M652.728516,91.7089844 L665.716797,91.7089844 L665.716797,103.535156 C667.311857,99.4335732 669.886701,96.1979285 673.441406,93.828125 C676.996112,91.4583215 680.846984,90.2734375 684.994141,90.2734375 C687.272798,90.2734375 689.426097,90.6380172 691.454102,91.3671875 C693.482106,92.0963578 695.225253,93.1217382 696.683594,94.4433594 L690.873047,105.585938 C688.639963,103.899731 685.99676,103.056641 682.943359,103.056641 C679.61652,103.056641 676.642917,104.013662 674.022461,105.927734 C671.402005,107.841806 669.339851,110.690085 667.835938,114.472656 C666.332024,118.255227 665.580078,122.880832 665.580078,128.349609 L665.580078,160 L652.728516,160 L652.728516,91.7089844 Z M717.772461,156.547852 C712.691055,153.334945 708.737644,149.005561 705.912109,143.55957 C703.086574,138.11358 701.673828,132.15498 701.673828,125.683594 C701.673828,119.212207 703.075181,113.276394 705.87793,107.875977 C708.680678,102.475559 712.634089,98.1917476 717.738281,95.0244141 C722.842473,91.8570805 728.675748,90.2734375 735.238281,90.2734375 C741.755241,90.2734375 747.577123,91.8570805 752.704102,95.0244141 C757.83108,98.1917476 761.818671,102.486952 764.666992,107.910156 C767.515314,113.33336 768.939453,119.25778 768.939453,125.683594 C768.939453,132.15498 767.515314,138.11358 764.666992,143.55957 C761.818671,149.005561 757.83108,153.334945 752.704102,156.547852 C747.577123,159.760758 741.755241,161.367188 735.238281,161.367188 C728.675748,161.367188 722.853867,159.760758 717.772461,156.547852 Z M746.483398,146.738281 C749.468439,144.459624 751.678704,141.474628 753.114258,137.783203 C754.549812,134.091778 755.267578,130.058616 755.267578,125.683594 C755.267578,121.308572 754.549812,117.298195 753.114258,113.652344 C751.678704,110.006492 749.468439,107.078462 746.483398,104.868164 C743.498357,102.657867 739.772809,101.552734 735.306641,101.552734 C730.840472,101.552734 727.114924,102.657867 724.129883,104.868164 C721.144842,107.078462 718.934577,110.006492 717.499023,113.652344 C716.063469,117.298195 715.345703,121.308572 715.345703,125.683594 C715.345703,130.058616 716.063469,134.103172 717.499023,137.817383 C718.934577,141.531594 721.156235,144.51659 724.164062,146.772461 C727.17189,149.028332 730.886046,150.15625 735.306641,150.15625 C739.772809,150.15625 743.498357,149.016938 746.483398,146.738281 Z M775.84375,91.7089844 L789.925781,91.7089844 L802.845703,140.859375 L817.201172,91.7089844 L828.822266,91.7089844 L843.177734,140.722656 L856.097656,91.7089844 L870.179688,91.7089844 L850.150391,160 L837.845703,160 L823.080078,109.550781 L808.177734,160.136719 L795.873047,160.136719 L775.84375,91.7089844 Z M890.584961,158.974609 C886.87075,157.379549 883.737643,155.214857 881.185547,152.480469 C878.633451,149.74608 876.810552,146.715511 875.716797,143.388672 L886.517578,138.330078 C887.748053,141.884783 889.63931,144.790028 892.191406,147.045898 C894.743502,149.301769 898.138651,150.429688 902.376953,150.429688 C906.159524,150.429688 909.155913,149.518238 911.366211,147.695312 C913.576508,145.872387 914.681641,143.525405 914.681641,140.654297 C914.681641,138.557932 914.043626,136.826179 912.767578,135.458984 C911.49153,134.09179 909.885101,133.009444 907.948242,132.211914 C906.011384,131.414384 903.334001,130.514328 899.916016,129.511719 C895.540994,128.235671 891.986342,126.971035 889.251953,125.717773 C886.517564,124.464512 884.181976,122.596041 882.245117,120.112305 C880.308259,117.628568 879.339844,114.358744 879.339844,110.302734 C879.339844,106.064432 880.399403,102.430028 882.518555,99.3994141 C884.637706,96.3688 887.474592,94.0901769 891.029297,92.5634766 C894.584002,91.0367762 898.480448,90.2734375 902.71875,90.2734375 C908.141954,90.2734375 912.915669,91.5266802 917.040039,94.0332031 C921.164409,96.5397261 924.411446,100.026019 926.78125,104.492188 L916.732422,110.029297 C913.542302,103.876922 908.711621,100.800781 902.240234,100.800781 C899.095687,100.800781 896.566416,101.529941 894.652344,102.988281 C892.738272,104.446622 891.78125,106.315093 891.78125,108.59375 C891.78125,110.325529 892.373692,111.783848 893.558594,112.96875 C894.743496,114.153652 896.2246,115.09928 898.001953,115.805664 C899.779306,116.512048 902.285791,117.366531 905.521484,118.369141 C910.169945,119.781908 913.952459,121.171868 916.869141,122.539062 C919.785822,123.906257 922.3037,125.968411 924.422852,128.725586 C926.542003,131.482761 927.601562,135.094379 927.601562,139.560547 C927.601562,144.300154 926.428072,148.31053 924.081055,151.591797 C921.734038,154.873063 918.657897,157.322583 914.852539,158.94043 C911.047181,160.558276 907.002626,161.367188 902.71875,161.367188 C898.343728,161.367188 894.299172,160.569669 890.584961,158.974609 Z M952.655273,156.376953 C947.75616,153.186833 944.087577,148.948594 941.649414,143.662109 C939.211251,138.375625 937.992188,132.54235 937.992188,126.162109 C937.992188,119.918589 939.165678,114.085314 941.512695,108.662109 C943.859712,103.238905 947.402971,98.8639489 952.142578,95.5371094 C956.882185,92.2102698 962.647101,90.546875 969.4375,90.546875 C976.638057,90.546875 982.494118,92.1988767 987.005859,95.5029297 C991.517601,98.8069827 994.753245,103.022435 996.712891,108.149414 C998.672536,113.276393 999.652344,118.688123 999.652344,124.384766 C999.652344,126.800142 999.515626,128.736972 999.242188,130.195312 L952.005859,130.195312 C952.324871,135.891956 954.045232,140.506167 957.166992,144.038086 C960.288753,147.570005 964.766247,149.335938 970.599609,149.335938 C974.291034,149.335938 977.583644,148.777675 980.477539,147.661133 C983.371434,146.544591 986.02603,144.664727 988.441406,142.021484 L997.601562,149.746094 C993.864565,153.665384 989.820009,156.547842 985.467773,158.393555 C981.115538,160.239267 976.159533,161.162109 970.599609,161.162109 C963.535772,161.162109 957.554386,159.567073 952.655273,156.376953 Z M986.800781,120.146484 C986.481769,114.586561 984.87534,110.097673 981.981445,106.679688 C979.087551,103.261702 974.883491,101.552734 969.369141,101.552734 C964.082656,101.552734 959.992527,103.284488 957.098633,106.748047 C954.204738,110.211606 952.507164,114.677707 952.005859,120.146484 L986.800781,120.146484 Z M1015.58008,91.7089844 L1028.56836,91.7089844 L1028.56836,103.535156 C1030.16342,99.4335732 1032.73826,96.1979285 1036.29297,93.828125 C1039.84767,91.4583215 1043.69855,90.2734375 1047.8457,90.2734375 C1050.12436,90.2734375 1052.27766,90.6380172 1054.30566,91.3671875 C1056.33367,92.0963578 1058.07682,93.1217382 1059.53516,94.4433594 L1053.72461,105.585938 C1051.49153,103.899731 1048.84832,103.056641 1045.79492,103.056641 C1042.46808,103.056641 1039.49448,104.013662 1036.87402,105.927734 C1034.25357,107.841806 1032.19141,110.690085 1030.6875,114.472656 C1029.18359,118.255227 1028.43164,122.880832 1028.43164,128.349609 L1028.43164,160 L1015.58008,160 L1015.58008,91.7089844 Z M1077.95801,156.103516 C1074.79067,152.594383 1073.20703,147.740916 1073.20703,141.542969 L1073.20703,102.714844 L1062.20117,102.714844 L1062.20117,91.7089844 L1073.34375,91.7089844 L1073.34375,75.5761719 L1086.33203,64.9121094 L1086.33203,91.7089844 L1103.62695,91.7089844 L1103.62695,102.714844 L1086.33203,102.714844 L1086.33203,140.175781 C1086.33203,143.548194 1087.08398,146.031893 1088.58789,147.626953 C1090.0918,149.222013 1092.07421,150.019531 1094.53516,150.019531 C1095.85678,150.019531 1097.23534,149.814455 1098.6709,149.404297 C1100.10645,148.994139 1101.43945,148.492842 1102.66992,147.900391 L1102.66992,159.658203 C1101.57617,160.113935 1100.04949,160.512694 1098.08984,160.854492 C1096.1302,161.196291 1094.10222,161.367188 1092.00586,161.367188 C1085.80791,161.367188 1081.12534,159.612648 1077.95801,156.103516 Z M1119.21289,91.7089844 L1132.20117,91.7089844 L1132.20117,103.535156 C1133.79623,99.4335732 1136.37108,96.1979285 1139.92578,93.828125 C1143.48049,91.4583215 1147.33136,90.2734375 1151.47852,90.2734375 C1153.75717,90.2734375 1155.91047,90.6380172 1157.93848,91.3671875 C1159.96648,92.0963578 1161.70963,93.1217382 1163.16797,94.4433594 L1157.35742,105.585938 C1155.12434,103.899731 1152.48114,103.056641 1149.42773,103.056641 C1146.10089,103.056641 1143.12729,104.013662 1140.50684,105.927734 C1137.88638,107.841806 1135.82423,110.690085 1134.32031,114.472656 C1132.8164,118.255227 1132.06445,122.880832 1132.06445,128.349609 L1132.06445,160 L1119.21289,160 L1119.21289,91.7089844 Z M1173.69531,63.4082031 L1187.98242,63.4082031 L1187.98242,77.6269531 L1173.69531,77.6269531 L1173.69531,63.4082031 Z M1174.24219,91.7089844 L1187.43555,91.7089844 L1187.43555,160 L1174.24219,160 L1174.24219,91.7089844 Z M1223.46094,125.546875 L1200.21875,91.7089844 L1216.35156,91.7089844 L1231.45898,115.703125 L1246.63477,91.7089844 L1262.63086,91.7089844 L1239.52539,125.546875 L1262.63086,160 L1246.63477,160 L1231.45898,135.458984 L1216.35156,160 L1200.21875,160 L1223.46094,125.546875 Z" id="Browsertrix" fill="#000000"></path>
13 |     </g>
14 | </svg>


--------------------------------------------------------------------------------
/static/index.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html lang="en">
 3 |   <head>
 4 |     <meta charset="UTF-8" />
 5 |     <title>UI</title>
 6 |     <script>
 7 |       window.__crawlmanEndpoints = {
 8 |         crawls: {
 9 |           ep: "/crawls",
10 |           retrieve: { method: "GET" },
11 |           create: {
12 |             defaults: { crawl_type: 'single-page', num_browsers: 2, num_tabs: 1 },
13 |             method: "POST"
14 |           }
15 |         },
16 |         crawl: {
17 |           ep: "/crawl/",
18 |           info: { method: "GET", path: '/info' },
19 |           remove: { method: "DELETE" },
20 |           start: {
21 |             defaults: {
22 |               browser: "chrome:67",
23 |               behavior_run_time: 60,
24 |               headless: false
25 |             },
26 |             method: "POST",
27 |             path: "/start"
28 |           },
29 |           stop: { method: "POST", path: "/stop" },
30 |           done: { method: "GET", path: "/done" },
31 |           retrieveURLS: { method: "GET", path: "/urls" },
32 |           addURLS: { method: "POST", path: "/urls" }
33 |         }
34 |       };
35 |     </script>
36 |   </head>
37 |   <body>
38 |     <div id="mount"></div>
39 |     <script src="/static/app.js"></script>
40 |   </body>
41 | </html>
42 | 


--------------------------------------------------------------------------------
/test-docker-requirements.txt:
--------------------------------------------------------------------------------
1 | pytest
2 | mock
3 | requests
4 | fakeredis
5 | pyyaml
6 | six
7 | starlette
8 | 


--------------------------------------------------------------------------------
/test-local-requirements.txt:
--------------------------------------------------------------------------------
1 | pytest
2 | mock
3 | requests
4 | fakeredis
5 | six
6 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/webrecorder/browsertrix-old/726578cfec61c8e0e074393a6373591697ff3bc0/tests/__init__.py


--------------------------------------------------------------------------------
/tests/conftest.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from os import environ
 3 | 
 4 | import pytest
 5 | import yaml
 6 | from mock import patch as mock_patch
 7 | from starlette.testclient import TestClient
 8 | 
 9 | from .utils import init_fake_redis
10 | 
11 | crawl_names = []
12 | crawls = []
13 | 
14 | 
15 | def pytest_addoption(parser):
16 |     default_file = os.path.join(os.path.dirname(__file__), "crawl_tests.yaml")
17 |     parser.addoption("--crawl-file", action="store", default=default_file)
18 |     parser.addoption("--run-only", action="store", default="")
19 |     parser.addoption("--headless", action="store_true", default=False)
20 | 
21 | @pytest.fixture
22 | def headless(request):
23 |     return request.config.getoption("--headless")
24 | 
25 | 
26 | def pytest_generate_tests(metafunc):
27 |     if "crawl" in metafunc.fixturenames:
28 |         init_crawl_data(
29 |             metafunc.config.getoption("--crawl-file"),
30 |             metafunc.config.getoption("--run-only"),
31 |         )
32 | 
33 |         metafunc.parametrize("crawl", crawls, ids=crawl_names, scope="class")
34 | 
35 | 
36 | def init_crawl_data(filename, run_only):
37 |     """ Load the crawl YAML
38 |     """
39 |     global crawl_names
40 |     global crawls
41 |     if crawl_names and crawls:
42 |         return
43 | 
44 |     run_only_list = run_only.split(",") if run_only else None
45 | 
46 |     crawls_root = None
47 |     with open(filename) as fh:
48 |         crawls_root = yaml.safe_load(fh.read())
49 | 
50 |     for crawl in crawls_root["crawls"]:
51 |         if crawl.get("skip"):
52 |             continue
53 | 
54 |         if run_only_list and crawl["name"] not in run_only_list:
55 |             continue
56 | 
57 |         crawl_names.append(crawl["name"])
58 |         crawls.append(crawl)
59 | 
60 | 
61 | @pytest.fixture(scope="class")
62 | def api_test_client(request):
63 |     from browsertrix.api import app
64 | 
65 |     with TestClient(app) as tc:
66 |         request.cls.client = tc
67 |         yield
68 | 
69 | 
70 | 
71 | @pytest.fixture(scope="class")
72 | def browsertrix_use_fake_redis(request):
73 |     with mock_patch("browsertrix.utils.init_redis", init_fake_redis):
74 |         yield
75 | 


--------------------------------------------------------------------------------
/tests/crawl_tests.yaml:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | crawls:
 4 |   - name: social-media-behaviors
 5 |     crawl_type: single-page
 6 |     num_browsers: 1
 7 |     num_tabs: 2
 8 | 
 9 |     coll: coll
10 |     mode: replay
11 | 
12 |     seed_urls:
13 |       - https://twitter.com/webrecorder_io
14 |       - https://instagram.com/rhizomedotorg
15 | 
16 |     behavior_max_time: 20
17 |     browser: chrome:73
18 | 
19 |     # params for test only
20 |     ignore_extra:
21 |       test_max_timeout: 120
22 | 
23 |   - name: all-links-single-page
24 |     crawl_type: all-links
25 |     num_browsers: 2
26 | 
27 |     coll: coll
28 |     mode: replay
29 | 
30 |     seed_urls:
31 |       - https://www.iana.org/domains/reserved
32 | 
33 |     behavior_max_time: 60
34 |     browser: chrome:73
35 | 
36 |     # params for test only
37 |     ignore_extra:
38 |       test_max_timeout: 600
39 | 
40 |       # expected size of seen list
41 |       test_expected_seen: 51
42 |    
43 |   - name: multi-browser-test
44 |     crawl_type: single-page
45 |     num_browsers: 2
46 | 
47 |     coll: coll
48 |     mode: replay
49 | 
50 |     seed_urls:
51 |       - https://rhizome.org/
52 |       - https://youtube.com/watch?v=aT-Up5Y4uRI
53 | 
54 |     behavior_max_time: 60
55 |     browser: chrome:73
56 | 
57 |     # params for test only
58 |     ignore_extra:
59 |       test_max_timeout: 300
60 | 
61 | 


--------------------------------------------------------------------------------
/tests/start-test-compose.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | CURR_DIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )
 4 | 
 5 | mkdir $CURR_DIR/test-webarchive
 6 | 
 7 | if [ ! -f testcoll.tar.gz ]; then
 8 |     wget https://s3.amazonaws.com/webrecorder-builds/crawlmanager-tests/testcoll.tar.gz
 9 | fi
10 | 
11 | tar xvfz testcoll.tar.gz --directory $CURR_DIR/test-webarchive
12 | 
13 | docker-compose -f $CURR_DIR/test-docker-compose.yml build
14 | docker-compose -f $CURR_DIR/test-docker-compose.yml up -d
15 | 
16 | 


--------------------------------------------------------------------------------
/tests/stop-test-compose.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | CURR_DIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )
3 | 
4 | docker-compose -f $CURR_DIR/test-docker-compose.yml kill
5 | docker-compose -f $CURR_DIR/test-docker-compose.yml rm -f
6 | 
7 | 


--------------------------------------------------------------------------------
/tests/test-docker-compose.yml:
--------------------------------------------------------------------------------
 1 | version: '3.5'
 2 | 
 3 | services:
 4 |     browsertrix:
 5 |         image: webrecorder/browsertrix
 6 |         build:
 7 |             context: ../
 8 | 
 9 |         environment:
10 |             - REDIS_URL=redis://redis/0
11 |             - DEFAULT_POOL=auto-pool
12 |             - SCREENSHOT_API_URL=http://pywb:8080/screenshot/{coll}
13 |             - PROXY_HOST=pywb
14 |             - DEBUG=1
15 | 
16 |         depends_on:
17 |             - redis
18 |             - pywb
19 | 
20 |         ports:
21 |             - 8000:8000
22 | 
23 | 
24 |     pywb:
25 |         build: ../pywb
26 | 
27 |         environment:
28 |             - REDIS_URL=redis://redis/0
29 |         # local test dir
30 |         volumes:
31 |             - ./test-webarchive:/webarchive
32 | 
33 |         networks:
34 |             - default
35 |             - browsers
36 | 
37 |         # use 8181 port for test only
38 |         ports:
39 |             - 8181:8080
40 | 
41 |         depends_on:
42 |             - redis
43 | 
44 | 
45 |     shepherd:
46 |         image: oldwebtoday/shepherd:1.1.0-dev
47 | 
48 |         environment:
49 |             - BROWSER_NET=tests_browsers
50 |             - MAIN_NET=tests_default
51 | 
52 |             - PROXY_HOST=pywb
53 |             - PROXY_PORT=8080
54 | 
55 |         depends_on:
56 |             - redis
57 | 
58 |         volumes:
59 |             - /var/run/docker.sock:/var/run/docker.sock
60 |             - ../flocks:/app/flocks
61 |             - ./test_pool_config.yaml:/app/pool_config.yaml
62 | 
63 |         ports:
64 |             - 9323:9020
65 | 
66 |     redis:
67 |         image: redis:3.2.4
68 | 
69 |     behaviors:
70 |       image: webrecorder/behaviors
71 | 
72 | 
73 | networks:
74 |     default:
75 |         driver: bridge
76 | 
77 |     browsers:
78 |         driver: bridge
79 | 
80 | 


--------------------------------------------------------------------------------
/tests/test_api.py:
--------------------------------------------------------------------------------
  1 | from collections import defaultdict
  2 | 
  3 | import fakeredis
  4 | import pytest
  5 | import os
  6 | from mock import patch
  7 | 
  8 | 
  9 | # ============================================================================
 10 | shepherd_api_urls = defaultdict(list)
 11 | shepherd_api_post_datas = defaultdict(list)
 12 | reqid_counter = 0
 13 | 
 14 | os.environ['DEFAULT_POOL'] = 'test-pool'
 15 | 
 16 | 
 17 | # ============================================================================
 18 | async def mock_shepherd_api(self, url_path, post_data=None, use_pool=True):
 19 |     global shepherd_api_urls
 20 |     global shepherd_api_post_datas
 21 |     global reqid_counter
 22 | 
 23 |     type_ = None
 24 |     resp = None
 25 |     if 'flock/request' in url_path:
 26 |         type_ = 'request'
 27 |         reqid_counter += 1
 28 |         resp = {'reqid': 'ID_' + str(reqid_counter)}
 29 | 
 30 |     elif 'flock/start' in url_path:
 31 |         type_ = 'start'
 32 |         resp = {'success': True}
 33 | 
 34 |     elif 'flock/stop' in url_path:
 35 |         type_ = 'stop'
 36 |         resp = {'success': True}
 37 | 
 38 |     elif 'flock/remove' in url_path:
 39 |         type_ = 'remove'
 40 |         resp = {'success': True}
 41 | 
 42 |     else:
 43 |         assert False, 'Unknown API call'
 44 | 
 45 |     shepherd_api_urls[type_].append(url_path)
 46 |     shepherd_api_post_datas[type_].append(post_data)
 47 |     return resp
 48 | 
 49 | 
 50 | # ============================================================================
 51 | @patch('browsertrix.crawl.CrawlManager.do_request', mock_shepherd_api)
 52 | @pytest.mark.usefixtures('browsertrix_use_fake_redis', 'api_test_client')
 53 | class TestCrawlAPI:
 54 |     crawl_id = None
 55 |     crawl_id_2 = None
 56 | 
 57 |     params = {
 58 |             'browser': 'chrome:67',
 59 |             'screenshot_target_uri': 'file://test',
 60 |             'user_params': {'some': 'value', 'some_int': 7},
 61 |             'behavior_max_time': 30,
 62 |             'headless': False,
 63 |             'start': False,
 64 |             'num_tabs': 2,
 65 |     }
 66 | 
 67 |     def test_crawl_create(self):
 68 |         params = self.params.copy()
 69 |         params['crawl_type'] = 'all-links'
 70 |         params['name'] = 'First Crawl!'
 71 | 
 72 |         res = self.client.post(
 73 |             '/crawls', json=params
 74 |         )
 75 | 
 76 |         res = res.json()
 77 |         assert res['success']
 78 |         TestCrawlAPI.crawl_id = res['id']
 79 | 
 80 |         # shepherd api urls
 81 |         assert shepherd_api_urls['request'] == [
 82 |             '/flock/request/browsers?pool=test-pool',
 83 |             '/flock/request/browsers?pool=test-pool',
 84 |         ]
 85 | 
 86 |     def test_crawl_queue_urls(self):
 87 |         urls = {'urls': ['https://example.com/', 'http://iana.org/']}
 88 | 
 89 |         res = self.client.put(f'/crawl/{self.crawl_id}/urls', json=urls)
 90 |         assert res.json()['success']
 91 | 
 92 |     def test_get_crawl(self):
 93 |         res = self.client.get(f'/crawl/{self.crawl_id}')
 94 | 
 95 |         json = res.json()
 96 | 
 97 |         assert json['id'] == self.crawl_id
 98 |         assert json['name'] == 'First Crawl!'
 99 |         assert json['num_browsers'] == 2
100 |         assert json['num_tabs'] == 2
101 |         assert json['crawl_type'] == 'all-links'
102 |         assert json['status'] == 'new'
103 |         assert json['crawl_depth'] == 1
104 |         assert json['start_time'] == 0
105 |         assert json['finish_time'] == 0
106 |         assert json['coll'] == 'live'
107 |         assert json['mode'] == 'record'
108 |         assert json['num_queue'] == 2
109 |         assert json['num_seen'] == 2
110 |         assert json['num_pending'] == 0
111 |         assert json['headless'] == False
112 |         assert json['screenshot_coll'] == 'live'
113 |         assert json['text_coll'] == 'live'
114 | 
115 |         assert len(json) == 19
116 | 
117 |     def test_get_crawl_details(self):
118 |         res = self.client.get(f'/crawl/{self.crawl_id}/urls')
119 | 
120 |         json = res.json()
121 | 
122 |         assert json['queue'] == [
123 |             {'url': 'https://example.com/', 'depth': 0},
124 |             {'url': 'http://iana.org/', 'depth': 0},
125 |         ]
126 | 
127 |         assert json['scopes'] == []
128 | 
129 |         assert set(json['seen']) == {'http://iana.org/', 'https://example.com/'}
130 | 
131 |         assert json['pending'] == []
132 | 
133 |     def test_crawl_same_domain_scopes(self):
134 |         params = self.params.copy()
135 |         params['crawl_type'] = 'same-domain'
136 |         res = self.client.post(
137 |             '/crawls', json=params
138 |         )
139 |         assert res.json()['success'] == True
140 | 
141 |         crawl_id = res.json()['id']
142 | 
143 |         urls = {'urls': ['https://example.com/', 'http://iana.org/']}
144 | 
145 |         res = self.client.put(f'/crawl/{crawl_id}/urls', json=urls)
146 |         assert res.json()['success']
147 | 
148 |         res = self.client.get(f'/crawl/{crawl_id}/urls')
149 |         scopes = res.json()['scopes']
150 |         assert len(scopes) == 2
151 |         assert {'domain': 'example.com'} in scopes
152 |         assert {'domain': 'iana.org'} in scopes
153 | 
154 |         # save for deletion
155 |         TestCrawlAPI.crawl_id_2 = crawl_id
156 | 
157 |         # shepherd api urls
158 |         assert shepherd_api_urls['request'] == [
159 |             '/flock/request/browsers?pool=test-pool',
160 |             '/flock/request/browsers?pool=test-pool',
161 |             '/flock/request/browsers?pool=test-pool',
162 |             '/flock/request/browsers?pool=test-pool',
163 |         ]
164 | 
165 | 
166 | 
167 |     def test_get_all_crawls(self):
168 |         res = self.client.get(f'/crawls')
169 |         res = res.json()
170 | 
171 |         assert len(res['crawls']) == 2
172 | 
173 |         expected_crawls = {
174 |             (self.crawl_id, 'all-links'),
175 |             (self.crawl_id_2, 'same-domain'),
176 |         }
177 | 
178 |         assert (
179 |             set((crawl['id'], crawl['crawl_type']) for crawl in res['crawls'])
180 |             == expected_crawls
181 |         )
182 | 
183 |     def test_invalid_crawl(self):
184 |         res = self.client.get(f'/crawl/x-invalid')
185 | 
186 |         assert res.status_code == 404
187 | 
188 |         assert res.json() == {'detail': 'crawl not found'}
189 | 
190 |     def test_invalid_request_body(self):
191 |         res = self.client.put(f'/crawl/x-another-invalid/urls', json={})
192 | 
193 |         assert res.status_code == 422
194 | 
195 |         assert res.json()['detail']
196 | 
197 |     def test_invalid_crawl_2(self):
198 |         res = self.client.put(f'/crawl/x-another-invalid/urls', json={'urls': []})
199 | 
200 |         assert res.status_code == 404
201 | 
202 |         assert res.json() == {'detail': 'crawl not found'}
203 | 
204 |     def test_start_crawl(self):
205 |         res = self.client.post(f'/crawl/{self.crawl_id}/start')
206 |         json = res.json()
207 | 
208 |         assert json['success']
209 | 
210 |         # two browsers started
211 |         assert set(json['browsers']) == {'ID_1', 'ID_2'}
212 | 
213 |         assert set(shepherd_api_urls['start']) == {'/flock/start/ID_1', '/flock/start/ID_2'}
214 | 
215 |         # shepherd api post data
216 |         for data in shepherd_api_post_datas['request']:
217 |             assert data['overrides'] == {
218 |                 'browser': 'oldwebtoday/chrome:67',
219 |                 'xserver': 'oldwebtoday/vnc-webrtc-audio',
220 |             }
221 | 
222 |             assert data['deferred'] == {'autobrowser': False}
223 | 
224 |             #assert data['environ']['SCREENSHOT_TARGET_URI'] == 'file://test'
225 | 
226 |             assert data['user_params']['some'] == 'value'
227 |             assert data['user_params']['some_int'] == 7
228 | 
229 |         assert {'environ': {'REQ_ID': 'ID_1'}} in shepherd_api_post_datas['start']
230 |         assert {'environ': {'REQ_ID': 'ID_2'}} in shepherd_api_post_datas['start']
231 | 
232 |     def test_is_done(self):
233 |         res = self.client.get(f'/crawl/{self.crawl_id}/done')
234 |         res = res.json()
235 | 
236 |         assert res['done'] == False
237 | 
238 |     def test_get_crawl_start_running(self):
239 |         res = self.client.get(f'/crawl/{self.crawl_id}')
240 | 
241 |         json = res.json()
242 | 
243 |         assert json['id'] == self.crawl_id
244 |         assert json['name'] == 'First Crawl!'
245 |         assert json['status'] == 'running'
246 |         assert json['start_time'] > 0
247 |         assert json['finish_time'] == 0
248 |         assert json['headless'] == False
249 |         assert json['num_queue'] == 2
250 |         assert json['num_seen'] == 2
251 |         assert json['num_pending'] == 0
252 | 
253 |         assert len(json) == 19
254 | 
255 |     @patch('browsertrix.crawl.CrawlManager.do_request', mock_shepherd_api)
256 |     def test_stop_crawl(self):
257 |         res = self.client.post(f'/crawl/{self.crawl_id}/stop')
258 |         json = res.json()
259 | 
260 |         assert json['success']
261 | 
262 |         # stop calls
263 |         assert set(shepherd_api_urls['stop']) == {
264 |             '/flock/stop/ID_1',
265 |             '/flock/stop/ID_2',
266 |         }
267 | 
268 |         # no post data for stop
269 |         assert shepherd_api_post_datas['stop'] == [None, None]
270 | 
271 |         res = self.client.get(f'/crawl/{self.crawl_id}')
272 | 
273 |         json = res.json()
274 | 
275 |         assert json['status'] == 'stopped'
276 | 
277 |     @patch('browsertrix.crawl.CrawlManager.do_request', mock_shepherd_api)
278 |     def test_delete_crawl(self):
279 |         res = self.client.delete(f'/crawl/{self.crawl_id}')
280 | 
281 |         assert res.json()['success'] == True
282 | 
283 |         res = self.client.delete(f'/crawl/{self.crawl_id_2}')
284 | 
285 |         assert res.json()['success'] == True
286 | 
287 |         assert fakeredis.FakeStrictRedis().keys('a:*') == []
288 | 
289 |         res = self.client.delete(f'/crawl/{self.crawl_id_2}')
290 | 
291 |         assert res.json()['detail'] == 'crawl not found'
292 | 
293 |     @patch('browsertrix.crawl.CrawlManager.do_request', mock_shepherd_api)
294 |     def test_create_and_start(self):
295 |         res = self.client.post(
296 |             '/crawls', json={'num_tabs': 2,
297 |                              'crawl_type': 'all-links',
298 |                              'name': 'Second Crawl Auto Start!',
299 |                              'num_browsers': 3,
300 |                              'coll': 'custom',
301 |                              'screenshot_coll': 'screen-coll',
302 |                              'seed_urls':
303 |                                 [
304 |                                   'https://example.com/',
305 |                                   'https://iana.org/'
306 |                                 ],
307 |                              'browser': 'chrome:67',
308 |                              'screenshot_target_uri': 'file://test',
309 |                              'user_params': {'some': 'value', 'some_int': 7},
310 |                              'behavior_max_time': 30,
311 |                              'headless': True,
312 |                              'start': True,
313 |                              }
314 | 
315 |         )
316 | 
317 |         json = res.json()
318 |         assert 'success' in json
319 |         assert len(json['browsers']) == 3
320 |         assert json['status'] == 'running'
321 | 
322 |         TestCrawlAPI.crawl_id = json['id']
323 | 
324 |     def test_second_crawl_info(self):
325 |         res = self.client.get(f'/crawl/{self.crawl_id}')
326 |         json = res.json()
327 |         assert json['coll'] == 'custom'
328 |         assert json['screenshot_coll'] == 'screen-coll'
329 |         assert json['headless'] == True
330 | 
331 |         assert len(shepherd_api_post_datas['request']) == 7
332 | 
333 |         # assert last 3 shepherd api requests were headless
334 |         for data in shepherd_api_post_datas['request'][-3:]:
335 |             data = shepherd_api_post_datas['request'][-1]
336 |             assert data['deferred'] == {'autobrowser': False, 'xserver': True}
337 | 
338 |     @patch('browsertrix.crawl.CrawlManager.do_request', mock_shepherd_api)
339 |     def test_stop_and_delete_second_crawl(self):
340 |         res = self.client.post(f'/crawl/{self.crawl_id}/stop')
341 |         json = res.json()
342 | 
343 |         assert json['success']
344 | 
345 |         # stop calls
346 |         assert set(shepherd_api_urls['stop']) == {
347 |             '/flock/stop/ID_1',
348 |             '/flock/stop/ID_2',
349 |             '/flock/stop/ID_5',
350 |             '/flock/stop/ID_6',
351 |             '/flock/stop/ID_7',
352 |         }
353 | 
354 |         # no post data for stop
355 |         assert shepherd_api_post_datas['stop'] == [None] * 5
356 | 
357 |         res = self.client.get(f'/crawl/{self.crawl_id}')
358 | 
359 |         json = res.json()
360 | 
361 |         assert json['status'] == 'stopped'
362 | 
363 |         res = self.client.delete(f'/crawl/{self.crawl_id}')
364 | 
365 |         assert res.json()['success'] == True
366 | 
367 |         assert set(shepherd_api_urls['remove']) == {
368 |             '/flock/remove/ID_1',
369 |             '/flock/remove/ID_2',
370 |             '/flock/remove/ID_3',
371 |             '/flock/remove/ID_4',
372 |             '/flock/remove/ID_5',
373 |             '/flock/remove/ID_6',
374 |             '/flock/remove/ID_7',
375 |         }
376 |         assert shepherd_api_post_datas['remove'] == [None] * 7
377 | 
378 |         assert fakeredis.FakeStrictRedis().keys('a:*') == []
379 | 
380 | 


--------------------------------------------------------------------------------
/tests/test_live_crawl.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | import webbrowser
 3 | import time
 4 | 
 5 | 
 6 | class TestCrawls(object):
 7 |     crawl_id = None
 8 |     api_host = 'http://localhost:8000'
 9 |     default_browser = 'chrome:73'
10 |     seen = []
11 |     browsers = []
12 | 
13 |     all_crawl_ids = []
14 | 
15 |     @classmethod
16 |     def teardown_class(cls):
17 |         for crawl_id in cls.all_crawl_ids:
18 |             res = requests.delete(cls.api_host + '/crawl/' + crawl_id)
19 |             res = res.json()
20 |             assert res.get('success') or res.get('detail') == 'crawl not found'
21 | 
22 |     def test_crawl_create_and_start(self, crawl, headless):
23 |         crawl['headless'] = headless
24 |         if 'browser' not in crawl:
25 |             crawl['browser'] = self.default_browser
26 | 
27 |         res = requests.post(self.api_host + '/crawls', json=crawl)
28 |         res = res.json()
29 | 
30 |         assert res.get('success'), res
31 |         assert len(res['browsers']) == crawl.get('num_browsers', 1)
32 | 
33 |         TestCrawls.crawl_id = res['id']
34 |         TestCrawls.all_crawl_ids.append(res['id'])
35 |         TestCrawls.browsers = res['browsers']
36 | 
37 |     def test_load_browsers(self, crawl, headless):
38 |         if not headless:
39 |             for reqid in self.browsers:
40 |                 webbrowser.open(f'http://localhost:9323/attach/{reqid}')
41 | 
42 |     def test_sleep_wait(self, crawl):
43 |         start_time = time.time()
44 |         sleep_time = 5
45 |         max_time = crawl.get('ignore_extra', {}).get('test_max_timeout', 600) + 30
46 |         done = False
47 |         while True:
48 |             res = requests.get(self.api_host + f'/crawl/{self.crawl_id}/done')
49 |             if res.json()['done']:
50 |                 done = True
51 |                 break
52 | 
53 |             if time.time() - start_time > max_time:
54 |                 break
55 | 
56 |             print('Waiting for crawl done')
57 |             time.sleep(sleep_time)
58 | 
59 |         assert done
60 | 
61 |     def test_get_stats(self, crawl):
62 |         res = requests.get(self.api_host + f'/crawl/{self.crawl_id}/urls')
63 |         res = res.json()
64 |         assert len(res['queue']) == 0
65 |         assert len(res['pending']) == 0
66 | 
67 |         assert len(res['seen']) >= len(crawl['seed_urls'])
68 |         expected_seen = crawl.get('ignore_extra', {}).get('test_expected_seen')
69 |         if expected_seen:
70 |             assert len(res['seen']) == expected_seen
71 | 
72 |         TestCrawls.seen = res['seen']
73 | 
74 |     def test_delete(self, crawl):
75 |         res = requests.delete(self.api_host + '/crawl/' + self.crawl_id)
76 |         res = res.json()
77 |         assert res.get('success'), res
78 |         TestCrawls.crawl_id = None
79 | 
80 |     def test_replay(self, crawl):
81 |         for seen_url in self.seen:
82 |             res = requests.get(f'http://localhost:8181/coll/mp_/{seen_url}', allow_redirects=True)
83 |             assert 'URL Not Found' not in res.text
84 | 
85 | 


--------------------------------------------------------------------------------
/tests/test_pool_config.yaml:
--------------------------------------------------------------------------------
 1 | default_pool: auto-pool
 2 | 
 3 | pools:
 4 |   - name: auto-pool
 5 |     type: persist
 6 |     duration: 300
 7 |     max_size: 2
 8 |     expire_check: 10
 9 |     grace_time: 10
10 |     stop_on_pause: false
11 | 
12 | 
13 | 


--------------------------------------------------------------------------------
/tests/utils.py:
--------------------------------------------------------------------------------
 1 | from typing import Any, Callable, Dict, List, Optional, Set
 2 | import json
 3 | 
 4 | import fakeredis
 5 | 
 6 | __all__ = [
 7 |     'AwaitFakeRedis',
 8 |     'init_fake_redis',
 9 | ]
10 | 
11 | 
12 | class AwaitFakeRedis:
13 |     """ async adapter for fakeredis
14 |     """
15 | 
16 |     def __init__(self):
17 |         self.redis = fakeredis.FakeStrictRedis(decode_responses=True)
18 | 
19 |     def close(self):
20 |         self.redis.close()
21 | 
22 |     def __getattr__(self, name):
23 |         async def func(*args, **kwargs):
24 |             return getattr(self.redis, name)(*args, **kwargs)
25 | 
26 |         return func
27 | 
28 |     async def iscan(self, match=None, count=None):
29 |         for key in self.redis.scan_iter(match=match, count=count):
30 |             yield key
31 | 
32 |     async def hmset_dict(self, key, kwargs):
33 |         return self.redis.hmset(key, kwargs)
34 | 
35 | 
36 | async def init_fake_redis(*args, **kwargs) -> AwaitFakeRedis:
37 |     return AwaitFakeRedis()
38 | 


--------------------------------------------------------------------------------