├── .coveragerc ├── .dockerignore ├── .flake8 ├── .gitignore ├── .travis.yml ├── Dockerfile ├── LICENSE ├── NOTICE ├── README.md ├── browsertrix ├── __init__.py ├── api.py ├── crawl.py ├── schema.py └── utils.py ├── browsertrix_cli ├── basecli.py ├── crawl.py ├── main.py └── profile.py ├── cli-requirements.txt ├── dev-requirements.txt ├── docker-compose.yml ├── flocks ├── browsers-headless.yaml └── browsers.yaml ├── frontend ├── .eslintrc.js ├── .prettierrc ├── package.json ├── public │ └── index.html ├── src │ ├── actions │ │ ├── crawls.js │ │ ├── httpRequests.js │ │ └── index.js │ ├── components │ │ ├── Crawl │ │ │ ├── Control.js │ │ │ ├── Info.js │ │ │ └── index.js │ │ ├── CrawlCreator │ │ │ ├── CreationForm.js │ │ │ ├── fields.js │ │ │ ├── index.js │ │ │ └── validate.js │ │ ├── Crawls │ │ │ ├── LoadingCrawls.js │ │ │ ├── SelectCrawl.js │ │ │ └── index.js │ │ └── Header │ │ │ ├── HeaderLink.js │ │ │ └── index.js │ ├── containers │ │ └── App.js │ ├── reducers │ │ ├── crawls.js │ │ └── index.js │ ├── root.js │ ├── store │ │ ├── dev.js │ │ ├── index.js │ │ ├── middleware.js │ │ └── prod.js │ ├── styles │ │ └── global.scss │ ├── utils │ │ ├── bootstrap.js │ │ ├── endpoints.js │ │ ├── index.js │ │ └── rhlConfig.js │ └── wrap-with-provider.js ├── webpack │ ├── development-server.js │ └── webpack.config.js └── yarn.lock ├── install-browsers.sh ├── mypy.ini ├── pool_config.yaml ├── pyproject.toml ├── pytest.ini ├── pywb ├── Dockerfile ├── config.yaml ├── crawlapp.py ├── run.sh ├── static │ └── browsertrix-logo.svg ├── templates │ └── fullsearch.html └── uwsgi.ini ├── requirements.txt ├── sample-crawls ├── custom-scopes.yaml ├── emulate-mobile-browser.yaml ├── example.yaml ├── override-browser-http-cookies-language.yaml ├── social-media-replay.yaml └── social-media.yaml ├── scripts ├── format.sh └── lint.sh ├── setup.py ├── static ├── .gitkeep ├── app.js ├── browsertrix-logo.svg └── index.html ├── test-docker-requirements.txt ├── test-local-requirements.txt └── tests ├── __init__.py ├── conftest.py ├── crawl_tests.yaml ├── start-test-compose.sh ├── stop-test-compose.sh ├── test-docker-compose.yml ├── test_api.py ├── test_live_crawl.py ├── test_pool_config.yaml └── utils.py /.coveragerc: -------------------------------------------------------------------------------- 1 | [run] 2 | source = codecov 3 | branch = True 4 | omit = 5 | */test/* 6 | */tests/* 7 | 8 | -------------------------------------------------------------------------------- /.dockerignore: -------------------------------------------------------------------------------- 1 | pywb 2 | build 3 | webarchive 4 | browsertrix_cli 5 | 6 | # Created by .ignore support plugin (hsz.mobi) 7 | ### Node template 8 | # Logs 9 | logs 10 | *.log 11 | npm-debug.log* 12 | yarn-debug.log* 13 | yarn-error.log* 14 | 15 | # Runtime data 16 | pids 17 | *.pid 18 | *.seed 19 | *.pid.lock 20 | 21 | # Directory for instrumented libs generated by jscoverage/JSCover 22 | lib-cov 23 | 24 | # Coverage directory used by tools like istanbul 25 | coverage 26 | 27 | # nyc test coverage 28 | .nyc_output 29 | 30 | # Grunt intermediate storage (http://gruntjs.com/creating-plugins#storing-task-files) 31 | .grunt 32 | 33 | # Bower dependency directory (https://bower.io/) 34 | bower_components 35 | 36 | # node-waf configuration 37 | .lock-wscript 38 | 39 | # Compiled binary addons (https://nodejs.org/api/addons.html) 40 | build/Release 41 | 42 | # Dependency directories 43 | node_modules/ 44 | jspm_packages/ 45 | 46 | # TypeScript v1 declaration files 47 | typings/ 48 | 49 | # Optional npm cache directory 50 | .npm 51 | 52 | # Optional eslint cache 53 | .eslintcache 54 | 55 | # Optional REPL history 56 | .node_repl_history 57 | 58 | # Output of 'npm pack' 59 | *.tgz 60 | 61 | # Yarn Integrity file 62 | .yarn-integrity 63 | 64 | # dotenv environment variables file 65 | .env 66 | 67 | # next.js build output 68 | .next 69 | ### JetBrains template 70 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and WebStorm 71 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 72 | 73 | # User-specific stuff 74 | .idea/**/workspace.xml 75 | .idea/**/tasks.xml 76 | .idea/**/dictionaries 77 | .idea/**/shelf 78 | 79 | # Sensitive or high-churn files 80 | .idea/**/dataSources/ 81 | .idea/**/dataSources.ids 82 | .idea/**/dataSources.local.xml 83 | .idea/**/sqlDataSources.xml 84 | .idea/**/dynamic.xml 85 | .idea/**/uiDesigner.xml 86 | .idea/**/dbnavigator.xml 87 | 88 | # Gradle 89 | .idea/**/gradle.xml 90 | .idea/**/libraries 91 | 92 | # CMake 93 | cmake-build-debug/ 94 | cmake-build-release/ 95 | 96 | # Mongo Explorer plugin 97 | .idea/**/mongoSettings.xml 98 | 99 | # File-based project format 100 | *.iws 101 | 102 | # IntelliJ 103 | out/ 104 | 105 | # mpeltonen/sbt-idea plugin 106 | .idea_modules/ 107 | 108 | # JIRA plugin 109 | atlassian-ide-plugin.xml 110 | 111 | # Cursive Clojure plugin 112 | .idea/replstate.xml 113 | 114 | # Crashlytics plugin (for Android Studio and IntelliJ) 115 | com_crashlytics_export_strings.xml 116 | crashlytics.properties 117 | crashlytics-build.properties 118 | fabric.properties 119 | 120 | # Editor-based Rest Client 121 | .idea/httpRequests 122 | ### Example user template template 123 | ### Example user template 124 | 125 | # IntelliJ project files 126 | .idea 127 | *.iml 128 | out 129 | gen### Python template 130 | # Byte-compiled / optimized / DLL files 131 | __pycache__/ 132 | *.py[cod] 133 | *$py.class 134 | 135 | # C extensions 136 | *.so 137 | 138 | # Distribution / packaging 139 | .Python 140 | build/ 141 | develop-eggs/ 142 | dist/ 143 | downloads/ 144 | eggs/ 145 | .eggs/ 146 | lib/ 147 | lib64/ 148 | parts/ 149 | sdist/ 150 | var/ 151 | wheels/ 152 | *.egg-info/ 153 | .installed.cfg 154 | *.egg 155 | MANIFEST 156 | 157 | # PyInstaller 158 | # Usually these files are written by a python script from a template 159 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 160 | *.manifest 161 | *.spec 162 | 163 | # Installer logs 164 | pip-log.txt 165 | pip-delete-this-directory.txt 166 | 167 | # Unit test / coverage reports 168 | htmlcov/ 169 | .tox/ 170 | .coverage 171 | .coverage.* 172 | .cache 173 | nosetests.xml 174 | coverage.xml 175 | *.cover 176 | .hypothesis/ 177 | .pytest_cache/ 178 | 179 | # Translations 180 | *.mo 181 | *.pot 182 | 183 | # Django stuff: 184 | *.log 185 | local_settings.py 186 | db.sqlite3 187 | 188 | # Flask stuff: 189 | instance/ 190 | .webassets-cache 191 | 192 | # Scrapy stuff: 193 | .scrapy 194 | 195 | # Sphinx documentation 196 | docs/_build/ 197 | 198 | # PyBuilder 199 | target/ 200 | 201 | # Jupyter Notebook 202 | .ipynb_checkpoints 203 | 204 | # pyenv 205 | .python-version 206 | 207 | # celery beat schedule file 208 | celerybeat-schedule 209 | 210 | # SageMath parsed files 211 | *.sage.py 212 | 213 | # Environments 214 | .env 215 | .venv 216 | env/ 217 | venv/ 218 | ENV/ 219 | env.bak/ 220 | venv.bak/ 221 | 222 | # Spyder project settings 223 | .spyderproject 224 | .spyproject 225 | 226 | # Rope project settings 227 | .ropeproject 228 | 229 | # mkdocs documentation 230 | /site 231 | 232 | 233 | # mypy 234 | .mypy_cache/ 235 | localCompose 236 | pip-wheel-metadata 237 | scripts 238 | #tests 239 | poetry.lock 240 | pyproject.toml 241 | pytest.ini 242 | README.md 243 | mypy.ini 244 | .flake8 245 | frontend 246 | webarchive 247 | -------------------------------------------------------------------------------- /.flake8: -------------------------------------------------------------------------------- 1 | [flake8] 2 | ignore = E203, E266, E501, W503 3 | max-line-length = 88 4 | max-complexity = 18 5 | select = B,C,E,F,W,T4,B950 6 | exclude = .git, 7 | __pycache__, 8 | .mypy_cache, 9 | venv, 10 | .venv 11 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Created by .ignore support plugin (hsz.mobi) 2 | ### Python template 3 | # Byte-compiled / optimized / DLL files 4 | __pycache__/ 5 | *.py[cod] 6 | *$py.class 7 | 8 | # C extensions 9 | *.so 10 | 11 | # Distribution / packaging 12 | .Python 13 | build/ 14 | develop-eggs/ 15 | dist/ 16 | downloads/ 17 | eggs/ 18 | .eggs/ 19 | lib/ 20 | lib64/ 21 | parts/ 22 | sdist/ 23 | var/ 24 | wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | .hypothesis/ 50 | .pytest_cache/ 51 | 52 | # Translations 53 | *.mo 54 | *.pot 55 | 56 | # Django stuff: 57 | *.log 58 | local_settings.py 59 | db.sqlite3 60 | 61 | # Flask stuff: 62 | instance/ 63 | .webassets-cache 64 | 65 | # Scrapy stuff: 66 | .scrapy 67 | 68 | # Sphinx documentation 69 | docs/_build/ 70 | 71 | # PyBuilder 72 | target/ 73 | 74 | # Jupyter Notebook 75 | .ipynb_checkpoints 76 | 77 | # pyenv 78 | .python-version 79 | 80 | # celery beat schedule file 81 | celerybeat-schedule 82 | 83 | # SageMath parsed files 84 | *.sage.py 85 | 86 | # Environments 87 | .env 88 | .venv 89 | env/ 90 | venv/ 91 | ENV/ 92 | env.bak/ 93 | venv.bak/ 94 | 95 | # Spyder project settings 96 | .spyderproject 97 | .spyproject 98 | 99 | # Rope project settings 100 | .ropeproject 101 | 102 | # mkdocs documentation 103 | /site 104 | 105 | # mypy 106 | .mypy_cache/ 107 | ### Example user template template 108 | ### Example user template 109 | 110 | # IntelliJ project files 111 | .idea 112 | *.iml 113 | out 114 | gen### JetBrains template 115 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and WebStorm 116 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 117 | 118 | # User-specific stuff 119 | .idea/**/workspace.xml 120 | .idea/**/tasks.xml 121 | .idea/**/dictionaries 122 | .idea/**/shelf 123 | 124 | # Sensitive or high-churn files 125 | .idea/**/dataSources/ 126 | .idea/**/dataSources.ids 127 | .idea/**/dataSources.local.xml 128 | .idea/**/sqlDataSources.xml 129 | .idea/**/dynamic.xml 130 | .idea/**/uiDesigner.xml 131 | .idea/**/dbnavigator.xml 132 | 133 | # Gradle 134 | .idea/**/gradle.xml 135 | .idea/**/libraries 136 | 137 | # CMake 138 | cmake-build-debug/ 139 | cmake-build-release/ 140 | 141 | # Mongo Explorer plugin 142 | .idea/**/mongoSettings.xml 143 | 144 | # File-based project format 145 | *.iws 146 | 147 | # IntelliJ 148 | out/ 149 | 150 | # mpeltonen/sbt-idea plugin 151 | .idea_modules/ 152 | 153 | # JIRA plugin 154 | atlassian-ide-plugin.xml 155 | 156 | # Cursive Clojure plugin 157 | .idea/replstate.xml 158 | 159 | # Crashlytics plugin (for Android Studio and IntelliJ) 160 | com_crashlytics_export_strings.xml 161 | crashlytics.properties 162 | crashlytics-build.properties 163 | fabric.properties 164 | 165 | # Editor-based Rest Client 166 | .idea/httpRequests 167 | 168 | ### Node template 169 | # Logs 170 | logs 171 | *.log 172 | npm-debug.log* 173 | yarn-debug.log* 174 | yarn-error.log* 175 | 176 | # Runtime data 177 | pids 178 | *.pid 179 | *.seed 180 | *.pid.lock 181 | 182 | # Directory for instrumented libs generated by jscoverage/JSCover 183 | lib-cov 184 | 185 | # Coverage directory used by tools like istanbul 186 | coverage 187 | 188 | # nyc test coverage 189 | .nyc_output 190 | 191 | # Grunt intermediate storage (http://gruntjs.com/creating-plugins#storing-task-files) 192 | .grunt 193 | 194 | # Bower dependency directory (https://bower.io/) 195 | bower_components 196 | 197 | # node-waf configuration 198 | .lock-wscript 199 | 200 | # Compiled binary addons (https://nodejs.org/api/addons.html) 201 | build/Release 202 | 203 | # Dependency directories 204 | node_modules/ 205 | jspm_packages/ 206 | 207 | # TypeScript v1 declaration files 208 | typings/ 209 | 210 | # Optional npm cache directory 211 | .npm 212 | 213 | # Optional eslint cache 214 | .eslintcache 215 | 216 | # Optional REPL history 217 | .node_repl_history 218 | 219 | # Output of 'npm pack' 220 | *.tgz 221 | 222 | # Yarn Integrity file 223 | .yarn-integrity 224 | 225 | *.tar.gz 226 | **/test-webarchive/ 227 | **/webarchive/ 228 | 229 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | 3 | python: 4 | - "3.7" 5 | 6 | os: 7 | - linux 8 | 9 | dist: xenial 10 | 11 | sudo: required 12 | 13 | jobs: 14 | include: 15 | - stage: local tests 16 | script: 17 | - python setup.py install 18 | - pip install -U -r test-local-requirements.txt 19 | - py.test ./tests/test_api.py 20 | 21 | - stage: docker integration tests 22 | 23 | services: 24 | - docker 25 | 26 | env: 27 | - DOCKER_COMPOSE_VERSION=1.23.2 28 | 29 | before_install: 30 | - ./install-browsers.sh --headless 31 | - sudo rm /usr/local/bin/docker-compose 32 | - curl -L https://github.com/docker/compose/releases/download/${DOCKER_COMPOSE_VERSION}/docker-compose-`uname -s`-`uname -m` > docker-compose 33 | - chmod +x docker-compose 34 | - sudo mv docker-compose /usr/local/bin 35 | 36 | script: 37 | - bash ./tests/start-test-compose.sh 38 | - pip install -U -r test-docker-requirements.txt 39 | - py.test --headless ./tests/test_live_crawl.py 40 | - docker-compose logs 41 | - bash ./tests/stop-test-compose.sh 42 | 43 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.7.3 2 | 3 | WORKDIR /app 4 | 5 | COPY requirements.txt ./ 6 | 7 | RUN pip install --no-cache-dir -r requirements.txt 8 | 9 | COPY browsertrix ./browsertrix 10 | COPY static ./static 11 | 12 | CMD uvicorn --reload --host 0.0.0.0 --port 8000 browsertrix.api:app 13 | 14 | 15 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document. 10 | 11 | "Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License. 12 | 13 | "Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity. 14 | 15 | "You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License. 16 | 17 | "Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files. 18 | 19 | "Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types. 20 | 21 | "Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below). 22 | 23 | "Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof. 24 | 25 | "Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution." 26 | 27 | "Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work. 28 | 29 | 2. Grant of Copyright License. 30 | 31 | Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form. 32 | 33 | 3. Grant of Patent License. 34 | 35 | Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed. 36 | 37 | 4. Redistribution. 38 | 39 | You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions: 40 | 41 | You must give any other recipients of the Work or Derivative Works a copy of this License; and 42 | You must cause any modified files to carry prominent notices stating that You changed the files; and 43 | You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and 44 | If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License. 45 | 46 | You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License. 47 | 48 | 5. Submission of Contributions. 49 | 50 | Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions. 51 | 52 | 6. Trademarks. 53 | 54 | This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file. 55 | 56 | 7. Disclaimer of Warranty. 57 | 58 | Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License. 59 | 60 | 8. Limitation of Liability. 61 | 62 | In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages. 63 | 64 | 9. Accepting Warranty or Additional Liability. 65 | 66 | While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability. 67 | 68 | END OF TERMS AND CONDITIONS 69 | -------------------------------------------------------------------------------- /NOTICE: -------------------------------------------------------------------------------- 1 | Browsertrix 2 | Copyright 2018-2020 Webrecorder Software, Rhizome, and Contributors. 3 | 4 | Distributed under the Apache License 2.0. 5 | See LICENSE for details. 6 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | **Depcrecated**: The Browsertrix system is being refactored into more modular individual components. The main component, [Browsertrix Crawler](https://github.com/webrecorder/browsertrix-crawler) will soon support most of the same crawling features via an integrated Docker image that can be deployed via the command-line. The UI and scheduling components will soon be reimplemented as additional components. 2 | 3 | Please see Browsertrix Crawler for latest development. 4 | 5 |
6 | 7 | 8 | 9 | [![Code Style](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/ambv/black) [![Build Status](https://travis-ci.org/webrecorder/browsertrix.svg?branch=master)](https://travis-ci.org/webrecorder/browsertrix) 10 | 11 | ## High Fidelity Browser-Based Crawling Automation 12 | 13 | Browsertrix is a brand new toolset from the Webrecorder project for automating browsers to perform complex scripted behaviors 14 | as well as crawl multiple pages. (The name was originally used for an older project with similar goals). 15 | 16 | Browsertrix is a system for orchestrating Docker-based Chrome browsers, crawling processes, behavior systems, web archiving capture and replay, and full-text search. 17 | 18 | It includes the following features: 19 | * Crawling via customizable YAML-based crawl spec 20 | * High-fidelity browser-based crawlers (controlled via [webrecorder/autobrowser](https://github.com/webrecorder/autobrowser)) 21 | * Execution of complex, domain-specific in-page behaviors (provided by [webrecorder/behaviors](https://github.com/webrecorder/behaviors)) 22 | * Capture or replay into designated [pywb](https://github.com/webrecorder/pywb) collections 23 | * Screenshot creation of each page (optional). 24 | * Text extraction for each page and full text search via Solr (optional). 25 | * Support for customized browser profiles to minimize capture of private information. 26 | 27 | ## Getting Started 28 | 29 | ### Installing Browsertrix 30 | 31 | Browsertrix is currently designed to run with Docker and Docker Compose. 32 | The Browsertrix CLI requires local Python 3.6+. 33 | 34 | To install, run: 35 | 36 | ```bash 37 | git clone https://github.com/webrecorder/browsertrix 38 | cd browsertrix 39 | python setup.py install 40 | ./install-browsers.sh 41 | docker-compose build 42 | docker-compose up -d 43 | ``` 44 | 45 | The `install-browsers.sh` script installs additional Docker images necessary for dynamic browser creation. 46 | The script can be used to update the images as well. 47 | 48 | ### Installing Browsertrix CLI 49 | 50 | The Browsertrix CLI is installed by running `python setup.py install` and includes full functionality for running crawls and creating browser profiles. 51 | 52 | Once installed, browsertrix commands are available via the `browsertrix` command. 53 | 54 | ## Creating a Crawl 55 | 56 | To create a crawl, first a crawl spec should be defined in a yaml file. 57 | An example spec, [sample-crawls/example.yaml](sample-crawls/example.yaml) might look as follows: 58 | 59 | ```yaml 60 | crawls: 61 | - name: example 62 | crawl_type: all-links 63 | num_browsers: 1 64 | 65 | coll: example 66 | mode: record 67 | 68 | seed_urls: 69 | - https://www.iana.org/ 70 | ``` 71 | 72 | Then, simply run `browsertrix crawl create sample-crawls/example.yaml --watch` 73 | 74 | The `--watch` param will also result in the crawling browser opening in a new browser window via vnc connection. 75 | 76 | If started successfully, the output will be similar to: 77 | ``` 78 | Crawl Created and Started: cf30281efc7a 79 | Status: running 80 | Opening Browser 1 of 1 (CKVEMACNI6YBUKLQI6UKKBLB) for crawl cf30281efc7a 81 | ``` 82 | 83 | To view all running crawls, simply run `browsertrix crawl list` which should result in output similar to: 84 | 85 | ``` 86 | CRAWL ID NAME STARTED DURATION STATUS CRAWL TYPE COLL MODE TO CRAWL PENDING SEEN BROWSERS TABS 87 | cf30281efc7a example 0:00:35 ago 0:00:10 running all-links example record 15 1 25 1 1 88 | ``` 89 | 90 | To get more detailed info on the crawl, run `browsertrix crawl info --urls ` (where ` = cf30281efc7a` in this example) 91 | 92 | To follow the crawl log in the console window, add the `--log` option (the log followed will be from the first browser). 93 | 94 | ### Crawling Options 95 | 96 | Browsertrix supports a number of options, with a key option being the `crawl_type`, which can be: 97 | 98 | - `single-page` -- crawl only the specified seed urls 99 | - `all-links` -- crawl the seed url(s) and all links discovered until max depth is exceeded 100 | - `same-domain` -- crawl the seed url(s) and all links discovered that are on the same domain or sub-domain (up to a depth of 100) 101 | - `custom` -- Supports custom depth and scope rules! 102 | 103 | The first 3 options are designed to be a simple way to specify common options, and more may be added later. 104 | 105 | When using `custom`, the `crawl_depth` param can specify the crawl depth (hops) from each seed url. 106 | 107 | The `scopes` list can contain one or more [urlcanon MatchRules](https://github.com/iipc/urlcanon/blob/master/python/urlcanon/rules.py#L70) specifying urls that are in scope for the crawl. 108 | 109 | See [custom-scopes.yaml](sample_crawl_spec/custom-scopes.yaml) for an example on how to use the custom option. 110 | 111 | 112 | The `coll` option specifies the pywb collection to use for crawling, and mode specifies `record` (default) or `replay` or 113 | `live` (direct live web connection). 114 | 115 | The `num_browsers` and `num_tabs` option allow for selecting total number of browsers and number of tabs per browser to use for this crawl. 116 | 117 | The seed urls for the crawl should be provided in the `seed_urls` list. 118 | 119 | The `cache` option specifies cacheing options for a crawl, with available options: 120 | - `always` -- Strict cacheing via `Cache-Control` on almost every resource to limit duplicate urls in a single browser session (default option when omitted) 121 | - `default` -- Keep default cacheing for a page 122 | - `never` -- disables all cacheing for all urls. 123 | 124 | All example crawl configs demonstrating these options are available in: [sample-crawls](sample-crawls/) 125 | 126 | ### In-Page Behaviors 127 | 128 | For every page, Browsertrix runs a designated behavior before collecting outlinks, (optionally) taking screenshots, 129 | and moving on to the next page. 130 | 131 | The behaviors are served via a separate behavior API server. The current list of available behaviors is available at: 132 | https://github.com/webrecorder/behaviors/tree/master/behaviors 133 | 134 | The behaviors are built using a special library of behavior functions (preliminary docs available here: 135 | https://webrecorder.github.io/behaviors/) 136 | 137 | If no site-specific behavior is found, the default `autoscroll.js` 138 | 139 | The `behavior_max_time` crawl option specifies the maximum time a behavior can run (current default is 60 seconds). 140 | When crawling sites with infinite scroll, it is recommended to set the `behavior_max_time` to be much higher. 141 | 142 | 143 | ### pywb Collections and Access 144 | 145 | All data crawled is placed in the `./webarchive/collections/` directory which corresponds to the [standard pywb directory structure conventions](https://pywb.readthedocs.io/en/latest/manual/configuring.html#directory-structure) eg. a collection `test` would be found under `./webarchive/collections/test`. 146 | 147 | Collections are created automatically on first use and can also be managed via `wb-manager` with `webarchive` as the working directory. 148 | 149 | The running pywb instance can also be accessed via `http://localhost:8180/` 150 | 151 | ### Replay Crawling and Screenshots 152 | 153 | Currently, screenshot creation is automatically enabled when crawling in record mode and screenshots are added automatically 154 | to the same collection. 155 | 156 | Browsertrix supports crawling in replay mode, over an existing collection, which may be useful for QA processes, 157 | especially when combined with screenshot creation. 158 | 159 | By setting the `mode` and `screenshot_coll` properties for each crawl, it is possible to run Browsertrix over replay and generate screenshots into a different collection, which may be used for QA comparison. 160 | 161 | Additional screenshot options are to be added soon. (Currently, the screenshot is taken after the behavior is run but this will likely change). 162 | 163 | Crawl options can also be overriden via command line. 164 | 165 | For example, given a crawl spec `./my_crawl.yaml`, one could first capture with: 166 | ``` 167 | browsertrix crawl create ./my_crawl.yaml --screenshot_coll screenshots-capture 168 | ``` 169 | 170 | and then run: 171 | ``` 172 | browsertrix crawl create ./my_crawl.yaml --screenshot_coll --mode replay screenshots-qa 173 | ``` 174 | 175 | By default, screenshots are saved with `urn:screenshot:` prefix. 176 | Based on the above crawls, one could then query all capture and qa screenshots in pywb via: 177 | ``` 178 | http://localhost:8180/screenshots-capture/*/urn:screenshot:* 179 | http://localhost:8180/screenshots-qa/*/urn:screenshot:* 180 | ``` 181 | 182 | Sample record and replay configs, [social-media.yaml](sample-crawls/social-media.yaml) and [social-media-replay.yaml](sample-crawls/social-media-replay.yaml), are also available. 183 | 184 | (Note: The screenshot functionality will likely change and additional options will be added) 185 | 186 | ### Other Crawl operations 187 | 188 | Other crawl operations include: 189 | * `browsertrix crawl stop` for stopping a crawl 190 | * `browsertrix crawl logs` for printing and following logs for one or all crawlers 191 | * `browsertrix crawl watch ` for attaching and watching all the browsers in a given crawl. 192 | * `browsertrix crawl remove` for removing a crawl 193 | * `browsertrix crawl remove-all` for stopping and removing all crawls. 194 | 195 | See `browsertrix crawl -h` for a complete reference of available commands. 196 | 197 | ## Full Text Search 198 | 199 | Browsertrix now includes a prototype integration with Apache Solr. Text is extracted for each page, after taking a screenshot, and ingested into Solr. The extracted text (as provided via raw DOM text nodes) from all frames, 200 | as well as the title, and url are indexed in Solr using default schema. (This is likely to evolve as well). 201 | 202 | The search is available for each collection via the pywb replay interface at: `http://localhost:8180/` 203 | 204 | The replay interface currently includes a list of pages, screenshot (if enabled) and ability to search the collection. 205 | 206 | (Note: solr data is stored in the `./solr` volume, and may require a permission adjustment on certain systems via `chmod a+w ./solr`) 207 | 208 | 209 | ## Browser Profiles 210 | 211 | It is often useful to prepare a browser, such as by logging into social media, other password protected sites 212 | to be able to capture content that is not generally accessible. However, doing so during a crawl is tedious, and worse, 213 | may result in passwords being recorded to WARC. 214 | 215 | Browsertrix addresses this problem with the support of browser profiles. A profile can be created by running a base 216 | Chrome browser, performing custom actions, and then 'saving' the running browser into a new 'profile' image. 217 | 218 | To create a profile: 219 | 220 | 1. Run: 221 | ```browsertrix profile create``` 222 | 223 | 2. This should start a new remote browser (Chrome 73 by default) and open it in a new window. You can now interact with the browser and log in to any sites as needed. 224 | 225 | 3. The command line should have the following message and a prompt to enter the profile name, eg. `logged-in` 226 | 227 | ``` 228 | A new browser window should have been opened 229 | You can use the browser to log-in to accounts or otherwise prepare the browser profile 230 | (The content will not be recorded to WARC) 231 | When done, please enter a new name to save the browser profile: 232 | ``` 233 | 234 | 4. Once the name is entered the profile is saved, and you can continue browsing to make a new profile, or select 'no' and close the browser. 235 | 236 | If everything worked, running ```browsertrix profile list``` should show: 237 | 238 | ``` 239 | PROFILE BASE BROWSER 240 | logged-in chrome:73 241 | ``` 242 | 243 | 5. To use the profile, set the `profile` property in the crawl spec YAML, or simply include `--profile` in the command line: 244 | 245 | ``` 246 | browsertrix crawl create ./my_crawl.yaml --profile logged-in 247 | ``` 248 | 249 | The browsers used for the crawl will be a copy of the browser saved during profile creation. 250 | 251 | `browsertrix profile remove` can be used to remove an unneeded profile. 252 | 253 | Note: The profile functionality is brand new and subject to change. At present, it is tied to the particular browser Docker image used an extend the image. The system may switch to Docker volumes in the future. 254 | 255 | ## Testing 256 | 257 | Browsertrix includes several test suites, also tested on automatically via Travis CI. 258 | 259 | ### Docker Integration Tests 260 | 261 | Browsertrix includes a Docker-based test suite that runs crawls over content replayed from a WARC 262 | (no live web content is accessed). This test suite requires Python 3.6+. 263 | 264 | To run this test suite, run: 265 | 266 | ```bash 267 | bash ./tests/start-test-compose.sh 268 | pip install -U -r test-docker-requirements.txt 269 | py.test --headless ./tests/test_live_crawl.py 270 | bash ./tests/stop-test-compose.sh 271 | ``` 272 | 273 | The test suite does not perform any live crawling, but runs all the [tests/crawl_tests.yaml](tests/crawl_tests.yaml) in replay mode using an existing test WARC downloaded from S3. 274 | 275 | ### Local API Tests 276 | 277 | To install and run local tests of the API (without Docker), run the following: 278 | (Python 3.7+ is required) 279 | 280 | ```bash 281 | pip install -U -r requirements.txt -r test-local-requirements.txt 282 | py.test ./tests/test_api.py 283 | ``` 284 | 285 | ## UI 286 | 287 | Browsertrix also includes a UI (still under development) which will 288 | have the same features as the CLI. 289 | 290 | To access the browsertrix UI, load `http://localhost:8000/` 291 | 292 | The frontend React app is found in `./frontend` and can be started via: 293 | 294 | ``` 295 | yarn run develop 296 | ``` 297 | 298 | (The develop server is started at `http://localhost:8001` to avoid conflict with production) 299 | 300 | To build the production bundle, run: 301 | ``` 302 | yarn run build-prod 303 | ``` 304 | 305 | This should update the production server running at `http://localhost:8000` 306 | -------------------------------------------------------------------------------- /browsertrix/__init__.py: -------------------------------------------------------------------------------- 1 | from better_exceptions import hook 2 | 3 | __version__ = '0.1.0' 4 | 5 | hook() 6 | -------------------------------------------------------------------------------- /browsertrix/api.py: -------------------------------------------------------------------------------- 1 | from fastapi import APIRouter, FastAPI 2 | from starlette.middleware.cors import ALL_METHODS, CORSMiddleware 3 | from starlette.responses import FileResponse, UJSONResponse 4 | from starlette.staticfiles import StaticFiles 5 | 6 | from .crawl import CrawlManager 7 | from .schema import * 8 | 9 | app = FastAPI(debug=True) 10 | app.add_middleware( 11 | CORSMiddleware, allow_origins=["*"], allow_methods=ALL_METHODS, allow_headers=["*"] 12 | ) 13 | crawl_man = CrawlManager() 14 | crawl_router = APIRouter() 15 | 16 | 17 | # ============================================================================ 18 | @app.post('/crawls', response_model=CreateStartResponse, response_class=UJSONResponse) 19 | async def create_crawl(new_crawl: CreateCrawlRequest): 20 | return await crawl_man.create_new(new_crawl) 21 | 22 | 23 | @app.get('/crawls', response_model=CrawlInfosResponse, response_class=UJSONResponse) 24 | async def get_all_crawls(): 25 | return await crawl_man.get_all_crawls() 26 | 27 | 28 | @crawl_router.put( 29 | '/{crawl_id}/urls', 30 | response_model=OperationSuccessResponse, 31 | response_class=UJSONResponse, 32 | ) 33 | async def queue_urls(crawl_id: str, url_list: QueueUrlsRequest): 34 | return await crawl_man.queue_crawl_urls(crawl_id, url_list.urls) 35 | 36 | 37 | @crawl_router.get( 38 | '/{crawl_id}', response_model=CrawlInfoResponse, response_class=UJSONResponse 39 | ) 40 | async def get_crawl(crawl_id: str): 41 | return await crawl_man.get_crawl_info(crawl_id) 42 | 43 | 44 | @crawl_router.get( 45 | '/{crawl_id}/urls', 46 | response_model=CrawlInfoUrlsResponse, 47 | response_class=UJSONResponse, 48 | ) 49 | async def get_crawl_urls(crawl_id: str): 50 | return await crawl_man.get_crawl_urls(crawl_id) 51 | 52 | 53 | @crawl_router.get( 54 | '/{crawl_id}/info', 55 | response_model=FullCrawlInfoResponse, 56 | response_class=UJSONResponse, 57 | ) 58 | async def get_full_crawl_info(crawl_id: str): 59 | return await crawl_man.get_full_crawl_info(crawl_id) 60 | 61 | 62 | @crawl_router.post( 63 | '/{crawl_id}/start', 64 | response_model=CreateStartResponse, 65 | response_class=UJSONResponse, 66 | ) 67 | async def start_crawl(crawl_id: str): 68 | return await crawl_man.start_crawl(crawl_id) 69 | 70 | 71 | @crawl_router.post( 72 | '/{crawl_id}/stop', 73 | response_model=OperationSuccessResponse, 74 | response_class=UJSONResponse, 75 | ) 76 | async def stop_crawl(crawl_id: str): 77 | return await crawl_man.stop_crawl(crawl_id) 78 | 79 | 80 | @crawl_router.get( 81 | '/{crawl_id}/done', response_model=CrawlDoneResponse, response_class=UJSONResponse 82 | ) 83 | async def is_done_crawl(crawl_id: str): 84 | return await crawl_man.is_crawl_done(crawl_id) 85 | 86 | 87 | @crawl_router.delete( 88 | '/{crawl_id}', response_model=OperationSuccessResponse, response_class=UJSONResponse 89 | ) 90 | async def delete_crawl(crawl_id: str): 91 | return await crawl_man.delete_crawl(crawl_id) 92 | 93 | 94 | @app.route('/') 95 | def ui(*args, **kwargs): 96 | return FileResponse('static/index.html') 97 | 98 | 99 | app.include_router(crawl_router, prefix='/crawl', tags=['crawl']) 100 | app.mount('/static', StaticFiles(directory='static', check_dir=True), 'static') 101 | app.add_event_handler('startup', crawl_man.startup) 102 | app.add_event_handler('shutdown', crawl_man.shutdown) 103 | -------------------------------------------------------------------------------- /browsertrix/schema.py: -------------------------------------------------------------------------------- 1 | import math 2 | from enum import Enum 3 | from typing import Any, Dict, List, Optional, Set, Union 4 | 5 | from pydantic import BaseModel, Schema, UrlStr 6 | 7 | __all__ = [ 8 | 'BrowserCookie', 9 | 'BrowserOverrides', 10 | 'CacheMode', 11 | 'CaptureMode', 12 | 'CookieSameSite', 13 | 'CrawlDoneResponse', 14 | 'CrawlInfo', 15 | 'CrawlInfoResponse', 16 | 'CrawlInfoUrlsResponse', 17 | 'CrawlInfosResponse', 18 | 'CrawlType', 19 | 'CreateCrawlRequest', 20 | 'CreateStartResponse', 21 | 'EmulatedDevice', 22 | 'EmulatedGeoLocation', 23 | 'FullCrawlInfoResponse', 24 | 'OperationSuccessResponse', 25 | 'QueueUrlsRequest', 26 | ] 27 | 28 | # ============================================================================ 29 | OptionalList = Optional[List[str]] 30 | OptionalSet = Optional[Set[str]] 31 | Number = Union[int, float] 32 | 33 | UrlStr.max_length = math.inf 34 | UrlStr.relative = True 35 | 36 | 37 | class CrawlType(str, Enum): 38 | SINGLE_PAGE = 'single-page' 39 | ALL_LINKS = 'all-links' 40 | SAME_DOMAIN = 'same-domain' 41 | CUSTOM = 'custom' 42 | 43 | 44 | class CaptureMode(str, Enum): 45 | RECORD = 'record' 46 | REPLAY = 'replay' 47 | LIVE = 'live' 48 | 49 | 50 | class CacheMode(str, Enum): 51 | ALWAYS = 'always' 52 | NEVER = 'never' 53 | DEFAULT = 'default' 54 | 55 | 56 | class CookieSameSite(str, Enum): 57 | STRICT = 'Strict' 58 | LAX = 'LAX' 59 | EXTENDED = 'Extended' 60 | NONE = 'None' 61 | 62 | 63 | class EmulatedDevice(BaseModel): 64 | width: Number 65 | height: Number 66 | deviceScaleFactor: Optional[Number] = None 67 | maxTouchPoints: Optional[Number] = None 68 | isMobile: Optional[bool] = None 69 | hasTouch: Optional[bool] = None 70 | isLandscape: Optional[bool] = None 71 | 72 | 73 | class EmulatedGeoLocation(BaseModel): 74 | latitude: Number 75 | longitude: Number 76 | 77 | 78 | class BrowserCookie(BaseModel): 79 | name: str 80 | value: str 81 | url: Optional[UrlStr] = None 82 | domain: Optional[str] = None 83 | path: Optional[str] = None 84 | secure: Optional[bool] = None 85 | httpOnly: Optional[bool] = None 86 | expires: Optional[Number] = None 87 | sameSite: Optional[CookieSameSite] = None 88 | 89 | 90 | class BrowserOverrides(BaseModel): 91 | user_agent: Optional[str] = None 92 | accept_language: Optional[str] = None 93 | navigator_platform: Optional[str] = None 94 | extra_headers: Optional[Dict[str, str]] = None 95 | cookies: Optional[List[BrowserCookie]] = None 96 | geo_location: Optional[EmulatedGeoLocation] = None 97 | device: Optional[EmulatedDevice] = None 98 | 99 | 100 | class BaseCreateCrawl(BaseModel): 101 | crawl_type: CrawlType = Schema( 102 | CrawlType.SINGLE_PAGE, description='What type of crawl should be launched' 103 | ) 104 | crawl_depth: Optional[int] = None 105 | num_browsers: int = Schema( 106 | 2, description='How many browsers should be used for the crawl' 107 | ) 108 | num_tabs: int = Schema(1, description='How many tabs should be used for the crawl') 109 | name: Optional[str] = Schema('', description='User friendly name for the crawl') 110 | coll: Optional[str] = Schema('live', description='Default Collection') 111 | 112 | mode: CaptureMode = Schema(CaptureMode.RECORD, description='Default Mode') 113 | 114 | screenshot_coll: Optional[str] = Schema( 115 | '', description='Collection to store screenshots, if any' 116 | ) 117 | 118 | text_coll: Optional[str] = Schema( 119 | '', description='Collection to store full-text indexes, if any' 120 | ) 121 | 122 | 123 | class CreateCrawlRequest(BaseCreateCrawl): 124 | class Config: 125 | extra = 'forbid' 126 | 127 | seed_urls: List[UrlStr] = [] 128 | scopes: List[Dict[Any, Any]] = [] 129 | 130 | cache: CacheMode = CacheMode.ALWAYS 131 | 132 | browser: Optional[str] = 'chrome:73' 133 | user_params: Dict[Any, Any] = dict() 134 | 135 | profile: Optional[str] = None 136 | 137 | ignore_extra: Optional[Dict[Any, Any]] = None 138 | 139 | behavior_max_time: int = 0 140 | headless: bool = False 141 | screenshot_target_uri: Optional[str] = None 142 | 143 | start: bool = True 144 | browser_overrides: Optional[BrowserOverrides] = None 145 | 146 | 147 | class OperationSuccessResponse(BaseModel): 148 | success: bool 149 | 150 | 151 | class CreateStartResponse(OperationSuccessResponse): 152 | id: str 153 | status: str = 'new' 154 | browsers: Optional[List[str]] 155 | 156 | 157 | class CrawlInfoResponse(BaseCreateCrawl): 158 | id: str 159 | status: str = 'new' 160 | start_time: int = 0 161 | finish_time: int = 0 162 | browsers: OptionalList 163 | tabs_done: List[Dict[Any, Any]] 164 | headless: bool = False 165 | num_queue: int = 0 166 | num_seen: int = 0 167 | num_pending: int = 0 168 | 169 | 170 | class CrawlInfosResponse(BaseModel): 171 | crawls: List[CrawlInfoResponse] 172 | 173 | 174 | class CrawlInfo(BaseModel): 175 | """ Model for validate a:{crawl_id}:info key 176 | All fields should be set in the model 177 | """ 178 | 179 | id: str 180 | name: str 181 | coll: str 182 | screenshot_coll: str 183 | text_coll: str 184 | mode: str 185 | status: str 186 | crawl_type: str 187 | crawl_depth: int 188 | num_browsers: int 189 | num_tabs: int 190 | start_time: int = 0 191 | finish_time: int = 0 192 | headless: bool = False 193 | browser_overrides: Optional[BrowserOverrides] = None 194 | 195 | 196 | class CrawlInfoUrlsResponse(BaseModel): 197 | scopes: List[Dict[Any, Any]] 198 | queue: List[Dict[Any, Any]] 199 | pending: OptionalList 200 | seen: OptionalSet 201 | 202 | 203 | class FullCrawlInfoResponse(CrawlInfo, CrawlInfoUrlsResponse): 204 | success: bool 205 | 206 | 207 | class QueueUrlsRequest(BaseModel): 208 | urls: List[str] 209 | 210 | 211 | class CrawlDoneResponse(BaseModel): 212 | done: bool 213 | -------------------------------------------------------------------------------- /browsertrix/utils.py: -------------------------------------------------------------------------------- 1 | from asyncio import AbstractEventLoop 2 | from os import environ 3 | from typing import Any, Dict, Optional, Type, Union 4 | from urllib.parse import urlsplit 5 | 6 | from aioredis import Redis, create_redis 7 | from ujson import loads as ujson_loads 8 | 9 | __all__ = ['env', 'extract_domain', 'init_redis'] 10 | 11 | 12 | async def init_redis(redis_url: str, loop: AbstractEventLoop) -> Redis: 13 | return await create_redis(redis_url, encoding='utf-8', loop=loop) 14 | 15 | 16 | def env( 17 | key: str, 18 | type_: Type[Union[str, bool, int, dict, float]] = str, 19 | default: Optional[Any] = None, 20 | ) -> Union[str, int, bool, float, Dict]: 21 | """Returns the value of the supplied env key name converting 22 | the env key's value to the specified type. 23 | 24 | If the env key does not exist the default value is returned. 25 | 26 | Boolean values for env keys are expected to be: 27 | - true: 1, true, yes, y, ok, on 28 | - false: 0, false, no, n, nok, off 29 | 30 | :param key: The name of the environment variable 31 | :param type_: What type should the the env key's value be converted to, 32 | defaults to str 33 | :param default: The default value of the env key, defaults to None 34 | :return: The value of the env key or the supplied default 35 | """ 36 | if key not in environ: 37 | return default 38 | 39 | val = environ[key] 40 | 41 | if type_ == str: 42 | return val 43 | elif type_ == bool: 44 | if val.lower() in ['1', 'true', 'yes', 'y', 'ok', 'on']: 45 | return True 46 | if val.lower() in ['0', 'false', 'no', 'n', 'nok', 'off']: 47 | return False 48 | raise ValueError( 49 | f'Invalid environment variable "{key}" (expected a boolean): "{val}"' 50 | ) 51 | elif type_ == int: 52 | try: 53 | return int(val) 54 | except ValueError: 55 | raise ValueError( 56 | f'Invalid environment variable "{key}" (expected a integer): "{val}"' 57 | ) 58 | elif type_ == float: 59 | try: 60 | return float(val) 61 | except ValueError: 62 | raise ValueError( 63 | f'Invalid environment variable "{key}" (expected a float): "{val}"' 64 | ) 65 | elif type_ == dict: 66 | return ujson_loads(val) 67 | 68 | 69 | def extract_domain(url: str) -> str: 70 | """Extracts and returns the domain, including the suffix, 71 | of the supplied URL 72 | 73 | :param url: The url to have its domain extracted from 74 | :return: The extracted domain 75 | """ 76 | extracted = urlsplit(url).netloc 77 | return extracted.replace('www.', '') 78 | -------------------------------------------------------------------------------- /browsertrix_cli/basecli.py: -------------------------------------------------------------------------------- 1 | import click 2 | import sys 3 | 4 | import requests 5 | 6 | 7 | # ============================================================================ 8 | class Settings: 9 | quiet_mode = False 10 | sesh = None 11 | 12 | server_prefix = None 13 | shepherd_prefix = None 14 | view_browsers_prefix = None 15 | 16 | 17 | settings = Settings() 18 | 19 | 20 | # ============================================================================ 21 | @click.group() 22 | @click.option( 23 | '--server', 24 | metavar='', 25 | type=str, 26 | default='http://localhost:8000', 27 | help='The Browsertrix server url', 28 | ) 29 | @click.option( 30 | '--shepherd', 31 | metavar='', 32 | type=str, 33 | default='http://localhost:9020', 34 | help='The Shepherd server url', 35 | ) 36 | @click.option( 37 | '-q', 38 | '--quiet', 39 | is_flag=True, 40 | default=False, 41 | type=bool, 42 | help='quiet mode: print only crawl ids if success', 43 | ) 44 | def cli(server, quiet, shepherd): 45 | settings.server_prefix = server 46 | 47 | settings.shepherd_prefix = shepherd 48 | settings.view_browsers_prefix = shepherd + '/attach/' 49 | 50 | settings.sesh = requests.session() 51 | 52 | settings.quiet_mode = quiet 53 | 54 | 55 | # ============================================================================ 56 | def is_quiet(): 57 | return settings.quiet_mode 58 | 59 | 60 | # ============================================================================ 61 | def ensure_success(res, exit=True): 62 | """ Ensure API response is successful 63 | print error and exit if not 64 | 65 | :param res: Response from requests 66 | :param exit: Exit on any error 67 | :return: parsed JSON response as dict 68 | """ 69 | if res.status_code == 200: 70 | json = res.json() 71 | return json 72 | 73 | if not is_quiet(): 74 | print('Error response from API server') 75 | print('{0}: {1}'.format(res.status_code, res.text)) 76 | 77 | if exit: 78 | sys.exit(1) 79 | 80 | 81 | # ============================================================================ 82 | def conn_error_exit(url): 83 | if not is_quiet(): 84 | print( 85 | 'Unable to connect to {0}. Is Browsertrix container running in Docker?'.format( 86 | url 87 | ) 88 | ) 89 | sys.exit(2) 90 | 91 | 92 | # ============================================================================ 93 | def sesh_get(url, prefix=None): 94 | url = (prefix or settings.server_prefix) + url 95 | try: 96 | res = settings.sesh.get(url) 97 | return ensure_success(res) 98 | except requests.exceptions.ConnectionError: 99 | conn_error_exit(url) 100 | 101 | 102 | # ============================================================================ 103 | def sesh_post(url, json=None, prefix=None): 104 | url = (prefix or settings.server_prefix) + url 105 | try: 106 | res = settings.sesh.post(url, json=json) 107 | return ensure_success(res) 108 | except requests.exceptions.ConnectionError: 109 | conn_error_exit(url) 110 | 111 | 112 | # ============================================================================ 113 | def sesh_delete(url, prefix=None): 114 | url = (prefix or settings.server_prefix) + url 115 | try: 116 | res = settings.sesh.delete(url) 117 | return ensure_success(res, exit=False) 118 | except requests.exceptions.ConnectionError: 119 | conn_error_exit(url) 120 | -------------------------------------------------------------------------------- /browsertrix_cli/crawl.py: -------------------------------------------------------------------------------- 1 | import click 2 | import datetime 3 | import docker 4 | import sys 5 | import time 6 | import yaml 7 | import webbrowser 8 | 9 | from collections import defaultdict 10 | 11 | 12 | from browsertrix_cli.basecli import ( 13 | cli, 14 | is_quiet, 15 | sesh_get, 16 | sesh_post, 17 | sesh_delete, 18 | settings, 19 | ) 20 | from browsertrix_cli.profile import get_profile_image 21 | 22 | 23 | COLUMNS = [ 24 | ('id', 'CRAWL ID', 12), 25 | ('name', 'NAME', 12), 26 | ('start_time', 'STARTED', 12), 27 | ('finish_time', 'DURATION', 12), 28 | ('status', 'STATUS', 7), 29 | ('crawl_type', 'CRAWL TYPE', 12), 30 | ('coll', 'COLL', 16), 31 | ('mode', 'MODE', 8), 32 | ('num_queue', 'TO CRAWL', 8), 33 | ('num_pending', 'PENDING', 8), 34 | ('num_seen', 'SEEN', 8), 35 | ('num_browsers', 'BROWSERS', 9), 36 | ('num_tabs', 'TABS', 3), 37 | ] 38 | 39 | 40 | # ============================================================================ 41 | @cli.group(help='Commands for working with crawls') 42 | def crawl(): 43 | pass 44 | 45 | 46 | # ============================================================================ 47 | def format_duration(start_time, finish_time): 48 | """ Format duration of crawl 49 | 50 | :param start_time: start time of crawl 51 | :param finish_time: finish time of crawl 52 | :return: string text for time elapsed since timestr 53 | """ 54 | try: 55 | if start_time == 0: 56 | return '-' 57 | 58 | if not finish_time: 59 | finish = datetime.datetime.now() 60 | else: 61 | finish = datetime.datetime.fromtimestamp(int(finish_time)) 62 | 63 | start = datetime.datetime.fromtimestamp(int(start_time)) 64 | elapsed = finish - start 65 | return str(elapsed).split('.', 1)[0] 66 | except Exception: 67 | return start_time 68 | 69 | 70 | # ============================================================================ 71 | def print_container_log( 72 | docker_api, reqid, name='autobrowser-', follow=False, wait=False 73 | ): 74 | 75 | full_name = name + reqid 76 | while True: 77 | try: 78 | container = docker_api.containers.get(full_name) 79 | break 80 | except docker.errors.NotFound: 81 | if not wait: 82 | return False 83 | 84 | print('Waiting for Logs...') 85 | time.sleep(0.25) 86 | continue 87 | 88 | print('---- Logs for Crawl {0}: {1} ----'.format(reqid, full_name)) 89 | res = container.logs(follow=follow, stream=True) 90 | for line in res: 91 | sys.stdout.write(line.decode('utf-8')) 92 | 93 | print('-----------------------------------') 94 | print('') 95 | print('') 96 | 97 | return True 98 | 99 | 100 | # ============================================================================ 101 | def print_logs(browsers, follow=False, wait=False, all_containers=False): 102 | docker_api = docker.from_env(version='auto') 103 | 104 | if follow is None: 105 | follow = False 106 | 107 | for reqid in browsers: 108 | if all_containers: 109 | print_container_log( 110 | docker_api, reqid, wait=False, follow=False, name='browser-' 111 | ) 112 | 113 | print_container_log( 114 | docker_api, reqid, wait=False, follow=False, name='xserver-' 115 | ) 116 | 117 | print_container_log(docker_api, reqid, wait=wait, follow=follow) 118 | 119 | 120 | # ============================================================================ 121 | def open_browsers(browsers, crawl_id, tabs_done=None, num_tabs=-1): 122 | count = 1 123 | for reqid in browsers: 124 | skip = False 125 | if not tabs_done or tabs_done.get(reqid) != num_tabs: 126 | msg = 'Opening Browser {0} of {1} ({2}) for crawl {3}' 127 | else: 128 | msg = 'Skipping Finished Browser {0} of {1}, ({2}) for crawl {3}' 129 | skip = True 130 | 131 | if not is_quiet(): 132 | print(msg.format(count, len(browsers), reqid, crawl_id)) 133 | 134 | if not skip: 135 | webbrowser.open(settings.view_browsers_prefix + reqid) 136 | count += 1 137 | 138 | 139 | # ============================================================================ 140 | @crawl.command(name='list', help='List all crawls') 141 | def list_crawls(): 142 | """ List all available crawls 143 | """ 144 | res = sesh_get('/crawls') 145 | 146 | sorted_list = sorted(res['crawls'], key=lambda x: x['start_time'], reverse=True) 147 | 148 | if is_quiet(): 149 | for crawl in sorted_list: 150 | print(crawl['id']) 151 | 152 | return 153 | 154 | format_str = '{value: <{size}} ' 155 | 156 | for _, text, size in COLUMNS: 157 | sys.stdout.write(format_str.format(value=text, size=size)) 158 | print() 159 | 160 | for crawl in sorted_list: 161 | for field, _, size in COLUMNS: 162 | value = crawl[field] 163 | if field == 'start_time': 164 | value = format_duration(value, None) + ' ago' 165 | elif field == 'finish_time': 166 | value = format_duration(crawl['start_time'], value) 167 | 168 | sys.stdout.write(format_str.format(value=value, size=size)) 169 | print() 170 | print() 171 | 172 | 173 | # ============================================================================ 174 | @crawl.command( 175 | name='create', help='Create (and optionally start) new crawl from yaml crawl spec' 176 | ) 177 | @click.option( 178 | '--start/--no-start', 179 | default=True, 180 | help="Start/Don't start crawl immediately after creation", 181 | ) 182 | @click.option( 183 | '--browser', 184 | type=str, 185 | default=None, 186 | help='Browser Docker image to use for crawling, (overrides setting in spec)', 187 | ) 188 | @click.option( 189 | '--profile', 190 | type=str, 191 | default=None, 192 | help='Browser Profile Docker image to use for crawling (overrides "browser" option)', 193 | ) 194 | @click.option( 195 | '--coll', 196 | type=str, 197 | default=None, 198 | help='Set the collection (overrides setting in spec)', 199 | ) 200 | @click.option( 201 | '--mode', 202 | type=str, 203 | default=None, 204 | help='Set the capture mode (overrides setting in spec)', 205 | ) 206 | @click.option( 207 | '--screenshot_coll', 208 | type=str, 209 | default=None, 210 | help='Set the collection to save screenshots (overrides setting in spec)', 211 | ) 212 | @click.option( 213 | '--headless', 214 | type=bool, 215 | is_flag=True, 216 | help='Use headless mode. Browsers can not be opened for watching the crawl', 217 | ) 218 | @click.option( 219 | '--behavior-time', 220 | default=None, 221 | type=int, 222 | help='Max duration to run each in-page behavior', 223 | ) 224 | @click.option( 225 | '--watch', 226 | is_flag=True, 227 | default=False, 228 | type=bool, 229 | help='Watch all started browsers in a local browser (only if starting crawl)', 230 | ) 231 | @click.option( 232 | '--log', 233 | is_flag=True, 234 | default=False, 235 | type=bool, 236 | help='Tail the log for the browser crawler', 237 | ) 238 | @click.argument('crawl_spec_file', type=click.File('rt')) 239 | def create_crawl( 240 | crawl_spec_file, 241 | start, 242 | browser, 243 | profile, 244 | coll, 245 | mode, 246 | screenshot_coll, 247 | headless, 248 | behavior_time, 249 | watch, 250 | log, 251 | ): 252 | """ Create a new crawl! 253 | 254 | :param crawl_spec_file: YAML file with one or more crawls in 'crawls' key 255 | :param start: If true, start crawl immediately after creation 256 | :param browser: Browser Docker image to use for crawling (overrides setting in spec) 257 | :param profile: Browser Profile Docker image to use for crawling (overrides "browser" setting) 258 | :param coll: Set the collection (overrides setting in spec) 259 | :param mode: Set the capture mode (overrides setting in spec) 260 | :param screenshot_coll: Set the collection to save screenshots (overrides setting in spec) 261 | :param headless: Use headless mode. Browsers can not be opened for watching the crawl 262 | :param behavior_time: Max duration (in seconds) to run each in-page behavior 263 | :param watch: Watch all started browsers in a local browser (only if starting crawl) 264 | 265 | """ 266 | root = yaml.load(crawl_spec_file, Loader=yaml.Loader) 267 | 268 | for crawl_spec in root['crawls']: 269 | if not start: 270 | msg = 'Created' 271 | else: 272 | msg = 'Created and Started' 273 | 274 | if headless is not None: 275 | crawl_spec['headless'] = headless 276 | 277 | if behavior_time is not None: 278 | crawl_spec['behavior_time'] = behavior_time 279 | 280 | if profile is not None: 281 | crawl_spec['profile'] = profile 282 | 283 | if 'profile' in crawl_spec: 284 | browser = get_profile_image(crawl_spec.get('profile', '')) 285 | 286 | if browser is not None: 287 | crawl_spec['browser'] = browser 288 | 289 | if coll is not None: 290 | crawl_spec['coll'] = coll 291 | 292 | if mode is not None: 293 | crawl_spec['mode'] = mode 294 | 295 | if screenshot_coll is not None: 296 | crawl_spec['screenshot_coll'] = screenshot_coll 297 | 298 | if not is_quiet(): 299 | print('Creating New Crawl, Please Wait...') 300 | 301 | res = sesh_post('/crawls', json=crawl_spec) 302 | 303 | if is_quiet(): 304 | print(res['id']) 305 | else: 306 | print('Crawl {0}: {1}'.format(msg, res['id'])) 307 | print('Status: {0}'.format(res['status'])) 308 | 309 | if watch: 310 | if not start: 311 | if not is_quiet(): 312 | print("Can't watch, crawl not started") 313 | 314 | elif headless: 315 | if not is_quiet(): 316 | print("Can't watch, crawl is running in headless mode") 317 | 318 | else: 319 | open_browsers(res['browsers'], res['id']) 320 | 321 | if log: 322 | print_logs(res['browsers'], follow=True, wait=True) 323 | 324 | 325 | # ============================================================================ 326 | @crawl.command(name='start', help='Start an existing crawl') 327 | @click.argument('crawl_id', nargs=-1) 328 | def start_crawl(crawl_id, browser, headless, behavior_time): 329 | """ Start an existing crawl 330 | 331 | :param crawl_id: list of crawl ids to start 332 | """ 333 | for id_ in crawl_id: 334 | res = sesh_post('/crawl/{0}/start'.format(id_)) 335 | 336 | if is_quiet(): 337 | print(res['id']) 338 | else: 339 | print('Started Crawl: {0}'.format(res['id'])) 340 | 341 | 342 | # ============================================================================ 343 | @crawl.command(name='info', help='Get info on an existing crawl(s)') 344 | @click.argument('crawl_id', nargs=-1) 345 | @click.option( 346 | '--urls/--no-urls', 347 | default=False, 348 | help='Get detailed info on crawl, listing all urls', 349 | ) 350 | def get_info(crawl_id, urls): 351 | """ Get info on existing crawl(s) 352 | 353 | :param crawl_id: list of crawl ids to get info on 354 | :param urls: Get detailed info on crawl, listing all urls 355 | """ 356 | for id_ in crawl_id: 357 | if urls: 358 | res = sesh_get('/crawl/{0}/info'.format(id_)) 359 | else: 360 | res = sesh_get('/crawl/{0}'.format(id_)) 361 | 362 | print(yaml.dump(res)) 363 | 364 | 365 | # ============================================================================ 366 | @crawl.command(name='watch', help='Watch crawling browsers in local browser') 367 | @click.argument('crawl_id', nargs=-1) 368 | def watch_crawl(crawl_id): 369 | """ Watch crawling browsers in local browser 370 | 371 | :param crawl_id: list of crawl ids to watch 372 | """ 373 | for id_ in crawl_id: 374 | res = sesh_get('/crawl/{0}'.format(id_)) 375 | 376 | if res.get('headless'): 377 | if not is_quiet(): 378 | print("Can not watch, crawl is running in headless mode") 379 | continue 380 | 381 | if res.get('status') != 'running': 382 | if not is_quiet(): 383 | print('Crawl not running: {0}'.format(id_)) 384 | continue 385 | 386 | browsers = res['browsers'] 387 | 388 | done_count = defaultdict(int) 389 | 390 | for info in res.get('tabs_done'): 391 | done_count[info['id']] += 1 392 | 393 | if not browsers: 394 | if not is_quiet(): 395 | print('No Browsers') 396 | continue 397 | 398 | open_browsers(browsers, id_, done_count, res['num_tabs']) 399 | 400 | 401 | # ============================================================================ 402 | @crawl.command(name='stop', help='Stop one or more existing crawls') 403 | @click.argument('crawl_id', nargs=-1) 404 | def stop_crawl(crawl_id): 405 | """ Stop one or more existing crawls 406 | 407 | :param crawl_id: list of crawl ids to stop 408 | """ 409 | for id_ in crawl_id: 410 | res = sesh_post('/crawl/{0}/stop'.format(id_)) 411 | 412 | if not res.get('success'): 413 | print('Error stopping: ' + res) 414 | return 415 | 416 | if is_quiet(): 417 | print(id_) 418 | else: 419 | print('Stopped Crawl: {0}'.format(id_)) 420 | 421 | 422 | # ============================================================================ 423 | @crawl.command(name='remove', help='Remove one or more existing crawls') 424 | @click.argument('crawl_id', nargs=-1) 425 | def remove_crawl(crawl_id): 426 | """ Remove one or more existing crawls 427 | 428 | :param crawl_id: list of crawl ids to stop 429 | """ 430 | for id_ in crawl_id: 431 | res = sesh_delete('/crawl/{0}'.format(id_)) 432 | 433 | if not res.get('success'): 434 | print('Error removing: ' + res) 435 | return 436 | 437 | if is_quiet(): 438 | print(id_) 439 | else: 440 | print('Removed Crawl: {0}'.format(id_)) 441 | 442 | 443 | # ============================================================================ 444 | @crawl.command(name='remove-all', help='Stop and remove all crawls') 445 | def remove_all(): 446 | """ Stop and remove all crawls 447 | """ 448 | res = sesh_get('/crawls') 449 | 450 | crawls = res['crawls'] 451 | 452 | for crawl in crawls: 453 | id_ = crawl['id'] 454 | res = sesh_delete('/crawl/{0}'.format(id_)) 455 | if not is_quiet(): 456 | print('Removed Crawl: {0}'.format(id_)) 457 | 458 | 459 | # ============================================================================ 460 | @crawl.command(name='logs', help='View crawl logs for one or all crawlers') 461 | @click.argument('crawl_id', nargs=1) 462 | @click.option( 463 | '-b', 464 | '--browser', 465 | type=int, 466 | default=0, 467 | help='1-based index of browser to show logs for, or 0 for all (default)', 468 | ) 469 | @click.option( 470 | '-f', 471 | '--follow', 472 | type=bool, 473 | default=False, 474 | is_flag=True, 475 | help='follow crawl log in real-time', 476 | ) 477 | @click.option( 478 | '-a', 479 | '--all-containers', 480 | type=bool, 481 | default=False, 482 | is_flag=True, 483 | help='include logs from all containers, not just crawler', 484 | ) 485 | def logs(crawl_id, browser, follow, all_containers): 486 | """ View crawl logs for one or all crawlers 487 | :param crawl_id: The crawl_id to view logs for 488 | :param browser: 1-based index of browser to show logs for, or 0 for all (default) 489 | :param follow: follow crawl log in real-time (for one browser only) 490 | :param all_containers: include logs from all containers, not just crawler 491 | """ 492 | res = sesh_get('/crawl/{0}'.format(crawl_id)) 493 | 494 | num_browsers = len(res['browsers']) 495 | if browser <= 0: 496 | print_logs(res['browsers'], follow=follow, all_containers=all_containers) 497 | elif browser > num_browsers: 498 | print( 499 | 'Crawl has {0} browsers. Index must be 1 to {0}'.format( 500 | num_browsers, num_browsers 501 | ) 502 | ) 503 | else: 504 | print_logs( 505 | [res['browsers'][browser - 1]], follow=follow, all_containers=all_containers 506 | ) 507 | -------------------------------------------------------------------------------- /browsertrix_cli/main.py: -------------------------------------------------------------------------------- 1 | from browsertrix_cli.basecli import cli 2 | import browsertrix_cli.profile 3 | import browsertrix_cli.crawl 4 | 5 | 6 | # ============================================================================ 7 | if __name__ == '__main__': 8 | cli() 9 | -------------------------------------------------------------------------------- /browsertrix_cli/profile.py: -------------------------------------------------------------------------------- 1 | import click 2 | import docker 3 | import sys 4 | import time 5 | import webbrowser 6 | 7 | 8 | from browsertrix_cli.basecli import cli, is_quiet, sesh_get, settings 9 | 10 | 11 | # ============================================================================ 12 | docker_api = None 13 | 14 | PROFILE_PREFIX = 'oldwebtoday/profile:' 15 | 16 | LABEL_BROWSERPROFILE = 'wr.browserprofile' 17 | LABEL_BASEBROWSER = 'wr.basebrowser' 18 | 19 | 20 | # ============================================================================ 21 | def get_profile_image(profile): 22 | try: 23 | global docker_api 24 | if not docker_api: 25 | docker_api = docker.from_env(version='auto') 26 | 27 | image_name = PROFILE_PREFIX + profile 28 | image = docker_api.images.get(image_name) 29 | assert image.labels.get(LABEL_BROWSERPROFILE) == profile 30 | return 'profile:' + profile 31 | 32 | except (docker.errors.ImageNotFound, AssertionError): 33 | if not is_quiet(): 34 | print('Profile "{0}" not found'.format(profile)) 35 | sys.exit(1) 36 | 37 | 38 | # ============================================================================ 39 | @cli.group(help='Commands for creating/removing browser profiles') 40 | def profile(): 41 | global docker_api 42 | docker_api = docker.from_env(version='auto') 43 | 44 | 45 | # ============================================================================ 46 | @profile.command(name='list', help='List Profiles') 47 | def list_profiles(): 48 | res = docker_api.images.list(filters={'label': LABEL_BROWSERPROFILE}) 49 | 50 | format_str = '{profile: <16} {base}' 51 | if not is_quiet(): 52 | print(format_str.format(profile='PROFILE', base='BASE BROWSER')) 53 | 54 | for image in res: 55 | if not image.tags: 56 | continue 57 | 58 | if not image.tags[0].startswith(PROFILE_PREFIX): 59 | continue 60 | 61 | profile = image.tags[0][len(PROFILE_PREFIX) :] 62 | base_browser = image.labels.get(LABEL_BASEBROWSER, '(unknown)') 63 | 64 | if not is_quiet(): 65 | print(format_str.format(profile=profile, base=base_browser)) 66 | else: 67 | print(profile) 68 | 69 | if not is_quiet(): 70 | print() 71 | 72 | 73 | # ============================================================================ 74 | @profile.command(name='remove', help='Remove Profile') 75 | @click.argument('profile', type=str) 76 | def remove_profile(profile): 77 | full_tag = PROFILE_PREFIX + profile 78 | 79 | try: 80 | docker_api.images.remove(full_tag, force=True, noprune=False) 81 | if not is_quiet(): 82 | print('Removed profile "{0}"!'.format(profile)) 83 | 84 | except docker.errors.ImageNotFound: 85 | if not is_quiet(): 86 | print('Profile "{0}" not found'.format(profile)) 87 | sys.exit(1) 88 | 89 | 90 | # ============================================================================ 91 | @profile.command(name='create', help='Create Profile') 92 | @click.option( 93 | '--browser', default='chrome:73', type=str, help='Base Browser Image to Extend' 94 | ) 95 | def create_profile(browser): 96 | res = sesh_get( 97 | '/api/request/{0}/about:blank'.format(browser), prefix=settings.shepherd_prefix 98 | ) 99 | 100 | reqid = res.get('reqid') 101 | 102 | curr_browser = None 103 | 104 | webbrowser.open(settings.view_browsers_prefix + reqid) 105 | 106 | print('A new browser window should have been opened') 107 | print( 108 | 'You can use the browser to log-in to accounts or otherwise prepare the browser profile' 109 | ) 110 | print('(The content will not be recorded to WARC)') 111 | 112 | while True: 113 | profile_name = click.prompt( 114 | 'When done, please enter a new name to save the browser profile', type=str 115 | ) 116 | 117 | if not curr_browser: 118 | curr_browser = docker_api.containers.get('browser-' + reqid) 119 | 120 | # exit_code, output = curr_browser.exec_run('/app/prep-commit.sh') 121 | exit_code, output = curr_browser.exec_run('pkill -f "/usr/bin/google-chrome"') 122 | if not is_quiet(): 123 | print('Killed Chrome to Save Profile for Commit') 124 | print('Result: {0}'.format(exit_code)) 125 | print(output.decode('utf-8')) 126 | 127 | time.sleep(1.5) 128 | 129 | conf = { 130 | 'Labels': {LABEL_BROWSERPROFILE: profile_name, LABEL_BASEBROWSER: browser} 131 | } 132 | 133 | res = curr_browser.commit( 134 | repository=PROFILE_PREFIX[:-1], 135 | tag=profile_name, 136 | message='Browser Profile', 137 | conf=conf, 138 | ) 139 | 140 | if not is_quiet(): 141 | print('Created Image: {0} ({1})'.format(res.tags[0], res.short_id)) 142 | 143 | print('The browser should have restarted to about:blank') 144 | if not click.confirm('Continue browsing to create another profile?'): 145 | break 146 | -------------------------------------------------------------------------------- /cli-requirements.txt: -------------------------------------------------------------------------------- 1 | click 2 | docker 3 | pyyaml 4 | requests 5 | -------------------------------------------------------------------------------- /dev-requirements.txt: -------------------------------------------------------------------------------- 1 | flake8 2 | flake8-bugbear 3 | flake8-mypy 4 | mypy 5 | pytest 6 | pytest-asyncio 7 | black 8 | mock 9 | requests 10 | fakeredis 11 | PyYAML 12 | -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: '3.5' 2 | 3 | services: 4 | browsertrix: 5 | image: webrecorder/browsertrix 6 | build: 7 | context: . 8 | environment: 9 | - REDIS_URL=redis://redis/0 10 | - DEFAULT_POOL=auto-pool 11 | - SCREENSHOT_API_URL=http://pywb:8080/api/screenshot/{coll} 12 | - EXTRACTED_RAW_DOM_API_URL=http://pywb:8080/api/dom/{coll} 13 | - PROXY_HOST=pywb 14 | 15 | depends_on: 16 | - redis 17 | - pywb 18 | 19 | ports: 20 | - 8000:8000 21 | 22 | #volumes: 23 | # - ./:/app/ 24 | 25 | pywb: 26 | build: ./pywb/ 27 | 28 | environment: 29 | - REDIS_URL=redis://redis/0 30 | 31 | volumes: 32 | - ./webarchive:/webarchive 33 | 34 | networks: 35 | - default 36 | - browsers 37 | 38 | ports: 39 | - 8180:8080 40 | 41 | depends_on: 42 | - redis 43 | 44 | solr: 45 | image: solr 46 | 47 | volumes: 48 | - ./solr:/var/solr 49 | 50 | entrypoint: 51 | - docker-entrypoint.sh 52 | - solr-precreate 53 | - browsertrix 54 | 55 | ports: 56 | - 8983:8983 57 | 58 | shepherd: 59 | image: oldwebtoday/shepherd:1.1.0-dev 60 | 61 | environment: 62 | - BROWSER_NET=browsertrix_browsers 63 | - MAIN_NET=browsertrix_default 64 | 65 | - PROXY_HOST=pywb 66 | - PROXY_PORT=8080 67 | 68 | depends_on: 69 | - redis 70 | 71 | volumes: 72 | - /var/run/docker.sock:/var/run/docker.sock 73 | - ./flocks:/app/flocks 74 | - ./pool_config.yaml:/app/pool_config.yaml 75 | 76 | ports: 77 | - 9020:9020 78 | 79 | redis: 80 | image: redis:3.2.4 81 | 82 | behaviors: 83 | image: webrecorder/behaviors 84 | 85 | ports: 86 | - 3030:3030 87 | 88 | networks: 89 | default: 90 | driver: bridge 91 | 92 | browsers: 93 | driver: bridge 94 | 95 | -------------------------------------------------------------------------------- /flocks/browsers-headless.yaml: -------------------------------------------------------------------------------- 1 | name: browsers-headless 2 | auto_remove: false 3 | 4 | volumes: 5 | tmpdir: '/tmp/.X11-unix' 6 | 7 | containers: 8 | - name: browser 9 | image: oldwebtoday/base-browser 10 | set_user_params: true 11 | external_network: '${BROWSER_NET}' 12 | 13 | environment: 14 | DISPLAY: ':99' 15 | PULSE_SERVER: '/tmp/.X11-unix/pulse-socket' 16 | 17 | PROXY_HOST: 'pywb' 18 | PROXY_PORT: '8080' 19 | PROXY_CA_URL: 'http://wsgiprox/download/pem' 20 | PROXY_CA_FILE: '/tmp/proxy-ca.pem' 21 | 22 | - name: autobrowser 23 | image: webrecorder/autobrowser 24 | 25 | external_network: '${MAIN_NET}' 26 | 27 | environment: 28 | BROWSER_HOST: 'browser' 29 | REDIS_URL: 'redis://redis:6379/0' 30 | TAB_TYPE: 'CrawlerTab' 31 | 32 | 33 | -------------------------------------------------------------------------------- /flocks/browsers.yaml: -------------------------------------------------------------------------------- 1 | name: browsers 2 | auto_remove: false 3 | 4 | volumes: 5 | tmpdir: '/tmp/.X11-unix' 6 | 7 | containers: 8 | - name: xserver 9 | image: oldwebtoday/vnc-webrtc-audio 10 | ports: 11 | vnc_port: 6080 12 | cmd_port: 6082 13 | ice_tcp_port: 10235 14 | ice_udp_port: '10235/udp' 15 | 16 | environment: 17 | IDLE_TIMEOUT: '${IDLE_TIMEOUT}' 18 | 19 | - name: browser 20 | image: oldwebtoday/base-browser 21 | image_label: wr.name 22 | set_user_params: true 23 | external_network: '${BROWSER_NET}' 24 | 25 | environment: 26 | DISPLAY: ':99' 27 | PULSE_SERVER: '/tmp/.X11-unix/pulse-socket' 28 | 29 | #default to no proxy, set by crawler 30 | #PROXY_HOST: 'pywb' 31 | PROXY_PORT: '8080' 32 | PROXY_CA_URL: 'http://wsgiprox/download/pem' 33 | PROXY_CA_FILE: '/tmp/proxy-ca.pem' 34 | 35 | - name: autobrowser 36 | image: webrecorder/autobrowser 37 | 38 | external_network: '${MAIN_NET}' 39 | 40 | environment: 41 | BROWSER_HOST: 'browser' 42 | REDIS_URL: 'redis://redis:6379/0' 43 | TAB_TYPE: 'CrawlerTab' 44 | 45 | 46 | -------------------------------------------------------------------------------- /frontend/.eslintrc.js: -------------------------------------------------------------------------------- 1 | module.exports = { 2 | extends: ['plugin:prettier/recommended', 'prettier/react'], 3 | parser: 'babel-eslint', 4 | parserOptions: { 5 | ecmaVersion: 10 6 | }, 7 | env: { 8 | browser: true, 9 | node: true 10 | } 11 | }; 12 | -------------------------------------------------------------------------------- /frontend/.prettierrc: -------------------------------------------------------------------------------- 1 | { 2 | "singleQuote": true, 3 | "jsxSingleQuote": true 4 | } 5 | -------------------------------------------------------------------------------- /frontend/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "crawlmanager-frontend", 3 | "version": "1.0.0", 4 | "main": "index.js", 5 | "license": "Appache-2.0", 6 | "dependencies": { 7 | "@hot-loader/react-dom": "16.8.6", 8 | "fstream": "^1.0.12", 9 | "immutable": "^4.0.0-rc.12", 10 | "js-yaml": "^3.13.1", 11 | "plur": "^3.1.1", 12 | "prop-types": "^15.7.2", 13 | "react": "^16.8.6", 14 | "react-dom": "^16.8.6", 15 | "react-redux": "^6.0.1", 16 | "react-router": "^5.0.0", 17 | "react-router-dom": "^5.0.0", 18 | "react-table": "^6.9.2", 19 | "react-toastify": "^5.0.0-rc.3", 20 | "react-virtualized": "^9.21.0", 21 | "redux": "^4.0.1", 22 | "redux-actions": "^2.6.5", 23 | "redux-batched-actions": "^0.4.1", 24 | "redux-devtools-extension": "^2.13.8", 25 | "redux-form": "^8.1.0", 26 | "redux-immutable": "^4.0.0", 27 | "redux-promise": "^0.6.0", 28 | "redux-thunk": "^2.3.0", 29 | "uikit": "^3.0.3", 30 | "url-regex": "^4.1.1" 31 | }, 32 | "devDependencies": { 33 | "@babel/cli": "^7.4.3", 34 | "@babel/core": "^7.4.3", 35 | "@babel/plugin-proposal-class-properties": "^7.4.0", 36 | "@babel/plugin-proposal-decorators": "^7.4.0", 37 | "@babel/preset-env": "^7.4.3", 38 | "@babel/preset-react": "^7.0.0", 39 | "@types/react": "^16.8.13", 40 | "babel-eslint": "^10.0.1", 41 | "babel-loader": "^8.0.5", 42 | "babel-plugin-add-module-exports": "^1.0.0", 43 | "babel-plugin-transform-react-remove-prop-types": "^0.4.24", 44 | "babel-preset-env": "^1.7.0", 45 | "css-loader": "^2.1.1", 46 | "eslint": "^5.16.0", 47 | "eslint-config-prettier": "^4.1.0", 48 | "eslint-plugin-prettier": "^3.0.1", 49 | "eslint-plugin-react": "^7.12.4", 50 | "fastify": "^2.2.0", 51 | "fastify-static": "^2.4.0", 52 | "fastify-webpack-hmr": "^2.0.1", 53 | "file-loader": "^3.0.1", 54 | "html-webpack-plugin": "^3.2.0", 55 | "node-sass": "^4.11.0", 56 | "prettier": "^1.16.4", 57 | "react-hot-loader": "^4.8.3", 58 | "sass-loader": "^7.1.0", 59 | "style-loader": "^0.23.1", 60 | "url-loader": "^1.1.2", 61 | "webpack": "^4.29.6", 62 | "webpack-cli": "^3.3.0", 63 | "webpack-dev-middleware": "^3.6.2", 64 | "webpack-hot-middleware": "^2.24.3" 65 | }, 66 | "scripts": { 67 | "develop": "node ./webpack/development-server.js", 68 | "build-prod": "NODE_ENV=production webpack --config ./webpack/webpack.config.js" 69 | } 70 | } 71 | -------------------------------------------------------------------------------- /frontend/public/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | HI 6 | 7 | 8 |
9 | 10 | 11 | -------------------------------------------------------------------------------- /frontend/src/actions/crawls.js: -------------------------------------------------------------------------------- 1 | import { toast } from 'react-toastify'; 2 | import { makeHTTPRequest } from './httpRequests'; 3 | import { EndpointRequests } from '../utils'; 4 | 5 | export const ActionTypes = { 6 | getAll: Symbol('crawl-get-all'), 7 | gotAll: Symbol('crawl-got-all'), 8 | gotAllInit: Symbol('crawl-got-all-init'), 9 | create: Symbol('crawl-create'), 10 | urls: Symbol('crawl-get-urls'), 11 | addURLs: Symbol('crawl-add-urls'), 12 | updateURLInfo: Symbol('crawl-update-url-info'), 13 | info: Symbol('crawl-get-info'), 14 | stop: Symbol('crawl-stop'), 15 | start: Symbol('crawl-start'), 16 | isDone: Symbol('crawl-is-done'), 17 | deleteCrawl: Symbol('crawl-delete'), 18 | updateInfo: Symbol('crawl-update-info') 19 | }; 20 | 21 | export function getAllCrawls(init = false) { 22 | const request = EndpointRequests.retrieveAllCrawls(); 23 | return makeHTTPRequest(request, { 24 | onError({ error }) { 25 | toast(`Failed to retrieve info about all crawls: ${error}`, { 26 | type: toast.TYPE.ERROR 27 | }); 28 | }, 29 | async onResponse({ response }) { 30 | if (!response.ok) { 31 | toast( 32 | `Failed to retrieve info about all crawls: Details 33 | HTTP ${response.status}`, 34 | { 35 | type: toast.TYPE.ERROR 36 | } 37 | ); 38 | return; 39 | } 40 | return { 41 | type: init ? ActionTypes.gotAllInit : ActionTypes.gotAll, 42 | payload: await response.json() 43 | }; 44 | } 45 | }); 46 | } 47 | 48 | export function getCrawlInfo(id) { 49 | const request = EndpointRequests.crawlInfo(id); 50 | return makeHTTPRequest(request, { 51 | onError({ error }) { 52 | toast(`Failed to retrieve the info for crawl - ${id}: ${error}`, { 53 | type: toast.TYPE.ERROR 54 | }); 55 | }, 56 | async onResponse({ response }) { 57 | const json = await response.json(); 58 | if (!response.ok) { 59 | toast( 60 | `Failed to get the crawl info - ${id}: Details 61 | ${json.detail}`, 62 | { 63 | type: toast.TYPE.ERROR 64 | } 65 | ); 66 | return; 67 | } 68 | return { 69 | type: ActionTypes.info, 70 | payload: json 71 | }; 72 | } 73 | }); 74 | } 75 | 76 | export function addCrawlURLs(id, urls) { 77 | const { request } = EndpointRequests.addCrawlURLs(id, urls); 78 | return makeHTTPRequest(request, { 79 | onError({ error }) { 80 | toast(`Failed to add the urls to the crawl - ${id}: ${error}`, { 81 | type: toast.TYPE.ERROR 82 | }); 83 | }, 84 | async onResponse({ response }) { 85 | const json = await response.json(); 86 | if (!response.ok) { 87 | toast( 88 | `Failed to add urls to the crawl - ${id}: Details 89 | ${json.detail}`, 90 | { 91 | type: toast.TYPE.ERROR 92 | } 93 | ); 94 | return; 95 | } 96 | return { 97 | type: ActionTypes.addURLs, 98 | payload: { 99 | id, 100 | urls 101 | } 102 | }; 103 | } 104 | }); 105 | } 106 | 107 | export function getCrawlURLs(id) { 108 | const { request } = EndpointRequests.getCrawlURLs(id); 109 | return makeHTTPRequest(request, { 110 | onError({ error }) { 111 | toast(`Failed to retrieve the crawls URLs - ${id}: ${error}`, { 112 | type: toast.TYPE.ERROR 113 | }); 114 | }, 115 | async onResponse({ response }) { 116 | const json = await response.json(); 117 | if (!response.ok) { 118 | toast( 119 | `Failed to retrieve the crawls URLs - ${id}: Details 120 | ${json.detail}`, 121 | { 122 | type: toast.TYPE.ERROR 123 | } 124 | ); 125 | return; 126 | } 127 | return { 128 | type: ActionTypes.updateURLInfo, 129 | payload: Object.assign( 130 | { 131 | id 132 | }, 133 | json 134 | ) 135 | }; 136 | } 137 | }); 138 | } 139 | 140 | /** 141 | * 142 | * @param {Object} [newCrawlConfig] 143 | */ 144 | export function createCrawl(newCrawlConfig) { 145 | const { body, request } = EndpointRequests.createNewCrawl(newCrawlConfig); 146 | return makeHTTPRequest(request, { 147 | onError({ error }) { 148 | toast(`Failed to create the new crawl ${error}`, { 149 | type: toast.TYPE.ERROR 150 | }); 151 | }, 152 | async onResponse({ dispatch, response }) { 153 | const json = await response.json(); 154 | if (!response.ok) { 155 | toast( 156 | `Failed to create the crawl - ${json.id}: Details 157 | ${json.detail}`, 158 | { 159 | type: toast.TYPE.ERROR 160 | } 161 | ); 162 | return; 163 | } 164 | const request = EndpointRequests.crawlInfo(json.id); 165 | const infoResponse = await fetch(request); 166 | const result = infoResponse.ok 167 | ? await infoResponse.json() 168 | : Object.assign( 169 | { 170 | id: json.id 171 | }, 172 | body, 173 | newCrawlConfig.crawlRunInfo 174 | ); 175 | 176 | console.log('newly created crawl info', result); 177 | return { 178 | type: ActionTypes.create, 179 | payload: result 180 | }; 181 | } 182 | }); 183 | } 184 | 185 | /** 186 | * 187 | * @param {string} id 188 | * @param {Object} [startConfig] 189 | */ 190 | export function startCrawl(id, startConfig) { 191 | const { body, request } = EndpointRequests.startCrawl(id, startConfig); 192 | return makeHTTPRequest(request, { 193 | onError({ error }) { 194 | toast(`Failed to start the crawl - ${id}: ${error}`, { 195 | type: toast.TYPE.ERROR 196 | }); 197 | }, 198 | async onResponse({ response }) { 199 | const json = await response.json(); 200 | if (!response.ok) { 201 | toast( 202 | `Failed to start the crawl - ${id}: Details 203 | ${json.detail}`, 204 | { 205 | type: toast.TYPE.ERROR 206 | } 207 | ); 208 | return; 209 | } 210 | const result = { 211 | id, 212 | ...body 213 | }; 214 | console.log('started crawl info', result); 215 | return { 216 | type: ActionTypes.start, 217 | payload: result 218 | }; 219 | } 220 | }); 221 | } 222 | 223 | export function stopCrawl(id) { 224 | const request = EndpointRequests.stopCrawl(id); 225 | return makeHTTPRequest(request, { 226 | onError({ error }) { 227 | toast(`Failed to remove the crawl - ${id}: ${error}`, { 228 | type: toast.TYPE.ERROR 229 | }); 230 | }, 231 | async onResponse({ dispatch, response }) { 232 | const json = await response.json(); 233 | if (!response.ok) { 234 | toast( 235 | `Failed to stop the crawl - ${id}: Details 236 | ${json.detail}`, 237 | { 238 | type: toast.TYPE.ERROR 239 | } 240 | ); 241 | return; 242 | } 243 | return { 244 | type: ActionTypes.stop, 245 | payload: { id } 246 | }; 247 | } 248 | }); 249 | } 250 | 251 | export function removeCrawl(id) { 252 | const request = EndpointRequests.removeCrawl(id); 253 | return makeHTTPRequest(request, { 254 | onError({ error }) { 255 | toast(`Failed to remove the crawl - ${id}: ${error}`, { 256 | type: toast.TYPE.ERROR 257 | }); 258 | }, 259 | async onResponse({ dispatch, response }) { 260 | const json = await response.json(); 261 | if (!response.ok) { 262 | toast( 263 | `Failed to remove the crawl - ${id}: Details 264 | ${json.detail}`, 265 | { 266 | type: toast.TYPE.ERROR 267 | } 268 | ); 269 | return; 270 | } 271 | return { 272 | type: ActionTypes.deleteCrawl, 273 | payload: { id } 274 | }; 275 | } 276 | }); 277 | } 278 | -------------------------------------------------------------------------------- /frontend/src/actions/httpRequests.js: -------------------------------------------------------------------------------- 1 | export const HTTPRequestAction = Symbol('http-request-maker'); 2 | 3 | export const FetchStates = { 4 | preflight: Symbol('http-request-preflight'), 5 | inflight: Symbol('http-request-inflight'), 6 | done: Symbol('http-request-done'), 7 | error: Symbol('http-request-errored') 8 | }; 9 | 10 | export function requestErrorAction({ error, payload }) { 11 | return { 12 | type: HTTPRequestAction, 13 | payload: Object.assign({ error }, payload) 14 | }; 15 | } 16 | 17 | /** 18 | * @typedef {Object} MakeHTTPRequestInit 19 | * @property {function ({dispatch: Function, response: Response}): *} onResponse 20 | * @property {function ({dispatch: Function, error: Error}): *} onError 21 | */ 22 | 23 | function requestComplete(nextAction, wasError, url) { 24 | nextAction.meta = nextAction.meta || {}; 25 | nextAction.meta.httpRequest = { 26 | url, 27 | state: wasError ? FetchStates.error : FetchStates.done 28 | }; 29 | return nextAction; 30 | } 31 | 32 | /** 33 | * 34 | * @param {Request} request 35 | * @param {MakeHTTPRequestInit} init 36 | */ 37 | export function makeHTTPRequest(request, { onResponse, onError }) { 38 | return dispatch => { 39 | const init = { 40 | type: HTTPRequestAction, 41 | meta: { 42 | httpRequest: { 43 | state: FetchStates.preflight, 44 | url: request.url 45 | } 46 | } 47 | }; 48 | if (!dispatch(init)) return; // no op, this is a duplicate request 49 | let wasError = false; 50 | dispatch( 51 | fetch(request) 52 | .then(response => onResponse({ dispatch, response })) 53 | .catch(error => { 54 | wasError = true; 55 | return onError({ dispatch, error }); 56 | }) 57 | .then(requestFinished => 58 | requestComplete( 59 | requestFinished || { type: HTTPRequestAction }, 60 | wasError, 61 | request.url 62 | ) 63 | ) 64 | ); 65 | }; 66 | } 67 | -------------------------------------------------------------------------------- /frontend/src/actions/index.js: -------------------------------------------------------------------------------- 1 | export * from './crawls'; 2 | -------------------------------------------------------------------------------- /frontend/src/components/Crawl/Control.js: -------------------------------------------------------------------------------- 1 | import React, { Component } from 'react'; 2 | import * as PropTypes from 'prop-types'; 3 | import { CrawlRecord } from '../../reducers/crawls'; 4 | 5 | export default class Control extends Component { 6 | static propTypes = { 7 | crawl: PropTypes.instanceOf(CrawlRecord).isRequired, 8 | getCrawlInfo: PropTypes.func.isRequired, 9 | startCrawl: PropTypes.func.isRequired, 10 | stopCrawl: PropTypes.func.isRequired, 11 | removeCrawl: PropTypes.func.isRequired 12 | }; 13 | 14 | startCrawl() { 15 | const { crawl } = this.props; 16 | this.props.startCrawl(crawl.startCrawlConfig()); 17 | } 18 | 19 | constructor(props) { 20 | super(props); 21 | this.startCrawl = this.startCrawl.bind(this); 22 | } 23 | 24 | render() { 25 | const { crawl, getCrawlInfo, removeCrawl, stopCrawl } = this.props; 26 | return ( 27 |
28 |
29 | 36 |
37 |
38 | 45 |
46 |
47 | 53 |
54 |
55 | 58 |
59 |
60 | ); 61 | } 62 | } 63 | -------------------------------------------------------------------------------- /frontend/src/components/Crawl/Info.js: -------------------------------------------------------------------------------- 1 | import React, { Component } from 'react'; 2 | import * as PropTypes from 'prop-types'; 3 | import { CrawlRecord } from '../../reducers/crawls'; 4 | 5 | export default class Info extends Component { 6 | static propTypes = { 7 | crawl: PropTypes.instanceOf(CrawlRecord).isRequired 8 | }; 9 | 10 | renderQueue() { 11 | const crawlId = this.props.crawl.id; 12 | const queue = this.props.crawl.queue; 13 | const q = new Array(queue.size); 14 | for (let i = 0; i < queue.size; i++) { 15 | const qinfo = queue.get(i); 16 | q[i] = ( 17 |
  • 18 |

    19 | {qinfo.get('url')} @ depth {qinfo.get('depth')} 20 |

    21 |
  • 22 | ); 23 | } 24 | return ( 25 | <> 26 |

    Queue

    27 |
      {q}
    28 | 29 | ); 30 | } 31 | 32 | renderBrowsers() { 33 | const crawlId = this.props.crawl.id; 34 | const browsers = this.props.crawl.browsers; 35 | const b = new Array(browsers.length); 36 | for (let i = 0; i < browsers.length; i++) { 37 | b[i] = ( 38 |
  • 39 | 43 | View {browsers[i]} 44 | 45 |
  • 46 | ); 47 | } 48 | return ( 49 | <> 50 |

    View Running Crawl

    51 |
      {b}
    52 | 53 | ); 54 | } 55 | 56 | render() { 57 | const { crawl } = this.props; 58 | console.log('Viewing crawl info', crawl.toJS()); 59 | return ( 60 | <> 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 |
    TypeStatusNum BrowsersNum TabsCrawl Depth
    {crawl.crawl_type}{crawl.status || 'new'}{crawl.num_browsers}{crawl.num_tabs}{crawl.crawl_depth}
    81 | {crawl.browsers.length > 0 && this.renderBrowsers()} 82 | {crawl.queue.size > 0 && this.renderQueue()} 83 | 84 | ); 85 | } 86 | } 87 | -------------------------------------------------------------------------------- /frontend/src/components/Crawl/index.js: -------------------------------------------------------------------------------- 1 | import React, { Component } from 'react'; 2 | import * as PropTypes from 'prop-types'; 3 | import { connect } from 'react-redux'; 4 | import { Redirect } from 'react-router-dom'; 5 | import { CrawlRecord } from '../../reducers/crawls'; 6 | import { 7 | getCrawlInfo, 8 | removeCrawl, 9 | startCrawl, 10 | stopCrawl 11 | } from '../../actions'; 12 | import Info from './Info'; 13 | import Control from './Control'; 14 | 15 | class Crawl extends Component { 16 | static propTypes = { 17 | location: PropTypes.object.isRequired, 18 | match: PropTypes.object.isRequired, 19 | crawl: PropTypes.instanceOf(CrawlRecord), 20 | getCrawlInfo: PropTypes.func.isRequired, 21 | startCrawl: PropTypes.func.isRequired, 22 | stopCrawl: PropTypes.func.isRequired, 23 | removeCrawl: PropTypes.func.isRequired 24 | }; 25 | 26 | constructor(props) { 27 | super(props); 28 | 29 | this.handle = null; 30 | } 31 | 32 | componentDidMount() { 33 | const { crawl } = this.props; 34 | 35 | if (crawl.get('status') === 'running') { 36 | this.autoUpdate(); 37 | } 38 | } 39 | 40 | componentDidUpdate(prevProps) { 41 | const { crawl } = this.props; 42 | 43 | if ( 44 | crawl && 45 | crawl.get('status') === 'running' && 46 | prevProps.crawl.get('status') !== 'running' 47 | ) { 48 | this.autoUpdate(); 49 | } else if ( 50 | !crawl || 51 | (crawl.get('status') !== 'running' && prevProps.crawl.get('status') === 'running') 52 | ) { 53 | clearInterval(this.handle); 54 | } 55 | } 56 | 57 | componentWillUnmount() { 58 | clearInterval(this.handle); 59 | } 60 | 61 | autoUpdate = () => { 62 | this.handle = setInterval(this.props.getCrawlInfo, 1000); 63 | } 64 | 65 | render() { 66 | if (this.props.crawl == null) return ; 67 | const { 68 | crawl, 69 | getCrawlInfo, 70 | removeCrawl, 71 | stopCrawl, 72 | startCrawl 73 | } = this.props; 74 | return ( 75 |
    76 |
    80 |
    81 |

    Crawl Id - {crawl.id}

    82 |
    83 |
    84 | 85 | {crawl.running ? '' : 'Not'} Running 86 | 87 |
    88 |
    89 | 96 | 97 |
    98 | ); 99 | } 100 | } 101 | 102 | const mapStateToProps = (state, ownProps) => ({ 103 | crawl: state.get('crawls').get(ownProps.match.params.crawlid) 104 | }); 105 | 106 | const mapDispatchToProps = (dispatch, ownProps) => ({ 107 | getCrawlInfo() { 108 | dispatch(getCrawlInfo(ownProps.match.params.crawlid)); 109 | }, 110 | startCrawl(config) { 111 | dispatch(startCrawl(ownProps.match.params.crawlid, config)); 112 | }, 113 | stopCrawl() { 114 | dispatch(stopCrawl(ownProps.match.params.crawlid)); 115 | }, 116 | removeCrawl() { 117 | dispatch(removeCrawl(ownProps.match.params.crawlid)); 118 | } 119 | }); 120 | 121 | const ConnectedCrawl = connect( 122 | mapStateToProps, 123 | mapDispatchToProps 124 | )(Crawl); 125 | 126 | export default ConnectedCrawl; 127 | -------------------------------------------------------------------------------- /frontend/src/components/CrawlCreator/CreationForm.js: -------------------------------------------------------------------------------- 1 | import React from 'react'; 2 | import FormSection from 'redux-form/lib/FormSection'; 3 | import Field from 'redux-form/lib/immutable/Field'; 4 | import FieldArray from 'redux-form/lib/immutable/FieldArray'; 5 | import reduxForm from 'redux-form/lib/immutable/reduxForm'; 6 | import { 7 | CrawlConfigInputField, 8 | CrawlConfigSelectField, 9 | URLFields 10 | } from './fields'; 11 | 12 | export function validate(values, props) { 13 | const errors = { 14 | crawlInfo: {}, 15 | }; 16 | const crawlInfo = values.get('crawlInfo'); 17 | if (!crawlInfo) { 18 | errors.crawlInfo = 'required'; 19 | } else { 20 | if (values.get('num_browsers') <= 0) { 21 | errors.crawlInfo.num_browsers = 22 | 'The number of browser to be used cannot be less than or equal to zero'; 23 | } 24 | if (values.get('num_tabs') <= 0) { 25 | errors.crawlInfo.num_tabs = 26 | 'The number of tabs to be used cannot be less than or equal to zero'; 27 | } 28 | if (values.get('crawl_depth') <= 0) { 29 | errors.crawlInfo.crawl_depth = 30 | 'The depth of crawl cannot be less than or equal to zero'; 31 | } 32 | 33 | if (values.get('behavior_max_time') <= 0) { 34 | errors.crawlInfo.behavior_max_time = 35 | 'The runtime of behaviors must be greater than zero'; 36 | } 37 | const seeds = crawlInfo.get('seed_urls'); 38 | if (!seeds || seeds.size <= 0) { 39 | errors.crawlInfo.seed_urls = 'Must include seed urls'; 40 | } 41 | } 42 | return errors; 43 | } 44 | 45 | function seedURLsRequired(value, allValues, props) { 46 | if (!value) return 'Required'; 47 | if (value.size === 0) return 'Required'; 48 | } 49 | 50 | function CrawlCreationForm({ crawlType, handleSubmit, valid, submitting }) { 51 | const submitDisabled = !valid || submitting; 52 | return ( 53 |
    54 | 55 |
    56 |
    57 |
    58 | 64 | 70 | 75 | 76 | 77 | 78 | 79 | 80 | 86 | 92 | 99 |
    100 |
      101 |
    • 102 | More Options... 103 |
      104 | 111 | 116 | 117 | 118 | 123 | 124 | 125 | 126 | 131 | 132 | 133 | 134 | 135 |
      136 |
    • 137 |
    138 |
    139 |
    140 | 145 |
    146 |
    147 |
    148 | 155 |
    156 |
    157 |
    158 | ); 159 | } 160 | 161 | export const initialValues = { 162 | crawlInfo: { 163 | crawl_type: 'single-page', 164 | num_browsers: 1, 165 | num_tabs: 1, 166 | name: 'test crawl', 167 | coll: 'test', 168 | crawl_depth: 1, 169 | headless: false, 170 | cache: 'always', 171 | browser: 'chrome:73', 172 | behavior_max_time: 60, 173 | seed_urls: [], 174 | } 175 | }; 176 | 177 | export default reduxForm({ 178 | form: 'CreateCrawl', 179 | enableReinitialize: true, 180 | destroyOnUnmount: false, 181 | initialValues, 182 | validate 183 | })(CrawlCreationForm); 184 | -------------------------------------------------------------------------------- /frontend/src/components/CrawlCreator/fields.js: -------------------------------------------------------------------------------- 1 | import React, { Component } from 'react'; 2 | import * as PropTypes from 'prop-types'; 3 | import Field from 'redux-form/lib/immutable/Field'; 4 | import urlRegx from 'url-regex'; 5 | import UIKit from 'uikit'; 6 | 7 | export function CrawlConfigSelectField({ label, input, meta, children }) { 8 | const id = input.name; 9 | return ( 10 |
    11 | 14 | 17 |
    18 | ); 19 | } 20 | 21 | export function CrawlConfigInputField({ disabled, label, min, type, input, meta, title }) { 22 | const id = `${input.name}-${type}`; 23 | const className = `uk-input ${meta.valid ? '' : 'uk-form-danger'}`; 24 | return ( 25 |
    26 | 29 | 30 |
    31 | ); 32 | } 33 | 34 | const isURLRe = urlRegx({ exact: true, strict: false }); 35 | 36 | const isURLTest = url => (isURLRe.test(url) ? null : 'Not a URL'); 37 | 38 | class URLToCrawl extends Component { 39 | static propTypes = { 40 | idx: PropTypes.number.isRequired, 41 | remove: PropTypes.func.isRequired 42 | }; 43 | 44 | constructor(props) { 45 | super(props); 46 | this.renderURL = this.renderURL.bind(this); 47 | this.remove = this.remove.bind(this); 48 | } 49 | 50 | renderURL({ input, meta }) { 51 | const className = `uk-input ${meta.valid ? '' : 'uk-form-danger'}`; 52 | return ( 53 | 63 | ); 64 | } 65 | 66 | remove() { 67 | this.props.remove(this.props.idx); 68 | } 69 | 70 | render() { 71 | return ( 72 | 85 | ); 86 | } 87 | } 88 | 89 | export class BulkURLInput extends Component { 90 | static propTypes = { 91 | addURL: PropTypes.func.isRequired 92 | }; 93 | 94 | constructor(props) { 95 | super(props); 96 | this.textAreaRef = React.createRef(); 97 | this.addURLs = this.addURLs.bind(this); 98 | this.close = this.close.bind(this); 99 | } 100 | 101 | componentWillUnmount() { 102 | UIKit.modal('#bulk-seed-input').$destroy(true); 103 | } 104 | 105 | addURLs() { 106 | const value = this.textAreaRef.current.value; 107 | if (value) { 108 | const rawValues = value.split('\n'); 109 | let added = false; 110 | for (let i = 0; i < rawValues.length; i++) { 111 | const value = rawValues[i].trim(); 112 | if (isURLRe.test(value)) { 113 | added = true; 114 | this.props.addURL(value); 115 | } 116 | } 117 | if (added) { 118 | this.close(); 119 | } 120 | } 121 | } 122 | 123 | close() { 124 | this.textAreaRef.current.value = ''; 125 | UIKit.modal('#bulk-seed-input').hide(); 126 | } 127 | 128 | render() { 129 | return ( 130 |
    131 |
    132 |
    133 |

    Enter URLs On A Single Line

    134 |
    135 |
    136 |