├── .coveragerc
├── .dockerignore
├── .flake8
├── .gitignore
├── .travis.yml
├── Dockerfile
├── LICENSE
├── NOTICE
├── README.md
├── browsertrix
├── __init__.py
├── api.py
├── crawl.py
├── schema.py
└── utils.py
├── browsertrix_cli
├── basecli.py
├── crawl.py
├── main.py
└── profile.py
├── cli-requirements.txt
├── dev-requirements.txt
├── docker-compose.yml
├── flocks
├── browsers-headless.yaml
└── browsers.yaml
├── frontend
├── .eslintrc.js
├── .prettierrc
├── package.json
├── public
│ └── index.html
├── src
│ ├── actions
│ │ ├── crawls.js
│ │ ├── httpRequests.js
│ │ └── index.js
│ ├── components
│ │ ├── Crawl
│ │ │ ├── Control.js
│ │ │ ├── Info.js
│ │ │ └── index.js
│ │ ├── CrawlCreator
│ │ │ ├── CreationForm.js
│ │ │ ├── fields.js
│ │ │ ├── index.js
│ │ │ └── validate.js
│ │ ├── Crawls
│ │ │ ├── LoadingCrawls.js
│ │ │ ├── SelectCrawl.js
│ │ │ └── index.js
│ │ └── Header
│ │ │ ├── HeaderLink.js
│ │ │ └── index.js
│ ├── containers
│ │ └── App.js
│ ├── reducers
│ │ ├── crawls.js
│ │ └── index.js
│ ├── root.js
│ ├── store
│ │ ├── dev.js
│ │ ├── index.js
│ │ ├── middleware.js
│ │ └── prod.js
│ ├── styles
│ │ └── global.scss
│ ├── utils
│ │ ├── bootstrap.js
│ │ ├── endpoints.js
│ │ ├── index.js
│ │ └── rhlConfig.js
│ └── wrap-with-provider.js
├── webpack
│ ├── development-server.js
│ └── webpack.config.js
└── yarn.lock
├── install-browsers.sh
├── mypy.ini
├── pool_config.yaml
├── pyproject.toml
├── pytest.ini
├── pywb
├── Dockerfile
├── config.yaml
├── crawlapp.py
├── run.sh
├── static
│ └── browsertrix-logo.svg
├── templates
│ └── fullsearch.html
└── uwsgi.ini
├── requirements.txt
├── sample-crawls
├── custom-scopes.yaml
├── emulate-mobile-browser.yaml
├── example.yaml
├── override-browser-http-cookies-language.yaml
├── social-media-replay.yaml
└── social-media.yaml
├── scripts
├── format.sh
└── lint.sh
├── setup.py
├── static
├── .gitkeep
├── app.js
├── browsertrix-logo.svg
└── index.html
├── test-docker-requirements.txt
├── test-local-requirements.txt
└── tests
├── __init__.py
├── conftest.py
├── crawl_tests.yaml
├── start-test-compose.sh
├── stop-test-compose.sh
├── test-docker-compose.yml
├── test_api.py
├── test_live_crawl.py
├── test_pool_config.yaml
└── utils.py
/.coveragerc:
--------------------------------------------------------------------------------
1 | [run]
2 | source = codecov
3 | branch = True
4 | omit =
5 | */test/*
6 | */tests/*
7 |
8 |
--------------------------------------------------------------------------------
/.dockerignore:
--------------------------------------------------------------------------------
1 | pywb
2 | build
3 | webarchive
4 | browsertrix_cli
5 |
6 | # Created by .ignore support plugin (hsz.mobi)
7 | ### Node template
8 | # Logs
9 | logs
10 | *.log
11 | npm-debug.log*
12 | yarn-debug.log*
13 | yarn-error.log*
14 |
15 | # Runtime data
16 | pids
17 | *.pid
18 | *.seed
19 | *.pid.lock
20 |
21 | # Directory for instrumented libs generated by jscoverage/JSCover
22 | lib-cov
23 |
24 | # Coverage directory used by tools like istanbul
25 | coverage
26 |
27 | # nyc test coverage
28 | .nyc_output
29 |
30 | # Grunt intermediate storage (http://gruntjs.com/creating-plugins#storing-task-files)
31 | .grunt
32 |
33 | # Bower dependency directory (https://bower.io/)
34 | bower_components
35 |
36 | # node-waf configuration
37 | .lock-wscript
38 |
39 | # Compiled binary addons (https://nodejs.org/api/addons.html)
40 | build/Release
41 |
42 | # Dependency directories
43 | node_modules/
44 | jspm_packages/
45 |
46 | # TypeScript v1 declaration files
47 | typings/
48 |
49 | # Optional npm cache directory
50 | .npm
51 |
52 | # Optional eslint cache
53 | .eslintcache
54 |
55 | # Optional REPL history
56 | .node_repl_history
57 |
58 | # Output of 'npm pack'
59 | *.tgz
60 |
61 | # Yarn Integrity file
62 | .yarn-integrity
63 |
64 | # dotenv environment variables file
65 | .env
66 |
67 | # next.js build output
68 | .next
69 | ### JetBrains template
70 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and WebStorm
71 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
72 |
73 | # User-specific stuff
74 | .idea/**/workspace.xml
75 | .idea/**/tasks.xml
76 | .idea/**/dictionaries
77 | .idea/**/shelf
78 |
79 | # Sensitive or high-churn files
80 | .idea/**/dataSources/
81 | .idea/**/dataSources.ids
82 | .idea/**/dataSources.local.xml
83 | .idea/**/sqlDataSources.xml
84 | .idea/**/dynamic.xml
85 | .idea/**/uiDesigner.xml
86 | .idea/**/dbnavigator.xml
87 |
88 | # Gradle
89 | .idea/**/gradle.xml
90 | .idea/**/libraries
91 |
92 | # CMake
93 | cmake-build-debug/
94 | cmake-build-release/
95 |
96 | # Mongo Explorer plugin
97 | .idea/**/mongoSettings.xml
98 |
99 | # File-based project format
100 | *.iws
101 |
102 | # IntelliJ
103 | out/
104 |
105 | # mpeltonen/sbt-idea plugin
106 | .idea_modules/
107 |
108 | # JIRA plugin
109 | atlassian-ide-plugin.xml
110 |
111 | # Cursive Clojure plugin
112 | .idea/replstate.xml
113 |
114 | # Crashlytics plugin (for Android Studio and IntelliJ)
115 | com_crashlytics_export_strings.xml
116 | crashlytics.properties
117 | crashlytics-build.properties
118 | fabric.properties
119 |
120 | # Editor-based Rest Client
121 | .idea/httpRequests
122 | ### Example user template template
123 | ### Example user template
124 |
125 | # IntelliJ project files
126 | .idea
127 | *.iml
128 | out
129 | gen### Python template
130 | # Byte-compiled / optimized / DLL files
131 | __pycache__/
132 | *.py[cod]
133 | *$py.class
134 |
135 | # C extensions
136 | *.so
137 |
138 | # Distribution / packaging
139 | .Python
140 | build/
141 | develop-eggs/
142 | dist/
143 | downloads/
144 | eggs/
145 | .eggs/
146 | lib/
147 | lib64/
148 | parts/
149 | sdist/
150 | var/
151 | wheels/
152 | *.egg-info/
153 | .installed.cfg
154 | *.egg
155 | MANIFEST
156 |
157 | # PyInstaller
158 | # Usually these files are written by a python script from a template
159 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
160 | *.manifest
161 | *.spec
162 |
163 | # Installer logs
164 | pip-log.txt
165 | pip-delete-this-directory.txt
166 |
167 | # Unit test / coverage reports
168 | htmlcov/
169 | .tox/
170 | .coverage
171 | .coverage.*
172 | .cache
173 | nosetests.xml
174 | coverage.xml
175 | *.cover
176 | .hypothesis/
177 | .pytest_cache/
178 |
179 | # Translations
180 | *.mo
181 | *.pot
182 |
183 | # Django stuff:
184 | *.log
185 | local_settings.py
186 | db.sqlite3
187 |
188 | # Flask stuff:
189 | instance/
190 | .webassets-cache
191 |
192 | # Scrapy stuff:
193 | .scrapy
194 |
195 | # Sphinx documentation
196 | docs/_build/
197 |
198 | # PyBuilder
199 | target/
200 |
201 | # Jupyter Notebook
202 | .ipynb_checkpoints
203 |
204 | # pyenv
205 | .python-version
206 |
207 | # celery beat schedule file
208 | celerybeat-schedule
209 |
210 | # SageMath parsed files
211 | *.sage.py
212 |
213 | # Environments
214 | .env
215 | .venv
216 | env/
217 | venv/
218 | ENV/
219 | env.bak/
220 | venv.bak/
221 |
222 | # Spyder project settings
223 | .spyderproject
224 | .spyproject
225 |
226 | # Rope project settings
227 | .ropeproject
228 |
229 | # mkdocs documentation
230 | /site
231 |
232 |
233 | # mypy
234 | .mypy_cache/
235 | localCompose
236 | pip-wheel-metadata
237 | scripts
238 | #tests
239 | poetry.lock
240 | pyproject.toml
241 | pytest.ini
242 | README.md
243 | mypy.ini
244 | .flake8
245 | frontend
246 | webarchive
247 |
--------------------------------------------------------------------------------
/.flake8:
--------------------------------------------------------------------------------
1 | [flake8]
2 | ignore = E203, E266, E501, W503
3 | max-line-length = 88
4 | max-complexity = 18
5 | select = B,C,E,F,W,T4,B950
6 | exclude = .git,
7 | __pycache__,
8 | .mypy_cache,
9 | venv,
10 | .venv
11 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Created by .ignore support plugin (hsz.mobi)
2 | ### Python template
3 | # Byte-compiled / optimized / DLL files
4 | __pycache__/
5 | *.py[cod]
6 | *$py.class
7 |
8 | # C extensions
9 | *.so
10 |
11 | # Distribution / packaging
12 | .Python
13 | build/
14 | develop-eggs/
15 | dist/
16 | downloads/
17 | eggs/
18 | .eggs/
19 | lib/
20 | lib64/
21 | parts/
22 | sdist/
23 | var/
24 | wheels/
25 | *.egg-info/
26 | .installed.cfg
27 | *.egg
28 | MANIFEST
29 |
30 | # PyInstaller
31 | # Usually these files are written by a python script from a template
32 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
33 | *.manifest
34 | *.spec
35 |
36 | # Installer logs
37 | pip-log.txt
38 | pip-delete-this-directory.txt
39 |
40 | # Unit test / coverage reports
41 | htmlcov/
42 | .tox/
43 | .coverage
44 | .coverage.*
45 | .cache
46 | nosetests.xml
47 | coverage.xml
48 | *.cover
49 | .hypothesis/
50 | .pytest_cache/
51 |
52 | # Translations
53 | *.mo
54 | *.pot
55 |
56 | # Django stuff:
57 | *.log
58 | local_settings.py
59 | db.sqlite3
60 |
61 | # Flask stuff:
62 | instance/
63 | .webassets-cache
64 |
65 | # Scrapy stuff:
66 | .scrapy
67 |
68 | # Sphinx documentation
69 | docs/_build/
70 |
71 | # PyBuilder
72 | target/
73 |
74 | # Jupyter Notebook
75 | .ipynb_checkpoints
76 |
77 | # pyenv
78 | .python-version
79 |
80 | # celery beat schedule file
81 | celerybeat-schedule
82 |
83 | # SageMath parsed files
84 | *.sage.py
85 |
86 | # Environments
87 | .env
88 | .venv
89 | env/
90 | venv/
91 | ENV/
92 | env.bak/
93 | venv.bak/
94 |
95 | # Spyder project settings
96 | .spyderproject
97 | .spyproject
98 |
99 | # Rope project settings
100 | .ropeproject
101 |
102 | # mkdocs documentation
103 | /site
104 |
105 | # mypy
106 | .mypy_cache/
107 | ### Example user template template
108 | ### Example user template
109 |
110 | # IntelliJ project files
111 | .idea
112 | *.iml
113 | out
114 | gen### JetBrains template
115 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and WebStorm
116 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
117 |
118 | # User-specific stuff
119 | .idea/**/workspace.xml
120 | .idea/**/tasks.xml
121 | .idea/**/dictionaries
122 | .idea/**/shelf
123 |
124 | # Sensitive or high-churn files
125 | .idea/**/dataSources/
126 | .idea/**/dataSources.ids
127 | .idea/**/dataSources.local.xml
128 | .idea/**/sqlDataSources.xml
129 | .idea/**/dynamic.xml
130 | .idea/**/uiDesigner.xml
131 | .idea/**/dbnavigator.xml
132 |
133 | # Gradle
134 | .idea/**/gradle.xml
135 | .idea/**/libraries
136 |
137 | # CMake
138 | cmake-build-debug/
139 | cmake-build-release/
140 |
141 | # Mongo Explorer plugin
142 | .idea/**/mongoSettings.xml
143 |
144 | # File-based project format
145 | *.iws
146 |
147 | # IntelliJ
148 | out/
149 |
150 | # mpeltonen/sbt-idea plugin
151 | .idea_modules/
152 |
153 | # JIRA plugin
154 | atlassian-ide-plugin.xml
155 |
156 | # Cursive Clojure plugin
157 | .idea/replstate.xml
158 |
159 | # Crashlytics plugin (for Android Studio and IntelliJ)
160 | com_crashlytics_export_strings.xml
161 | crashlytics.properties
162 | crashlytics-build.properties
163 | fabric.properties
164 |
165 | # Editor-based Rest Client
166 | .idea/httpRequests
167 |
168 | ### Node template
169 | # Logs
170 | logs
171 | *.log
172 | npm-debug.log*
173 | yarn-debug.log*
174 | yarn-error.log*
175 |
176 | # Runtime data
177 | pids
178 | *.pid
179 | *.seed
180 | *.pid.lock
181 |
182 | # Directory for instrumented libs generated by jscoverage/JSCover
183 | lib-cov
184 |
185 | # Coverage directory used by tools like istanbul
186 | coverage
187 |
188 | # nyc test coverage
189 | .nyc_output
190 |
191 | # Grunt intermediate storage (http://gruntjs.com/creating-plugins#storing-task-files)
192 | .grunt
193 |
194 | # Bower dependency directory (https://bower.io/)
195 | bower_components
196 |
197 | # node-waf configuration
198 | .lock-wscript
199 |
200 | # Compiled binary addons (https://nodejs.org/api/addons.html)
201 | build/Release
202 |
203 | # Dependency directories
204 | node_modules/
205 | jspm_packages/
206 |
207 | # TypeScript v1 declaration files
208 | typings/
209 |
210 | # Optional npm cache directory
211 | .npm
212 |
213 | # Optional eslint cache
214 | .eslintcache
215 |
216 | # Optional REPL history
217 | .node_repl_history
218 |
219 | # Output of 'npm pack'
220 | *.tgz
221 |
222 | # Yarn Integrity file
223 | .yarn-integrity
224 |
225 | *.tar.gz
226 | **/test-webarchive/
227 | **/webarchive/
228 |
229 |
--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | language: python
2 |
3 | python:
4 | - "3.7"
5 |
6 | os:
7 | - linux
8 |
9 | dist: xenial
10 |
11 | sudo: required
12 |
13 | jobs:
14 | include:
15 | - stage: local tests
16 | script:
17 | - python setup.py install
18 | - pip install -U -r test-local-requirements.txt
19 | - py.test ./tests/test_api.py
20 |
21 | - stage: docker integration tests
22 |
23 | services:
24 | - docker
25 |
26 | env:
27 | - DOCKER_COMPOSE_VERSION=1.23.2
28 |
29 | before_install:
30 | - ./install-browsers.sh --headless
31 | - sudo rm /usr/local/bin/docker-compose
32 | - curl -L https://github.com/docker/compose/releases/download/${DOCKER_COMPOSE_VERSION}/docker-compose-`uname -s`-`uname -m` > docker-compose
33 | - chmod +x docker-compose
34 | - sudo mv docker-compose /usr/local/bin
35 |
36 | script:
37 | - bash ./tests/start-test-compose.sh
38 | - pip install -U -r test-docker-requirements.txt
39 | - py.test --headless ./tests/test_live_crawl.py
40 | - docker-compose logs
41 | - bash ./tests/stop-test-compose.sh
42 |
43 |
--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM python:3.7.3
2 |
3 | WORKDIR /app
4 |
5 | COPY requirements.txt ./
6 |
7 | RUN pip install --no-cache-dir -r requirements.txt
8 |
9 | COPY browsertrix ./browsertrix
10 | COPY static ./static
11 |
12 | CMD uvicorn --reload --host 0.0.0.0 --port 8000 browsertrix.api:app
13 |
14 |
15 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document.
10 |
11 | "Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License.
12 |
13 | "Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity.
14 |
15 | "You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License.
16 |
17 | "Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files.
18 |
19 | "Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types.
20 |
21 | "Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below).
22 |
23 | "Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof.
24 |
25 | "Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution."
26 |
27 | "Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work.
28 |
29 | 2. Grant of Copyright License.
30 |
31 | Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form.
32 |
33 | 3. Grant of Patent License.
34 |
35 | Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed.
36 |
37 | 4. Redistribution.
38 |
39 | You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions:
40 |
41 | You must give any other recipients of the Work or Derivative Works a copy of this License; and
42 | You must cause any modified files to carry prominent notices stating that You changed the files; and
43 | You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and
44 | If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License.
45 |
46 | You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License.
47 |
48 | 5. Submission of Contributions.
49 |
50 | Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions.
51 |
52 | 6. Trademarks.
53 |
54 | This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file.
55 |
56 | 7. Disclaimer of Warranty.
57 |
58 | Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License.
59 |
60 | 8. Limitation of Liability.
61 |
62 | In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages.
63 |
64 | 9. Accepting Warranty or Additional Liability.
65 |
66 | While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability.
67 |
68 | END OF TERMS AND CONDITIONS
69 |
--------------------------------------------------------------------------------
/NOTICE:
--------------------------------------------------------------------------------
1 | Browsertrix
2 | Copyright 2018-2020 Webrecorder Software, Rhizome, and Contributors.
3 |
4 | Distributed under the Apache License 2.0.
5 | See LICENSE for details.
6 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | **Depcrecated**: The Browsertrix system is being refactored into more modular individual components. The main component, [Browsertrix Crawler](https://github.com/webrecorder/browsertrix-crawler) will soon support most of the same crawling features via an integrated Docker image that can be deployed via the command-line. The UI and scheduling components will soon be reimplemented as additional components.
2 |
3 | Please see Browsertrix Crawler for latest development.
4 |
5 |
6 |
7 |
8 |
9 | [](https://github.com/ambv/black) [](https://travis-ci.org/webrecorder/browsertrix)
10 |
11 | ## High Fidelity Browser-Based Crawling Automation
12 |
13 | Browsertrix is a brand new toolset from the Webrecorder project for automating browsers to perform complex scripted behaviors
14 | as well as crawl multiple pages. (The name was originally used for an older project with similar goals).
15 |
16 | Browsertrix is a system for orchestrating Docker-based Chrome browsers, crawling processes, behavior systems, web archiving capture and replay, and full-text search.
17 |
18 | It includes the following features:
19 | * Crawling via customizable YAML-based crawl spec
20 | * High-fidelity browser-based crawlers (controlled via [webrecorder/autobrowser](https://github.com/webrecorder/autobrowser))
21 | * Execution of complex, domain-specific in-page behaviors (provided by [webrecorder/behaviors](https://github.com/webrecorder/behaviors))
22 | * Capture or replay into designated [pywb](https://github.com/webrecorder/pywb) collections
23 | * Screenshot creation of each page (optional).
24 | * Text extraction for each page and full text search via Solr (optional).
25 | * Support for customized browser profiles to minimize capture of private information.
26 |
27 | ## Getting Started
28 |
29 | ### Installing Browsertrix
30 |
31 | Browsertrix is currently designed to run with Docker and Docker Compose.
32 | The Browsertrix CLI requires local Python 3.6+.
33 |
34 | To install, run:
35 |
36 | ```bash
37 | git clone https://github.com/webrecorder/browsertrix
38 | cd browsertrix
39 | python setup.py install
40 | ./install-browsers.sh
41 | docker-compose build
42 | docker-compose up -d
43 | ```
44 |
45 | The `install-browsers.sh` script installs additional Docker images necessary for dynamic browser creation.
46 | The script can be used to update the images as well.
47 |
48 | ### Installing Browsertrix CLI
49 |
50 | The Browsertrix CLI is installed by running `python setup.py install` and includes full functionality for running crawls and creating browser profiles.
51 |
52 | Once installed, browsertrix commands are available via the `browsertrix` command.
53 |
54 | ## Creating a Crawl
55 |
56 | To create a crawl, first a crawl spec should be defined in a yaml file.
57 | An example spec, [sample-crawls/example.yaml](sample-crawls/example.yaml) might look as follows:
58 |
59 | ```yaml
60 | crawls:
61 | - name: example
62 | crawl_type: all-links
63 | num_browsers: 1
64 |
65 | coll: example
66 | mode: record
67 |
68 | seed_urls:
69 | - https://www.iana.org/
70 | ```
71 |
72 | Then, simply run `browsertrix crawl create sample-crawls/example.yaml --watch`
73 |
74 | The `--watch` param will also result in the crawling browser opening in a new browser window via vnc connection.
75 |
76 | If started successfully, the output will be similar to:
77 | ```
78 | Crawl Created and Started: cf30281efc7a
79 | Status: running
80 | Opening Browser 1 of 1 (CKVEMACNI6YBUKLQI6UKKBLB) for crawl cf30281efc7a
81 | ```
82 |
83 | To view all running crawls, simply run `browsertrix crawl list` which should result in output similar to:
84 |
85 | ```
86 | CRAWL ID NAME STARTED DURATION STATUS CRAWL TYPE COLL MODE TO CRAWL PENDING SEEN BROWSERS TABS
87 | cf30281efc7a example 0:00:35 ago 0:00:10 running all-links example record 15 1 25 1 1
88 | ```
89 |
90 | To get more detailed info on the crawl, run `browsertrix crawl info --urls ` (where ` = cf30281efc7a` in this example)
91 |
92 | To follow the crawl log in the console window, add the `--log` option (the log followed will be from the first browser).
93 |
94 | ### Crawling Options
95 |
96 | Browsertrix supports a number of options, with a key option being the `crawl_type`, which can be:
97 |
98 | - `single-page` -- crawl only the specified seed urls
99 | - `all-links` -- crawl the seed url(s) and all links discovered until max depth is exceeded
100 | - `same-domain` -- crawl the seed url(s) and all links discovered that are on the same domain or sub-domain (up to a depth of 100)
101 | - `custom` -- Supports custom depth and scope rules!
102 |
103 | The first 3 options are designed to be a simple way to specify common options, and more may be added later.
104 |
105 | When using `custom`, the `crawl_depth` param can specify the crawl depth (hops) from each seed url.
106 |
107 | The `scopes` list can contain one or more [urlcanon MatchRules](https://github.com/iipc/urlcanon/blob/master/python/urlcanon/rules.py#L70) specifying urls that are in scope for the crawl.
108 |
109 | See [custom-scopes.yaml](sample_crawl_spec/custom-scopes.yaml) for an example on how to use the custom option.
110 |
111 |
112 | The `coll` option specifies the pywb collection to use for crawling, and mode specifies `record` (default) or `replay` or
113 | `live` (direct live web connection).
114 |
115 | The `num_browsers` and `num_tabs` option allow for selecting total number of browsers and number of tabs per browser to use for this crawl.
116 |
117 | The seed urls for the crawl should be provided in the `seed_urls` list.
118 |
119 | The `cache` option specifies cacheing options for a crawl, with available options:
120 | - `always` -- Strict cacheing via `Cache-Control` on almost every resource to limit duplicate urls in a single browser session (default option when omitted)
121 | - `default` -- Keep default cacheing for a page
122 | - `never` -- disables all cacheing for all urls.
123 |
124 | All example crawl configs demonstrating these options are available in: [sample-crawls](sample-crawls/)
125 |
126 | ### In-Page Behaviors
127 |
128 | For every page, Browsertrix runs a designated behavior before collecting outlinks, (optionally) taking screenshots,
129 | and moving on to the next page.
130 |
131 | The behaviors are served via a separate behavior API server. The current list of available behaviors is available at:
132 | https://github.com/webrecorder/behaviors/tree/master/behaviors
133 |
134 | The behaviors are built using a special library of behavior functions (preliminary docs available here:
135 | https://webrecorder.github.io/behaviors/)
136 |
137 | If no site-specific behavior is found, the default `autoscroll.js`
138 |
139 | The `behavior_max_time` crawl option specifies the maximum time a behavior can run (current default is 60 seconds).
140 | When crawling sites with infinite scroll, it is recommended to set the `behavior_max_time` to be much higher.
141 |
142 |
143 | ### pywb Collections and Access
144 |
145 | All data crawled is placed in the `./webarchive/collections/` directory which corresponds to the [standard pywb directory structure conventions](https://pywb.readthedocs.io/en/latest/manual/configuring.html#directory-structure) eg. a collection `test` would be found under `./webarchive/collections/test`.
146 |
147 | Collections are created automatically on first use and can also be managed via `wb-manager` with `webarchive` as the working directory.
148 |
149 | The running pywb instance can also be accessed via `http://localhost:8180/`
150 |
151 | ### Replay Crawling and Screenshots
152 |
153 | Currently, screenshot creation is automatically enabled when crawling in record mode and screenshots are added automatically
154 | to the same collection.
155 |
156 | Browsertrix supports crawling in replay mode, over an existing collection, which may be useful for QA processes,
157 | especially when combined with screenshot creation.
158 |
159 | By setting the `mode` and `screenshot_coll` properties for each crawl, it is possible to run Browsertrix over replay and generate screenshots into a different collection, which may be used for QA comparison.
160 |
161 | Additional screenshot options are to be added soon. (Currently, the screenshot is taken after the behavior is run but this will likely change).
162 |
163 | Crawl options can also be overriden via command line.
164 |
165 | For example, given a crawl spec `./my_crawl.yaml`, one could first capture with:
166 | ```
167 | browsertrix crawl create ./my_crawl.yaml --screenshot_coll screenshots-capture
168 | ```
169 |
170 | and then run:
171 | ```
172 | browsertrix crawl create ./my_crawl.yaml --screenshot_coll --mode replay screenshots-qa
173 | ```
174 |
175 | By default, screenshots are saved with `urn:screenshot:` prefix.
176 | Based on the above crawls, one could then query all capture and qa screenshots in pywb via:
177 | ```
178 | http://localhost:8180/screenshots-capture/*/urn:screenshot:*
179 | http://localhost:8180/screenshots-qa/*/urn:screenshot:*
180 | ```
181 |
182 | Sample record and replay configs, [social-media.yaml](sample-crawls/social-media.yaml) and [social-media-replay.yaml](sample-crawls/social-media-replay.yaml), are also available.
183 |
184 | (Note: The screenshot functionality will likely change and additional options will be added)
185 |
186 | ### Other Crawl operations
187 |
188 | Other crawl operations include:
189 | * `browsertrix crawl stop` for stopping a crawl
190 | * `browsertrix crawl logs` for printing and following logs for one or all crawlers
191 | * `browsertrix crawl watch ` for attaching and watching all the browsers in a given crawl.
192 | * `browsertrix crawl remove` for removing a crawl
193 | * `browsertrix crawl remove-all` for stopping and removing all crawls.
194 |
195 | See `browsertrix crawl -h` for a complete reference of available commands.
196 |
197 | ## Full Text Search
198 |
199 | Browsertrix now includes a prototype integration with Apache Solr. Text is extracted for each page, after taking a screenshot, and ingested into Solr. The extracted text (as provided via raw DOM text nodes) from all frames,
200 | as well as the title, and url are indexed in Solr using default schema. (This is likely to evolve as well).
201 |
202 | The search is available for each collection via the pywb replay interface at: `http://localhost:8180/`
203 |
204 | The replay interface currently includes a list of pages, screenshot (if enabled) and ability to search the collection.
205 |
206 | (Note: solr data is stored in the `./solr` volume, and may require a permission adjustment on certain systems via `chmod a+w ./solr`)
207 |
208 |
209 | ## Browser Profiles
210 |
211 | It is often useful to prepare a browser, such as by logging into social media, other password protected sites
212 | to be able to capture content that is not generally accessible. However, doing so during a crawl is tedious, and worse,
213 | may result in passwords being recorded to WARC.
214 |
215 | Browsertrix addresses this problem with the support of browser profiles. A profile can be created by running a base
216 | Chrome browser, performing custom actions, and then 'saving' the running browser into a new 'profile' image.
217 |
218 | To create a profile:
219 |
220 | 1. Run:
221 | ```browsertrix profile create```
222 |
223 | 2. This should start a new remote browser (Chrome 73 by default) and open it in a new window. You can now interact with the browser and log in to any sites as needed.
224 |
225 | 3. The command line should have the following message and a prompt to enter the profile name, eg. `logged-in`
226 |
227 | ```
228 | A new browser window should have been opened
229 | You can use the browser to log-in to accounts or otherwise prepare the browser profile
230 | (The content will not be recorded to WARC)
231 | When done, please enter a new name to save the browser profile:
232 | ```
233 |
234 | 4. Once the name is entered the profile is saved, and you can continue browsing to make a new profile, or select 'no' and close the browser.
235 |
236 | If everything worked, running ```browsertrix profile list``` should show:
237 |
238 | ```
239 | PROFILE BASE BROWSER
240 | logged-in chrome:73
241 | ```
242 |
243 | 5. To use the profile, set the `profile` property in the crawl spec YAML, or simply include `--profile` in the command line:
244 |
245 | ```
246 | browsertrix crawl create ./my_crawl.yaml --profile logged-in
247 | ```
248 |
249 | The browsers used for the crawl will be a copy of the browser saved during profile creation.
250 |
251 | `browsertrix profile remove` can be used to remove an unneeded profile.
252 |
253 | Note: The profile functionality is brand new and subject to change. At present, it is tied to the particular browser Docker image used an extend the image. The system may switch to Docker volumes in the future.
254 |
255 | ## Testing
256 |
257 | Browsertrix includes several test suites, also tested on automatically via Travis CI.
258 |
259 | ### Docker Integration Tests
260 |
261 | Browsertrix includes a Docker-based test suite that runs crawls over content replayed from a WARC
262 | (no live web content is accessed). This test suite requires Python 3.6+.
263 |
264 | To run this test suite, run:
265 |
266 | ```bash
267 | bash ./tests/start-test-compose.sh
268 | pip install -U -r test-docker-requirements.txt
269 | py.test --headless ./tests/test_live_crawl.py
270 | bash ./tests/stop-test-compose.sh
271 | ```
272 |
273 | The test suite does not perform any live crawling, but runs all the [tests/crawl_tests.yaml](tests/crawl_tests.yaml) in replay mode using an existing test WARC downloaded from S3.
274 |
275 | ### Local API Tests
276 |
277 | To install and run local tests of the API (without Docker), run the following:
278 | (Python 3.7+ is required)
279 |
280 | ```bash
281 | pip install -U -r requirements.txt -r test-local-requirements.txt
282 | py.test ./tests/test_api.py
283 | ```
284 |
285 | ## UI
286 |
287 | Browsertrix also includes a UI (still under development) which will
288 | have the same features as the CLI.
289 |
290 | To access the browsertrix UI, load `http://localhost:8000/`
291 |
292 | The frontend React app is found in `./frontend` and can be started via:
293 |
294 | ```
295 | yarn run develop
296 | ```
297 |
298 | (The develop server is started at `http://localhost:8001` to avoid conflict with production)
299 |
300 | To build the production bundle, run:
301 | ```
302 | yarn run build-prod
303 | ```
304 |
305 | This should update the production server running at `http://localhost:8000`
306 |
--------------------------------------------------------------------------------
/browsertrix/__init__.py:
--------------------------------------------------------------------------------
1 | from better_exceptions import hook
2 |
3 | __version__ = '0.1.0'
4 |
5 | hook()
6 |
--------------------------------------------------------------------------------
/browsertrix/api.py:
--------------------------------------------------------------------------------
1 | from fastapi import APIRouter, FastAPI
2 | from starlette.middleware.cors import ALL_METHODS, CORSMiddleware
3 | from starlette.responses import FileResponse, UJSONResponse
4 | from starlette.staticfiles import StaticFiles
5 |
6 | from .crawl import CrawlManager
7 | from .schema import *
8 |
9 | app = FastAPI(debug=True)
10 | app.add_middleware(
11 | CORSMiddleware, allow_origins=["*"], allow_methods=ALL_METHODS, allow_headers=["*"]
12 | )
13 | crawl_man = CrawlManager()
14 | crawl_router = APIRouter()
15 |
16 |
17 | # ============================================================================
18 | @app.post('/crawls', response_model=CreateStartResponse, response_class=UJSONResponse)
19 | async def create_crawl(new_crawl: CreateCrawlRequest):
20 | return await crawl_man.create_new(new_crawl)
21 |
22 |
23 | @app.get('/crawls', response_model=CrawlInfosResponse, response_class=UJSONResponse)
24 | async def get_all_crawls():
25 | return await crawl_man.get_all_crawls()
26 |
27 |
28 | @crawl_router.put(
29 | '/{crawl_id}/urls',
30 | response_model=OperationSuccessResponse,
31 | response_class=UJSONResponse,
32 | )
33 | async def queue_urls(crawl_id: str, url_list: QueueUrlsRequest):
34 | return await crawl_man.queue_crawl_urls(crawl_id, url_list.urls)
35 |
36 |
37 | @crawl_router.get(
38 | '/{crawl_id}', response_model=CrawlInfoResponse, response_class=UJSONResponse
39 | )
40 | async def get_crawl(crawl_id: str):
41 | return await crawl_man.get_crawl_info(crawl_id)
42 |
43 |
44 | @crawl_router.get(
45 | '/{crawl_id}/urls',
46 | response_model=CrawlInfoUrlsResponse,
47 | response_class=UJSONResponse,
48 | )
49 | async def get_crawl_urls(crawl_id: str):
50 | return await crawl_man.get_crawl_urls(crawl_id)
51 |
52 |
53 | @crawl_router.get(
54 | '/{crawl_id}/info',
55 | response_model=FullCrawlInfoResponse,
56 | response_class=UJSONResponse,
57 | )
58 | async def get_full_crawl_info(crawl_id: str):
59 | return await crawl_man.get_full_crawl_info(crawl_id)
60 |
61 |
62 | @crawl_router.post(
63 | '/{crawl_id}/start',
64 | response_model=CreateStartResponse,
65 | response_class=UJSONResponse,
66 | )
67 | async def start_crawl(crawl_id: str):
68 | return await crawl_man.start_crawl(crawl_id)
69 |
70 |
71 | @crawl_router.post(
72 | '/{crawl_id}/stop',
73 | response_model=OperationSuccessResponse,
74 | response_class=UJSONResponse,
75 | )
76 | async def stop_crawl(crawl_id: str):
77 | return await crawl_man.stop_crawl(crawl_id)
78 |
79 |
80 | @crawl_router.get(
81 | '/{crawl_id}/done', response_model=CrawlDoneResponse, response_class=UJSONResponse
82 | )
83 | async def is_done_crawl(crawl_id: str):
84 | return await crawl_man.is_crawl_done(crawl_id)
85 |
86 |
87 | @crawl_router.delete(
88 | '/{crawl_id}', response_model=OperationSuccessResponse, response_class=UJSONResponse
89 | )
90 | async def delete_crawl(crawl_id: str):
91 | return await crawl_man.delete_crawl(crawl_id)
92 |
93 |
94 | @app.route('/')
95 | def ui(*args, **kwargs):
96 | return FileResponse('static/index.html')
97 |
98 |
99 | app.include_router(crawl_router, prefix='/crawl', tags=['crawl'])
100 | app.mount('/static', StaticFiles(directory='static', check_dir=True), 'static')
101 | app.add_event_handler('startup', crawl_man.startup)
102 | app.add_event_handler('shutdown', crawl_man.shutdown)
103 |
--------------------------------------------------------------------------------
/browsertrix/schema.py:
--------------------------------------------------------------------------------
1 | import math
2 | from enum import Enum
3 | from typing import Any, Dict, List, Optional, Set, Union
4 |
5 | from pydantic import BaseModel, Schema, UrlStr
6 |
7 | __all__ = [
8 | 'BrowserCookie',
9 | 'BrowserOverrides',
10 | 'CacheMode',
11 | 'CaptureMode',
12 | 'CookieSameSite',
13 | 'CrawlDoneResponse',
14 | 'CrawlInfo',
15 | 'CrawlInfoResponse',
16 | 'CrawlInfoUrlsResponse',
17 | 'CrawlInfosResponse',
18 | 'CrawlType',
19 | 'CreateCrawlRequest',
20 | 'CreateStartResponse',
21 | 'EmulatedDevice',
22 | 'EmulatedGeoLocation',
23 | 'FullCrawlInfoResponse',
24 | 'OperationSuccessResponse',
25 | 'QueueUrlsRequest',
26 | ]
27 |
28 | # ============================================================================
29 | OptionalList = Optional[List[str]]
30 | OptionalSet = Optional[Set[str]]
31 | Number = Union[int, float]
32 |
33 | UrlStr.max_length = math.inf
34 | UrlStr.relative = True
35 |
36 |
37 | class CrawlType(str, Enum):
38 | SINGLE_PAGE = 'single-page'
39 | ALL_LINKS = 'all-links'
40 | SAME_DOMAIN = 'same-domain'
41 | CUSTOM = 'custom'
42 |
43 |
44 | class CaptureMode(str, Enum):
45 | RECORD = 'record'
46 | REPLAY = 'replay'
47 | LIVE = 'live'
48 |
49 |
50 | class CacheMode(str, Enum):
51 | ALWAYS = 'always'
52 | NEVER = 'never'
53 | DEFAULT = 'default'
54 |
55 |
56 | class CookieSameSite(str, Enum):
57 | STRICT = 'Strict'
58 | LAX = 'LAX'
59 | EXTENDED = 'Extended'
60 | NONE = 'None'
61 |
62 |
63 | class EmulatedDevice(BaseModel):
64 | width: Number
65 | height: Number
66 | deviceScaleFactor: Optional[Number] = None
67 | maxTouchPoints: Optional[Number] = None
68 | isMobile: Optional[bool] = None
69 | hasTouch: Optional[bool] = None
70 | isLandscape: Optional[bool] = None
71 |
72 |
73 | class EmulatedGeoLocation(BaseModel):
74 | latitude: Number
75 | longitude: Number
76 |
77 |
78 | class BrowserCookie(BaseModel):
79 | name: str
80 | value: str
81 | url: Optional[UrlStr] = None
82 | domain: Optional[str] = None
83 | path: Optional[str] = None
84 | secure: Optional[bool] = None
85 | httpOnly: Optional[bool] = None
86 | expires: Optional[Number] = None
87 | sameSite: Optional[CookieSameSite] = None
88 |
89 |
90 | class BrowserOverrides(BaseModel):
91 | user_agent: Optional[str] = None
92 | accept_language: Optional[str] = None
93 | navigator_platform: Optional[str] = None
94 | extra_headers: Optional[Dict[str, str]] = None
95 | cookies: Optional[List[BrowserCookie]] = None
96 | geo_location: Optional[EmulatedGeoLocation] = None
97 | device: Optional[EmulatedDevice] = None
98 |
99 |
100 | class BaseCreateCrawl(BaseModel):
101 | crawl_type: CrawlType = Schema(
102 | CrawlType.SINGLE_PAGE, description='What type of crawl should be launched'
103 | )
104 | crawl_depth: Optional[int] = None
105 | num_browsers: int = Schema(
106 | 2, description='How many browsers should be used for the crawl'
107 | )
108 | num_tabs: int = Schema(1, description='How many tabs should be used for the crawl')
109 | name: Optional[str] = Schema('', description='User friendly name for the crawl')
110 | coll: Optional[str] = Schema('live', description='Default Collection')
111 |
112 | mode: CaptureMode = Schema(CaptureMode.RECORD, description='Default Mode')
113 |
114 | screenshot_coll: Optional[str] = Schema(
115 | '', description='Collection to store screenshots, if any'
116 | )
117 |
118 | text_coll: Optional[str] = Schema(
119 | '', description='Collection to store full-text indexes, if any'
120 | )
121 |
122 |
123 | class CreateCrawlRequest(BaseCreateCrawl):
124 | class Config:
125 | extra = 'forbid'
126 |
127 | seed_urls: List[UrlStr] = []
128 | scopes: List[Dict[Any, Any]] = []
129 |
130 | cache: CacheMode = CacheMode.ALWAYS
131 |
132 | browser: Optional[str] = 'chrome:73'
133 | user_params: Dict[Any, Any] = dict()
134 |
135 | profile: Optional[str] = None
136 |
137 | ignore_extra: Optional[Dict[Any, Any]] = None
138 |
139 | behavior_max_time: int = 0
140 | headless: bool = False
141 | screenshot_target_uri: Optional[str] = None
142 |
143 | start: bool = True
144 | browser_overrides: Optional[BrowserOverrides] = None
145 |
146 |
147 | class OperationSuccessResponse(BaseModel):
148 | success: bool
149 |
150 |
151 | class CreateStartResponse(OperationSuccessResponse):
152 | id: str
153 | status: str = 'new'
154 | browsers: Optional[List[str]]
155 |
156 |
157 | class CrawlInfoResponse(BaseCreateCrawl):
158 | id: str
159 | status: str = 'new'
160 | start_time: int = 0
161 | finish_time: int = 0
162 | browsers: OptionalList
163 | tabs_done: List[Dict[Any, Any]]
164 | headless: bool = False
165 | num_queue: int = 0
166 | num_seen: int = 0
167 | num_pending: int = 0
168 |
169 |
170 | class CrawlInfosResponse(BaseModel):
171 | crawls: List[CrawlInfoResponse]
172 |
173 |
174 | class CrawlInfo(BaseModel):
175 | """ Model for validate a:{crawl_id}:info key
176 | All fields should be set in the model
177 | """
178 |
179 | id: str
180 | name: str
181 | coll: str
182 | screenshot_coll: str
183 | text_coll: str
184 | mode: str
185 | status: str
186 | crawl_type: str
187 | crawl_depth: int
188 | num_browsers: int
189 | num_tabs: int
190 | start_time: int = 0
191 | finish_time: int = 0
192 | headless: bool = False
193 | browser_overrides: Optional[BrowserOverrides] = None
194 |
195 |
196 | class CrawlInfoUrlsResponse(BaseModel):
197 | scopes: List[Dict[Any, Any]]
198 | queue: List[Dict[Any, Any]]
199 | pending: OptionalList
200 | seen: OptionalSet
201 |
202 |
203 | class FullCrawlInfoResponse(CrawlInfo, CrawlInfoUrlsResponse):
204 | success: bool
205 |
206 |
207 | class QueueUrlsRequest(BaseModel):
208 | urls: List[str]
209 |
210 |
211 | class CrawlDoneResponse(BaseModel):
212 | done: bool
213 |
--------------------------------------------------------------------------------
/browsertrix/utils.py:
--------------------------------------------------------------------------------
1 | from asyncio import AbstractEventLoop
2 | from os import environ
3 | from typing import Any, Dict, Optional, Type, Union
4 | from urllib.parse import urlsplit
5 |
6 | from aioredis import Redis, create_redis
7 | from ujson import loads as ujson_loads
8 |
9 | __all__ = ['env', 'extract_domain', 'init_redis']
10 |
11 |
12 | async def init_redis(redis_url: str, loop: AbstractEventLoop) -> Redis:
13 | return await create_redis(redis_url, encoding='utf-8', loop=loop)
14 |
15 |
16 | def env(
17 | key: str,
18 | type_: Type[Union[str, bool, int, dict, float]] = str,
19 | default: Optional[Any] = None,
20 | ) -> Union[str, int, bool, float, Dict]:
21 | """Returns the value of the supplied env key name converting
22 | the env key's value to the specified type.
23 |
24 | If the env key does not exist the default value is returned.
25 |
26 | Boolean values for env keys are expected to be:
27 | - true: 1, true, yes, y, ok, on
28 | - false: 0, false, no, n, nok, off
29 |
30 | :param key: The name of the environment variable
31 | :param type_: What type should the the env key's value be converted to,
32 | defaults to str
33 | :param default: The default value of the env key, defaults to None
34 | :return: The value of the env key or the supplied default
35 | """
36 | if key not in environ:
37 | return default
38 |
39 | val = environ[key]
40 |
41 | if type_ == str:
42 | return val
43 | elif type_ == bool:
44 | if val.lower() in ['1', 'true', 'yes', 'y', 'ok', 'on']:
45 | return True
46 | if val.lower() in ['0', 'false', 'no', 'n', 'nok', 'off']:
47 | return False
48 | raise ValueError(
49 | f'Invalid environment variable "{key}" (expected a boolean): "{val}"'
50 | )
51 | elif type_ == int:
52 | try:
53 | return int(val)
54 | except ValueError:
55 | raise ValueError(
56 | f'Invalid environment variable "{key}" (expected a integer): "{val}"'
57 | )
58 | elif type_ == float:
59 | try:
60 | return float(val)
61 | except ValueError:
62 | raise ValueError(
63 | f'Invalid environment variable "{key}" (expected a float): "{val}"'
64 | )
65 | elif type_ == dict:
66 | return ujson_loads(val)
67 |
68 |
69 | def extract_domain(url: str) -> str:
70 | """Extracts and returns the domain, including the suffix,
71 | of the supplied URL
72 |
73 | :param url: The url to have its domain extracted from
74 | :return: The extracted domain
75 | """
76 | extracted = urlsplit(url).netloc
77 | return extracted.replace('www.', '')
78 |
--------------------------------------------------------------------------------
/browsertrix_cli/basecli.py:
--------------------------------------------------------------------------------
1 | import click
2 | import sys
3 |
4 | import requests
5 |
6 |
7 | # ============================================================================
8 | class Settings:
9 | quiet_mode = False
10 | sesh = None
11 |
12 | server_prefix = None
13 | shepherd_prefix = None
14 | view_browsers_prefix = None
15 |
16 |
17 | settings = Settings()
18 |
19 |
20 | # ============================================================================
21 | @click.group()
22 | @click.option(
23 | '--server',
24 | metavar='',
25 | type=str,
26 | default='http://localhost:8000',
27 | help='The Browsertrix server url',
28 | )
29 | @click.option(
30 | '--shepherd',
31 | metavar='',
32 | type=str,
33 | default='http://localhost:9020',
34 | help='The Shepherd server url',
35 | )
36 | @click.option(
37 | '-q',
38 | '--quiet',
39 | is_flag=True,
40 | default=False,
41 | type=bool,
42 | help='quiet mode: print only crawl ids if success',
43 | )
44 | def cli(server, quiet, shepherd):
45 | settings.server_prefix = server
46 |
47 | settings.shepherd_prefix = shepherd
48 | settings.view_browsers_prefix = shepherd + '/attach/'
49 |
50 | settings.sesh = requests.session()
51 |
52 | settings.quiet_mode = quiet
53 |
54 |
55 | # ============================================================================
56 | def is_quiet():
57 | return settings.quiet_mode
58 |
59 |
60 | # ============================================================================
61 | def ensure_success(res, exit=True):
62 | """ Ensure API response is successful
63 | print error and exit if not
64 |
65 | :param res: Response from requests
66 | :param exit: Exit on any error
67 | :return: parsed JSON response as dict
68 | """
69 | if res.status_code == 200:
70 | json = res.json()
71 | return json
72 |
73 | if not is_quiet():
74 | print('Error response from API server')
75 | print('{0}: {1}'.format(res.status_code, res.text))
76 |
77 | if exit:
78 | sys.exit(1)
79 |
80 |
81 | # ============================================================================
82 | def conn_error_exit(url):
83 | if not is_quiet():
84 | print(
85 | 'Unable to connect to {0}. Is Browsertrix container running in Docker?'.format(
86 | url
87 | )
88 | )
89 | sys.exit(2)
90 |
91 |
92 | # ============================================================================
93 | def sesh_get(url, prefix=None):
94 | url = (prefix or settings.server_prefix) + url
95 | try:
96 | res = settings.sesh.get(url)
97 | return ensure_success(res)
98 | except requests.exceptions.ConnectionError:
99 | conn_error_exit(url)
100 |
101 |
102 | # ============================================================================
103 | def sesh_post(url, json=None, prefix=None):
104 | url = (prefix or settings.server_prefix) + url
105 | try:
106 | res = settings.sesh.post(url, json=json)
107 | return ensure_success(res)
108 | except requests.exceptions.ConnectionError:
109 | conn_error_exit(url)
110 |
111 |
112 | # ============================================================================
113 | def sesh_delete(url, prefix=None):
114 | url = (prefix or settings.server_prefix) + url
115 | try:
116 | res = settings.sesh.delete(url)
117 | return ensure_success(res, exit=False)
118 | except requests.exceptions.ConnectionError:
119 | conn_error_exit(url)
120 |
--------------------------------------------------------------------------------
/browsertrix_cli/crawl.py:
--------------------------------------------------------------------------------
1 | import click
2 | import datetime
3 | import docker
4 | import sys
5 | import time
6 | import yaml
7 | import webbrowser
8 |
9 | from collections import defaultdict
10 |
11 |
12 | from browsertrix_cli.basecli import (
13 | cli,
14 | is_quiet,
15 | sesh_get,
16 | sesh_post,
17 | sesh_delete,
18 | settings,
19 | )
20 | from browsertrix_cli.profile import get_profile_image
21 |
22 |
23 | COLUMNS = [
24 | ('id', 'CRAWL ID', 12),
25 | ('name', 'NAME', 12),
26 | ('start_time', 'STARTED', 12),
27 | ('finish_time', 'DURATION', 12),
28 | ('status', 'STATUS', 7),
29 | ('crawl_type', 'CRAWL TYPE', 12),
30 | ('coll', 'COLL', 16),
31 | ('mode', 'MODE', 8),
32 | ('num_queue', 'TO CRAWL', 8),
33 | ('num_pending', 'PENDING', 8),
34 | ('num_seen', 'SEEN', 8),
35 | ('num_browsers', 'BROWSERS', 9),
36 | ('num_tabs', 'TABS', 3),
37 | ]
38 |
39 |
40 | # ============================================================================
41 | @cli.group(help='Commands for working with crawls')
42 | def crawl():
43 | pass
44 |
45 |
46 | # ============================================================================
47 | def format_duration(start_time, finish_time):
48 | """ Format duration of crawl
49 |
50 | :param start_time: start time of crawl
51 | :param finish_time: finish time of crawl
52 | :return: string text for time elapsed since timestr
53 | """
54 | try:
55 | if start_time == 0:
56 | return '-'
57 |
58 | if not finish_time:
59 | finish = datetime.datetime.now()
60 | else:
61 | finish = datetime.datetime.fromtimestamp(int(finish_time))
62 |
63 | start = datetime.datetime.fromtimestamp(int(start_time))
64 | elapsed = finish - start
65 | return str(elapsed).split('.', 1)[0]
66 | except Exception:
67 | return start_time
68 |
69 |
70 | # ============================================================================
71 | def print_container_log(
72 | docker_api, reqid, name='autobrowser-', follow=False, wait=False
73 | ):
74 |
75 | full_name = name + reqid
76 | while True:
77 | try:
78 | container = docker_api.containers.get(full_name)
79 | break
80 | except docker.errors.NotFound:
81 | if not wait:
82 | return False
83 |
84 | print('Waiting for Logs...')
85 | time.sleep(0.25)
86 | continue
87 |
88 | print('---- Logs for Crawl {0}: {1} ----'.format(reqid, full_name))
89 | res = container.logs(follow=follow, stream=True)
90 | for line in res:
91 | sys.stdout.write(line.decode('utf-8'))
92 |
93 | print('-----------------------------------')
94 | print('')
95 | print('')
96 |
97 | return True
98 |
99 |
100 | # ============================================================================
101 | def print_logs(browsers, follow=False, wait=False, all_containers=False):
102 | docker_api = docker.from_env(version='auto')
103 |
104 | if follow is None:
105 | follow = False
106 |
107 | for reqid in browsers:
108 | if all_containers:
109 | print_container_log(
110 | docker_api, reqid, wait=False, follow=False, name='browser-'
111 | )
112 |
113 | print_container_log(
114 | docker_api, reqid, wait=False, follow=False, name='xserver-'
115 | )
116 |
117 | print_container_log(docker_api, reqid, wait=wait, follow=follow)
118 |
119 |
120 | # ============================================================================
121 | def open_browsers(browsers, crawl_id, tabs_done=None, num_tabs=-1):
122 | count = 1
123 | for reqid in browsers:
124 | skip = False
125 | if not tabs_done or tabs_done.get(reqid) != num_tabs:
126 | msg = 'Opening Browser {0} of {1} ({2}) for crawl {3}'
127 | else:
128 | msg = 'Skipping Finished Browser {0} of {1}, ({2}) for crawl {3}'
129 | skip = True
130 |
131 | if not is_quiet():
132 | print(msg.format(count, len(browsers), reqid, crawl_id))
133 |
134 | if not skip:
135 | webbrowser.open(settings.view_browsers_prefix + reqid)
136 | count += 1
137 |
138 |
139 | # ============================================================================
140 | @crawl.command(name='list', help='List all crawls')
141 | def list_crawls():
142 | """ List all available crawls
143 | """
144 | res = sesh_get('/crawls')
145 |
146 | sorted_list = sorted(res['crawls'], key=lambda x: x['start_time'], reverse=True)
147 |
148 | if is_quiet():
149 | for crawl in sorted_list:
150 | print(crawl['id'])
151 |
152 | return
153 |
154 | format_str = '{value: <{size}} '
155 |
156 | for _, text, size in COLUMNS:
157 | sys.stdout.write(format_str.format(value=text, size=size))
158 | print()
159 |
160 | for crawl in sorted_list:
161 | for field, _, size in COLUMNS:
162 | value = crawl[field]
163 | if field == 'start_time':
164 | value = format_duration(value, None) + ' ago'
165 | elif field == 'finish_time':
166 | value = format_duration(crawl['start_time'], value)
167 |
168 | sys.stdout.write(format_str.format(value=value, size=size))
169 | print()
170 | print()
171 |
172 |
173 | # ============================================================================
174 | @crawl.command(
175 | name='create', help='Create (and optionally start) new crawl from yaml crawl spec'
176 | )
177 | @click.option(
178 | '--start/--no-start',
179 | default=True,
180 | help="Start/Don't start crawl immediately after creation",
181 | )
182 | @click.option(
183 | '--browser',
184 | type=str,
185 | default=None,
186 | help='Browser Docker image to use for crawling, (overrides setting in spec)',
187 | )
188 | @click.option(
189 | '--profile',
190 | type=str,
191 | default=None,
192 | help='Browser Profile Docker image to use for crawling (overrides "browser" option)',
193 | )
194 | @click.option(
195 | '--coll',
196 | type=str,
197 | default=None,
198 | help='Set the collection (overrides setting in spec)',
199 | )
200 | @click.option(
201 | '--mode',
202 | type=str,
203 | default=None,
204 | help='Set the capture mode (overrides setting in spec)',
205 | )
206 | @click.option(
207 | '--screenshot_coll',
208 | type=str,
209 | default=None,
210 | help='Set the collection to save screenshots (overrides setting in spec)',
211 | )
212 | @click.option(
213 | '--headless',
214 | type=bool,
215 | is_flag=True,
216 | help='Use headless mode. Browsers can not be opened for watching the crawl',
217 | )
218 | @click.option(
219 | '--behavior-time',
220 | default=None,
221 | type=int,
222 | help='Max duration to run each in-page behavior',
223 | )
224 | @click.option(
225 | '--watch',
226 | is_flag=True,
227 | default=False,
228 | type=bool,
229 | help='Watch all started browsers in a local browser (only if starting crawl)',
230 | )
231 | @click.option(
232 | '--log',
233 | is_flag=True,
234 | default=False,
235 | type=bool,
236 | help='Tail the log for the browser crawler',
237 | )
238 | @click.argument('crawl_spec_file', type=click.File('rt'))
239 | def create_crawl(
240 | crawl_spec_file,
241 | start,
242 | browser,
243 | profile,
244 | coll,
245 | mode,
246 | screenshot_coll,
247 | headless,
248 | behavior_time,
249 | watch,
250 | log,
251 | ):
252 | """ Create a new crawl!
253 |
254 | :param crawl_spec_file: YAML file with one or more crawls in 'crawls' key
255 | :param start: If true, start crawl immediately after creation
256 | :param browser: Browser Docker image to use for crawling (overrides setting in spec)
257 | :param profile: Browser Profile Docker image to use for crawling (overrides "browser" setting)
258 | :param coll: Set the collection (overrides setting in spec)
259 | :param mode: Set the capture mode (overrides setting in spec)
260 | :param screenshot_coll: Set the collection to save screenshots (overrides setting in spec)
261 | :param headless: Use headless mode. Browsers can not be opened for watching the crawl
262 | :param behavior_time: Max duration (in seconds) to run each in-page behavior
263 | :param watch: Watch all started browsers in a local browser (only if starting crawl)
264 |
265 | """
266 | root = yaml.load(crawl_spec_file, Loader=yaml.Loader)
267 |
268 | for crawl_spec in root['crawls']:
269 | if not start:
270 | msg = 'Created'
271 | else:
272 | msg = 'Created and Started'
273 |
274 | if headless is not None:
275 | crawl_spec['headless'] = headless
276 |
277 | if behavior_time is not None:
278 | crawl_spec['behavior_time'] = behavior_time
279 |
280 | if profile is not None:
281 | crawl_spec['profile'] = profile
282 |
283 | if 'profile' in crawl_spec:
284 | browser = get_profile_image(crawl_spec.get('profile', ''))
285 |
286 | if browser is not None:
287 | crawl_spec['browser'] = browser
288 |
289 | if coll is not None:
290 | crawl_spec['coll'] = coll
291 |
292 | if mode is not None:
293 | crawl_spec['mode'] = mode
294 |
295 | if screenshot_coll is not None:
296 | crawl_spec['screenshot_coll'] = screenshot_coll
297 |
298 | if not is_quiet():
299 | print('Creating New Crawl, Please Wait...')
300 |
301 | res = sesh_post('/crawls', json=crawl_spec)
302 |
303 | if is_quiet():
304 | print(res['id'])
305 | else:
306 | print('Crawl {0}: {1}'.format(msg, res['id']))
307 | print('Status: {0}'.format(res['status']))
308 |
309 | if watch:
310 | if not start:
311 | if not is_quiet():
312 | print("Can't watch, crawl not started")
313 |
314 | elif headless:
315 | if not is_quiet():
316 | print("Can't watch, crawl is running in headless mode")
317 |
318 | else:
319 | open_browsers(res['browsers'], res['id'])
320 |
321 | if log:
322 | print_logs(res['browsers'], follow=True, wait=True)
323 |
324 |
325 | # ============================================================================
326 | @crawl.command(name='start', help='Start an existing crawl')
327 | @click.argument('crawl_id', nargs=-1)
328 | def start_crawl(crawl_id, browser, headless, behavior_time):
329 | """ Start an existing crawl
330 |
331 | :param crawl_id: list of crawl ids to start
332 | """
333 | for id_ in crawl_id:
334 | res = sesh_post('/crawl/{0}/start'.format(id_))
335 |
336 | if is_quiet():
337 | print(res['id'])
338 | else:
339 | print('Started Crawl: {0}'.format(res['id']))
340 |
341 |
342 | # ============================================================================
343 | @crawl.command(name='info', help='Get info on an existing crawl(s)')
344 | @click.argument('crawl_id', nargs=-1)
345 | @click.option(
346 | '--urls/--no-urls',
347 | default=False,
348 | help='Get detailed info on crawl, listing all urls',
349 | )
350 | def get_info(crawl_id, urls):
351 | """ Get info on existing crawl(s)
352 |
353 | :param crawl_id: list of crawl ids to get info on
354 | :param urls: Get detailed info on crawl, listing all urls
355 | """
356 | for id_ in crawl_id:
357 | if urls:
358 | res = sesh_get('/crawl/{0}/info'.format(id_))
359 | else:
360 | res = sesh_get('/crawl/{0}'.format(id_))
361 |
362 | print(yaml.dump(res))
363 |
364 |
365 | # ============================================================================
366 | @crawl.command(name='watch', help='Watch crawling browsers in local browser')
367 | @click.argument('crawl_id', nargs=-1)
368 | def watch_crawl(crawl_id):
369 | """ Watch crawling browsers in local browser
370 |
371 | :param crawl_id: list of crawl ids to watch
372 | """
373 | for id_ in crawl_id:
374 | res = sesh_get('/crawl/{0}'.format(id_))
375 |
376 | if res.get('headless'):
377 | if not is_quiet():
378 | print("Can not watch, crawl is running in headless mode")
379 | continue
380 |
381 | if res.get('status') != 'running':
382 | if not is_quiet():
383 | print('Crawl not running: {0}'.format(id_))
384 | continue
385 |
386 | browsers = res['browsers']
387 |
388 | done_count = defaultdict(int)
389 |
390 | for info in res.get('tabs_done'):
391 | done_count[info['id']] += 1
392 |
393 | if not browsers:
394 | if not is_quiet():
395 | print('No Browsers')
396 | continue
397 |
398 | open_browsers(browsers, id_, done_count, res['num_tabs'])
399 |
400 |
401 | # ============================================================================
402 | @crawl.command(name='stop', help='Stop one or more existing crawls')
403 | @click.argument('crawl_id', nargs=-1)
404 | def stop_crawl(crawl_id):
405 | """ Stop one or more existing crawls
406 |
407 | :param crawl_id: list of crawl ids to stop
408 | """
409 | for id_ in crawl_id:
410 | res = sesh_post('/crawl/{0}/stop'.format(id_))
411 |
412 | if not res.get('success'):
413 | print('Error stopping: ' + res)
414 | return
415 |
416 | if is_quiet():
417 | print(id_)
418 | else:
419 | print('Stopped Crawl: {0}'.format(id_))
420 |
421 |
422 | # ============================================================================
423 | @crawl.command(name='remove', help='Remove one or more existing crawls')
424 | @click.argument('crawl_id', nargs=-1)
425 | def remove_crawl(crawl_id):
426 | """ Remove one or more existing crawls
427 |
428 | :param crawl_id: list of crawl ids to stop
429 | """
430 | for id_ in crawl_id:
431 | res = sesh_delete('/crawl/{0}'.format(id_))
432 |
433 | if not res.get('success'):
434 | print('Error removing: ' + res)
435 | return
436 |
437 | if is_quiet():
438 | print(id_)
439 | else:
440 | print('Removed Crawl: {0}'.format(id_))
441 |
442 |
443 | # ============================================================================
444 | @crawl.command(name='remove-all', help='Stop and remove all crawls')
445 | def remove_all():
446 | """ Stop and remove all crawls
447 | """
448 | res = sesh_get('/crawls')
449 |
450 | crawls = res['crawls']
451 |
452 | for crawl in crawls:
453 | id_ = crawl['id']
454 | res = sesh_delete('/crawl/{0}'.format(id_))
455 | if not is_quiet():
456 | print('Removed Crawl: {0}'.format(id_))
457 |
458 |
459 | # ============================================================================
460 | @crawl.command(name='logs', help='View crawl logs for one or all crawlers')
461 | @click.argument('crawl_id', nargs=1)
462 | @click.option(
463 | '-b',
464 | '--browser',
465 | type=int,
466 | default=0,
467 | help='1-based index of browser to show logs for, or 0 for all (default)',
468 | )
469 | @click.option(
470 | '-f',
471 | '--follow',
472 | type=bool,
473 | default=False,
474 | is_flag=True,
475 | help='follow crawl log in real-time',
476 | )
477 | @click.option(
478 | '-a',
479 | '--all-containers',
480 | type=bool,
481 | default=False,
482 | is_flag=True,
483 | help='include logs from all containers, not just crawler',
484 | )
485 | def logs(crawl_id, browser, follow, all_containers):
486 | """ View crawl logs for one or all crawlers
487 | :param crawl_id: The crawl_id to view logs for
488 | :param browser: 1-based index of browser to show logs for, or 0 for all (default)
489 | :param follow: follow crawl log in real-time (for one browser only)
490 | :param all_containers: include logs from all containers, not just crawler
491 | """
492 | res = sesh_get('/crawl/{0}'.format(crawl_id))
493 |
494 | num_browsers = len(res['browsers'])
495 | if browser <= 0:
496 | print_logs(res['browsers'], follow=follow, all_containers=all_containers)
497 | elif browser > num_browsers:
498 | print(
499 | 'Crawl has {0} browsers. Index must be 1 to {0}'.format(
500 | num_browsers, num_browsers
501 | )
502 | )
503 | else:
504 | print_logs(
505 | [res['browsers'][browser - 1]], follow=follow, all_containers=all_containers
506 | )
507 |
--------------------------------------------------------------------------------
/browsertrix_cli/main.py:
--------------------------------------------------------------------------------
1 | from browsertrix_cli.basecli import cli
2 | import browsertrix_cli.profile
3 | import browsertrix_cli.crawl
4 |
5 |
6 | # ============================================================================
7 | if __name__ == '__main__':
8 | cli()
9 |
--------------------------------------------------------------------------------
/browsertrix_cli/profile.py:
--------------------------------------------------------------------------------
1 | import click
2 | import docker
3 | import sys
4 | import time
5 | import webbrowser
6 |
7 |
8 | from browsertrix_cli.basecli import cli, is_quiet, sesh_get, settings
9 |
10 |
11 | # ============================================================================
12 | docker_api = None
13 |
14 | PROFILE_PREFIX = 'oldwebtoday/profile:'
15 |
16 | LABEL_BROWSERPROFILE = 'wr.browserprofile'
17 | LABEL_BASEBROWSER = 'wr.basebrowser'
18 |
19 |
20 | # ============================================================================
21 | def get_profile_image(profile):
22 | try:
23 | global docker_api
24 | if not docker_api:
25 | docker_api = docker.from_env(version='auto')
26 |
27 | image_name = PROFILE_PREFIX + profile
28 | image = docker_api.images.get(image_name)
29 | assert image.labels.get(LABEL_BROWSERPROFILE) == profile
30 | return 'profile:' + profile
31 |
32 | except (docker.errors.ImageNotFound, AssertionError):
33 | if not is_quiet():
34 | print('Profile "{0}" not found'.format(profile))
35 | sys.exit(1)
36 |
37 |
38 | # ============================================================================
39 | @cli.group(help='Commands for creating/removing browser profiles')
40 | def profile():
41 | global docker_api
42 | docker_api = docker.from_env(version='auto')
43 |
44 |
45 | # ============================================================================
46 | @profile.command(name='list', help='List Profiles')
47 | def list_profiles():
48 | res = docker_api.images.list(filters={'label': LABEL_BROWSERPROFILE})
49 |
50 | format_str = '{profile: <16} {base}'
51 | if not is_quiet():
52 | print(format_str.format(profile='PROFILE', base='BASE BROWSER'))
53 |
54 | for image in res:
55 | if not image.tags:
56 | continue
57 |
58 | if not image.tags[0].startswith(PROFILE_PREFIX):
59 | continue
60 |
61 | profile = image.tags[0][len(PROFILE_PREFIX) :]
62 | base_browser = image.labels.get(LABEL_BASEBROWSER, '(unknown)')
63 |
64 | if not is_quiet():
65 | print(format_str.format(profile=profile, base=base_browser))
66 | else:
67 | print(profile)
68 |
69 | if not is_quiet():
70 | print()
71 |
72 |
73 | # ============================================================================
74 | @profile.command(name='remove', help='Remove Profile')
75 | @click.argument('profile', type=str)
76 | def remove_profile(profile):
77 | full_tag = PROFILE_PREFIX + profile
78 |
79 | try:
80 | docker_api.images.remove(full_tag, force=True, noprune=False)
81 | if not is_quiet():
82 | print('Removed profile "{0}"!'.format(profile))
83 |
84 | except docker.errors.ImageNotFound:
85 | if not is_quiet():
86 | print('Profile "{0}" not found'.format(profile))
87 | sys.exit(1)
88 |
89 |
90 | # ============================================================================
91 | @profile.command(name='create', help='Create Profile')
92 | @click.option(
93 | '--browser', default='chrome:73', type=str, help='Base Browser Image to Extend'
94 | )
95 | def create_profile(browser):
96 | res = sesh_get(
97 | '/api/request/{0}/about:blank'.format(browser), prefix=settings.shepherd_prefix
98 | )
99 |
100 | reqid = res.get('reqid')
101 |
102 | curr_browser = None
103 |
104 | webbrowser.open(settings.view_browsers_prefix + reqid)
105 |
106 | print('A new browser window should have been opened')
107 | print(
108 | 'You can use the browser to log-in to accounts or otherwise prepare the browser profile'
109 | )
110 | print('(The content will not be recorded to WARC)')
111 |
112 | while True:
113 | profile_name = click.prompt(
114 | 'When done, please enter a new name to save the browser profile', type=str
115 | )
116 |
117 | if not curr_browser:
118 | curr_browser = docker_api.containers.get('browser-' + reqid)
119 |
120 | # exit_code, output = curr_browser.exec_run('/app/prep-commit.sh')
121 | exit_code, output = curr_browser.exec_run('pkill -f "/usr/bin/google-chrome"')
122 | if not is_quiet():
123 | print('Killed Chrome to Save Profile for Commit')
124 | print('Result: {0}'.format(exit_code))
125 | print(output.decode('utf-8'))
126 |
127 | time.sleep(1.5)
128 |
129 | conf = {
130 | 'Labels': {LABEL_BROWSERPROFILE: profile_name, LABEL_BASEBROWSER: browser}
131 | }
132 |
133 | res = curr_browser.commit(
134 | repository=PROFILE_PREFIX[:-1],
135 | tag=profile_name,
136 | message='Browser Profile',
137 | conf=conf,
138 | )
139 |
140 | if not is_quiet():
141 | print('Created Image: {0} ({1})'.format(res.tags[0], res.short_id))
142 |
143 | print('The browser should have restarted to about:blank')
144 | if not click.confirm('Continue browsing to create another profile?'):
145 | break
146 |
--------------------------------------------------------------------------------
/cli-requirements.txt:
--------------------------------------------------------------------------------
1 | click
2 | docker
3 | pyyaml
4 | requests
5 |
--------------------------------------------------------------------------------
/dev-requirements.txt:
--------------------------------------------------------------------------------
1 | flake8
2 | flake8-bugbear
3 | flake8-mypy
4 | mypy
5 | pytest
6 | pytest-asyncio
7 | black
8 | mock
9 | requests
10 | fakeredis
11 | PyYAML
12 |
--------------------------------------------------------------------------------
/docker-compose.yml:
--------------------------------------------------------------------------------
1 | version: '3.5'
2 |
3 | services:
4 | browsertrix:
5 | image: webrecorder/browsertrix
6 | build:
7 | context: .
8 | environment:
9 | - REDIS_URL=redis://redis/0
10 | - DEFAULT_POOL=auto-pool
11 | - SCREENSHOT_API_URL=http://pywb:8080/api/screenshot/{coll}
12 | - EXTRACTED_RAW_DOM_API_URL=http://pywb:8080/api/dom/{coll}
13 | - PROXY_HOST=pywb
14 |
15 | depends_on:
16 | - redis
17 | - pywb
18 |
19 | ports:
20 | - 8000:8000
21 |
22 | #volumes:
23 | # - ./:/app/
24 |
25 | pywb:
26 | build: ./pywb/
27 |
28 | environment:
29 | - REDIS_URL=redis://redis/0
30 |
31 | volumes:
32 | - ./webarchive:/webarchive
33 |
34 | networks:
35 | - default
36 | - browsers
37 |
38 | ports:
39 | - 8180:8080
40 |
41 | depends_on:
42 | - redis
43 |
44 | solr:
45 | image: solr
46 |
47 | volumes:
48 | - ./solr:/var/solr
49 |
50 | entrypoint:
51 | - docker-entrypoint.sh
52 | - solr-precreate
53 | - browsertrix
54 |
55 | ports:
56 | - 8983:8983
57 |
58 | shepherd:
59 | image: oldwebtoday/shepherd:1.1.0-dev
60 |
61 | environment:
62 | - BROWSER_NET=browsertrix_browsers
63 | - MAIN_NET=browsertrix_default
64 |
65 | - PROXY_HOST=pywb
66 | - PROXY_PORT=8080
67 |
68 | depends_on:
69 | - redis
70 |
71 | volumes:
72 | - /var/run/docker.sock:/var/run/docker.sock
73 | - ./flocks:/app/flocks
74 | - ./pool_config.yaml:/app/pool_config.yaml
75 |
76 | ports:
77 | - 9020:9020
78 |
79 | redis:
80 | image: redis:3.2.4
81 |
82 | behaviors:
83 | image: webrecorder/behaviors
84 |
85 | ports:
86 | - 3030:3030
87 |
88 | networks:
89 | default:
90 | driver: bridge
91 |
92 | browsers:
93 | driver: bridge
94 |
95 |
--------------------------------------------------------------------------------
/flocks/browsers-headless.yaml:
--------------------------------------------------------------------------------
1 | name: browsers-headless
2 | auto_remove: false
3 |
4 | volumes:
5 | tmpdir: '/tmp/.X11-unix'
6 |
7 | containers:
8 | - name: browser
9 | image: oldwebtoday/base-browser
10 | set_user_params: true
11 | external_network: '${BROWSER_NET}'
12 |
13 | environment:
14 | DISPLAY: ':99'
15 | PULSE_SERVER: '/tmp/.X11-unix/pulse-socket'
16 |
17 | PROXY_HOST: 'pywb'
18 | PROXY_PORT: '8080'
19 | PROXY_CA_URL: 'http://wsgiprox/download/pem'
20 | PROXY_CA_FILE: '/tmp/proxy-ca.pem'
21 |
22 | - name: autobrowser
23 | image: webrecorder/autobrowser
24 |
25 | external_network: '${MAIN_NET}'
26 |
27 | environment:
28 | BROWSER_HOST: 'browser'
29 | REDIS_URL: 'redis://redis:6379/0'
30 | TAB_TYPE: 'CrawlerTab'
31 |
32 |
33 |
--------------------------------------------------------------------------------
/flocks/browsers.yaml:
--------------------------------------------------------------------------------
1 | name: browsers
2 | auto_remove: false
3 |
4 | volumes:
5 | tmpdir: '/tmp/.X11-unix'
6 |
7 | containers:
8 | - name: xserver
9 | image: oldwebtoday/vnc-webrtc-audio
10 | ports:
11 | vnc_port: 6080
12 | cmd_port: 6082
13 | ice_tcp_port: 10235
14 | ice_udp_port: '10235/udp'
15 |
16 | environment:
17 | IDLE_TIMEOUT: '${IDLE_TIMEOUT}'
18 |
19 | - name: browser
20 | image: oldwebtoday/base-browser
21 | image_label: wr.name
22 | set_user_params: true
23 | external_network: '${BROWSER_NET}'
24 |
25 | environment:
26 | DISPLAY: ':99'
27 | PULSE_SERVER: '/tmp/.X11-unix/pulse-socket'
28 |
29 | #default to no proxy, set by crawler
30 | #PROXY_HOST: 'pywb'
31 | PROXY_PORT: '8080'
32 | PROXY_CA_URL: 'http://wsgiprox/download/pem'
33 | PROXY_CA_FILE: '/tmp/proxy-ca.pem'
34 |
35 | - name: autobrowser
36 | image: webrecorder/autobrowser
37 |
38 | external_network: '${MAIN_NET}'
39 |
40 | environment:
41 | BROWSER_HOST: 'browser'
42 | REDIS_URL: 'redis://redis:6379/0'
43 | TAB_TYPE: 'CrawlerTab'
44 |
45 |
46 |
--------------------------------------------------------------------------------
/frontend/.eslintrc.js:
--------------------------------------------------------------------------------
1 | module.exports = {
2 | extends: ['plugin:prettier/recommended', 'prettier/react'],
3 | parser: 'babel-eslint',
4 | parserOptions: {
5 | ecmaVersion: 10
6 | },
7 | env: {
8 | browser: true,
9 | node: true
10 | }
11 | };
12 |
--------------------------------------------------------------------------------
/frontend/.prettierrc:
--------------------------------------------------------------------------------
1 | {
2 | "singleQuote": true,
3 | "jsxSingleQuote": true
4 | }
5 |
--------------------------------------------------------------------------------
/frontend/package.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "crawlmanager-frontend",
3 | "version": "1.0.0",
4 | "main": "index.js",
5 | "license": "Appache-2.0",
6 | "dependencies": {
7 | "@hot-loader/react-dom": "16.8.6",
8 | "fstream": "^1.0.12",
9 | "immutable": "^4.0.0-rc.12",
10 | "js-yaml": "^3.13.1",
11 | "plur": "^3.1.1",
12 | "prop-types": "^15.7.2",
13 | "react": "^16.8.6",
14 | "react-dom": "^16.8.6",
15 | "react-redux": "^6.0.1",
16 | "react-router": "^5.0.0",
17 | "react-router-dom": "^5.0.0",
18 | "react-table": "^6.9.2",
19 | "react-toastify": "^5.0.0-rc.3",
20 | "react-virtualized": "^9.21.0",
21 | "redux": "^4.0.1",
22 | "redux-actions": "^2.6.5",
23 | "redux-batched-actions": "^0.4.1",
24 | "redux-devtools-extension": "^2.13.8",
25 | "redux-form": "^8.1.0",
26 | "redux-immutable": "^4.0.0",
27 | "redux-promise": "^0.6.0",
28 | "redux-thunk": "^2.3.0",
29 | "uikit": "^3.0.3",
30 | "url-regex": "^4.1.1"
31 | },
32 | "devDependencies": {
33 | "@babel/cli": "^7.4.3",
34 | "@babel/core": "^7.4.3",
35 | "@babel/plugin-proposal-class-properties": "^7.4.0",
36 | "@babel/plugin-proposal-decorators": "^7.4.0",
37 | "@babel/preset-env": "^7.4.3",
38 | "@babel/preset-react": "^7.0.0",
39 | "@types/react": "^16.8.13",
40 | "babel-eslint": "^10.0.1",
41 | "babel-loader": "^8.0.5",
42 | "babel-plugin-add-module-exports": "^1.0.0",
43 | "babel-plugin-transform-react-remove-prop-types": "^0.4.24",
44 | "babel-preset-env": "^1.7.0",
45 | "css-loader": "^2.1.1",
46 | "eslint": "^5.16.0",
47 | "eslint-config-prettier": "^4.1.0",
48 | "eslint-plugin-prettier": "^3.0.1",
49 | "eslint-plugin-react": "^7.12.4",
50 | "fastify": "^2.2.0",
51 | "fastify-static": "^2.4.0",
52 | "fastify-webpack-hmr": "^2.0.1",
53 | "file-loader": "^3.0.1",
54 | "html-webpack-plugin": "^3.2.0",
55 | "node-sass": "^4.11.0",
56 | "prettier": "^1.16.4",
57 | "react-hot-loader": "^4.8.3",
58 | "sass-loader": "^7.1.0",
59 | "style-loader": "^0.23.1",
60 | "url-loader": "^1.1.2",
61 | "webpack": "^4.29.6",
62 | "webpack-cli": "^3.3.0",
63 | "webpack-dev-middleware": "^3.6.2",
64 | "webpack-hot-middleware": "^2.24.3"
65 | },
66 | "scripts": {
67 | "develop": "node ./webpack/development-server.js",
68 | "build-prod": "NODE_ENV=production webpack --config ./webpack/webpack.config.js"
69 | }
70 | }
71 |
--------------------------------------------------------------------------------
/frontend/public/index.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | HI
6 |
7 |
8 |
9 |
10 |
11 |
--------------------------------------------------------------------------------
/frontend/src/actions/crawls.js:
--------------------------------------------------------------------------------
1 | import { toast } from 'react-toastify';
2 | import { makeHTTPRequest } from './httpRequests';
3 | import { EndpointRequests } from '../utils';
4 |
5 | export const ActionTypes = {
6 | getAll: Symbol('crawl-get-all'),
7 | gotAll: Symbol('crawl-got-all'),
8 | gotAllInit: Symbol('crawl-got-all-init'),
9 | create: Symbol('crawl-create'),
10 | urls: Symbol('crawl-get-urls'),
11 | addURLs: Symbol('crawl-add-urls'),
12 | updateURLInfo: Symbol('crawl-update-url-info'),
13 | info: Symbol('crawl-get-info'),
14 | stop: Symbol('crawl-stop'),
15 | start: Symbol('crawl-start'),
16 | isDone: Symbol('crawl-is-done'),
17 | deleteCrawl: Symbol('crawl-delete'),
18 | updateInfo: Symbol('crawl-update-info')
19 | };
20 |
21 | export function getAllCrawls(init = false) {
22 | const request = EndpointRequests.retrieveAllCrawls();
23 | return makeHTTPRequest(request, {
24 | onError({ error }) {
25 | toast(`Failed to retrieve info about all crawls: ${error}`, {
26 | type: toast.TYPE.ERROR
27 | });
28 | },
29 | async onResponse({ response }) {
30 | if (!response.ok) {
31 | toast(
32 | `Failed to retrieve info about all crawls: Details
33 | HTTP ${response.status}`,
34 | {
35 | type: toast.TYPE.ERROR
36 | }
37 | );
38 | return;
39 | }
40 | return {
41 | type: init ? ActionTypes.gotAllInit : ActionTypes.gotAll,
42 | payload: await response.json()
43 | };
44 | }
45 | });
46 | }
47 |
48 | export function getCrawlInfo(id) {
49 | const request = EndpointRequests.crawlInfo(id);
50 | return makeHTTPRequest(request, {
51 | onError({ error }) {
52 | toast(`Failed to retrieve the info for crawl - ${id}: ${error}`, {
53 | type: toast.TYPE.ERROR
54 | });
55 | },
56 | async onResponse({ response }) {
57 | const json = await response.json();
58 | if (!response.ok) {
59 | toast(
60 | `Failed to get the crawl info - ${id}: Details
61 | ${json.detail}`,
62 | {
63 | type: toast.TYPE.ERROR
64 | }
65 | );
66 | return;
67 | }
68 | return {
69 | type: ActionTypes.info,
70 | payload: json
71 | };
72 | }
73 | });
74 | }
75 |
76 | export function addCrawlURLs(id, urls) {
77 | const { request } = EndpointRequests.addCrawlURLs(id, urls);
78 | return makeHTTPRequest(request, {
79 | onError({ error }) {
80 | toast(`Failed to add the urls to the crawl - ${id}: ${error}`, {
81 | type: toast.TYPE.ERROR
82 | });
83 | },
84 | async onResponse({ response }) {
85 | const json = await response.json();
86 | if (!response.ok) {
87 | toast(
88 | `Failed to add urls to the crawl - ${id}: Details
89 | ${json.detail}`,
90 | {
91 | type: toast.TYPE.ERROR
92 | }
93 | );
94 | return;
95 | }
96 | return {
97 | type: ActionTypes.addURLs,
98 | payload: {
99 | id,
100 | urls
101 | }
102 | };
103 | }
104 | });
105 | }
106 |
107 | export function getCrawlURLs(id) {
108 | const { request } = EndpointRequests.getCrawlURLs(id);
109 | return makeHTTPRequest(request, {
110 | onError({ error }) {
111 | toast(`Failed to retrieve the crawls URLs - ${id}: ${error}`, {
112 | type: toast.TYPE.ERROR
113 | });
114 | },
115 | async onResponse({ response }) {
116 | const json = await response.json();
117 | if (!response.ok) {
118 | toast(
119 | `Failed to retrieve the crawls URLs - ${id}: Details
120 | ${json.detail}`,
121 | {
122 | type: toast.TYPE.ERROR
123 | }
124 | );
125 | return;
126 | }
127 | return {
128 | type: ActionTypes.updateURLInfo,
129 | payload: Object.assign(
130 | {
131 | id
132 | },
133 | json
134 | )
135 | };
136 | }
137 | });
138 | }
139 |
140 | /**
141 | *
142 | * @param {Object} [newCrawlConfig]
143 | */
144 | export function createCrawl(newCrawlConfig) {
145 | const { body, request } = EndpointRequests.createNewCrawl(newCrawlConfig);
146 | return makeHTTPRequest(request, {
147 | onError({ error }) {
148 | toast(`Failed to create the new crawl ${error}`, {
149 | type: toast.TYPE.ERROR
150 | });
151 | },
152 | async onResponse({ dispatch, response }) {
153 | const json = await response.json();
154 | if (!response.ok) {
155 | toast(
156 | `Failed to create the crawl - ${json.id}: Details
157 | ${json.detail}`,
158 | {
159 | type: toast.TYPE.ERROR
160 | }
161 | );
162 | return;
163 | }
164 | const request = EndpointRequests.crawlInfo(json.id);
165 | const infoResponse = await fetch(request);
166 | const result = infoResponse.ok
167 | ? await infoResponse.json()
168 | : Object.assign(
169 | {
170 | id: json.id
171 | },
172 | body,
173 | newCrawlConfig.crawlRunInfo
174 | );
175 |
176 | console.log('newly created crawl info', result);
177 | return {
178 | type: ActionTypes.create,
179 | payload: result
180 | };
181 | }
182 | });
183 | }
184 |
185 | /**
186 | *
187 | * @param {string} id
188 | * @param {Object} [startConfig]
189 | */
190 | export function startCrawl(id, startConfig) {
191 | const { body, request } = EndpointRequests.startCrawl(id, startConfig);
192 | return makeHTTPRequest(request, {
193 | onError({ error }) {
194 | toast(`Failed to start the crawl - ${id}: ${error}`, {
195 | type: toast.TYPE.ERROR
196 | });
197 | },
198 | async onResponse({ response }) {
199 | const json = await response.json();
200 | if (!response.ok) {
201 | toast(
202 | `Failed to start the crawl - ${id}: Details
203 | ${json.detail}`,
204 | {
205 | type: toast.TYPE.ERROR
206 | }
207 | );
208 | return;
209 | }
210 | const result = {
211 | id,
212 | ...body
213 | };
214 | console.log('started crawl info', result);
215 | return {
216 | type: ActionTypes.start,
217 | payload: result
218 | };
219 | }
220 | });
221 | }
222 |
223 | export function stopCrawl(id) {
224 | const request = EndpointRequests.stopCrawl(id);
225 | return makeHTTPRequest(request, {
226 | onError({ error }) {
227 | toast(`Failed to remove the crawl - ${id}: ${error}`, {
228 | type: toast.TYPE.ERROR
229 | });
230 | },
231 | async onResponse({ dispatch, response }) {
232 | const json = await response.json();
233 | if (!response.ok) {
234 | toast(
235 | `Failed to stop the crawl - ${id}: Details
236 | ${json.detail}`,
237 | {
238 | type: toast.TYPE.ERROR
239 | }
240 | );
241 | return;
242 | }
243 | return {
244 | type: ActionTypes.stop,
245 | payload: { id }
246 | };
247 | }
248 | });
249 | }
250 |
251 | export function removeCrawl(id) {
252 | const request = EndpointRequests.removeCrawl(id);
253 | return makeHTTPRequest(request, {
254 | onError({ error }) {
255 | toast(`Failed to remove the crawl - ${id}: ${error}`, {
256 | type: toast.TYPE.ERROR
257 | });
258 | },
259 | async onResponse({ dispatch, response }) {
260 | const json = await response.json();
261 | if (!response.ok) {
262 | toast(
263 | `Failed to remove the crawl - ${id}: Details
264 | ${json.detail}`,
265 | {
266 | type: toast.TYPE.ERROR
267 | }
268 | );
269 | return;
270 | }
271 | return {
272 | type: ActionTypes.deleteCrawl,
273 | payload: { id }
274 | };
275 | }
276 | });
277 | }
278 |
--------------------------------------------------------------------------------
/frontend/src/actions/httpRequests.js:
--------------------------------------------------------------------------------
1 | export const HTTPRequestAction = Symbol('http-request-maker');
2 |
3 | export const FetchStates = {
4 | preflight: Symbol('http-request-preflight'),
5 | inflight: Symbol('http-request-inflight'),
6 | done: Symbol('http-request-done'),
7 | error: Symbol('http-request-errored')
8 | };
9 |
10 | export function requestErrorAction({ error, payload }) {
11 | return {
12 | type: HTTPRequestAction,
13 | payload: Object.assign({ error }, payload)
14 | };
15 | }
16 |
17 | /**
18 | * @typedef {Object} MakeHTTPRequestInit
19 | * @property {function ({dispatch: Function, response: Response}): *} onResponse
20 | * @property {function ({dispatch: Function, error: Error}): *} onError
21 | */
22 |
23 | function requestComplete(nextAction, wasError, url) {
24 | nextAction.meta = nextAction.meta || {};
25 | nextAction.meta.httpRequest = {
26 | url,
27 | state: wasError ? FetchStates.error : FetchStates.done
28 | };
29 | return nextAction;
30 | }
31 |
32 | /**
33 | *
34 | * @param {Request} request
35 | * @param {MakeHTTPRequestInit} init
36 | */
37 | export function makeHTTPRequest(request, { onResponse, onError }) {
38 | return dispatch => {
39 | const init = {
40 | type: HTTPRequestAction,
41 | meta: {
42 | httpRequest: {
43 | state: FetchStates.preflight,
44 | url: request.url
45 | }
46 | }
47 | };
48 | if (!dispatch(init)) return; // no op, this is a duplicate request
49 | let wasError = false;
50 | dispatch(
51 | fetch(request)
52 | .then(response => onResponse({ dispatch, response }))
53 | .catch(error => {
54 | wasError = true;
55 | return onError({ dispatch, error });
56 | })
57 | .then(requestFinished =>
58 | requestComplete(
59 | requestFinished || { type: HTTPRequestAction },
60 | wasError,
61 | request.url
62 | )
63 | )
64 | );
65 | };
66 | }
67 |
--------------------------------------------------------------------------------
/frontend/src/actions/index.js:
--------------------------------------------------------------------------------
1 | export * from './crawls';
2 |
--------------------------------------------------------------------------------
/frontend/src/components/Crawl/Control.js:
--------------------------------------------------------------------------------
1 | import React, { Component } from 'react';
2 | import * as PropTypes from 'prop-types';
3 | import { CrawlRecord } from '../../reducers/crawls';
4 |
5 | export default class Control extends Component {
6 | static propTypes = {
7 | crawl: PropTypes.instanceOf(CrawlRecord).isRequired,
8 | getCrawlInfo: PropTypes.func.isRequired,
9 | startCrawl: PropTypes.func.isRequired,
10 | stopCrawl: PropTypes.func.isRequired,
11 | removeCrawl: PropTypes.func.isRequired
12 | };
13 |
14 | startCrawl() {
15 | const { crawl } = this.props;
16 | this.props.startCrawl(crawl.startCrawlConfig());
17 | }
18 |
19 | constructor(props) {
20 | super(props);
21 | this.startCrawl = this.startCrawl.bind(this);
22 | }
23 |
24 | render() {
25 | const { crawl, getCrawlInfo, removeCrawl, stopCrawl } = this.props;
26 | return (
27 |