├── .coveragerc ├── .dockerignore ├── .editorconfig ├── .gitattributes ├── .github ├── ISSUE_TEMPLATE │ ├── bug_report.md │ ├── feature_request.md │ └── replay_issue.md ├── PULL_REQUEST_TEMPLATE.md └── workflows │ ├── ci.yaml │ ├── publish_pypi.yaml │ └── release.yaml ├── .gitignore ├── .gitmodules ├── .travis.yml ├── .travis ├── install.sh └── test.sh ├── CHANGES.rst ├── CONTRIBUTING.md ├── Dockerfile ├── INSTALL.rst ├── LICENSE ├── MANIFEST.in ├── NOTICE ├── README.rst ├── Vagrantfile ├── appveyor.disabled.yml ├── babel.ini ├── build-vue-ui.sh ├── build-wombat.sh ├── ca └── README ├── config.yaml ├── docker-compose.yaml ├── docker-entrypoint.sh ├── docs ├── Makefile ├── code │ ├── pywb.apps.rst │ ├── pywb.indexer.rst │ ├── pywb.manager.rst │ ├── pywb.recorder.rst │ ├── pywb.rewrite.rst │ ├── pywb.rst │ ├── pywb.utils.rst │ ├── pywb.warcserver.index.rst │ ├── pywb.warcserver.resource.rst │ └── pywb.warcserver.rst ├── conf.py ├── index.rst ├── make.bat └── manual │ ├── access-control.rst │ ├── apis.rst │ ├── apps.rst │ ├── architecture.rst │ ├── cdxserver_api.rst │ ├── configuring.rst │ ├── images │ ├── vue-banner.png │ └── vue-cal.png │ ├── indexing.rst │ ├── localization.rst │ ├── memento.rst │ ├── migrating-cdx.rst │ ├── outbackcdx.rst │ ├── owb-pywb-terms.rst │ ├── owb-to-pywb-config.rst │ ├── owb-to-pywb-deploy.rst │ ├── owb-to-pywb-exclusions.rst │ ├── owb-transition.rst │ ├── recorder.rst │ ├── rewriter.rst │ ├── template-guide.rst │ ├── ui-customization.rst │ ├── ui-guide.rst │ ├── usage.rst │ ├── vue-ui.rst │ └── warcserver.rst ├── extra_requirements.txt ├── pywb ├── __init__.py ├── apps │ ├── __init__.py │ ├── cli.py │ ├── frontendapp.py │ ├── live.py │ ├── rewriterapp.py │ ├── static_handler.py │ ├── test │ │ ├── test_rewriter.py │ │ └── test_wbrequestresponse.py │ ├── warcserverapp.py │ ├── wayback.py │ └── wbrequestresponse.py ├── default_config.yaml ├── indexer │ ├── __init__.py │ ├── archiveindexer.py │ ├── cdxindexer.py │ └── test │ │ └── test_indexing.py ├── manager │ ├── __init__.py │ ├── aclmanager.py │ ├── autoindex.py │ ├── locmanager.py │ ├── manager.py │ └── migrate.py ├── recorder │ ├── __init__.py │ ├── filters.py │ ├── multifilewarcwriter.py │ ├── recorderapp.py │ ├── redisindexer.py │ └── test │ │ ├── rec.ini │ │ ├── simplerec.py │ │ └── test_recorder.py ├── rewrite │ ├── __init__.py │ ├── content_rewriter.py │ ├── cookie_rewriter.py │ ├── cookies.py │ ├── default_rewriter.py │ ├── header_rewriter.py │ ├── html_insert_rewriter.py │ ├── html_rewriter.py │ ├── jsonp_rewriter.py │ ├── regex_rewriters.py │ ├── rewrite_amf.py │ ├── rewrite_dash.py │ ├── rewrite_hls.py │ ├── rewrite_js_workers.py │ ├── rewriteinputreq.py │ ├── templateview.py │ ├── test │ │ ├── test_content_rewriter.py │ │ ├── test_cookie_rewriter.py │ │ ├── test_header_rewriter.py │ │ ├── test_html_insert_rewriter.py │ │ ├── test_html_rewriter.py │ │ ├── test_jsonp_rewriter.py │ │ ├── test_regex_rewriters.py │ │ ├── test_url_rewriter.py │ │ └── test_wburl.py │ ├── url_rewriter.py │ └── wburl.py ├── rules.yaml ├── static │ ├── autoFetchWorker.js │ ├── calendar.svg │ ├── css │ │ ├── base.css │ │ ├── bootstrap.min.css │ │ ├── font-awesome.min.css │ │ └── query.css │ ├── flowplayer │ │ ├── flowplayer-3.2.18.swf │ │ ├── flowplayer.audio-3.2.11.swf │ │ ├── flowplayer.controls-3.2.16.swf │ │ ├── flowplayer.pseudostreaming-3.2.13.swf │ │ └── toolbox.flashembed.js │ ├── fonts │ │ ├── font-awesome │ │ │ ├── fa-brands-400.eot │ │ │ ├── fa-brands-400.svg │ │ │ ├── fa-brands-400.ttf │ │ │ ├── fa-brands-400.woff │ │ │ ├── fa-brands-400.woff2 │ │ │ ├── fa-regular-400.eot │ │ │ ├── fa-regular-400.svg │ │ │ ├── fa-regular-400.ttf │ │ │ ├── fa-regular-400.woff │ │ │ ├── fa-regular-400.woff2 │ │ │ ├── fa-solid-900.eot │ │ │ ├── fa-solid-900.svg │ │ │ ├── fa-solid-900.ttf │ │ │ ├── fa-solid-900.woff │ │ │ └── fa-solid-900.woff2 │ │ ├── glyphicons-halflings-regular.eot │ │ ├── glyphicons-halflings-regular.svg │ │ ├── glyphicons-halflings-regular.ttf │ │ ├── glyphicons-halflings-regular.woff │ │ └── glyphicons-halflings-regular.woff2 │ ├── js │ │ ├── bootstrap.min.js │ │ ├── jquery-latest.min.js │ │ └── url-polyfill.min.js │ ├── loadWabac.js │ ├── loading-spinner │ │ ├── loading-spinner.js │ │ └── test.html │ ├── pywb-logo-sm.png │ ├── pywb-logo.png │ ├── query.js │ ├── queryWorker.js │ ├── scroll-webkit.css │ ├── search.js │ ├── transclusions.js │ ├── vidrw.js │ ├── vue │ │ └── vueui.js │ ├── vue_banner.css │ ├── wb_frame.js │ ├── wombat.js │ ├── wombatProxyMode.js │ └── wombatWorkers.js ├── templates │ ├── banner.html │ ├── base.html │ ├── bootstrap_jquery.html │ ├── collinfo.json │ ├── custom_banner.html │ ├── error.html │ ├── footer.html │ ├── frame_insert.html │ ├── head.html │ ├── head_insert.html │ ├── header.html │ ├── index.html │ ├── instructions.html │ ├── not_found.html │ ├── proxy_cert_download.html │ ├── proxy_select.html │ ├── query.html │ ├── search.html │ └── vue_loc.html ├── utils │ ├── README.md │ ├── __init__.py │ ├── binsearch.py │ ├── canonicalize.py │ ├── format.py │ ├── geventserver.py │ ├── io.py │ ├── loaders.py │ ├── memento.py │ ├── merge.py │ ├── test │ │ ├── test_binsearch.py │ │ └── test_loaders.py │ └── wbexception.py ├── version.py ├── vueui │ ├── .eslintrc.js │ ├── package.json │ ├── rollup.config.js │ ├── src │ │ ├── App.vue │ │ ├── cdx-simulator │ │ │ ├── README.md │ │ │ ├── cdx-record-sample.json │ │ │ ├── cdx-simulator.js │ │ │ ├── pywb-vueui-cdx-simulator-config.jpg │ │ │ └── test.html │ │ ├── components │ │ │ ├── CalendarMonth.vue │ │ │ ├── CalendarYear.vue │ │ │ ├── LoadingSpinner.vue │ │ │ ├── PageRuler.vue │ │ │ ├── Timeline.vue │ │ │ ├── TimelineBreadcrumbs.vue │ │ │ ├── TimelineLinear.vue │ │ │ └── Tooltip.vue │ │ ├── i18n.js │ │ ├── index.js │ │ └── model.js │ └── yarn.lock └── warcserver │ ├── __init__.py │ ├── access_checker.py │ ├── amf.py │ ├── basewarcserver.py │ ├── handlers.py │ ├── http.py │ ├── index │ ├── __init__.py │ ├── aggregator.py │ ├── cdxobject.py │ ├── cdxops.py │ ├── fuzzymatcher.py │ ├── indexsource.py │ ├── query.py │ ├── test │ │ ├── __init__.py │ │ ├── test_cdxobject.py │ │ ├── test_cdxops.py │ │ ├── test_dir_agg.py │ │ ├── test_fuzzymatcher.py │ │ ├── test_indexsource.py │ │ ├── test_lazy_ops.py │ │ ├── test_memento_agg.py │ │ ├── test_redis_agg.py │ │ ├── test_timeouts.py │ │ ├── test_xmlquery_indexsource.py │ │ └── test_zipnum.py │ └── zipnum.py │ ├── inputrequest.py │ ├── resource │ ├── __init__.py │ ├── blockrecordloader.py │ ├── pathresolvers.py │ ├── resolvingloader.py │ ├── responseloader.py │ └── test │ │ ├── __init__.py │ │ ├── test_loading.py │ │ └── test_pathresolvers.py │ ├── test │ ├── __init__.py │ ├── live.ini │ ├── test_access.py │ ├── test_amf.py │ ├── test_handlers.py │ ├── test_inputreq.py │ ├── test_upstream.py │ ├── test_warcserver.py │ ├── test_warcserver_config.yaml │ └── testutils.py │ ├── upstreamindexsource.py │ └── warcserver.py ├── requirements.txt ├── run-gunicorn.sh ├── run-tests.py ├── run-uwsgi.sh ├── sample-deploy ├── docker-compose-apache.yaml ├── docker-compose-nginx.yaml ├── docker-compose-outback.yaml ├── httpd.conf ├── pywb-apache.conf ├── pywb-nginx.conf ├── run.sh └── uwsgi_subdir.ini ├── sample_archive ├── access │ ├── allow_all.aclj │ ├── allows.aclj │ ├── blocks.aclj │ ├── list1.aclj │ ├── list2.aclj │ ├── pywb.aclj │ └── single-line.aclj ├── cdx │ ├── bad.cdx │ ├── dupes.cdx │ ├── example-arc-test.cdx │ ├── example-extra.cdx │ ├── example.cdx │ ├── httpbin-resource.cdxj │ ├── iana.cdx │ ├── missing-status-text.cdxj │ ├── post-test.cdx │ └── url-agnost-example.cdx ├── cdxj │ ├── dupes.cdxj │ ├── example-no-digest.cdxj │ ├── example.cdx.gz │ ├── example.cdxj │ ├── example2.cdxj │ ├── iana.cdxj │ ├── post-test.cdxj │ └── url-agnost-example.cdxj ├── non-surt-cdx │ └── example-non-surt.cdx ├── text_content │ ├── link_headers.yaml │ ├── pathindex.txt │ ├── quickfox_repeated.compressed │ ├── sample.html │ ├── sample_dash.mpd │ ├── sample_hls.m3u8 │ ├── sample_no_head.html │ ├── sample_no_head_2.html │ ├── sample_unclosed_script.html │ └── toptest.js ├── waczs │ ├── invalid_example_1.wacz │ └── valid_example_1.wacz ├── warcs │ ├── bad.arc │ ├── dupes.warc.gz │ ├── example-bad.warc.gz.bad │ ├── example-extra.warc │ ├── example-url-agnostic-orig.warc.gz │ ├── example-url-agnostic-revisit.warc.gz │ ├── example-wget-1-14.warc.gz │ ├── example-wpull.warc.gz │ ├── example.arc │ ├── example.arc.gz │ ├── example.warc │ ├── example.warc.gz │ ├── example2.warc.gz │ ├── httpbin-resource.warc.gz │ ├── iana.warc.gz │ ├── missing-status-text.warc │ └── post-test.warc.gz └── zipcdx │ ├── zipnum-bad.idx │ ├── zipnum-bad.loc │ ├── zipnum-sample.cdx.gz │ ├── zipnum-sample.idx │ └── zipnum-sample.loc ├── setup.py ├── test_requirements.txt ├── tests ├── __init__.py ├── base_config_test.py ├── config_test.yaml ├── config_test_access.yaml ├── config_test_cert_req.yaml ├── config_test_loc.yaml ├── config_test_record.yaml ├── config_test_record_dedup.yaml ├── config_test_redirect_classic.yaml ├── config_test_root_coll.yaml ├── i18n-data │ ├── .gitignore │ └── l337 │ │ └── LC_MESSAGES │ │ ├── messages.mo │ │ └── messages.po ├── memento_fixture.py ├── test_acl.py ├── test_acl_manager.py ├── test_auto_colls.py ├── test_cdx_server_app.py ├── test_cert_req.py ├── test_cli.py ├── test_embargo.py ├── test_force_https.py ├── test_integration.py ├── test_live_rewriter.py ├── test_locales.py ├── test_manager.py ├── test_memento.py ├── test_prefer_header.py ├── test_prefixed_deploy.py ├── test_proxy.py ├── test_range.py ├── test_record_dedup.py ├── test_record_replay.py ├── test_redirect_classic.py ├── test_redirect_revisits.py ├── test_redirects.py ├── test_root_coll.py ├── test_socks.py └── test_zipnum_auto_dir.py ├── tests_disabled ├── fixture.py ├── live.py ├── perms_fixture.py ├── server_mock.py ├── server_thread.py ├── test_cdxserver.py ├── test_config_frames.yaml ├── test_config_memento.yaml ├── test_config_proxy_http_cookie.yaml ├── test_config_proxy_https_cookie.yaml ├── test_config_proxy_ip.yaml ├── test_config_proxy_ip_redis.yaml ├── test_config_proxy_no_banner.yaml ├── test_config_root_coll.yaml ├── test_live_proxy.py ├── test_memento.py ├── test_perms.py ├── test_perms_app.py ├── test_proxy_http_auth.py ├── test_proxy_http_cookie.py ├── test_proxy_http_ip.py ├── test_proxy_http_ip_redis.py ├── test_proxy_http_no_banner.py ├── test_proxy_https_cookie.py └── test_rewrite_content.py ├── tox.ini ├── update-tag.sh └── uwsgi.ini /.coveragerc: -------------------------------------------------------------------------------- 1 | [run] 2 | concurrency = gevent 3 | source = codecov 4 | branch = True 5 | omit = 6 | */test/* 7 | */tests/* 8 | *.html 9 | *.js 10 | *.css 11 | pywb/__init__.py 12 | 13 | [report] 14 | exclude_lines = 15 | pragma: no cover 16 | if __name__ == .__main__.: 17 | def __repr__ 18 | raise NotImplementedError 19 | -------------------------------------------------------------------------------- /.dockerignore: -------------------------------------------------------------------------------- 1 | build/ 2 | dist/ 3 | karma-tests/ 4 | tests_disabled/ 5 | venv/ 6 | collections/ 7 | wombat/ 8 | docs/ 9 | 10 | .cache/ 11 | .eggs/ 12 | .git/ 13 | .github/ 14 | .venv 15 | .travis 16 | .pytest_cache 17 | 18 | .coveragerc 19 | .dockerignore 20 | .editorconfig 21 | .gitattributes 22 | .gitignore 23 | .travis.yml 24 | appveyor.yml 25 | package.json 26 | run-* 27 | Vagrantfile 28 | Dockerfile 29 | 30 | **/*.egg 31 | **/*.egg-info 32 | **/__pycache__ 33 | **/*.pyc 34 | -------------------------------------------------------------------------------- /.editorconfig: -------------------------------------------------------------------------------- 1 | [*.js] 2 | indent_style=space 3 | indent_size=2 4 | -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | *.arc -text 2 | *.warc -text 3 | *.idx -text 4 | *.idxj -text 5 | *.cdx -text 6 | *.cdxj -text 7 | *.gz -text 8 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug Report 3 | about: Create a report to help us improve 4 | 5 | --- 6 | ## Describe the bug 7 | 8 | 9 | ## Steps to reproduce the bug 10 | 11 | 12 | 13 | ## Expected behavior 14 | 15 | 16 | ## Screenshots 17 | 18 | 19 | ## Environment 20 | 21 | - OS: [e.g. iOS] 22 | - Browser [e.g. chrome, safari] 23 | - Version [e.g. 22] 24 | 25 | ## Additional context 26 | 27 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature Request 3 | about: Suggest an idea to improve this project! 4 | 5 | --- 6 | 7 | ## Is your feature request related to a problem? Please describe. 8 | 9 | 10 | 11 | ## Describe the solution you'd like 12 | 13 | 14 | 15 | ## Describe alternatives you've considered 16 | 17 | 18 | 19 | ## Additional context 20 | 21 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/replay_issue.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Replay Issue 3 | about: Can not replay a specific web page? Tell us about it! 4 | 5 | --- 6 | ## Expected behavior 7 | 8 | 9 | 10 | ## What actually happened 11 | 12 | 1. Go to '...' 13 | 2. Click on '....' 14 | 3. Scroll down to '....' 15 | 4. See error 16 | 17 | 18 | ## Browser 19 | 20 | 21 | **Desktop (please complete the following information):** 22 | - OS: [e.g. iOS] 23 | - Browser [e.g. chrome, safari] 24 | - Version [e.g. 22] 25 | 26 | **Smartphone (please complete the following information):** 27 | - Device: [e.g. iPhone6] 28 | - OS: [e.g. iOS8.1] 29 | - Browser [e.g. stock browser, safari] 30 | - Version [e.g. 22] 31 | -------------------------------------------------------------------------------- /.github/PULL_REQUEST_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | ## Description 5 | 6 | 7 | ## Motivation and Context 8 | 9 | 10 | 11 | 12 | ## Screenshots (if appropriate): 13 | 14 | ## Types of changes 15 | 16 | - [ ] Replay fix (fixes a replay specific issue) 17 | - [ ] Bug fix (non-breaking change which fixes an issue) 18 | - [ ] New feature (non-breaking change which adds functionality) 19 | - [ ] Breaking change (fix or feature that would cause existing functionality to change) 20 | 21 | ## Checklist: 22 | 23 | 24 | - [ ] My change requires a change to the documentation. 25 | - [ ] I have updated the documentation accordingly. 26 | - [ ] I have added or updated tests to cover my changes. 27 | - [ ] All new and existing tests passed. 28 | -------------------------------------------------------------------------------- /.github/workflows/ci.yaml: -------------------------------------------------------------------------------- 1 | name: CI 2 | 3 | on: [push, pull_request] 4 | 5 | concurrency: 6 | group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} 7 | cancel-in-progress: true 8 | 9 | jobs: 10 | unit-tests: 11 | timeout-minutes: 20 12 | runs-on: ubuntu-latest 13 | strategy: 14 | max-parallel: 3 15 | matrix: 16 | python-version: ['3.9', '3.10', '3.11'] 17 | 18 | steps: 19 | - name: checkout 20 | uses: actions/checkout@v2 21 | 22 | - name: Set up Python ${{ matrix.python-version }} 23 | uses: actions/setup-python@v5 24 | with: 25 | python-version: ${{ matrix.python-version }} 26 | 27 | - name: Install dependencies 28 | run: | 29 | python -m pip install --upgrade pip 30 | pip install tox tox-gh-actions 31 | 32 | - name: Test with tox 33 | run: tox 34 | 35 | - name: Upload coverage to Codecov 36 | uses: codecov/codecov-action@v1 37 | 38 | -------------------------------------------------------------------------------- /.github/workflows/publish_pypi.yaml: -------------------------------------------------------------------------------- 1 | name: Publish to PYPI 2 | on: 3 | release: 4 | types: [published] 5 | 6 | jobs: 7 | pypi-release: 8 | runs-on: ubuntu-latest 9 | strategy: 10 | matrix: 11 | python-version: [3.9] 12 | 13 | steps: 14 | - name: checkout 15 | uses: actions/checkout@v1 16 | 17 | - name: Set up Python ${{ matrix.python-version }} 18 | uses: actions/setup-python@v1 19 | with: 20 | python-version: ${{ matrix.python-version }} 21 | 22 | - name: Install dependencies 23 | run: python -m pip install --upgrade pip wheel twine 24 | 25 | - name: Build Dist 26 | run: python setup.py sdist bdist_wheel --universal 27 | 28 | - name: Publish package to TestPyPI 29 | uses: pypa/gh-action-pypi-publish@master 30 | with: 31 | user: __token__ 32 | password: ${{ secrets.PYPI_API_TOKEN }} 33 | 34 | -------------------------------------------------------------------------------- /.github/workflows/release.yaml: -------------------------------------------------------------------------------- 1 | name: Publish Docker image 2 | on: 3 | release: 4 | types: [published] 5 | 6 | jobs: 7 | push_to_registries: 8 | name: Build pywb Docker image for release and push to Dockerhub 9 | runs-on: ubuntu-latest 10 | steps: 11 | - 12 | name: Check out the repo 13 | uses: actions/checkout@v2 14 | - 15 | name: Docker meta 16 | id: meta 17 | uses: docker/metadata-action@v3 18 | with: 19 | images: webrecorder/pywb 20 | tags: | 21 | type=match,pattern=v-(.*),group=1 22 | - 23 | name: Set up QEMU 24 | uses: docker/setup-qemu-action@v1 25 | - 26 | name: Set up Docker Buildx 27 | uses: docker/setup-buildx-action@v1 28 | - 29 | name: Login to DockerHub 30 | uses: docker/login-action@v1 31 | with: 32 | username: ${{ secrets.DOCKER_USERNAME }} 33 | password: ${{ secrets.DOCKER_PASSWORD }} 34 | - 35 | name: Build and push 36 | id: docker_build 37 | uses: docker/build-push-action@v2 38 | with: 39 | context: . 40 | push: true 41 | platforms: linux/amd64,linux/arm64 42 | tags: ${{ steps.meta.outputs.tags }} 43 | 44 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # wabac sw 2 | pywb/static/wabacSW.js 3 | 4 | *.py[cod] 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Packages 10 | *.egg 11 | *.egg-info 12 | dist 13 | build 14 | eggs 15 | .eggs 16 | parts 17 | bin 18 | var 19 | sdist 20 | develop-eggs 21 | .installed.cfg 22 | lib 23 | lib64 24 | __pycache__ 25 | .DS_Store 26 | 27 | # ignore auto-gen certs 28 | ca/pywb-ca.pem 29 | ca/certs/ 30 | proxy-certs/ 31 | collections/ 32 | 33 | # Installer logs 34 | pip-log.txt 35 | 36 | # Unit test / coverage reports 37 | .coverage 38 | .tox 39 | nosetests.xml 40 | 41 | # Translations 42 | *.mo 43 | 44 | # Mr Developer 45 | .mr.developer.cfg 46 | .project 47 | .pydevproject 48 | 49 | .vagrant 50 | 51 | # Node 52 | node_modules/ 53 | 54 | # git_hash 55 | git_hash.py 56 | 57 | # Sphinx documentation 58 | docs/_build/* 59 | 60 | # virtualenvs 61 | env/ 62 | venv/ 63 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "wombat"] 2 | path = wombat 3 | url = https://github.com/webrecorder/wombat 4 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | 3 | python: 4 | - "2.7" 5 | - "3.5" 6 | - "3.6" 7 | - "3.7" 8 | - "3.8" 9 | 10 | dist: xenial 11 | 12 | addons: 13 | chrome: stable 14 | sauce_connect: true 15 | 16 | env: 17 | - WR_TEST=no 18 | - WR_TEST=yes 19 | 20 | services: xvfb 21 | 22 | cache: 23 | directories: 24 | - node_modules 25 | 26 | sudo: required 27 | 28 | install: 29 | - ./.travis/install.sh 30 | 31 | before_install: 32 | - 'if [ "$WR_TEST" = "yes" ]; then sudo sysctl kernel.unprivileged_userns_clone=1; fi' 33 | 34 | script: 35 | - ./.travis/test.sh 36 | 37 | after_success: 38 | - codecov 39 | 40 | matrix: 41 | allow_failures: 42 | - env: WR_TEST=yes 43 | - python: "2.7" 44 | 45 | exclude: 46 | - env: WR_TEST=yes 47 | python: "2.7" 48 | - env: WR_TEST=yes 49 | python: "3.5" 50 | - env: WR_TEST=yes 51 | python: "3.7" 52 | 53 | -------------------------------------------------------------------------------- /.travis/install.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | 4 | pip install --upgrade pip setuptools 5 | pip install 'Markupsafe<2.0.0' 6 | python setup.py -q install 7 | pip install -r extra_requirements.txt 8 | pip install coverage pytest-cov coveralls 9 | pip install codecov 10 | 11 | if [ "$WR_TEST" = "yes" ]; then 12 | git clone https://github.com/webrecorder/webrecorder-tests.git 13 | cd webrecorder-tests 14 | pip install --upgrade -r requirements.txt 15 | ./bootstrap.sh 16 | fi 17 | -------------------------------------------------------------------------------- /.travis/test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | 4 | if [ "$WR_TEST" = "no" ]; then 5 | python setup.py test 6 | else 7 | cd webrecorder-tests 8 | INTRAVIS=1 pytest -m "pywbtest and chrometest" 9 | fi 10 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | ARG PYTHON=python:3.11 2 | 3 | FROM $PYTHON 4 | 5 | WORKDIR /pywb 6 | 7 | COPY requirements.txt extra_requirements.txt ./ 8 | 9 | RUN pip install --no-cache-dir -r requirements.txt -r extra_requirements.txt 10 | 11 | COPY . ./ 12 | 13 | RUN python setup.py install \ 14 | && mv ./docker-entrypoint.sh / \ 15 | && mkdir /uwsgi && mv ./uwsgi.ini /uwsgi/ \ 16 | && mkdir /webarchive && mv ./config.yaml /webarchive/ 17 | 18 | WORKDIR /webarchive 19 | 20 | # auto init collection 21 | ENV INIT_COLLECTION '' 22 | 23 | ENV VOLUME_DIR /webarchive 24 | ENV UWSGI_MOUNT '/=/pywb/pywb/apps/wayback.py' 25 | 26 | #USER archivist 27 | COPY docker-entrypoint.sh ./ 28 | 29 | # volume and port 30 | VOLUME /webarchive 31 | EXPOSE 8080 32 | 33 | ENTRYPOINT ["/docker-entrypoint.sh"] 34 | CMD ["uwsgi", "/uwsgi/uwsgi.ini"] 35 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include LICENSE 2 | include *.rst 3 | include *requirements.txt 4 | include *.yaml 5 | include *.yml 6 | include .gitmodules 7 | include .dockerignore 8 | include Dockerfile 9 | include Vagrantfile 10 | include uwsgi.ini 11 | include run-tests.py 12 | include *.sh 13 | recursive-include static *.js 14 | recursive-include pywb *.ini 15 | recursive-include pywb *.md 16 | recursive-include pywb *.py 17 | recursive-include pywb *.yaml 18 | recursive-include sample_archive *.aclj 19 | recursive-include tests *.po 20 | recursive-include tests *.yaml 21 | recursive-include tests_disabled *.py 22 | recursive-include tests_disabled *.yaml 23 | recursive-include docs *.bat 24 | recursive-include docs *.py 25 | recursive-include docs *.rst 26 | recursive-include docs Makefile 27 | -------------------------------------------------------------------------------- /NOTICE: -------------------------------------------------------------------------------- 1 | pywb 2 | Copyright 2014-2020 Webrecorder Software, Rhizome, and Contributors. 3 | 4 | Distributed under the GNU General Public License v3. 5 | See LICENSE for details. 6 | -------------------------------------------------------------------------------- /appveyor.disabled.yml: -------------------------------------------------------------------------------- 1 | environment: 2 | global: 3 | CMD_IN_ENV: "cmd /E:ON /V:ON /C obvci_appveyor_python_build_env.cmd" 4 | 5 | matrix: 6 | - PYTHON: "C:\\Python35" 7 | - PYTHON: "C:\\Python35-x64" 8 | - PYTHON: "C:\\Python36" 9 | - PYTHON: "C:\\Python36-x64" 10 | - PYTHON: "C:\\Python37" 11 | - PYTHON: "C:\\Python37-x64" 12 | - PYTHON: "C:\\Python38" 13 | - PYTHON: "C:\\Python38-x64" 14 | 15 | 16 | 17 | install: 18 | - "SET PATH=%PYTHON%;%PYTHON%\\Scripts;%PATH%" 19 | - "python -m pip install --upgrade pip" 20 | - "pip install -U setuptools" 21 | - "pip install MarkupSafe==1.1.1" 22 | - "pip install coverage pytest-cov" 23 | - "pip install cffi" 24 | - "pip install pyopenssl" 25 | - "pip install pypiwin32" 26 | - "pip install certauth boto3 youtube-dl pysocks" 27 | - "pip install codecov" 28 | - "pip install wheel" 29 | 30 | build_script: 31 | - "python setup.py install" 32 | 33 | test_script: 34 | - "python setup.py test" 35 | 36 | after_test: 37 | - rm -rf tests/coverages 38 | - if not exist dist mkdir dist 39 | - if exist .coverage (cp .coverage dist\) else (echo no .coverage) 40 | - codecov 41 | - if exist coverage.xml (cp coverage.xml dist\) else (echo no coverage.xml) 42 | 43 | -------------------------------------------------------------------------------- /babel.ini: -------------------------------------------------------------------------------- 1 | [jinja2: pywb/templates/**.html] 2 | extensions=jinja2.ext.i18n,jinja2.ext.autoescape,jinja2.ext.with_ 3 | -------------------------------------------------------------------------------- /build-vue-ui.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | CURR_DIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd ) 4 | 5 | cd $CURR_DIR/pywb/vueui/ 6 | yarn install 7 | yarn run build 8 | -------------------------------------------------------------------------------- /build-wombat.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | CURR_DIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd ) 4 | 5 | cd $CURR_DIR/wombat 6 | export OUTPUT_DIR=../pywb/static/ 7 | yarn install 8 | yarn run build-prod 9 | #cp ./dist/*.js ../pywb/static/ 10 | -------------------------------------------------------------------------------- /ca/README: -------------------------------------------------------------------------------- 1 | pywb Certificate Authority 2 | 3 | For proxy HTTPS support, the pywb-ca.pem root cert and per-host certificates will be created here 4 | -------------------------------------------------------------------------------- /config.yaml: -------------------------------------------------------------------------------- 1 | # pywb config file 2 | # ======================================== 3 | # 4 | debug: true 5 | 6 | # Uncomment to set banner colors and logo 7 | # ui: 8 | # logo: path/relative/from/static/logo.png 9 | # logo_home_url: https://example.com 10 | # navbar_background_hex: 0c49b0 11 | # navbar_color_hex: fff 12 | # navbar_light_buttons: true 13 | # disable_printing: true 14 | 15 | collections: 16 | all: $all 17 | pywb: 18 | index_paths: ./sample_archive/cdx/ 19 | archive_paths: ./sample_archive/warcs/ 20 | 21 | # Settings for each collection 22 | use_js_obj_proxy: true 23 | 24 | # Eanable Memento support 25 | enable_memento: true 26 | 27 | # Replay content in an iframe 28 | framed_replay: true 29 | 30 | # Use wabac.js-style client-side replay system for framed replay 31 | client_side_replay: false 32 | 33 | # Enable classic redirect behavior 34 | redirect_to_exact: true 35 | 36 | # Uncomment and change to set default locale 37 | # default_locale: en 38 | 39 | # Uncomment to set available locales 40 | # locales: 41 | # - en 42 | # - ru 43 | -------------------------------------------------------------------------------- /docker-compose.yaml: -------------------------------------------------------------------------------- 1 | version: '3' 2 | 3 | services: 4 | pywb: 5 | build: . 6 | ports: 7 | - 8080:8080 8 | volumes: 9 | - ./config.yaml:/webarchive/config.yaml 10 | - ./sample_archive/:/webarchive/sample_archive/ 11 | -------------------------------------------------------------------------------- /docker-entrypoint.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | set -e 4 | 5 | # Get UID/GID from volume dir 6 | VOLUME_UID=$(stat -c '%u' $VOLUME_DIR) 7 | VOLUME_GID=$(stat -c '%g' $VOLUME_DIR) 8 | 9 | MY_UID=$(id -u) 10 | MY_GID=$(id -g) 11 | 12 | # Run as custom user 13 | if [ "$MY_GID" != "$VOLUME_GID" ] || [ "$MY_UID" != "$VOLUME_UID" ]; then 14 | # create or modify user and group to match expected uid/gid 15 | groupadd --gid $VOLUME_GID archivist || groupmod -o --gid $VOLUME_GID archivist 16 | useradd -ms /bin/bash -u $VOLUME_UID -g $VOLUME_GID archivist || usermod -o -u $VOLUME_UID archivist 17 | 18 | # initialize a collection if defined and not present 19 | if [ -n "$INIT_COLLECTION" ] && [ ! -d $VOLUME_DIR/collections/$INIT_COLLECTION ]; then 20 | su archivist -c "wb-manager init $INIT_COLLECTION" 21 | fi 22 | 23 | cmd="cd $PWD; $@" 24 | 25 | # run process as new archivist user 26 | su archivist -c "$cmd" 27 | 28 | # run as current user (root) 29 | else 30 | # initialize a collection if defined and not present 31 | if [ -n "$INIT_COLLECTION" ] && [ ! -d $VOLUME_DIR/collections/$INIT_COLLECTION ]; then 32 | cd $VOLUME_DIR 33 | wb-manager init $INIT_COLLECTION 34 | fi 35 | 36 | # run process directly 37 | exec $@ 38 | fi 39 | 40 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = python -msphinx 7 | SPHINXPROJ = pywb 8 | SOURCEDIR = . 9 | BUILDDIR = _build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | apidoc: 16 | @sphinx-apidoc -f -T -o code ../pywb/ "../*test*" "../*git_hash*" 17 | 18 | .PHONY: help Makefile 19 | 20 | # Catch-all target: route all unknown targets to Sphinx using the new 21 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 22 | %: Makefile 23 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 24 | -------------------------------------------------------------------------------- /docs/code/pywb.apps.rst: -------------------------------------------------------------------------------- 1 | pywb.apps package 2 | ================= 3 | 4 | Submodules 5 | ---------- 6 | 7 | pywb.apps.cli module 8 | -------------------- 9 | 10 | .. automodule:: pywb.apps.cli 11 | :members: 12 | :undoc-members: 13 | :show-inheritance: 14 | 15 | pywb.apps.frontendapp module 16 | ---------------------------- 17 | 18 | .. automodule:: pywb.apps.frontendapp 19 | :members: 20 | :undoc-members: 21 | :show-inheritance: 22 | 23 | pywb.apps.live module 24 | --------------------- 25 | 26 | .. automodule:: pywb.apps.live 27 | :members: 28 | :undoc-members: 29 | :show-inheritance: 30 | 31 | pywb.apps.rewriterapp module 32 | ---------------------------- 33 | 34 | .. automodule:: pywb.apps.rewriterapp 35 | :members: 36 | :undoc-members: 37 | :show-inheritance: 38 | 39 | pywb.apps.static\_handler module 40 | -------------------------------- 41 | 42 | .. automodule:: pywb.apps.static_handler 43 | :members: 44 | :undoc-members: 45 | :show-inheritance: 46 | 47 | pywb.apps.warcserverapp module 48 | ------------------------------ 49 | 50 | .. automodule:: pywb.apps.warcserverapp 51 | :members: 52 | :undoc-members: 53 | :show-inheritance: 54 | 55 | pywb.apps.wayback module 56 | ------------------------ 57 | 58 | .. automodule:: pywb.apps.wayback 59 | :members: 60 | :undoc-members: 61 | :show-inheritance: 62 | 63 | pywb.apps.wbrequestresponse module 64 | ---------------------------------- 65 | 66 | .. automodule:: pywb.apps.wbrequestresponse 67 | :members: 68 | :undoc-members: 69 | :show-inheritance: 70 | 71 | Module contents 72 | --------------- 73 | 74 | .. automodule:: pywb.apps 75 | :members: 76 | :undoc-members: 77 | :show-inheritance: 78 | -------------------------------------------------------------------------------- /docs/code/pywb.indexer.rst: -------------------------------------------------------------------------------- 1 | pywb.indexer package 2 | ==================== 3 | 4 | Submodules 5 | ---------- 6 | 7 | pywb.indexer.archiveindexer module 8 | ---------------------------------- 9 | 10 | .. automodule:: pywb.indexer.archiveindexer 11 | :members: 12 | :undoc-members: 13 | :show-inheritance: 14 | 15 | pywb.indexer.cdxindexer module 16 | ------------------------------ 17 | 18 | .. automodule:: pywb.indexer.cdxindexer 19 | :members: 20 | :undoc-members: 21 | :show-inheritance: 22 | 23 | Module contents 24 | --------------- 25 | 26 | .. automodule:: pywb.indexer 27 | :members: 28 | :undoc-members: 29 | :show-inheritance: 30 | -------------------------------------------------------------------------------- /docs/code/pywb.manager.rst: -------------------------------------------------------------------------------- 1 | pywb.manager package 2 | ==================== 3 | 4 | Submodules 5 | ---------- 6 | 7 | pywb.manager.aclmanager module 8 | ------------------------------ 9 | 10 | .. automodule:: pywb.manager.aclmanager 11 | :members: 12 | :undoc-members: 13 | :show-inheritance: 14 | 15 | pywb.manager.autoindex module 16 | ----------------------------- 17 | 18 | .. automodule:: pywb.manager.autoindex 19 | :members: 20 | :undoc-members: 21 | :show-inheritance: 22 | 23 | pywb.manager.locmanager module 24 | ------------------------------ 25 | 26 | .. automodule:: pywb.manager.locmanager 27 | :members: 28 | :undoc-members: 29 | :show-inheritance: 30 | 31 | pywb.manager.manager module 32 | --------------------------- 33 | 34 | .. automodule:: pywb.manager.manager 35 | :members: 36 | :undoc-members: 37 | :show-inheritance: 38 | 39 | pywb.manager.migrate module 40 | --------------------------- 41 | 42 | .. automodule:: pywb.manager.migrate 43 | :members: 44 | :undoc-members: 45 | :show-inheritance: 46 | 47 | Module contents 48 | --------------- 49 | 50 | .. automodule:: pywb.manager 51 | :members: 52 | :undoc-members: 53 | :show-inheritance: 54 | -------------------------------------------------------------------------------- /docs/code/pywb.recorder.rst: -------------------------------------------------------------------------------- 1 | pywb.recorder package 2 | ===================== 3 | 4 | Submodules 5 | ---------- 6 | 7 | pywb.recorder.filters module 8 | ---------------------------- 9 | 10 | .. automodule:: pywb.recorder.filters 11 | :members: 12 | :undoc-members: 13 | :show-inheritance: 14 | 15 | pywb.recorder.multifilewarcwriter module 16 | ---------------------------------------- 17 | 18 | .. automodule:: pywb.recorder.multifilewarcwriter 19 | :members: 20 | :undoc-members: 21 | :show-inheritance: 22 | 23 | pywb.recorder.recorderapp module 24 | -------------------------------- 25 | 26 | .. automodule:: pywb.recorder.recorderapp 27 | :members: 28 | :undoc-members: 29 | :show-inheritance: 30 | 31 | pywb.recorder.redisindexer module 32 | --------------------------------- 33 | 34 | .. automodule:: pywb.recorder.redisindexer 35 | :members: 36 | :undoc-members: 37 | :show-inheritance: 38 | 39 | Module contents 40 | --------------- 41 | 42 | .. automodule:: pywb.recorder 43 | :members: 44 | :undoc-members: 45 | :show-inheritance: 46 | -------------------------------------------------------------------------------- /docs/code/pywb.rst: -------------------------------------------------------------------------------- 1 | pywb package 2 | ============ 3 | 4 | Subpackages 5 | ----------- 6 | 7 | .. toctree:: 8 | :maxdepth: 4 9 | 10 | pywb.apps 11 | pywb.indexer 12 | pywb.manager 13 | pywb.recorder 14 | pywb.rewrite 15 | pywb.utils 16 | pywb.warcserver 17 | 18 | Submodules 19 | ---------- 20 | 21 | pywb.version module 22 | ------------------- 23 | 24 | .. automodule:: pywb.version 25 | :members: 26 | :undoc-members: 27 | :show-inheritance: 28 | 29 | Module contents 30 | --------------- 31 | 32 | .. automodule:: pywb 33 | :members: 34 | :undoc-members: 35 | :show-inheritance: 36 | -------------------------------------------------------------------------------- /docs/code/pywb.utils.rst: -------------------------------------------------------------------------------- 1 | pywb.utils package 2 | ================== 3 | 4 | Submodules 5 | ---------- 6 | 7 | pywb.utils.binsearch module 8 | --------------------------- 9 | 10 | .. automodule:: pywb.utils.binsearch 11 | :members: 12 | :undoc-members: 13 | :show-inheritance: 14 | 15 | pywb.utils.canonicalize module 16 | ------------------------------ 17 | 18 | .. automodule:: pywb.utils.canonicalize 19 | :members: 20 | :undoc-members: 21 | :show-inheritance: 22 | 23 | pywb.utils.format module 24 | ------------------------ 25 | 26 | .. automodule:: pywb.utils.format 27 | :members: 28 | :undoc-members: 29 | :show-inheritance: 30 | 31 | pywb.utils.geventserver module 32 | ------------------------------ 33 | 34 | .. automodule:: pywb.utils.geventserver 35 | :members: 36 | :undoc-members: 37 | :show-inheritance: 38 | 39 | pywb.utils.io module 40 | -------------------- 41 | 42 | .. automodule:: pywb.utils.io 43 | :members: 44 | :undoc-members: 45 | :show-inheritance: 46 | 47 | pywb.utils.loaders module 48 | ------------------------- 49 | 50 | .. automodule:: pywb.utils.loaders 51 | :members: 52 | :undoc-members: 53 | :show-inheritance: 54 | 55 | pywb.utils.memento module 56 | ------------------------- 57 | 58 | .. automodule:: pywb.utils.memento 59 | :members: 60 | :undoc-members: 61 | :show-inheritance: 62 | 63 | pywb.utils.merge module 64 | ----------------------- 65 | 66 | .. automodule:: pywb.utils.merge 67 | :members: 68 | :undoc-members: 69 | :show-inheritance: 70 | 71 | pywb.utils.wbexception module 72 | ----------------------------- 73 | 74 | .. automodule:: pywb.utils.wbexception 75 | :members: 76 | :undoc-members: 77 | :show-inheritance: 78 | 79 | Module contents 80 | --------------- 81 | 82 | .. automodule:: pywb.utils 83 | :members: 84 | :undoc-members: 85 | :show-inheritance: 86 | -------------------------------------------------------------------------------- /docs/code/pywb.warcserver.index.rst: -------------------------------------------------------------------------------- 1 | pywb.warcserver.index package 2 | ============================= 3 | 4 | Submodules 5 | ---------- 6 | 7 | pywb.warcserver.index.aggregator module 8 | --------------------------------------- 9 | 10 | .. automodule:: pywb.warcserver.index.aggregator 11 | :members: 12 | :undoc-members: 13 | :show-inheritance: 14 | 15 | pywb.warcserver.index.cdxobject module 16 | -------------------------------------- 17 | 18 | .. automodule:: pywb.warcserver.index.cdxobject 19 | :members: 20 | :undoc-members: 21 | :show-inheritance: 22 | 23 | pywb.warcserver.index.cdxops module 24 | ----------------------------------- 25 | 26 | .. automodule:: pywb.warcserver.index.cdxops 27 | :members: 28 | :undoc-members: 29 | :show-inheritance: 30 | 31 | pywb.warcserver.index.fuzzymatcher module 32 | ----------------------------------------- 33 | 34 | .. automodule:: pywb.warcserver.index.fuzzymatcher 35 | :members: 36 | :undoc-members: 37 | :show-inheritance: 38 | 39 | pywb.warcserver.index.indexsource module 40 | ---------------------------------------- 41 | 42 | .. automodule:: pywb.warcserver.index.indexsource 43 | :members: 44 | :undoc-members: 45 | :show-inheritance: 46 | 47 | pywb.warcserver.index.query module 48 | ---------------------------------- 49 | 50 | .. automodule:: pywb.warcserver.index.query 51 | :members: 52 | :undoc-members: 53 | :show-inheritance: 54 | 55 | pywb.warcserver.index.zipnum module 56 | ----------------------------------- 57 | 58 | .. automodule:: pywb.warcserver.index.zipnum 59 | :members: 60 | :undoc-members: 61 | :show-inheritance: 62 | 63 | Module contents 64 | --------------- 65 | 66 | .. automodule:: pywb.warcserver.index 67 | :members: 68 | :undoc-members: 69 | :show-inheritance: 70 | -------------------------------------------------------------------------------- /docs/code/pywb.warcserver.resource.rst: -------------------------------------------------------------------------------- 1 | pywb.warcserver.resource package 2 | ================================ 3 | 4 | Submodules 5 | ---------- 6 | 7 | pywb.warcserver.resource.blockrecordloader module 8 | ------------------------------------------------- 9 | 10 | .. automodule:: pywb.warcserver.resource.blockrecordloader 11 | :members: 12 | :undoc-members: 13 | :show-inheritance: 14 | 15 | pywb.warcserver.resource.pathresolvers module 16 | --------------------------------------------- 17 | 18 | .. automodule:: pywb.warcserver.resource.pathresolvers 19 | :members: 20 | :undoc-members: 21 | :show-inheritance: 22 | 23 | pywb.warcserver.resource.resolvingloader module 24 | ----------------------------------------------- 25 | 26 | .. automodule:: pywb.warcserver.resource.resolvingloader 27 | :members: 28 | :undoc-members: 29 | :show-inheritance: 30 | 31 | pywb.warcserver.resource.responseloader module 32 | ---------------------------------------------- 33 | 34 | .. automodule:: pywb.warcserver.resource.responseloader 35 | :members: 36 | :undoc-members: 37 | :show-inheritance: 38 | 39 | Module contents 40 | --------------- 41 | 42 | .. automodule:: pywb.warcserver.resource 43 | :members: 44 | :undoc-members: 45 | :show-inheritance: 46 | -------------------------------------------------------------------------------- /docs/code/pywb.warcserver.rst: -------------------------------------------------------------------------------- 1 | pywb.warcserver package 2 | ======================= 3 | 4 | Subpackages 5 | ----------- 6 | 7 | .. toctree:: 8 | :maxdepth: 4 9 | 10 | pywb.warcserver.index 11 | pywb.warcserver.resource 12 | 13 | Submodules 14 | ---------- 15 | 16 | pywb.warcserver.access\_checker module 17 | -------------------------------------- 18 | 19 | .. automodule:: pywb.warcserver.access_checker 20 | :members: 21 | :undoc-members: 22 | :show-inheritance: 23 | 24 | pywb.warcserver.amf module 25 | -------------------------- 26 | 27 | .. automodule:: pywb.warcserver.amf 28 | :members: 29 | :undoc-members: 30 | :show-inheritance: 31 | 32 | pywb.warcserver.basewarcserver module 33 | ------------------------------------- 34 | 35 | .. automodule:: pywb.warcserver.basewarcserver 36 | :members: 37 | :undoc-members: 38 | :show-inheritance: 39 | 40 | pywb.warcserver.handlers module 41 | ------------------------------- 42 | 43 | .. automodule:: pywb.warcserver.handlers 44 | :members: 45 | :undoc-members: 46 | :show-inheritance: 47 | 48 | pywb.warcserver.http module 49 | --------------------------- 50 | 51 | .. automodule:: pywb.warcserver.http 52 | :members: 53 | :undoc-members: 54 | :show-inheritance: 55 | 56 | pywb.warcserver.inputrequest module 57 | ----------------------------------- 58 | 59 | .. automodule:: pywb.warcserver.inputrequest 60 | :members: 61 | :undoc-members: 62 | :show-inheritance: 63 | 64 | pywb.warcserver.upstreamindexsource module 65 | ------------------------------------------ 66 | 67 | .. automodule:: pywb.warcserver.upstreamindexsource 68 | :members: 69 | :undoc-members: 70 | :show-inheritance: 71 | 72 | pywb.warcserver.warcserver module 73 | --------------------------------- 74 | 75 | .. automodule:: pywb.warcserver.warcserver 76 | :members: 77 | :undoc-members: 78 | :show-inheritance: 79 | 80 | Module contents 81 | --------------- 82 | 83 | .. automodule:: pywb.warcserver 84 | :members: 85 | :undoc-members: 86 | :show-inheritance: 87 | -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | .. pywb documentation master file, created by 2 | sphinx-quickstart on Thu Sep 21 01:58:55 2017. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | Webrecorder pywb documentation! 7 | ================================ 8 | 9 | The Webrecorder (:mod:`pywb`) toolkit is a full-featured, advanced web archiving capture and replay framework for python. 10 | It provides command-line tools and an extensible framework for high-fidelity web archive access and creation. 11 | A subset of features provides the basic functionality of a "Wayback Machine". 12 | 13 | 14 | .. toctree:: 15 | :maxdepth: 3 16 | 17 | manual/usage 18 | manual/configuring 19 | manual/access-control 20 | manual/ui-customization 21 | manual/localization 22 | manual/architecture 23 | manual/apis 24 | manual/owb-transition 25 | code/pywb 26 | 27 | 28 | Indices and tables 29 | ================== 30 | 31 | * :ref:`genindex` 32 | * :ref:`modindex` 33 | * :ref:`search` 34 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=python -msphinx 9 | ) 10 | set SOURCEDIR=. 11 | set BUILDDIR=_build 12 | set SPHINXPROJ=pywb 13 | 14 | if "%1" == "" goto help 15 | 16 | %SPHINXBUILD% >NUL 2>NUL 17 | if errorlevel 9009 ( 18 | echo. 19 | echo.The Sphinx module was not found. Make sure you have Sphinx installed, 20 | echo.then set the SPHINXBUILD environment variable to point to the full 21 | echo.path of the 'sphinx-build' executable. Alternatively you may add the 22 | echo.Sphinx directory to PATH. 23 | echo. 24 | echo.If you don't have Sphinx installed, grab it from 25 | echo.http://sphinx-doc.org/ 26 | exit /b 1 27 | ) 28 | 29 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% 30 | goto end 31 | 32 | :help 33 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% 34 | 35 | :end 36 | popd 37 | -------------------------------------------------------------------------------- /docs/manual/apis.rst: -------------------------------------------------------------------------------- 1 | APIs 2 | ==== 3 | 4 | pywb supports the following APIs: 5 | 6 | .. toctree:: 7 | 8 | cdxserver_api 9 | memento 10 | 11 | -------------------------------------------------------------------------------- /docs/manual/architecture.rst: -------------------------------------------------------------------------------- 1 | Architecture 2 | ============ 3 | 4 | The pywb system consists of 3 distinct components: Warcserver, Recorder and Rewriter, which can be run and scaled separately. 5 | The default pywb wayback application uses Warcserver and Rewriter. If recording is enabled, the Recorder is also used. 6 | 7 | Additionally, the indexing system is used through all components, and a few command line tools encompass the pywb toolkit. 8 | 9 | 10 | .. toctree:: 11 | :maxdepth: 2 12 | 13 | warcserver 14 | recorder 15 | rewriter 16 | 17 | indexing 18 | apps 19 | 20 | -------------------------------------------------------------------------------- /docs/manual/images/vue-banner.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/webrecorder/pywb/8ea2f74517161c1fc91ca969cea6328a20f5dce6/docs/manual/images/vue-banner.png -------------------------------------------------------------------------------- /docs/manual/images/vue-cal.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/webrecorder/pywb/8ea2f74517161c1fc91ca969cea6328a20f5dce6/docs/manual/images/vue-cal.png -------------------------------------------------------------------------------- /docs/manual/migrating-cdx.rst: -------------------------------------------------------------------------------- 1 | .. _migrating-cdx: 2 | 3 | Migrating CDX 4 | ============= 5 | 6 | If you are not using OutbackCDX, you may need to check on the format of the CDX files that you are using. 7 | 8 | Over the years, there have been many variations on the CDX (capture index) format which is used by OpenWayback and pywb to look up captures in WARC/ARC files. 9 | 10 | When migrating CDX from OpenWayback, there are a few options. 11 | 12 | pywb currently supports: 13 | 14 | - 9 field CDX (surt-ordered) 15 | - 11 field CDX (surt-ordered) 16 | - CDXJ (surt-ordered) 17 | 18 | pywb will support the 11-field and 9-field `CDX format `_ that is also used in OpenWayback. 19 | 20 | Non-SURT ordered CDXs are not currently supported, though they may be supported in the future (see this `pending pull request `_). 21 | 22 | CDXJ Conversion 23 | --------------- 24 | 25 | The native format used by pywb is the :ref:`cdxj-index` with SURT-ordering, which uses JSON to encode the fields, allowing for more flexibility by storing most of the index in a JSON, allowing support for optional fields as needed. 26 | 27 | If your CDX are not SURT-ordered, 11 or 9 field CDX, or if there is a mix, pywb also offers a conversion utility which will convert all CDX to the pywb native CDXJ: :: 28 | 29 | wb-manager cdx-convert 30 | 31 | The converter will read the CDX files and create a corresponding .cdxj file for every cdx file. Since the conversion happens on the .cdx itself, it does not require reindexing the source WARC/ARC files and can happen fairly quickly. The converted CDXJ are guaranteed to be in the right format to work with pywb. 32 | -------------------------------------------------------------------------------- /docs/manual/owb-pywb-terms.rst: -------------------------------------------------------------------------------- 1 | OpenWayback vs pywb Terms 2 | ========================= 3 | 4 | pywb and OpenWayback use slightly different terms to describe the configuration options, as explained below. 5 | 6 | Some differences are: 7 | - The ``wayback.xml`` config file in OpenWayback is replaced with ``config.yaml`` yaml 8 | - The terms ``Access Point`` and ``Wayback Collection`` are replaced with ``Collection`` in pywb. The collection configuration represents a unique path (access point) and the data that is accessed at that path. 9 | - The ``Resource Store`` in OpenWayback is known in pywb as the archive paths, configured under ``archive_paths`` 10 | - The ``Resource Index`` in OpenWayback is known in pywb as the index paths, configurable under ``index_paths`` 11 | - The ``Exclusions`` in OpenWayback are replaced with general :ref:`access-control` 12 | 13 | 14 | 15 | Pywb Collection Basics 16 | ---------------------- 17 | 18 | A pywb collection must consist of a minimum of three parts: the collection name, the ``index_paths`` (where to read the index), and the ``archive_paths`` (where to read the WARC files). 19 | 20 | The collection is accessed by name, so there is no distinct access point. 21 | 22 | The collections are configured in the ``config.yaml`` under the ``collections`` key: 23 | 24 | For example, a basic collection definition can be specified via: 25 | 26 | .. code:: yaml 27 | 28 | collections: 29 | wayback: 30 | index_paths: /archive/cdx/ 31 | archive_paths: /archive/storage/warcs/ 32 | 33 | 34 | Pywb also supports a convention-based directory structure. Collections created in this structure can be detected automatically 35 | and need not be specified in the ``config.yaml``. This structure is designed for smaller collections that are all stored locally in a subdirectory. 36 | 37 | See the :ref:`dir_structure` for the default pywb directory structure. 38 | 39 | However, for importing existing collections from OpenWayback, it is probably easier to specify the existing paths as shown above. 40 | 41 | 42 | 43 | -------------------------------------------------------------------------------- /docs/manual/owb-to-pywb-exclusions.rst: -------------------------------------------------------------------------------- 1 | Migrating Exclusion Rules 2 | ========================= 3 | 4 | pywb includes a new :ref:`access-control` system, which allows granual allow/block/exclude access control rules on paths and subpaths. 5 | 6 | The rules are configured in .aclj files, and a command-line utility exists to import OpenWayback exclusions 7 | into the pywb ACLJ format. 8 | 9 | For example, given an OpenWayback exclusion list configuration for a static file: 10 | 11 | .. code:: xml 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | The exclusions file can be converted to an .aclj file by running: :: 20 | 21 | wb-manager acl importtxt /archive/exclusions.aclj /archive/exclusions.txt exclude 22 | 23 | 24 | Then, in the pywb config, specify: 25 | 26 | .. code:: yaml 27 | 28 | collections: 29 | wayback: 30 | index_paths: ... 31 | archive_paths: ... 32 | acl_paths: /archive/exclusions.aclj 33 | 34 | 35 | It is possible to specify multiple access control files, which will all be applied. 36 | 37 | Using ``block`` instead of ``exclude`` will result in pywb returning a 451 error, indicating that URLs are in the index but blocked. 38 | 39 | 40 | CLI Tool 41 | -------- 42 | 43 | After exclusions have been imported, it is recommended to use ``wb-manager acl`` command-line tool for managing exclusions: 44 | 45 | 46 | To add an exclusion, run: :: 47 | 48 | wb-manager acl add /archive/exclusions.aclj http://httpbin.org/anything/something exclude 49 | 50 | To remove an exclusion, run: :: 51 | 52 | wb-manager acl remove /archive/exclusions.aclj http://httpbin.org/anything/something 53 | 54 | 55 | For more options, see the full :ref:`access-control` documentation or run ``wb-manager acl --help``. 56 | 57 | 58 | Not Yet Supported 59 | ----------------- 60 | 61 | Some OpenWayback exclusion options are not yet supported in pywb. 62 | The following is not yet supported in the access control system: 63 | 64 | - Exclusions/Access Control By specific date range 65 | - Regex based exclusions 66 | - Date Range Embargo on All URLs 67 | - Robots.txt-based exclusions 68 | 69 | -------------------------------------------------------------------------------- /docs/manual/owb-transition.rst: -------------------------------------------------------------------------------- 1 | .. _transition-openwayback: 2 | 3 | OpenWayback Transition Guide 4 | ============================ 5 | 6 | This guide provides guidelines for transtioning from OpenWayback to pywb, 7 | with additional recommendations. The main recommendation is to run pywb along 8 | with OutbackCDX and nginx, and this configuration is covered below, along with additional options. 9 | 10 | 11 | .. toctree:: 12 | :maxdepth: 2 13 | 14 | owb-pywb-terms 15 | outbackcdx 16 | migrating-cdx 17 | owb-to-pywb-config 18 | owb-to-pywb-exclusions 19 | owb-to-pywb-deploy 20 | 21 | 22 | -------------------------------------------------------------------------------- /docs/manual/recorder.rst: -------------------------------------------------------------------------------- 1 | .. _recorder: 2 | 3 | Recorder 4 | ======== 5 | 6 | The recorder component acts a proxy component, intercepting requests to and response from the :ref:`warcserver` and recording them 7 | to a WARC file on disk. 8 | 9 | The recorder uses the :class:`pywb.recorder.multifilewarcwriter.MultiFileWARCWriter` which extends the base :class:`warcio.warcwriter.WARCWriter` from :mod:`warcio` and provides support for: 10 | 11 | * appending to multiple WARC files at once 12 | 13 | * WARC 'rollover' based on maximum size idle time 14 | 15 | * indexing (CDXJ) on write 16 | 17 | 18 | Many of the features of the Recorder are created for use with Webrecorder project, although the core recorder is used to provide 19 | a basic recording via ``/record/`` endpoint. (See: :ref:`recording-mode`) 20 | 21 | 22 | Deduplication Filters 23 | --------------------- 24 | 25 | The core recorder class provides for optional deduplication using the :class:`pywb.recorder.redisindexer.WritableRedisIndexer` class which requires Redis to store the index, and can be used to either: 26 | 27 | * write duplicates responses. 28 | 29 | * write ``revisit`` records. 30 | 31 | * ignore duplicates and don't write to WARC. 32 | 33 | 34 | Custom Filtering 35 | ---------------- 36 | 37 | The recorder filter system also includes a filtering system to allow for not writing certain requests and responses. 38 | Filters include: 39 | 40 | * Skipping by regex applied to source (``Warcserver-Source-Coll`` header from Warcserver) 41 | 42 | * Skipping if ``Recorder-Skip: 1`` header is provided 43 | 44 | * Skipping if ``Range`` request header is provided 45 | 46 | * Filtering out certain HTTP headers, for example, http-only cookies 47 | 48 | The additional recorder functionality will be enhanced in a future version. 49 | 50 | For a more detailed examples, please consult the tests in :mod:`pywb.recorder.test.test_recorder` 51 | 52 | 53 | 54 | -------------------------------------------------------------------------------- /docs/manual/ui-customization.rst: -------------------------------------------------------------------------------- 1 | UI Customization 2 | ================ 3 | 4 | .. toctree:: 5 | 6 | ui-guide 7 | vue-ui 8 | template-guide 9 | 10 | -------------------------------------------------------------------------------- /extra_requirements.txt: -------------------------------------------------------------------------------- 1 | certauth 2 | youtube-dl 3 | boto3 4 | uwsgi 5 | ujson 6 | pysocks 7 | lxml 8 | babel 9 | translate_toolkit 10 | -------------------------------------------------------------------------------- /pywb/__init__.py: -------------------------------------------------------------------------------- 1 | from pywb.version import __version__ 2 | 3 | DEFAULT_CONFIG = 'pywb/default_config.yaml' 4 | 5 | DEFAULT_RULES_FILE = 'pkg://pywb/rules.yaml' 6 | 7 | 8 | def get_test_dir(): 9 | import os 10 | return os.path.join(os.path.dirname(os.path.realpath(__file__)), 11 | '..', 12 | 'sample_archive') + os.path.sep 13 | -------------------------------------------------------------------------------- /pywb/apps/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/webrecorder/pywb/8ea2f74517161c1fc91ca969cea6328a20f5dce6/pywb/apps/__init__.py -------------------------------------------------------------------------------- /pywb/apps/live.py: -------------------------------------------------------------------------------- 1 | from gevent.monkey import patch_all; patch_all() 2 | from pywb.apps.frontendapp import FrontEndApp 3 | 4 | application = FrontEndApp(config_file=None, 5 | custom_config={'collections': {'live': '$live'}}) 6 | 7 | 8 | -------------------------------------------------------------------------------- /pywb/apps/static_handler.py: -------------------------------------------------------------------------------- 1 | import mimetypes 2 | import os 3 | 4 | from pywb.utils.loaders import LocalFileLoader 5 | 6 | from pywb.apps.wbrequestresponse import WbResponse 7 | from pywb.utils.wbexception import NotFoundException 8 | 9 | 10 | # ================================================================= 11 | def is_subpath(parent_path, child_path): 12 | parent = os.path.abspath(parent_path) 13 | child = os.path.abspath(child_path) 14 | return os.path.commonpath([parent, child]) == parent 15 | 16 | 17 | #================================================================= 18 | # Static Content Handler 19 | #================================================================= 20 | class StaticHandler(object): 21 | def __init__(self, static_path): 22 | mimetypes.init() 23 | 24 | self.static_path = static_path 25 | self.block_loader = LocalFileLoader() 26 | 27 | def __call__(self, environ, url_str): 28 | url = url_str.split('?')[0] 29 | 30 | if url.endswith('/'): 31 | url += 'index.html' 32 | 33 | full_path = None 34 | env_static_dir = environ.get('pywb.static_dir') 35 | 36 | if env_static_dir: 37 | full_path = os.path.join(env_static_dir, url) 38 | 39 | # Prevent path traversal 40 | if not is_subpath(env_static_dir, full_path): 41 | raise NotFoundException('Requested a static file outside of static_dir') 42 | 43 | if not os.path.isfile(full_path): 44 | full_path = None 45 | 46 | if not full_path: 47 | full_path = os.path.join(self.static_path, url) 48 | 49 | # Prevent path traversal 50 | if not is_subpath(self.static_path, full_path): 51 | raise NotFoundException('Requested a static file outside of static_dir') 52 | 53 | try: 54 | data = self.block_loader.load(full_path) 55 | 56 | data.seek(0, 2) 57 | size = data.tell() 58 | data.seek(0) 59 | headers = [('Content-Length', str(size))] 60 | 61 | reader = None 62 | 63 | if 'wsgi.file_wrapper' in environ: 64 | try: 65 | reader = environ['wsgi.file_wrapper'](data) 66 | except: 67 | pass 68 | 69 | if not reader: 70 | reader = iter(lambda: data.read(), b'') 71 | 72 | content_type = 'application/octet-stream' 73 | 74 | guessed = mimetypes.guess_type(full_path) 75 | if guessed[0]: 76 | content_type = guessed[0] 77 | 78 | return WbResponse.bin_stream(reader, 79 | content_type=content_type, 80 | headers=headers) 81 | 82 | except IOError: 83 | raise NotFoundException('Static File Not Found: ' + 84 | url_str) 85 | 86 | 87 | -------------------------------------------------------------------------------- /pywb/apps/test/test_rewriter.py: -------------------------------------------------------------------------------- 1 | from gevent import monkey; monkey.patch_all(thread=False) 2 | 3 | from pywb.warcserver.test.testutils import LiveServerTests, BaseTestClass 4 | from pywb.warcserver.test.testutils import FakeRedisTests 5 | 6 | from pywb.apps.frontendapp import FrontEndApp 7 | 8 | import os 9 | import webtest 10 | 11 | 12 | LIVE_CONFIG = {'collections': {'live': '$live'}} 13 | 14 | 15 | class TestRewriterApp(FakeRedisTests, BaseTestClass): 16 | @classmethod 17 | def setup_class(cls): 18 | super(TestRewriterApp, cls).setup_class() 19 | 20 | #cls.app = RWApp.create_app(replay_port=cls.server.port) 21 | #cls.testapp = webtest.TestApp(cls.app.app) 22 | cls.testapp = webtest.TestApp(FrontEndApp(custom_config=LIVE_CONFIG, 23 | config_file=None)) 24 | 25 | def test_replay(self): 26 | resp = self.testapp.get('/live/mp_/http://example.com/') 27 | resp.charset = 'utf-8' 28 | 29 | assert '"http://localhost:80/live/mp_/https://www.iana.org/domains/example"' in resp.text 30 | 31 | assert '"http://example.com/"' 32 | 33 | def test_top_frame(self): 34 | resp = self.testapp.get('/live/http://example.com/') 35 | resp.charset = 'utf-8' 36 | 37 | assert '"http://example.com/"' in resp.text 38 | 39 | #def test_cookie_track_1(self): 40 | # resp = self.testapp.get('/live/mp_/https://twitter.com/') 41 | 42 | # assert resp.headers['set-cookie'] != None 43 | 44 | -------------------------------------------------------------------------------- /pywb/apps/warcserverapp.py: -------------------------------------------------------------------------------- 1 | from gevent.monkey import patch_all; patch_all() 2 | from pywb.warcserver.warcserver import WarcServer 3 | 4 | application = WarcServer(custom_config={'collections': {'live': '$live'}}) 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /pywb/apps/wayback.py: -------------------------------------------------------------------------------- 1 | from gevent.monkey import patch_all; patch_all() 2 | from pywb.apps.frontendapp import FrontEndApp 3 | 4 | application = FrontEndApp() 5 | 6 | 7 | -------------------------------------------------------------------------------- /pywb/default_config.yaml: -------------------------------------------------------------------------------- 1 | collections_root: collections 2 | 3 | # Per-Collection Paths 4 | archive_paths: archive 5 | index_paths: indexes 6 | acl_paths: acl 7 | static_path: static 8 | 9 | default_access: allow 10 | 11 | templates_dir: templates 12 | 13 | # Template HTML 14 | banner_html: banner.html 15 | custom_banner_html: custom_banner.html 16 | head_insert_html: head_insert.html 17 | frame_insert_html: frame_insert.html 18 | 19 | base_html: base.html 20 | header_html: header.html 21 | footer_html: footer.html 22 | head_html: head.html 23 | 24 | query_html: query.html 25 | search_html: search.html 26 | not_found_html: not_found.html 27 | 28 | home_html: index.html 29 | error_html: error.html 30 | 31 | proxy_cert_download_html: proxy_cert_download.html 32 | proxy_select_html: proxy_select.html 33 | 34 | # Info JSON 35 | info_json: collinfo.json 36 | 37 | # HTML Templates List 38 | html_templates: 39 | - banner_html 40 | - custom_banner_html 41 | - head_insert_html 42 | - frame_insert_html 43 | 44 | - query_html 45 | - search_html 46 | - not_found_html 47 | 48 | - home_html 49 | 50 | - base_html 51 | - header_html 52 | - head_html 53 | - footer_html 54 | 55 | - error_html 56 | - proxy_cert_download_html 57 | - proxy_select_html 58 | 59 | # Other Settings 60 | enable_memento: true 61 | 62 | rules_config: pkg://pywb/rules.yaml 63 | 64 | 65 | -------------------------------------------------------------------------------- /pywb/indexer/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/webrecorder/pywb/8ea2f74517161c1fc91ca969cea6328a20f5dce6/pywb/indexer/__init__.py -------------------------------------------------------------------------------- /pywb/manager/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/webrecorder/pywb/8ea2f74517161c1fc91ca969cea6328a20f5dce6/pywb/manager/__init__.py -------------------------------------------------------------------------------- /pywb/manager/migrate.py: -------------------------------------------------------------------------------- 1 | from pywb.utils.canonicalize import canonicalize 2 | from pywb.warcserver.index.cdxobject import CDXObject, URLKEY, ORIGINAL 3 | from pywb.indexer.cdxindexer import CDXJ 4 | 5 | import os 6 | import shutil 7 | 8 | 9 | #============================================================================= 10 | class MigrateCDX(object): 11 | def __init__(self, dir_): 12 | self.cdx_dir = dir_ 13 | 14 | def iter_cdx_files(self): 15 | for root, dirs, files in os.walk(self.cdx_dir): 16 | for filename in files: 17 | if filename.endswith('.cdx'): 18 | full_path = os.path.join(root, filename) 19 | yield full_path 20 | 21 | def count_cdx(self): 22 | count = 0 23 | for x in self.iter_cdx_files(): 24 | count += 1 25 | return count 26 | 27 | def convert_to_cdxj(self): 28 | cdxj_writer = CDXJ() 29 | for filename in self.iter_cdx_files(): 30 | outfile = filename + 'j' 31 | 32 | print('Converting {0} -> {1}'.format(filename, outfile)) 33 | 34 | with open(outfile + '.tmp', 'w+') as out: 35 | with open(filename, 'rb') as fh: 36 | for line in fh: 37 | if line.startswith(b' CDX'): 38 | continue 39 | cdx = CDXObject(line) 40 | cdx[URLKEY] = canonicalize(cdx[ORIGINAL]) 41 | cdxj_writer.write_cdx_line(out, cdx, cdx['filename']) 42 | 43 | shutil.move(outfile + '.tmp', outfile) 44 | os.remove(filename) 45 | 46 | 47 | -------------------------------------------------------------------------------- /pywb/recorder/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/webrecorder/pywb/8ea2f74517161c1fc91ca969cea6328a20f5dce6/pywb/recorder/__init__.py -------------------------------------------------------------------------------- /pywb/recorder/test/rec.ini: -------------------------------------------------------------------------------- 1 | [uwsgi] 2 | if-not-env = PORT 3 | http-socket = :8010 4 | endif = 5 | 6 | master = true 7 | buffer-size = 65536 8 | die-on-term = true 9 | 10 | if-env = VIRTUAL_ENV 11 | venv = $(VIRTUAL_ENV) 12 | endif = 13 | 14 | gevent = 100 15 | #gevent-early-monkey-patch = 16 | 17 | wsgi = recorder.test.simplerec 18 | -------------------------------------------------------------------------------- /pywb/recorder/test/simplerec.py: -------------------------------------------------------------------------------- 1 | from gevent import monkey; monkey.patch_all() 2 | 3 | from pywb.recorder.recorderapp import RecorderApp 4 | from pywb.recorder.redisindexer import WritableRedisIndexer 5 | 6 | from pywb.recorder.multifilewarcwriter import MultiFileWARCWriter 7 | from pywb.recorder.filters import SkipDupePolicy 8 | 9 | import atexit 10 | import tempfile 11 | import redis 12 | import shutil 13 | 14 | def main(): 15 | upstream_url = 'http://localhost:8080' 16 | 17 | target = tempfile.mkdtemp(prefix='tmprec') + '/' 18 | 19 | print('Recording to ' + target) 20 | 21 | def rm_target(): 22 | print('Removing ' + target) 23 | shutil.rmtree(target) 24 | 25 | atexit.register(rm_target) 26 | 27 | local_r = redis.StrictRedis.from_url('redis://localhost/2') 28 | local_r.delete('rec:cdxj') 29 | local_r.delete('rec:warc') 30 | 31 | #target = './_recordings/' 32 | 33 | dedup_index = WritableRedisIndexer( 34 | redis_url='redis://localhost/2/rec:cdxj', 35 | file_key_template='rec:warc', 36 | rel_path_template=target, 37 | dupe_policy=SkipDupePolicy()) 38 | 39 | recorder_app = RecorderApp(upstream_url, 40 | MultiFileWARCWriter(target, dedup_index=dedup_index), 41 | accept_colls='live') 42 | 43 | return recorder_app 44 | 45 | if __name__ == '__main__': 46 | application = main() 47 | 48 | -------------------------------------------------------------------------------- /pywb/rewrite/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/webrecorder/pywb/8ea2f74517161c1fc91ca969cea6328a20f5dce6/pywb/rewrite/__init__.py -------------------------------------------------------------------------------- /pywb/rewrite/html_insert_rewriter.py: -------------------------------------------------------------------------------- 1 | import re 2 | from pywb.rewrite.content_rewriter import StreamingRewriter 3 | 4 | 5 | # ============================================================================ 6 | class HTMLInsertOnlyRewriter(StreamingRewriter): 7 | """ Insert custom string into HTML into the head, before any tag not or 8 | no other rewriting performed 9 | """ 10 | NOT_HEAD_REGEX = re.compile(r'(<\s*\b)(?!(html|head))', re.I) 11 | 12 | XML_HEADER = re.compile(r'<\?xml.*\?>') 13 | 14 | def __init__(self, url_rewriter, **kwargs): 15 | super(HTMLInsertOnlyRewriter, self).__init__(url_rewriter, False) 16 | self.head_insert = kwargs['head_insert'] 17 | 18 | self.done = False 19 | self.first = True 20 | 21 | def rewrite(self, string): 22 | if self.first: 23 | if self.url_rewriter.rewrite_opts.get('is_ajax') and self.XML_HEADER.search(string): 24 | self.done = True 25 | 26 | self.first = False 27 | 28 | if self.done: 29 | return string 30 | 31 | m = self.NOT_HEAD_REGEX.search(string) 32 | if m: 33 | inx = m.start() 34 | buff = string[:inx] 35 | buff += self.head_insert 36 | buff += string[inx:] 37 | self.done = True 38 | return buff 39 | else: 40 | return string 41 | 42 | def final_read(self): 43 | return '' if self.done else self.head_insert 44 | -------------------------------------------------------------------------------- /pywb/rewrite/jsonp_rewriter.py: -------------------------------------------------------------------------------- 1 | import re 2 | from pywb.rewrite.content_rewriter import StreamingRewriter 3 | 4 | 5 | # ============================================================================ 6 | class JSONPRewriter(StreamingRewriter): 7 | #JSONP = re.compile(r'^(?:\s*\/\*(?:.*)\*\/)*\s*(\w+)\(\{') 8 | # Match a single /* and // style comments at the beginning 9 | JSONP = re.compile(r'(?:^[ \t]*(?:(?:\/\*[^\*]*\*\/)|(?:\/\/[^\n]+[\n])))*[ \t]*(\w+)\(\{', re.M) 10 | CALLBACK = re.compile(r'[?].*callback=([^&]+)') 11 | 12 | def rewrite(self, string): 13 | # see if json is jsonp, starts with callback func 14 | m_json = self.JSONP.match(string) 15 | if not m_json: 16 | return string 17 | 18 | # see if there is a callback param in current url 19 | m_callback = self.CALLBACK.search(self.url_rewriter.wburl.url) 20 | if not m_callback: 21 | return string 22 | if m_callback.group(1) == '?': 23 | # this is a very sharp edge case e.g. callback=? 24 | # since we only have this string[m_json.end(1):] 25 | # would cut off the name of the CB if any is included 26 | # so we just pass the string through 27 | return string 28 | 29 | string = m_callback.group(1) + string[m_json.end(1):] 30 | return string 31 | 32 | -------------------------------------------------------------------------------- /pywb/rewrite/rewrite_amf.py: -------------------------------------------------------------------------------- 1 | from io import BytesIO 2 | from six.moves import zip 3 | 4 | from pywb.rewrite.content_rewriter import BufferedRewriter 5 | 6 | 7 | # ============================================================================ 8 | # Experimental: not fully tested 9 | class RewriteAMF(BufferedRewriter): #pragma: no cover 10 | def rewrite_stream(self, stream, rwinfo): 11 | try: 12 | from pyamf import remoting 13 | 14 | iobuff = BytesIO() 15 | while True: 16 | buff = stream.read() 17 | if not buff: 18 | break 19 | iobuff.write(buff) 20 | 21 | iobuff.seek(0) 22 | res = remoting.decode(iobuff) 23 | 24 | # TODO: revisit this 25 | inputdata = rwinfo.url_rewriter.rewrite_opts.get('pywb.inputdata') 26 | 27 | if inputdata: 28 | new_list = [] 29 | 30 | for src, target in zip(inputdata.bodies, res.bodies): 31 | #print(target[0] + ' = ' + src[0]) 32 | 33 | #print('messageId => corrId ' + target[1].body.correlationId + ' => ' + src[1].body[0].messageId) 34 | target[1].body.correlationId = src[1].body[0].messageId 35 | 36 | new_list.append((src[0], target[1])) 37 | 38 | res.bodies = new_list 39 | 40 | return BytesIO(remoting.encode(res).getvalue()) 41 | 42 | except Exception as e: 43 | import traceback 44 | traceback.print_exc() 45 | print(e) 46 | 47 | stream.seek(0) 48 | return stream 49 | 50 | 51 | -------------------------------------------------------------------------------- /pywb/rewrite/rewrite_hls.py: -------------------------------------------------------------------------------- 1 | import re 2 | from io import BytesIO 3 | 4 | from pywb.rewrite.content_rewriter import BufferedRewriter 5 | 6 | 7 | # ============================================================================ 8 | class RewriteHLS(BufferedRewriter): 9 | EXT_INF = re.compile('#EXT-X-STREAM-INF:(?:.*[,])?BANDWIDTH=([\d]+)') 10 | EXT_RESOLUTION = re.compile('RESOLUTION=([\d]+)x([\d]+)') 11 | 12 | def rewrite_stream(self, stream, rwinfo): 13 | max_resolution, max_bandwidth = self._get_adaptive_metadata(rwinfo) 14 | 15 | buff = stream.read() 16 | 17 | lines = buff.decode('utf-8').split('\n') 18 | indexes = [] 19 | count = 0 20 | best_index = None 21 | 22 | best_bandwidth = 0 23 | best_resolution = 0 24 | 25 | for line in lines: 26 | m = self.EXT_INF.match(line) 27 | if m: 28 | indexes.append(count) 29 | curr_bandwidth = int(m.group(1)) 30 | 31 | # resolution 32 | m2 = self.EXT_RESOLUTION.search(line) 33 | if m2: 34 | curr_resolution = int(m2.group(1)) * int(m2.group(2)) 35 | else: 36 | curr_resolution = 0 37 | 38 | if max_resolution and curr_resolution: 39 | if curr_resolution > best_resolution and curr_resolution <= max_resolution: 40 | best_resolution = curr_resolution 41 | best_bandwidth = curr_bandwidth 42 | best_index = count 43 | 44 | elif curr_bandwidth > best_bandwidth and curr_bandwidth <= max_bandwidth: 45 | best_resolution = curr_resolution 46 | best_bandwidth = curr_bandwidth 47 | best_index = count 48 | 49 | count = count + 1 50 | 51 | if indexes and best_index is not None: 52 | indexes.remove(best_index) 53 | 54 | for index in reversed(indexes): 55 | del lines[index + 1] 56 | del lines[index] 57 | 58 | buff_io = BytesIO() 59 | buff_io.write('\n'.join(lines).encode('utf-8')) 60 | buff_io.seek(0) 61 | return buff_io 62 | 63 | -------------------------------------------------------------------------------- /pywb/rewrite/rewrite_js_workers.py: -------------------------------------------------------------------------------- 1 | from pywb.rewrite.content_rewriter import StreamingRewriter, WORKER_MODS 2 | 3 | __all__ = ["JSWorkerRewriter"] 4 | 5 | INJECT = "(function() { self.importScripts('%s'); new WBWombat(%s); })();" 6 | INIT = "{'prefix': '%s', 'prefixMod': '%s/', 'originalURL': '%s'}" 7 | 8 | 9 | class JSWorkerRewriter(StreamingRewriter): 10 | """A simple rewriter for rewriting web or service workers. 11 | The only rewriting that occurs is the injection of the init code 12 | for wombatWorkers.js. 13 | This allows for all them to operate as expected on the live web. 14 | """ 15 | 16 | def __init__(self, url_rewriter, align_to_line=True, first_buff=''): 17 | """Initialize a new JSWorkerRewriter 18 | 19 | :param UrlRewriter url_rewriter: The url rewriter for this rewrite 20 | :param bool align_to_line: Should the response stream be aliened to line boundaries 21 | :param str first_buff: The first string to be added to the rewrite 22 | :rtype: None 23 | """ 24 | super(JSWorkerRewriter, self).__init__(url_rewriter, align_to_line, first_buff) 25 | wb_url = self.url_rewriter.wburl 26 | if wb_url.mod in WORKER_MODS: 27 | rw_url = self.url_rewriter.pywb_static_prefix + "wombatWorkers.js" 28 | prefix = self.url_rewriter.full_prefix 29 | init = INIT % (prefix, prefix + 'wkrf_', wb_url.url) 30 | self.first_buff = INJECT % (rw_url, init) 31 | -------------------------------------------------------------------------------- /pywb/rewrite/test/test_html_insert_rewriter.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | r''' 5 | >>> parse('') 6 | '' 7 | 8 | >>> parse('Text') 9 | 'Text' 10 | 11 | >>> parse(' < head> ') 12 | ' < head> ' 13 | 14 | >>> parse('< head> ') 15 | '< head> ' 16 | 17 | >>> parse('text') 18 | 'text' 19 | 20 | >>> parse('\n') 21 | '\n' 22 | 23 | # ajax leave unchanged? 24 | >>> parse('\n', is_ajax=True) 25 | '\n' 26 | ''' 27 | 28 | from pywb.rewrite.url_rewriter import UrlRewriter 29 | from pywb.rewrite.html_insert_rewriter import HTMLInsertOnlyRewriter 30 | 31 | def parse(html_text, is_ajax=False): 32 | urlrewriter = UrlRewriter('20131226101010/https://example.com/some/path.html', '/web/') 33 | 34 | if is_ajax: 35 | urlrewriter.rewrite_opts['is_ajax'] = True 36 | 37 | rewriter = HTMLInsertOnlyRewriter(urlrewriter, head_insert='') 38 | 39 | return rewriter.rewrite(html_text) + rewriter.final_read() 40 | 41 | -------------------------------------------------------------------------------- /pywb/static/calendar.svg: -------------------------------------------------------------------------------- 1 | 2 | 4 | 7 | 9 | 10 | -------------------------------------------------------------------------------- /pywb/static/css/base.css: -------------------------------------------------------------------------------- 1 | header { 2 | display: flex; 3 | display: -webkit-box; 4 | display: -moz-box; 5 | display: -webkit-flex; 6 | display: -ms-flexbox; 7 | 8 | justify-content: space-between; 9 | -webkit-box-pack: justify; 10 | -moz-box-pack: justify; 11 | -ms-flex-pack: justify; 12 | } 13 | 14 | header .language-select { 15 | position: absolute; 16 | top: 10px; 17 | right: 10px; 18 | } 19 | header .language-select ul { 20 | display: inline-block; 21 | list-style-type: none; 22 | margin: 0; 23 | padding: 0; 24 | } 25 | header .language-select ul li { 26 | display: inline-block; 27 | } 28 | header .language-select ul li:not(:last-child):after { 29 | content: ' / '; 30 | } 31 | 32 | header .language-select a:link, 33 | header .language-select a:visited, 34 | header .language-select a:active { 35 | text-decoration: none; 36 | } 37 | 38 | header .language-select a:hover { 39 | text-decoration: underline; 40 | } 41 | 42 | .error pre { 43 | white-space: pre-wrap; 44 | text-align: left; 45 | } 46 | -------------------------------------------------------------------------------- /pywb/static/css/query.css: -------------------------------------------------------------------------------- 1 | .auto-overflow { 2 | overflow-y: auto; 3 | } 4 | 5 | .q-display { 6 | height: 80% !important; 7 | } 8 | 9 | .q-row { 10 | height: 90% !important; 11 | } 12 | 13 | .list-group-item.list-group-item-action.active { 14 | background-color: transparent; 15 | color: #007bff; 16 | } 17 | 18 | .long-text { 19 | word-wrap: break-word; 20 | } 21 | 22 | .inherit-height { 23 | height: inherit; 24 | } 25 | 26 | .filter-list { 27 | height: 140px; 28 | max-height: 140px; 29 | overflow-y: scroll 30 | } 31 | 32 | .show-optional-bad-input { 33 | display: block; 34 | } 35 | 36 | -------------------------------------------------------------------------------- /pywb/static/flowplayer/flowplayer-3.2.18.swf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/webrecorder/pywb/8ea2f74517161c1fc91ca969cea6328a20f5dce6/pywb/static/flowplayer/flowplayer-3.2.18.swf -------------------------------------------------------------------------------- /pywb/static/flowplayer/flowplayer.audio-3.2.11.swf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/webrecorder/pywb/8ea2f74517161c1fc91ca969cea6328a20f5dce6/pywb/static/flowplayer/flowplayer.audio-3.2.11.swf -------------------------------------------------------------------------------- /pywb/static/flowplayer/flowplayer.controls-3.2.16.swf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/webrecorder/pywb/8ea2f74517161c1fc91ca969cea6328a20f5dce6/pywb/static/flowplayer/flowplayer.controls-3.2.16.swf -------------------------------------------------------------------------------- /pywb/static/flowplayer/flowplayer.pseudostreaming-3.2.13.swf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/webrecorder/pywb/8ea2f74517161c1fc91ca969cea6328a20f5dce6/pywb/static/flowplayer/flowplayer.pseudostreaming-3.2.13.swf -------------------------------------------------------------------------------- /pywb/static/fonts/font-awesome/fa-brands-400.eot: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/webrecorder/pywb/8ea2f74517161c1fc91ca969cea6328a20f5dce6/pywb/static/fonts/font-awesome/fa-brands-400.eot -------------------------------------------------------------------------------- /pywb/static/fonts/font-awesome/fa-brands-400.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/webrecorder/pywb/8ea2f74517161c1fc91ca969cea6328a20f5dce6/pywb/static/fonts/font-awesome/fa-brands-400.ttf -------------------------------------------------------------------------------- /pywb/static/fonts/font-awesome/fa-brands-400.woff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/webrecorder/pywb/8ea2f74517161c1fc91ca969cea6328a20f5dce6/pywb/static/fonts/font-awesome/fa-brands-400.woff -------------------------------------------------------------------------------- /pywb/static/fonts/font-awesome/fa-brands-400.woff2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/webrecorder/pywb/8ea2f74517161c1fc91ca969cea6328a20f5dce6/pywb/static/fonts/font-awesome/fa-brands-400.woff2 -------------------------------------------------------------------------------- /pywb/static/fonts/font-awesome/fa-regular-400.eot: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/webrecorder/pywb/8ea2f74517161c1fc91ca969cea6328a20f5dce6/pywb/static/fonts/font-awesome/fa-regular-400.eot -------------------------------------------------------------------------------- /pywb/static/fonts/font-awesome/fa-regular-400.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/webrecorder/pywb/8ea2f74517161c1fc91ca969cea6328a20f5dce6/pywb/static/fonts/font-awesome/fa-regular-400.ttf -------------------------------------------------------------------------------- /pywb/static/fonts/font-awesome/fa-regular-400.woff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/webrecorder/pywb/8ea2f74517161c1fc91ca969cea6328a20f5dce6/pywb/static/fonts/font-awesome/fa-regular-400.woff -------------------------------------------------------------------------------- /pywb/static/fonts/font-awesome/fa-regular-400.woff2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/webrecorder/pywb/8ea2f74517161c1fc91ca969cea6328a20f5dce6/pywb/static/fonts/font-awesome/fa-regular-400.woff2 -------------------------------------------------------------------------------- /pywb/static/fonts/font-awesome/fa-solid-900.eot: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/webrecorder/pywb/8ea2f74517161c1fc91ca969cea6328a20f5dce6/pywb/static/fonts/font-awesome/fa-solid-900.eot -------------------------------------------------------------------------------- /pywb/static/fonts/font-awesome/fa-solid-900.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/webrecorder/pywb/8ea2f74517161c1fc91ca969cea6328a20f5dce6/pywb/static/fonts/font-awesome/fa-solid-900.ttf -------------------------------------------------------------------------------- /pywb/static/fonts/font-awesome/fa-solid-900.woff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/webrecorder/pywb/8ea2f74517161c1fc91ca969cea6328a20f5dce6/pywb/static/fonts/font-awesome/fa-solid-900.woff -------------------------------------------------------------------------------- /pywb/static/fonts/font-awesome/fa-solid-900.woff2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/webrecorder/pywb/8ea2f74517161c1fc91ca969cea6328a20f5dce6/pywb/static/fonts/font-awesome/fa-solid-900.woff2 -------------------------------------------------------------------------------- /pywb/static/fonts/glyphicons-halflings-regular.eot: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/webrecorder/pywb/8ea2f74517161c1fc91ca969cea6328a20f5dce6/pywb/static/fonts/glyphicons-halflings-regular.eot -------------------------------------------------------------------------------- /pywb/static/fonts/glyphicons-halflings-regular.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/webrecorder/pywb/8ea2f74517161c1fc91ca969cea6328a20f5dce6/pywb/static/fonts/glyphicons-halflings-regular.ttf -------------------------------------------------------------------------------- /pywb/static/fonts/glyphicons-halflings-regular.woff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/webrecorder/pywb/8ea2f74517161c1fc91ca969cea6328a20f5dce6/pywb/static/fonts/glyphicons-halflings-regular.woff -------------------------------------------------------------------------------- /pywb/static/fonts/glyphicons-halflings-regular.woff2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/webrecorder/pywb/8ea2f74517161c1fc91ca969cea6328a20f5dce6/pywb/static/fonts/glyphicons-halflings-regular.woff2 -------------------------------------------------------------------------------- /pywb/static/loadWabac.js: -------------------------------------------------------------------------------- 1 | class WabacReplay 2 | { 3 | constructor(prefix, url, ts, staticPrefix, coll, swScopePrefix) { 4 | this.prefix = prefix; 5 | this.url = url; 6 | this.ts = ts; 7 | this.staticPrefix = staticPrefix; 8 | this.collName = coll; 9 | this.isRoot = coll === "$root"; 10 | this.swScope = swScopePrefix; 11 | this.adblockUrl = undefined; 12 | 13 | this.queryParams = {"replayPrefix": ""}; 14 | if (this.isRoot) { 15 | this.queryParams["root"] = "$root"; 16 | } 17 | } 18 | 19 | async init() { 20 | const scope = this.swScope + "/"; 21 | 22 | await navigator.serviceWorker.register( 23 | `${this.staticPrefix}/sw.js?` + new URLSearchParams(this.queryParams).toString(), 24 | { scope }, 25 | ); 26 | 27 | let initedResolve = null; 28 | 29 | const inited = new Promise((resolve) => initedResolve = resolve); 30 | 31 | navigator.serviceWorker.addEventListener("message", (event) => { 32 | if (event.data.msg_type === "collAdded") { 33 | // the replay is ready to be loaded when this message is received 34 | initedResolve(); 35 | } 36 | }); 37 | 38 | const proxyPrefix = ""; 39 | 40 | const msg = { 41 | msg_type: "addColl", 42 | name: this.collName, 43 | type: "live", 44 | root: this.isRoot, 45 | file: {"sourceUrl": `proxy:${proxyPrefix}`}, 46 | skipExisting: true, 47 | extraConfig: { 48 | prefix: proxyPrefix, 49 | isLive: false, 50 | baseUrl: this.prefix, 51 | baseUrlAppendReplay: true, 52 | noPostToGet: false, 53 | archivePrefix: this.prefix, 54 | archiveMod: "ir_", 55 | adblockUrl: this.adblockUrl, 56 | noPostToGet: true, 57 | }, 58 | }; 59 | 60 | if (!navigator.serviceWorker.controller) { 61 | navigator.serviceWorker.addEventListener("controllerchange", () => { 62 | navigator.serviceWorker.controller.postMessage(msg); 63 | }); 64 | } else { 65 | navigator.serviceWorker.controller.postMessage(msg); 66 | } 67 | 68 | window.addEventListener("message", event => { 69 | let data = event.data; 70 | if (window.WBBanner) { 71 | window.WBBanner.onMessage(event); 72 | } 73 | if (data.wb_type === "load" || data.wb_type === "replace-url") { 74 | history.replaceState({}, data.title, this.prefix + data.ts + '/' + data.url); 75 | } 76 | }); 77 | 78 | if (inited) { 79 | await inited; 80 | } 81 | 82 | this.load_url(this.url, this.ts); 83 | } 84 | 85 | // called by the Vue banner when the timeline is clicked 86 | load_url(url, ts) { 87 | const iframe = document.querySelector('#replay_iframe'); 88 | iframe.src = `${this.prefix}${ts}mp_/${url}`; 89 | } 90 | } 91 | -------------------------------------------------------------------------------- /pywb/static/loading-spinner/test.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 11 | 12 | -------------------------------------------------------------------------------- /pywb/static/pywb-logo-sm.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/webrecorder/pywb/8ea2f74517161c1fc91ca969cea6328a20f5dce6/pywb/static/pywb-logo-sm.png -------------------------------------------------------------------------------- /pywb/static/pywb-logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/webrecorder/pywb/8ea2f74517161c1fc91ca969cea6328a20f5dce6/pywb/static/pywb-logo.png -------------------------------------------------------------------------------- /pywb/static/scroll-webkit.css: -------------------------------------------------------------------------------- 1 | ::-webkit-scrollbar { 2 | height: 12px; 3 | overflow: visible; 4 | width: 12px 5 | } 6 | ::-webkit-scrollbar-button { 7 | height: 0; 8 | width: 0 9 | } 10 | ::-webkit-scrollbar-track { 11 | background-clip: padding-box; 12 | border: solid transparent; 13 | border-width: 0 0 0 0px 14 | } 15 | ::-webkit-scrollbar-track:horizontal { 16 | border-width: 4px 0 0 17 | } 18 | ::-webkit-scrollbar-track:hover { 19 | background-color: rgba(0, 0, 0, .05); 20 | box-shadow: inset 1px 0 0 rgba(0, 0, 0, .1) 21 | } 22 | ::-webkit-scrollbar-track:horizontal:hover { 23 | box-shadow: inset 0 1px 0 rgba(0, 0, 0, .1) 24 | } 25 | ::-webkit-scrollbar-track:active { 26 | background-color: rgba(0, 0, 0, .05); 27 | box-shadow: inset 1px 0 0 rgba(0, 0, 0, .14), inset -1px 0 0 rgba(0, 0, 0, .07) 28 | } 29 | ::-webkit-scrollbar-track:horizontal:active { 30 | box-shadow: inset 0 1px 0 rgba(0, 0, 0, .14), inset 0 -1px 0 rgba(0, 0, 0, .07) 31 | } 32 | ::-webkit-scrollbar-thumb { 33 | background-color: rgba(0, 0, 0, .2); 34 | background-clip: padding-box; 35 | border: solid transparent; 36 | border-width: 1px 1px 1px 2px; 37 | min-height: 28px; 38 | padding: 100px 0 0; 39 | box-shadow: inset 1px 1px 0 rgba(0, 0, 0, .1), inset 0 -1px 0 rgba(0, 0, 0, .07) 40 | } 41 | ::-webkit-scrollbar-thumb:horizontal { 42 | border-width: 6px 1px 1px; 43 | padding: 0 0 0 100px; 44 | box-shadow: inset 1px 1px 0 rgba(0, 0, 0, .1), inset -1px 0 0 rgba(0, 0, 0, .07) 45 | } 46 | ::-webkit-scrollbar-thumb:hover { 47 | background-color: rgba(0, 0, 0, .4); 48 | box-shadow: inset 1px 1px 1px rgba(0, 0, 0, .25) 49 | } 50 | ::-webkit-scrollbar-thumb:active { 51 | background-color: rgba(0, 0, 0, 0.5); 52 | box-shadow: inset 1px 1px 3px rgba(0, 0, 0, 0.35) 53 | } 54 | ::-webkit-scrollbar-corner { 55 | background: transparent 56 | } 57 | 58 | /* 59 | body::-webkit-scrollbar-track-piece { 60 | background-clip: padding-box; 61 | background-color: #f5f5f5; 62 | border: solid #fff; 63 | border-width: 0 0 0 3px; 64 | box-shadow: inset 1px 0 0 rgba(0, 0, 0, .14), inset -1px 0 0 rgba(0, 0, 0, .07) 65 | } 66 | body::-webkit-scrollbar-track-piece:horizontal { 67 | border-width: 3px 0 0; 68 | box-shadow: inset 0 1px 0 rgba(0, 0, 0, .14), inset 0 -1px 0 rgba(0, 0, 0, .07) 69 | } 70 | body::-webkit-scrollbar-thumb { 71 | border-width: 1px 1px 1px 5px 72 | } 73 | body::-webkit-scrollbar-thumb:horizontal { 74 | border-width: 5px 1px 1px 75 | } 76 | body::-webkit-scrollbar-corner { 77 | background-clip: padding-box; 78 | background-color: #f5f5f5; 79 | border: solid #fff; 80 | border-width: 3px 0 0 3px; 81 | box-shadow: inset 1px 1px 0 rgba(0, 0, 0, .14) 82 | } 83 | */ 84 | -------------------------------------------------------------------------------- /pywb/static/transclusions.js: -------------------------------------------------------------------------------- 1 | (function() { 2 | var loaded = false; 3 | 4 | document.addEventListener("readystatechange", function() { 5 | if (document.readyState === "complete") { 6 | if (!loaded) { 7 | loadTransclusions(); 8 | loaded = true; 9 | } 10 | } 11 | }); 12 | 13 | function loadTransclusions() { 14 | //var viUrl = window.location.href.replace("mp_", "vi_"); 15 | var embedsUrl = wbinfo.prefix + wbinfo.timestamp + "id_/urn:embeds:" + wbinfo.url; 16 | 17 | window.fetch(embedsUrl) 18 | .then(function(response) { 19 | return response.json(); 20 | }) 21 | .then(function(json) { 22 | addTransclusions(json); 23 | }) 24 | .catch(function(err) { 25 | }); 26 | } 27 | 28 | function addTransclusions(json) { 29 | var selector = json.selector || "object, embed"; 30 | var result = document.querySelector(selector); 31 | if (!result) { 32 | console.warn("No target to add video/audio transclusions"); 33 | return; 34 | } 35 | 36 | var parentElem = result.parentElement; 37 | 38 | if (!json.formats) { 39 | console.warn("No formats to add!"); 40 | return; 41 | } 42 | 43 | var isAudio = false; 44 | 45 | try { 46 | isAudio = json.formats.reduce(function(accum, curr) { 47 | return accum && (curr.skip_as_source || (curr && curr.mime && curr.mime.startsWith("audio/"))); 48 | }, true); 49 | } catch (e) { 50 | isAudio = false; 51 | } 52 | 53 | var media = document.createElement(!isAudio ? "video" : "audio"); 54 | media.setAttribute("controls", "true"); 55 | media.setAttribute("style", "width: 100%; height: 100%"); 56 | //media.setAttribute("autoplay", "true"); 57 | //media.setAttribute("muted", true); 58 | 59 | media.oncanplaythrough = function() { 60 | if (!media.hasStarted) { 61 | //media.muted = true; 62 | media.hasStarted = true; 63 | } 64 | //media.play(); 65 | } 66 | 67 | json.formats.forEach(function(data) { 68 | if (data.skip_as_source) { 69 | return; 70 | } 71 | 72 | if (data.name === "png_poster") { 73 | media.setAttribute("poster", data.url); 74 | return; 75 | } 76 | 77 | var source = document.createElement("source"); 78 | source.src = data.url; 79 | if (data.mime) { 80 | source.type = data.mime; 81 | } 82 | media.appendChild(source); 83 | }); 84 | 85 | parentElem.replaceChild(media, result); 86 | } 87 | 88 | })(); 89 | 90 | -------------------------------------------------------------------------------- /pywb/static/vue_banner.css: -------------------------------------------------------------------------------- 1 | #wb_iframe_div, #replay_iframe { 2 | width: 100%; 3 | height: 100%; 4 | } 5 | -------------------------------------------------------------------------------- /pywb/templates/banner.html: -------------------------------------------------------------------------------- 1 | {% if not env.pywb_proxy_magic or config.proxy.enable_banner | default(true) %} 2 | {% autoescape false %} 3 | 25 | 26 | 27 | 28 | 29 | {% include 'bootstrap_jquery.html' ignore missing %} 30 | 31 | {% endautoescape %} 32 | {% endif %} 33 | -------------------------------------------------------------------------------- /pywb/templates/base.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | {% block title %}{% endblock %} 8 | 9 | {% include 'bootstrap_jquery.html' ignore missing %} 10 | 11 | {% block head %} 12 | {% include 'head.html' ignore missing %} 13 | {% endblock %} 14 | 15 | 16 | 17 | {% block header %} 18 | {% include 'header.html' ignore missing %} 19 | {% endblock %} 20 | 21 |
22 | {% block body %} 23 | {% endblock %} 24 |
25 | 26 | {% block footer %} 27 | {% include 'footer.html' ignore missing %} 28 | {% endblock footer %} 29 | 30 | 31 | -------------------------------------------------------------------------------- /pywb/templates/bootstrap_jquery.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /pywb/templates/collinfo.json: -------------------------------------------------------------------------------- 1 | [ 2 | {% for route in routes %} 3 | {% if route | is_wb_handler %} 4 | {{ ',' if notfirst else '' }} 5 | { 6 | "id": "{{ route.path }}", 7 | "name": "{{ route.user_metadata.title if route.user_metadata.title else route.path }}", 8 | "timegate": "{{ host }}/{{route.path}}/", 9 | "timemap": "{{ host }}/{{route.path}}/timemap/*/" 10 | 11 | } 12 | {% set notfirst = true %} 13 | {% endif %} 14 | {% endfor %} 15 | ] 16 | -------------------------------------------------------------------------------- /pywb/templates/custom_banner.html: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /pywb/templates/error.html: -------------------------------------------------------------------------------- 1 | {% extends "base.html" %} 2 | {% block title %}{{ _('Pywb Error') }}{% endblock %} 3 | {% block body %} 4 |
5 |
6 |

{{ _('Pywb Error') }}

7 |
8 |
9 |
10 | {% if err_status == 451 %} 11 |

{% trans %}Access Blocked to {{ err_msg }}{% endtrans %}

12 | 13 | {% elif err_status == 404 and err_details == 'coll_not_found' %} 14 |

{% trans %}Collection not found: {{ err_msg }}{% endtrans %}

15 | 16 |

{{ _('See list of valid collections') }}

17 | 18 | {% elif err_status == 404 and err_details == 'static_file_not_found' %} 19 |

{% trans %}Static file not found: {{ err_msg }}{% endtrans %}

20 | 21 | {% else %} 22 | 23 |

{{ err_msg }}

24 | 25 | {% if err_details %} 26 |

{% trans %}Error Details:{% endtrans %}

27 |
{{ err_details }}
28 | {% endif %} 29 | {% endif %} 30 |
31 |
32 |
33 | {% endblock %} 34 | -------------------------------------------------------------------------------- /pywb/templates/footer.html: -------------------------------------------------------------------------------- 1 | {# place content to be added at the very end of the tag in this file below #} 2 | 3 | -------------------------------------------------------------------------------- /pywb/templates/frame_insert.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 15 | 16 | 17 | {% if client_side_replay %} 18 | 19 | {% endif %} 20 | 21 | {% autoescape false %} 22 | 23 | {{ banner_html }} 24 | 25 | {% include 'vue_loc.html' %} 26 | 27 | 28 | 29 | 30 |
31 | 48 | 49 |
50 | 51 |
52 | 53 | 69 | 70 | 71 | {% endautoescape %} 72 | 73 | -------------------------------------------------------------------------------- /pywb/templates/head.html: -------------------------------------------------------------------------------- 1 | {# place optional content to be injected into the of every page in this file below #} 2 | -------------------------------------------------------------------------------- /pywb/templates/head_insert.html: -------------------------------------------------------------------------------- 1 | {% autoescape false %} 2 | 3 | 4 | 32 | {% if env.pywb_proxy_magic %} 33 | {% set whichWombat = 'wombatProxyMode.js' %} 34 | {% else %} 35 | {% set whichWombat = 'wombat.js' %} 36 | {% endif %} 37 | {% if not wb_url.is_banner_only or (env.pywb_proxy_magic and (config.enable_auto_fetch or config.proxy.enable_wombat)) %} 38 | 39 | 51 | {% else %} 52 | 55 | {% endif %} 56 | 57 | {% if config.enable_flash_video_rewrite or config.transclusions_version == 1 %} 58 | 59 | 60 | {% elif config.transclusions_version == 2 %} 61 | 62 | 63 | {% endif %} 64 | 65 | {% if not is_framed %} 66 | 67 | {{ custom_banner_html }} 68 | 69 | {% endif %} 70 | 71 | {% endautoescape %} 72 | 73 | 74 | 75 | -------------------------------------------------------------------------------- /pywb/templates/header.html: -------------------------------------------------------------------------------- 1 | {# place content to be added at the very beginning of the tag in this file below #} 2 |
3 | {% if not err_msg and locales|length > 1 and (not ui or not ui.vue_calendar_ui) %} 4 |
5 | {{ _('Language:') }} 6 |
    7 | {% for locale in locales %} 8 |
  • {{ locale }}
  • 9 | {% endfor %} 10 |
11 |
12 | {% endif %} 13 |
14 | 15 | -------------------------------------------------------------------------------- /pywb/templates/index.html: -------------------------------------------------------------------------------- 1 | {% extends "base.html" %} 2 | {% block body %} 3 |
4 |
5 |

{{ _('Pywb Wayback Machine') }}

6 |

{{ _('This archive contains the following collections:') }}

7 |
8 |
9 |
    10 | {% for route in routes %} 11 |
  • 12 | {{ '/' + route }} 13 | {% if all_metadata and all_metadata[route] and all_metadata[route].title %} 14 | ({{ all_metadata[route].title }}) 15 | {% endif %} 16 |
  • 17 | {% endfor %} 18 |
19 |
20 |
21 | {% endblock %} 22 | -------------------------------------------------------------------------------- /pywb/templates/not_found.html: -------------------------------------------------------------------------------- 1 | {% extends "base.html" %} 2 | 3 | {% block title %}{{ _('URL Not Found') }}{% endblock %} 4 | 5 | {% block body %} 6 |
7 |
8 |

{% trans %}URL Not Found{% endtrans %}

9 |
10 |

11 | {% trans %}The url {{ url }} could not be found in this collection.{% endtrans %} 12 |

13 | {% if wbrequest and wbrequest.env.pywb_proxy_magic and url %} 14 |

15 | 16 | {{ _('Try Different Collection') }} 17 | 18 |

19 | {% endif %} 20 |
21 | {% endblock %} 22 | 23 | -------------------------------------------------------------------------------- /pywb/templates/proxy_cert_download.html: -------------------------------------------------------------------------------- 1 | {% extends "base.html" %} 2 | 3 | {% block title %}Download HTTPS Certificate For PyWb Web Archive Replay{% endblock %} 4 | 5 | {% block body %} 6 | 7 |

HTTPS Certificate For PyWb Web Archive Replay

8 | {% if not available %} 9 |

Sorry, HTTPS support is not configured for this proxy. However, the proxy should work in HTTP mode.

10 | 11 | {% else %} 12 |

Download for all platforms except Windows (or Firefox on Windows):

13 |

Download Certificate (All except Windows)

14 | 15 |

(If you see the Already Installed message, then no further action is necessary and you may start browsing!

16 | 17 |

Download for Windows platforms (except if using Firefox. For Firefox, use the above download, even on Windows):

18 |

Download Certificate (Window Only)

19 | {% endif %} 20 | {% endblock %} 21 | -------------------------------------------------------------------------------- /pywb/templates/proxy_select.html: -------------------------------------------------------------------------------- 1 | {% extends "base.html" %} 2 | 3 | {% block title %}Pywb Proxy Collection Selector{% endblock %} 4 | 5 | {% block body %} 6 |

Pywb Proxy Collection Selector

7 | {% if coll %} 8 |

9 | Current collection is: {{ coll }} 10 |

11 | {% else %} 12 |

You have attempted to load the url {{ url }}, but there are multiple collections available.

13 | {% endif %} 14 | 15 |

Please select which collection you would like to use (You will be redirected back to {{ url }}): 16 |

17 | 18 |
    19 | {% for route in routes %} 20 | {% if route.path and route | is_wb_handler %} 21 |
  • {{ route.path }}
  • 22 | {% endif %} 23 | {% endfor %} 24 |
25 | 26 |

(Once selected, you will not be prompted again, however you can return to this page to switch collections.)

27 | {% endblock %} 28 | -------------------------------------------------------------------------------- /pywb/utils/README.md: -------------------------------------------------------------------------------- 1 | ### pywb.utils 2 | 3 | This package contains a utils used by pywb wayback tool suite. 4 | 5 | #### Modules 6 | 7 | * [binsearch.py](binsearch.py) -- Binary search implementation over text files 8 | 9 | * [loaders.py](loaders.py) -- Loading abstraction for loading via http or local file system. 10 | 11 | * [bufferedreaders.py](bufferedreaders.py) -- Buffering wrappers for file-like object, also provide gzip decompression and 12 | de-chunking facilities. 13 | 14 | * [statusandheaders.py](statusandheaders.py) -- Represent http status line + headers and parsing them out from a stream 15 | 16 | * [timeutils.py](timeutils.py) -- Utility functions for converting between standard datetime formats 14-digit timestamp 17 | 18 | -------------------------------------------------------------------------------- /pywb/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/webrecorder/pywb/8ea2f74517161c1fc91ca969cea6328a20f5dce6/pywb/utils/__init__.py -------------------------------------------------------------------------------- /pywb/utils/format.py: -------------------------------------------------------------------------------- 1 | from six.moves.urllib.parse import quote, parse_qsl 2 | import string 3 | 4 | 5 | # ============================================================================ 6 | class ParamFormatter(string.Formatter): 7 | def __init__(self, params, name='', prefix='param.'): 8 | self.params = params 9 | self.prefix = prefix 10 | self.name = name 11 | 12 | def get_value(self, key, args, kwargs): 13 | # First, try the named param 'param.{name}.{key}' 14 | if self.name: 15 | named_key = self.prefix + self.name + '.' + key 16 | value = self.params.get(named_key) 17 | if value is not None: 18 | return value 19 | 20 | # Then, try 'param.{key}' 21 | named_key = self.prefix + key 22 | value = self.params.get(named_key) 23 | if value is not None: 24 | return value 25 | 26 | # try in extra params as just {key} 27 | value = kwargs.get(key) 28 | if value is not None: 29 | return value 30 | 31 | # try in params as just '{key}' 32 | value = self.params.get(key, '') 33 | return value 34 | 35 | 36 | # ============================================================================= 37 | def res_template(template, params, **extra_params): 38 | formatter = params.get('_formatter') 39 | if not formatter: 40 | formatter = ParamFormatter(params) 41 | 42 | url = params.get('url', '') 43 | qi = template.find('?') 44 | if qi >= 0 and template.find('{url}') > qi: 45 | url = quote(url) 46 | 47 | res = formatter.format(template, url=url, **extra_params) 48 | 49 | return res 50 | 51 | 52 | # ============================================================================= 53 | def to_bool(val): 54 | if not val: 55 | return False 56 | 57 | if isinstance(val, str): 58 | return val.lower() not in ('0', 'false', 'f', 'off') 59 | else: 60 | return bool(val) 61 | 62 | 63 | # ============================================================================= 64 | def query_to_dict(query_str, multi=None): 65 | pairlist = parse_qsl(query_str) 66 | if not multi: 67 | return dict(pairlist) 68 | 69 | obj = {} 70 | for n, v in pairlist: 71 | if n not in multi: 72 | obj[n] = v 73 | continue 74 | 75 | # make_list 76 | if n not in obj: 77 | obj[n] = v 78 | elif isinstance(obj[n], list): 79 | obj[n].append(v) 80 | else: 81 | obj[n] = [obj[n], v] 82 | 83 | return obj 84 | 85 | 86 | -------------------------------------------------------------------------------- /pywb/version.py: -------------------------------------------------------------------------------- 1 | __version__ = '2.9.0b1' 2 | 3 | if __name__ == '__main__': 4 | print(__version__) 5 | -------------------------------------------------------------------------------- /pywb/vueui/.eslintrc.js: -------------------------------------------------------------------------------- 1 | module.exports = { 2 | "env": { 3 | "browser": true, 4 | "es2021": true 5 | }, 6 | "extends": [ 7 | "eslint:recommended", 8 | "plugin:vue/essential" 9 | ], 10 | "parserOptions": { 11 | "ecmaVersion": 12, 12 | "sourceType": "module" 13 | }, 14 | "plugins": [ 15 | "vue" 16 | ], 17 | "rules": { 18 | "no-restricted-globals": [ 19 | 2, 20 | "event", "error" 21 | ], 22 | "indent": [ 23 | "error", 24 | 2 25 | ], 26 | "linebreak-style": [ 27 | "error", 28 | "unix" 29 | ], 30 | "quotes": [ 31 | "error", 32 | "double" 33 | ], 34 | "semi": [ 35 | "error", 36 | "always" 37 | ] 38 | } 39 | }; 40 | -------------------------------------------------------------------------------- /pywb/vueui/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "typescript", 3 | "version": "0.0.0", 4 | "private": true, 5 | "scripts": { 6 | "build": "rollup -c", 7 | "lint": "eslint ./src/*.js ./src/*.vue ./src/components/*.vue" 8 | }, 9 | "dependencies": { 10 | "vue": "^2.6.11", 11 | "vue-template-compiler": "^2.6.14" 12 | }, 13 | "devDependencies": { 14 | "@rollup/plugin-node-resolve": "^13.0.4", 15 | "eslint": "^7.32.0", 16 | "eslint-plugin-vue": "^7.17.0", 17 | "rollup": "^2.10.9", 18 | "rollup-plugin-css-only": "^3.1.0", 19 | "rollup-plugin-vue": "^5.0.0" 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /pywb/vueui/rollup.config.js: -------------------------------------------------------------------------------- 1 | import vue from "rollup-plugin-vue"; 2 | import css from "rollup-plugin-css-only"; 3 | import { nodeResolve } from "@rollup/plugin-node-resolve"; 4 | 5 | export default [ 6 | { 7 | input: "src/index.js", 8 | output: { 9 | file: "../static/vue/vueui.js", 10 | sourcemap: "inline", 11 | name: "VueUI", 12 | format: "iife", 13 | }, 14 | plugins: [ 15 | vue({css: true, compileTemplate: true}), 16 | css(), 17 | nodeResolve({browser: true}) 18 | ], 19 | }, 20 | ]; 21 | -------------------------------------------------------------------------------- /pywb/vueui/src/cdx-simulator/README.md: -------------------------------------------------------------------------------- 1 | # How to incorporate CDX Simulator 2 | 3 | Place following code snippets in **index.js** 4 | 5 | ## Import `CDXQueryWorkerSimulator` Mock Class 6 | 7 | It is the mock class to the main javascript built-in `Worker` class: 8 | 9 | ``import { CDXQueryWorkerSimulator } from "./cdx-simulator/cdx-simulator";`` 10 | 11 | ## Initialize `queryWorker` with Mock Class 12 | 13 | Update `const queryWorker = ...` initialization in `CDXLoader` class, `loadCDX()` method 14 | 15 | ### by replacing it 16 | 17 | ``` 18 | const queryWorker = new CDXQueryWorkerSimulator(this.staticPrefix + "/queryWorker.js"); 19 | ``` 20 | 21 | ### or by adding a conditional, so you can go back and forth between simulator and real CDX-data-loader: 22 | 23 | for example with a URL-hash flag conditional: 24 | 25 | ``` 26 | const queryWorker = new (window.location.hash.indexOf('cdx_simulate') >= 0 ? CDXQueryWorkerSimulator : Worker)(this.staticPrefix + "/queryWorker.js"); 27 | ``` 28 | 29 | NOTE: where if the url contains '#cdx_simulate' the mock simulator will be used; using a URL hash does not interfere with the main URL parsing of the PYWB app 30 | 31 | ## Configure Simulation 32 | 33 | Add a **local** storage (from Chrome Dev Tools > Application > Local Storage) 34 | 35 | ``` 36 | {"count":5000, "yearStart":2020, "yearEnd":2022, "fetchTime":3000} 37 | ``` 38 | 39 | where `count` is the total records, yearStart and yearEnd are self-explanatory, and `fetchTime` is how long it should take 40 | 41 | ![cdx loader config](pywb-vueui-cdx-simulator-config.jpg) -------------------------------------------------------------------------------- /pywb/vueui/src/cdx-simulator/pywb-vueui-cdx-simulator-config.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/webrecorder/pywb/8ea2f74517161c1fc91ca969cea6328a20f5dce6/pywb/vueui/src/cdx-simulator/pywb-vueui-cdx-simulator-config.jpg -------------------------------------------------------------------------------- /pywb/vueui/src/cdx-simulator/test.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | CDX Simulator 6 | 7 | 8 | 9 | 13 | 14 | -------------------------------------------------------------------------------- /pywb/vueui/src/components/TimelineBreadcrumbs.vue: -------------------------------------------------------------------------------- 1 | 32 | 33 | 57 | 58 | 93 | -------------------------------------------------------------------------------- /pywb/vueui/src/components/TimelineLinear.vue: -------------------------------------------------------------------------------- 1 | 22 | 23 | 56 | 57 | 91 | -------------------------------------------------------------------------------- /pywb/vueui/src/components/Tooltip.vue: -------------------------------------------------------------------------------- 1 | 6 | 7 | 47 | 48 | -------------------------------------------------------------------------------- /pywb/vueui/src/i18n.js: -------------------------------------------------------------------------------- 1 | export class PywbI18N { 2 | static #locale = ''; // private (can only be set here) 3 | static getLocale() { // get via public static method 4 | return PywbI18N.#locale; 5 | } 6 | static firstDayOfWeek = 1; 7 | static init = (locale, config) => { 8 | if (PywbI18N.instance) { 9 | throw new Error('cannot instantiate PywbI18N twice'); 10 | } 11 | PywbI18N.#locale = locale; 12 | PywbI18N.instance = new PywbI18N(config); 13 | let intlLocale = new Intl.Locale(PywbI18N.getLocale()); 14 | if ('weekInfo' in intlLocale) PywbI18N.firstDayOfWeek = intlLocale.weekInfo.firstDay % 7; 15 | } 16 | 17 | // PywbI18N expects from the i18n string source to receive months SHORT and LONG names in the config like this: 18 | // config.jan_short, config.jan_long, ...., config._short, config._long 19 | static monthIdPrefix = {1:"jan", 2:"feb",3:"mar",4:"apr",5:"may",6:"jun",7:"jul",8:"aug",9:"sep",10:"oct",11:"nov",12:"dec"}; 20 | 21 | /** 22 | * 23 | * @type {PywbI18N|null} 24 | */ 25 | static instance = null; 26 | 27 | constructor(config) { 28 | this.config = {...config}; // make a copy of config 29 | } 30 | 31 | // can get long (default) or short month string 32 | getMonth(id, type='long') { 33 | return decodeURIComponent(this.config[PywbI18N.monthIdPrefix[id]+'_'+type]); 34 | } 35 | // can get long (default) or short day string or initial 36 | // PywbI18N expects to receive day's initials like: 37 | // config.mon_short, config.tue_long, ...., config._short, config._long 38 | getWeekDay(id, type='long') { 39 | return decodeURIComponent(this.config[id+'_'+type]) 40 | } 41 | getWeekDays(type='long') { 42 | let weekDays = ['sun', 'mon', 'tue', 'wed', 'thu', 'fri', 'sat']; 43 | return weekDays.concat(weekDays).slice(PywbI18N.firstDayOfWeek, PywbI18N.firstDayOfWeek + 7).map(d => this.getWeekDay(d, type)); 44 | } 45 | getText(id, embeddedVariableStrings=null) { 46 | const translated = decodeURIComponent(this.config[id] || id); 47 | if (embeddedVariableStrings && id.indexOf('{') >= 0 && id.indexOf('}') >= 0 ) { 48 | return translated.replace(/{(\w+)}/g, (match, stringId) => embeddedVariableStrings[stringId]); 49 | } 50 | return translated 51 | } 52 | _(id, embeddedVariableStrings=null) { 53 | return this.getText(id, embeddedVariableStrings); 54 | } 55 | } 56 | -------------------------------------------------------------------------------- /pywb/warcserver/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/webrecorder/pywb/8ea2f74517161c1fc91ca969cea6328a20f5dce6/pywb/warcserver/__init__.py -------------------------------------------------------------------------------- /pywb/warcserver/amf.py: -------------------------------------------------------------------------------- 1 | import json 2 | import six 3 | from pyamf.remoting import Envelope, Request 4 | from pyamf.flex.messaging import RemotingMessage 5 | 6 | 7 | class Amf: 8 | 9 | @staticmethod 10 | def get_representation(request_object, max_calls=500): 11 | 12 | max_calls = max_calls - 1 13 | 14 | if max_calls < 0: 15 | raise Exception("Amf.get_representation maximum number of calls reached") 16 | 17 | if isinstance(request_object, Envelope): 18 | # Remove order of Request 19 | bodies = [] 20 | for i in request_object.bodies: 21 | bodies.append(Amf.get_representation(i[1], max_calls)) 22 | bodies = sorted(bodies) 23 | 24 | return "{bodies}".format(bodies="[" + ",".join(bodies) + "]") 25 | 26 | elif isinstance(request_object, Request): 27 | # Remove cyclic reference 28 | target = request_object.target 29 | body = Amf.get_representation(request_object.body, max_calls) 30 | return "{body}".format(**locals()) 31 | 32 | elif isinstance(request_object, RemotingMessage): 33 | # Remove random properties 34 | operation = request_object.operation 35 | body = Amf.get_representation(request_object.body, max_calls) 36 | return "{body}".format(**locals()) 37 | 38 | elif isinstance(request_object, dict): 39 | return json.dumps(request_object, sort_keys=True) 40 | 41 | elif isinstance(request_object, list): 42 | bodies = [] 43 | for i in request_object: 44 | bodies.append(Amf.get_representation(i, max_calls)) 45 | return "[" + ",".join(bodies) + "]" 46 | 47 | elif isinstance(request_object, six.string_types): 48 | return request_object 49 | 50 | elif request_object is None: 51 | return "" 52 | 53 | elif isinstance(request_object, object) and hasattr(request_object, "__dict__"): 54 | classname = request_object.__class__.__name__ 55 | properties = request_object.__dict__ 56 | bodies = dict() 57 | for prop in properties: 58 | bodies[prop] = Amf.get_representation(getattr(request_object, prop), max_calls) 59 | bodies = Amf.get_representation(bodies, max_calls) 60 | 61 | return '<{classname}>{bodies}'.format(**locals()) 62 | 63 | else: 64 | return repr(request_object) 65 | -------------------------------------------------------------------------------- /pywb/warcserver/http.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import requests 4 | import six.moves.http_client 5 | from requests.adapters import DEFAULT_POOLBLOCK, HTTPAdapter 6 | from urllib3.poolmanager import PoolManager 7 | from urllib3.util.retry import Retry 8 | 9 | six.moves.http_client._MAXHEADERS = 10000 10 | six.moves.http_client._MAXLINE = 131072 11 | 12 | 13 | # ============================================================================= 14 | class PywbHttpAdapter(HTTPAdapter): 15 | """This adaptor exists exists to restore the default behavior 16 | of urllib3 < 1.25.x, which was to not verify ssl certs, 17 | until a better solution is found 18 | """ 19 | 20 | def __init__(self, cert_reqs='CERT_NONE', ca_cert_dir=None, **init_kwargs): 21 | self.cert_reqs = cert_reqs 22 | self.ca_cert_dir = ca_cert_dir 23 | return super(PywbHttpAdapter, self).__init__(**init_kwargs) 24 | 25 | def init_poolmanager( 26 | self, connections, maxsize, block=DEFAULT_POOLBLOCK, **pool_kwargs 27 | ): 28 | self._pool_connections = connections 29 | self._pool_maxsize = maxsize 30 | self._pool_block = block 31 | self.poolmanager = PoolManager( 32 | num_pools=connections, 33 | maxsize=maxsize, 34 | block=block, 35 | strict=True, 36 | cert_reqs=self.cert_reqs, 37 | ca_cert_dir=self.ca_cert_dir, 38 | **pool_kwargs 39 | ) 40 | 41 | def proxy_manager_for(self, proxy, **proxy_kwargs): 42 | proxy_kwargs['cert_reqs'] = self.cert_reqs 43 | proxy_kwargs['ca_cert_dir'] = self.ca_cert_dir 44 | return super(PywbHttpAdapter, self).proxy_manager_for(proxy, **proxy_kwargs) 45 | 46 | 47 | # ============================================================================= 48 | class DefaultAdapters(object): 49 | live_adapter = PywbHttpAdapter(max_retries=Retry(3)) 50 | remote_adapter = PywbHttpAdapter(max_retries=Retry(3)) 51 | 52 | 53 | requests.packages.urllib3.disable_warnings() 54 | 55 | -------------------------------------------------------------------------------- /pywb/warcserver/index/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/webrecorder/pywb/8ea2f74517161c1fc91ca969cea6328a20f5dce6/pywb/warcserver/index/__init__.py -------------------------------------------------------------------------------- /pywb/warcserver/index/test/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/webrecorder/pywb/8ea2f74517161c1fc91ca969cea6328a20f5dce6/pywb/warcserver/index/test/__init__.py -------------------------------------------------------------------------------- /pywb/warcserver/index/test/test_cdxobject.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | from pywb.warcserver.index.cdxobject import CDXObject, IDXObject, CDXException 5 | from pytest import raises 6 | 7 | def test_empty_cdxobject(): 8 | x = CDXObject(b'') 9 | assert len(x) == 0 10 | 11 | def test_invalid_cdx_format(): 12 | with raises(CDXException): 13 | x = CDXObject(b'a b c') 14 | 15 | 16 | def _make_line(fields): 17 | line = ' '.join(['-'] * fields) 18 | x = CDXObject(line.encode('utf-8')) 19 | assert len(x) == fields 20 | assert str(x) == line 21 | 22 | def test_valid_cdx_formats(): 23 | # Currently supported cdx formats, 9, 11, 12, 14 field 24 | # See CDXObject for more details 25 | _make_line(9) 26 | _make_line(12) 27 | 28 | _make_line(11) 29 | _make_line(14) 30 | 31 | def test_unicode_url(): 32 | x = CDXObject(u'com,example,cafe)/ 123 {"url": "http://example.com/café/path"}'.encode('utf-8')) 33 | assert x['urlkey'] == 'com,example,cafe)/' 34 | assert x['timestamp'] == '123' 35 | assert x['url'] == 'http://example.com/caf%C3%A9/path' 36 | 37 | assert x.to_cdxj() == 'com,example,cafe)/ 123 {"url": "http://example.com/caf%C3%A9/path"}\n' 38 | 39 | def test_invalid_idx_format(): 40 | with raises(CDXException): 41 | x = IDXObject(b'a b c') 42 | 43 | 44 | def test_lt_le(): 45 | A = CDXObject(b'ca,example)/ 2016 {"url": "http://example.com/"}') 46 | B = CDXObject(b'com,example)/ 2015 {"url": "http://example.com/"}') 47 | C = CDXObject(b'com,example)/ 2016 {"url": "http://example.com/"}') 48 | 49 | assert A < B 50 | assert B < C 51 | assert B >= A 52 | assert C >= A 53 | assert A < C 54 | 55 | 56 | -------------------------------------------------------------------------------- /pywb/warcserver/index/test/test_lazy_ops.py: -------------------------------------------------------------------------------- 1 | from pywb.utils.wbexception import AccessException 2 | from pywb.warcserver.index.cdxops import cdx_load 3 | from pywb.warcserver.index.query import CDXQuery 4 | 5 | from pytest import raises 6 | 7 | import six 8 | 9 | 10 | URL = 'http://example.com/' 11 | 12 | 13 | #================================================================ 14 | def raise_access_exception(cdx_iter, query): 15 | if query.url == URL: 16 | raise AccessException 17 | 18 | for cdx in cdx_iter: 19 | yield 20 | 21 | #================================================================ 22 | def lazy_cdx_load(**params): 23 | """ 24 | # Verify that an op 'short-circuits' further evaluation.. eg, a bad cdx source is not even loaded 25 | # as soon as exception is thrown 26 | 27 | Exception is thrown on first .next() access, not on the cdx_load 28 | """ 29 | params['custom_ops'] = [raise_access_exception] 30 | 31 | cdx_iter = cdx_load(['bogus ignored'], 32 | CDXQuery(params), 33 | process=True) 34 | 35 | # exception happens on first access attempt 36 | with raises(AccessException): 37 | six.next(cdx_iter) 38 | 39 | 40 | def test_no_process(): 41 | lazy_cdx_load(url=URL) 42 | 43 | def test_reverse(): 44 | lazy_cdx_load(url=URL, reverse=True) 45 | 46 | def test_closest(): 47 | lazy_cdx_load(url=URL, closest='2013') 48 | 49 | def test_limit(): 50 | lazy_cdx_load(url=URL, limit=10) 51 | 52 | def test_limit_1_reverse(): 53 | lazy_cdx_load(url=URL, limit=1, reverse=True) 54 | 55 | def test_multi_ops(): 56 | lazy_cdx_load(url=URL, 57 | resolveRevisits=True, 58 | filters=['=filename:A'], 59 | collapseTime=10, 60 | reverse=True, 61 | closest='2013', 62 | limit=5, 63 | fields='timestamp,filename', 64 | output='text') 65 | 66 | 67 | 68 | -------------------------------------------------------------------------------- /pywb/warcserver/index/test/test_redis_agg.py: -------------------------------------------------------------------------------- 1 | from pywb.warcserver.index.aggregator import RedisMultiKeyIndexSource 2 | from pywb.warcserver.test.testutils import to_path, to_json_list, FakeRedisTests, BaseTestClass, TEST_CDX_PATH 3 | import pytest 4 | 5 | 6 | class TestRedisAgg(FakeRedisTests, BaseTestClass): 7 | @classmethod 8 | def setup_class(cls): 9 | super(TestRedisAgg, cls).setup_class() 10 | cls.add_cdx_to_redis(TEST_CDX_PATH + 'example2.cdxj', 'FOO:example:cdxj') 11 | cls.add_cdx_to_redis(TEST_CDX_PATH + 'dupes.cdxj', 'FOO:dupes:cdxj') 12 | 13 | # scan loader 14 | cls.scan_loader = RedisMultiKeyIndexSource('redis://localhost/2/{user}:{coll}:cdxj') 15 | 16 | cls.redis.sadd('FOO::list', 'dupes') 17 | cls.redis.sadd('FOO::list', 'example') 18 | 19 | cls.member_list_loader = RedisMultiKeyIndexSource('redis://localhost/2/{user}:{coll}:cdxj', 20 | member_key_templ='FOO::list') 21 | 22 | @pytest.fixture(params=['scan', 'member-list']) 23 | def indexloader(self, request): 24 | if request.param == 'scan': 25 | return self.scan_loader 26 | else: 27 | return self.member_list_loader 28 | 29 | def test_redis_agg_all(self, indexloader): 30 | res, errs = indexloader({'url': 'example.com/', 'param.user': 'FOO', 'param.coll': '*'}) 31 | 32 | exp = [ 33 | {'source': 'FOO:dupes:cdxj', 'timestamp': '20140127171200', 'filename': 'dupes.warc.gz'}, 34 | {'source': 'FOO:dupes:cdxj', 'timestamp': '20140127171251', 'filename': 'dupes.warc.gz'}, 35 | {'source': 'FOO:example:cdxj', 'timestamp': '20160225042329', 'filename': 'example2.warc.gz'} 36 | ] 37 | 38 | assert(errs == {}) 39 | assert(to_json_list(res) == exp) 40 | 41 | def test_redis_agg_one(self, indexloader): 42 | res, errs = indexloader({'url': 'example.com/', 'param.user': 'FOO', 'param.coll': 'dupes'}) 43 | 44 | exp = [ 45 | {'source': 'FOO:dupes:cdxj', 'timestamp': '20140127171200', 'filename': 'dupes.warc.gz'}, 46 | {'source': 'FOO:dupes:cdxj', 'timestamp': '20140127171251', 'filename': 'dupes.warc.gz'}, 47 | ] 48 | 49 | assert(errs == {}) 50 | assert(to_json_list(res) == exp) 51 | 52 | def test_redis_not_found(self, indexloader): 53 | res, errs = indexloader({'url': 'example.com/'}) 54 | 55 | exp = [] 56 | 57 | assert(errs == {}) 58 | assert(to_json_list(res) == exp) 59 | 60 | 61 | -------------------------------------------------------------------------------- /pywb/warcserver/resource/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/webrecorder/pywb/8ea2f74517161c1fc91ca969cea6328a20f5dce6/pywb/warcserver/resource/__init__.py -------------------------------------------------------------------------------- /pywb/warcserver/resource/blockrecordloader.py: -------------------------------------------------------------------------------- 1 | from warcio.bufferedreaders import DecompressingBufferedReader 2 | from warcio.recordloader import ArcWarcRecordLoader 3 | 4 | from pywb.utils.loaders import BlockLoader 5 | from pywb.utils.io import BUFF_SIZE 6 | 7 | 8 | #================================================================= 9 | class BlockArcWarcRecordLoader(ArcWarcRecordLoader): 10 | def __init__(self, loader=None, cookie_maker=None, block_size=BUFF_SIZE, *args, **kwargs): 11 | if not loader: 12 | loader = BlockLoader(cookie_maker=cookie_maker) 13 | 14 | self.loader = loader 15 | self.block_size = block_size 16 | super(BlockArcWarcRecordLoader, self).__init__(*args, **kwargs) 17 | 18 | def load(self, url, offset, length, no_record_parse=False): 19 | """ Load a single record from given url at offset with length 20 | and parse as either warc or arc record 21 | """ 22 | try: 23 | length = int(length) 24 | except: 25 | length = -1 26 | 27 | stream = self.loader.load(url, int(offset), length) 28 | decomp_type = 'gzip' 29 | 30 | # Create decompressing stream 31 | stream = DecompressingBufferedReader(stream=stream, 32 | decomp_type=decomp_type, 33 | block_size=self.block_size) 34 | 35 | return self.parse_record_stream(stream, no_record_parse=no_record_parse) 36 | -------------------------------------------------------------------------------- /pywb/warcserver/resource/test/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/webrecorder/pywb/8ea2f74517161c1fc91ca969cea6328a20f5dce6/pywb/warcserver/resource/test/__init__.py -------------------------------------------------------------------------------- /pywb/warcserver/test/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/webrecorder/pywb/8ea2f74517161c1fc91ca969cea6328a20f5dce6/pywb/warcserver/test/__init__.py -------------------------------------------------------------------------------- /pywb/warcserver/test/live.ini: -------------------------------------------------------------------------------- 1 | [uwsgi] 2 | if-not-env = PORT 3 | http-socket = :8080 4 | endif = 5 | 6 | master = true 7 | buffer-size = 65536 8 | die-on-term = true 9 | 10 | if-env = VIRTUAL_ENV 11 | venv = $(VIRTUAL_ENV) 12 | endif = 13 | 14 | gevent = 100 15 | gevent-monkey-patch = 16 | 17 | wsgi = warcserver.test.live 18 | -------------------------------------------------------------------------------- /pywb/warcserver/test/test_warcserver_config.yaml: -------------------------------------------------------------------------------- 1 | debug: true 2 | 3 | collections: 4 | 5 | # Live Index 6 | live: $live 7 | 8 | # rhizome (memento) 9 | rhiz: memento+http://webarchives.rhizome.org/all/ 10 | 11 | # rhizome (cdx) 12 | rhiz_cdx: cdx+http://webarchives.rhizome.org/all-cdx 13 | 14 | # rhizome (native wb) 15 | rhiz_wb: wb-memento+http://webarchives.rhizome.org/all/ 16 | 17 | # ia cdx 18 | ia: cdx+http://web.archive.org/cdx /web 19 | 20 | # ait cdxX 21 | ait: cdx+http://wayback.archive-it.org/cdx /all 22 | 23 | # zipnum index 24 | zip_cluster: zipnum+./local/indexes/file.idx 25 | 26 | 27 | ait_long: 28 | index: 29 | type: cdx 30 | api_url: 'http://wayback.archive-it.org/cdx?url={url}&closest={closest}&sort=closest' 31 | replay_url: http://wayback.archive-it.org/all/{timestamp}id_/{url} 32 | 33 | rhiz_long: 34 | index: 35 | type: memento 36 | timegate_url: http://webarchives.rhizome.org/all/{url} 37 | timemap_url: http://webarchives.rhizome.org/all/timemap/link/{url} 38 | replay_url: http://webarchives.rhizome.org/all/{timestamp}id_/{url} 39 | 40 | # many archives 41 | many: 42 | index_group: 43 | rhiz: memento+http://webarchives.rhizome.org/all/ 44 | ia: cdx+http://web.archive.org/cdx;/web 45 | apt: memento+http://arquivo.pt/wayback/ 46 | liveweb: live 47 | 48 | timeout: 10 49 | 50 | # Local Dir CDX 51 | local: 52 | index: ./local/indexes 53 | archive_paths: ./local/data 54 | 55 | local_file: 56 | index: ./local/indexes/file.cdxj 57 | archive_paths: ./local/data 58 | 59 | # Sequence 60 | many_seq: 61 | sequence: 62 | - 63 | index: ./local/indexes 64 | archive_paths: ./local/data 65 | name: local 66 | 67 | - 68 | index_group: 69 | rhiz: cdx+http://webarchives.rhizome.org/all-cdx 70 | apt: memento+http://arquivo.pt/wayback/ 71 | 72 | - 73 | index: $live 74 | name: live 75 | -------------------------------------------------------------------------------- /pywb/warcserver/upstreamindexsource.py: -------------------------------------------------------------------------------- 1 | from warcio.timeutils import timestamp_now 2 | 3 | from pywb.utils.wbexception import NotFoundException 4 | 5 | from pywb.warcserver.index.cdxobject import CDXObject 6 | from pywb.warcserver.index.indexsource import BaseIndexSource, RemoteIndexSource 7 | from pywb.warcserver.resource.responseloader import LiveWebLoader 8 | from pywb.utils.format import ParamFormatter, res_template 9 | 10 | 11 | #============================================================================= 12 | class UpstreamAggIndexSource(RemoteIndexSource): 13 | def __init__(self, base_url): 14 | api_url = base_url + '/index?url={url}' 15 | proxy_url = base_url + '/resource?url={url}&closest={timestamp}' 16 | super(UpstreamAggIndexSource, self).__init__(api_url, proxy_url, 'filename') 17 | 18 | def _set_load_url(self, cdx, params): 19 | super(UpstreamAggIndexSource, self)._set_load_url(cdx, params) 20 | cdx['offset'] = '0' 21 | cdx.pop('load_url', '') 22 | 23 | 24 | #============================================================================= 25 | class UpstreamMementoIndexSource(BaseIndexSource): 26 | def __init__(self, proxy_url='{url}'): 27 | self.proxy_url = proxy_url 28 | self.loader = LiveWebLoader() 29 | 30 | def load_index(self, params): 31 | cdx = CDXObject() 32 | cdx['urlkey'] = params.get('key').decode('utf-8') 33 | 34 | closest = params.get('closest') 35 | cdx['timestamp'] = closest if closest else timestamp_now() 36 | cdx['url'] = params['url'] 37 | cdx['load_url'] = res_template(self.proxy_url, params) 38 | cdx['memento_url'] = cdx['load_url'] 39 | return self._do_load(cdx, params) 40 | 41 | def _do_load(self, cdx, params): 42 | result = self.loader.load_resource(cdx, params) 43 | if not result: 44 | raise NotFoundException('Not a memento: ' + cdx['url']) 45 | 46 | cdx['_cached_result'] = result 47 | yield cdx 48 | 49 | def __str__(self): 50 | return 'upstream' 51 | 52 | @staticmethod 53 | def upstream_resource(base_url): 54 | return UpstreamMementoIndexSource(base_url + '/resource?url={url}&closest={closest}') 55 | 56 | 57 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | six 2 | warcio>=1.7.1 3 | requests 4 | redis==2.10.6 5 | jinja2>=3.1.2 6 | surt>=0.3.1 7 | brotlipy 8 | pyyaml 9 | werkzeug==2.2.3 10 | webencodings 11 | gevent==22.10.2 12 | greenlet>=2.0.2,<3.0 13 | webassets==2.0 14 | portalocker 15 | wsgiprox>=1.5.1 16 | fakeredis<1.0 17 | tldextract 18 | python-dateutil 19 | markupsafe>=2.1.1 20 | ua_parser 21 | py3AMF 22 | -------------------------------------------------------------------------------- /run-gunicorn.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | pip install gunicorn 3 | 4 | if [ $? -ne 0 ]; then 5 | "gunicorn install failed" 6 | exit 1 7 | fi 8 | 9 | export PYWB_CONFIG_FILE=config.yaml 10 | gunicorn -w 4 pywb.apps.wayback -b 0.0.0.0:8080 11 | -------------------------------------------------------------------------------- /run-tests.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | result = pytest.main('-v --doctest-module tests/ pywb/') 3 | exit(result) 4 | -------------------------------------------------------------------------------- /run-uwsgi.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | # requires uwsgi 4 | pip install uwsgi 5 | pip install gevent 6 | 7 | if [ $? -ne 0 ]; then 8 | "uwsgi install failed" 9 | exit 1 10 | fi 11 | 12 | 13 | mypath=$(cd `dirname $0` && pwd) 14 | 15 | params="$mypath/uwsgi.ini" 16 | 17 | #if [ -n "$VIRTUAL_ENV" ] ; then 18 | # params="$params -H $VIRTUAL_ENV" 19 | #fi 20 | 21 | uwsgi $params 22 | -------------------------------------------------------------------------------- /sample-deploy/docker-compose-apache.yaml: -------------------------------------------------------------------------------- 1 | # This example demonstrates running pywb with apache frontend under a subpath /wayback 2 | 3 | version: '3' 4 | 5 | services: 6 | # main pywb image 7 | pywb: 8 | image: webrecorder/pywb 9 | volumes: 10 | - ../config.yaml:/webarchive/config.yaml 11 | - ../sample_archive/:/webarchive/sample_archive/ 12 | - ./uwsgi_subdir.ini:/uwsgi/uwsgi.ini 13 | 14 | # optional volume to serve static assets from nginx 15 | - pywb-static:/pywb/pywb/static 16 | 17 | apache: 18 | image: httpd 19 | ports: 20 | - 8080:80 21 | 22 | volumes: 23 | #- ./nginx-default.conf:/etc/nginx/conf.d/default.conf 24 | - ./httpd.conf:/usr/local/apache2/conf/httpd.conf 25 | - ./pywb-apache.conf:/usr/local/apache2/conf/extra/pywb-apache.conf 26 | 27 | # optional volume to serve static assets from nginx 28 | - pywb-static:/pywb/pywb/static 29 | 30 | depends_on: 31 | - pywb 32 | 33 | volumes: 34 | pywb-static: 35 | -------------------------------------------------------------------------------- /sample-deploy/docker-compose-nginx.yaml: -------------------------------------------------------------------------------- 1 | # This example demonstrates running pywb with nginx frontend under a subpath /wayback 2 | 3 | version: '3' 4 | 5 | services: 6 | # main pywb image 7 | pywb: 8 | image: webrecorder/pywb 9 | volumes: 10 | - ../config.yaml:/webarchive/config.yaml 11 | - ../sample_archive/:/webarchive/sample_archive/ 12 | - ./uwsgi_subdir.ini:/uwsgi/uwsgi.ini 13 | 14 | # optional volume to serve static assets from nginx 15 | - pywb-static:/pywb/pywb/static 16 | 17 | nginx: 18 | image: nginx 19 | ports: 20 | - 8080:80 21 | 22 | volumes: 23 | - ./pywb-nginx.conf:/etc/nginx/conf.d/default.conf 24 | 25 | # optional volume to serve static assets from nginx 26 | - pywb-static:/pywb/pywb/static 27 | 28 | depends_on: 29 | - pywb 30 | 31 | volumes: 32 | pywb-static: 33 | -------------------------------------------------------------------------------- /sample-deploy/docker-compose-outback.yaml: -------------------------------------------------------------------------------- 1 | version: '3' 2 | 3 | services: 4 | # outbackcdx image 5 | outbackcdx: 6 | image: nlagovau/outbackcdx 7 | ports: 8 | - 8084:8080 9 | 10 | # use cdx-indexer to index and ingest into outbackcdx 11 | ingest: 12 | image: webrecorder/pywb 13 | entrypoint: ["bash", "-c"] 14 | command: /tmp/run.sh 15 | 16 | depends_on: 17 | - outbackcdx 18 | 19 | volumes: 20 | - ../config.yaml:/webarchive/config.yaml 21 | - ./run.sh:/tmp/run.sh 22 | - ../sample_archive/:/webarchive/sample_archive/ 23 | 24 | # main pywb image 25 | pywb: 26 | image: webrecorder/pywb 27 | volumes: 28 | - ../config.yaml:/webarchive/config.yaml 29 | - ../sample_archive/:/webarchive/sample_archive/ 30 | 31 | ports: 32 | - 8080:8080 33 | 34 | depends_on: 35 | - ingest 36 | 37 | 38 | 39 | 40 | -------------------------------------------------------------------------------- /sample-deploy/pywb-apache.conf: -------------------------------------------------------------------------------- 1 | 2 | # optional: optimization to have apache serve static assets 3 | Alias /wayback/static "/pywb/pywb/static" 4 | ProxyPass /wayback/static ! 5 | 6 | 7 | Options None 8 | AllowOverride None 9 | Order allow,deny 10 | Allow from all 11 | Require all granted 12 | 13 | 14 | # required: proxy pass to pywb 15 | ProxyPass /wayback uwsgi://pywb:8081/ 16 | 17 | # optional: set custom header based on IP ranges 18 | 19 | RequestHeader set X-Pywb-ACL-User staff 20 | 21 | # ensure header is cleared if no match 22 | 23 | RequestHeader set X-Pywb-ACL-User "" 24 | 25 | 26 | 27 | -------------------------------------------------------------------------------- /sample-deploy/pywb-nginx.conf: -------------------------------------------------------------------------------- 1 | # nginx config for running under /wayback/ prefix 2 | 3 | 4 | # set acl_user, defaulting to empty (any public user) 5 | geo $acl_user { 6 | # ensure user is set to empty by default 7 | default ""; 8 | 9 | # optional: add IP ranges to allow privileged access 10 | 127.0.0.1 "staff"; 11 | 192.168.0.0/24 "staff"; 12 | } 13 | 14 | 15 | 16 | server { 17 | listen 80; 18 | 19 | # optinal: optimization to have nginx serve static assets 20 | location /wayback/static { 21 | alias /pywb/pywb/static; 22 | } 23 | 24 | # required: pywb with prefix 25 | location /wayback/ { 26 | resolver 127.0.0.1; 27 | 28 | uwsgi_pass pywb:8081; 29 | uwsgi_buffer_size 8k; 30 | 31 | 32 | include uwsgi_params; 33 | uwsgi_param UWSGI_SCHEME $scheme; 34 | 35 | # pass acl_user (which should be empty by default) 36 | uwsgi_param HTTP_X_PYWB_ACL_USER $acl_user; 37 | } 38 | } 39 | 40 | -------------------------------------------------------------------------------- /sample-deploy/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | cdx-indexer /webarchive/sample_archive/warcs/example.warc.gz > /tmp/index.cdx 3 | curl -X POST --data-binary @/tmp/index.cdx http://outbackcdx:8080/pywb 4 | 5 | -------------------------------------------------------------------------------- /sample-deploy/uwsgi_subdir.ini: -------------------------------------------------------------------------------- 1 | [uwsgi] 2 | if-not-env = PORT 3 | http-socket = :8080 4 | socket = :8081 5 | endif = 6 | 7 | master = true 8 | buffer-size = 65536 9 | die-on-term = true 10 | 11 | if-env = VIRTUAL_ENV 12 | venv = $(VIRTUAL_ENV) 13 | endif = 14 | 15 | gevent = 100 16 | 17 | #Not available until uwsgi 2.1 18 | #monkey-patching manually in pywb.apps.wayback 19 | #gevent-early-monkey-patch = 20 | # for uwsgi<2.1, set env when using gevent 21 | env = GEVENT_MONKEY_PATCH=1 22 | 23 | # specify config file here 24 | env = PYWB_CONFIG_FILE=config.yaml 25 | #wsgi = pywb.apps.wayback 26 | 27 | # config to run pywb from a prefix 28 | mount = /wayback=/pywb/pywb/apps/wayback.py 29 | manage-script-name = true 30 | -------------------------------------------------------------------------------- /sample_archive/access/allow_all.aclj: -------------------------------------------------------------------------------- 1 | *, - {"access": "allow", "user": "staff"} 2 | -------------------------------------------------------------------------------- /sample_archive/access/allows.aclj: -------------------------------------------------------------------------------- 1 | net,example)/test - {"access": "allow"} 2 | net,domain, - {"access": "allow"} 3 | net, - {"access": "allow"} 4 | -------------------------------------------------------------------------------- /sample_archive/access/blocks.aclj: -------------------------------------------------------------------------------- 1 | net,example)/abc/path - {"access": "block"} 2 | com,example)/foo - {"access": "exclude"} 3 | 4 | -------------------------------------------------------------------------------- /sample_archive/access/list1.aclj: -------------------------------------------------------------------------------- 1 | com,example, - {"access": "exclude"} 2 | com,example)/abc/page.html - {"access": "allow"} 3 | com,example)/abc/ef - {"access": "block"} 4 | com,example)/abc/cd - {"access": "block"} 5 | com,example)/abc/ab - {"access": "block"} 6 | com,example)/abc - {"access": "block"} 7 | com,exampke)/ - {"access": "allow"} 8 | com,ex)/ - {"access": "exclude"} 9 | com, - {"access": "allow"} 10 | -------------------------------------------------------------------------------- /sample_archive/access/list2.aclj: -------------------------------------------------------------------------------- 1 | org,httpbin)/ - {"access": "allow"} 2 | com,example)/ - {"access": "allow"} 3 | bo,example)/ - {"access": "exclude"} 4 | -------------------------------------------------------------------------------- /sample_archive/access/pywb.aclj: -------------------------------------------------------------------------------- 1 | org,iana)/exact/match/first/line/aclj### - {"access": "allow", "url": "https://www.iana.org/exact/match/first/line/aclj/"} 2 | org,iana)/about - {"access": "block"} 3 | org,iana)/about - {"access": "allow", "user": "staff"} 4 | org,iana)/_css/2013.1/fonts/opensans-semibold.ttf - {"access": "allow"} 5 | org,iana)/_css - {"access": "exclude"} 6 | org,iana)/### - {"access": "allow"} 7 | org,iana)/ - {"access": "exclude"} 8 | com,example)/?example=3 - {"access": "block", "user": "staff"} 9 | com,example)/?example=3 - {"access": "exclude", "user": "staff2"} 10 | org,example)/?example=1 - {"access": "block"} 11 | com,example)/?example=2 - {"access": "allow_ignore_embargo"} 12 | com,example)/?example=1 - {"access": "allow_ignore_embargo", "user": "staff2"} 13 | com,example)/?example=1 - {"access": "allow", "user": "staff"} 14 | com,example)/ - {"access": "allow"} 15 | -------------------------------------------------------------------------------- /sample_archive/access/single-line.aclj: -------------------------------------------------------------------------------- 1 | org,lonesome-rule)/### - {"access": "allow", "url": "https://www.lonesome-rule.org/"} 2 | -------------------------------------------------------------------------------- /sample_archive/cdx/bad.cdx: -------------------------------------------------------------------------------- 1 | CDX N b a m s k r M S V g 2 | com,example)/?example=2 20140603030351 http://example.com?example=2 warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36B - - 504 2701 example-extra.warc 3 | com,example)/?example=2 20140703030321 http://example.com?example=2 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1987 0 non-existent.warc 4 | com,example)/?example=3 20140603030351 http://example.com?example=3 warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36B - - 504 2701 example-extra.warc 5 | com,example)/?example=3 20140703030321 http://example.com?example=3 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1987 0 non-existent.warc 6 | -------------------------------------------------------------------------------- /sample_archive/cdx/dupes.cdx: -------------------------------------------------------------------------------- 1 | CDX N b a m s k r M S V g 2 | com,example)/ 20140127171200 http://example.com text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1046 334 dupes.warc.gz 3 | com,example)/ 20140127171251 http://example.com warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 11875 dupes.warc.gz 4 | org,iana)/ 20140127171238 http://iana.org unk 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 343 1858 dupes.warc.gz 5 | org,iana)/ 20140127171238 http://www.iana.org/ warc/revisit - OSSAPWJ23L56IYVRW3GFEAR4MCJMGPTB - - 536 2678 dupes.warc.gz 6 | org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140127171240 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf warc/revisit - YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 556 10826 dupes.warc.gz 7 | org,iana)/_css/2013.1/fonts/opensans-regular.ttf 20140127171240 http://www.iana.org/_css/2013.1/fonts/OpenSans-Regular.ttf warc/revisit - GVSO2C2TMPPVZ4TXYFXAY27NYWTIEIL7 - - 540 9793 dupes.warc.gz 8 | org,iana)/_css/2013.1/print.css 20140127171239 http://www.iana.org/_css/2013.1/print.css warc/revisit - VNBXHMUNWJQC5OWWGZ3X7GM5C7X6ZAB4 - - 537 6684 dupes.warc.gz 9 | org,iana)/_css/2013.1/screen.css 20140127171239 http://www.iana.org/_css/2013.1/screen.css warc/revisit - BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 541 4630 dupes.warc.gz 10 | org,iana)/_img/2013.1/iana-logo-homepage.png 20140127171240 http://www.iana.org/_img/2013.1/iana-logo-homepage.png warc/revisit - GCW2GM3SIMHEIQYZX25MLSRYVWUCZ7OK - - 549 8750 dupes.warc.gz 11 | org,iana)/_img/2013.1/icann-logo.svg 20140127171239 http://www.iana.org/_img/2013.1/icann-logo.svg warc/revisit - HGRZHOH73EFQQWBYWBSOIV2UU5JDTSGJ - - 549 7709 dupes.warc.gz 12 | org,iana)/_js/2013.1/iana.js 20140127171239 http://www.iana.org/_js/2013.1/iana.js application/x-javascript 200 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 457 3696 dupes.warc.gz 13 | org,iana)/_js/2013.1/jquery.js 20140127171239 http://www.iana.org/_js/2013.1/jquery.js warc/revisit - AAW2RS7JB7HTF666XNZDQYJFA6PDQBPO - - 547 5658 dupes.warc.gz 14 | -------------------------------------------------------------------------------- /sample_archive/cdx/example-arc-test.cdx: -------------------------------------------------------------------------------- 1 | CDX N b a m s k r M S V g 2 | com,example,test,arc)/ 20140216050221 http://example.com/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1656 151 example.arc 3 | com,example,test,gz,arc)/ 20140216050221 http://example.com/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 856 171 example.arc.gz 4 | -------------------------------------------------------------------------------- /sample_archive/cdx/example-extra.cdx: -------------------------------------------------------------------------------- 1 | CDX N b a m s k r M S V g 2 | com,example)/?example=2 20140103030321 http://example.com?example=2 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1987 0 example-extra.warc 3 | com,example)/?example=2 20140603030341 http://example.com?example=2 warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 504 2701 example-extra.warc 4 | -------------------------------------------------------------------------------- /sample_archive/cdx/example.cdx: -------------------------------------------------------------------------------- 1 | CDX N b a m s k r M S V g 2 | com,example)/?example=1 20140103030321 http://example.com?example=1 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1043 333 example.warc.gz 3 | com,example)/?example=1 20140103030341 http://example.com?example=1 warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 1864 example.warc.gz 4 | org,iana)/domains/example 20140128051539 http://www.iana.org/domains/example text/html 302 JZ622UA23G5ZU6Y3XAKH4LINONUEICEG - - 577 2907 example.warc.gz 5 | -------------------------------------------------------------------------------- /sample_archive/cdx/httpbin-resource.cdxj: -------------------------------------------------------------------------------- 1 | org,httpbin)/anything/resource.json 20171130220904 {"filename":"httpbin-resource.warc.gz","digest":"UQ3W6RIQVJO6ZEL55355BJODG2DMWBPH","length":"465","offset":"0","mime":"application/json","url":"http://httpbin.org/anything/resource.json"} 2 | -------------------------------------------------------------------------------- /sample_archive/cdx/missing-status-text.cdxj: -------------------------------------------------------------------------------- 1 | org,iana)/bads 20140127171238 {"offset":"0","mime":"unk","url":"http://iana.org/bads","digest":"3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ","length":"485","status":"302","filename":"missing-status-text.warc"} 2 | -------------------------------------------------------------------------------- /sample_archive/cdx/post-test.cdx: -------------------------------------------------------------------------------- 1 | CDX N b a m s k r M S V g 2 | org,httpbin)/post?__wb_method=post&a=1&b=[]&c=3 20140610001151 http://httpbin.org/post application/json 200 M7YCTM7HS3YKYQTAWQVMQSQZBNEOXGU2 - - 723 1196 post-test.warc.gz 3 | org,httpbin)/post?__wb_method=post&data=^&foo=bar 20140610001255 http://httpbin.org/post?foo=bar application/json 200 B6E5P6JUZI6UPDTNO4L2BCHMGLTNCUAJ - - 723 2395 post-test.warc.gz 4 | org,httpbin)/post?__wb_method=post&foo=bar&test=abc 20140610000859 http://httpbin.org/post application/json 200 M532K5WS4GY2H4OVZO6HRPOP47A7KDWU - - 720 0 post-test.warc.gz 5 | -------------------------------------------------------------------------------- /sample_archive/cdx/url-agnost-example.cdx: -------------------------------------------------------------------------------- 1 | CDX N b a m s k r M S V g 2 | com,example)/ 20130729195151 http://test@example.com/ warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 591 355 example-url-agnostic-revisit.warc.gz 3 | org,iana,example)/ 20130702195402 http://example.iana.org/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1001 353 example-url-agnostic-orig.warc.gz 4 | -------------------------------------------------------------------------------- /sample_archive/cdxj/dupes.cdxj: -------------------------------------------------------------------------------- 1 | com,example)/ 20140127171200 {"url": "http://example.com", "mime": "text/html", "status": "200", "digest": "B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", "length": "1046", "offset": "334", "filename": "dupes.warc.gz"} 2 | com,example)/ 20140127171251 {"url": "http://example.com", "mime": "warc/revisit", "digest": "B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", "length": "553", "offset": "11875", "filename": "dupes.warc.gz"} 3 | org,iana)/ 20140127171238 {"url": "http://iana.org", "mime": "unk", "status": "302", "digest": "3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ", "length": "343", "offset": "1858", "filename": "dupes.warc.gz"} 4 | org,iana)/ 20140127171238 {"url": "http://www.iana.org/", "mime": "warc/revisit", "digest": "OSSAPWJ23L56IYVRW3GFEAR4MCJMGPTB", "length": "536", "offset": "2678", "filename": "dupes.warc.gz"} 5 | org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140127171240 {"url": "http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf", "mime": "warc/revisit", "digest": "YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW", "length": "556", "offset": "10826", "filename": "dupes.warc.gz"} 6 | org,iana)/_css/2013.1/fonts/opensans-regular.ttf 20140127171240 {"url": "http://www.iana.org/_css/2013.1/fonts/OpenSans-Regular.ttf", "mime": "warc/revisit", "digest": "GVSO2C2TMPPVZ4TXYFXAY27NYWTIEIL7", "length": "540", "offset": "9793", "filename": "dupes.warc.gz"} 7 | org,iana)/_css/2013.1/print.css 20140127171239 {"url": "http://www.iana.org/_css/2013.1/print.css", "mime": "warc/revisit", "digest": "VNBXHMUNWJQC5OWWGZ3X7GM5C7X6ZAB4", "length": "537", "offset": "6684", "filename": "dupes.warc.gz"} 8 | org,iana)/_css/2013.1/screen.css 20140127171239 {"url": "http://www.iana.org/_css/2013.1/screen.css", "mime": "warc/revisit", "digest": "BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD", "length": "541", "offset": "4630", "filename": "dupes.warc.gz"} 9 | org,iana)/_img/2013.1/iana-logo-homepage.png 20140127171240 {"url": "http://www.iana.org/_img/2013.1/iana-logo-homepage.png", "mime": "warc/revisit", "digest": "GCW2GM3SIMHEIQYZX25MLSRYVWUCZ7OK", "length": "549", "offset": "8750", "filename": "dupes.warc.gz"} 10 | org,iana)/_img/2013.1/icann-logo.svg 20140127171239 {"url": "http://www.iana.org/_img/2013.1/icann-logo.svg", "mime": "warc/revisit", "digest": "HGRZHOH73EFQQWBYWBSOIV2UU5JDTSGJ", "length": "549", "offset": "7709", "filename": "dupes.warc.gz"} 11 | org,iana)/_js/2013.1/iana.js 20140127171239 {"url": "http://www.iana.org/_js/2013.1/iana.js", "mime": "application/x-javascript", "status": "200", "digest": "3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ", "length": "457", "offset": "3696", "filename": "dupes.warc.gz"} 12 | org,iana)/_js/2013.1/jquery.js 20140127171239 {"url": "http://www.iana.org/_js/2013.1/jquery.js", "mime": "warc/revisit", "digest": "AAW2RS7JB7HTF666XNZDQYJFA6PDQBPO", "length": "547", "offset": "5658", "filename": "dupes.warc.gz"} 13 | -------------------------------------------------------------------------------- /sample_archive/cdxj/example-no-digest.cdxj: -------------------------------------------------------------------------------- 1 | com,example)/?example=1 20140103030321 {"url": "http://example.com?example=1", "length": "1043", "offset": "333", "filename": "example.warc.gz"} 2 | com,example)/?example=1 20140103030341 {"url": "http://example.com?example=1", "mime": "warc/revisit", "length": "553", "offset": "1864", "filename": "example.warc.gz"} 3 | org,iana)/domains/example 20140128051539 {"url": "http://www.iana.org/domains/example", "length": "577", "offset": "2907", "filename": "example.warc.gz"} 4 | -------------------------------------------------------------------------------- /sample_archive/cdxj/example.cdx.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/webrecorder/pywb/8ea2f74517161c1fc91ca969cea6328a20f5dce6/sample_archive/cdxj/example.cdx.gz -------------------------------------------------------------------------------- /sample_archive/cdxj/example.cdxj: -------------------------------------------------------------------------------- 1 | com,example)/?example=1 20140103030321 {"url": "http://example.com?example=1", "digest": "B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", "length": "1043", "offset": "333", "filename": "example.warc.gz"} 2 | com,example)/?example=1 20140103030341 {"url": "http://example.com?example=1", "mime": "warc/revisit", "digest": "B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", "length": "553", "offset": "1864", "filename": "example.warc.gz"} 3 | org,iana)/domains/example 20140128051539 {"url": "http://www.iana.org/domains/example", "digest": "JZ622UA23G5ZU6Y3XAKH4LINONUEICEG", "length": "577", "offset": "2907", "filename": "example.warc.gz"} 4 | -------------------------------------------------------------------------------- /sample_archive/cdxj/example2.cdxj: -------------------------------------------------------------------------------- 1 | com,example)/ 20160225042329 {"url": "http://example.com/", "mime": "text/html", "status": "200", "digest": "37cf167c2672a4a64af901d9484e75eee0e2c98a", "length": "1286", "offset": "363", "filename": "example2.warc.gz"} 2 | -------------------------------------------------------------------------------- /sample_archive/cdxj/post-test.cdxj: -------------------------------------------------------------------------------- 1 | org,httpbin)/post?__wb_method=post&foo=bar&test=abc 20140610000859 {"url": "http://httpbin.org/post", "mime": "application/json", "status": "200", "digest": "M532K5WS4GY2H4OVZO6HRPOP47A7KDWU", "length": "720", "offset": "0", "filename": "post-test.warc.gz"} 2 | org,httpbin)/post?__wb_method=post&a=1&b=[]&c=3 20140610001151 {"url": "http://httpbin.org/post", "mime": "application/json", "status": "200", "digest": "M7YCTM7HS3YKYQTAWQVMQSQZBNEOXGU2", "length": "723", "offset": "1196", "filename": "post-test.warc.gz"} 3 | org,httpbin)/post?__wb_method=post&data=^&foo=bar 20140610001255 {"url": "http://httpbin.org/post?foo=bar", "mime": "application/json", "status": "200", "digest": "B6E5P6JUZI6UPDTNO4L2BCHMGLTNCUAJ", "length": "723", "offset": "2395", "filename": "post-test.warc.gz"} 4 | -------------------------------------------------------------------------------- /sample_archive/cdxj/url-agnost-example.cdxj: -------------------------------------------------------------------------------- 1 | com,example)/ 20130729195151 {"url": "http://test@example.com/", "mime": "warc/revisit", "digest": "B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", "length": "591", "offset": "355", "filename": "example-url-agnostic-revisit.warc.gz"} 2 | org,iana,example)/ 20130702195402 {"url": "http://example.iana.org/", "mime": "text/html", "status": "200", "digest": "B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", "length": "1001", "offset": "353", "filename": "example-url-agnostic-orig.warc.gz"} 3 | -------------------------------------------------------------------------------- /sample_archive/non-surt-cdx/example-non-surt.cdx: -------------------------------------------------------------------------------- 1 | CDX N b a m s k r M S V g 2 | example.com/?example=1 20140103030321 http://example.com?example=1 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1043 333 example.warc.gz 3 | example.com/?example=1 20140103030341 http://example.com?example=1 warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 1864 example.warc.gz 4 | iana.org/domains/example 20140128051539 http://www.iana.org/domains/example text/html 302 JZ622UA23G5ZU6Y3XAKH4LINONUEICEG - - 577 2907 example.warc.gz 5 | -------------------------------------------------------------------------------- /sample_archive/text_content/pathindex.txt: -------------------------------------------------------------------------------- 1 | example.warc.gz invalid_path sample_archive/warcs/example.warc.gz 2 | iana.warc.gz sample_archive/warcs/iana.warc.gz 3 | -------------------------------------------------------------------------------- /sample_archive/text_content/quickfox_repeated.compressed: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/webrecorder/pywb/8ea2f74517161c1fc91ca969cea6328a20f5dce6/sample_archive/text_content/quickfox_repeated.compressed -------------------------------------------------------------------------------- /sample_archive/text_content/sample.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | Sample Page For Rewrite Test 4 | 5 | 6 | 12 | Test Content 13 | Some Link 14 | 15 | -------------------------------------------------------------------------------- /sample_archive/text_content/sample_hls.m3u8: -------------------------------------------------------------------------------- 1 | #EXTM3U 2 | #EXT-X-MEDIA:TYPE=SUBTITLES,GROUP-ID="WebVTT",NAME="English",DEFAULT=YES,AUTOSELECT=YES,FORCED=NO,URI="https://example.com/subtitles/" 3 | #EXT-X-STREAM-INF:PROGRAM-ID=1,BANDWIDTH=610000,RESOLUTION=640x360,CODECS="avc1.66.30, mp4a.40.2",SUBTITLES="WebVTT" 4 | http://example.com/video_1.m3u8 5 | #EXT-X-STREAM-INF:PROGRAM-ID=1,BANDWIDTH=416000,RESOLUTION=400x224,CODECS="avc1.66.30, mp4a.40.2",SUBTITLES="WebVTT" 6 | http://example.com/video_2.m3u8 7 | #EXT-X-STREAM-INF:PROGRAM-ID=1,BANDWIDTH=797000,RESOLUTION=640x360,CODECS="avc1.66.30, mp4a.40.2",SUBTITLES="WebVTT" 8 | http://example.com/video_3.m3u8 9 | #EXT-X-STREAM-INF:PROGRAM-ID=1,BANDWIDTH=1002000,RESOLUTION=640x360,CODECS="avc1.77.30, mp4a.40.2",SUBTITLES="WebVTT" 10 | http://example.com/video_4.m3u8 11 | #EXT-X-STREAM-INF:PROGRAM-ID=1,BANDWIDTH=2505000,RESOLUTION=1280x720,CODECS="avc1.77.30, mp4a.40.2",SUBTITLES="WebVTT" 12 | http://example.com/video_5.m3u8 13 | #EXT-X-STREAM-INF:PROGRAM-ID=1,BANDWIDTH=4495000,RESOLUTION=1920x1080,CODECS="avc1.640028, mp4a.40.2",SUBTITLES="WebVTT" 14 | http://example.com/video_6.m3u8 15 | #EXT-X-STREAM-INF:PROGRAM-ID=1,BANDWIDTH=38000,CODECS="mp4a.40.2",SUBTITLES="WebVTT" 16 | http://example.com/audio_0.m3u8 17 | -------------------------------------------------------------------------------- /sample_archive/text_content/sample_no_head.html: -------------------------------------------------------------------------------- 1 | 7 | Test Content 8 | Some Link 9 | -------------------------------------------------------------------------------- /sample_archive/text_content/sample_no_head_2.html: -------------------------------------------------------------------------------- 1 | 2 | A title 3 | Some Text 4 | -------------------------------------------------------------------------------- /sample_archive/text_content/sample_unclosed_script.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | Sample Page For Rewrite Test 4 | 5 | 6 | Test Content 7 |