├── .coveragerc ├── .dockerignore ├── .gitattributes ├── .github └── workflows │ ├── build.yml │ ├── codeql.yml │ ├── comment-run.yml │ ├── dist.yml │ ├── lint.yml │ ├── release.yml │ └── test.yml ├── .gitignore ├── Dockerfile ├── LICENSE ├── README.md ├── docs ├── __init__.py ├── diagram.graffle ├── diagram_72.png ├── logo.psd ├── logo_stroked_400px.png └── logo_stroked_400px.psd ├── entrypoint.sh ├── evaluation.sh ├── ipwb ├── __init__.py ├── __main__.py ├── assets │ ├── admin.css │ ├── daemonController.js │ ├── favicons │ │ ├── android-chrome-192x192.png │ │ ├── apple-touch-icon.png │ │ ├── browserconfig.xml │ │ ├── favicon-16x16.png │ │ ├── favicon-32x32.png │ │ ├── favicon.ico │ │ ├── manifest.json │ │ ├── mstile-150x150.png │ │ └── safari-pinned-tab.svg │ ├── logo.png │ ├── reconstructive-banner.js │ ├── reconstructive.js │ ├── serviceWorker.js │ ├── webui.css │ └── webui.js ├── backends.py ├── error_handler.py ├── exceptions.py ├── indexer.py ├── replay.py ├── settings.py ├── templates │ ├── admin.html │ └── index.html └── util.py ├── package.json ├── release.sh ├── requirements.txt ├── samples ├── indexes │ ├── 5mementos.cdxj │ ├── 5mementos.link │ ├── froggie_badHeaderHash.cdxj │ ├── salam-home.cdxj │ ├── sample-1.cdxj │ ├── sample-2.cdxj │ ├── sample-encrypted.cdxj │ └── sample-old.cdxj └── warcs │ ├── 1memento.warc │ ├── 1memento_noContentType.warc │ ├── 2mementos.warc │ ├── 2mementos_htmlXhtml.warc │ ├── 2mementos_queryString.warc │ ├── 3mementos.warc │ ├── 4mementos.warc │ ├── 5mementos.warc │ ├── 5mementosAndFroggie.warc │ ├── HTTP204.warc │ ├── HTTP404.warc │ ├── IAH-20080430204825-00000-blackbook.warc.gz │ ├── baconIpsum.warc.gz │ ├── baconIpsum_chunked.warc │ ├── baconIpsum_chunkedWithMetadata.warc │ ├── baconIpsum_chunked_httpTrailors.warc │ ├── broken.warc │ ├── frogTest.warc │ ├── froggie.warc.gz │ ├── mkelly1.warc │ ├── mkelly2.warc │ ├── redirect.warc │ ├── redirectRelative.warc │ ├── salam-home.warc │ ├── sample-1.warc.gz │ ├── slash.warc │ └── variableSizedDates.warc ├── setup.cfg ├── setup.py ├── test-requirements.txt └── tests ├── __init__.py ├── testUtil.py ├── test_backends.py ├── test_compile_target_uri.py ├── test_daemon.py ├── test_error_handler.py ├── test_indexing.py ├── test_ipfs_client.py ├── test_memento.py ├── test_nodeToNode.py ├── test_randomized_add.py ├── test_replay.py └── test_util.py /.coveragerc: -------------------------------------------------------------------------------- 1 | [run] 2 | branch = True 3 | source = ./ 4 | 5 | [report] 6 | exclude_lines = 7 | if self.debug: 8 | pragma: no cover 9 | raise NotImplementedError 10 | if __name__ == .__main__.: 11 | ignore_errors = True 12 | omit = 13 | tests/* 14 | fail_under = 0 15 | -------------------------------------------------------------------------------- /.dockerignore: -------------------------------------------------------------------------------- 1 | .git 2 | .gitignore 3 | .gitattributes 4 | .coveragerc 5 | .travis.yml 6 | LICENSE 7 | Dockerfile 8 | docs 9 | dist 10 | build 11 | ipwb.egg-info 12 | -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | *.arc -text 2 | *.warc -text 3 | *.idx -text 4 | *.idxj -text 5 | *.cdx -text 6 | *.cdxj -text 7 | *.gz -text 8 | -------------------------------------------------------------------------------- /.github/workflows/build.yml: -------------------------------------------------------------------------------- 1 | name: Docker Build 2 | 3 | on: 4 | push: 5 | branches: 6 | - master 7 | tags: 8 | - '*' 9 | 10 | jobs: 11 | build-image: 12 | runs-on: ubuntu-latest 13 | name: Docker Image 14 | steps: 15 | - uses: actions/checkout@v4 16 | - id: imgtagger 17 | run: | 18 | imgtags=$(echo "${{ github.ref }}" | sed 's/refs\/tags\//latest,/; s/refs\/heads\///') 19 | echo "::set-output name=imgtags::$imgtags" 20 | - name: Build and Push to DockerHub 21 | uses: docker/build-push-action@v6 22 | with: 23 | username: ${{ secrets.DOCKER_USERNAME }} 24 | password: ${{ secrets.DOCKER_PASSWORD }} 25 | repository: ${{ secrets.DOCKER_REPO }} 26 | tags: "${{ steps.imgtagger.outputs.imgtags }}" 27 | -------------------------------------------------------------------------------- /.github/workflows/codeql.yml: -------------------------------------------------------------------------------- 1 | name: "CodeQL" 2 | 3 | on: 4 | push: 5 | pull_request: 6 | schedule: 7 | # Runs at 15:00 UTC on Fri 8 | - cron: '0 15 * * 5' 9 | 10 | jobs: 11 | analyze: 12 | if: github.event_name == 'schedule' || github.event_name == 'push' || github.event.pull_request.head.repo.full_name != github.repository 13 | name: Analyze 14 | runs-on: ubuntu-latest 15 | 16 | steps: 17 | - name: Checkout repository 18 | uses: actions/checkout@v4 19 | with: 20 | # We must fetch at least the immediate parents so that if this is 21 | # a pull request then we can checkout the head. 22 | fetch-depth: 2 23 | 24 | # If this run was triggered by a pull request event, then checkout 25 | # the head of the pull request instead of the merge commit. 26 | - run: git checkout HEAD^2 27 | if: ${{ github.event_name == 'pull_request' }} 28 | 29 | - name: Initialize CodeQL 30 | uses: github/codeql-action/init@v2 31 | with: 32 | languages: javascript, python 33 | 34 | - name: Perform CodeQL Analysis 35 | uses: github/codeql-action/analyze@v2 36 | -------------------------------------------------------------------------------- /.github/workflows/comment-run.yml: -------------------------------------------------------------------------------- 1 | name: "Comment Run" 2 | 3 | on: 4 | issue_comment: 5 | types: 6 | - created 7 | - edited 8 | 9 | jobs: 10 | comment-run: 11 | if: contains(github.event.comment.body, '@github-actions run') 12 | runs-on: ubuntu-latest 13 | steps: 14 | - name: Checkout Code (Deep) 15 | uses: actions/checkout@v4 16 | with: 17 | fetch-depth: 0 18 | - name: Set up IPFS and Rub Daemon 19 | uses: oduwsdl/setup-ipfs@main 20 | with: 21 | run_daemon: true 22 | - name: Set up Python 23 | uses: actions/setup-python@v5 24 | - name: Install Test Dependencies 25 | run: pip install -r test-requirements.txt 26 | - name: Install IPWB from Source 27 | run: pip install . 28 | - name: Execute Code in Comment 29 | uses: ibnesayeed/actions-comment-run@master 30 | with: 31 | github-token: ${{ secrets.GITHUB_TOKEN }} 32 | allowed-associations: '["OWNER", "MEMBER"]' 33 | -------------------------------------------------------------------------------- /.github/workflows/dist.yml: -------------------------------------------------------------------------------- 1 | name: Publish to PyPI 2 | 3 | on: 4 | push: 5 | tags: 6 | - '*' 7 | 8 | jobs: 9 | build-n-publish: 10 | name: Build and Publish Package 11 | runs-on: ubuntu-20.04 12 | steps: 13 | - uses: actions/checkout@v4 14 | - name: Set up Python 3.9 15 | uses: actions/setup-python@v5 16 | with: 17 | python-version: 3.9 18 | - name: Upgrade setuptools and wheel 19 | run: python -m pip install --user --upgrade setuptools wheel 20 | - name: Build a binary wheel and a source tarball 21 | run: python setup.py sdist bdist_wheel 22 | - name: Publish package to PyPI 23 | uses: pypa/gh-action-pypi-publish@release/v1.10 24 | with: 25 | password: ${{ secrets.pypi_password }} 26 | # The PyPI API token (password) was generated as per https://pypi.org/help/#apitoken 27 | # The token is stored in this GH repo under `Settings > Secrets > pypi_password` 28 | -------------------------------------------------------------------------------- /.github/workflows/lint.yml: -------------------------------------------------------------------------------- 1 | name: Lint 2 | 3 | on: 4 | - push 5 | - pull_request 6 | 7 | jobs: 8 | lint: 9 | if: github.event_name == 'push' || github.event.pull_request.head.repo.full_name != github.repository 10 | runs-on: ubuntu-latest 11 | name: Py and JS 12 | steps: 13 | - uses: actions/checkout@v4 14 | - name: Lint JavaScript 15 | run: | 16 | npm install standard 17 | node_modules/.bin/standard 18 | - name: Clean up standard.js artifacts 19 | shell: bash 20 | run: | 21 | rm -rf node_modules 22 | rm package-lock.json 23 | rm package.json 24 | - name: Lint Python 25 | run: | 26 | pip install --user pycodestyle 27 | python -m pycodestyle --max-line-length=88 28 | -------------------------------------------------------------------------------- /.github/workflows/release.yml: -------------------------------------------------------------------------------- 1 | name: Draft a Release 2 | 3 | on: 4 | push: 5 | tags: 6 | - '*' 7 | 8 | jobs: 9 | release: 10 | name: Prepare Release 11 | runs-on: ubuntu-latest 12 | steps: 13 | - name: Checkout Code 14 | uses: actions/checkout@v4 15 | with: 16 | fetch-depth: 0 17 | - name: Extract Repo Attributes 18 | id: attrs 19 | uses: ibnesayeed/repo-attrs@master 20 | - name: Draft Release 21 | id: create_release 22 | uses: actions/create-release@v1 23 | env: 24 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 25 | with: 26 | tag_name: ${{ github.ref }} 27 | release_name: Release ${{ github.ref }} 28 | draft: true 29 | body: | 30 | ## Changes Since Last Release 31 | 32 | History between `${{ steps.attrs.outputs.tail }}` and `${{ steps.attrs.outputs.head }}` 33 | 34 | ### Pull Requests 35 | 36 | ${{ steps.attrs.outputs.prs }} 37 | 38 | ### Contributors 39 | 40 | ${{ steps.attrs.outputs.contributors }} 41 | 42 | ### Changed Files 43 | 44 | ``` 45 | ${{ steps.attrs.outputs.files }} 46 | ``` 47 | -------------------------------------------------------------------------------- /.github/workflows/test.yml: -------------------------------------------------------------------------------- 1 | name: Test 2 | 3 | on: 4 | - push 5 | - pull_request 6 | 7 | jobs: 8 | matrix-test: 9 | if: github.event_name == 'push' || github.event.pull_request.head.repo.full_name != github.repository 10 | strategy: 11 | fail-fast: false 12 | matrix: 13 | os: 14 | - ubuntu-latest 15 | - macos-latest 16 | # - windows-latest 17 | python: 18 | - "3.9" 19 | - "3.10" 20 | - "3.11" 21 | - "3.12" 22 | - "3.13" 23 | ipfs: 24 | - "0.28" 25 | - "0.29" 26 | - "0.30" 27 | - "0.31" 28 | runs-on: ${{ matrix.os }} 29 | name: ${{ matrix.os }} Py-${{ matrix.python }} IPFS-${{ matrix.ipfs }} 30 | steps: 31 | - uses: actions/checkout@v4 32 | - name: Set up Python ${{ matrix.python }} 33 | uses: actions/setup-python@v5 34 | with: 35 | python-version: ${{ matrix.python }} 36 | - name: Set up IPFS ${{ matrix.ipfs }} 37 | uses: oduwsdl/setup-ipfs@main 38 | with: 39 | ipfs_version: ${{ matrix.ipfs }} 40 | run_daemon: true 41 | - name: Install Python Dependencies 42 | shell: bash 43 | run: | 44 | pip install --upgrade pip 45 | pip install -r requirements.txt 46 | pip install -r test-requirements.txt 47 | - name: Run Tests 48 | shell: bash 49 | run: py.test -s --cov=./ 50 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | .pytest_cache 6 | 7 | # C extensions 8 | *.so 9 | 10 | # Distribution / packaging 11 | .Python 12 | .idea/ 13 | env/ 14 | build/ 15 | develop-eggs/ 16 | dist/ 17 | downloads/ 18 | eggs/ 19 | .eggs/ 20 | lib/ 21 | lib64/ 22 | parts/ 23 | sdist/ 24 | var/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *,cover 47 | .hypothesis/ 48 | tests/samples/warcs/frogTest_* 49 | samples/warcs/frogTest_* 50 | 51 | # Translations 52 | *.mo 53 | *.pot 54 | 55 | # Django stuff: 56 | *.log 57 | 58 | # Sphinx documentation 59 | docs/_build/ 60 | 61 | # PyBuilder 62 | target/ 63 | 64 | #Ipython Notebook 65 | .ipynb_checkpoints 66 | 67 | # Mac meta 68 | .DS_Store 69 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | ARG PYTHON_TAG=3 2 | FROM python:${PYTHON_TAG} AS base 3 | 4 | # Add some metadata 5 | LABEL app.name="InterPlanetary Wayback (IPWB)" \ 6 | app.description="A distributed and persistent archive replay system using IPFS" \ 7 | app.license="MIT License" \ 8 | app.license.url="https://github.com/oduwsdl/ipwb/blob/master/LICENSE" \ 9 | app.repo.url="https://github.com/oduwsdl/ipwb" \ 10 | app.authors="Mat Kelly <@machawk1> and Sawood Alam <@ibnesayeed>" 11 | 12 | # Add a custom entrypoint script 13 | COPY entrypoint.sh /usr/local/bin/ 14 | RUN chmod a+x /usr/local/bin/entrypoint.sh 15 | 16 | # Enable unbuffered STDOUT logging 17 | ENV PYTHONUNBUFFERED=1 18 | 19 | # Create folders for WARC, CDXJ and IPFS stores 20 | RUN mkdir -p /data/{warc,cdxj,ipfs} 21 | 22 | # Download and install IPFS 23 | ENV IPFS_PATH=/data/ipfs 24 | ARG IPFS_VERSION=v0.31.0 25 | ARG BUILDARCH 26 | RUN cd /tmp \ 27 | && wget -q https://dist.ipfs.tech/kubo/${IPFS_VERSION}/kubo_${IPFS_VERSION}_linux-$BUILDARCH.tar.gz \ 28 | && tar xvfz kubo*.tar.gz \ 29 | && mv kubo/ipfs /usr/local/bin/ipfs \ 30 | && rm -rf kubo* \ 31 | && ipfs init 32 | 33 | # Make necessary changes to prepare the environment for IPWB 34 | RUN apt update && apt install -y locales \ 35 | && rm -rf /var/lib/apt/lists/* \ 36 | && echo "en_US.UTF-8 UTF-8" > /etc/locale.gen \ 37 | && locale-gen 38 | 39 | # Install basic requirements 40 | WORKDIR /ipwb 41 | COPY requirements.txt ./ 42 | RUN pip install -r requirements.txt 43 | 44 | 45 | # Standard JS lint 46 | FROM node 47 | WORKDIR /ipwb 48 | COPY . ./ 49 | ARG SKIPTEST=false 50 | RUN $SKIPTEST || npm install -g standard 51 | RUN $SKIPTEST || standard 52 | 53 | 54 | # Testing stage 55 | FROM base AS test 56 | 57 | # Install necessary test requirements 58 | COPY test-requirements.txt ./ 59 | RUN pip install -r test-requirements.txt 60 | 61 | # Perform tests 62 | COPY . ./ 63 | ARG SKIPTEST=false 64 | RUN $SKIPTEST || pycodestyle 65 | RUN $SKIPTEST || (ipfs daemon & while ! curl -s localhost:5001 > /dev/null; do sleep 1; done && py.test -s --cov=./) 66 | 67 | 68 | # Final production image 69 | FROM base 70 | 71 | # Install IPWB from the source code 72 | COPY . ./ 73 | RUN python setup.py install 74 | 75 | # Run ipfs daemon in background 76 | # Wait for the daemon to be ready 77 | # Runs provided command 78 | ENTRYPOINT ["entrypoint.sh"] 79 | 80 | # Index a sample WARC file and replay it 81 | CMD ["ipwb", "replay"] 82 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2016 ODU Web Science / Digital Libraries Research Group 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![image](https://github.com/oduwsdl/ipwb/raw/master/docs/logo_stroked_400px.png)](https://pypi.python.org/pypi/ipwb) 2 | 3 | # InterPlanetary Wayback (ipwb) 4 | 5 | **Peer-To-Peer Permanence of Web Archives** 6 | 7 | [![pypi](https://img.shields.io/pypi/v/ipwb.svg)](https://pypi.org/project/ipwb) [![codecov](https://codecov.io/gh/oduwsdl/ipwb/branch/master/graph/badge.svg)](https://codecov.io/gh/oduwsdl/ipwb) 8 | 9 | InterPlanetary Wayback (ipwb) facilitates permanence and collaboration in web archives by disseminating the contents of [WARC](http://www.iso.org/iso/catalogue_detail.htm?csnumber=44717) files into the IPFS network. [IPFS](https://ipfs.io/) is a peer-to-peer content-addressable file system that inherently allows deduplication and facilitates opt-in replication. ipwb splits the header and payload of WARC response records before disseminating into IPFS to leverage the deduplication, builds a [CDXJ index](https://github.com/oduwsdl/ORS/wiki/CDXJ) with references to the IPFS hashes that are returned, and combines the header and payload from IPFS at the time of replay. 10 | 11 | InterPlanetary Wayback primarily consists of two scripts: 12 | 13 | - **ipwb/indexer.py** - archival indexing script that takes the path to a WARC input, extracts the HTTP headers, HTTP payload (response body), and relevant parts of the WARC-response record header from the WARC specified and creates byte string representations. The indexer then pushes the byte strings into IPFS using a locally running IPFS daemon then creates a [CDXJ](https://github.com/oduwsdl/ORS/wiki/CDXJ) file with this metadata for replay.py. 14 | - **ipwb/replay.py** - rudimentary replay script to resolve requests for archival content contained in IPFS for replay in the browser. 15 | 16 | A pictorial representation of the ipwb indexing and replay process: 17 | 18 | ![image](https://raw.githubusercontent.com/oduwsdl/ipwb/master/docs/diagram_72.png) 19 | 20 | An important aspect of archival replay systems is rewriting various resource references for proper memento reconstruction so that they are dereferenced properly from the archive from around the same datetime as of the root memento and not from the live site (in which case the resource might have changed or gone missing). Many archival replay systems perform server-side rewriting, but it has its limitations when URIs are generated using JavaScript. To handle this, we use [Service Worker](https://developer.mozilla.org/en-US/docs/Web/API/Service_Worker_API) for rerouting requests on the client-side when they are dereferenced to avoid any server-side rewiring. For this, we have implemented a separate library, [Reconstructive](https://oduwsdl.github.io/Reconstructive/), which is reusable and extendable by any archival replay system. 21 | 22 | Another important feature of archival replays is the inclusion of an archival banner in mementos. The purpose of an archival banner is to highlight that a replayed page is a memento and not a live page, to provide metadata about the memento and the archive, and to facilitate additional interactivity. Many archival banners used in different web archival replay systems are obtrusive in nature and have issues like style leakage. To eliminate both of these issues we have implemented a [Custom HTML Element](https://developer.mozilla.org/en-US/docs/Web/Web_Components/Using_custom_elements), [](https://oduwsdl.github.io/Reconstructive/docs/class/Reconstructive/reconstructive-banner.js~ReconstructiveBanner.html) as part of the [Reconstructive](https://oduwsdl.github.io/Reconstructive/) library and used in the ipwb. 23 | 24 | ## Installing 25 | 26 | InterPlanetary Wayback (ipwb) requires Python 3.9+. ipwb can also be used with Docker ([see below](#user-content-using-docker)). 27 | 28 | For conventional usage, the latest release of ipwb can be installed using pip: 29 | 30 | ``` 31 | $ pip install ipwb 32 | ``` 33 | 34 | The latest development version containing changes not yet released can be installed from source: 35 | 36 | ``` 37 | $ git clone https://github.com/oduwsdl/ipwb 38 | $ cd ipwb 39 | $ pip install ./ 40 | ``` 41 | 42 | ## Setup 43 | 44 | The InterPlanetary File System (IPFS) daemon (named "kubo", previously "go-ipfs") must be installed and running before starting ipwb. [Download kubo](https://dist.ipfs.tech/#kubo) and [take your node online](https://docs.ipfs.tech/how-to/command-line-quick-start/#take-your-node-online) to start the IPFS daemon. Once installed, this can be done using the command: 45 | 46 | ``` 47 | $ ipfs daemon 48 | ``` 49 | 50 | If you encounter a conflict with the default API port of 5001 when starting the daemon, running the following prior to launching the daemon will change the API port to access to one of your choosing (here, shown to be 5002): 51 | 52 | ``` 53 | $ ipfs config Addresses.API /ip4/127.0.0.1/tcp/5002 54 | ``` 55 | 56 | ## Indexing 57 | 58 | In a separate terminal session (or the same if you started the daemon in the background), instruct ipwb to push contents of a WARC file into IPFS and create an index of records: 59 | 60 | ``` 61 | $ ipwb index (path to warc or warc.gz) 62 | ``` 63 | 64 | ...for example, from the root of the ipwb repository: 65 | 66 | ``` 67 | $ ipwb index samples/warcs/salam-home.warc 68 | ``` 69 | 70 | The ipwb indexer partitions the WARC into WARC Records and extracts the WARC Response headers, HTTP response headers, and the HTTP response bodies (payloads). Relevant information is extracted from the WARC Response headers, temporary byte strings are created for the HTTP response headers and payload, and these two bytes strings are pushed into IPFS. The resulting CDXJ data is written to `STDOUT` by default but can be redirected to a file, e.g., 71 | 72 | ``` 73 | $ ipwb index (path to warc or warc.gz) >> myArchiveIndex.cdxj 74 | ``` 75 | 76 | ## Replaying 77 | 78 | An archival replay system is also included with ipwb to re-experience the content disseminated to IPFS. A CDXJ index needs to be provided and used by the ipwb replay system by specifying the path of the index file as a parameter to the replay system: 79 | 80 | ``` 81 | $ ipwb replay 82 | ``` 83 | 84 | ipwb also supports using an IPFS hash or any HTTP location as the source of the CDXJ: 85 | 86 | ``` 87 | $ ipwb replay http://myDomain/files/myIndex.cdxj 88 | $ ipwb replay QmYwAPJzv5CZsnANOTaREALhashYgPpHdWEz79ojWnPbdG 89 | ``` 90 | 91 | Once started, the replay system's web interface can be accessed through a web browser, e.g., by default. 92 | 93 | To run it under a domain name other than `localhost`, the easiest approach is to use a reverse proxy that supports HTTPS. The replay system utilizes [Service Worker](https://developer.mozilla.org/en-US/docs/Web/API/Service_Worker_API) for URL rerouting/rewriting to prevent [live leakage (zombies)](http://ws-dl.blogspot.com/2012/10/2012-10-10-zombies-in-archives.html). However, for security reason many web browsers have mandated HTTPS for the Service Worker API with only exception if the domain is `localhost`. [Caddy Server](https://caddyserver.com/) and [Traefik](https://traefik.io/) can be used as a reverse-proxy server and are very easy to setup. They come with built-in HTTPS support and manage (install and update) TLS certificates transparently and automatically from [Let's Encrypt](https://letsencrypt.org/). However, any web server proxy that has HTTPS support on the front-end will work. To make ipwb replay aware of the proxy, use `--proxy` or `-P` flag to supply the proxy URL. This way the replay will yield the supplied proxy URL as a prefix when generating various fully qualified domain name (FQDN) URIs or absolute URIs (for example, those in the TimeMap or Link header) instead of the default `http://localhost:2016`. This can be necessary when the service is running in a private network or a container, and only exposed via a reverse-proxy. Suppose a reverse-proxy server is running and ready to forward all traffic on the `https://ipwb.example.com` to the ipwb replay server then the replay can be started as following: 94 | 95 | ``` 96 | $ ipwb replay --proxy=https://ipwb.example.com 97 | ``` 98 | 99 | ## Using Docker 100 | 101 | A pre-built Docker image is made available that can be run as following: 102 | 103 | ``` 104 | $ docker container run -it --rm -p 2016:2016 oduwsdl/ipwb 105 | ``` 106 | 107 | The container will run an IPFS daemon, index a sample WARC file, and replay it using the newly created index. It will take a few seconds to be ready, then the replay will be accessible at with a sample archived page. 108 | 109 | To index and replay your own WARC file, bind mount your data folders inside the container using `-v` (or `--volume`) flag and run commands accordingly. The provided docker image has designated `/data` directory, inside which there are `warc`, `cdxj`, and `ipfs` folders where host folders can be mounted separately or as a single mount point at the parent `/data` directory. Assuming that the host machine has a `/path/to/data` folder under which there are `warc`, `cdxj`, and `ipfs` folders and a WARC file at `/path/to/data/warc/custom.warc.gz`. 110 | 111 | ``` 112 | $ docker container run -it --rm -v /path/to/data:/data oduwsdl/ipwb ipwb index -o /data/cdxj/custom.cdxj /data/warc/custom.warc.gz 113 | $ docker container run -it --rm -v /path/to/data:/data -p 2016:2016 oduwsdl/ipwb ipwb replay /data/cdxj/custom.cdxj 114 | ``` 115 | 116 | If the host folder structure is something other than `/some/path/{warc,cdxj,ipfs}` then these volumes need to be mounted separately. 117 | 118 | To build an image from the source, run the following command from the directory where the source code is checked out. The name of the locally built image could be anything, but we use `oduwsdl/ipwb` to be consistent with the above commands. 119 | 120 | ``` 121 | $ docker image build -t oduwsdl/ipwb . 122 | ``` 123 | 124 | By default, the image building process also performs tests, so it might take a while to build the image. It ensures that an image will not be created with failing tests. However, it is possible to skip tests by supplying a build-arg `--build-arg SKIPTEST=true` as shown below: 125 | 126 | ``` 127 | $ docker image build --build-arg SKIPTEST=true -t oduwsdl/ipwb . 128 | ``` 129 | 130 | ## Help 131 | 132 | Usage of sub-commands in ipwb can be accessed through providing the `-h` or `--help` flag, like any of the below. 133 | 134 | ``` 135 | $ ipwb -h 136 | usage: ipwb [-h] [-d DAEMON_ADDRESS] [-v] [-u] {index,replay} ... 137 | 138 | InterPlanetary Wayback (ipwb) 139 | 140 | optional arguments: 141 | -h, --help show this help message and exit 142 | -d DAEMON_ADDRESS, --daemon DAEMON_ADDRESS 143 | Multi-address of IPFS daemon (default 144 | /dns/localhost/tcp/5001/http) 145 | -v, --version Report the version of ipwb 146 | -u, --update-check Check whether an updated version of ipwb is available 147 | 148 | ipwb commands: 149 | Invoke using "ipwb ", e.g., ipwb replay 150 | 151 | {index,replay} 152 | index Index a WARC file for replay in ipwb 153 | replay Start the ipwb replay system 154 | ``` 155 | 156 | ``` 157 | $ ipwb index -h 158 | usage: ipwb [-h] [-e] [-c] [--compressFirst] [-o OUTFILE] [--debug] 159 | index [index ...] 160 | 161 | Index a WARC file for replay in ipwb 162 | 163 | positional arguments: 164 | index Path to a WARC[.gz] file 165 | 166 | optional arguments: 167 | -h, --help show this help message and exit 168 | -e Encrypt WARC content prior to adding to IPFS 169 | -c Compress WARC content prior to adding to IPFS 170 | --compressFirst Compress data before encryption, where applicable 171 | -o OUTFILE, --outfile OUTFILE 172 | Path to an output CDXJ file, defaults to STDOUT 173 | --debug Convenience flag to help with testing and debugging 174 | ``` 175 | 176 | ``` 177 | $ ipwb replay -h 178 | usage: ipwb replay [-h] [-P []] [index] 179 | 180 | Start the ipwb relay system 181 | 182 | positional arguments: 183 | index path, URI, or multihash of file to use for replay 184 | 185 | optional arguments: 186 | -h, --help show this help message and exit 187 | -P [], --proxy [] 188 | Proxy URL 189 | ``` 190 | 191 | ## Project History 192 | 193 | This repo contains the code for integrating [WARC](http://www.iso.org/iso/catalogue_detail.htm?csnumber=44717)s and [IPFS](https://ipfs.io/) as developed at the [Archives Unleashed: Web Archive Hackathon]() in Toronto, Canada in March 2016. The project was also presented at: 194 | 195 | - The [Joint Conference on Digital Libraries 2016](http://www.jcdl2016.org/) in Newark, NJ in June 2016. 196 | - The [Web Archiving and Digital Libraries (WADL) 2016 workshop](http://fox.cs.vt.edu/wadl2016.html) in Newark, NJ in June 2016. 197 | - The [Theory and Practice on Digital Libraries (TPDL) 2016](http://www.tpdl2016.org/) in Hannover, Germany in September 2016. 198 | - The [Archives Unleashed 4.0: Web Archive Datathon](https://archivesunleashed.com/call-for-participation-au4/) in London, England in June 2017. 199 | - The [International Internet Preservation Consortium (IIPC) Web Archiving Conference (WAC) 2017](http://netpreserve.org/wac2017/) in London, England in June 2017. 200 | - The [Decentralized Web Summit 2018's](https://www.decentralizedweb.net/) IPFS Lab Day in San Francisco, CA in August 2018. 201 | 202 | ### Citing Project 203 | 204 | There are numerous publications related to this project, but the most significant and primary one was published in TPDL 2016. ([Read the PDF](https://matkelly.com/papers/2016_tpdl_ipwb.pdf)) 205 | 206 | > Mat Kelly, Sawood Alam, Michael L. Nelson, and Michele C. Weigle. __InterPlanetary Wayback: Peer-To-Peer Permanence of Web Archives__. In _Proceedings of the 20th International Conference on Theory and Practice of Digital Libraries_, pages 411–416, Hamburg, Germany, June 2016. 207 | 208 | ```bib 209 | @INPROCEEDINGS{ipwb-tpdl2016, 210 | AUTHOR = {Mat Kelly and 211 | Sawood Alam and 212 | Michael L. Nelson and 213 | Michele C. Weigle}, 214 | TITLE = {{InterPlanetary Wayback}: Peer-To-Peer Permanence of Web Archives}, 215 | BOOKTITLE = {Proceedings of the 20th International Conference on Theory and Practice of Digital Libraries}, 216 | PAGES = {411--416}, 217 | MONTH = {June}, 218 | YEAR = {2016}, 219 | ADDRESS = {Hamburg, Germany}, 220 | DOI = {10.1007/978-3-319-43997-6_35} 221 | } 222 | ``` 223 | 224 | # License 225 | 226 | MIT 227 | -------------------------------------------------------------------------------- /docs/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oduwsdl/ipwb/91e202b31d96af0171690b2df61d47625867651f/docs/__init__.py -------------------------------------------------------------------------------- /docs/diagram.graffle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oduwsdl/ipwb/91e202b31d96af0171690b2df61d47625867651f/docs/diagram.graffle -------------------------------------------------------------------------------- /docs/diagram_72.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oduwsdl/ipwb/91e202b31d96af0171690b2df61d47625867651f/docs/diagram_72.png -------------------------------------------------------------------------------- /docs/logo.psd: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oduwsdl/ipwb/91e202b31d96af0171690b2df61d47625867651f/docs/logo.psd -------------------------------------------------------------------------------- /docs/logo_stroked_400px.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oduwsdl/ipwb/91e202b31d96af0171690b2df61d47625867651f/docs/logo_stroked_400px.png -------------------------------------------------------------------------------- /docs/logo_stroked_400px.psd: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oduwsdl/ipwb/91e202b31d96af0171690b2df61d47625867651f/docs/logo_stroked_400px.psd -------------------------------------------------------------------------------- /entrypoint.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -e 4 | 5 | if [[ ("$@" != "ipwb") && ("$@" != *" -h"*) && ("$@" != *" --help"*) ]] 6 | then 7 | # Run the IPFS daemon in background, initialize configs if necessary 8 | ipfs daemon --init --migrate & 9 | 10 | # Wait for IPFS daemon to be ready 11 | while ! curl -s localhost:5001 > /dev/null 12 | do 13 | sleep 1 14 | done 15 | fi 16 | 17 | exec "$@" 18 | -------------------------------------------------------------------------------- /evaluation.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | function cleanIPFS { 4 | rm -rf ~/.ipfs 5 | ipfs init 6 | rm /tmp/xxx 7 | } 8 | function runSizeMessage { 9 | echo "Running size(msg)" 10 | cleanIPFS 11 | 12 | SECONDS=0 13 | ipfs daemon & 14 | ipwb index mkelly2.warc 15 | #echo $! 16 | kill -SIGKILL $! 17 | 18 | echo "$SECONDS second(s) elapsed" 19 | 20 | du -hcs ~/.ipfs | tail -1 21 | } 22 | 23 | function runSizeEncryptMessage { 24 | echo "Running size(encrypt(msg))" 25 | cleanIPFS 26 | 27 | SECONDS=0 28 | ipfs daemon & 29 | ipwb index -e mkelly2.warc 30 | kill -SIGKILL $! 31 | 32 | echo "$SECONDS second(s) elapsed" 33 | 34 | du -hcs ~/.ipfs | tail -1 35 | } 36 | 37 | 38 | ipwb --version 39 | runSizeMessage 40 | #runSizeCompressMessage 41 | 42 | runSizeEncryptMessage 43 | 44 | #export IPFS_PATH=/path/to/ipfsrepo -------------------------------------------------------------------------------- /ipwb/__init__.py: -------------------------------------------------------------------------------- 1 | __version__ = '0.2024.10.24.1853' 2 | -------------------------------------------------------------------------------- /ipwb/__main__.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import random # For generating a temp file for stdin 4 | # import string # For generating a temp file for stdin 5 | import sys 6 | import tempfile 7 | 8 | from multiaddr import Multiaddr 9 | from multiaddr import exceptions as multiaddr_exceptions 10 | # ipwb modules 11 | from ipwb import settings, replay, indexer, util 12 | from ipwb.error_handler import exception_logger 13 | from ipwb.__init__ import __version__ as ipwb_version 14 | 15 | 16 | @exception_logger(catch=not settings.DEBUG) 17 | def main(): 18 | check_args(sys.argv) 19 | 20 | 21 | def check_args_index(args): 22 | # args.daemon_address is always set. Either default or by CLI 23 | try: 24 | # see if it parses 25 | daemon = Multiaddr(args.daemon_address) 26 | except multiaddr_exceptions.StringParseError as e: 27 | print("Daemon address cannot be parsed") 28 | raise e 29 | settings.App.set("ipfsapi", str(daemon)) 30 | 31 | util.check_daemon_is_alive() 32 | 33 | enc_key = None 34 | compression_level = None 35 | if args.e: 36 | enc_key = '' 37 | if args.c: 38 | compression_level = 6 # Magic 6, TA-DA! 39 | 40 | indexer.index_file_at(args.warc_path, enc_key, compression_level, 41 | args.compressFirst, outfile=args.outfile, 42 | debug=args.debug) 43 | 44 | 45 | def check_args_replay(args): 46 | supplied_index_parameter = hasattr(args, 'index') and \ 47 | args.index is not None 48 | likely_piping = not sys.stdin.isatty() 49 | 50 | if not supplied_index_parameter and likely_piping: 51 | cdxj_in = ''.join(sys.stdin.readlines()) 52 | if len(cdxj_in) == 0: # Daemon was not running, so nothing was indexed 53 | print(('ERROR: The IPFS daemon must be running to pipe input from' 54 | ' the indexer to the replay system.')) 55 | sys.exit() 56 | 57 | random.seed() 58 | # Write data to temp file (sub-optimal) 59 | 60 | fh, args.index = tempfile.mkstemp(suffix='.cdxj') 61 | os.close(fh) 62 | with open(args.index, 'w') as f: 63 | f.write(cdxj_in) 64 | 65 | supplied_index_parameter = True 66 | 67 | proxy = None 68 | if hasattr(args, 'proxy') and args.proxy is not None: 69 | print(f'Proxying to {args.proxy}') 70 | proxy = args.proxy 71 | try: 72 | # see if it parses 73 | daemon = Multiaddr(args.daemon_address) 74 | except multiaddr_exceptions.StringParseError as e: 75 | print("Daemon address cannot be parsed") 76 | raise e 77 | settings.App.set("ipfsapi", str(daemon)) 78 | 79 | port = replay.IPWBREPLAY_PORT 80 | if hasattr(args, 'port') and args.port is not None: 81 | print(f'Using custom port {args.port} for replay.') 82 | port = args.port 83 | 84 | # TODO: add any other sub-arguments for replay here 85 | if supplied_index_parameter: 86 | replay.start(cdxj_file_path=args.index, proxy=proxy, port=port) 87 | else: 88 | print('ERROR: An index file must be specified if not piping, e.g.,') 89 | print(("> ipwb replay " 90 | f"{os.path.join('path', 'to', 'your', 'index.cdxj')}\n")) 91 | 92 | args.onError() 93 | sys.exit() 94 | 95 | 96 | def check_args(args_in): 97 | """ 98 | Check to ensure valid arguments were passed in and provides guidance 99 | on the available options if not 100 | """ 101 | parser = argparse.ArgumentParser( 102 | description='InterPlanetary Wayback (ipwb)', prog="ipwb") 103 | subparsers = parser.add_subparsers( 104 | title="ipwb commands", 105 | description=("Invoke using \"ipwb \"" 106 | ", e.g., ipwb replay ")) 107 | 108 | index_parser = subparsers.add_parser( 109 | 'index', 110 | prog="ipwb", 111 | description="Index a WARC file for replay in ipwb", 112 | help="Index a WARC file for replay in ipwb") 113 | index_parser.add_argument( 114 | 'warc_path', 115 | help="Path to a WARC[.gz] file", 116 | metavar="index ", 117 | nargs='+', 118 | default=None) 119 | index_parser.add_argument( 120 | '-e', 121 | help="Encrypt WARC content prior to adding to IPFS", 122 | action='store_true', 123 | default=False) 124 | index_parser.add_argument( 125 | '-c', 126 | help='Compress WARC content prior to adding to IPFS', 127 | action='store_true', 128 | default=False) 129 | index_parser.add_argument( 130 | '--compressFirst', 131 | help='Compress data before encryption, where applicable', 132 | action='store_true', 133 | default=False) 134 | index_parser.add_argument( 135 | '-o', '--outfile', 136 | help='Path to an output CDXJ file, defaults to STDOUT', 137 | default=None) 138 | index_parser.add_argument( 139 | '--debug', 140 | help='Convenience flag to help with testing and debugging', 141 | action='store_true', 142 | default=False) 143 | index_parser.set_defaults(func=check_args_index) 144 | 145 | replay_parser = subparsers.add_parser( 146 | 'replay', 147 | prog="ipwb replay", 148 | description="Start the ipwb relay system", 149 | help="Start the ipwb replay system") 150 | replay_parser.add_argument( 151 | 'index', 152 | help='path, URI, or multihash of file to use for replay', 153 | nargs='?') 154 | replay_parser.add_argument( 155 | '-P', '--proxy', 156 | help='Proxy URL', 157 | metavar='', 158 | nargs='?') 159 | replay_parser.add_argument( 160 | '-p', '--port', 161 | help='Custom Port', 162 | type=int, 163 | default=util.IPWBREPLAY_PORT 164 | ) 165 | replay_parser.set_defaults(func=check_args_replay, 166 | onError=replay_parser.print_help) 167 | 168 | parser.add_argument( 169 | '-d', '--daemon', 170 | help=("Multi-address of IPFS daemon " 171 | "(default /dns/localhost/tcp/5001/http)"), 172 | default=settings.App.config("ipfsapi"), 173 | dest='daemon_address') 174 | parser.add_argument( 175 | '-v', '--version', help='Report the version of ipwb', action='version', 176 | version=f'InterPlanetary Wayback {ipwb_version}') 177 | parser.add_argument( 178 | '-u', '--update-check', 179 | action='store_true', 180 | help='Check whether an updated version of ipwb is available' 181 | ) 182 | parser.set_defaults(func=util.check_for_update) 183 | 184 | arg_count = len(args_in) 185 | cmd_list = ['index', 'replay'] 186 | base_parser_flag_list = ['-d', '--daemon', '-v', '--version', 187 | '-u', '--update-check'] 188 | 189 | # Various invocation error, used to show appropriate help 190 | cmd_error_index = arg_count == 2 and args_in[1] == 'index' 191 | cmd_error_no_command = arg_count == 1 192 | cmd_error_invalid_command = arg_count > 1 \ 193 | and args_in[1] not in cmd_list + base_parser_flag_list 194 | 195 | if cmd_error_no_command or cmd_error_invalid_command: 196 | parser.print_help() 197 | sys.exit() 198 | elif cmd_error_index: 199 | index_parser.print_help() 200 | sys.exit() 201 | 202 | results = parser.parse_args() 203 | results.func(results) 204 | 205 | return results 206 | 207 | 208 | if __name__ == "__main__": 209 | main() 210 | -------------------------------------------------------------------------------- /ipwb/assets/admin.css: -------------------------------------------------------------------------------- 1 | div#iframeWrapper {text-overflow: hidden; color: #8f1d27; height: 1.0em;} 2 | div#iframeWrapper p {} 3 | iframe#daemonStatus {border: 0; height: 50px; width: 200px; vertical-align: top;} 4 | 5 | details {font-family: Consolas, monospace; width: 100%; margin: 0 auto 0 auto; text-align: left;} 6 | details p {margin: 0;} 7 | details summary {color: #8f1d27; text-decoration: underline; font-weight: bold; text-align: center;} 8 | details summary:hover {cursor: pointer;} 9 | details label {padding: 0; color: #8f1d27; text-align: right; display: block; float: left; margin-right: 1.0em; width: 50%;} 10 | details label#daemonStatusLabel {height: 10px; margin-top: 0; vertical-align: top;} 11 | details label.twoRowLabel {height: 2.0em;} 12 | 13 | form input[type=submit] {display: block; clear: both; margin: 0.5em auto 0 auto;} 14 | h1 {font-size: 24px; margin-bottom: 0.5em; font-family: sans-serif;} 15 | h1 img {height: 2.0em;} 16 | -------------------------------------------------------------------------------- /ipwb/assets/daemonController.js: -------------------------------------------------------------------------------- 1 | let remainingTries = 0 2 | 3 | function recheckDaemonStatus () { 4 | remainingTries = 10 5 | 6 | if (document.getElementById('status').innerHTML !== 'Running') { 7 | checkDaemonStatus() 8 | } 9 | } 10 | 11 | function checkDaemonStatus () { 12 | window.fetch('/ipfsdaemon/status') 13 | .then(resp => updateUIOrResetTimer()) 14 | .catch(error => console.log('error on daemon status check', error)) 15 | } 16 | 17 | function updateUIOrResetTimer (resp) { 18 | if (remainingTries > 0) { // Stop polling after 10 sec/tries 19 | remainingTries -= 1 20 | } 21 | if (resp.indexOf('Not Running') > -1) { 22 | window.setTimeout(checkDaemonStatus, 1000) 23 | } else { 24 | document.location.reload(true) 25 | } 26 | } 27 | 28 | document.addEventListener('DOMContentLoaded', recheckDaemonStatus, false) 29 | -------------------------------------------------------------------------------- /ipwb/assets/favicons/android-chrome-192x192.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oduwsdl/ipwb/91e202b31d96af0171690b2df61d47625867651f/ipwb/assets/favicons/android-chrome-192x192.png -------------------------------------------------------------------------------- /ipwb/assets/favicons/apple-touch-icon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oduwsdl/ipwb/91e202b31d96af0171690b2df61d47625867651f/ipwb/assets/favicons/apple-touch-icon.png -------------------------------------------------------------------------------- /ipwb/assets/favicons/browserconfig.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | #da532c 7 | 8 | 9 | 10 | -------------------------------------------------------------------------------- /ipwb/assets/favicons/favicon-16x16.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oduwsdl/ipwb/91e202b31d96af0171690b2df61d47625867651f/ipwb/assets/favicons/favicon-16x16.png -------------------------------------------------------------------------------- /ipwb/assets/favicons/favicon-32x32.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oduwsdl/ipwb/91e202b31d96af0171690b2df61d47625867651f/ipwb/assets/favicons/favicon-32x32.png -------------------------------------------------------------------------------- /ipwb/assets/favicons/favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oduwsdl/ipwb/91e202b31d96af0171690b2df61d47625867651f/ipwb/assets/favicons/favicon.ico -------------------------------------------------------------------------------- /ipwb/assets/favicons/manifest.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "ipwb", 3 | "icons": [ 4 | { 5 | "src": "\/android-chrome-192x192.png", 6 | "sizes": "192x192", 7 | "type": "image\/png" 8 | } 9 | ], 10 | "theme_color": "#ffffff", 11 | "display": "standalone" 12 | } 13 | -------------------------------------------------------------------------------- /ipwb/assets/favicons/mstile-150x150.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oduwsdl/ipwb/91e202b31d96af0171690b2df61d47625867651f/ipwb/assets/favicons/mstile-150x150.png -------------------------------------------------------------------------------- /ipwb/assets/favicons/safari-pinned-tab.svg: -------------------------------------------------------------------------------- 1 | 2 | 4 | 7 | 8 | Created by potrace 1.11, written by Peter Selinger 2001-2013 9 | 10 | 12 | 14 | 18 | 24 | 26 | 28 | 35 | 43 | 45 | 47 | 50 | 53 | 54 | 57 | 61 | 64 | 67 | 71 | 74 | 77 | 81 | 84 | 87 | 90 | 94 | 98 | 101 | 105 | 108 | 112 | 113 | 114 | -------------------------------------------------------------------------------- /ipwb/assets/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oduwsdl/ipwb/91e202b31d96af0171690b2df61d47625867651f/ipwb/assets/logo.png -------------------------------------------------------------------------------- /ipwb/assets/reconstructive.js: -------------------------------------------------------------------------------- 1 | /** 2 | * [Reconstructive](https://github.com/oduwsdl/Reconstructive) is a [ServiceWorker](https://www.w3.org/TR/service-workers/) module for client-side reconstruction of composite mementos. 3 | * It reroutes embedded resource requests to their appropriate archival version without any URL rewriting. 4 | * It also provides functionality to add custom archival banners or rewrite hyperlinks on the client-side. 5 | * Use it in a ServiceWorker as illustrated below: 6 | * 7 | * ```js 8 | * importScripts('reconstructive.js'); 9 | * const rc = new Reconstructive(); 10 | * self.addEventListener('fetch', rc.reroute); 11 | * ``` 12 | * 13 | * @overview Reconstructive is a module to be used in a ServiceWorker of an archival replay. 14 | * @author Sawood Alam 15 | * @license MIT 16 | * @copyright ODU Web Science / Digital Libraries Research Group 2017 17 | */ 18 | class Reconstructive { 19 | constructor(config) { 20 | this.NAME = 'Reconstructive'; 21 | 22 | this.VERSION = '0.7.1'; 23 | 24 | this.id = `${this.NAME}:${this.VERSION}`; 25 | 26 | this.urimPattern = `${self.location.origin}/memento//`; 27 | 28 | this.bannerElementLocation = `${self.location.origin}/reconstructive-banner.js`; 29 | 30 | this.bannerLogoLocation = ''; 31 | 32 | this.bannerLogoHref = '/'; 33 | 34 | this.showBanner = false; 35 | 36 | this.debug = false; 37 | 38 | if (config instanceof Object) { 39 | for (const [k, v] of Object.entries(config)) { 40 | this[k] = v; 41 | } 42 | } 43 | 44 | this._regexps = { 45 | urimPattern: new RegExp(`^${this.urimPattern.replace('', '(\\d{14})').replace('', '(.*)')}$`), 46 | absoluteReference: new RegExp(`(<(iframe|a|meta).*?\\s+(src|href|content\\s*=\\s*["']?\\s*\\d+\\s*;\\s*url)\\s*=\\s*["']?)(https?:\/\/[^'"\\s]+)(.*?>)`, 'ig'), 47 | bodyEnd: new RegExp('<\/(body|html)>', 'i') 48 | }; 49 | 50 | this.exclusions = { 51 | notGet: event => event.request.method !== 'GET', 52 | bannerElement: event => this.showBanner && event.request.url.endsWith(this.bannerElementLocation), 53 | bannerLogo: event => this.showBanner && this.bannerLogoLocation && event.request.url.endsWith(this.bannerLogoLocation), 54 | localResource: event => !(this._regexps.urimPattern.test(event.request.url) || this._regexps.urimPattern.test(event.request.referrer)) 55 | }; 56 | 57 | this.debug && console.log(`${this.NAME}:${this.VERSION} initialized:`, this); 58 | 59 | this.fetchFailure = this.fetchFailure.bind(this); 60 | } 61 | 62 | shouldExclude(event) { 63 | return Object.entries(this.exclusions).some(([exclusionName, exclusionFunc]) => { 64 | if (exclusionFunc(event)) { 65 | this.debug && console.log('Exclusion found:', exclusionName, event.request.url); 66 | return true; 67 | } 68 | return false; 69 | }); 70 | } 71 | 72 | createUrim(event) { 73 | let [datetime, refUrir] = this.extractDatetimeUrir(event.request.referrer); 74 | let urir = new URL(event.request.url); 75 | 76 | if (urir.origin === self.location.origin) { 77 | let refOrigin = refUrir.match(/^(https?:\/\/)?[^\/]+/)[0]; 78 | urir = refOrigin + urir.pathname + urir.search; 79 | } else { 80 | urir = urir.href; 81 | } 82 | return this.urimPattern.replace('', datetime).replace('', urir); 83 | } 84 | 85 | extractDatetimeUrir(urim) { 86 | let [, datetime, urir] = urim.match(this._regexps.urimPattern); 87 | 88 | if (isNaN(datetime)) { 89 | return [urir, datetime]; 90 | } 91 | return [datetime, urir]; 92 | } 93 | 94 | createRequest(event) { 95 | let headers = this.cloneHeaders(event.request.headers); 96 | headers.set('X-ServiceWorker', this.id); 97 | return new Request(event.request.url, { headers: headers, redirect: 'manual' }); 98 | } 99 | 100 | cloneHeaders(original) { 101 | let headers = new Headers(); 102 | for (const [k, v] of original.entries()) { 103 | headers.append(k, v); 104 | } 105 | return headers; 106 | } 107 | 108 | localRedirect(urim) { 109 | this.debug && console.log('Locally redirecting to:', urim); 110 | return Promise.resolve(new Response(`

Locally Redirecting

${urim}

`, { 111 | status: 302, 112 | statusText: 'Found', 113 | headers: new Headers({ 114 | 'Location': urim, 115 | 'Access-Control-Allow-Origin': '*', 116 | 'Content-Type': 'text/html' 117 | }) 118 | })); 119 | } 120 | 121 | fetchSuccess(response, event) { 122 | this.debug && console.log('Fetched from server:', response); 123 | 124 | if (response.ok) { 125 | return this.rewrite(response, event); 126 | } 127 | return Promise.resolve(response); 128 | } 129 | 130 | fetchFailure(error) { 131 | this.debug && console.log(error); 132 | return new Response('

Service Unavailable

', { 133 | status: 503, 134 | statusText: 'Service Unavailable', 135 | headers: new Headers({ 136 | 'Content-Type': 'text/html' 137 | }) 138 | }); 139 | } 140 | 141 | rewrite(response, event) { 142 | if (/text\/html/i.test(response.headers.get('Content-Type'))) { 143 | let headers = this.cloneHeaders(response.headers); 144 | let init = { 145 | status: response.status, 146 | statusText: response.statusText, 147 | headers: headers 148 | }; 149 | return response.text().then(body => { 150 | const [datetime] = this.extractDatetimeUrir(response.url); 151 | 152 | body = body.replace(this._regexps.absoluteReference, `$1${this.urimPattern.replace('', datetime).replace('', '$4')}$5`); 153 | 154 | if (this.showBanner && event.request.mode === 'navigate') { 155 | const banner = this.createBanner(response, event); 156 | 157 | if (this._regexps.bodyEnd.test(body)) { 158 | body = body.replace(this._regexps.bodyEnd, banner + ''); 159 | } else { 160 | body += banner; 161 | } 162 | } 163 | return new Response(body, init); 164 | }); 165 | } 166 | return Promise.resolve(response); 167 | } 168 | 169 | createBanner(response, event) { 170 | let mementoDatetime = response.headers.get('Memento-Datetime') || ''; 171 | const [datetime, urir] = this.extractDatetimeUrir(response.url); 172 | if (!mementoDatetime) { 173 | mementoDatetime = new Date(`${datetime.slice(0, 4)}-${datetime.slice(4, 6)}-${datetime.slice(6, 8)}T${datetime.slice(8, 10)}:${datetime.slice(10, 12)}:${datetime.slice(12, 14)}Z`).toUTCString(); 174 | } 175 | 176 | let rels = {}; 177 | const links = response.headers.get('Link'); 178 | if (links) { 179 | links.replace(/[\r\n]+/g, ' ').replace(/^\W+|\W+$/g, '').split(/\W+ { 180 | let segs = l.split(/[>\s'"]*;\W*/); 181 | let href = segs.shift(); 182 | let attributes = {}; 183 | segs.forEach(s => { 184 | let [k, v] = s.split(/\W*=\W*/); 185 | attributes[k] = v; 186 | }); 187 | attributes['rel'].split(/\s+/).forEach(r => { 188 | rels[r] = { href: href, datetime: attributes['datetime'] }; 189 | }); 190 | }); 191 | } 192 | return ` 193 | 194 | 206 | 207 | `; 208 | } 209 | 210 | reroute(event) { 211 | this.debug && console.log('Rerouting requested', event); 212 | 213 | if (this.shouldExclude(event)) return; 214 | 215 | if (this._regexps.urimPattern.test(event.request.url)) { 216 | let request = this.createRequest(event); 217 | event.respondWith(fetch(request).then(response => this.fetchSuccess(response, event)).catch(this.fetchFailure)); 218 | } else { 219 | let urim = this.createUrim(event); 220 | event.respondWith(this.localRedirect(urim)); 221 | } 222 | } 223 | 224 | } 225 | -------------------------------------------------------------------------------- /ipwb/assets/serviceWorker.js: -------------------------------------------------------------------------------- 1 | /* eslint-env serviceworker */ 2 | /* global Reconstructive */ 3 | 4 | // This makes a class module available named "Reconstructive" 5 | importScripts('/ipwbassets/reconstructive.js') 6 | 7 | // Create a Reconstructive instance with optionally customized configurations 8 | // const rc = new Reconstructive({ 9 | // id: `${NAME}:${VERSION}`, 10 | // urimPattern: `${self.location.origin}/memento//`, 11 | // bannerElementLocation: `${self.location.origin}/reconstructive-banner.js`, 12 | // bannerLogoLocation: '', 13 | // bannerLogoHref: '/', 14 | // showBanner: false, 15 | // debug: false 16 | // }); 17 | const rc = new Reconstructive({ 18 | bannerElementLocation: '/ipwbassets/reconstructive-banner.js', 19 | showBanner: true, 20 | bannerLogoLocation: '/ipwbassets/logo.png', 21 | debug: true 22 | }) 23 | 24 | // Add any custom exclusions or modify or delete default ones 25 | // > rc.exclusions; 26 | // < { 27 | // < notGet: function(FetchEvent) => boolean, 28 | // < bannerElement: function(FetchEvent) => boolean, 29 | // < bannerLogo: function(FetchEvent) => boolean, 30 | // < localResource: function(FetchEvent) => boolean 31 | // < } 32 | rc.exclusions.specialEndpint = function (event, config) { 33 | return ['/ipwbassets/', '/ipfsdaemon/'].some( 34 | ep => event.request.url.startsWith(self.location.origin + ep)) 35 | } 36 | 37 | // This is not necessary, but can be useful for debugging or in future 38 | self.addEventListener('install', (event) => { 39 | console.log('ServiceWorker installed') 40 | }) 41 | 42 | // This is not necessary, but can be useful for debugging or in future 43 | self.addEventListener('activate', (event) => { 44 | console.log('ServiceWorker Activated') 45 | }) 46 | 47 | self.addEventListener('fetch', (event) => { 48 | // Add any custom logic here to conditionally call the reroute method 49 | rc.reroute(event) 50 | }) 51 | -------------------------------------------------------------------------------- /ipwb/assets/webui.css: -------------------------------------------------------------------------------- 1 | #wrapper {margin: auto; text-align: center;} 2 | #submit, #url {font-size: 72px; font-family: Consolas, sans-serif;} 3 | #memCount {font-size: 30px; font-family: Consolas, sans-serif;} 4 | h1 {font-size: 72px; margin-bottom: 0.5em; font-family: sans-serif;} 5 | h1 img {height: 2.0em;} 6 | abbr:hover {cursor: help;} 7 | abbr[title] {text-decoration-skip-ink: auto; text-underline-position: under;} 8 | .hidden {display: none;} 9 | #uriList {list-style-type: none; text-align: left; margin: auto;} 10 | a#memCountListLink {border-bottom: 1px dotted black; cursor: pointer;} 11 | a#memCountListLink.activated {border-bottom: 2px solid #009900;} 12 | p.centered {text-align: center;} 13 | p.topSpace {margin-top: 1.0em;} 14 | 15 | #uris {border-bottom: 3px double #ccc; border-top: 1px solid #ccc; padding-bottom: 5px; width: 100%;} 16 | 17 | div#iframeWrapper {text-overflow: hidden; color: #8f1d27; height: 1.0em;} 18 | div#iframeWrapper p {} 19 | iframe#daemonStatus {border: 0; height: 50px; width: 200px; vertical-align: top;} 20 | 21 | html.status body {margin: 0 0 0 2px; padding: 0;} 22 | html#statusStart {color: red;} 23 | html#statusStop {color: green;} 24 | html.status button {display: inline-block; margin-left: 5px;} 25 | ul#uriList li {display: none; width: 100%; margin-bottom: 0.5em; text-overflow: ellipsis; overflow: hidden; white-space: nowrap;} 26 | ul#uriList li[data-display] {background-color: white; display: block;} 27 | ul#uriList.forceDisplay li {display: block;} 28 | 29 | h3#urisHeader {margin-bottom: 0;} 30 | h4#htmlCountHeader {margin-top: 0; margin-bottom: 1.0em; font-size: 0.8em; font-weight: normal;} 31 | details {font-family: Consolas, monospace; width: 100%; margin: 0 auto 0 auto; text-align: left;} 32 | details p {margin: 0;} 33 | details summary {color: #8f1d27; text-decoration: underline; font-weight: bold; text-align: center;} 34 | details summary:hover {cursor: pointer;} 35 | details label {padding: 0; color: #8f1d27; text-align: right; display: block; float: left; margin-right: 1.0em; width: 50%;} 36 | details label#daemonStatusLabel {height: 10px; margin-top: 0; vertical-align: top;} 37 | details label.twoRowLabel {height: 2.0em;} 38 | 39 | form input[type=submit] {display: block; clear: both; margin: 0.5em auto 0 auto;} 40 | 41 | span.datetime {color: #999; font-size: 0.8em; margin-right: 5px;} 42 | 43 | @media only screen and (prefers-color-scheme: dark) { 44 | body {background-color: #111;} 45 | h3 {color: #eee;} 46 | a {color: #8f1d27;} 47 | #memCount {color: #eee;} 48 | details {color: #8f1d27;} 49 | details summary {color: #ccc;} 50 | details label {color: #eee;} 51 | ul#uriList li[data-display] {background-color: transparent;} 52 | h4#htmlCountHeader {color: #eee;} 53 | a#memCountListLink.activated {border-bottom: 2px solid #8f1d27;} 54 | } 55 | -------------------------------------------------------------------------------- /ipwb/assets/webui.js: -------------------------------------------------------------------------------- 1 | /* global uris */ 2 | 3 | function handleSubmit () { // eslint-disable-line no-unused-vars 4 | const val = document.getElementById('url').value 5 | if (val) { 6 | document.location += 'memento/*/' + val 7 | } 8 | } 9 | 10 | function shortestFirst (a, b) { 11 | return a.replace(/\/+$/, '').split('/').length - b.replace(/\/+$/, '').split('/').length 12 | } 13 | 14 | function hideURIs () { 15 | document.getElementById('uris').classList.add('hidden') 16 | document.getElementById('memCountListLink').classList.remove('activated') 17 | window.localStorage.setItem('showURIs', 'false') 18 | } 19 | 20 | function splitDatetime (datetime) { 21 | return datetime.replace(/(\d{4})(\d{2})(\d{2})(\d{2})(\d{2})(\d{2})/, '$1-$2-$3 $4:$5:$6') 22 | } 23 | 24 | function addURIListToDOM () { 25 | const ul = document.getElementById('uriList') 26 | const uriKeys = Object.keys(uris).sort(shortestFirst) 27 | 28 | uriKeys.forEach(urir => { 29 | uris[urir].forEach(function (memento) { 30 | const li = document.createElement('li') 31 | const a = document.createElement('a') 32 | const dt = document.createElement('span') 33 | const title = memento.title || urir 34 | 35 | a.href = 'memento/' + memento.datetime + '/' + urir 36 | a.appendChild(document.createTextNode(title)) 37 | a.title = title 38 | 39 | dt.setAttribute('class', 'datetime') 40 | dt.appendChild(document.createTextNode(splitDatetime(memento.datetime))) 41 | 42 | li.appendChild(dt) 43 | li.appendChild(a) 44 | 45 | li.setAttribute('data-mime', memento.mime) 46 | li.setAttribute('data-status', memento.status) 47 | 48 | const htmlMIMEs = ['text/html', 'application/xhtml+xml'] 49 | const mementoMIME = memento.mime.split(/\s*;/)[0].toLowerCase() 50 | const isHTML = htmlMIMEs.includes(mementoMIME) 51 | 52 | const isARedirect = memento.status[0] === '3' 53 | if (isHTML && !isARedirect) { 54 | li.setAttribute('data-display', 'default') 55 | } 56 | ul.appendChild(li) 57 | }) 58 | }) 59 | } 60 | 61 | function showURIs () { 62 | if (document.getElementById('uriList').childNodes.length === 0) { 63 | addURIListToDOM() // Prevent multiple adds of the URI list to the DOM 64 | } 65 | 66 | document.getElementById('memCountListLink').className = ['activated'] 67 | document.getElementById('uris').classList.remove('hidden') 68 | setPlurality() 69 | setShowAllButtonStatus() 70 | 71 | setUIExpandedState(uris) 72 | // Maintain visible state of URI display for future retrieval 73 | window.localStorage.setItem('showURIs', 'true') 74 | } 75 | 76 | function setUIExpandedState (urisObj) { 77 | const urisHash = calculateURIsHash(urisObj) 78 | setURIsHash(urisHash) 79 | } 80 | 81 | function calculateURIsHash (urisObj) { 82 | return getStringHashCode(JSON.stringify(urisObj)) 83 | } 84 | 85 | function getURIsHash () { 86 | return window.localStorage.getItem('urisHash') 87 | } 88 | 89 | function setURIsHash (hashIn) { 90 | return window.localStorage.setItem('urisHash', hashIn) 91 | } 92 | 93 | function getStringHashCode (str) { 94 | let hash = 0 95 | let i 96 | let chr 97 | if (this.length === 0) { 98 | return hash 99 | } 100 | for (i = 0; i < this.length; i++) { 101 | chr = this.charCodeAt(i) 102 | hash = ((hash << 5) - hash) + chr 103 | hash |= 0 // Convert to 32bit integer 104 | } 105 | return hash 106 | } 107 | 108 | function toggleURIDisplay () { 109 | if (window.localStorage.getItem('showURIs') === 'true') { 110 | hideURIs() 111 | } else { 112 | showURIs() 113 | } 114 | } 115 | 116 | function addEventListeners () { // eslint-disable-line no-unused-vars 117 | const target = document.getElementById('memCountListLink') 118 | target.addEventListener('click', toggleURIDisplay, false) 119 | 120 | const showAllInListingButton = document.getElementById('showEmbeddedURI') 121 | showAllInListingButton.onclick = function showAllURIs () { 122 | const uriList = document.getElementById('uriList') 123 | if (this.innerHTML === this.dataset.defaultvalue) { 124 | this.innerHTML = this.dataset.activatedvalue 125 | uriList.classList.add('forceDisplay') 126 | } else { 127 | this.innerHTML = this.dataset.defaultvalue 128 | uriList.classList.remove('forceDisplay') 129 | } 130 | } 131 | 132 | getIPFSWebUIAddress() 133 | updateServiceWorkerVersionUI() 134 | 135 | const reinstallServiceWorkerButton = document.getElementById('reinstallServiceWorker') 136 | reinstallServiceWorkerButton.onclick = reinstallServiceWorker 137 | 138 | setShowURIsVisibility() 139 | } 140 | 141 | function setShowURIsVisibility () { 142 | const previousHash = getURIsHash() + '' 143 | const newHash = calculateURIsHash(uris) + '' 144 | 145 | if (window.localStorage.getItem('showURIs') && previousHash === newHash) { 146 | showURIs() 147 | } else { 148 | hideURIs() 149 | } 150 | } 151 | 152 | function setPlurality () { 153 | const htmlFilesPlurality = document.getElementById('htmlPages').innerHTML 154 | 155 | if (htmlFilesPlurality === '1') { 156 | document.getElementById('htmlPagesPlurality').classList.add('hidden') 157 | } 158 | } 159 | 160 | function setShowAllButtonStatus () { 161 | const urimCount = document.getElementById('memCountInt').innerHTML 162 | const htmlFilesPlurality = document.getElementById('htmlPages').innerHTML 163 | if (urimCount === htmlFilesPlurality) { 164 | document.getElementById('showEmbeddedURI').setAttribute('disabled', 'disabled') 165 | } 166 | } 167 | 168 | function assignStatusButtonHandlers () { // eslint-disable-line no-unused-vars 169 | const button = document.getElementsByTagName('button')[0] 170 | if (button.innerHTML === 'Start') { 171 | button.addEventListener('click', startIPFSDaemon) 172 | } else { 173 | button.addEventListener('click', stopIPFSDaemon) 174 | } 175 | } 176 | 177 | function startIPFSDaemon () { 178 | sendCommandToIPFSDaemon('start') 179 | this.innerHTML = 'Starting...' 180 | this.setAttribute('disabled', 'disabled') 181 | } 182 | 183 | function stopIPFSDaemon () { 184 | sendCommandToIPFSDaemon('stop') 185 | this.innerHTML = 'Stopping...' 186 | this.setAttribute('disabled', 'disabled') 187 | } 188 | 189 | function getIPFSWebUIAddress () { 190 | const setIPFSWebUILink = function (resp) { 191 | document.getElementById('webui').setAttribute('href', 'http://' + resp) 192 | } 193 | const fail = function () { console.log('fail') } 194 | const err = function () { console.log('err') } 195 | makeAnAJAXRequest('/ipfsdaemon/webuilink', setIPFSWebUILink, fail, err) 196 | } 197 | 198 | function updateIPFSDaemonButtonUI () { 199 | window.setTimeout(function () { 200 | document.location.reload(true) 201 | }, 4000) 202 | } 203 | 204 | function sendCommandToIPFSDaemon (cmd) { 205 | const failFunction = function () { console.log('Comm w/ ipfs daemon failed.') } 206 | const errFunction = function () { console.log('Error talking to ipfs daemon.') } 207 | makeAnAJAXRequest('/ipfsdaemon/' + cmd, updateIPFSDaemonButtonUI, 208 | failFunction, errFunction) 209 | } 210 | 211 | function makeAnAJAXRequest (address, successFunction, failFunction, errorFunction) { 212 | const xmlhttp = new window.XMLHttpRequest() 213 | 214 | xmlhttp.onreadystatechange = function () { 215 | if (xmlhttp.readyState === window.XMLHttpRequest.DONE) { 216 | if (xmlhttp.status === 200) { 217 | successFunction(xmlhttp.responseText) 218 | } else if (xmlhttp.status === 400) { 219 | failFunction() 220 | } else { 221 | errorFunction() 222 | } 223 | } 224 | } 225 | 226 | xmlhttp.open('GET', address, true) 227 | xmlhttp.send() 228 | } 229 | 230 | function injectIPWBJS () { // eslint-disable-line no-unused-vars 231 | registerServiceWorker() 232 | } 233 | 234 | function getServiceWorkerVersion () { 235 | return window.fetch(document.location.href) 236 | .then(function (resp) { 237 | return Promise.resolve(resp.headers.get('Server').split('/').at(-1)) 238 | }) 239 | } 240 | 241 | function reinstallServiceWorker () { 242 | console.log('Deleting old serviceWorker') 243 | deleteServiceWorker() 244 | document.getElementById('serviceWorkerVersion').innerHTML = 'Updating...' 245 | installServiceWorker() 246 | updateServiceWorkerVersionUI() 247 | } 248 | 249 | function deleteServiceWorker () { 250 | navigator.serviceWorker.getRegistrations().then(function (registrations) { 251 | for (const registration of registrations) { 252 | registration.unregister() 253 | } 254 | }) 255 | } 256 | 257 | function updateServiceWorkerVersionUI () { 258 | getServiceWorkerVersion().then(function (resp) { 259 | console.log('updating to ...' + resp) 260 | document.getElementById('serviceWorkerVersion').innerHTML = 'ver. ' + resp 261 | }) 262 | } 263 | 264 | function installServiceWorker () { 265 | let newInstallation = false 266 | 267 | if (navigator.serviceWorker.controller === null) { // Ideally we would use serviceWorker.getRegistration 268 | newInstallation = true 269 | } 270 | 271 | navigator.serviceWorker.register('/ipwbassets/serviceWorker.js', { scope: '/' }).then( 272 | function (registration) { 273 | console.log('ServiceWorker registration successful with scope: ', registration.scope) 274 | }).catch(function (err) { 275 | console.log('ServiceWorker registration failed: ', err) 276 | }).then(function (rr) { 277 | const dt = document.location.href.split('/')[3] 278 | const viewingMemento = dt.length === 14 && parseInt(dt, 10) + '' === dt 279 | 280 | // Reload the page with processing by the newly installed Service Worker 281 | if (newInstallation && viewingMemento) { 282 | document.location.reload() 283 | } 284 | }) 285 | } 286 | 287 | function registerServiceWorker () { 288 | if ('serviceWorker' in navigator) { 289 | window.addEventListener('load', installServiceWorker) 290 | } else { 291 | console.log('Browser does not support Service Worker.') 292 | } 293 | } 294 | 295 | function localizeNumber (numberIn) { 296 | let clientLocale = navigator.language 297 | if (navigator.languages && navigator.languages.length) { 298 | clientLocale = navigator.languages[0] 299 | } 300 | return new Intl.NumberFormat(clientLocale).format(numberIn) 301 | } 302 | 303 | function setDaemonVersion () { 304 | const daemonVersion = document.querySelector('#daemonVersion') 305 | 306 | window.fetch('/ipfsdaemon/version') 307 | .then((response) => response.text()) 308 | .then((txt) => (daemonVersion.innerHTML = txt)) 309 | } 310 | 311 | window.addEventListener('DOMContentLoaded', (event) => { 312 | const memCount = document.querySelector('#memCountInt') 313 | if (!memCount) { 314 | return // JS file called from two contexts 315 | } 316 | memCount.innerHTML = localizeNumber(memCount.innerHTML) 317 | 318 | document.querySelector('#daemonStatus').addEventListener('load', () => { setDaemonVersion() }) 319 | }) 320 | -------------------------------------------------------------------------------- /ipwb/backends.py: -------------------------------------------------------------------------------- 1 | import dataclasses 2 | from typing import Optional 3 | from urllib.parse import urlparse 4 | 5 | import ipfshttpclient 6 | import requests 7 | 8 | from ipwb import util, settings 9 | 10 | 11 | @dataclasses.dataclass(frozen=True) 12 | class BackendError(Exception): 13 | backend_name: str 14 | 15 | def __str__(self): 16 | return 'Cannot load index file from {self.backend_name}.'.format( 17 | self=self, 18 | ) 19 | 20 | 21 | def format_ipfs_cid(path: str) -> Optional[str]: 22 | """Format IPFS CID properly.""" 23 | if path.startswith('Qm'): 24 | return path 25 | 26 | elif path.startswith('ipfs://'): 27 | return path.replace('ipfs://', '') 28 | 29 | 30 | def fetch_ipfs_index(path: str) -> Optional[str]: 31 | """Fetch CDXJ file content from IPFS by hash.""" 32 | ipfs_hash = format_ipfs_cid(path) 33 | 34 | if ipfs_hash is None: 35 | return None 36 | 37 | try: 38 | with ipfshttpclient.connect(settings.App.config("ipfsapi")) as client: 39 | return client.cat(path).decode('utf-8') 40 | 41 | except ipfshttpclient.exceptions.StatusError as err: 42 | raise BackendError(backend_name='ipfs') from err 43 | 44 | 45 | def fetch_web_index(path: str) -> Optional[str]: 46 | """Fetch CDXJ file content from a URL.""" 47 | scheme = urlparse(path).scheme 48 | 49 | if not scheme: 50 | return None 51 | 52 | try: 53 | return requests.get(path).text 54 | 55 | except ( 56 | requests.ConnectionError, 57 | requests.HTTPError, 58 | ) as err: 59 | raise BackendError(backend_name='web') from err 60 | 61 | 62 | def fetch_local_index(path: str) -> str: 63 | """Fetch CDXJ index contents from a file on local disk.""" 64 | with open(path, 'r') as f: 65 | return f.read() 66 | 67 | 68 | def get_web_archive_index(path: str) -> str: 69 | """ 70 | Based on path, choose appropriate backend and fetch the file contents. 71 | """ 72 | 73 | # TODO right now, every backend is just a function which returns contents 74 | # of a CDXJ file as string. In the future, however, backends will be 75 | # probably represented as classes with much more sophisticated methods 76 | # of manipulating the archive index records. 77 | # TODO also, it will be possible to choose a backend and configure it; 78 | # whereas right now we choose a backend automatically based on the given 79 | # path itself. 80 | 81 | # Maybe it is an IPFS address? 82 | response = fetch_ipfs_index(path) 83 | if response is not None: 84 | return response 85 | 86 | # Or a traditional Web address? 87 | response = fetch_web_index(path) 88 | if response is not None: 89 | return response 90 | 91 | # Okay, this is probably a file on local disk 92 | response = fetch_local_index(path) 93 | if response is not None: 94 | return response 95 | 96 | raise ValueError(( 97 | f'Unknown format of index file location: {path}. Please provide ' 98 | f'a valid local path, HTTP or FTP URL, or an IPFS QmHash.' 99 | )) 100 | -------------------------------------------------------------------------------- /ipwb/error_handler.py: -------------------------------------------------------------------------------- 1 | import functools 2 | import logging 3 | from typing import Callable 4 | 5 | logger = logging.getLogger('ipwb') 6 | 7 | 8 | def exception_logger(catch=True, exception_class=Exception): 9 | """ 10 | Decorator that catches exceptions in the function and logs them. 11 | 12 | Usage: 13 | 14 | ```python 15 | @exception_logger() 16 | def decorated_function(foo, bar): 17 | do_something 18 | ``` 19 | 20 | `exception_logger()` will catch any exception that happens in 21 | `decorated_function()` while it is being executed, and log an error using 22 | Python built in `logging` library. 23 | 24 | Unless `catch` argument is `False` - in which case, the exception will be 25 | reraised. 26 | """ 27 | def decorator(f: Callable): 28 | @functools.wraps(f) 29 | def wrapper(*args, **kwargs): 30 | try: 31 | return f(*args, **kwargs) 32 | 33 | except exception_class as err: 34 | if catch: 35 | logger.critical(str(err)) 36 | 37 | else: 38 | raise 39 | 40 | return wrapper 41 | 42 | return decorator 43 | -------------------------------------------------------------------------------- /ipwb/exceptions.py: -------------------------------------------------------------------------------- 1 | class IPFSDaemonNotAvailable(Exception): 2 | """IPFS Daemon is for some reason not available.""" 3 | -------------------------------------------------------------------------------- /ipwb/indexer.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | """ 4 | InterPlanetary Wayback indexer 5 | 6 | This script reads a WARC file and returns a CDXJ representative of its 7 | contents. In doing so, it extracts all archived HTTP responses from 8 | warc-response records, separates the HTTP header from the body, pushes each 9 | into IPFS, and retains the hashes. These hashes are then used to populate the 10 | JSON block corresponding to the archived URI. 11 | """ 12 | 13 | import sys 14 | import os 15 | import json 16 | import ipfshttpclient as ipfsapi 17 | import zlib 18 | import surt 19 | import ntpath 20 | import traceback 21 | import tempfile 22 | 23 | from io import BytesIO 24 | from warcio.archiveiterator import ArchiveIterator 25 | from warcio.recordloader import ArchiveLoadFailed 26 | 27 | from requests.packages.urllib3.exceptions import NewConnectionError 28 | from ipfshttpclient.exceptions import ConnectionError 29 | # from requests.exceptions import ConnectionError 30 | 31 | from ipwb.util import iso8601_to_digits14, ipfs_client 32 | 33 | import requests 34 | import datetime 35 | 36 | from bs4 import BeautifulSoup 37 | 38 | from Crypto.Cipher import AES 39 | from Crypto.Util.Padding import pad 40 | import base64 41 | 42 | from .__init__ import __version__ as ipwb_version 43 | 44 | DEBUG = False 45 | 46 | 47 | def s2b(s): # Convert str to bytes, cross-py 48 | return bytes(s, 'utf-8') 49 | 50 | 51 | # TODO: put this method definition below index_file_at() 52 | def push_to_ipfs(hstr, payload): 53 | ipfs_retry_count = 5 # WARC->IPFS attempts before giving up 54 | retry_count = 0 55 | while retry_count < ipfs_retry_count: 56 | try: 57 | # Py 2/3 str/unicode/byte resolution 58 | if isinstance(hstr, str): 59 | hstr = s2b(hstr) 60 | if isinstance(payload, str): 61 | payload = s2b(payload) 62 | 63 | if len(payload) == 0: # py-ipfs-api issue #137 64 | return 65 | 66 | http_header_ipfs_hash = push_bytes_to_ipfs(hstr) 67 | payload_ipfs_hash = push_bytes_to_ipfs(payload) 68 | 69 | if retry_count > 0: 70 | m = f'Retrying succeeded after {retry_count} attempts' 71 | print(m) 72 | return [http_header_ipfs_hash, payload_ipfs_hash] 73 | except NewConnectionError as _: 74 | print('IPFS daemon is likely not running.') 75 | print('Run "ipfs daemon" in another terminal session.') 76 | 77 | sys.exit() 78 | except Exception as _: # TODO: Do not use bare except 79 | attempt_count = f'{retry_count + 1}/{ipfs_retry_count}' 80 | log_error(f'IPFS failed to add, retrying attempt {attempt_count}') 81 | log_error(sys.exc_info()) 82 | traceback.print_tb(sys.exc_info()[-1]) 83 | 84 | retry_count += 1 85 | 86 | return None # Process of adding to IPFS failed 87 | 88 | 89 | def encrypt(hstr, payload, encryption_key): 90 | padded_encryption_key = pad(encryption_key, AES.block_size) 91 | key = base64.b64encode(padded_encryption_key) 92 | cipher = AES.new(key, AES.MODE_CTR) 93 | 94 | hstr_bytes = base64.b64encode(cipher.encrypt(hstr)).decode('utf-8') 95 | 96 | payload_bytes = base64.b64encode(cipher.encrypt(payload)).decode('utf-8') 97 | nonce = base64.b64encode(cipher.nonce).decode('utf-8') 98 | 99 | return [hstr_bytes, payload_bytes, nonce] 100 | 101 | 102 | def create_ipfs_temp_path(): 103 | ipfs_temp_path = tempfile.gettempdir() + '/ipfs/' 104 | 105 | # Create temp path for ipwb temp files if it does not already exist 106 | if not os.path.exists(ipfs_temp_path): 107 | os.makedirs(ipfs_temp_path) 108 | 109 | 110 | def index_file_at(warc_paths, encryption_key=None, 111 | compression_level=None, encrypt_then_compress=True, 112 | quiet=False, outfile=None, debug=False): 113 | global DEBUG 114 | DEBUG = debug 115 | 116 | if type(warc_paths) is str: 117 | warc_paths = [warc_paths] 118 | 119 | for warc_path in warc_paths: 120 | verify_file_exists(warc_path) 121 | 122 | cdxj_lines = [] 123 | 124 | if outfile: 125 | outdir = os.path.dirname(os.path.abspath(outfile)) 126 | if not os.path.exists(outdir): 127 | try: 128 | os.makedirs(outdir) 129 | except Exception as e: 130 | log_error(e) 131 | log_error('CDXJ output directory was not created') 132 | try: 133 | output_file = open(outfile, 'a+') 134 | # Read existing non-meta lines (if any) to allow automatic merge 135 | cdxj_lines = [ln.strip() for ln in output_file if ln[:1] != '!'] 136 | except IOError as e: 137 | log_error(e) 138 | log_error('Writing generated CDXJ to STDOUT instead') 139 | outfile = None 140 | 141 | if encryption_key is not None and len(encryption_key) == 0: 142 | encryption_key = ask_user_for_encryption_key() 143 | if encryption_key == '': 144 | encryption_key = None 145 | log_error('Blank key entered, encryption disabled') 146 | 147 | encryption_and_compression_setting = { 148 | 'encrypt_THEN_compress': encrypt_then_compress, 149 | 'encryption_key': encryption_key, 150 | 'compression_level': compression_level 151 | } 152 | 153 | for warc_path in warc_paths: 154 | warc_file_full_path = warc_path 155 | 156 | try: 157 | cdxj_lines += cdx_cdxj_lines_from_file( 158 | warc_file_full_path, **encryption_and_compression_setting) 159 | except ArchiveLoadFailed: 160 | log_error(warc_path + ' is not a valid WARC file.') 161 | 162 | # De-dupe and sort, needed for CDXJ adherence 163 | cdxj_lines = list(set(cdxj_lines)) 164 | cdxj_lines.sort() 165 | 166 | # Prepend metadata 167 | cdxj_metadata_lines = generate_cdxj_metadata(cdxj_lines) 168 | cdxj_lines = cdxj_metadata_lines + cdxj_lines 169 | 170 | if quiet: 171 | return cdxj_lines 172 | 173 | if outfile: 174 | # Truncate existing CDXJ file contents (if any) before writing to it 175 | output_file.seek(0) 176 | output_file.truncate() 177 | for line in cdxj_lines: 178 | output_file.write(line + "\n") 179 | output_file.close() 180 | else: 181 | print('\n'.join(cdxj_lines)) 182 | 183 | 184 | def sanitize_cdxj_line(cdxj_line): 185 | return cdxj_line 186 | 187 | 188 | def cdx_cdxj_lines_from_file(warc_path, **enc_comp_opts): 189 | record_count = 0 190 | with open(warc_path, 'rb') as fhForCounting: 191 | record_count = 0 192 | try: 193 | for _ in ArchiveIterator(fhForCounting): 194 | record_count += 1 195 | 196 | except ArchiveLoadFailed: 197 | print('Encountered a bad WARC record.', file=sys.stderr) 198 | 199 | with open(warc_path, 'rb') as fh: 200 | cdxj_lines = [] 201 | records_processed = 0 202 | # Throws pywb.warc.recordloader.ArchiveLoadFailed if not a warc 203 | for record in ArchiveIterator(fh): 204 | msg = f'Processing WARC records in {ntpath.basename(warc_path)}' 205 | show_progress(msg, records_processed, record_count) 206 | 207 | records_processed += 1 208 | # Only consider WARC resps records from reqs for web resources 209 | ''' TODO: Change conditional to return on non-HTTP responses 210 | to reduce branch depth''' 211 | if record.rec_type != 'response' or \ 212 | record.rec_headers.get_header('Content-Type') in \ 213 | ('text/dns', 'text/whois'): 214 | continue 215 | 216 | hstr = record.http_headers.to_str().strip() 217 | 218 | try: 219 | status_code = record.http_headers.statusline.split()[0] 220 | except Exception as _: # TODO: Do not use bare except 221 | break 222 | 223 | payload = record.content_stream().read() 224 | 225 | title = None 226 | try: 227 | ctype = record.http_headers.get_header('content-type') 228 | if ctype and ctype.lower().startswith('text/html'): 229 | title = BeautifulSoup(payload, 'html.parser').title 230 | if title is not None: 231 | title = ' '.join(title.text.split()) or None 232 | except Exception as e: 233 | print('Failed to extract title', file=sys.stderr) 234 | print(e, file=sys.stderr) 235 | 236 | http_header_ipfs_hash = '' 237 | payload_ipfs_hash = '' 238 | retry_count = 0 239 | nonce = '' 240 | 241 | if enc_comp_opts.get('encrypt_THEN_compress'): 242 | if enc_comp_opts.get('encryption_key') is not None: 243 | key = enc_comp_opts.get('encryption_key') 244 | (hstr, payload, nonce) = encrypt(hstr, payload, key) 245 | if enc_comp_opts.get('compression_level') is not None: 246 | compression_level = enc_comp_opts.get('compression_level') 247 | hstr = zlib.compress(hstr, compression_level) 248 | payload = zlib.compress(payload, compression_level) 249 | else: 250 | if enc_comp_opts.get('compression_level') is not None: 251 | compression_level = enc_comp_opts.get('compression_level') 252 | hstr = zlib.compress(hstr, compression_level) 253 | payload = zlib.compress(payload, compression_level) 254 | if enc_comp_opts.get('encryption_key') is not None: 255 | encryption_key = enc_comp_opts.get('encryption_key') 256 | (hstr, payload, nonce) = \ 257 | encrypt(hstr, payload, encryption_key) 258 | 259 | # print(f'Adding {entry.get("url")} to IPFS') 260 | ipfs_hashes = push_to_ipfs(hstr, payload) 261 | 262 | if ipfs_hashes is None: 263 | log_error('Skipping ' + 264 | record.rec_headers.get_header('WARC-Target-URI')) 265 | 266 | continue 267 | 268 | (http_header_ipfs_hash, payload_ipfs_hash) = ipfs_hashes 269 | 270 | original_uri = record.rec_headers.get_header('WARC-Target-URI') 271 | original_uri_surted = \ 272 | surt.surt(original_uri, 273 | path_strip_trailing_slash_unless_empty=False) 274 | timestamp = iso8601_to_digits14( 275 | record.rec_headers.get_header('WARC-Date')) 276 | mime = record.http_headers.get_header('content-type') 277 | obj = { 278 | 'locator': 279 | f'urn:ipfs/{http_header_ipfs_hash}/{payload_ipfs_hash}', 280 | 'status_code': status_code, 281 | 'mime_type': mime or '', 282 | 'original_uri': original_uri 283 | } 284 | if enc_comp_opts.get('encryption_key') is not None: 285 | obj['encryption_key'] = enc_comp_opts.get('encryption_key') 286 | obj['encryption_method'] = 'aes' 287 | obj['encryption_nonce'] = nonce 288 | if title is not None: 289 | obj['title'] = title 290 | 291 | obj_jSON = json.dumps(obj) 292 | 293 | cdxj_line = f'{original_uri_surted} {timestamp} {obj_jSON}' 294 | cdxj_lines.append(cdxj_line) # + '\n' 295 | return cdxj_lines 296 | 297 | 298 | def generate_cdxj_metadata(cdxj_lines=None): 299 | metadata = ['!context ["https://tools.ietf.org/html/rfc7089"]'] 300 | meta_vals = { 301 | 'generator': f'InterPlanetary Wayback {ipwb_version}', 302 | 'created_at': datetime.datetime.now().isoformat() 303 | } 304 | meta_vals = f'!meta {json.dumps(meta_vals)}' 305 | metadata.append(meta_vals) 306 | 307 | return metadata 308 | 309 | 310 | def ask_user_for_encryption_key(): 311 | if DEBUG: # Allows testing instead of requiring a user prompt 312 | return 'ipwb' 313 | 314 | output_redirected = os.fstat(0) != os.fstat(1) 315 | prompt_string = 'Enter a key for encryption: ' 316 | if output_redirected: # Prevents prompt in redir output 317 | log_error(prompt_string, end='') 318 | prompt_string = '' 319 | 320 | key = input(prompt_string) 321 | 322 | return key 323 | 324 | 325 | def verify_daemon_is_alive(host_and_port): 326 | """Ensure that the IPFS daemon is running via HTTP before proceeding""" 327 | try: 328 | requests.get(f'http://{host_and_port}') 329 | except ConnectionError: 330 | print(f'Daemon is not running at http://{host_and_port}') 331 | sys.exit() 332 | 333 | 334 | def verify_file_exists(warc_path): 335 | if os.path.isfile(warc_path): 336 | return 337 | log_error(f'File at {warc_path} does not exist!') 338 | sys.exit() 339 | 340 | 341 | def show_progress(msg, i, n): 342 | line = f'{msg}: {i}/{n}' 343 | print(line, file=sys.stderr, end='\r') 344 | # Clear status line, show complete msg 345 | if i == n - 1: 346 | final_msg = f'{msg} complete' 347 | space_delta = len(final_msg) - len(msg) 348 | spaces = '' * space_delta if space_delta > 0 else '' 349 | print(final_msg + spaces, file=sys.stderr, end='\r\n') 350 | 351 | 352 | def log_error(err_in, end='\n'): 353 | print(err_in, file=sys.stderr, end=end) 354 | 355 | 356 | def pull_from_ipfs(hash_in): 357 | return ipfs_client().cat(hash_in) 358 | 359 | 360 | def push_bytes_to_ipfs(bytes_in): 361 | """ 362 | Call the IPFS API to add the byte string to IPFS. 363 | When IPFS returns a hash, return this to the caller 364 | """ 365 | # Returns unicode in py2.7, str in py3.7 366 | try: 367 | res = ipfs_client().add_bytes(bytes_in) # bytes) 368 | except TypeError as _: 369 | print('fail') 370 | log_error('IPFS_API had an issue pushing the item to IPFS') 371 | log_error(sys.exc_info()) 372 | log_error(len(bytes_in)) 373 | traceback.print_tb(sys.exc_info()[-1]) 374 | except ipfsapi.exceptions.ConnectionError as _: 375 | print('ConnErr') 376 | log_error(sys.exc_info()) 377 | traceback.print_tb(sys.exc_info()[-1]) 378 | return 379 | 380 | # TODO: verify that the add was successful 381 | 382 | if type(res).__name__ == 'unicode': 383 | return res 384 | elif type(res).__name__ == 'str': 385 | return res 386 | 387 | log_error('NEITHER UNICODE NOR STR RETURNED.') 388 | return res[0]['Hash'] 389 | 390 | 391 | def write_file(filename, content): 392 | with open(filename, 'w') as tmp_file: 393 | tmp_file.write(content) 394 | -------------------------------------------------------------------------------- /ipwb/settings.py: -------------------------------------------------------------------------------- 1 | """Settings and configuration parameters of ipwb.""" 2 | 3 | import os 4 | from logging import config as logging_config 5 | from ipwb import util 6 | 7 | # Running in debug mode or not? 8 | DEBUG = os.environ.get('DEBUG', False) 9 | 10 | IPFSAPI_MUTLIADDRESS = '/dns/localhost/tcp/5001/http' 11 | # or '/dns/{host}/tcp/{port}/http' 12 | # or '/ip4/{ipaddress}/tcp/{port}/http' 13 | # or '/ip6/{ipaddress}/tcp/{port}/http 14 | 15 | LOGGING = { 16 | 'version': 1, 17 | 'disable_existing_loggers': False, 18 | 'formatters': { 19 | 'standard': { 20 | 'format': '%(asctime)s [%(levelname)s] %(name)s: %(message)s' 21 | } 22 | }, 23 | 'handlers': { 24 | 'console': { 25 | 'level': 'DEBUG' if DEBUG else 'INFO', 26 | 'formatter': 'standard', 27 | 'class': 'logging.StreamHandler', 28 | }, 29 | }, 30 | 'loggers': { 31 | '': { 32 | 'handlers': ['console'], 33 | 'level': 'INFO', 34 | 'propagate': False 35 | }, 36 | } 37 | } 38 | 39 | 40 | logging_config.dictConfig(LOGGING) 41 | 42 | 43 | class App: 44 | __conf = { 45 | "ipfsapi": IPFSAPI_MUTLIADDRESS 46 | } 47 | __setters = ["ipfsapi"] 48 | 49 | @staticmethod 50 | def config(name): 51 | return App.__conf[name] 52 | 53 | @staticmethod 54 | def set(name, value): 55 | if name in App.__setters: 56 | App.__conf[name] = value 57 | else: 58 | raise NameError("Name not accepted in set() method") 59 | -------------------------------------------------------------------------------- /ipwb/templates/admin.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 11 | Admin | InterPlanetary Wayback (ipwb) 12 | 13 | 14 |

ipwb | ADMIN

15 | 18 |
19 |

Status

20 |
21 |
IPWB
22 |
Version: {{ status.ipwb_version }}
23 |
IPFS
24 |
Endpoint: {{ status.ipfs_endpoint }} (Change | Web UI)
25 |
Daemon Version:
26 |
27 |
28 |
29 |
30 |

Index

31 |
    32 | {% for idx in indexes %} 33 |
  • 34 | 35 | {% if idx.enabled %} 36 | [Disable] 37 | {% else %} 38 | [Enable] 39 | {% endif %} 40 | {{ idx.path }} ({{ idx.urimCount }} / {{ idx.urirCount }}) 41 |
  • 42 | {% endfor %} 43 |
44 | 45 | 46 |
47 |
48 |

Collection

49 |

{{ summary.urim_count }} mementos of {{ summary.urir_count }} resources with {{ summary.html_count }} HTML pages between {{ summary.earliest }} and {{ summary.latest }}

50 | 51 | 53 | 54 |
    55 | 56 |
    57 | 58 |
    59 |
    60 | 61 |
    62 |
    63 | 64 |
      65 |
      66 | 69 | 70 | 71 | -------------------------------------------------------------------------------- /ipwb/templates/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | InterPlanetary Wayback (ipwb) 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 22 | 23 | 24 | 25 |
      26 |
      27 |

      ipwb

      28 |
      29 | 30 | 31 |
      32 |
      33 |
      34 |

      {{ summary.urim_count}} Mementos available

      35 |
      36 | Options and Details... 37 |

      {{ summary.index_path }}

      38 |

      checking...

      39 |

      40 | 41 |
      42 |
      43 | 44 |
      45 |
      46 |

      47 |

      IPFS WebUI

      48 |

      IPWB Help

      49 | 50 |
      51 | 52 |
      53 | 60 | 61 |
      62 | 63 | 68 | 69 | 70 | -------------------------------------------------------------------------------- /ipwb/util.py: -------------------------------------------------------------------------------- 1 | import functools 2 | from os.path import expanduser 3 | 4 | import os 5 | 6 | import ipfshttpclient 7 | import requests 8 | 9 | import re 10 | # Datetime conversion to rfc1123 11 | import locale 12 | import datetime 13 | import logging 14 | import platform 15 | 16 | from enum import Enum, auto 17 | 18 | from urllib.request import urlopen 19 | from urllib.error import URLError 20 | 21 | import json 22 | from .__init__ import __version__ as ipwb_version 23 | from . import settings 24 | 25 | from ipfshttpclient.exceptions import ConnectionError, AddressError 26 | from multiaddr.exceptions import StringParseError 27 | from packaging.version import parse as parse_version 28 | 29 | from .exceptions import IPFSDaemonNotAvailable 30 | 31 | logger = logging.getLogger(__name__) 32 | 33 | 34 | IPFSAPI_MUTLIADDRESS = '/dns/localhost/tcp/5001/http' 35 | # or '/dns/{host}/tcp/{port}/http' 36 | # or '/ip4/{ipaddress}/tcp/{port}/http' 37 | # or '/ip6/{ipaddress}/tcp/{port}/http 38 | 39 | IPWBREPLAY_ADDRESS = 'localhost:2016' 40 | 41 | (IPWBREPLAY_HOST, IPWBREPLAY_PORT) = IPWBREPLAY_ADDRESS.split(':') 42 | IPWBREPLAY_PORT = int(IPWBREPLAY_PORT) 43 | 44 | INDEX_FILE = os.path.join('samples', 'indexes', 'salam-home.cdxj') 45 | 46 | 47 | class MementoMatch(Enum): 48 | WRONGKEY = auto() 49 | RIGHTKEYWRONGDATE = auto() 50 | EXACTMATCH = auto() 51 | 52 | 53 | log = logging.getLogger('werkzeug') 54 | log.setLevel(logging.ERROR) 55 | 56 | 57 | dt_pattern = re.compile(r"^(\d{4})(\d{2})?(\d{2})?(\d{2})?(\d{2})?(\d{2})?$") 58 | 59 | 60 | def create_ipfs_client(): 61 | """Create and return IPFS client.""" 62 | daemonMultiaddr = settings.App.config("ipfsapi") 63 | try: 64 | return ipfshttpclient.Client(daemonMultiaddr) 65 | except Exception as err: 66 | raise Exception('Cannot create an IPFS client.') from err 67 | 68 | 69 | @functools.lru_cache() 70 | def ipfs_client(): 71 | """ 72 | Create and cache IPFS client instance. 73 | 74 | Caching is the single difference between this and 75 | `create_ipfs_client()` above. 76 | """ 77 | return create_ipfs_client() 78 | 79 | 80 | def check_daemon_is_alive(): 81 | """Ensure that the IPFS daemon is running via HTTP before proceeding""" 82 | client = ipfs_client() 83 | daemonMultiaddr = settings.App.config("ipfsapi") 84 | 85 | try: 86 | # ConnectionError/AttributeError if IPFS daemon not running 87 | client.id() 88 | return True 89 | 90 | except ConnectionError as err: 91 | raise IPFSDaemonNotAvailable( 92 | f'Daemon is not running at: {daemonMultiaddr}', 93 | ) from err 94 | 95 | except OSError as err: 96 | raise IPFSDaemonNotAvailable( 97 | 'IPFS is likely not installed. See https://ipfs.io/docs/install/' 98 | ) from err 99 | 100 | except Exception as err: 101 | raise IPFSDaemonNotAvailable( 102 | 'Unknown error in retrieving IPFS daemon status.', 103 | ) from err 104 | 105 | 106 | def is_valid_cdxj(stringIn): # TODO: Check specific strict syntax 107 | # Also, be sure to mind the meta headers starting with @/#, etc. 108 | return True 109 | 110 | 111 | def is_valid_cdxj_line(cdxj_line): 112 | try: 113 | (surt_uri, datetime, jsonData) = cdxj_line.split(' ', 2) 114 | 115 | json.loads(jsonData) 116 | valid_datetime = len(datetime) == 14 117 | 118 | valid_surt = True # TODO: check valid SURT URI 119 | 120 | return valid_surt and valid_datetime 121 | except ValueError: # Not valid JSON 122 | return False 123 | except NameError: 124 | return is_cdxj_metadata_record(cdxj_line) 125 | except Exception as e: 126 | return False 127 | 128 | 129 | # Compare versions of software, <0 if a1 if b>a 130 | def compare_versions(versionA, versionB): 131 | return parse_version(versionA) < parse_version(versionB) 132 | 133 | 134 | def is_cdxj_metadata_record(cdxj_line): 135 | return cdxj_line.strip()[:1] == '!' 136 | 137 | 138 | def is_localhosty(uri): 139 | # TODO: check for these SW conditions 140 | # (*, localhost, *); (*, 127/8, *); (*, ::1/128, *) 141 | localhosts = ['localhost', '127.0.0.1'] 142 | for lh in localhosts: 143 | if lh in uri: 144 | return True 145 | return False 146 | 147 | 148 | def set_locale(): 149 | currentOS = platform.system() 150 | 151 | if currentOS == 'Darwin': 152 | new_locale = 'en_US' 153 | elif currentOS == 'Windows': 154 | new_locale = 'english' 155 | else: # Assume Linux 156 | new_locale = 'en_US.utf8' 157 | 158 | try: 159 | locale.setlocale(locale.LC_TIME, new_locale) 160 | except locale.Error: 161 | locale.setlocale(locale.LC_TIME, '') 162 | 163 | 164 | def digits14_to_rfc1123(digits14): 165 | set_locale() 166 | d = datetime.datetime.strptime(digits14, '%Y%m%d%H%M%S') 167 | return d.strftime('%a, %d %b %Y %H:%M:%S GMT') 168 | 169 | 170 | def rfc1123_to_digits14(rfc1123_datestring): 171 | set_locale() 172 | d = datetime.datetime.strptime(rfc1123_datestring, 173 | '%a, %d %b %Y %H:%M:%S %Z') 174 | 175 | # TODO: Account for conversion if TZ other than GMT not specified 176 | 177 | return d.strftime('%Y%m%d%H%M%S') 178 | 179 | 180 | def iso8601_to_digits14(iso8601DateString): 181 | set_locale() 182 | d = datetime.datetime.strptime(iso8601DateString, 183 | "%Y-%m-%dT%H:%M:%SZ") 184 | 185 | # TODO: Account for conversion if TZ other than GMT not specified 186 | 187 | return d.strftime('%Y%m%d%H%M%S') 188 | 189 | 190 | def is_rfc1123_compliant(dtstr): 191 | try: 192 | datetime.datetime.strptime(dtstr, '%a, %d %b %Y %H:%M:%S GMT') 193 | return True 194 | except ValueError as err: 195 | return False 196 | 197 | 198 | def get_rfc1123_of_now(): 199 | set_locale() 200 | d = datetime.datetime.now() 201 | return d.strftime('%a, %d %b %Y %H:%M:%S GMT') 202 | 203 | 204 | def pad_digits14(dtstr, validate=False): 205 | '''Pad datetime to make a 14-digit string and optionally validate it''' 206 | match = dt_pattern.match(dtstr) 207 | if match: 208 | Y = match.group(1) 209 | m = match.group(2) or '01' 210 | d = match.group(3) or '01' 211 | H = match.group(4) or '00' 212 | M = match.group(5) or '00' 213 | S = match.group(6) or '00' 214 | dtstr = f'{Y}{m}{d}{H}{M}{S}' 215 | if validate: 216 | datetime.datetime.strptime(dtstr, '%Y%m%d%H%M%S') 217 | return dtstr 218 | 219 | 220 | def fetch_remote_file(path): 221 | try: 222 | r = requests.get(path) 223 | return r.text 224 | 225 | except ConnectionError: 226 | raise Exception(f'File at {path} is unavailable.') 227 | 228 | except Exception as err: 229 | raise Exception( 230 | 'An unknown error occurred trying to fetch {}'.format(path) 231 | ) from err 232 | 233 | 234 | # IPFS Config manipulation from here on out. 235 | def read_ipfs_config(): 236 | ipfs_config_path = os.path.join(expanduser("~"), '.ipfs', 'config') 237 | if 'IPFS_PATH' in os.environ: 238 | ipfs_config_path = os.path.join( 239 | os.environ.get('IPFS_PATH'), 'config') 240 | 241 | try: 242 | with open(ipfs_config_path, 'r') as f: 243 | return json.load(f) 244 | 245 | except IOError as err: 246 | raise Exception( 247 | 'IPFS config not found. Have you installed ipfs and run ipfs init?' 248 | ) from err 249 | 250 | 251 | def write_ipfs_config(json_to_write): 252 | ipfs_config_path = os.path.join(expanduser("~"), '.ipfs', 'config') 253 | if 'IPFS_PATH' in os.environ: 254 | ipfs_config_path = os.path.join( 255 | os.environ.get('IPFS_PATH'), 'config') 256 | 257 | with open(ipfs_config_path, 'w') as f: 258 | f.write(json.dumps(json_to_write, indent=4, sort_keys=True)) 259 | 260 | 261 | def get_ipfsapi_host_and_port(): 262 | daemon_address = settings.App.config("ipfsapi") 263 | # format right now is "/dns/localhost/tcp/5001/http" 264 | 265 | (scheme, host, protocol, port, protocol2) = daemon_address[1:].split('/') 266 | if protocol2 == "https" and port == "443": 267 | # if https is used, rely on a 301/302 redirect response 268 | return host 269 | else: 270 | return host + ':' + port 271 | 272 | 273 | def get_ipwb_replay_config(ipfs_json=None): 274 | if not ipfs_json: 275 | ipfs_json = read_ipfs_config() 276 | port = None 277 | if ('Ipwb' in ipfs_json and 'Replay' in ipfs_json['Ipwb'] and 278 | 'Port' in ipfs_json['Ipwb']['Replay']): 279 | host = ipfs_json['Ipwb']['Replay']['Host'] 280 | port = ipfs_json['Ipwb']['Replay']['Port'] 281 | return (host, port) 282 | else: 283 | return None 284 | 285 | 286 | def set_ipwb_replay_config(Host, Port, ipfs_json=None): 287 | if not ipfs_json: 288 | ipfs_json = read_ipfs_config() 289 | ipfs_json['Ipwb'] = {} 290 | ipfs_json['Ipwb']['Replay'] = { 291 | u'Host': Host, 292 | u'Port': Port 293 | } 294 | write_ipfs_config(ipfs_json) 295 | 296 | 297 | def set_ipwb_replay_index_path(cdxj): 298 | if cdxj is None: 299 | cdxj = INDEX_FILE 300 | ipfs_json = read_ipfs_config() 301 | ipfs_json['Ipwb']['Replay']['Index'] = cdxj 302 | write_ipfs_config(ipfs_json) 303 | return 304 | 305 | 306 | def get_ipwb_replay_index_path(): 307 | ipfs_json = read_ipfs_config() 308 | if 'Ipwb' not in ipfs_json: 309 | set_ipwb_replay_config(IPWBREPLAY_HOST, IPWBREPLAY_PORT) 310 | ipfs_json = read_ipfs_config() 311 | 312 | if 'Index' in ipfs_json['Ipwb']['Replay']: 313 | return ipfs_json['Ipwb']['Replay']['Index'] 314 | else: 315 | return '' 316 | 317 | 318 | # From pywb 2.0.4 319 | def unsurt(surt): 320 | try: 321 | index = surt.index(')/') 322 | parts = surt[0:index].split(',') 323 | parts.reverse() 324 | host = '.'.join(parts) 325 | host += surt[index+1:] 326 | return host 327 | 328 | except ValueError: 329 | # May not be a valid surt 330 | return surt 331 | 332 | 333 | def get_latest_version(): 334 | try: 335 | resp = urlopen('https://pypi.org/pypi/ipwb/json') 336 | return json.loads(resp.read())['info']['version'] 337 | except Exception: 338 | return None 339 | 340 | 341 | def check_for_update(_): 342 | latest = get_latest_version() 343 | if not latest: 344 | print("Failed to check for the latest version.") 345 | return 346 | current = re.sub(r'\.0+', '.', ipwb_version) 347 | if latest == current: 348 | print(f"Installed version {current} is up to date.") 349 | else: 350 | print("The installed version of ipwb is outdated.") 351 | print(f"* Installed: {current}\n* Latest: {latest}") 352 | print("Please run `pip install --upgrade ipwb` to upgrade.") 353 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "standard": { 3 | "ignore": [ 4 | "ipwb/assets/reconstructive*.js" 5 | ] 6 | } 7 | } 8 | -------------------------------------------------------------------------------- /release.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | git checkout master 4 | git pull 5 | 6 | # Update version in project 7 | PYVAR="__version__ = " 8 | VERSION_STRING=`date -u +0.%Y.%m.%d.%H%M` 9 | FILE_NAME='__init__.py' 10 | 11 | # Update ipwb version 12 | echo $PYVAR\'$VERSION_STRING\'>'ipwb/'$FILE_NAME 13 | 14 | # Push to GitHub 15 | git add 'ipwb/'$FILE_NAME 16 | git commit -m "RELEASE: Bump version to "$VERSION_STRING 17 | 18 | # Create a tag in repo 19 | git tag $VERSION_STRING 20 | git push 21 | git push origin $VERSION_STRING 22 | 23 | # The `.github/workflows/dist.yml` Workflow is triggered automatically 24 | # when the repo is tagged by running this Shell script 25 | # or manually creating a release in GitHub. 26 | # In either case, the Workflow will build the Python package 27 | # in GH's CI infrastructure and publish it to PyPI. 28 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | warcio>=1.5.3 2 | ipfshttpclient>=0.8.0a 3 | Flask>=3.0 4 | pycryptodome>=3.4.11 5 | requests>=2.19.1 6 | beautifulsoup4>=4.6.3 7 | surt>=0.3.0 8 | multiaddr >= 0.0.9 9 | packaging==23.0 10 | setuptools==75.2.0 11 | -------------------------------------------------------------------------------- /samples/indexes/5mementos.cdxj: -------------------------------------------------------------------------------- 1 | !context ["https://tools.ietf.org/html/rfc7089"] 2 | !id {"uri": "http://localhost:2016/timemap/cdxj/memento.us"} 3 | !keys ["memento_datetime_YYYYMMDDhhmmss"] 4 | !meta {"original_uri": "http://memento.us/"} 5 | !meta {"timegate_uri": "http://localhost:2016/timegate/memento.us"} 6 | !meta {"timemap_uri": {"link_format": "http://localhost:2016/timemap/link/memento.us","cdxj_format": "http://localhost:2016/timemap/cdxj/memento.us"}} 7 | 20130202100000 {"uri": "http://localhost:2016/memento/20130202100000/memento.us/", "rel": "first memento", "datetime"="Sat, 02 Feb 2013 10:00:00 GMT"} 8 | 20140114100000 {"uri": "http://localhost:2016/memento/20140114100000/memento.us/", "rel": "memento", "datetime"="Tue, 14 Jan 2014 10:00:00 GMT"} 9 | 20140115101500 {"uri": "http://localhost:2016/memento/20140115101500/memento.us/", "rel": "memento", "datetime"="Wed, 15 Jan 2014 10:15:00 GMT"} 10 | 20161231110000 {"uri": "http://localhost:2016/memento/20161231110000/memento.us/", "rel": "memento", "datetime"="Sat, 31 Dec 2016 11:00:00 GMT"} 11 | 20161231110001 {"uri": "http://localhost:2016/memento/20161231110001/memento.us/", "rel": "last memento", "datetime"="Sat, 31 Dec 2016 11:00:01 GMT"} 12 | -------------------------------------------------------------------------------- /samples/indexes/5mementos.link: -------------------------------------------------------------------------------- 1 | ; rel="original", 2 | ; rel="self timemap"; type="application/link-format", 3 | ; rel="timemap"; type="application/cdxj+ors", 4 | ; rel="timegate", 5 | ; rel="first memento"; datetime="Sat, 02 Feb 2013 10:00:00 GMT", 6 | ; rel="memento"; datetime="Tue, 14 Jan 2014 10:00:00 GMT", 7 | ; rel="memento"; datetime="Wed, 15 Jan 2014 10:15:00 GMT", 8 | ; rel="memento"; datetime="Sat, 31 Dec 2016 11:00:00 GMT", 9 | ; rel="last memento"; datetime="Sat, 31 Dec 2016 11:00:01 GMT" 10 | -------------------------------------------------------------------------------- /samples/indexes/froggie_badHeaderHash.cdxj: -------------------------------------------------------------------------------- 1 | !context ["https://tools.ietf.org/html/rfc7089"] 2 | !meta {"created_at": "2018-01-30T16:52:43.212606", "generator": "InterPlanetary Wayback 0.2018.01.10.0435"} 3 | com,matkelly)/froggies/frog.png 20170301192639 {"locator": "urn:ipfs/QmUeko8zM7Xanwz6F9GtRH4rLAi4Poj3DMECGsci2BRQfs/QmPhMnX74cwqx2xgj9d3N3gTra8CzafXwSbUwU8xagMfqR", "original_uri": "http://matkelly.com/froggies/frog.png", "mime_type": "image/png", "status_code": "200"} 4 | com,matkelly)/robots.txt 20170301192639 {"locator": "urn:ipfs/Qmbk3Aju7u26Pzk356a43wY9eUCScAJiLPxhvwsMoVt7Pd/QmYNB85U2txRAAdLp6wvZSPvd8AQq8UcjZJ2azhv5h6NF7", "original_uri": "http://matkelly.com/robots.txt", "mime_type": "text/plain", "status_code": "200"} 5 | edu,odu,cs)/~mkelly/semester/2017_spring/remotefroggie.html 20170301192639 {"locator": "urn:ipfs/QmPdyY7Pm66iWtGpTc7PqK11hvsnYSKMVL57G69RiNjGcm/QmNZ6mKSSAXAmXEocQj5gT4y4kdcr5D2C173ubWJ6PSKEZ", "original_uri": "http://www.cs.odu.edu/~mkelly/semester/2017_spring/remoteFroggie.html", "mime_type": "text/html", "status_code": "200"} 6 | -------------------------------------------------------------------------------- /samples/indexes/salam-home.cdxj: -------------------------------------------------------------------------------- 1 | !context ["https://tools.ietf.org/html/rfc7089"] 2 | !meta {"created_at": "2018-07-28T14:07:36.044446", "generator": "InterPlanetary Wayback 0.2018.07.27.2357"} 3 | edu,odu,cs)/~salam/ 20160305192247 {"locator": "urn:ipfs/QmNkt4JbkTemhUDmua7JW5NaTQWCrVZbk2EvUvhhPm9NJP/QmQr2uoXCbmC5c1vLngeE9HU1CHfF7BVG2z98JR6DQNFoU", "original_uri": "http://www.cs.odu.edu/~salam/", "mime_type": "text/html", "status_code": "200"} 4 | -------------------------------------------------------------------------------- /samples/indexes/sample-1.cdxj: -------------------------------------------------------------------------------- 1 | !context ["http://oduwsdl.github.io/contexts/cdxj"] 2 | !meta {"created_at": "2017-05-31T13:46:51.380204", "generator": "InterPlanetary Wayback 0.2017.05.31.1322"} 3 | com,yahoo,search)/mrss/ 20130411205500 {"locator": "urn:ipfs/Qmd7gw1C84eaDqEcuGTQRj5CA2jKkiEE7ZVCa9pfkrruRY/QmVFeyXN7T3265XMwTrn86tbovTHzJJeHnus2HFCwmbaYh", "mime_type": "text/html", "status_code": "302"} 4 | org,bitchmagazine)/blogs/feed/ 20130411205457 {"locator": "urn:ipfs/QmbRZEuSFqT214MNZg2iSifq7DCdc9PNbcpb3rDHAHdhyv/Qmb4mXnzpjKwZUgRXw2hupjnTdx6LK71UbCNFCZeR9FvmP", "mime_type": "application/rss+xml", "status_code": "200"} 5 | org,bitchmagazine)/blogs/girls-of-color-in-dystopia 20130411210315 {"locator": "urn:ipfs/QmQZPeLTuUs4wk3nLzoSawpHEHqnfyZSV3C654pDPBeRMy/QmRDZij54ghXrKoigU9miXjp4crcuoPnKZtSWL81q3Q2As", "mime_type": "text/html", "status_code": "200"} 6 | org,bitchmagazine)/blogs/model-media 20130411210514 {"locator": "urn:ipfs/QmYTEpDEQiM7hMF3wy8KthPBXsRwJS53ipU8gvy4nwcBeD/QmZYwpHzzUPVqE38GpB4YaxnZcAmUk5TKKSchhoZgDXWyK", "mime_type": "text/html", "status_code": "200"} 7 | org,bitchmagazine)/blogs/ms-opinionated 20130411210142 {"locator": "urn:ipfs/QmZpaxjzUoA5c697ZZeXSDPBJWyDdArEuSdc1V3TmNrrWU/QmeGinaBSjtfKmb7g15RH5azDQ5Yjs81iYvVzLud67pnqo", "mime_type": "text/html", "status_code": "200"} 8 | org,bitchmagazine)/category/blogs/b-sides 20130411205800 {"locator": "urn:ipfs/QmWbTzq7tGYz7KTGDJBhzGvkQrRa4rMVNrHSBfEmdvuZTv/QmWRewuPqpsGX8qXvXLYnRyWE4H9ZHjEw8htN1RAuRg8LF", "mime_type": "text/html", "status_code": "200"} 9 | org,bitchmagazine)/category/blogs/bitch-hq 20130411210006 {"locator": "urn:ipfs/QmRBFJraQYYZHiPNeZXisWGp61Nnp7uyBBXM3xQemW6UY5/QmemQsDUtqbXQou3LEJ64a3DWWEti6ngJJUArZ6GSQczsS", "mime_type": "text/html", "status_code": "200"} 10 | org,bitchmagazine)/category/blogs/movies 20130411205629 {"locator": "urn:ipfs/QmS8mqhodRPfPoeCBGbeopPreCeyDCYUtZdnFjpUfpg1Xu/Qmdbtg6RSb1X7RHzcKhz7qYpabe5VoihM2DRU4BBvvXa3b", "mime_type": "text/html", "status_code": "200"} 11 | org,bitchmagazine)/category/blogs/tv 20130411210227 {"locator": "urn:ipfs/QmaQGwurFNrkamKiXUKpP7J81mmmcUWCQhVomT3ThUUHJB/QmQjGW26gnfLqXRVKVGagKTaHWVt46WLWCXvDY3L19eaJF", "mime_type": "text/html", "status_code": "200"} 12 | org,bitchmagazine)/post/advice-what-am-i-supposed-to-do-about-my-friends-wedding-costs 20130411210018 {"locator": "urn:ipfs/QmTYoJb9EHK7eRqAGRSZnw3apZ9ncztg6NFLK8mQHmbeEx/QmcAsrZSvQj2974wZmkY2ssVMQvcX1GCuZ1DAwhLLE5TbG", "mime_type": "text/html", "status_code": "200"} 13 | org,bitchmagazine)/post/alien-boy-is-a-chilling-documentary-of-one-case-of-police-brutality 20130411205811 {"locator": "urn:ipfs/QmbHEWnkwizRuF1atyxw8HDQtnAGGgewBZvf8oS82XrrP7/Qmegwh5tYVPJ7E5DoU53Hw14jGwBu1SfsBScdg8THghBNB", "mime_type": "text/html", "status_code": "200"} 14 | org,bitchmagazine)/post/happy-birthday-frances-perkins-4-ways-to-celebrate 20130411210415 {"locator": "urn:ipfs/QmSgk2d76pyGe488GCTZftnPRoJeNp5TEGkNk3DoM9EZ1e/QmX7esNSMz7kXrDUNkFANCYebDmhsubhhmEi7gvkxck7L7", "mime_type": "text/html", "status_code": "200"} 15 | org,bitchmagazine)/post/how-does-a-15-year-old-nyc-student-write-race-gender-dystopia-truancy-review 20130411210239 {"locator": "urn:ipfs/QmQZjsTQgQfWxC8W1WnaBtAxYSSnFkdujeVPAsDS2B9M1U/QmQR4Y1at88DGA3SzDxYPsm5FWG65cNaDLJWyVzgeRGEfT", "mime_type": "text/html", "status_code": "200"} 16 | org,bitchmagazine)/post/kacey-musgraves-feminist-country-music-review 20130411205641 {"locator": "urn:ipfs/QmbR4BZqHEZC4zMC5A691FXYmZb2WuaJyqk9Q7uT38KzEj/QmQArdnCjcYfaHyX14fXPeH4fRfTiZktf1a33QE29vmMkD", "mime_type": "text/html", "status_code": "200"} 17 | org,bitchmagazine)/post/on-our-radar-todays-feminist-news-roundup-34 20130411210601 {"locator": "urn:ipfs/QmXMA8JmuLC5gGyXWnw3Ba4v88gjabTTtAFTuXvo21z26D/QmfEqTmD9GeqRpxA3rUXY7GfkJjPHVAov9oCCrT31Ft9bR", "mime_type": "text/html", "status_code": "200"} 18 | org,bitchmagazine)/post/on-our-radar-todays-feminist-news-roundup-35 20130411205858 {"locator": "urn:ipfs/QmfTUeYACwSjdhxSPCfkLPJEL4Q9T4u6bzx458LeUWB6vJ/Qmd1HmnBNcT4SZiau2FaWBmJ2dUSDUkr6W2qKm13w4jtkP", "mime_type": "text/html", "status_code": "200"} 19 | org,bitchmagazine)/post/rupauls-drag-race-comic-recap-the-six-best-moments-from-episode-ten 20130411210153 {"locator": "urn:ipfs/QmWojWLy1zKTwp7ghUtCMkWYjEHtr2sdaP2Vj3i7CEhBDa/Qmf3rp5pTfbLdm7cXpzchH1kSfu1CNvDKjxTg5R9c8xebv", "mime_type": "text/html", "status_code": "200"} 20 | org,bitchmagazine)/post/shes-the-original-i-am-divine-film-review-looks-at-a-drag-legend 20130411205518 {"locator": "urn:ipfs/QmdfP3tifVDHzuNhGAxeUbnjVbpAWd4Nm2AHfCDj8NRH3L/QmaYgsxFg7KKaHwDQL2kDCozhEPHyvaf49GGZuyiNXKWRT", "mime_type": "text/html", "status_code": "200"} 21 | org,bitchmagazine)/post/why-do-good-girls-need-to-go-bad-spring-breakers-feminist 20130411210438 {"locator": "urn:ipfs/QmTvobhz1b7coQ7jNAeZdeG4wnt3SfSZzduCxY2de5fAPj/Qmc8ypAcEo1sfSp7qCXyUL6FXgpJnvP7uRsD14etmhJSDL", "mime_type": "text/html", "status_code": "200"} 22 | org,bitchmagazine)/tag/advice 20130411210028 {"locator": "urn:ipfs/QmRRnyfwddmeheaYnb4JNN1B1KVjXFvH99D1t7CDtUbLys/QmX8Xd4SxUikCbRPRqMB6nf2yCttJbt4uqBeeSiX2N7vHs", "mime_type": "text/html", "status_code": "200"} 23 | org,bitchmagazine)/tag/b-sides-17 20130411205651 {"locator": "urn:ipfs/QmXzrmKFQyjU5wWt36K1HHBVQ7Pm8J1XkSfX5Xk1byq321/QmTXZUcrVJrpcwrht18QFJrLPmuURHdTrMvNySZTdytZ5e", "mime_type": "text/html", "status_code": "200"} 24 | org,bitchmagazine)/tag/britney-spears-5 20130411210449 {"locator": "urn:ipfs/QmXPsu5TwEDPLi1Fux8ZhuDe9iuJRNGiBYj8YRGikGoTS9/QmV3fL5V36DsXq67XzkSyPqitocjQi1SN3mQczWTk6D95i", "mime_type": "text/html", "status_code": "200"} 25 | org,bitchmagazine)/tag/comics-10 20130411210204 {"locator": "urn:ipfs/QmedTdN5A2yuECnh3BnPgbJ8JPUZQrJXvmgx1xudH4z8Gk/QmSvxn4JGb4fHQvmrHwQ9JHLTiLk5uiF6E9JhdNQfeDGk1", "mime_type": "text/html", "status_code": "200"} 26 | org,bitchmagazine)/tag/country-music 20130411205704 {"locator": "urn:ipfs/QmVJwU4Kf5KgcVdcxKhMnGxxgUuZBnkNaAy751fgeimMbQ/Qmb5qjWugTdvfgqPHr5Ve4p82vGAHg28jdAi6phsJcqCA8", "mime_type": "text/html", "status_code": "200"} 27 | org,bitchmagazine)/tag/cultural-appropriation-5 20130411210619 {"locator": "urn:ipfs/QmeL4dJuMu8dJpKCjGTK1SFnWuydNyCfD5yFHYbMD8obrs/QmebVrHc5mS8T3TpH1fQEnHsWSpw9iBv6Yu5YRyWihMvoq", "mime_type": "text/html", "status_code": "200"} 28 | org,bitchmagazine)/tag/cyberbullying 20130411210650 {"locator": "urn:ipfs/QmYqngyzz5EXwib3Y1AbytYukH2eK1Z3GBcFLD6TLBoyoK/QmQwAENGNWuWD7t7khhJpyX6zhBpnfCheuqD8ThCuCqB7k", "mime_type": "text/html", "status_code": "200"} 29 | org,bitchmagazine)/tag/divine 20130411205530 {"locator": "urn:ipfs/QmdZ9fLeAgEXysTooj34Dy5hy79y9341qnzypRNkxaVVin/QmSfY3VkGMxA66Z1W9QRdyNfzYumauNTMHSEArPkcUcjDp", "mime_type": "text/html", "status_code": "200"} 30 | org,bitchmagazine)/tag/drag 20130411205541 {"locator": "urn:ipfs/QmbJWoGrcvev7xCy6xB37WZhYX8zYiXDcbgQESykwgaxqn/QmWLfqTLa4DiV1imrmKMeudU9hWRh79Vd3xaiEFoFMmNpp", "mime_type": "text/html", "status_code": "200"} 31 | org,bitchmagazine)/tag/dsm 20130411205908 {"locator": "urn:ipfs/QmW351m8A7YBeiyT397GZKtWpx2m4GA96qiStXuXiNoAAc/QmZSbtgaHJURB5QtAnZ83PvkUDnJf9uS22kYpXpMHZf38R", "mime_type": "text/html", "status_code": "200"} 32 | org,bitchmagazine)/tag/dystopian 20130411210251 {"locator": "urn:ipfs/QmSMPyGUx7yeH5iGvstHqB9rPU7Zt5QNdD23ikVnvL9sST/QmRPAmDbP4TCndrPU6ejb9CC8di3TyY5b4w78bTEo2pbRQ", "mime_type": "text/html", "status_code": "200"} 33 | org,bitchmagazine)/tag/education-7 20130411210303 {"locator": "urn:ipfs/QmTbEQYgF9fj5p4Msh6Zid8JgkX9haW8wH8616M1EL1FUd/QmWmYpM5hqzaA3wRyoz142eo7osnEKrcPKM1mdGWZQT8Uy", "mime_type": "text/html", "status_code": "200"} 34 | org,bitchmagazine)/tag/femen 20130411205920 {"locator": "urn:ipfs/QmdktHZfzRhYSKCZTJAHh4MtCteNHWxmAmxFtmuXMtQfe9/Qmc2mJpZwNPiiW1zbDXHotnpmFeUeyE4XY4L3jsKLfDJXv", "mime_type": "text/html", "status_code": "200"} 35 | org,bitchmagazine)/tag/frances-perkins 20130411210426 {"locator": "urn:ipfs/QmNYmNf6BYZhAf4djovDJNyjY9c5tawoybPqLmhEZnVvBZ/QmYNBmDgaoTzkQT24Zzy69Cpa1jdLwQHiTh8DYucx6kGsU", "mime_type": "text/html", "status_code": "200"} 36 | org,bitchmagazine)/tag/friendship 20130411210040 {"locator": "urn:ipfs/QmWQJqhSVJfpz5CiQ6bauXziRAfreVUNYXP5Lxac5BWtXJ/QmZXz8iPkpgWDrhzcm1AwraXw5NYAgPYkN7qCL7HjttEPg", "mime_type": "text/html", "status_code": "200"} 37 | org,bitchmagazine)/tag/guns-in-school 20130411210714 {"locator": "urn:ipfs/QmRv67U7dAgahzkfJMNd27XC73fB1aAKqPV2eQ88RGVvZK/QmcT5NV6cZ32UCgZvez1vzYKKj1FBao1rVJm56CFpK9nAB", "mime_type": "text/html", "status_code": "200"} 38 | org,bitchmagazine)/tag/homonationalism 20130411205931 {"locator": "urn:ipfs/QmeEuGHHQKYL3CrDhWWajT3kWiXGSGe24uv7xu1CVNa5La/QmarmFSWRd3zw9PV5mNf36XH18Hn3ka2x976GWFQfW2vdh", "mime_type": "text/html", "status_code": "200"} 39 | org,bitchmagazine)/tag/immigration-reform-0 20130411205942 {"locator": "urn:ipfs/QmbB33R4K2g5ut6dCwVsxVsmexFR5kyarSftLDfJCUGFyu/QmY8WN2kwcQZWnp8BfRJsKHWxnBr2qTRWtiFoVME1XxPBr", "mime_type": "text/html", "status_code": "200"} 40 | org,bitchmagazine)/tag/islamophobia 20130411205954 {"locator": "urn:ipfs/QmNcMAgERJiRRCSuTNZW4zy9Q1r8iEFkJFJFUNb19U4fQb/QmaqJjeYSxy2eT4kPbxKf2CA2tHYx8TJmsTu7xF8rcBEjP", "mime_type": "text/html", "status_code": "200"} 41 | org,bitchmagazine)/tag/jennifer-lawrence 20130411210501 {"locator": "urn:ipfs/QmRqXfnYc5dJRk7mwhXgPtEqTcnAeU6VpJBgjyD9qeKrM4/QmPGNRxqF8xgzkphbtY1CKGofmwGNbD9bi9xcs1SxSwC71", "mime_type": "text/html", "status_code": "200"} 42 | org,bitchmagazine)/tag/john-waters 20130411205554 {"locator": "urn:ipfs/QmbMBDFmbrTSD41f9hC5v1RJbrhma1jM5mKMEtew4Yb1Xz/QmXknTU3vN2A71Ufbyz2cj3z4NizF3vaMq6bd4esxBbkG4", "mime_type": "text/html", "status_code": "200"} 43 | org,bitchmagazine)/tag/kacey-musgraves 20130411205716 {"locator": "urn:ipfs/QmcVdfG7MDYb9WN1V7q7DEYDZGnBd4KaxWeHKTeF1WtrD7/QmULm5TVoUNhJDZA2i2JE2L6y6hzMUNtMQUowGmxprsRXf", "mime_type": "text/html", "status_code": "200"} 44 | org,bitchmagazine)/tag/marriage-6 20130411210053 {"locator": "urn:ipfs/QmVgj1Dq4guw31MG4GNSbvmYRrZNHk8QqoCR2ny2myGf2V/QmaRvjJ5NqC6HR1vzWVHBVgdkxzpN7a9NY8uoL4moRhYHF", "mime_type": "text/html", "status_code": "200"} 45 | org,bitchmagazine)/tag/mental-health-5 20130411205822 {"locator": "urn:ipfs/QmaTPGMGHud6L5J7AVEY3xwjLRDWAvzKN56FieDFC3nVaM/QmWA7iRFredVWJ27eJ1BYW4XTKhLdKgvUwCzmymue2CjG1", "mime_type": "text/html", "status_code": "200"} 46 | org,bitchmagazine)/tag/money 20130411210105 {"locator": "urn:ipfs/QmeC2X9KQep4cVVWpCUdPfCvdcRuHKr7i8k4Si4wpuSj3Y/QmU5uTZxoD6ME3g5V1H9C76HHYWoAj9n4BwLVsDxduoMa4", "mime_type": "text/html", "status_code": "200"} 47 | org,bitchmagazine)/tag/ms-opinionated 20130411210117 {"locator": "urn:ipfs/QmY1GkmPH1TTdfs6tBtuEoBuKDngigGkhp6KNGQcjNvWur/Qmd2pMYRBdvkmEgR9m6StfarB4Yu4JMy6gsPjZ7kp4ewhr", "mime_type": "text/html", "status_code": "200"} 48 | org,bitchmagazine)/tag/nashville 20130411205727 {"locator": "urn:ipfs/QmenNtDgWKBucSADn6FodVG41i1MwDkmAX7sThZpDbUExu/QmcLdsDRBtMgGsS5XF5ewUgr6ZmuCS8SFtdpXQY4BQdMx5", "mime_type": "text/html", "status_code": "200"} 49 | org,bitchmagazine)/tag/new-york-city-0 20130411210327 {"locator": "urn:ipfs/QmZLNMHdCK9U3UCoGWKkLut13vFhkGBXgQfvgSkfZjP6aB/QmT32hSYXxgtBgtJ464CwPHregXAfhTmwsSGioNxd8hc4e", "mime_type": "text/html", "status_code": "200"} 50 | org,bitchmagazine)/tag/police 20130411205834 {"locator": "urn:ipfs/QmU5sjixMBhtGeQotCSuyAufF9GLcjAx8Sg8vTfct9L9Nz/QmZPEhvUaQnTagH82h2mN6yTedUuijBPZrV1cid7bMXXit", "mime_type": "text/html", "status_code": "200"} 51 | org,bitchmagazine)/tag/police-brutality 20130411205846 {"locator": "urn:ipfs/QmfHHJXjfAdPUVFwxaowBbPMwPDypuKxFMzMsgaDjjpH2t/QmZzt2p4jdHZaQirDipvcZNANWxwBn88S92uYaSNcXGUF2", "mime_type": "text/html", "status_code": "200"} 52 | org,bitchmagazine)/tag/privilege-6 20130411210725 {"locator": "urn:ipfs/QmcEQZwesEXSteWgTbviug1xzkKkeytATxqXibFxB4rLWr/QmXZrxXw4bLhVTPBTWu7mek6o9uGmWH8L3Rvn5NpDWp2jf", "mime_type": "text/html", "status_code": "200"} 53 | org,bitchmagazine)/tag/queer-10 20130411205605 {"locator": "urn:ipfs/QmTPoFr2hiwgGVRbVYTCBueXQa9zLuhNsng8E2ycgMvML9/QmZA1NxmDSt4WuDy1ZMy5GZmBdmUBLHfXHny8TM33K783T", "mime_type": "text/html", "status_code": "200"} 54 | org,bitchmagazine)/tag/queer-artists 20130411205617 {"locator": "urn:ipfs/QmUaBE6HCAHZNjpGa58PA4NLgpVZ4z4scAZuUP31nJZQa1/QmSAitxMUgGeKhKgzdfUw75GCfwWA1CJhse1Fp9pdMjuWp", "mime_type": "text/html", "status_code": "200"} 55 | org,bitchmagazine)/tag/rape-11 20130411210737 {"locator": "urn:ipfs/QmcMKnhb3EeWxgf8QoGQ4Jnv54hJGnRnXWL4ct5X1MKm2B/QmYvyfKVjatvU4QiCPGFSNiVi9hwnFHeGZsgygRg92xp4b", "mime_type": "text/html", "status_code": "200"} 56 | org,bitchmagazine)/tag/rupauls-drag-race 20130411210216 {"locator": "urn:ipfs/QmeWbeTjFVdEo9dhDKxYHQmBsevidbDarjoJTB1ekvFyMB/QmRTGfAs9WpWcgfWTLotVLLRfvWhnxYSwsSf9rARZ87xXT", "mime_type": "text/html", "status_code": "200"} 57 | org,bitchmagazine)/tag/school 20130411210339 {"locator": "urn:ipfs/QmaRuTtbJik5AP6o4iu6bgfBxv3JPCVwyqTidSQ57z94FZ/QmVskdWmJ9eBFpVKnfvEJmeJof3L5Gx5dV2eDUVqWjpd6U", "mime_type": "text/html", "status_code": "200"} 58 | org,bitchmagazine)/tag/science-fiction-2 20130411210351 {"locator": "urn:ipfs/QmV6CYahFP1hDnoLSE3nok9jcDZwe1EygYKtLpeUupe9jY/QmUcyiNxTmAeBeKTBtTuv9yQ85wi2F13Q1jgJCaWihn4Ye", "mime_type": "text/html", "status_code": "200"} 59 | org,bitchmagazine)/tag/selena-gomez 20130411210526 {"locator": "urn:ipfs/Qmee9gtabGdudpvLSCtpmKELPQ7hLyQ7DQgpCo2BvcEiwX/Qmf93v8QkHnbG9WJdsh3KYfHFkbvKMtVbL3HEHvp8RgaZs", "mime_type": "text/html", "status_code": "200"} 60 | org,bitchmagazine)/tag/spring-breakers 20130411210537 {"locator": "urn:ipfs/QmdNmknW4QnmrWcVu1vS9LV5Zb3J7sNbeQtx2U8oNXA15R/QmXsk8RcfDsvS7JyhKod1zeW4un1aLKX8dgVVrvQTVn8AD", "mime_type": "text/html", "status_code": "200"} 61 | org,bitchmagazine)/tag/talib-kweli 20130411210750 {"locator": "urn:ipfs/Qma3wd4J5Eb7Y9hRKMjkQp6BGEfAnyd3THvUrPGfyMfLrX/QmZ8tRidCtHy5v9cBKqbcTLzFvXn57dUctgbvw595jBn88", "mime_type": "text/html", "status_code": "200"} 62 | org,bitchmagazine)/tag/teenagers 20130411210403 {"locator": "urn:ipfs/QmVPoKgpiB5onLc6QYKFY9GWyJNeNnU5XCTSU9hgbEcEPb/QmRjt3fFi8eXuS7F6iKYYz27tq8XN9osZWBK9aDMrFoLHE", "mime_type": "text/html", "status_code": "200"} 63 | org,bitchmagazine)/tag/texas-0 20130411205738 {"locator": "urn:ipfs/QmXMpqHeTkqmbdVxFAbhCLjXcJkyhgKCN8zWR3humrBWh2/QmTTREW35gmVmHrHTJFg1aSwFJYj8ot3dPY2MmeS7eVEdN", "mime_type": "text/html", "status_code": "200"} 64 | org,bitchmagazine)/tag/twerking 20130411210802 {"locator": "urn:ipfs/QmNmWFBmqx7tMj8Ziq378zSJQuw7AtiGSwef1X7sn9XUM8/QmaitjMnqud8AceDCAWLh1FdWM1PAZEosZKLtUZEsVvpuk", "mime_type": "text/html", "status_code": "200"} 65 | org,bitchmagazine)/tag/vanessa-hudgens 20130411210549 {"locator": "urn:ipfs/QmVbT893dJcgeFM6QBk8GH4XBFpuwRTSiZvN5G8DwmsXFF/QmTmCnJJoyro4ZejRhHupi7UqmG4XnmJRfPiV9Lf2VyPh9", "mime_type": "text/html", "status_code": "200"} 66 | org,bitchmagazine)/tag/weddings-0 20130411210130 {"locator": "urn:ipfs/Qme2DzAbQk7WEvKdqeH6sbMyXfc2XFMfg5jmj5ie9Q6nyr/QmdrDKsKyHTcyDgfcUBC59hEk39HNuwg1ojM4gZxFoBZEB", "mime_type": "text/html", "status_code": "200"} 67 | org,bitchmagazine)/tag/women-in-country 20130411205750 {"locator": "urn:ipfs/QmVvshFbbnphak1W2muiGW3u2Rz1Js8hmKbE9b1jTJdXJ3/QmecpTm95xdZL9XitZ7YFHsSMNXqkrvtzkyyYXda54R7Uc", "mime_type": "text/html", "status_code": "200"} 68 | -------------------------------------------------------------------------------- /samples/indexes/sample-2.cdxj: -------------------------------------------------------------------------------- 1 | edu,odu,cs)/~salam 20160305192247 {"locator": "urn:ipfs/QmNkt4JbkTemhUDmua7JW5NaTQWCrVZbk2EvUvhhPm9NJP/QmQr2uoXCbmC5c1vLngeE9HU1CHfF7BVG2z98JR6DQNFoU", "mime_type": "text/html", "status_code": "200"} 2 | -------------------------------------------------------------------------------- /samples/indexes/sample-encrypted.cdxj: -------------------------------------------------------------------------------- 1 | edu,odu,cs)/~salam 20160305192247 {"locator": "urn:ipfs/QmeVWGtnfuJ1QnpmtKKnyArVgEpq7v31kktEfh6c8mDiXE/QmZWKQRBNXNrVZ69LoGpMNJi5NU66gDhnGtQukWJepv7Kr", "encryption_method": "xor", "encryption_key": "radon", "mime_type": "text/html", "status_code": "200"} 2 | -------------------------------------------------------------------------------- /samples/warcs/1memento.warc: -------------------------------------------------------------------------------- 1 | WARC/1.0 2 | WARC-Type: warcinfo 3 | WARC-Date: 2017-02-18T10:00:00Z 4 | WARC-Filename: ipwb-memento.warc 5 | WARC-Record-ID: 6 | Content-Type: application/warc-fields 7 | Content-Length: 238 8 | 9 | software: Fabricated 10 | ip: 127.0.0.1 11 | hostname: localhost 12 | format: WARC File Format 1.0 13 | conformsTo: http://bibnum.bnf.fr/WARC/WARC_ISO_28500_version1_latestdraft.pdf 14 | description: SampleCrawl 15 | robots: ignore 16 | http-header-user-agent: WARCFab/1.0 17 | 18 | 19 | 20 | WARC/1.0 21 | WARC-Type: response 22 | WARC-Target-URI: http://memento.us/ 23 | WARC-Date: 2013-02-02T10:00:00Z 24 | WARC-Payload-Digest: sha1:3KRQHQ65T23N52AOS5QLFTIMWZIOO7G5 25 | WARC-Record-ID: 26 | Content-Type: application/http; msgtype=response 27 | Content-Length: 186 28 | 29 | HTTP/1.1 200 OK 30 | Server: nginx 31 | Date: Mon, 30 Jan 2017 18:39:49 GMT 32 | Content-Type: text/html 33 | Connection: close 34 | Vary: Accept-Encoding 35 | 36 | Memento for 2/2/2013 10:00am 37 | 38 | 39 | 40 | 41 | 42 | WARC/1.0 43 | WARC-Type: response 44 | WARC-Target-URI: http://someotherURI.us/ 45 | WARC-Date: 2016-12-31T11:00:00Z 46 | WARC-Payload-Digest: sha1:3KRQHQ65T23N52AOS5QLFTIMWZIOO7G5 47 | WARC-Record-ID: 48 | Content-Type: application/http; msgtype=response 49 | Content-Length: 170 50 | 51 | HTTP/1.1 200 OK 52 | Server: nginx 53 | Date: Mon, 30 Jan 2017 18:39:49 GMT 54 | Content-Type: text/html 55 | Connection: close 56 | Vary: Accept-Encoding 57 | 58 | SomeotherURI 59 | 60 | 61 | 62 | 63 | 64 | WARC/1.0 65 | WARC-Type: response 66 | WARC-Target-URI: http://anothersite.us/ 67 | WARC-Date: 2016-12-31T11:00:00Z 68 | WARC-Payload-Digest: sha1:3KRQHQ65T23N52AOS5QLFTIMWZIOO7G5 69 | WARC-Record-ID: 70 | Content-Type: application/http; msgtype=response 71 | Content-Length: 170 72 | 73 | HTTP/1.1 200 OK 74 | Server: nginx 75 | Date: Mon, 30 Jan 2017 18:39:49 GMT 76 | Content-Type: text/html 77 | Connection: close 78 | Vary: Accept-Encoding 79 | 80 | Another site 81 | 82 | 83 | 84 | -------------------------------------------------------------------------------- /samples/warcs/1memento_noContentType.warc: -------------------------------------------------------------------------------- 1 | WARC/1.0 2 | WARC-Type: warcinfo 3 | WARC-Date: 2017-02-18T10:00:00Z 4 | WARC-Filename: ipwb-memento.warc 5 | WARC-Record-ID: 6 | Content-Type: application/warc-fields 7 | Content-Length: 238 8 | 9 | software: Fabricated 10 | ip: 127.0.0.1 11 | hostname: localhost 12 | format: WARC File Format 1.0 13 | conformsTo: http://bibnum.bnf.fr/WARC/WARC_ISO_28500_version1_latestdraft.pdf 14 | description: SampleCrawl 15 | robots: ignore 16 | http-header-user-agent: WARCFab/1.0 17 | 18 | 19 | 20 | WARC/1.0 21 | WARC-Type: response 22 | WARC-Target-URI: http://memento.us/ 23 | WARC-Date: 2013-02-02T10:00:00Z 24 | WARC-Payload-Digest: sha1:3KRQHQ65T23N52AOS5QLFTIMWZIOO7G5 25 | WARC-Record-ID: 26 | Content-Type: application/http; msgtype=response 27 | Content-Length: 215 28 | 29 | HTTP/1.1 200 OK 30 | Server: nginx 31 | Date: Mon, 27 Aug 2018 16:55:49 GMT 32 | Connection: close 33 | Vary: Accept-Encoding 34 | 35 | I have no content type!Memento for 2/2/2013 10:00am 36 | 37 | 38 | -------------------------------------------------------------------------------- /samples/warcs/2mementos.warc: -------------------------------------------------------------------------------- 1 | WARC/1.0 2 | WARC-Type: warcinfo 3 | WARC-Date: 2017-02-18T10:00:00Z 4 | WARC-Filename: ipwb-memento.warc 5 | WARC-Record-ID: 6 | Content-Type: application/warc-fields 7 | Content-Length: 238 8 | 9 | software: Fabricated 10 | ip: 127.0.0.1 11 | hostname: localhost 12 | format: WARC File Format 1.0 13 | conformsTo: http://bibnum.bnf.fr/WARC/WARC_ISO_28500_version1_latestdraft.pdf 14 | description: SampleCrawl 15 | robots: ignore 16 | http-header-user-agent: WARCFab/1.0 17 | 18 | 19 | 20 | WARC/1.0 21 | WARC-Type: response 22 | WARC-Target-URI: http://memento.us/ 23 | WARC-Date: 2013-02-02T10:00:00Z 24 | WARC-Payload-Digest: sha1:3KRQHQ65T23N52AOS5QLFTIMWZIOO7G5 25 | WARC-Record-ID: 26 | Content-Type: application/http; msgtype=response 27 | Content-Length: 186 28 | 29 | HTTP/1.1 200 OK 30 | Server: nginx 31 | Date: Mon, 30 Jan 2017 18:39:49 GMT 32 | Content-Type: text/html 33 | Connection: close 34 | Vary: Accept-Encoding 35 | 36 | Memento for 2/2/2013 10:00am 37 | 38 | 39 | 40 | WARC/1.0 41 | WARC-Type: response 42 | WARC-Target-URI: http://memento.us/ 43 | WARC-Date: 2016-12-31T11:00:00Z 44 | WARC-Payload-Digest: sha1:3KRQHQ65T23N52AOS5QLFTIMWZIOO7G5 45 | WARC-Record-ID: 46 | Content-Type: application/http; msgtype=response 47 | Content-Length: 187 48 | 49 | HTTP/1.1 200 OK 50 | Server: nginx 51 | Date: Mon, 30 Jan 2017 18:39:49 GMT 52 | Content-Type: text/html 53 | Connection: close 54 | Vary: Accept-Encoding 55 | 56 | Memento for 12/31/2016 11:00am 57 | 58 | 59 | 60 | WARC/1.0 61 | WARC-Type: response 62 | WARC-Target-URI: http://someotherURI.us/ 63 | WARC-Date: 2016-12-31T11:00:00Z 64 | WARC-Payload-Digest: sha1:3KRQHQ65T23N52AOS5QLFTIMWZIOO7G5 65 | WARC-Record-ID: 66 | Content-Type: application/http; msgtype=response 67 | Content-Length: 170 68 | 69 | HTTP/1.1 200 OK 70 | Server: nginx 71 | Date: Mon, 30 Jan 2017 18:39:49 GMT 72 | Content-Type: text/html 73 | Connection: close 74 | Vary: Accept-Encoding 75 | 76 | SomeotherURI 77 | 78 | 79 | 80 | 81 | 82 | WARC/1.0 83 | WARC-Type: response 84 | WARC-Target-URI: http://anothersite.us/ 85 | WARC-Date: 2016-12-31T11:00:00Z 86 | WARC-Payload-Digest: sha1:3KRQHQ65T23N52AOS5QLFTIMWZIOO7G5 87 | WARC-Record-ID: 88 | Content-Type: application/http; msgtype=response 89 | Content-Length: 170 90 | 91 | HTTP/1.1 200 OK 92 | Server: nginx 93 | Date: Mon, 30 Jan 2017 18:39:49 GMT 94 | Content-Type: text/html 95 | Connection: close 96 | Vary: Accept-Encoding 97 | 98 | Another site 99 | 100 | 101 | 102 | -------------------------------------------------------------------------------- /samples/warcs/2mementos_htmlXhtml.warc: -------------------------------------------------------------------------------- 1 | WARC/1.0 2 | WARC-Type: warcinfo 3 | WARC-Date: 2017-02-18T10:00:00Z 4 | WARC-Filename: ipwb-memento.warc 5 | WARC-Record-ID: 6 | Content-Type: application/warc-fields 7 | Content-Length: 238 8 | 9 | software: Fabricated 10 | ip: 127.0.0.1 11 | hostname: localhost 12 | format: WARC File Format 1.0 13 | conformsTo: http://bibnum.bnf.fr/WARC/WARC_ISO_28500_version1_latestdraft.pdf 14 | description: SampleCrawl 15 | robots: ignore 16 | http-header-user-agent: WARCFab/1.0 17 | 18 | 19 | 20 | WARC/1.0 21 | WARC-Type: response 22 | WARC-Target-URI: http://htmlsite.com 23 | WARC-Date: 2013-02-02T10:00:00Z 24 | WARC-Payload-Digest: sha1:3KRQHQ65T23N52AOS5QLFTIMWZIOO7G5 25 | WARC-Record-ID: 26 | Content-Type: application/http; msgtype=response 27 | Content-Length: 169 28 | 29 | HTTP/1.1 200 OK 30 | Server: nginx 31 | Date: Mon, 30 Jan 2017 18:39:49 GMT 32 | Content-Type: text/html 33 | Connection: close 34 | Vary: Accept-Encoding 35 | 36 | HTML Memento 37 | 38 | 39 | 40 | WARC/1.0 41 | WARC-Type: response 42 | WARC-Target-URI: https://xhtml-website.org 43 | WARC-Date: 2009-01-10T11:00:01Z 44 | WARC-Payload-Digest: sha1:3KRQHQ65T23N52AOS5QLFTIMWZIOO7G5 45 | WARC-Record-ID: 46 | Content-Type: application/http; msgtype=response 47 | Content-Length: 403 48 | 49 | HTTP/1.1 200 OK 50 | Server: nginx 51 | Date: Mon, 10 Jan 2009 11:00:01 GMT 52 | Content-Type: application/xhtml+xml 53 | Connection: close 54 | 55 | 57 | 58 | An XHTML Document 59 | 60 |

      An XHTML Document Body

      61 | 62 | 63 | 64 | 65 | -------------------------------------------------------------------------------- /samples/warcs/2mementos_queryString.warc: -------------------------------------------------------------------------------- 1 | WARC/1.0 2 | WARC-Type: warcinfo 3 | WARC-Date: 2017-02-18T10:00:00Z 4 | WARC-Filename: ipwb-memento.warc 5 | WARC-Record-ID: 6 | Content-Type: application/warc-fields 7 | Content-Length: 238 8 | 9 | software: Fabricated 10 | ip: 127.0.0.1 11 | hostname: localhost 12 | format: WARC File Format 1.0 13 | conformsTo: http://bibnum.bnf.fr/WARC/WARC_ISO_28500_version1_latestdraft.pdf 14 | description: SampleCrawl 15 | robots: ignore 16 | http-header-user-agent: WARCFab/1.0 17 | 18 | 19 | 20 | WARC/1.0 21 | WARC-Type: response 22 | WARC-Target-URI: http://memento.us/index.php?someval=lorem&anotherval=ipsum 23 | WARC-Date: 2013-02-02T10:00:00Z 24 | WARC-Payload-Digest: sha1:3KRQHQ65T23N52AOS5QLFTIMWZIOO7G5 25 | WARC-Record-ID: 26 | Content-Type: application/http; msgtype=response 27 | Content-Length: 186 28 | 29 | HTTP/1.1 200 OK 30 | Server: nginx 31 | Date: Mon, 30 Jan 2017 18:39:49 GMT 32 | Content-Type: text/html 33 | Connection: close 34 | Vary: Accept-Encoding 35 | 36 | Memento for 2/2/2013 10:00am 37 | 38 | 39 | 40 | WARC/1.0 41 | WARC-Type: response 42 | WARC-Target-URI: http://memento.us/ 43 | WARC-Date: 2016-12-31T11:00:00Z 44 | WARC-Payload-Digest: sha1:3KRQHQ65T23N52AOS5QLFTIMWZIOO7G5 45 | WARC-Record-ID: 46 | Content-Type: application/http; msgtype=response 47 | Content-Length: 187 48 | 49 | HTTP/1.1 200 OK 50 | Server: nginx 51 | Date: Mon, 30 Jan 2017 18:39:49 GMT 52 | Content-Type: text/html 53 | Connection: close 54 | Vary: Accept-Encoding 55 | 56 | Memento for 12/31/2016 11:00am 57 | 58 | 59 | -------------------------------------------------------------------------------- /samples/warcs/3mementos.warc: -------------------------------------------------------------------------------- 1 | WARC/1.0 2 | WARC-Type: warcinfo 3 | WARC-Date: 2017-02-18T10:00:00Z 4 | WARC-Filename: ipwb-memento.warc 5 | WARC-Record-ID: 6 | Content-Type: application/warc-fields 7 | Content-Length: 238 8 | 9 | software: Fabricated 10 | ip: 127.0.0.1 11 | hostname: localhost 12 | format: WARC File Format 1.0 13 | conformsTo: http://bibnum.bnf.fr/WARC/WARC_ISO_28500_version1_latestdraft.pdf 14 | description: SampleCrawl 15 | robots: ignore 16 | http-header-user-agent: WARCFab/1.0 17 | 18 | 19 | 20 | WARC/1.0 21 | WARC-Type: response 22 | WARC-Target-URI: http://memento.us/ 23 | WARC-Date: 2014-01-15T10:15:00Z 24 | WARC-Payload-Digest: sha1:3KRQHQ65T23N52AOS5QLFTIMWZIOO7G5 25 | WARC-Record-ID: 26 | Content-Type: application/http; msgtype=response 27 | Content-Length: 186 28 | 29 | HTTP/1.1 200 OK 30 | Server: nginx 31 | Date: Mon, 30 Jan 2017 18:39:49 GMT 32 | Content-Type: text/html 33 | Connection: close 34 | Vary: Accept-Encoding 35 | 36 | Memento for 1/15/2014 10:15am 37 | 38 | 39 | 40 | WARC/1.0 41 | WARC-Type: response 42 | WARC-Target-URI: http://memento.us/ 43 | WARC-Date: 2013-02-02T10:00:00Z 44 | WARC-Payload-Digest: sha1:3KRQHQ65T23N52AOS5QLFTIMWZIOO7G5 45 | WARC-Record-ID: 46 | Content-Type: application/http; msgtype=response 47 | Content-Length: 186 48 | 49 | HTTP/1.1 200 OK 50 | Server: nginx 51 | Date: Mon, 30 Jan 2017 18:39:49 GMT 52 | Content-Type: text/html 53 | Connection: close 54 | Vary: Accept-Encoding 55 | 56 | Memento for 2/2/2013 10:00am 57 | 58 | 59 | 60 | WARC/1.0 61 | WARC-Type: response 62 | WARC-Target-URI: http://memento.us/ 63 | WARC-Date: 2016-12-31T11:00:00Z 64 | WARC-Payload-Digest: sha1:3KRQHQ65T23N52AOS5QLFTIMWZIOO7G5 65 | WARC-Record-ID: 66 | Content-Type: application/http; msgtype=response 67 | Content-Length: 187 68 | 69 | HTTP/1.1 200 OK 70 | Server: nginx 71 | Date: Mon, 30 Jan 2017 18:39:49 GMT 72 | Content-Type: text/html 73 | Connection: close 74 | Vary: Accept-Encoding 75 | 76 | Memento for 12/31/2016 11:00am 77 | 78 | 79 | 80 | WARC/1.0 81 | WARC-Type: response 82 | WARC-Target-URI: http://someotherURI.us/ 83 | WARC-Date: 2016-12-31T11:00:00Z 84 | WARC-Payload-Digest: sha1:3KRQHQ65T23N52AOS5QLFTIMWZIOO7G5 85 | WARC-Record-ID: 86 | Content-Type: application/http; msgtype=response 87 | Content-Length: 170 88 | 89 | HTTP/1.1 200 OK 90 | Server: nginx 91 | Date: Mon, 30 Jan 2017 18:39:49 GMT 92 | Content-Type: text/html 93 | Connection: close 94 | Vary: Accept-Encoding 95 | 96 | SomeotherURI 97 | 98 | 99 | 100 | 101 | 102 | WARC/1.0 103 | WARC-Type: response 104 | WARC-Target-URI: http://anothersite.us/ 105 | WARC-Date: 2016-12-31T11:00:00Z 106 | WARC-Payload-Digest: sha1:3KRQHQ65T23N52AOS5QLFTIMWZIOO7G5 107 | WARC-Record-ID: 108 | Content-Type: application/http; msgtype=response 109 | Content-Length: 170 110 | 111 | HTTP/1.1 200 OK 112 | Server: nginx 113 | Date: Mon, 30 Jan 2017 18:39:49 GMT 114 | Content-Type: text/html 115 | Connection: close 116 | Vary: Accept-Encoding 117 | 118 | Another site 119 | 120 | 121 | 122 | -------------------------------------------------------------------------------- /samples/warcs/4mementos.warc: -------------------------------------------------------------------------------- 1 | WARC/1.0 2 | WARC-Type: warcinfo 3 | WARC-Date: 2017-02-18T10:00:00Z 4 | WARC-Filename: ipwb-memento.warc 5 | WARC-Record-ID: 6 | Content-Type: application/warc-fields 7 | Content-Length: 238 8 | 9 | software: Fabricated 10 | ip: 127.0.0.1 11 | hostname: localhost 12 | format: WARC File Format 1.0 13 | conformsTo: http://bibnum.bnf.fr/WARC/WARC_ISO_28500_version1_latestdraft.pdf 14 | description: SampleCrawl 15 | robots: ignore 16 | http-header-user-agent: WARCFab/1.0 17 | 18 | 19 | 20 | WARC/1.0 21 | WARC-Type: response 22 | WARC-Target-URI: http://memento.us/ 23 | WARC-Date: 2014-01-14T10:00:00Z 24 | WARC-Payload-Digest: sha1:3KRQHQ65T23N52AOS5QLFTIMWZIOO7G5 25 | WARC-Record-ID: 26 | Content-Type: application/http; msgtype=response 27 | Content-Length: 186 28 | 29 | HTTP/1.1 200 OK 30 | Server: nginx 31 | Date: Mon, 30 Jan 2017 18:39:49 GMT 32 | Content-Type: text/html 33 | Connection: close 34 | Vary: Accept-Encoding 35 | 36 | Memento for 1/14/2014 10:00am 37 | 38 | 39 | 40 | 41 | WARC/1.0 42 | WARC-Type: response 43 | WARC-Target-URI: http://memento.us/ 44 | WARC-Date: 2014-01-15T10:15:00Z 45 | WARC-Payload-Digest: sha1:3KRQHQ65T23N52AOS5QLFTIMWZIOO7G5 46 | WARC-Record-ID: 47 | Content-Type: application/http; msgtype=response 48 | Content-Length: 186 49 | 50 | HTTP/1.1 200 OK 51 | Server: nginx 52 | Date: Mon, 30 Jan 2017 18:39:49 GMT 53 | Content-Type: text/html 54 | Connection: close 55 | Vary: Accept-Encoding 56 | 57 | Memento for 1/15/2014 10:15am 58 | 59 | 60 | 61 | WARC/1.0 62 | WARC-Type: response 63 | WARC-Target-URI: http://memento.us/ 64 | WARC-Date: 2013-02-02T10:00:00Z 65 | WARC-Payload-Digest: sha1:3KRQHQ65T23N52AOS5QLFTIMWZIOO7G5 66 | WARC-Record-ID: 67 | Content-Type: application/http; msgtype=response 68 | Content-Length: 186 69 | 70 | HTTP/1.1 200 OK 71 | Server: nginx 72 | Date: Mon, 30 Jan 2017 18:39:49 GMT 73 | Content-Type: text/html 74 | Connection: close 75 | Vary: Accept-Encoding 76 | 77 | Memento for 2/2/2013 10:00am 78 | 79 | 80 | 81 | WARC/1.0 82 | WARC-Type: response 83 | WARC-Target-URI: http://memento.us/ 84 | WARC-Date: 2016-12-31T11:00:00Z 85 | WARC-Payload-Digest: sha1:3KRQHQ65T23N52AOS5QLFTIMWZIOO7G5 86 | WARC-Record-ID: 87 | Content-Type: application/http; msgtype=response 88 | Content-Length: 187 89 | 90 | HTTP/1.1 200 OK 91 | Server: nginx 92 | Date: Mon, 30 Jan 2017 18:39:49 GMT 93 | Content-Type: text/html 94 | Connection: close 95 | Vary: Accept-Encoding 96 | 97 | Memento for 12/31/2016 11:00am 98 | 99 | 100 | 101 | WARC/1.0 102 | WARC-Type: response 103 | WARC-Target-URI: http://someotherURI.us/ 104 | WARC-Date: 2016-12-31T11:00:00Z 105 | WARC-Payload-Digest: sha1:3KRQHQ65T23N52AOS5QLFTIMWZIOO7G5 106 | WARC-Record-ID: 107 | Content-Type: application/http; msgtype=response 108 | Content-Length: 170 109 | 110 | HTTP/1.1 200 OK 111 | Server: nginx 112 | Date: Mon, 30 Jan 2017 18:39:49 GMT 113 | Content-Type: text/html 114 | Connection: close 115 | Vary: Accept-Encoding 116 | 117 | SomeotherURI 118 | 119 | 120 | 121 | 122 | 123 | WARC/1.0 124 | WARC-Type: response 125 | WARC-Target-URI: http://anothersite.us/ 126 | WARC-Date: 2016-12-31T11:00:00Z 127 | WARC-Payload-Digest: sha1:3KRQHQ65T23N52AOS5QLFTIMWZIOO7G5 128 | WARC-Record-ID: 129 | Content-Type: application/http; msgtype=response 130 | Content-Length: 170 131 | 132 | HTTP/1.1 200 OK 133 | Server: nginx 134 | Date: Mon, 30 Jan 2017 18:39:49 GMT 135 | Content-Type: text/html 136 | Connection: close 137 | Vary: Accept-Encoding 138 | 139 | Another site 140 | 141 | 142 | 143 | -------------------------------------------------------------------------------- /samples/warcs/5mementos.warc: -------------------------------------------------------------------------------- 1 | WARC/1.0 2 | WARC-Type: warcinfo 3 | WARC-Date: 2017-02-18T10:00:00Z 4 | WARC-Filename: ipwb-memento.warc 5 | WARC-Record-ID: 6 | Content-Type: application/warc-fields 7 | Content-Length: 238 8 | 9 | software: Fabricated 10 | ip: 127.0.0.1 11 | hostname: localhost 12 | format: WARC File Format 1.0 13 | conformsTo: http://bibnum.bnf.fr/WARC/WARC_ISO_28500_version1_latestdraft.pdf 14 | description: SampleCrawl 15 | robots: ignore 16 | http-header-user-agent: WARCFab/1.0 17 | 18 | 19 | 20 | WARC/1.0 21 | WARC-Type: response 22 | WARC-Target-URI: http://memento.us/ 23 | WARC-Date: 2014-01-14T10:00:00Z 24 | WARC-Payload-Digest: sha1:3KRQHQ65T23N52AOS5QLFTIMWZIOO7G5 25 | WARC-Record-ID: 26 | Content-Type: application/http; msgtype=response 27 | Content-Length: 186 28 | 29 | HTTP/1.1 200 OK 30 | Server: nginx 31 | Date: Mon, 30 Jan 2017 18:39:49 GMT 32 | Content-Type: text/html 33 | Connection: close 34 | Vary: Accept-Encoding 35 | 36 | Memento for 1/14/2014 10:00am 37 | 38 | 39 | 40 | 41 | WARC/1.0 42 | WARC-Type: response 43 | WARC-Target-URI: http://memento.us/ 44 | WARC-Date: 2014-01-15T10:15:00Z 45 | WARC-Payload-Digest: sha1:3KRQHQ65T23N52AOS5QLFTIMWZIOO7G5 46 | WARC-Record-ID: 47 | Content-Type: application/http; msgtype=response 48 | Content-Length: 186 49 | 50 | HTTP/1.1 200 OK 51 | Server: nginx 52 | Date: Mon, 30 Jan 2017 18:39:49 GMT 53 | Content-Type: text/html 54 | Connection: close 55 | Vary: Accept-Encoding 56 | 57 | Memento for 1/15/2014 10:15am 58 | 59 | 60 | 61 | WARC/1.0 62 | WARC-Type: response 63 | WARC-Target-URI: http://memento.us/ 64 | WARC-Date: 2013-02-02T10:00:00Z 65 | WARC-Payload-Digest: sha1:3KRQHQ65T23N52AOS5QLFTIMWZIOO7G5 66 | WARC-Record-ID: 67 | Content-Type: application/http; msgtype=response 68 | Content-Length: 186 69 | 70 | HTTP/1.1 200 OK 71 | Server: nginx 72 | Date: Mon, 30 Jan 2017 18:39:49 GMT 73 | Content-Type: text/html 74 | Connection: close 75 | Vary: Accept-Encoding 76 | 77 | Memento for 2/2/2013 10:00am 78 | 79 | 80 | 81 | WARC/1.0 82 | WARC-Type: response 83 | WARC-Target-URI: http://memento.us/ 84 | WARC-Date: 2016-12-31T11:00:00Z 85 | WARC-Payload-Digest: sha1:3KRQHQ65T23N52AOS5QLFTIMWZIOO7G5 86 | WARC-Record-ID: 87 | Content-Type: application/http; msgtype=response 88 | Content-Length: 187 89 | 90 | HTTP/1.1 200 OK 91 | Server: nginx 92 | Date: Mon, 30 Jan 2017 18:39:49 GMT 93 | Content-Type: text/html 94 | Connection: close 95 | Vary: Accept-Encoding 96 | 97 | Memento for 12/31/2016 11:00am 98 | 99 | 100 | 101 | WARC/1.0 102 | WARC-Type: response 103 | WARC-Target-URI: http://memento.us/ 104 | WARC-Date: 2016-12-31T11:00:01Z 105 | WARC-Payload-Digest: sha1:3KRQHQ65T23N52AOS5QLFTIMWZIOO7G5 106 | WARC-Record-ID: 107 | Content-Type: application/http; msgtype=response 108 | Content-Length: 187 109 | 110 | HTTP/1.1 200 OK 111 | Server: nginx 112 | Date: Mon, 30 Jan 2017 18:39:49 GMT 113 | Content-Type: text/html 114 | Connection: close 115 | Vary: Accept-Encoding 116 | 117 | Memento for 12/31/2016 11:01am 118 | 119 | 120 | 121 | 122 | WARC/1.0 123 | WARC-Type: response 124 | WARC-Target-URI: http://someotherURI.us/ 125 | WARC-Date: 2016-12-31T11:00:00Z 126 | WARC-Payload-Digest: sha1:3KRQHQ65T23N52AOS5QLFTIMWZIOO7G5 127 | WARC-Record-ID: 128 | Content-Type: application/http; msgtype=response 129 | Content-Length: 170 130 | 131 | HTTP/1.1 200 OK 132 | Server: nginx 133 | Date: Mon, 30 Jan 2017 18:39:49 GMT 134 | Content-Type: text/html 135 | Connection: close 136 | Vary: Accept-Encoding 137 | 138 | SomeotherURI 139 | 140 | 141 | 142 | 143 | 144 | WARC/1.0 145 | WARC-Type: response 146 | WARC-Target-URI: http://anothersite.us/ 147 | WARC-Date: 2016-12-31T11:00:00Z 148 | WARC-Payload-Digest: sha1:3KRQHQ65T23N52AOS5QLFTIMWZIOO7G5 149 | WARC-Record-ID: 150 | Content-Type: application/http; msgtype=response 151 | Content-Length: 170 152 | 153 | HTTP/1.1 200 OK 154 | Server: nginx 155 | Date: Mon, 30 Jan 2017 18:39:49 GMT 156 | Content-Type: text/html 157 | Connection: close 158 | Vary: Accept-Encoding 159 | 160 | Another site 161 | 162 | 163 | 164 | -------------------------------------------------------------------------------- /samples/warcs/5mementosAndFroggie.warc: -------------------------------------------------------------------------------- 1 | WARC/1.0 2 | WARC-Type: warcinfo 3 | WARC-Date: 2017-02-18T10:00:00Z 4 | WARC-Filename: ipwb-memento.warc 5 | WARC-Record-ID: 6 | Content-Type: application/warc-fields 7 | Content-Length: 238 8 | 9 | software: Fabricated 10 | ip: 127.0.0.1 11 | hostname: localhost 12 | format: WARC File Format 1.0 13 | conformsTo: http://bibnum.bnf.fr/WARC/WARC_ISO_28500_version1_latestdraft.pdf 14 | description: SampleCrawl 15 | robots: ignore 16 | http-header-user-agent: WARCFab/1.0 17 | 18 | 19 | 20 | WARC/1.0 21 | WARC-Type: response 22 | WARC-Target-URI: http://memento.us/ 23 | WARC-Date: 2014-01-14T10:00:00Z 24 | WARC-Payload-Digest: sha1:3KRQHQ65T23N52AOS5QLFTIMWZIOO7G5 25 | WARC-Record-ID: 26 | Content-Type: application/http; msgtype=response 27 | Content-Length: 186 28 | 29 | HTTP/1.1 200 OK 30 | Server: nginx 31 | Date: Mon, 30 Jan 2017 18:39:49 GMT 32 | Content-Type: text/html 33 | Connection: close 34 | Vary: Accept-Encoding 35 | 36 | Memento for 1/14/2014 10:00am 37 | 38 | 39 | 40 | 41 | WARC/1.0 42 | WARC-Type: response 43 | WARC-Target-URI: http://memento.us/ 44 | WARC-Date: 2014-01-15T10:15:00Z 45 | WARC-Payload-Digest: sha1:3KRQHQ65T23N52AOS5QLFTIMWZIOO7G5 46 | WARC-Record-ID: 47 | Content-Type: application/http; msgtype=response 48 | Content-Length: 186 49 | 50 | HTTP/1.1 200 OK 51 | Server: nginx 52 | Date: Mon, 30 Jan 2017 18:39:49 GMT 53 | Content-Type: text/html 54 | Connection: close 55 | Vary: Accept-Encoding 56 | 57 | Memento for 1/15/2014 10:15am 58 | 59 | 60 | 61 | WARC/1.0 62 | WARC-Type: response 63 | WARC-Target-URI: http://memento.us/ 64 | WARC-Date: 2013-02-02T10:00:00Z 65 | WARC-Payload-Digest: sha1:3KRQHQ65T23N52AOS5QLFTIMWZIOO7G5 66 | WARC-Record-ID: 67 | Content-Type: application/http; msgtype=response 68 | Content-Length: 186 69 | 70 | HTTP/1.1 200 OK 71 | Server: nginx 72 | Date: Mon, 30 Jan 2017 18:39:49 GMT 73 | Content-Type: text/html 74 | Connection: close 75 | Vary: Accept-Encoding 76 | 77 | Memento for 2/2/2013 10:00am 78 | 79 | 80 | 81 | WARC/1.0 82 | WARC-Type: response 83 | WARC-Target-URI: http://memento.us/ 84 | WARC-Date: 2016-12-31T11:00:00Z 85 | WARC-Payload-Digest: sha1:3KRQHQ65T23N52AOS5QLFTIMWZIOO7G5 86 | WARC-Record-ID: 87 | Content-Type: application/http; msgtype=response 88 | Content-Length: 187 89 | 90 | HTTP/1.1 200 OK 91 | Server: nginx 92 | Date: Mon, 30 Jan 2017 18:39:49 GMT 93 | Content-Type: text/html 94 | Connection: close 95 | Vary: Accept-Encoding 96 | 97 | Memento for 12/31/2016 11:00am 98 | 99 | 100 | 101 | WARC/1.0 102 | WARC-Type: response 103 | WARC-Target-URI: http://memento.us/ 104 | WARC-Date: 2016-12-31T11:00:01Z 105 | WARC-Payload-Digest: sha1:3KRQHQ65T23N52AOS5QLFTIMWZIOO7G5 106 | WARC-Record-ID: 107 | Content-Type: application/http; msgtype=response 108 | Content-Length: 187 109 | 110 | HTTP/1.1 200 OK 111 | Server: nginx 112 | Date: Mon, 30 Jan 2017 18:39:49 GMT 113 | Content-Type: text/html 114 | Connection: close 115 | Vary: Accept-Encoding 116 | 117 | Memento for 12/31/2016 11:01am 118 | 119 | 120 | 121 | 122 | WARC/1.0 123 | WARC-Type: response 124 | WARC-Target-URI: http://someotherURI.us/ 125 | WARC-Date: 2016-12-31T11:00:00Z 126 | WARC-Payload-Digest: sha1:3KRQHQ65T23N52AOS5QLFTIMWZIOO7G5 127 | WARC-Record-ID: 128 | Content-Type: application/http; msgtype=response 129 | Content-Length: 170 130 | 131 | HTTP/1.1 200 OK 132 | Server: nginx 133 | Date: Mon, 30 Jan 2017 18:39:49 GMT 134 | Content-Type: text/html 135 | Connection: close 136 | Vary: Accept-Encoding 137 | 138 | SomeotherURI 139 | 140 | 141 | 142 | 143 | 144 | WARC/1.0 145 | WARC-Type: response 146 | WARC-Target-URI: http://anothersite.us/ 147 | WARC-Date: 2016-12-31T11:00:00Z 148 | WARC-Payload-Digest: sha1:3KRQHQ65T23N52AOS5QLFTIMWZIOO7G5 149 | WARC-Record-ID: 150 | Content-Type: application/http; msgtype=response 151 | Content-Length: 170 152 | 153 | HTTP/1.1 200 OK 154 | Server: nginx 155 | Date: Mon, 30 Jan 2017 18:39:49 GMT 156 | Content-Type: text/html 157 | Connection: close 158 | Vary: Accept-Encoding 159 | 160 | Another site 161 | 162 | 163 | 164 | 165 | 166 | WARC/1.0 167 | WARC-Type: response 168 | WARC-Record-ID: 169 | WARC-Warcinfo-ID: 170 | WARC-Target-URI: http://whensAPNGNotAPing.net 171 | WARC-Date: 2017-03-01T19:26:39Z 172 | WARC-Block-Digest: sha1:SZPTTOGV3LYYR6H7OMA7QC6YKZACNQSY 173 | Content-Type: image/png 174 | Content-Length: 154 175 | 176 | HTTP/1.1 200 OK 177 | Server: nginx 178 | Date: Mon, 30 Jan 2017 18:39:49 GMT 179 | Content-Type: image/png 180 | Connection: close 181 | Vary: Accept-Encoding 182 | 183 | Ceci n'est pas une PNG. 184 | -------------------------------------------------------------------------------- /samples/warcs/HTTP204.warc: -------------------------------------------------------------------------------- 1 | WARC/1.0 2 | WARC-Type: warcinfo 3 | WARC-Date: 2017-02-18T10:00:00Z 4 | WARC-Filename: HTTP204.warc 5 | WARC-Record-ID: 6 | Content-Type: application/warc-fields 7 | Content-Length: 238 8 | 9 | software: Fabricated 10 | ip: 127.0.0.1 11 | hostname: localhost 12 | format: WARC File Format 1.0 13 | conformsTo: http://bibnum.bnf.fr/WARC/WARC_ISO_28500_version1_latestdraft.pdf 14 | description: SampleCrawl 15 | robots: ignore 16 | http-header-user-agent: WARCFab/1.0 17 | 18 | 19 | 20 | WARC/1.0 21 | WARC-Type: response 22 | WARC-Record-ID: 23 | WARC-Date: 2018-07-10T20:40:46Z 24 | WARC-Target-URI: https://example.com 25 | WARC-IP-Address: 192.30.255.116 26 | Content-Type: application/http;msgtype=response 27 | Content-Length: 140 28 | 29 | HTTP/1.1 204 No Content 30 | Date: Tue, 10 Jul 2018 20:40:46 GMT 31 | Content-Type: application/octet-stream 32 | Server: GitHub.com 33 | Status: 204 No Content 34 | 35 | 36 | 37 | WARC/1.0 38 | WARC-Type: response 39 | WARC-Target-URI: http://memento.us/ 40 | WARC-Date: 2013-02-02T10:00:00Z 41 | WARC-Payload-Digest: sha1:3KRQHQ65T23N52AOS5QLFTIMWZIOO7G5 42 | WARC-Record-ID: 43 | Content-Type: application/http; msgtype=response 44 | Content-Length: 194 45 | 46 | HTTP/1.1 200 OK 47 | Server: nginx 48 | Date: Mon, 30 Jan 2017 18:39:49 GMT 49 | Content-Type: text/html 50 | Connection: close 51 | Vary: Accept-Encoding 52 | 53 | Memento for 2/2/2013 10:00am 54 | 55 | -------------------------------------------------------------------------------- /samples/warcs/HTTP404.warc: -------------------------------------------------------------------------------- 1 | WARC/1.0 2 | WARC-Type: warcinfo 3 | WARC-Date: 2017-02-18T10:00:00Z 4 | WARC-Filename: HTTP204.warc 5 | WARC-Record-ID: 6 | Content-Type: application/warc-fields 7 | Content-Length: 248 8 | 9 | software: Fabricated 10 | ip: 127.0.0.1 11 | hostname: localhost 12 | format: WARC File Format 1.0 13 | conformsTo: http://bibnum.bnf.fr/WARC/WARC_ISO_28500_version1_latestdraft.pdf 14 | description: SampleCrawl 15 | robots: ignore 16 | http-header-user-agent: WARCFab/1.0 17 | 18 | 19 | 20 | WARC/1.0 21 | WARC-Type: response 22 | WARC-Target-URI: http://memento.us/ 23 | WARC-Date: 2020-02-02T10:00:00Z 24 | WARC-Payload-Digest: sha1:3KRQHQ65T23N52AOS5QLFTIMWZIOO7G5 25 | WARC-Record-ID: 26 | Content-Type: application/http; msgtype=response 27 | Content-Length: 215 28 | 29 | HTTP/1.1 404 NOT FOUND 30 | Server: nginx 31 | Date: Mon, 20 Jan 2020 16:29:49 GMT 32 | Content-Type: text/html 33 | Connection: close 34 | 35 | The page was not found on this server. This is an archived 404. 36 | 37 | -------------------------------------------------------------------------------- /samples/warcs/IAH-20080430204825-00000-blackbook.warc.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oduwsdl/ipwb/91e202b31d96af0171690b2df61d47625867651f/samples/warcs/IAH-20080430204825-00000-blackbook.warc.gz -------------------------------------------------------------------------------- /samples/warcs/baconIpsum.warc.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oduwsdl/ipwb/91e202b31d96af0171690b2df61d47625867651f/samples/warcs/baconIpsum.warc.gz -------------------------------------------------------------------------------- /samples/warcs/broken.warc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oduwsdl/ipwb/91e202b31d96af0171690b2df61d47625867651f/samples/warcs/broken.warc -------------------------------------------------------------------------------- /samples/warcs/frogTest.warc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oduwsdl/ipwb/91e202b31d96af0171690b2df61d47625867651f/samples/warcs/frogTest.warc -------------------------------------------------------------------------------- /samples/warcs/froggie.warc.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oduwsdl/ipwb/91e202b31d96af0171690b2df61d47625867651f/samples/warcs/froggie.warc.gz -------------------------------------------------------------------------------- /samples/warcs/redirect.warc: -------------------------------------------------------------------------------- 1 | WARC/1.0 2 | WARC-Type: warcinfo 3 | WARC-Date: 2017-02-18T10:00:00Z 4 | WARC-Filename: 301body.warc 5 | WARC-Record-ID: 6 | Content-Type: application/warc-fields 7 | Content-Length: 238 8 | 9 | software: Fabricated 10 | ip: 127.0.0.1 11 | hostname: localhost 12 | format: WARC File Format 1.0 13 | conformsTo: http://bibnum.bnf.fr/WARC/WARC_ISO_28500_version1_latestdraft.pdf 14 | description: SampleCrawl 15 | robots: ignore 16 | http-header-user-agent: WARCFab/1.0 17 | 18 | 19 | 20 | WARC/1.0 21 | WARC-Target-URI: https://example.com 22 | WARC-Date: 2018-07-27T20:31:27Z 23 | WARC-Record-ID: 24 | WARC-Type: response 25 | Content-Type: application/http; msgtype=response 26 | Content-Length: 265 27 | 28 | HTTP/1.1 302 Found 29 | Server: example.com 30 | Date: Fri, 27 Jul 2018 20:31:27 GMT 31 | Content-Type: text/html; charset=utf-8 32 | Location: https://example.com/anotherURI 33 | 34 | You are being redirected. 35 | 36 | 37 | 38 | -------------------------------------------------------------------------------- /samples/warcs/redirectRelative.warc: -------------------------------------------------------------------------------- 1 | WARC/1.0 2 | WARC-Type: warcinfo 3 | WARC-Date: 2017-02-18T10:00:00Z 4 | WARC-Filename: 301body.warc 5 | WARC-Record-ID: 6 | Content-Type: application/warc-fields 7 | Content-Length: 238 8 | 9 | software: Fabricated 10 | ip: 127.0.0.1 11 | hostname: localhost 12 | format: WARC File Format 1.0 13 | conformsTo: http://bibnum.bnf.fr/WARC/WARC_ISO_28500_version1_latestdraft.pdf 14 | description: SampleCrawl 15 | robots: ignore 16 | http-header-user-agent: WARCFab/1.0 17 | 18 | 19 | 20 | WARC/1.0 21 | WARC-Target-URI: https://example.com 22 | WARC-Date: 2018-07-27T20:31:27Z 23 | WARC-Record-ID: 24 | WARC-Type: response 25 | Content-Type: application/http; msgtype=response 26 | Content-Length: 233 27 | 28 | HTTP/1.1 302 Found 29 | Server: example.com 30 | Date: Fri, 27 Jul 2018 20:31:27 GMT 31 | Content-Type: text/html; charset=utf-8 32 | Location: /anotherURI 33 | 34 | You are being redirected. 35 | 36 | 37 | 38 | -------------------------------------------------------------------------------- /samples/warcs/salam-home.warc: -------------------------------------------------------------------------------- 1 | WARC/1.0 2 | WARC-Type: warcinfo 3 | Content-Type: application/warc-fields 4 | WARC-Date: 2016-03-05T19:22:47Z 5 | WARC-Record-ID: 6 | WARC-Filename: salam-home-2.warc.gz 7 | WARC-Block-Digest: sha1:M3NMC5WGCYZK4VPJRWJDMOTCXOEUBTYJ 8 | Content-Length: 240 9 | 10 | software: Wget/1.16.1 (linux-gnu) 11 | format: WARC File Format 1.0 12 | conformsTo: http://bibnum.bnf.fr/WARC/WARC_ISO_28500_version1_latestdraft.pdf 13 | robots: classic 14 | wget-arguments: "--warc-file=salam-home-2" "http://www.cs.odu.edu/~salam/" 15 | 16 | 17 | 18 | WARC/1.0 19 | WARC-Type: request 20 | WARC-Target-URI: http://www.cs.odu.edu/~salam/ 21 | Content-Type: application/http;msgtype=request 22 | WARC-Date: 2016-03-05T19:22:47Z 23 | WARC-Record-ID: 24 | WARC-IP-Address: 128.82.4.2 25 | WARC-Warcinfo-ID: 26 | WARC-Block-Digest: sha1:H24M7VNPPW7FX2HGT32WJOD4P765KD33 27 | Content-Length: 148 28 | 29 | GET /~salam/ HTTP/1.1 30 | User-Agent: Wget/1.16.1 (linux-gnu) 31 | Accept: */* 32 | Accept-Encoding: identity 33 | Host: www.cs.odu.edu 34 | Connection: Keep-Alive 35 | 36 | 37 | 38 | WARC/1.0 39 | WARC-Type: response 40 | WARC-Record-ID: 41 | WARC-Warcinfo-ID: 42 | WARC-Concurrent-To: 43 | WARC-Target-URI: http://www.cs.odu.edu/~salam/ 44 | WARC-Date: 2016-03-05T19:22:47Z 45 | WARC-IP-Address: 128.82.4.2 46 | WARC-Block-Digest: sha1:KJGCBURAG3CG5CJVPM5FPDVISRXETSOU 47 | WARC-Payload-Digest: sha1:EROTE27QSP4O5OYCL4GCOOAYQBXLMPJV 48 | Content-Type: application/http;msgtype=response 49 | Content-Length: 1789 50 | 51 | HTTP/1.1 200 OK 52 | Server: nginx 53 | Date: Sat, 05 Mar 2016 19:22:47 GMT 54 | Content-Type: text/html 55 | Transfer-Encoding: chunked 56 | Connection: keep-alive 57 | Vary: Accept-Encoding 58 | 59 | 646 60 | 61 | 62 | HomePage | Sawood Alam 63 | 64 | 109 | 110 | 111 |

      Sawood Alam

      112 | 113 |

      My CV (PDF)

      114 |

      I am a Web Application programmer and Semantic Web researcher. Commonly tagged by Web, Digital Library, Web Archiving, Ruby on Rails, PHP, XHTML, CSS, JavaScript, ExtJS, Urdu, RTL and Linux.

      115 | 116 | 117 | 118 | 0 119 | 120 | 121 | 122 | WARC/1.0 123 | WARC-Type: metadata 124 | WARC-Record-ID: 125 | WARC-Warcinfo-ID: 126 | WARC-Target-URI: metadata://gnu.org/software/wget/warc/MANIFEST.txt 127 | WARC-Date: 2016-03-05T19:22:47Z 128 | WARC-Block-Digest: sha1:G6DHE4Q672Q27TUHKVMWXRAVVMA2DHF6 129 | Content-Type: text/plain 130 | Content-Length: 48 131 | 132 | 133 | 134 | 135 | WARC/1.0 136 | WARC-Type: resource 137 | WARC-Record-ID: 138 | WARC-Warcinfo-ID: 139 | WARC-Concurrent-To: 140 | WARC-Target-URI: metadata://gnu.org/software/wget/warc/wget_arguments.txt 141 | WARC-Date: 2016-03-05T19:22:47Z 142 | WARC-Block-Digest: sha1:FBDU3PKIEVTCPIMYQDFENX7H4C4I7BDE 143 | Content-Type: text/plain 144 | Content-Length: 60 145 | 146 | "--warc-file=salam-home-2" "http://www.cs.odu.edu/~salam/" 147 | 148 | 149 | WARC/1.0 150 | WARC-Type: resource 151 | WARC-Record-ID: 152 | WARC-Warcinfo-ID: 153 | WARC-Concurrent-To: 154 | WARC-Target-URI: metadata://gnu.org/software/wget/warc/wget.log 155 | WARC-Date: 2016-03-05T19:22:47Z 156 | WARC-Block-Digest: sha1:NXMJ6U6EE6D272EAYKNQNQPT5RB77ZKS 157 | Content-Type: text/plain 158 | Content-Length: 483 159 | 160 | Opening WARC file ‘salam-home-2.warc.gz’. 161 | 162 | --2016-03-05 14:22:47-- http://www.cs.odu.edu/~salam/ 163 | Resolving www.cs.odu.edu (www.cs.odu.edu)... 128.82.4.2 164 | Connecting to www.cs.odu.edu (www.cs.odu.edu)|128.82.4.2|:80... connected. 165 | HTTP request sent, awaiting response... 200 OK 166 | Length: unspecified [text/html] 167 | Saving to: ‘index.html.1’ 168 | 169 | 0K . 23.7M=0s 170 | 171 | 2016-03-05 14:22:47 (23.7 MB/s) - ‘index.html.1’ saved [1606] 172 | 173 | 174 | 175 | -------------------------------------------------------------------------------- /samples/warcs/sample-1.warc.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oduwsdl/ipwb/91e202b31d96af0171690b2df61d47625867651f/samples/warcs/sample-1.warc.gz -------------------------------------------------------------------------------- /samples/warcs/slash.warc: -------------------------------------------------------------------------------- 1 | WARC/1.0 2 | WARC-Type: warcinfo 3 | WARC-Date: 2018-08-15T23:00:00Z 4 | WARC-Filename: slash.warc 5 | WARC-Record-ID: 6 | Content-Type: application/warc-fields 7 | Content-Length: 238 8 | 9 | software: Fabricated 10 | ip: 127.0.0.1 11 | hostname: localhost 12 | format: WARC File Format 1.0 13 | conformsTo: http://bibnum.bnf.fr/WARC/WARC_ISO_28500_version1_latestdraft.pdf 14 | description: SampleCrawl 15 | robots: ignore 16 | http-header-user-agent: WARCFab/1.0 17 | 18 | 19 | 20 | WARC/1.0 21 | WARC-Type: response 22 | WARC-Target-URI: http://slash.sawood/ 23 | WARC-Date: 2018-08-15T23:00:01Z 24 | WARC-Payload-Digest: sha1:3KRQHQ65T23N52AOS5QLFTIMWZIOO7G5 25 | WARC-Record-ID: 26 | Content-Type: application/http; msgtype=response 27 | Content-Length: 178 28 | 29 | HTTP/1.1 200 OK 30 | Server: nginx 31 | Date: Mon, 30 Jan 2017 18:39:49 GMT 32 | Content-Type: text/html 33 | Connection: close 34 | Vary: Accept-Encoding 35 | 36 | Home 37 | -------------------------------------------------------------------------------- /samples/warcs/variableSizedDates.warc: -------------------------------------------------------------------------------- 1 | WARC/1.1 2 | WARC-Type: warcinfo 3 | WARC-Date: 2017-02-18T10:00:00Z 4 | WARC-Filename: variableSizedDates.warc 5 | WARC-Record-ID: 6 | Content-Type: application/warc-fields 7 | Content-Length: 253 8 | 9 | software: Fabricated 10 | ip: 127.0.0.1 11 | hostname: localhost 12 | format: WARC file version 1.1 13 | conformsTo: http://iipc.github.io/warc-specifications/specifications/warc-format/warc-1.1/ 14 | description: SampleCrawl 15 | robots: ignore 16 | http-header-user-agent: WARCFab/1.0 17 | 18 | 19 | 20 | WARC/1.1 21 | WARC-Type: response 22 | WARC-Target-URI: http://memento.us/ 23 | WARC-Date: 2014-01 24 | WARC-Payload-Digest: sha1:3KRQHQ65T23N52AOS5QLFTIMWZIOO7G5 25 | WARC-Record-ID: 26 | Content-Type: application/http; msgtype=response 27 | Content-Length: 190 28 | 29 | HTTP/1.1 200 OK 30 | Server: nginx 31 | Date: Mon, 30 Jan 2017 18:39:49 GMT 32 | Content-Type: text/html 33 | Connection: close 34 | Vary: Accept-Encoding 35 | 36 | Memento captured in January 2014 37 | 38 | 39 | 40 | 41 | WARC/1.1 42 | WARC-Type: response 43 | WARC-Target-URI: http://memento.us/ 44 | WARC-Date: 2014-02-10T00:00:01.000000002Z 45 | WARC-Payload-Digest: sha1:3KRQHQ65T23N52AOS5QLFTIMWZIOO7G5 46 | WARC-Record-ID: 47 | Content-Type: application/http; msgtype=response 48 | Content-Length: 228 49 | 50 | HTTP/1.1 200 OK 51 | Server: nginx 52 | Date: Mon, 30 Jan 2017 18:39:49 GMT 53 | Content-Type: text/html 54 | Connection: close 55 | Vary: Accept-Encoding 56 | 57 | Memento captured February 10, 2014 at 00:01 with very high precision. 58 | 59 | 60 | 61 | WARC/1.1 62 | WARC-Type: response 63 | WARC-Target-URI: http://memento.us/ 64 | WARC-Date: 2014-02-10T00:00:01.01Z 65 | WARC-Payload-Digest: sha1:3KRQHQ65T23N52AOS5QLFTIMWZIOO7G5 66 | WARC-Record-ID: 67 | Content-Type: application/http; msgtype=response 68 | Content-Length: 223 69 | 70 | HTTP/1.1 200 OK 71 | Server: nginx 72 | Date: Mon, 30 Jan 2017 18:39:49 GMT 73 | Content-Type: text/html 74 | Connection: close 75 | Vary: Accept-Encoding 76 | 77 | Memento captured February 10, 2014 at 00:01 with high precision. 78 | 79 | 80 | 81 | WARC/1.1 82 | WARC-Type: response 83 | WARC-Target-URI: http://memento.us/ 84 | WARC-Date: 2014-02-10T00:00:01Z 85 | WARC-Payload-Digest: sha1:3KRQHQ65T23N52AOS5QLFTIMWZIOO7G5 86 | WARC-Record-ID: 87 | Content-Type: application/http; msgtype=response 88 | Content-Length: 231 89 | 90 | HTTP/1.1 200 OK 91 | Server: nginx 92 | Date: Mon, 30 Jan 2017 18:39:49 GMT 93 | Content-Type: text/html 94 | Connection: close 95 | Vary: Accept-Encoding 96 | 97 | Memento captured February 10, 2014 at 00:01 with conventional precision. 98 | 99 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [bdist_wheel] 2 | [tool:pytest] 3 | python_files = 4 | test_*.py 5 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from setuptools import setup 4 | from ipwb import __version__ 5 | 6 | with open('README.md') as f: 7 | long_description = f.read() 8 | desc = """InterPlanetary Wayback (ipwb): Web Archive integration with IPFS""" 9 | 10 | setup( 11 | name='ipwb', 12 | version=__version__, 13 | url='https://github.com/oduwsdl/ipwb', 14 | download_url="https://github.com/oduwsdl/ipwb", 15 | author='Mat Kelly', 16 | author_email='me@matkelly.com', 17 | description=desc, 18 | packages=['ipwb'], 19 | python_requires='>=3.9', 20 | license='MIT', 21 | long_description=long_description, 22 | long_description_content_type="text/markdown", 23 | provides=[ 24 | 'ipwb' 25 | ], 26 | install_requires=[ 27 | 'warcio>=1.5.3', 28 | 'ipfshttpclient>=0.8.0a', 29 | 'Flask>=3.0', 30 | 'pycryptodome>=3.4.11', 31 | 'requests>=2.19.1', 32 | 'beautifulsoup4>=4.6.3', 33 | 'surt>=0.3.0' 34 | ], 35 | tests_require=[ 36 | 'flake8>=3.4', 37 | 'pytest>=3.6', 38 | 'pytest-cov', 39 | 'pytest-flake8' 40 | ], 41 | entry_points=""" 42 | [console_scripts] 43 | ipwb = ipwb.__main__:main 44 | """, 45 | package_data={ 46 | 'ipwb': [ 47 | 'assets/*.*', 48 | 'assets/favicons/*.*', 49 | 'templates/*.*' 50 | ] 51 | }, 52 | zip_safe=False, 53 | keywords='http web archives ipfs distributed odu wayback memento', 54 | classifiers=[ 55 | 'Development Status :: 4 - Beta', 56 | 57 | 'Environment :: Web Environment', 58 | 59 | 'Programming Language :: Python :: 3.9', 60 | 'Programming Language :: Python :: 3.10', 61 | 'Programming Language :: Python :: 3.11', 62 | 'Programming Language :: Python :: 3.12', 63 | 'Programming Language :: Python :: 3.13', 64 | 65 | 'License :: OSI Approved :: MIT License', 66 | 67 | 'Intended Audience :: Developers', 68 | 'Intended Audience :: Information Technology', 69 | 'Intended Audience :: Science/Research', 70 | 71 | 'Topic :: Internet :: WWW/HTTP', 72 | 'Topic :: System :: Archiving', 73 | 'Topic :: System :: Archiving :: Backup', 74 | 'Topic :: System :: Archiving :: Mirroring', 75 | 'Topic :: Utilities', 76 | ] 77 | ) 78 | 79 | # Publish to pypi: 80 | # rm -rf dist; python setup.py sdist bdist_wheel; twine upload dist/* 81 | -------------------------------------------------------------------------------- /test-requirements.txt: -------------------------------------------------------------------------------- 1 | flake8>=3.7.9 2 | pytest>=5.3.5 3 | pytest-cov 4 | pytest-flake8 5 | setuptools 6 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oduwsdl/ipwb/91e202b31d96af0171690b2df61d47625867651f/tests/__init__.py -------------------------------------------------------------------------------- /tests/testUtil.py: -------------------------------------------------------------------------------- 1 | import os 2 | import random 3 | import string 4 | import re 5 | import tempfile 6 | 7 | from time import sleep 8 | 9 | from ipwb import replay 10 | from ipwb import indexer 11 | from ipwb import __file__ as moduleLocation 12 | 13 | from multiprocessing import Process 14 | from pathlib import Path 15 | 16 | p = Process() 17 | 18 | 19 | def createUniqueWARC(): 20 | lines = [] 21 | warc_in_filename = 'frogTest.warc' 22 | warc_in_path = os.path.join( 23 | Path(os.path.dirname(__file__)).parent, 24 | 'samples', 'warcs', warc_in_filename) 25 | 26 | string_to_change = b'abcdefghijklmnopqrstuvwxz' 27 | random_string = get_random_string(len(string_to_change)) 28 | random_bytes = str.encode(random_string) 29 | 30 | with open(warc_in_path, 'rb') as warcFile: 31 | newContent = warcFile.read().replace(string_to_change, random_bytes) 32 | 33 | warc_out_filename = warc_in_filename.replace('.warc', 34 | f'_{random_string}.warc') 35 | warc_out_path = os.path.join( 36 | Path(os.path.dirname(__file__)).parent, 37 | 'samples', 'warcs', warc_out_filename) 38 | 39 | print(warc_out_path) 40 | with open(warc_out_path, 'wb') as warcFile: 41 | warcFile.write(newContent) 42 | 43 | return warc_out_path 44 | 45 | 46 | def get_random_string(n): 47 | return ''.join(random.SystemRandom().choice( 48 | string.ascii_lowercase + string.digits) for _ in range(n)) 49 | 50 | 51 | def count_cdxj_entries(cdxj_data): 52 | urim_count = 0 53 | lines = cdxj_data.strip().split('\n') 54 | for line in lines: 55 | if line[0] != '!': # Exclude metadata from count 56 | urim_count += 1 57 | return urim_count 58 | 59 | 60 | def start_replay(warc_filename): 61 | global p 62 | path_of_warc = os.path.join( 63 | Path(os.path.dirname(__file__)).parent, 64 | 'samples', 'warcs', warc_filename) 65 | 66 | fh, tempfile_path = tempfile.mkstemp(suffix='.cdxj') 67 | os.close(fh) 68 | 69 | p = Process(target=replay.start, args=[tempfile_path]) 70 | p.start() 71 | sleep(5) 72 | 73 | cdxj_list = indexer.index_file_at(path_of_warc, quiet=True) 74 | cdxj = '\n'.join(cdxj_list) 75 | 76 | with open(tempfile_path, 'w') as f: 77 | f.write(cdxj) 78 | 79 | 80 | def stop_replay(): 81 | global p 82 | p.terminate() 83 | 84 | 85 | def extract_relation_entries_from_link_timemap(tm): 86 | matches = re.findall('rel=".*?"', tm) 87 | matches = map(lambda s: s[5:-1], matches) 88 | return matches 89 | -------------------------------------------------------------------------------- /tests/test_backends.py: -------------------------------------------------------------------------------- 1 | from unittest import mock 2 | 3 | import pytest 4 | from ipfshttpclient.exceptions import StatusError 5 | 6 | from ipwb.backends import get_web_archive_index, BackendError 7 | from pathlib import Path 8 | 9 | 10 | SAMPLE_INDEX = str( 11 | Path(__file__).parent.parent / 'samples/indexes/salam-home.cdxj' 12 | ) 13 | 14 | 15 | def test_local(): 16 | assert get_web_archive_index(SAMPLE_INDEX).startswith( 17 | '!context ["https://tools.ietf.org/html/rfc7089"]' 18 | ) 19 | 20 | 21 | def test_https(): 22 | assert get_web_archive_index( 23 | 'https://raw.githubusercontent.com/oduwsdl/ipwb/master/samples/' + 24 | 'indexes/salam-home.cdxj' 25 | ).startswith('!context ["https://tools.ietf.org/html/rfc7089"]') 26 | 27 | 28 | def test_ipfs_success(): 29 | with open(SAMPLE_INDEX, 'r') as f: 30 | expected_content = f.read() 31 | 32 | connect_to_ipfs = mock.MagicMock() 33 | connect_to_ipfs.return_value.cat.return_value = expected_content 34 | 35 | with mock.patch('ipfshttpclient.connect', connect_to_ipfs): 36 | assert get_web_archive_index( 37 | 'QmReQCtRpmEhdWZVLhoE3e8bqreD8G3avGpVfcLD7r4K6W' 38 | ).startswith('!context ["https://tools.ietf.org/html/rfc7089"]') 39 | 40 | 41 | def test_ipfs_failure(): 42 | with pytest.raises(BackendError) as err_info: 43 | with mock.patch( 44 | 'ipfshttpclient.client.Client.cat', 45 | side_effect=StatusError(original='') 46 | ): 47 | get_web_archive_index( 48 | 'QmReQCtRpmEhdWZVLhoE3e8bqreD8G3avGpVfcLD7r4K6W', 49 | ) 50 | 51 | assert str(err_info.value) == ( 52 | 'Cannot load index file from ipfs.' 53 | ) 54 | 55 | 56 | def test_ipfs_url_success(): 57 | with open(SAMPLE_INDEX, 'r') as f: 58 | expected_content = f.read() 59 | 60 | connect_to_ipfs = mock.MagicMock() 61 | connect_to_ipfs.return_value.cat.return_value = expected_content 62 | 63 | with mock.patch('ipfshttpclient.connect', connect_to_ipfs): 64 | assert get_web_archive_index( 65 | 'ipfs://QmReQCtRpmEhdWZVLhoE3e8bqreD8G3avGpVfcLD7r4K6W' 66 | ).startswith('!context ["https://tools.ietf.org/html/rfc7089"]') 67 | -------------------------------------------------------------------------------- /tests/test_compile_target_uri.py: -------------------------------------------------------------------------------- 1 | from ipwb.replay import compile_target_uri 2 | 3 | 4 | def test_empty_query_string(): 5 | assert compile_target_uri('https://example.com', b'') == ( 6 | 'https://example.com' 7 | ) 8 | 9 | 10 | def test_unempty_query_string(): 11 | assert compile_target_uri('https://example.com', b'foo=bar&boo=baz') == ( 12 | 'https://example.com?foo=bar&boo=baz' 13 | ) 14 | -------------------------------------------------------------------------------- /tests/test_daemon.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | 4 | from ipwb import __main__ 5 | from multiaddr import exceptions as multiaddr_exceptions 6 | 7 | 8 | class dotdict(dict): 9 | """dot.notation access to dictionary attributes""" 10 | __getattr__ = dict.get 11 | __setattr__ = dict.__setitem__ 12 | __delattr__ = dict.__delitem__ 13 | 14 | 15 | daemon = None 16 | args = { 17 | "c": False, 18 | "compressFirst": False, 19 | "daemon_address": daemon, 20 | "debug": False, 21 | "e": False, 22 | "outfile": None, 23 | "update_check": False, 24 | "warc_path": ['../samples/warcs/sample-1.warc.gz'] 25 | } 26 | 27 | 28 | def test_daemon_wrong_scheme(): 29 | daemon = "/dnswrong/localhost/tcp/5001/http" 30 | args['daemon_address'] = daemon 31 | with pytest.raises(multiaddr_exceptions.StringParseError): 32 | __main__.check_args_index(dotdict(args)) 33 | __main__.check_args_replay(dotdict(args)) 34 | 35 | 36 | def test_daemon_wrong_ip(): 37 | daemon = "/ip4/256.999.478.444/tcp/5001/http" 38 | args['daemon_address'] = daemon 39 | with pytest.raises(multiaddr_exceptions.StringParseError): 40 | __main__.check_args_index(dotdict(args)) 41 | __main__.check_args_replay(dotdict(args)) 42 | 43 | 44 | def test_daemon_wrong_protocol(): 45 | daemon = "/dns/localhost/tcp/5001/httpwrong" 46 | args['daemon_address'] = daemon 47 | with pytest.raises(multiaddr_exceptions.StringParseError): 48 | __main__.check_args_index(dotdict(args)) 49 | __main__.check_args_replay(dotdict(args)) 50 | 51 | 52 | def test_daemon_wrong_format(): 53 | daemon = "localhost:5001" 54 | args['daemon_address'] = daemon 55 | with pytest.raises(multiaddr_exceptions.StringParseError): 56 | __main__.check_args_index(dotdict(args)) 57 | __main__.check_args_replay(dotdict(args)) 58 | -------------------------------------------------------------------------------- /tests/test_error_handler.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from unittest.mock import MagicMock, patch, ANY 3 | 4 | from ipwb.error_handler import exception_logger 5 | 6 | 7 | @exception_logger(catch=False) 8 | def reraised_error(arg): 9 | raise Exception(arg) 10 | 11 | 12 | @exception_logger() 13 | def caught_error(arg): 14 | raise Exception(arg) 15 | 16 | 17 | def test_re_raise(): 18 | with pytest.raises(Exception, match='foo'): 19 | reraised_error('foo') 20 | 21 | 22 | def test_catch(): 23 | mock_logger = MagicMock() 24 | with patch('ipwb.error_handler.logger.critical', mock_logger): 25 | caught_error('boo') 26 | 27 | mock_logger.assert_called_once_with('boo') 28 | -------------------------------------------------------------------------------- /tests/test_indexing.py: -------------------------------------------------------------------------------- 1 | # Number of entries in CDXJ == number of response records in WARC 2 | 3 | import pytest 4 | from . import testUtil as ipwb_test 5 | import os 6 | 7 | from ipwb import indexer 8 | 9 | from pathlib import Path 10 | 11 | 12 | def test_cdxj_warc_responserecord_count(): 13 | new_warc_path = ipwb_test.createUniqueWARC() 14 | # use ipwb indexer to push 15 | cdxjList = indexer.index_file_at(new_warc_path, quiet=True) 16 | cdxj = '\n'.join(cdxjList) 17 | assert ipwb_test.count_cdxj_entries(cdxj) == 2 18 | 19 | 20 | # A response record's content-length causes the payload to truncate 21 | # WARC-Response record for html should still exist in output 22 | def test_warc_ipwb_indexer_broken_warc_record(): 23 | pathOfBrokenWARC = os.path.join( 24 | Path(os.path.dirname(__file__)).parent, 25 | 'samples', 'warcs', 'broken.warc') 26 | cdxjList = indexer.index_file_at(pathOfBrokenWARC, quiet=True) 27 | cdxj = '\n'.join(cdxjList) 28 | assert ipwb_test.count_cdxj_entries(cdxj) == 1 29 | 30 | 31 | # TODO: Have unit tests for each function in indexer.py 32 | -------------------------------------------------------------------------------- /tests/test_ipfs_client.py: -------------------------------------------------------------------------------- 1 | from unittest.mock import MagicMock, patch 2 | 3 | import pytest 4 | 5 | from ipwb.util import check_daemon_is_alive, create_ipfs_client 6 | from ipfshttpclient.exceptions import ConnectionError 7 | 8 | 9 | def test_exception(): 10 | mock_client = MagicMock() 11 | mock_client.side_effect = Exception('boo!') 12 | expected_error = 'Cannot create an IPFS client.' 13 | 14 | with patch('ipfshttpclient.Client', mock_client): 15 | with pytest.raises(Exception, match=expected_error): 16 | create_ipfs_client() 17 | 18 | 19 | def test_is_alive(): 20 | mock_client = MagicMock() 21 | 22 | with patch('ipwb.util.ipfs_client', mock_client): 23 | assert check_daemon_is_alive() is True 24 | 25 | 26 | def test_connection_error(): 27 | mock_client = MagicMock() 28 | mock_client.return_value.id.side_effect = ConnectionError('boo!') 29 | 30 | with patch('ipwb.util.ipfs_client', mock_client): 31 | with pytest.raises(Exception, match='Daemon is not running at'): 32 | check_daemon_is_alive() 33 | 34 | 35 | def test_os_error(): 36 | mock_client = MagicMock() 37 | mock_client.return_value.id.side_effect = OSError('foo!') 38 | 39 | with patch('ipwb.util.ipfs_client', mock_client): 40 | with pytest.raises(Exception, match='IPFS is likely not installed'): 41 | check_daemon_is_alive() 42 | 43 | 44 | def test_unknown_error(): 45 | mock_client = MagicMock() 46 | mock_client.return_value.id.side_effect = Exception('foo!') 47 | expected_error = 'Unknown error in retrieving IPFS daemon status.' 48 | 49 | with patch('ipwb.util.ipfs_client', mock_client): 50 | with pytest.raises(Exception, match=expected_error): 51 | check_daemon_is_alive() 52 | -------------------------------------------------------------------------------- /tests/test_memento.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from . import testUtil as ipwb_test 4 | from ipwb import replay 5 | from ipwb import indexer 6 | from ipwb import __file__ as moduleLocation 7 | from time import sleep 8 | import os 9 | import subprocess 10 | from urllib.request import urlopen 11 | import requests 12 | import random 13 | import string 14 | import re 15 | import sys 16 | 17 | 18 | def get_urims_from_timemap_in_warc(warcFilename): 19 | ipwb_test.start_replay(warcFilename) 20 | 21 | tm_uri = 'http://localhost:2016/timemap/link/memento.us/' 22 | tm = urlopen(tm_uri).read().decode('utf-8') 23 | 24 | urims = [] 25 | for line in tm.split('\n'): 26 | is_a_memento = len(re.findall('rel=".*memento"', line)) > 0 27 | if is_a_memento: 28 | urims.append(re.findall('<(.*)>', line)[0]) 29 | 30 | ipwb_test.stop_replay() 31 | 32 | return urims 33 | 34 | 35 | def get_rels_from_urims_in_warc(warc): 36 | urims = get_urims_from_timemap_in_warc(warc) 37 | ipwb_test.start_replay(warc) 38 | 39 | # Get Link header values for each memento 40 | link_headers = [] 41 | for urim in urims: 42 | link_headers.append(urlopen(urim).info().get('Link')) 43 | ipwb_test.stop_replay() 44 | 45 | rels_for_urims = [] 46 | for link_header in link_headers: 47 | relForURIM = ipwb_test.extract_relation_entries_from_link_timemap( 48 | link_header) 49 | rels_for_urims.append(relForURIM) 50 | 51 | ipwb_test.stop_replay() 52 | return rels_for_urims 53 | 54 | 55 | @pytest.mark.parametrize("warc,lookup,acceptdatetime,status", [ 56 | ('5mementos.warc', 'timegate/memento.us', 57 | 'Thu, 31 May 2007 20:35:00 GMT', 302), 58 | ('5mementos.warc', 'timegate/memento.us', 59 | 'Thu, 31 May 2007 20:35:00', 400), 60 | ('5mementos.warc', 'timegate/memento.us', 61 | 'Thu, 31 May 2007 20:35 GMT', 400), 62 | ('5mementos.warc', 'timegate/memento.us', 63 | '20181001123636', 400) 64 | ]) 65 | def test_acceptdatetime_status(warc, lookup, acceptdatetime, status): 66 | ipwb_test.start_replay(warc) 67 | 68 | headers = {'Accept-Datetime': acceptdatetime} 69 | 70 | resp = requests.get(f'http://localhost:2016/{lookup}', 71 | allow_redirects=False, headers=headers) 72 | assert resp.status_code == status 73 | 74 | ipwb_test.stop_replay() 75 | 76 | 77 | def test_mementoRelations_one(): 78 | rels_for_urims = get_rels_from_urims_in_warc('1memento.warc') 79 | 80 | rels_for_urims = list(filter(lambda k: 'memento' in k, rels_for_urims[0])) 81 | m1_m1 = rels_for_urims[0].split(' ') 82 | 83 | onlyOneMemento = len(rels_for_urims) == 1 84 | 85 | cond_first_memento = 'first' in m1_m1 86 | cond_last_memento = 'last' in m1_m1 87 | 88 | assert onlyOneMemento and \ 89 | cond_first_memento and \ 90 | cond_last_memento 91 | 92 | 93 | def test_mementoRelations_two(): 94 | rels_for_urims = get_rels_from_urims_in_warc('2mementos.warc') 95 | 96 | cond_first_memento = False 97 | cond_last_next_memento = False 98 | cond_first_prev_memento = False 99 | cond_last_memento = False 100 | 101 | rels_for_urims1of2 = list(filter( 102 | lambda k: 'memento' in k, rels_for_urims[0])) 103 | rels_for_urims2of2 = list(filter( 104 | lambda k: 'memento' in k, rels_for_urims[1])) 105 | 106 | # mX_mY = URI-M requested, Y-th URIM-M in header 107 | m1_m1 = rels_for_urims1of2[0].split(' ') 108 | m1_m2 = rels_for_urims1of2[1].split(' ') 109 | m2_m1 = rels_for_urims2of2[0].split(' ') 110 | m2_m2 = rels_for_urims2of2[1].split(' ') 111 | 112 | cond_first_memento = 'first' in m1_m1 113 | cond_last_next_memento = 'last' in m1_m2 and 'next' in m1_m2 114 | cond_first_prev_memento = 'first' in m2_m1 and 'prev' in m2_m1 115 | cond_last_memento = 'last' in m2_m2 116 | 117 | assert cond_first_memento and \ 118 | cond_last_next_memento and \ 119 | cond_first_prev_memento and \ 120 | cond_last_memento 121 | 122 | 123 | def test_mementoRelations_three(): 124 | rels_for_urims = get_rels_from_urims_in_warc('3mementos.warc') 125 | 126 | cond_m1m1_first_memento = False 127 | cond_m1m2_next_memento = False 128 | cond_m1m3_last_memento = False 129 | cond_m2m1_first_prev_memento = False 130 | cond_m2m2_memento = False 131 | cond_m2m3_last_next_memento = False 132 | cond_m3m1_first_memento = False 133 | cond_m3m2_prev_memento = False 134 | cond_m3m3_last_memento = False 135 | 136 | rels_for_urims1of3 = list(filter( 137 | lambda k: 'memento' in k, rels_for_urims[0])) 138 | rels_for_urims2of3 = list(filter( 139 | lambda k: 'memento' in k, rels_for_urims[1])) 140 | rels_for_urims3of3 = list(filter( 141 | lambda k: 'memento' in k, rels_for_urims[2])) 142 | 143 | # mX_mY = URI-M requested, Y-th URIM-M in header 144 | m1_m1 = rels_for_urims1of3[0].split(' ') 145 | m1_m2 = rels_for_urims1of3[1].split(' ') 146 | m1_m3 = rels_for_urims1of3[2].split(' ') 147 | m2_m1 = rels_for_urims2of3[0].split(' ') 148 | m2_m2 = rels_for_urims2of3[1].split(' ') 149 | m2_m3 = rels_for_urims2of3[2].split(' ') 150 | m3_m1 = rels_for_urims3of3[0].split(' ') 151 | m3_m2 = rels_for_urims3of3[1].split(' ') 152 | m3_m3 = rels_for_urims3of3[2].split(' ') 153 | 154 | cond_m1m1_first_memento = 'first' in m1_m1 155 | cond_m1m2_next_memento = 'next' in m1_m2 156 | cond_m1m3_last_memento = 'last' in m1_m3 157 | cond_m2m1_first_prev_memento = 'first' in m2_m1 and 'prev' in m2_m1 158 | cond_m2m2_memento = len(m2_m2) == 1 159 | cond_m2m3_last_next_memento = 'last' in m2_m3 and 'next' in m2_m3 160 | cond_m3m1_first_memento = 'first' in m3_m1 161 | cond_m3m2_prev_memento = 'prev' in m3_m2 162 | cond_m3m3_last_memento = 'last' in m3_m3 163 | 164 | assert (cond_m1m1_first_memento and 165 | cond_m1m2_next_memento and 166 | cond_m1m3_last_memento and 167 | cond_m2m1_first_prev_memento and 168 | cond_m2m2_memento and 169 | cond_m2m3_last_next_memento and 170 | cond_m3m1_first_memento and 171 | cond_m3m2_prev_memento and 172 | cond_m3m3_last_memento) 173 | 174 | 175 | def test_mementoRelations_four(): 176 | rels_for_urims = get_rels_from_urims_in_warc('4mementos.warc') 177 | 178 | cond_m1m1_first_memento = False 179 | cond_m1m2_next_memento = False 180 | cond_m1m3 = False 181 | cond_m1m4_last_memento = False 182 | cond_m2m1_first_prev_memento = False 183 | cond_m2m2_memento = False 184 | cond_m2m3_next_memento = False 185 | cond_m2m4_last_memento = False 186 | cond_m3m1_first_memento = False 187 | cond_m3m2_prev_memento = False 188 | cond_m3m3_memento = False 189 | cond_m3m4_last_next_memento = False 190 | cond_m4m1_first_memento = False 191 | cond_m4m2 = False 192 | cond_m4m3_prev_memento = False 193 | cond_m4m4_last_memento = False 194 | 195 | rels_for_urims1of4 = list(filter( 196 | lambda k: 'memento' in k, rels_for_urims[0])) 197 | rels_for_urims2of4 = list(filter( 198 | lambda k: 'memento' in k, rels_for_urims[1])) 199 | rels_for_urims3of4 = list(filter( 200 | lambda k: 'memento' in k, rels_for_urims[2])) 201 | rels_for_urims4of4 = list(filter( 202 | lambda k: 'memento' in k, rels_for_urims[3])) 203 | 204 | # mX_mY = URI-M requested, Y-th URIM-M in header 205 | m1_m1 = rels_for_urims1of4[0].split(' ') 206 | m1_m2 = rels_for_urims1of4[1].split(' ') 207 | # m1_m3 = rels_for_urims1of4[2].split(' ') 208 | m1_m4 = rels_for_urims1of4[2].split(' ') 209 | m2_m1 = rels_for_urims2of4[0].split(' ') 210 | m2_m2 = rels_for_urims2of4[1].split(' ') 211 | m2_m3 = rels_for_urims2of4[2].split(' ') 212 | m2_m4 = rels_for_urims2of4[3].split(' ') 213 | m3_m1 = rels_for_urims3of4[0].split(' ') 214 | m3_m2 = rels_for_urims3of4[1].split(' ') 215 | m3_m3 = rels_for_urims3of4[2].split(' ') 216 | m3_m4 = rels_for_urims3of4[3].split(' ') 217 | m4_m1 = rels_for_urims4of4[0].split(' ') 218 | # m4_m2 = rels_for_urims4of4[1].split(' ') 219 | m4_m3 = rels_for_urims4of4[1].split(' ') 220 | m4_m4 = rels_for_urims4of4[2].split(' ') 221 | 222 | cond_m1m1_first_memento = 'first' in m1_m1 223 | cond_m1m2_next_memento = 'next' in m1_m2 224 | # M3 not present 225 | cond_m1m4_last_memento = 'last' in m1_m4 226 | cond_m2m1_first_prev_memento = 'first' in m2_m1 and 'prev' in m2_m1 227 | cond_m2m2_memento = len(m2_m2) == 1 228 | cond_m2m3_next_memento = 'next' in m2_m3 229 | cond_m2m4_last_memento = 'last' in m2_m4 230 | cond_m3m1_first_memento = 'first' in m3_m1 231 | cond_m3m2_prev_memento = 'prev' in m3_m2 232 | cond_m3m3_memento = len(m3_m3) == 1 233 | cond_m3m4_last_next_memento = 'last' in m3_m4 and 'next' in m3_m4 234 | cond_m4m1_first_memento = 'first' in m4_m1 235 | # M2 not present 236 | cond_m4m3_prev_memento = 'prev' in m4_m3 237 | cond_m4m4_last_memento = 'last' in m4_m4 238 | 239 | assert (cond_m1m1_first_memento and 240 | cond_m1m2_next_memento and 241 | # cond_m1m3 and 242 | cond_m1m4_last_memento and 243 | cond_m2m1_first_prev_memento and 244 | cond_m2m2_memento and 245 | cond_m2m3_next_memento and 246 | cond_m2m4_last_memento and 247 | cond_m3m1_first_memento and 248 | cond_m3m2_prev_memento and 249 | cond_m3m3_memento and 250 | cond_m3m4_last_next_memento and 251 | cond_m4m1_first_memento and 252 | # cond_m4m2 and 253 | cond_m4m3_prev_memento and 254 | cond_m4m4_last_memento) 255 | 256 | 257 | def test_mementoRelations_five(): 258 | rels_for_urims = get_rels_from_urims_in_warc('5mementos.warc') 259 | 260 | cond_m1m1_first_memento = False 261 | cond_m1m2_next_memento = False 262 | cond_m1m3 = False 263 | cond_m1m4 = False 264 | cond_m1m5_last_memento = False 265 | cond_m2m1_first_prev_memento = False 266 | cond_m2m2_memento = False 267 | cond_m2m3_next_memento = False 268 | cond_m2m4 = False 269 | cond_m2m5_last_memento = False 270 | cond_m3m1_first_memento = False 271 | cond_m3m2_prev_memento = False 272 | cond_m3m3_memento = False 273 | cond_m3m4_next_memento = False 274 | cond_m3m5_last_memento = False 275 | cond_m4m1_first_memento = False 276 | cond_m4m2 = False 277 | cond_m4m3_prev_memento = False 278 | cond_m4m4_memento = False 279 | cond_m4m5_last_next_memento = False 280 | cond_m5m1_first_memento = False 281 | cond_m5m2 = False 282 | cond_m5m3 = False 283 | cond_m5m4_prev_memento = False 284 | cond_m5m5_last_memento = False 285 | 286 | rels_for_urims1of5 = list(filter( 287 | lambda k: 'memento' in k, rels_for_urims[0])) 288 | rels_for_urims2of5 = list(filter( 289 | lambda k: 'memento' in k, rels_for_urims[1])) 290 | rels_for_urims3of5 = list(filter( 291 | lambda k: 'memento' in k, rels_for_urims[2])) 292 | rels_for_urims4of5 = list(filter( 293 | lambda k: 'memento' in k, rels_for_urims[3])) 294 | rels_for_urims5of5 = list(filter( 295 | lambda k: 'memento' in k, rels_for_urims[4])) 296 | 297 | # mX_mY = URI-M requested, Y-th URIM-M in header 298 | m1_m1 = rels_for_urims1of5[0].split(' ') 299 | m1_m2 = rels_for_urims1of5[1].split(' ') 300 | # M3 not present 301 | # M4 not present 302 | m1_m5 = rels_for_urims1of5[2].split(' ') 303 | m2_m1 = rels_for_urims2of5[0].split(' ') 304 | m2_m2 = rels_for_urims2of5[1].split(' ') 305 | m2_m3 = rels_for_urims2of5[2].split(' ') 306 | # M4 not present 307 | m2_m5 = rels_for_urims2of5[3].split(' ') 308 | m3_m1 = rels_for_urims3of5[0].split(' ') 309 | m3_m2 = rels_for_urims3of5[1].split(' ') 310 | m3_m3 = rels_for_urims3of5[2].split(' ') 311 | m3_m4 = rels_for_urims3of5[3].split(' ') 312 | m3_m5 = rels_for_urims3of5[4].split(' ') 313 | m4_m1 = rels_for_urims4of5[0].split(' ') 314 | # M2 not present 315 | m4_m3 = rels_for_urims4of5[1].split(' ') 316 | m4_m4 = rels_for_urims4of5[2].split(' ') 317 | m4_m5 = rels_for_urims4of5[3].split(' ') 318 | m5_m1 = rels_for_urims5of5[0].split(' ') 319 | # M2 not present 320 | # M3 not present 321 | m5_m4 = rels_for_urims5of5[1].split(' ') 322 | m5_m5 = rels_for_urims5of5[2].split(' ') 323 | 324 | cond_m1m1_first_memento = 'first' in m1_m1 325 | cond_m1m2_next_memento = 'next' in m1_m2 326 | # M3 not present 327 | # M4 not present 328 | cond_m1m5_last_memento = 'last' in m1_m5 329 | cond_m2m1_first_prev_memento = 'first' in m2_m1 and 'prev' in m2_m1 330 | cond_m2m2_memento = len(m2_m2) == 1 331 | cond_m2m3_next_memento = 'next' in m2_m3 332 | # M4 not present 333 | cond_m2m5_last_memento = 'last' in m1_m5 334 | cond_m3m1_first_memento = 'first' in m3_m1 335 | cond_m3m2_prev_memento = 'prev' in m3_m2 336 | cond_m3m3_memento = len(m3_m3) == 1 337 | cond_m3m4_next_memento = 'next' in m3_m4 338 | cond_m3m5_last_memento = 'last' in m3_m5 339 | cond_m4m1_first_memento = 'first' in m4_m1 340 | # M2 not present 341 | cond_m4m3_prev_memento = 'prev' in m4_m3 342 | cond_m4m4_memento = len(m4_m4) == 1 343 | cond_m4m5_last_next_memento = 'last' in m4_m5 and 'next' in m4_m5 344 | cond_m5m1_first_memento = 'first' in m4_m1 345 | # M2 not present 346 | # M3 not present 347 | cond_m5m4_prev_memento = 'prev' in m5_m4 348 | cond_m5m5_last_memento = 'last' in m5_m5 349 | 350 | assert (cond_m1m1_first_memento and 351 | cond_m1m2_next_memento and 352 | # cond_m1m3 and 353 | # cond_m1m4 and 354 | cond_m1m5_last_memento and 355 | cond_m2m1_first_prev_memento and 356 | cond_m2m2_memento and 357 | cond_m2m3_next_memento and 358 | # cond_m2m4 and 359 | cond_m2m5_last_memento and 360 | cond_m3m1_first_memento and 361 | cond_m3m2_prev_memento and 362 | cond_m3m3_memento and 363 | cond_m3m4_next_memento and 364 | cond_m3m5_last_memento and 365 | cond_m4m1_first_memento and 366 | # cond_m4m2 and 367 | cond_m4m3_prev_memento and 368 | cond_m4m4_memento and 369 | cond_m4m5_last_next_memento and 370 | cond_m5m1_first_memento and 371 | # cond_m5m2 and 372 | # cond_m5m3 and 373 | cond_m5m4_prev_memento and 374 | cond_m5m5_last_memento) 375 | -------------------------------------------------------------------------------- /tests/test_nodeToNode.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | # Replay uniquely created WARC pushed from another ipwb instance 4 | # to IPFS using provided CDXJ 5 | 6 | 7 | @pytest.mark.skip(reason='not implemented') 8 | def test_replay_replay_another_ipwb_instance_provided_cdxj_file(): 9 | pass 10 | 11 | 12 | @pytest.mark.skip(reason='not implemented') 13 | def test_replay_warc_another_ipwb_instance_provided_cdxj_record(): 14 | pass 15 | -------------------------------------------------------------------------------- /tests/test_randomized_add.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import os 3 | import sys 4 | import json 5 | 6 | from ipwb import indexer 7 | 8 | from . import testUtil as ipwb_test 9 | 10 | 11 | def is_valid_surt(surt): 12 | return True # The surt library does not yet have a way to check this 13 | 14 | 15 | def is_valid_datetime(dt): 16 | return len(dt) == 14 and dt.isdigit() 17 | 18 | 19 | def is_valid_json(jsonIn): 20 | try: 21 | j = json.loads(json.dumps(jsonIn)) 22 | except ValueError: 23 | return False 24 | return True 25 | 26 | 27 | def check_cdxj_fields(cdxjEntry): 28 | (surt, dt, json) = cdxjEntry.split(' ', 2) 29 | valid_surt = is_valid_surt(surt) 30 | valid_dt = is_valid_datetime(dt) 31 | valid_json = is_valid_json(json) 32 | 33 | return valid_surt and valid_dt and valid_json 34 | 35 | 36 | def check_ipwb_json_field_presence(jsonStr): 37 | keys = json.loads(jsonStr) 38 | return 'locator' in keys and 'mime_type' in keys and 'status_code' in keys 39 | 40 | 41 | def test_push(): 42 | """ 43 | Read WARC, manipulate content to ensure uniqueness, push to IPFS 44 | WARC should result in two CDXJ entries with three space-limited fields 45 | each: surt URI, datetime, JSON 46 | JSON should contain AT LEAST locator, mime_type, and status fields 47 | """ 48 | new_warc_path = ipwb_test.createUniqueWARC() 49 | # use ipwb indexer to push 50 | cdxj_list = indexer.index_file_at(new_warc_path, quiet=True) 51 | cdxj = '\n'.join(cdxj_list) 52 | 53 | first_entry = cdxj.split('\n')[0] 54 | first_non_metadata_entry = '' 55 | for line in cdxj.split('\n'): 56 | if line[0] != '!': 57 | first_non_metadata_entry = line 58 | break 59 | 60 | assert check_cdxj_fields(first_non_metadata_entry) 61 | first_entry_last__field = first_non_metadata_entry.split(' ', 2)[2] 62 | assert check_ipwb_json_field_presence(first_entry_last__field) 63 | -------------------------------------------------------------------------------- /tests/test_replay.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from . import testUtil as ipwb_test 4 | from ipwb import replay 5 | 6 | from time import sleep 7 | 8 | import requests 9 | 10 | import urllib 11 | 12 | # Successful retrieval 13 | # Accurate retrieval 14 | # Comprehensive retrieval of sub-resources 15 | 16 | 17 | @pytest.mark.parametrize("warc,lookup,has_md_header", [ 18 | ('HTTP404.warc', 'memento/20200202100000/memento.us/', True), 19 | ('HTTP404.warc', 'memento/20200202100000/memento.ca/', False), 20 | ('HTTP404.warc', 'loremipsum', False)]) 21 | def test_replay_404(warc, lookup, has_md_header): 22 | ipwb_test.start_replay(warc) 23 | 24 | resp = requests.get(f'http://localhost:2016/{lookup}', 25 | allow_redirects=False) 26 | 27 | assert resp.status_code == 404 28 | 29 | if has_md_header: 30 | assert 'Memento-Datetime' in resp.headers 31 | else: 32 | assert 'Memento-Datetime' not in resp.headers 33 | 34 | ipwb_test.stop_replay() 35 | 36 | 37 | @pytest.mark.parametrize("warc,lookup,status,location", [ 38 | ('salam-home.warc', 'memento/*/cs.odu.edu/~salam/', 302, 39 | '/memento/20160305192247/cs.odu.edu/~salam/'), 40 | ('1memento.warc', 'memento/*/memento.us', 302, 41 | '/memento/20130202100000/memento.us/'), 42 | ('2mementos.warc', 'memento/*/memento.us', 200, None), 43 | ('salam-home.warc', 'memento/*/?url=cs.odu.edu/~salam/', 301, 44 | '/memento/*/cs.odu.edu/~salam/'), 45 | ('1memento.warc', 'memento/*/?url=memento.us', 301, 46 | '/memento/*/memento.us'), 47 | ('2mementos.warc', 'memento/*/?url=memento.us', 301, 48 | '/memento/*/memento.us'), 49 | ('2mementos_queryString.warc', 50 | '/memento/20130202100000/memento.us/' + 51 | 'index.php?anotherval=ipsum&someval=lorem', 200, None), 52 | ]) 53 | def test_replay_search(warc, lookup, status, location): 54 | ipwb_test.start_replay(warc) 55 | 56 | resp = requests.get(f'http://localhost:2016/{lookup}', 57 | allow_redirects=False) 58 | assert resp.status_code == status 59 | if location is not None: # Allow for checks w/o redirects 60 | assert resp.headers.get('location') == location 61 | 62 | ipwb_test.stop_replay() 63 | 64 | 65 | def test_replay_dated_memento(): 66 | ipwb_test.start_replay('salam-home.warc') 67 | 68 | url = 'http://localhost:2016/memento/{}/cs.odu.edu/~salam/' 69 | dest = '/memento/20160305192247/cs.odu.edu/~salam/' 70 | 71 | invalid_dts = [ 72 | '18', 73 | '20181', 74 | '201800', 75 | '20180132', 76 | '2018010226', 77 | '20181301000000', 78 | '20180932000000', 79 | '20180230000000', 80 | '20180102263127', 81 | ] 82 | for dt in invalid_dts: 83 | resp = requests.get(url.format(dt), allow_redirects=False) 84 | assert resp.status_code == 400 85 | 86 | typoDts = [ 87 | 'foo', 88 | '201l', 89 | '2018010100000O', 90 | '20181126134257.123', 91 | ] 92 | for dt in typoDts: 93 | resp = requests.get(url.format(dt), allow_redirects=False) 94 | assert resp.status_code == 404 95 | 96 | valid_dts = [ 97 | '2018', 98 | '201811', 99 | '20181126', 100 | '2018112613', 101 | '201811261342', 102 | '20181126134257', 103 | ] 104 | for dt in valid_dts: 105 | resp = requests.get(url.format(dt), allow_redirects=False) 106 | assert resp.status_code == 302 107 | assert resp.headers.get('location') == dest 108 | 109 | resp = requests.get(url.format('20160305192247'), allow_redirects=False) 110 | assert resp.status_code == 200 111 | 112 | ipwb_test.stop_replay() 113 | 114 | 115 | @pytest.mark.parametrize("warc,index,tmformat,urim", [ 116 | ('5mementos.warc', '5mementos.cdxj', 'cdxj', 'memento.us'), 117 | ('5mementos.warc', '5mementos.link', 'link', 'memento.us') 118 | ]) 119 | def test_generate_timemap(warc, index, tmformat, urim): 120 | ipwb_test.start_replay(warc) 121 | 122 | resp = requests.get(f'http://localhost:2016/timemap/{tmformat}/{urim}', 123 | allow_redirects=False) 124 | 125 | with open(f'samples/indexes/{index}', 'r') as index: 126 | assert index.read().encode('utf-8') == resp.content 127 | 128 | ipwb_test.stop_replay() 129 | 130 | 131 | @pytest.mark.skip(reason='not implemented') 132 | def test_retrieveWARCRecord_fromIPFSHash(): 133 | pass 134 | 135 | 136 | @pytest.mark.skip(reason='not implemented') 137 | def test_retrieveWARCRecord_fromLocalCDXJFile(): 138 | pass 139 | 140 | 141 | @pytest.mark.skip(reason='not implemented') 142 | def test_retrieveWARCRecord_fromRemoteCDXJFile_ByIPFSHash(): 143 | pass 144 | 145 | 146 | @pytest.mark.skip(reason='not implemented') 147 | def test_retrieveWARCRecord_fromRemoteCDXJFile_ByHTTP(): 148 | pass 149 | 150 | 151 | @pytest.mark.skip(reason='not implemented') 152 | def test_retrieveWARCRecord_fromRemoteCDXJFile_ByHTTPS(): 153 | pass 154 | 155 | 156 | @pytest.mark.skip(reason='not implemented') 157 | def test_retrieveWARCRecord_fromRemoteCDXJFile_ByFTP(): 158 | pass 159 | 160 | 161 | @pytest.mark.skip(reason='not implemented') 162 | def test_retrieveWARCRecord_fromRemoteCDXJFile_ByBitTorrentMagnetLink(): 163 | pass 164 | 165 | 166 | @pytest.mark.skip(reason='not implemented') 167 | def test_retrieveWARCRecord_fromRemoteCDXJFile_BySMB(): 168 | pass 169 | 170 | 171 | @pytest.mark.skip(reason='not implemented') 172 | def test_accuracy_retrievedContent_vsWARC(): 173 | pass 174 | 175 | 176 | @pytest.mark.skip(reason='not implemented') 177 | def test_availability_subResources(): 178 | pass 179 | 180 | 181 | @pytest.mark.skip(reason='not implemented') 182 | def test_inclusionInWebpage_selectResources(): 183 | pass 184 | 185 | 186 | @pytest.mark.skip(reason='not implemented') 187 | def test_exclusionInWebpage_selectIrrelevantResources(): 188 | pass 189 | 190 | 191 | @pytest.mark.skip(reason='not implemented') 192 | def test_fileImport_nonCDXJ(): # Fail w/ friendly message when non-cdxj 193 | pass 194 | 195 | 196 | @pytest.mark.skip(reason='not implemented') 197 | def test_helpWithoutDaemon(): # See #244 198 | pass 199 | 200 | 201 | def test_unit_command_daemon(): 202 | replay.command_daemon('start') 203 | sleep(10) 204 | try: 205 | urllib.request.urlopen('http://localhost:5001') 206 | except urllib.error.HTTPError as e: 207 | assert e.code == 404 208 | except Exception as e: 209 | assert False 210 | 211 | 212 | @pytest.mark.parametrize("expected,input", [ 213 | (True, 'http://example.com'), 214 | (True, 'https://example.com'), 215 | (True, 'HTTP://EXAMPLE.COM'), 216 | (True, 'HTTPS://EXAMPLE.COM'), 217 | (True, 'http://example.com/'), 218 | (True, 'http://example.com/foo.bar'), 219 | (True, 'https://www.example.com/foo?a=b&c=d'), 220 | (False, ''), 221 | (False, 'foo'), 222 | (False, 'foo/bar.baz'), 223 | (False, 'foo?a=b&c=d'), 224 | (False, '/'), 225 | (False, '/foo'), 226 | (False, '/foo/bar.baz'), 227 | (False, '/foo?a=b&c=d'), 228 | (False, './'), 229 | (False, './foo'), 230 | (False, './foo/bar.baz'), 231 | (False, './foo?a=b&c=d'), 232 | (False, '../'), 233 | (False, '../foo'), 234 | (False, '../foo/bar.baz'), 235 | (False, '../foo?a=b&c=d'), 236 | (False, '../../'), 237 | (False, '../../foo'), 238 | (False, '../../foo/bar.baz'), 239 | (False, '../../foo?a=b&c=d'), 240 | (False, 'ftp://example.com'), 241 | (False, 'httpd://example.com'), 242 | (False, 'http//example.com'), 243 | (False, 'http:/example.com'), 244 | (False, 'http:example.com'), 245 | (False, 'http.example.com'), 246 | (False, 'http-bin.com'), 247 | ]) 248 | def test_is_uri(expected, input): 249 | assert expected == bool(replay.is_uri(input)) 250 | 251 | 252 | # TODO: Have unit tests for each function in replay.py 253 | -------------------------------------------------------------------------------- /tests/test_util.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from ipwb import util 4 | 5 | 6 | @pytest.mark.parametrize('expected,input', [ 7 | ('', ''), 8 | ('foo', 'foo'), 9 | ('18', '18'), 10 | ('201l', '201l'), 11 | ('2018010100000O', '2018010100000O'), 12 | ('20181', '20181'), 13 | ('20180001000000', '201800'), 14 | ('20180132000000', '20180132'), 15 | ('20180102260000', '2018010226'), 16 | ('20181301000000', '20181301000000'), 17 | ('20180932000000', '20180932000000'), 18 | ('20180230000000', '20180230000000'), 19 | ('20181126134257.123', '20181126134257.123'), 20 | ('20180101000000', '2018'), 21 | ('20181101000000', '201811'), 22 | ('20181126000000', '20181126'), 23 | ('20181126130000', '2018112613'), 24 | ('20181126134200', '201811261342'), 25 | ('20181126134257', '20181126134257'), 26 | ]) 27 | def test_pad_digits14(expected, input): 28 | assert expected == util.pad_digits14(input) 29 | 30 | 31 | @pytest.mark.parametrize('input', [ 32 | '', 33 | 'foo', 34 | '18', 35 | '201l', 36 | '2018010100000O', 37 | '20181', 38 | '201800', 39 | '20180132', 40 | '2018010226', 41 | '20181301000000', 42 | '20180932000000', 43 | '20180230000000', 44 | '20180102263127', 45 | '20181126134257.123', 46 | ]) 47 | def test_pad_digits14_inalid(input): 48 | with pytest.raises(ValueError): 49 | util.pad_digits14(input, validate=True) 50 | --------------------------------------------------------------------------------