├── .eslintrc.json ├── .github ├── FUNDING.yml ├── stale.yml └── workflows │ ├── Publish.yaml │ ├── PublishDockerDevImage.yaml │ ├── QA.yaml │ ├── TestWebsite.yaml │ └── Tests.yaml ├── .gitignore ├── .pre-commit-config.yaml ├── CHANGELOG.md ├── Dockerfile ├── LICENSE ├── README.md ├── contrib ├── cleanup_log.py ├── html_convert.py └── marxists.org.py ├── docs └── software_architecture.md ├── pyproject.toml ├── src └── warc2zim │ ├── __about__.py │ ├── __init__.py │ ├── cdxj_indexer │ ├── __init__.py │ ├── bufferiter.py │ ├── main.py │ └── postquery.py │ ├── constants.py │ ├── converter.py │ ├── icon_finder.py │ ├── items.py │ ├── language.py │ ├── main.py │ ├── rewriting.py │ ├── templates │ └── head_insert.html │ └── utils.py ├── tasks.py ├── test-website ├── Caddyfile ├── Dockerfile ├── README.md ├── content │ ├── audio.html │ ├── bad-encoding.html │ ├── bad-redirections.html │ ├── base-href.html │ ├── base-href │ │ ├── image1.png │ │ └── something.html │ ├── chinese-encoding.html │ ├── content-types │ │ ├── content1.json │ │ ├── content2.json │ │ ├── image1.png │ │ ├── image2.png │ │ ├── index.html │ │ ├── script1.js │ │ ├── script2.js │ │ ├── small.pdf │ │ ├── style1.css │ │ └── style2.css │ ├── css-broken-file.css │ ├── css-broken-file.html │ ├── css-inline-broken.html │ ├── dailymotion.html │ ├── double-slash.html │ ├── facebook1.html │ ├── facebook2.html │ ├── form-get-1.html │ ├── form-get-2.html │ ├── form-get-3.html │ ├── form-get-4.html │ ├── form-get.html │ ├── full.html │ ├── href-to-folder │ │ └── index.html │ ├── http-equiv-redirect.html │ ├── http-return-codes.html │ ├── icons │ │ ├── android-chrome-192x192.png │ │ ├── android-chrome-512x512.png │ │ ├── apple-touch-icon.png │ │ ├── favicon-16x16.png │ │ ├── favicon-32x32.png │ │ ├── favicon.ico │ │ └── site.webmanifest │ ├── image-srcset.html │ ├── images │ │ ├── image1-1x.png │ │ ├── image1-2x.png │ │ ├── image1.png │ │ ├── image2-1x.png │ │ ├── image2-2x.png │ │ ├── image2.png │ │ ├── image3-high.png │ │ ├── image3-medium.png │ │ ├── image3-small.png │ │ ├── image3.png │ │ ├── image4-1.5x.png │ │ ├── image4.png │ │ ├── urlencoding1_icône-débuter-Solidarité-Numérique_1@300x.png │ │ ├── urlencoding2_ico%CC%82ne-de%CC%81buter-Solidarite%CC%81-Nume%CC%81rique_1%40300x.png │ │ ├── urlencoding3_icône-débuter-Solidarite%CC%81-Nume%CC%81rique_1@300x.png │ │ ├── urlencoding4_(_[_|_]_).png │ │ ├── urlencoding4_ؼ.png │ │ ├── urlencoding5_(_[_|_ؼ_]_).png │ │ ├── urlencoding5_(_[_|_ؼ_]_)_1.png │ │ └── urlencoding5_(_[_|_ؼ_]_)_2.png │ ├── index.html │ ├── instagram.html │ ├── internal_redirect_target.html │ ├── javascript.html │ ├── javascript │ │ ├── cont!nt.txt │ │ ├── cont?nt.txt │ │ ├── content.txt │ │ ├── contént.txt │ │ ├── cont🎁nt.txt │ │ ├── not_working.png │ │ ├── r?sources.js │ │ ├── resources.js │ │ ├── résources.js │ │ ├── script02!b.js │ │ ├── script02a.js │ │ ├── script03.js │ │ ├── script04.js │ │ ├── script05.js │ │ ├── script06a.js │ │ ├── script06b.js │ │ ├── script06c.js │ │ ├── script06d.js │ │ ├── script06e.js │ │ ├── script06f.js │ │ ├── script06g.js │ │ ├── script06h.js │ │ ├── script06i.js │ │ ├── script06j.js │ │ ├── script07.js │ │ ├── script08a.js │ │ ├── script08b.js │ │ ├── script08c.js │ │ ├── script09a.js │ │ ├── script09b.js │ │ ├── script09c.js │ │ ├── script09d.js │ │ ├── script10.js │ │ └── working.png │ ├── jàvàscrïpt.html │ ├── mediaelement.html │ ├── onxxx.html │ ├── soundcloud.html │ ├── spotify.html │ ├── tiktok.html │ ├── twitch1.html │ ├── twitch2.html │ ├── twitter1.html │ ├── twitter2.html │ ├── url-encoding.html │ ├── video.html │ ├── vimeo.html │ └── youtube.html ├── entrypoint-reverse.sh └── entrypoint.sh └── tests ├── __init__.py ├── cdxj_indexer └── test_postappend.py ├── data-special ├── icons.html └── qsl.net-encoding-alias.warc.gz ├── data ├── bad-redirections.warc.gz ├── content-resource-types.warc.gz ├── empty-file ├── example-response.warc ├── example-revisit.warc.gz ├── example-utf8.warc ├── example-with-timestamp.warc ├── http-return-codes.warc.gz ├── kiwix-with-redirects.warc.gz ├── main-entry-403.warc.gz ├── self-redirect.warc ├── single-page-test.warc ├── solidaritenum.warc.gz ├── video-vimeo.warc.gz ├── video-yt-2.warc.gz └── video-yt.warc.gz ├── encodings ├── definition.json ├── file01.js ├── file02.js ├── file03.html ├── file04.js ├── file05.js ├── file06.html ├── file07.html └── file08.js ├── test_converter.py ├── test_icon_finder.py ├── test_language.py ├── test_metadata_validation.py ├── test_rewriting.py ├── test_utils.py ├── test_warc_to_zim.py └── utils.py /.eslintrc.json: -------------------------------------------------------------------------------- 1 | { 2 | "ignorePatterns": [ 3 | "tests/encodings/*.js" 4 | ] 5 | } 6 | -------------------------------------------------------------------------------- /.github/FUNDING.yml: -------------------------------------------------------------------------------- 1 | # These are supported funding model platforms 2 | 3 | github: kiwix # Replace with up to 4 GitHub Sponsors-enabled usernames e.g., [user1, user2] 4 | patreon: # Replace with a single Patreon username 5 | open_collective: # Replace with a single Open Collective username 6 | ko_fi: # Replace with a single Ko-fi username 7 | tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel 8 | community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry 9 | liberapay: # Replace with a single Liberapay username 10 | issuehunt: # Replace with a single IssueHunt username 11 | otechie: # Replace with a single Otechie username 12 | custom: # https://kiwix.org/support-us/ 13 | -------------------------------------------------------------------------------- /.github/stale.yml: -------------------------------------------------------------------------------- 1 | daysUntilClose: false 2 | staleLabel: stale 3 | 4 | issues: 5 | daysUntilStale: 60 6 | markComment: > 7 | This issue has been automatically marked as stale because it has not had 8 | recent activity. It will be now be reviewed manually. Thank you 9 | for your contributions. 10 | pulls: 11 | daysUntilStale: 7 12 | markComment: > 13 | This pull request has been automatically marked as stale because it has not had 14 | recent activity. It will be now be reviewed manually. Thank you 15 | for your contributions. 16 | -------------------------------------------------------------------------------- /.github/workflows/Publish.yaml: -------------------------------------------------------------------------------- 1 | name: Publish released version 2 | 3 | on: 4 | release: 5 | types: [published] 6 | 7 | jobs: 8 | publish: 9 | runs-on: ubuntu-24.04 10 | permissions: 11 | id-token: write # mandatory for PyPI trusted publishing 12 | 13 | steps: 14 | - uses: actions/checkout@v4 15 | 16 | - name: Set up Python 17 | uses: actions/setup-python@v5 18 | with: 19 | python-version-file: pyproject.toml 20 | architecture: x64 21 | 22 | - name: Install dependencies (and project) 23 | run: | 24 | pip install -U pip 25 | pip install -e .[scripts] 26 | 27 | - name: Build packages 28 | run: | 29 | pip install -U pip build 30 | python -m build --sdist --wheel 31 | 32 | - name: Upload to PyPI 33 | uses: pypa/gh-action-pypi-publish@release/v1.12 34 | 35 | - name: Build and push Docker image 36 | uses: openzim/docker-publish-action@v10 37 | with: 38 | image-name: openzim/warc2zim 39 | tag-pattern: /^v([0-9.]+)$/ 40 | latest-on-tag: true 41 | restrict-to: openzim/warc2zim 42 | registries: ghcr.io 43 | credentials: 44 | GHCRIO_USERNAME=${{ secrets.GHCR_USERNAME }} 45 | GHCRIO_TOKEN=${{ secrets.GHCR_TOKEN }} 46 | repo_description: auto 47 | repo_overview: auto 48 | -------------------------------------------------------------------------------- /.github/workflows/PublishDockerDevImage.yaml: -------------------------------------------------------------------------------- 1 | name: Publish Docker dev image 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | 8 | jobs: 9 | publish: 10 | runs-on: ubuntu-24.04 11 | 12 | steps: 13 | - uses: actions/checkout@v4 14 | 15 | - name: Build and push Docker image 16 | uses: openzim/docker-publish-action@v10 17 | with: 18 | image-name: openzim/warc2zim 19 | manual-tag: dev 20 | latest-on-tag: false 21 | restrict-to: openzim/warc2zim 22 | registries: ghcr.io 23 | credentials: 24 | GHCRIO_USERNAME=${{ secrets.GHCR_USERNAME }} 25 | GHCRIO_TOKEN=${{ secrets.GHCR_TOKEN }} 26 | repo_description: auto 27 | repo_overview: auto 28 | -------------------------------------------------------------------------------- /.github/workflows/QA.yaml: -------------------------------------------------------------------------------- 1 | name: QA 2 | 3 | on: 4 | pull_request: 5 | push: 6 | branches: 7 | - main 8 | 9 | jobs: 10 | check-qa: 11 | runs-on: ubuntu-24.04 12 | 13 | steps: 14 | - uses: actions/checkout@v4 15 | 16 | - name: Set up Python 17 | uses: actions/setup-python@v5 18 | with: 19 | python-version-file: pyproject.toml 20 | architecture: x64 21 | 22 | - name: Install dependencies (and project) 23 | run: | 24 | pip install -U pip 25 | pip install -e .[lint,scripts,test,check] 26 | 27 | - name: Check black formatting 28 | run: inv lint-black 29 | 30 | - name: Check ruff 31 | run: inv lint-ruff 32 | 33 | - name: Check pyright 34 | run: inv check-pyright 35 | -------------------------------------------------------------------------------- /.github/workflows/TestWebsite.yaml: -------------------------------------------------------------------------------- 1 | name: Test website 2 | 3 | on: 4 | push: 5 | tags: 6 | - test-website 7 | 8 | jobs: 9 | publish: 10 | runs-on: ubuntu-24.04 11 | 12 | steps: 13 | - uses: actions/checkout@v4 14 | 15 | - name: Build and push Docker image 16 | uses: openzim/docker-publish-action@v10 17 | with: 18 | image-name: openzim/test-website 19 | manual-tag: latest 20 | latest-on-tag: false 21 | restrict-to: openzim/warc2zim 22 | context: test-website 23 | registries: ghcr.io 24 | credentials: 25 | GHCRIO_USERNAME=${{ secrets.GHCR_USERNAME }} 26 | GHCRIO_TOKEN=${{ secrets.GHCR_TOKEN }} 27 | repo_description: auto 28 | repo_overview: auto 29 | 30 | - name: Deploy Test website changes to openzim.org 31 | uses: actions-hub/kubectl@master 32 | env: 33 | KUBE_CONFIG: ${{ secrets.TEST_KUBE_CONFIG }} 34 | with: 35 | args: rollout restart deployments test-website-deployment -n test 36 | -------------------------------------------------------------------------------- /.github/workflows/Tests.yaml: -------------------------------------------------------------------------------- 1 | name: Tests 2 | 3 | on: 4 | pull_request: 5 | push: 6 | branches: 7 | - main 8 | 9 | jobs: 10 | run-tests: 11 | runs-on: ubuntu-24.04 12 | 13 | steps: 14 | - uses: actions/checkout@v4 15 | 16 | - name: Set up Python 17 | uses: actions/setup-python@v5 18 | with: 19 | python-version-file: pyproject.toml 20 | architecture: x64 21 | 22 | - name: Install dependencies (and project) 23 | run: | 24 | pip install -U pip 25 | pip install -e .[test,scripts] 26 | 27 | - name: Run the tests 28 | run: inv coverage --args "-vvv" 29 | 30 | - name: Upload coverage report to codecov 31 | uses: codecov/codecov-action@v4 32 | with: 33 | token: ${{ secrets.CODECOV_TOKEN }} 34 | 35 | build_python: 36 | runs-on: ubuntu-24.04 37 | steps: 38 | - uses: actions/checkout@v4 39 | 40 | - name: Set up Python 41 | uses: actions/setup-python@v5 42 | with: 43 | python-version-file: pyproject.toml 44 | architecture: x64 45 | 46 | - name: Ensure we can build Python targets 47 | run: | 48 | pip install -U pip build 49 | python3 -m build --sdist --wheel 50 | 51 | build_docker: 52 | runs-on: ubuntu-24.04 53 | steps: 54 | - uses: actions/checkout@v4 55 | 56 | - name: Ensure we can build the Docker image 57 | run: | 58 | docker build -t testimage . 59 | 60 | - name: Ensure we can start the Docker image 61 | run: | 62 | docker run --rm testimage 63 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | # See https://pre-commit.com for more information 2 | # See https://pre-commit.com/hooks.html for more hooks 3 | repos: 4 | - repo: https://github.com/pre-commit/pre-commit-hooks 5 | rev: v5.0.0 6 | hooks: 7 | - id: trailing-whitespace 8 | - id: end-of-file-fixer 9 | - repo: https://github.com/psf/black 10 | rev: "25.1.0" 11 | hooks: 12 | - id: black 13 | - repo: https://github.com/astral-sh/ruff-pre-commit 14 | rev: v0.9.6 15 | hooks: 16 | - id: ruff 17 | - repo: https://github.com/RobertCraigie/pyright-python 18 | rev: v1.1.394 19 | hooks: 20 | - id: pyright 21 | name: pyright (system) 22 | description: 'pyright static type checker' 23 | entry: pyright 24 | language: system 25 | 'types_or': [python, pyi] 26 | require_serial: true 27 | minimum_pre_commit_version: '2.9.2' 28 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | ## Changelog 2 | 3 | All notable changes to this project are documented in this file. 4 | 5 | The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), 6 | and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html) (as of version 1.4.0). 7 | 8 | ## [Unreleased] 9 | 10 | ### Added 11 | 12 | - Provide default encoding aliases (#416) 13 | 14 | ### Changed 15 | 16 | - Convert aliases given in `--encoding-aliases` to lower case (#412) 17 | 18 | ## [2.2.2] - 2024-02-17 19 | 20 | ### Changed 21 | 22 | - Upgrade dependencies especially zimscraperlib 5.1.1 (#439) 23 | 24 | ## [2.2.1] - 2024-02-07 25 | 26 | ### Changed 27 | 28 | - Upgrade dependencies: Python 3.13, zimscraperlib 5.1.0 and others (#434) 29 | - Fork cdxj_indexer codebase (#428) 30 | 31 | ## [2.2.0] - 2024-01-10 32 | 33 | ### Changed 34 | 35 | - Upgrade dependencies: zimscraperlib 5.0.0, warcio 1.7.5, cdxj_index 1.4.6 and others 36 | - Use all rewriting stuff from zimscraperlib 37 | - Remove most HTML / CSS / JS rewriting logic which is now part of zimscraperlib 5 38 | - Fix wombat setup settings (especially `isSW`) (#293) 39 | 40 | ### Fixed 41 | 42 | - Stop checking main entry processability when it is already found (#424) 43 | 44 | ## [2.1.3] - 2024-11-01 45 | 46 | ### Changed 47 | 48 | - Upgrade to wombat 3.8.3 (#414) 49 | 50 | ## [2.1.2] - 2024-10-08 51 | 52 | ### Added 53 | 54 | - Enrich test website with img srcset situations (in preparation for #403) 55 | 56 | ### Changed 57 | 58 | - Upgrade dependencies, including wombat 3.8.2 (#407) 59 | 60 | ### Fixed 61 | 62 | - HTML document can be retrieved as `fetch` resource type (#405) 63 | 64 | ## [2.1.1] - 2024-09-05 65 | 66 | ### Changed 67 | 68 | - Upgrade dependencies, including wombat 3.8.0 (#386) 69 | 70 | ## [2.1.0] - 2024-08-09 71 | 72 | ### Added 73 | 74 | - New fuzzy-rule for cheatography.com (#342), der-postillon.com (#330), iranwire.com (#363) 75 | - Properly rewrite redirect target url when present in HTML tag (#237) 76 | - New `--encoding-aliases` argument to pass encoding/charset aliases (#331) 77 | - Add support for SVG favicon (#148) 78 | - Automatically index PDF content and use PDF title (#289 and #290) 79 | 80 | ### Changed 81 | 82 | - Upgrade to python-scraperlib 4.0.0 83 | - Generate fuzzy rules tests in Python and Javascript (#284) 84 | - Refactor HTML rewriter class to make it more open to change and expressive (#305) 85 | - Detect charset in document header only for HTML documents (#331) 86 | - Use `software` property from `warcinfo` record to set ZIM `Scraper` metadata (#357) 87 | - Store `ContentDate` as metadata, based on `WARC-Date` (#358) 88 | - Remove domain specific rules (#328) 89 | - Revisit retrieve_illustration logic to prefer best favicons (#352 and #369) 90 | - Upgrade dependencies (zimscraperlib 4.0.0, wombat.js 3.7.12 and others) (#376) 91 | 92 | ### Fixed 93 | 94 | - Handle case where the redirect target is bad / unsupported (#332 and #356) 95 | - Fixed WARC files handling order to follow creation order (#366) 96 | - Remove subsequent slashes in URLs, both in Python and JS (#365) 97 | - Ignore non HTTP(S) WARC records (#351) 98 | - Fix `vimeo_cdn_fix` fuzzy rule for proper operation in Javascript (#348) 99 | - Performance issue linked to new "extensible" HTML rewriting rules (#370) 100 | 101 | ## [2.0.3] - 2024-07-24 102 | 103 | ### Changed 104 | 105 | - Moved rules definition from JSON to YAML and documented update process (#216) 106 | - Upgrade to wombat.js 3.7.11 107 | 108 | ### Added 109 | 110 | - Exit with cleaner message when no entries are expected in the ZIM (#336) and when main entry is not processable (#337) 111 | - Add debug log for items whose content is empty (#344) 112 | 113 | ### Fixed 114 | 115 | - Some resources rewrite mode are still not correctly identified (#326) 116 | 117 | ## [2.0.2] - 2024-06-18 118 | 119 | ### Added 120 | 121 | - Add `--ignore-content-header-charsets` option to disable automatic retrieval of content charsets from content first bytes (#318) 122 | - Add `--content-header-bytes-length` option to specify how many first bytes to consider when searching for content charsets in header (#320) 123 | - Add `--ignore-http-header-charsets` option to disable automatic retrieval of content charsets from content HTTP `Content-Type` headers (#318) 124 | 125 | ### Changed 126 | 127 | - Simplify logic deciding content charset, stop guessing with chardet (#312) 128 | 129 | ### Fixed 130 | 131 | - Rewrite only content with mimetype `text-html` when `WARC-Resource-Type` is `html` (#313) 132 | 133 | ## [2.0.1] - 2024-06-13 134 | 135 | ### Added 136 | 137 | - Add support for multiple languages in `--lang` CLI argument (#300) 138 | 139 | ### Changed 140 | 141 | - Use the new `WARC-Resource-Type` header to decide rewrite mode (when present in WARC) (#296) 142 | - Upgrade Python dependencies + wombat.js 3.7.5 143 | 144 | ### Fixed 145 | 146 | - Drop `integrity` attribute in HTML ` 6 | 7 | 29 | {% endautoescape %} 30 | 31 | 32 | -------------------------------------------------------------------------------- /src/warc2zim/utils.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # vim: ai ts=4 sts=4 et sw=4 nu 3 | 4 | from __future__ import annotations 5 | 6 | import re 7 | from http import HTTPStatus 8 | 9 | from bs4 import BeautifulSoup 10 | from warcio.recordloader import ArcWarcRecord 11 | 12 | from warc2zim.__about__ import __version__ 13 | 14 | ENCODING_RE = re.compile( 15 | r"(charset|encoding)=(?P['\"]?)(?P[a-wA-Z0-9_\-]+)(?P=quote)", 16 | re.ASCII, 17 | ) 18 | 19 | DEFAULT_ENCODING_ALIASES = { 20 | "ansi": "windows-1252", 21 | "65001": "utf-8", 22 | "iso-utf-8": "utf-8", 23 | "u": "utf-8", 24 | "unicode": "utf-8", 25 | "utf-8": "utf-8", 26 | "utf-08": "utf-8", 27 | "utf-f": "utf-8", 28 | "utp-8": "utf-8", 29 | "windows-8859-1": "iso-8859-1", 30 | "iso88591": "iso-8859-1", 31 | } 32 | 33 | ENCODING_ALIASES = {} 34 | 35 | 36 | def set_encoding_aliases(aliases: dict[str, str]): 37 | """Set the encoding aliases to use to decode""" 38 | ENCODING_ALIASES.clear() 39 | ENCODING_ALIASES.update({**DEFAULT_ENCODING_ALIASES, **aliases}) 40 | 41 | 42 | def get_encoding_by_alias(alias: str) -> str: 43 | """Get the encoding method for alias.""" 44 | key = alias.lower().strip() 45 | return ENCODING_ALIASES.get(key, key) 46 | 47 | 48 | def get_version(): 49 | return __version__ 50 | 51 | 52 | def get_record_url(record) -> str: 53 | """Check if record has url converted from POST/PUT, and if so, use that 54 | otherwise return the target url""" 55 | if hasattr(record, "urlkey"): 56 | return record.urlkey 57 | return str(record.rec_headers["WARC-Target-URI"]) 58 | 59 | 60 | def get_status_code(record: ArcWarcRecord) -> HTTPStatus | int | None: 61 | """Get the HTTP status of a given ArcWarcRecord 62 | 63 | Returns HTTPStatus value or None if status code is not found / supported 64 | """ 65 | if record.rec_type == "response": 66 | status_code = record.http_headers.get_statuscode() 67 | else: 68 | status_code = record.rec_headers.get_statuscode() 69 | 70 | if status_code is None or status_code.strip() == "": 71 | # null / missing http status found, ignore it 72 | return None 73 | 74 | status_code = int(status_code) 75 | 76 | try: 77 | status_code = HTTPStatus(status_code) 78 | except ValueError: 79 | # invalid http status found, ignore it (happens when bad http status is 80 | # returned, e.g 0, 306) 81 | return status_code 82 | 83 | return status_code 84 | 85 | 86 | def can_process_status_code(status_code: HTTPStatus | int | None) -> bool: 87 | """Return a boolean indicating if this status code is a processable redirect""" 88 | return isinstance(status_code, HTTPStatus) and not ( 89 | status_code.is_informational # not supposed to exist in WARC files 90 | or status_code.is_client_error 91 | or status_code.is_server_error 92 | or ( 93 | status_code.is_success 94 | and status_code 95 | not in [ 96 | HTTPStatus.OK, 97 | HTTPStatus.CREATED, 98 | HTTPStatus.ACCEPTED, 99 | HTTPStatus.NON_AUTHORITATIVE_INFORMATION, 100 | ] 101 | ) 102 | or ( 103 | status_code.is_redirection 104 | and status_code 105 | not in [ 106 | HTTPStatus.MOVED_PERMANENTLY, 107 | HTTPStatus.FOUND, 108 | HTTPStatus.TEMPORARY_REDIRECT, 109 | HTTPStatus.PERMANENT_REDIRECT, 110 | ] 111 | ) 112 | ) 113 | 114 | 115 | def status_code_is_processable_redirect(status_code: HTTPStatus | int | None) -> bool: 116 | """Return a boolean indicating if this status code is processable redirect""" 117 | return isinstance(status_code, HTTPStatus) and status_code in [ 118 | HTTPStatus.MOVED_PERMANENTLY, 119 | HTTPStatus.FOUND, 120 | HTTPStatus.TEMPORARY_REDIRECT, 121 | HTTPStatus.PERMANENT_REDIRECT, 122 | ] 123 | 124 | 125 | def get_record_content_type(record: ArcWarcRecord) -> str: 126 | if record.http_headers: 127 | # if the record has HTTP headers, use the Content-Type from those 128 | # (eg. 'response' record) 129 | content_type = record.http_headers["Content-Type"] 130 | else: 131 | # otherwise, use the Content-Type from WARC headers 132 | content_type = record.rec_headers["Content-Type"] 133 | return content_type or "" 134 | 135 | 136 | def get_record_mime_type(record: ArcWarcRecord) -> str: 137 | content_type = get_record_content_type(record) 138 | return content_type.split(";")[0] 139 | 140 | 141 | def parse_title(content): 142 | try: 143 | soup = BeautifulSoup(content, "html.parser") 144 | return soup.title.text or "" # pyright: ignore[reportOptionalMemberAccess] 145 | except Exception: 146 | return "" 147 | 148 | 149 | def get_record_encoding(record: ArcWarcRecord) -> str | None: 150 | content_type = get_record_content_type(record) 151 | if m := ENCODING_RE.search(content_type): 152 | return m.group("encoding") 153 | 154 | 155 | def to_string( 156 | input_: str | bytes, 157 | http_encoding: str | None, 158 | charsets_to_try: list[str], 159 | content_header_bytes_length: int, 160 | *, 161 | ignore_content_header_charsets: bool, 162 | ignore_http_header_charsets: bool, 163 | ) -> str: 164 | """ 165 | Decode content to string based on charset declared in content or fallback. 166 | 167 | This method tries to not be smarter than necessary. 168 | 169 | First, it tries to find an charset declaration inside the first bytes of the content 170 | (hopping that content first bytes can be losely decoded using few known encoding to 171 | something usable). If found, it is used to decode and any bad character is 172 | automatically replaced, assuming document editor is right. 173 | 174 | Second, if no charset declaration has been found in content, it uses the charset 175 | declared in HTTP `Content-Type` header. This is passed to this method as 176 | `http_encoding` argument. If present, it is used to decode and any bad character is 177 | automatically replaced, assuming web server is right. 178 | 179 | Finally, we fallback to use `charsets_to_try` argument, which is a list of charsets 180 | to try. Each charset is tried in order, but any bad character found is raising an 181 | error. If none of these charsets achieves to decode the content, an exception is 182 | raised. 183 | 184 | Returns the decoded content. 185 | 186 | """ 187 | 188 | if isinstance(input_, str): 189 | return input_ 190 | 191 | if not input_: 192 | # Empty bytes are easy to decode 193 | return "" 194 | 195 | # Search for encoding from content first bytes based on regexp 196 | if not ignore_content_header_charsets: 197 | for encoding in ["ascii", "utf-16", "utf-32"]: 198 | content_start = input_[:content_header_bytes_length].decode( 199 | encoding, errors="replace" 200 | ) 201 | if m := ENCODING_RE.search(content_start): 202 | head_encoding = m.group("encoding") 203 | return input_.decode( 204 | get_encoding_by_alias(head_encoding), 205 | errors="replace", 206 | ) 207 | 208 | # Search for encofing in HTTP `Content-Type` header 209 | if not ignore_http_header_charsets and http_encoding: 210 | return input_.decode(get_encoding_by_alias(http_encoding), errors="replace") 211 | 212 | # Try all charsets_to_try passed 213 | for charset_to_try in charsets_to_try: 214 | try: 215 | return input_.decode(get_encoding_by_alias(charset_to_try)) 216 | except (ValueError, LookupError): 217 | pass 218 | 219 | raise ValueError(f"No suitable charset found to decode content {input_[:200]}") 220 | 221 | 222 | def get_record_content(record: ArcWarcRecord) -> bytes: 223 | if hasattr(record, "buffered_stream"): 224 | stream = ( 225 | record.buffered_stream # pyright: ignore [reportGeneralTypeIssues, reportAttributeAccessIssue] 226 | ) 227 | stream.seek(0) 228 | return stream.read() 229 | else: 230 | return record.content_stream().read() 231 | -------------------------------------------------------------------------------- /tasks.py: -------------------------------------------------------------------------------- 1 | # pyright: strict, reportUntypedFunctionDecorator=false 2 | import os 3 | 4 | from invoke.context import Context 5 | from invoke.tasks import task # pyright: ignore [reportUnknownVariableType] 6 | 7 | use_pty = not os.getenv("CI", "") 8 | 9 | 10 | @task(optional=["args"], help={"args": "pytest additional arguments"}) 11 | def test(ctx: Context, args: str = ""): 12 | """run tests (without coverage)""" 13 | ctx.run(f"pytest {args}", pty=use_pty) 14 | 15 | 16 | @task(optional=["args"], help={"args": "pytest additional arguments"}) 17 | def test_cov(ctx: Context, args: str = ""): 18 | """run test vith coverage""" 19 | ctx.run(f"coverage run -m pytest {args}", pty=use_pty) 20 | 21 | 22 | @task(optional=["html"], help={"html": "flag to export html report"}) 23 | def report_cov(ctx: Context, *, html: bool = False): 24 | """report coverage""" 25 | ctx.run("coverage combine", warn=True, pty=use_pty) 26 | ctx.run("coverage report --show-missing", pty=use_pty) 27 | if html: 28 | ctx.run("coverage html", pty=use_pty) 29 | 30 | 31 | @task( 32 | optional=["args", "html"], 33 | help={ 34 | "args": "pytest additional arguments", 35 | "html": "flag to export html report", 36 | }, 37 | ) 38 | def coverage(ctx: Context, args: str = "", *, html: bool = False): 39 | """run tests and report coverage""" 40 | test_cov(ctx, args=args) 41 | report_cov(ctx, html=html) 42 | 43 | 44 | @task(optional=["args"], help={"args": "black additional arguments"}) 45 | def lint_black(ctx: Context, args: str = "."): 46 | args = args or "." # needed for hatch script 47 | ctx.run("black --version", pty=use_pty) 48 | ctx.run(f"black --check --diff {args}", pty=use_pty) 49 | 50 | 51 | @task(optional=["args"], help={"args": "ruff additional arguments"}) 52 | def lint_ruff(ctx: Context, args: str = "."): 53 | args = args or "." # needed for hatch script 54 | ctx.run("ruff --version", pty=use_pty) 55 | ctx.run(f"ruff check {args}", pty=use_pty) 56 | 57 | 58 | @task( 59 | optional=["args"], 60 | help={ 61 | "args": "linting tools (black, ruff) additional arguments, typically a path", 62 | }, 63 | ) 64 | def lintall(ctx: Context, args: str = "."): 65 | """Check linting""" 66 | args = args or "." # needed for hatch script 67 | lint_black(ctx, args) 68 | lint_ruff(ctx, args) 69 | 70 | 71 | @task(optional=["args"], help={"args": "check tools (pyright) additional arguments"}) 72 | def check_pyright(ctx: Context, args: str = ""): 73 | """check static types with pyright""" 74 | ctx.run("pyright --version") 75 | ctx.run(f"pyright {args}", pty=use_pty) 76 | 77 | 78 | @task(optional=["args"], help={"args": "check tools (pyright) additional arguments"}) 79 | def checkall(ctx: Context, args: str = ""): 80 | """check static types""" 81 | check_pyright(ctx, args) 82 | 83 | 84 | @task(optional=["args"], help={"args": "black additional arguments"}) 85 | def fix_black(ctx: Context, args: str = "."): 86 | """fix black formatting""" 87 | args = args or "." # needed for hatch script 88 | ctx.run(f"black {args}", pty=use_pty) 89 | 90 | 91 | @task(optional=["args"], help={"args": "ruff additional arguments"}) 92 | def fix_ruff(ctx: Context, args: str = "."): 93 | """fix all ruff rules""" 94 | args = args or "." # needed for hatch script 95 | ctx.run(f"ruff check --fix {args}", pty=use_pty) 96 | 97 | 98 | @task( 99 | optional=["args"], 100 | help={ 101 | "args": "linting tools (black, ruff) additional arguments, typically a path", 102 | }, 103 | ) 104 | def fixall(ctx: Context, args: str = "."): 105 | """Fix everything automatically""" 106 | args = args or "." # needed for hatch script 107 | fix_black(ctx, args) 108 | fix_ruff(ctx, args) 109 | lintall(ctx, args) 110 | -------------------------------------------------------------------------------- /test-website/Caddyfile: -------------------------------------------------------------------------------- 1 | {$SITE_ADDRESS} { 2 | header { 3 | Access-Control-Allow-Origin * 4 | } 5 | 6 | file_server { 7 | root /var/www/html 8 | } 9 | 10 | handle /dynimages { 11 | @hasPathParam query "path=*" 12 | handle @hasPathParam { 13 | rewrite * /images/{http.request.uri.query.path} 14 | } 15 | handle { 16 | respond 500 "Bad Request" 17 | } 18 | } 19 | 20 | handle /dynform-rewrite { 21 | @hasIdParam query "id=*" 22 | handle @hasIdParam { 23 | rewrite * /form-get-{http.request.uri.query.id}.html 24 | } 25 | handle { 26 | respond 500 "Bad Request" 27 | } 28 | } 29 | 30 | handle /dynform-redirect { 31 | @hasIdParam query "id=*" 32 | handle @hasIdParam { 33 | redir * /form-get-{http.request.uri.query.id}.html 302 34 | } 35 | handle { 36 | respond 500 "Bad Request" 37 | } 38 | } 39 | 40 | respond /200-response "Never mind" 200 41 | 42 | respond /201-response "Never mind" 201 43 | 44 | respond /202-response "Never mind" 202 45 | 46 | respond /204-response 204 47 | 48 | respond /206-response "Never mind" 206 49 | 50 | respond /300-response "Never mind" 300 51 | 52 | respond /303-response "Never mind" 303 53 | 54 | respond /304-response "Never mind" 304 55 | 56 | respond /305-response "Never mind" 305 57 | 58 | respond /306-response "Never mind" 306 59 | 60 | respond /400-response 400 61 | 62 | respond /401-response 401 63 | 64 | respond /402-response 402 65 | 66 | respond /403-response 403 67 | 68 | respond /404-response 404 69 | 70 | respond /500-response 500 71 | 72 | respond /501-response 501 73 | 74 | respond /502-response 502 75 | 76 | respond // "Hello you" 400 77 | 78 | respond /double-slash/test1 "Hello you" 200 79 | respond /double-slash//test1 400 80 | 81 | respond /double-slash/test2 "Hello you v1" 200 82 | respond /double-slash//test2 "Hello you v2" 200 83 | 84 | redir /301-internal-redirect-ok /internal_redirect_target.html 301 85 | redir /301-external-redirect-ok https://www.example.com 301 86 | redir /302-internal-redirect-ok /internal_redirect_target.html 302 87 | redir /302-external-redirect-ok https://www.example.com 302 88 | redir /307-internal-redirect-ok /internal_redirect_target.html 307 89 | redir /307-external-redirect-ok https://www.example.com 307 90 | redir /308-internal-redirect-ok /internal_redirect_target.html 308 91 | redir /308-external-redirect-ok https://www.example.com 308 92 | 93 | redir /301-internal-redirect-ko /internal_redirect_missing.html 301 94 | redir /301-external-redirect-ko https://www.example.invalid 301 95 | redir /302-internal-redirect-ko /internal_redirect_missing.html 302 96 | redir /302-external-redirect-ko https://www.example.invalid 302 97 | redir /307-internal-redirect-ko /internal_redirect_missing.html 307 98 | redir /307-external-redirect-ko https://www.example.invalid 307 99 | redir /308-internal-redirect-ko /internal_redirect_missing.html 308 100 | redir /308-external-redirect-ko https://www.example.invalid 308 101 | 102 | redir /bad-redir-loop-A /bad-redir-loop-B 307 103 | redir /bad-redir-loop-B /bad-redir-loop-C 307 104 | redir /bad-redir-loop-C /bad-redir-loop-D 307 105 | redir /bad-redir-loop-D /bad-redir-loop-B 307 106 | redir /bad-redir-target-A https://I%20mNotAhostname 307 107 | redir /bad-redir-target-B intent://example.com/path#Intent;scheme=http;package=com.example.myapp;component=com.example.myapp/.MainActivity;end 307 108 | 109 | header /content-types/script1.js Content-Type application/javascript 110 | header /content-types/script2.js Content-Type text/javascript 111 | header /content-types/style1.css Content-Type text/css 112 | header /content-types/style2.css Content-Type idontexist 113 | header /content-types/image1.png Content-Type idontexist 114 | header /content-types/image2.png Content-Type text/html 115 | header /content-types/content2.json Content-Type idontexist 116 | 117 | } 118 | -------------------------------------------------------------------------------- /test-website/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM caddy:2.6.1-alpine 2 | LABEL org.opencontainers.image.source=https://github.com/openzim/warc2zim 3 | 4 | COPY Caddyfile /etc/caddy/Caddyfile 5 | 6 | COPY content/ /var/www/html/ 7 | 8 | RUN apk add --no-cache bash 9 | 10 | COPY entrypoint.sh /root/entrypoint.sh 11 | 12 | ENTRYPOINT [ "/root/entrypoint.sh" ] 13 | 14 | CMD [ "caddy", "run", "--config", "/etc/caddy/Caddyfile", "--adapter", "caddyfile"] 15 | -------------------------------------------------------------------------------- /test-website/README.md: -------------------------------------------------------------------------------- 1 | # Test website 2 | 3 | This is a test website for zimit / warc2zim tests. 4 | 5 | It contains all kind of situations we currently cope with: 6 | - a youtube player 7 | - a vimeo player 8 | - an X/twitter post 9 | - an X/twitter video 10 | - a facebook video 11 | - a facebook post 12 | - an instagram embed 13 | 14 | - invalid inline CSS 15 | - invalid rule in a CSS file 16 | 17 | 18 | ## How to use locally 19 | 20 | Build the docker image 21 | 22 | ``` 23 | docker build -t test-website . 24 | ``` 25 | 26 | Start the test website with appropriate environment variables. 27 | 28 | ``` 29 | docker run -p 8888:80 --rm --name test-website -e SITE_ADDRESS="test-website.local.oviles.info:80, xn--jdacents-v0aqb.local.oviles.info:80" -e STANDARD_NETLOC="http:\/\/test-website.local.oviles.info:8888" -e NOT_STANDARD_NETLOC_NOT_ENCODED="http:\/\/jédéacçents.local.oviles.info:8888" -e NOT_STANDARD_NETLOC_PUNNY_ENCODED="http:\/\/xn--jdacents-v0aqb.local.oviles.info:8888" test-website 30 | ``` 31 | 32 | In the example above, the trick is that we have the following DNS records in place : `local.oviles.info A 127.0.0.1` and `*.local.oviles.info CNAME local.oviles.info`, meaning that any request to local.oviles.info or one of its subdomain will resolve to localhost IP 127.0.0.1 ; we use local ports 8080 for HTTP and 8443 for HTTPS. 33 | 34 | You can then open https://test-website.local.oviles.info:8888 in your favorite browser and run manual tests on this website (which uses the other one as sub-site for few resources on special domains with special characters). 35 | 36 | If you wanna develop the test-website locally, you might want as well to mount the `content` folder inside the container 37 | 38 | ``` 39 | docker run -v $PWD/content:/var/www/html -p 8888:80 --rm --name test-website -e SITE_ADDRESS="test-website.local.oviles.info:80, xn--jdacents-v0aqb.local.oviles.info:80" -e STANDARD_NETLOC="http:\/\/test-website.local.oviles.info:8888" -e NOT_STANDARD_NETLOC_NOT_ENCODED="http:\/\/jédéacçents.local.oviles.info:8888" -e NOT_STANDARD_NETLOC_PUNNY_ENCODED="http:\/\/xn--jdacents-v0aqb.local.oviles.info:8888" test-website 40 | ``` 41 | 42 | This will have the adverse effect that local files will be modified as well by the `entrypoint.sh` to replace placeholders by environment variables value. And it means that you have to use "real" netloc from the environment in your modifications for test. 43 | 44 | Once done, there is a utility script at `entrypoint-reverse.sh` which can be used to reverse these modifications once you are about to commit to Github (this will break the test-website inside Docker container, but you will be able to commit with proper modifications and then just restart the container to reapply needed modification). 45 | 46 | ## Environments variables needed in Docker image 47 | 48 | |Environment variable | Usage | Comment | Sample value | 49 | |--|--|--|--| 50 | | `SITE_ADDRESS` | Caddyfile | The site address for Caddy operation ; should contain a standard and a not-standard punny-encoded hostnames for proper testing | `test-website.local.oviles.info:80, xn--jdacents-v0aqb.local.oviles.info:80` | 51 | | `STANDARD_NETLOC` | sed in HTML/JS/... files ^1 | The URL to the standard netloc (no special characters) | `http:\/\/test-website.local.oviles.info:8888` | 52 | | `NOT_STANDARD_NETLOC_NOT_ENCODED` | sed in HTML/JS/... files ^1 | The URL to the not standard netloc (with special characters) but not encoded | `http:\/\/jédéacçents.local.oviles.info:8888` | 53 | | `NOT_STANDARD_NETLOC_PUNNY_ENCODED` | sed in HTML/JS/... files ^1 | The URL to the not standard netloc (with special characters) punny encoded | `http:\/\/xn--jdacents-v0aqb.local.oviles.info:8888` | 54 | 55 | 1. variables that will be used by `sed` as replacement have to be be escaped for proper sed operations ; since they will be interpreted by the browser, they should contain the "user-visible" FQDN and port 56 | -------------------------------------------------------------------------------- /test-website/content/audio.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | Test website 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 |

Audio tag

17 | 18 | 19 | 20 | 21 | 22 | 23 | -------------------------------------------------------------------------------- /test-website/content/bad-encoding.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openzim/warc2zim/270f5dbaaed6455ccb1dc05e4f10cf460f41b992/test-website/content/bad-encoding.html -------------------------------------------------------------------------------- /test-website/content/bad-redirections.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | Test website 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 |

Bad redirections

17 | 18 |

Links below are indefinitely redirecting and hence not working

19 | 20 |

Redirect to self through loop

21 | 22 |

Redirect to inner-loop

23 | 24 |

Links below is targeting something which is not working

25 | 26 |

Redirect to silly HTTP URL

27 | 28 |

Redirect to an intent (not working inside ZIM)

29 | 30 | 31 | 32 | 33 | -------------------------------------------------------------------------------- /test-website/content/base-href.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | Test website 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 |

Base href

20 | 21 | You should check that icon (meaning favicon) / webmanifest are properly loaded as well 22 | 23 |

Something on relative link

24 |

Something on absolute link

25 |

A picture with relative link:

26 |

Another picture with relative link:

27 |

A picture with absolute link:

28 | 29 | 30 | 31 | 32 | -------------------------------------------------------------------------------- /test-website/content/base-href/image1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openzim/warc2zim/270f5dbaaed6455ccb1dc05e4f10cf460f41b992/test-website/content/base-href/image1.png -------------------------------------------------------------------------------- /test-website/content/base-href/something.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | Test website 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 |

I'm located here

17 | 18 | 19 | 20 | 21 | -------------------------------------------------------------------------------- /test-website/content/chinese-encoding.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openzim/warc2zim/270f5dbaaed6455ccb1dc05e4f10cf460f41b992/test-website/content/chinese-encoding.html -------------------------------------------------------------------------------- /test-website/content/content-types/content1.json: -------------------------------------------------------------------------------- 1 | { 2 | "property": "I'm OK" 3 | } 4 | -------------------------------------------------------------------------------- /test-website/content/content-types/content2.json: -------------------------------------------------------------------------------- 1 | { 2 | "property": "I'm OK" 3 | } 4 | -------------------------------------------------------------------------------- /test-website/content/content-types/image1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openzim/warc2zim/270f5dbaaed6455ccb1dc05e4f10cf460f41b992/test-website/content/content-types/image1.png -------------------------------------------------------------------------------- /test-website/content/content-types/image2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openzim/warc2zim/270f5dbaaed6455ccb1dc05e4f10cf460f41b992/test-website/content/content-types/image2.png -------------------------------------------------------------------------------- /test-website/content/content-types/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | Test website 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 22 | 23 | 24 | 25 | 26 |

Tests of records with some missing or bad content-type headers

27 | 28 |

29 | An image should be displayed here: 30 | 31 |

32 |

33 | And mostly same image should be displayed here: 34 | 35 |

36 |

Open a PDF

37 |

This text must be in green.

38 |

This text must also be in green.

39 |

Something should be loaded here from JS: I'm not OK

40 |

Something also should be loaded here from JS: I'm not OK

41 | 42 | 43 | 44 | 45 | -------------------------------------------------------------------------------- /test-website/content/content-types/script1.js: -------------------------------------------------------------------------------- 1 | async function fetchContent1() { 2 | const response = await fetch('./content1.json') 3 | const json = await response.json(); 4 | const span = document.getElementById("span01"); 5 | span.innerHTML=json.property; 6 | span.classList.add('green'); 7 | } 8 | fetchContent1() 9 | -------------------------------------------------------------------------------- /test-website/content/content-types/script2.js: -------------------------------------------------------------------------------- 1 | async function fetchContent2() { 2 | const response = await fetch('./content2.json') 3 | const json = await response.json(); 4 | const span = document.getElementById("span02"); 5 | span.innerHTML=json.property; 6 | span.classList.add('green'); 7 | } 8 | fetchContent2() 9 | -------------------------------------------------------------------------------- /test-website/content/content-types/small.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openzim/warc2zim/270f5dbaaed6455ccb1dc05e4f10cf460f41b992/test-website/content/content-types/small.pdf -------------------------------------------------------------------------------- /test-website/content/content-types/style1.css: -------------------------------------------------------------------------------- 1 | .green1 { 2 | color: green; 3 | } 4 | -------------------------------------------------------------------------------- /test-website/content/content-types/style2.css: -------------------------------------------------------------------------------- 1 | .green2 { 2 | color: green; 3 | } 4 | -------------------------------------------------------------------------------- /test-website/content/css-broken-file.css: -------------------------------------------------------------------------------- 1 | 2 | .great { 3 | not_exits_prop: red; /* property does not exist */ 4 | background-image: url('http://www.mediaelementjs.com/images/big_buck_bunny.jpg'); 5 | width: 150px; 6 | font-size: ; /* missing value */ 7 | display= 12; /* equal sign */ 8 | height: 100px; 9 | margin: 2rem; 10 | padding: 1rem; 11 | font-weight: bolder; 12 | color: white; 13 | text-align: center; 14 | } 15 | 16 | "broken 17 | -------------------------------------------------------------------------------- /test-website/content/css-broken-file.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | Test website 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 |

Broken CSS file

18 | 19 |
20 | CSS file is broken but background image is displayed 21 |
22 | 23 | 24 | 25 | 26 | -------------------------------------------------------------------------------- /test-website/content/css-inline-broken.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | Test website 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 |

Broken inline CSS

17 | 18 |
20 | CSS not broken, background image displayed 21 |
22 |
24 | CSS broken, background image displayed as well 25 |
26 | 27 | 28 | 29 | 30 | -------------------------------------------------------------------------------- /test-website/content/dailymotion.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | Test website 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 |

Dailymotion

17 | 18 |
19 | 22 |
23 | 24 | 25 | 26 | 27 | 28 | -------------------------------------------------------------------------------- /test-website/content/double-slash.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | Test website 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 |

Double slash in URLs

17 | 18 | .// 19 | ./double-slash//test1 20 | ./double-slash/test1 21 | ./double-slash/test2 22 | ./double-slash//test2 23 | 24 | 25 | 26 | 27 | -------------------------------------------------------------------------------- /test-website/content/facebook1.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | Test website 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 |

Facebook video embed from Feb. 2024

18 | 19 | 23 | 24 | 25 | 26 | 27 | -------------------------------------------------------------------------------- /test-website/content/facebook2.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | Test website 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 |

Facebook post embed from Feb. 2024

17 | 18 | 22 | 23 | 24 | 25 | 26 | -------------------------------------------------------------------------------- /test-website/content/form-get-1.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | Test website 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 |

Form get subpage 1

17 | 18 |

Congratulations, you found subpage 1

19 | 20 | 21 | 22 | 23 | 24 | -------------------------------------------------------------------------------- /test-website/content/form-get-2.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | Test website 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 |

Form get subpage 2

17 | 18 |

Congratulations, you found subpage 2

19 | 20 | 21 | 22 | 23 | 24 | -------------------------------------------------------------------------------- /test-website/content/form-get-3.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | Test website 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 |

Form get subpage 3

17 | 18 |

Congratulations, you found subpage 3

19 | 20 | 21 | 22 | 23 | 24 | -------------------------------------------------------------------------------- /test-website/content/form-get-4.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | Test website 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 |

Form get subpage 4

17 | 18 |

Congratulations, you found subpage 4

19 | 20 | 21 | 22 | 23 | 24 | -------------------------------------------------------------------------------- /test-website/content/form-get.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Test website 6 | 11 | 17 | 23 | 24 | 25 | 26 | 27 | 28 |

Form GET

29 | 30 |

This page showcase a situation where someone used a form with a combobox to enable navigation on the website.

31 | 32 |

Nota: this has been seen in the wild on https://chopin.lib.uchicago.edu/ (open any title scores and the combobox will appear in top right corner) and on https://medecine-integree.com/ (for which we - Kiwix - do not have any rights to copy but have been approached by a user).

33 | 34 |

Rewrite

35 |

Server configuration will intercept the queryparameter and rewrite internal to proper HTML file, invisible to the end user.

36 |
37 | 40 | 45 | 46 |
47 | 48 |

Redirect

49 |

Server configuration will intercept the queryparameter and redirect with a 302 to proper URL, hence visible to the end user.

50 |
51 | 54 | 59 | 60 |
61 | 62 | 63 | 64 | -------------------------------------------------------------------------------- /test-website/content/href-to-folder/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | Test website 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 |

Tests of links to folder instead of document

17 | 18 |

./

19 |

../href-to-folder/

20 |

../../href-to-folder/ (too deep)

21 |

#section1

22 |

#section2

23 |

...

24 |

...

25 |

...

26 |

...

27 |

...

28 |

...

29 |

...

30 |

...

31 |

...

32 |

...

33 |

...

34 |

...

35 |

...

36 |

...

37 |

...

38 |

Section1

39 |

...

40 |

...

41 |

...

42 |

...

43 |

...

44 |

...

45 |

...

46 |

...

47 |

...

48 |

...

49 |

Section2

50 |

...

51 |

...

52 |

...

53 |

...

54 |

...

55 |

...

56 |

...

57 |

...

58 |

...

59 |

...

60 | 61 | 62 | 63 | 64 | -------------------------------------------------------------------------------- /test-website/content/http-equiv-redirect.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | Test website 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 |

Redirect with http-equiv meta directive

18 | 19 |

You should be redirected to home page in 3 seconds

20 | 21 | 22 | 23 | 24 | -------------------------------------------------------------------------------- /test-website/content/http-return-codes.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | Test website 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 |

Various redirects

17 | 18 |

200 response

19 |

201 response

20 |

202 response

21 |

204 response

22 |

206 response

23 |

300 response

24 |

303 response

25 |

304 response

26 |

305 response

27 |

306 response

28 |

400 response

29 |

401 response

30 |

402 response

31 |

403 response

32 |

404 response

33 |

500 response

34 |

501 response

35 |

502 response

36 |

301 Internal redirect OK

37 |

301 External redirect OK

38 |

302 Internal redirect OK

39 |

302 External redirect OK

40 |

307 Internal redirect OK

41 |

307 External redirect OK

42 |

308 Internal redirect OK

43 |

308 External redirect OK

44 |

301 Internal redirect KO

45 |

301 External redirect KO

46 |

302 Internal redirect KO

47 |

302 External redirect KO

48 |

307 Internal redirect KO

49 |

307 External redirect KO

50 |

308 Internal redirect KO

51 |

308 External redirect KO

52 | 53 | 54 | 55 | -------------------------------------------------------------------------------- /test-website/content/icons/android-chrome-192x192.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openzim/warc2zim/270f5dbaaed6455ccb1dc05e4f10cf460f41b992/test-website/content/icons/android-chrome-192x192.png -------------------------------------------------------------------------------- /test-website/content/icons/android-chrome-512x512.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openzim/warc2zim/270f5dbaaed6455ccb1dc05e4f10cf460f41b992/test-website/content/icons/android-chrome-512x512.png -------------------------------------------------------------------------------- /test-website/content/icons/apple-touch-icon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openzim/warc2zim/270f5dbaaed6455ccb1dc05e4f10cf460f41b992/test-website/content/icons/apple-touch-icon.png -------------------------------------------------------------------------------- /test-website/content/icons/favicon-16x16.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openzim/warc2zim/270f5dbaaed6455ccb1dc05e4f10cf460f41b992/test-website/content/icons/favicon-16x16.png -------------------------------------------------------------------------------- /test-website/content/icons/favicon-32x32.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openzim/warc2zim/270f5dbaaed6455ccb1dc05e4f10cf460f41b992/test-website/content/icons/favicon-32x32.png -------------------------------------------------------------------------------- /test-website/content/icons/favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openzim/warc2zim/270f5dbaaed6455ccb1dc05e4f10cf460f41b992/test-website/content/icons/favicon.ico -------------------------------------------------------------------------------- /test-website/content/icons/site.webmanifest: -------------------------------------------------------------------------------- 1 | { 2 | "name": "RFG", 3 | "short_name": "RFG", 4 | "icons": [ 5 | { 6 | "src": "./android-chrome-192x192.png", 7 | "sizes": "192x192", 8 | "type": "image/png" 9 | }, 10 | { 11 | "src": "./android-chrome-512x512.png", 12 | "sizes": "512x512", 13 | "type": "image/png" 14 | } 15 | ], 16 | "theme_color": "#377dba", 17 | "background_color": "#377dba", 18 | "display": "standalone" 19 | } 20 | -------------------------------------------------------------------------------- /test-website/content/image-srcset.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Test website 6 | 11 | 17 | 23 | 24 | 25 | 26 | 27 | 28 |

29 | This page contains tests around varying images based on screen / device. 30 |

31 | 32 |

Image srcset

33 | 34 |

35 | An image should be displayed below at all screen sizes and pixel ratios. 36 |

37 | 38 | an image 47 | 48 |

49 | Another image should be displayed below at all screen sizes and pixel 50 | ratios. 51 |

52 | 53 | an image 58 | 59 |

Picture sources - with srcset pixel ratio

60 | 61 |

62 | An image should be displayed below at all screen sizes and pixel ratios. 63 |

64 | 65 | 66 | 74 | an image 75 | 76 | 77 |

Picture sources - with media queries

78 | 79 |

80 | An image should be displayed below at all screen sizes and pixel ratios. 81 |

82 | 83 | 84 | 89 | 94 | 99 | an image 100 | 101 | 102 | 103 | -------------------------------------------------------------------------------- /test-website/content/images/image1-1x.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openzim/warc2zim/270f5dbaaed6455ccb1dc05e4f10cf460f41b992/test-website/content/images/image1-1x.png -------------------------------------------------------------------------------- /test-website/content/images/image1-2x.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openzim/warc2zim/270f5dbaaed6455ccb1dc05e4f10cf460f41b992/test-website/content/images/image1-2x.png -------------------------------------------------------------------------------- /test-website/content/images/image1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openzim/warc2zim/270f5dbaaed6455ccb1dc05e4f10cf460f41b992/test-website/content/images/image1.png -------------------------------------------------------------------------------- /test-website/content/images/image2-1x.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openzim/warc2zim/270f5dbaaed6455ccb1dc05e4f10cf460f41b992/test-website/content/images/image2-1x.png -------------------------------------------------------------------------------- /test-website/content/images/image2-2x.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openzim/warc2zim/270f5dbaaed6455ccb1dc05e4f10cf460f41b992/test-website/content/images/image2-2x.png -------------------------------------------------------------------------------- /test-website/content/images/image2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openzim/warc2zim/270f5dbaaed6455ccb1dc05e4f10cf460f41b992/test-website/content/images/image2.png -------------------------------------------------------------------------------- /test-website/content/images/image3-high.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openzim/warc2zim/270f5dbaaed6455ccb1dc05e4f10cf460f41b992/test-website/content/images/image3-high.png -------------------------------------------------------------------------------- /test-website/content/images/image3-medium.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openzim/warc2zim/270f5dbaaed6455ccb1dc05e4f10cf460f41b992/test-website/content/images/image3-medium.png -------------------------------------------------------------------------------- /test-website/content/images/image3-small.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openzim/warc2zim/270f5dbaaed6455ccb1dc05e4f10cf460f41b992/test-website/content/images/image3-small.png -------------------------------------------------------------------------------- /test-website/content/images/image3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openzim/warc2zim/270f5dbaaed6455ccb1dc05e4f10cf460f41b992/test-website/content/images/image3.png -------------------------------------------------------------------------------- /test-website/content/images/image4-1.5x.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openzim/warc2zim/270f5dbaaed6455ccb1dc05e4f10cf460f41b992/test-website/content/images/image4-1.5x.png -------------------------------------------------------------------------------- /test-website/content/images/image4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openzim/warc2zim/270f5dbaaed6455ccb1dc05e4f10cf460f41b992/test-website/content/images/image4.png -------------------------------------------------------------------------------- /test-website/content/images/urlencoding1_icône-débuter-Solidarité-Numérique_1@300x.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openzim/warc2zim/270f5dbaaed6455ccb1dc05e4f10cf460f41b992/test-website/content/images/urlencoding1_icône-débuter-Solidarité-Numérique_1@300x.png -------------------------------------------------------------------------------- /test-website/content/images/urlencoding2_ico%CC%82ne-de%CC%81buter-Solidarite%CC%81-Nume%CC%81rique_1%40300x.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openzim/warc2zim/270f5dbaaed6455ccb1dc05e4f10cf460f41b992/test-website/content/images/urlencoding2_ico%CC%82ne-de%CC%81buter-Solidarite%CC%81-Nume%CC%81rique_1%40300x.png -------------------------------------------------------------------------------- /test-website/content/images/urlencoding3_icône-débuter-Solidarite%CC%81-Nume%CC%81rique_1@300x.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openzim/warc2zim/270f5dbaaed6455ccb1dc05e4f10cf460f41b992/test-website/content/images/urlencoding3_icône-débuter-Solidarite%CC%81-Nume%CC%81rique_1@300x.png -------------------------------------------------------------------------------- /test-website/content/images/urlencoding4_(_[_|_]_).png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openzim/warc2zim/270f5dbaaed6455ccb1dc05e4f10cf460f41b992/test-website/content/images/urlencoding4_(_[_|_]_).png -------------------------------------------------------------------------------- /test-website/content/images/urlencoding4_ؼ.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openzim/warc2zim/270f5dbaaed6455ccb1dc05e4f10cf460f41b992/test-website/content/images/urlencoding4_ؼ.png -------------------------------------------------------------------------------- /test-website/content/images/urlencoding5_(_[_|_ؼ_]_).png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openzim/warc2zim/270f5dbaaed6455ccb1dc05e4f10cf460f41b992/test-website/content/images/urlencoding5_(_[_|_ؼ_]_).png -------------------------------------------------------------------------------- /test-website/content/images/urlencoding5_(_[_|_ؼ_]_)_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openzim/warc2zim/270f5dbaaed6455ccb1dc05e4f10cf460f41b992/test-website/content/images/urlencoding5_(_[_|_ؼ_]_)_1.png -------------------------------------------------------------------------------- /test-website/content/images/urlencoding5_(_[_|_ؼ_]_)_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openzim/warc2zim/270f5dbaaed6455ccb1dc05e4f10cf460f41b992/test-website/content/images/urlencoding5_(_[_|_ؼ_]_)_2.png -------------------------------------------------------------------------------- /test-website/content/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | Test website 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 |

This is a test website for zimit/warc2zim.

17 | 18 |

Caution! Opening any of the subpages will likely activate all kind of trackers from the source websites 19 |

20 | 21 | 57 | 58 | 59 | 60 | -------------------------------------------------------------------------------- /test-website/content/instagram.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | Test website 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 |

This page contains Instagram test(s).

17 | 18 |

Instagram embed from Feb. 2024

19 | 20 |
24 |
28 |
29 |
31 |
32 |
33 |
35 |
36 |
37 |
38 |
39 |
40 |
43 | 44 | 45 | 46 | 48 | 49 | 50 | 51 | 52 |
53 |
54 |
56 | View this post on Instagram
57 |
58 |
59 |
60 |
61 |
63 |
64 |
66 |
67 |
69 |
70 |
71 |
72 |
73 |
75 |
76 |
77 |
78 |
80 |
81 |
83 |
84 |
86 |
87 |
88 |
89 |
90 |
92 |
93 |
94 |
95 |
96 |

98 | A post shared by Wikipedia (@wikipedia)

101 |
102 |
103 | 104 | 105 | 106 | 107 | 108 | -------------------------------------------------------------------------------- /test-website/content/internal_redirect_target.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | Test website 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 |

You've been redirect to me

17 | 18 | 19 | 20 | 21 | -------------------------------------------------------------------------------- /test-website/content/javascript/cont!nt.txt: -------------------------------------------------------------------------------- 1 | This is working OK 2 | -------------------------------------------------------------------------------- /test-website/content/javascript/cont?nt.txt: -------------------------------------------------------------------------------- 1 | This is working OK 2 | -------------------------------------------------------------------------------- /test-website/content/javascript/content.txt: -------------------------------------------------------------------------------- 1 | This is working OK 2 | -------------------------------------------------------------------------------- /test-website/content/javascript/contént.txt: -------------------------------------------------------------------------------- 1 | This is working OK 2 | -------------------------------------------------------------------------------- /test-website/content/javascript/cont🎁nt.txt: -------------------------------------------------------------------------------- 1 | This is working OK 2 | -------------------------------------------------------------------------------- /test-website/content/javascript/not_working.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openzim/warc2zim/270f5dbaaed6455ccb1dc05e4f10cf460f41b992/test-website/content/javascript/not_working.png -------------------------------------------------------------------------------- /test-website/content/javascript/r?sources.js: -------------------------------------------------------------------------------- 1 | const okValue = 'This is working OK'; 2 | const otherValue = 'This is another value'; 3 | 4 | const okClass = 'green'; 5 | 6 | export {okClass, otherValue}; 7 | export default okValue; 8 | -------------------------------------------------------------------------------- /test-website/content/javascript/resources.js: -------------------------------------------------------------------------------- 1 | const okValue = 'This is working OK'; 2 | const otherValue = 'This is another value'; 3 | 4 | const okClass = 'green'; 5 | 6 | export {okClass, otherValue}; 7 | export default okValue; 8 | -------------------------------------------------------------------------------- /test-website/content/javascript/résources.js: -------------------------------------------------------------------------------- 1 | const okValue = 'This is working OK'; 2 | const otherValue = 'This is another value'; 3 | 4 | const okClass = 'green'; 5 | 6 | export {okClass, otherValue}; 7 | export default okValue; 8 | -------------------------------------------------------------------------------- /test-website/content/javascript/script02!b.js: -------------------------------------------------------------------------------- 1 | const span02b = document.getElementById('span02b'); 2 | span02b.innerHTML = 'This is working OK'; 3 | span02b.classList.add('green'); 4 | -------------------------------------------------------------------------------- /test-website/content/javascript/script02a.js: -------------------------------------------------------------------------------- 1 | const span02a = document.getElementById('span02a'); 2 | span02a.innerHTML = 'This is working OK'; 3 | span02a.classList.add('green'); 4 | -------------------------------------------------------------------------------- /test-website/content/javascript/script03.js: -------------------------------------------------------------------------------- 1 | const span03 = document.getElementById("span03"); 2 | span03.innerHTML="This is working OK"; 3 | span03.classList.add("green"); 4 | -------------------------------------------------------------------------------- /test-website/content/javascript/script04.js: -------------------------------------------------------------------------------- 1 | window.onload = function() { 2 | const span04 = document.getElementById("span04"); 3 | span04.innerHTML="This is working OK"; 4 | span04.classList.add("green"); 5 | } 6 | -------------------------------------------------------------------------------- /test-website/content/javascript/script05.js: -------------------------------------------------------------------------------- 1 | const span05 = document.getElementById("span05"); 2 | span05.innerHTML="This is working OK"; 3 | span05.classList.add("green"); 4 | -------------------------------------------------------------------------------- /test-website/content/javascript/script06a.js: -------------------------------------------------------------------------------- 1 | import okValue from './resources.js'; 2 | import { okClass } from './resources.js'; 3 | 4 | const span = document.getElementById("span06a"); 5 | span.innerHTML=okValue; 6 | span.classList.add(okClass); 7 | -------------------------------------------------------------------------------- /test-website/content/javascript/script06b.js: -------------------------------------------------------------------------------- 1 | import okValue from './resources.js'; 2 | import { otherValue, okClass as myClass } from './resources.js'; 3 | 4 | const span = document.getElementById("span06b"); 5 | span.innerHTML=okValue; 6 | span.classList.add(myClass); 7 | -------------------------------------------------------------------------------- /test-website/content/javascript/script06c.js: -------------------------------------------------------------------------------- 1 | import okValue from '/javascript/resources.js'; 2 | import { okClass } from '/javascript/resources.js'; 3 | 4 | const span = document.getElementById("span06c"); 5 | span.innerHTML=okValue; 6 | span.classList.add(okClass); 7 | -------------------------------------------------------------------------------- /test-website/content/javascript/script06d.js: -------------------------------------------------------------------------------- 1 | import okValue from '../javascript/resources.js'; 2 | import { okClass } from '../javascript/resources.js'; 3 | 4 | const span = document.getElementById("span06d"); 5 | span.innerHTML=okValue; 6 | span.classList.add(okClass); 7 | -------------------------------------------------------------------------------- /test-website/content/javascript/script06e.js: -------------------------------------------------------------------------------- 1 | import okValue from './résources.js'; 2 | import { okClass } from './résources.js'; 3 | 4 | const span = document.getElementById("span06e"); 5 | span.innerHTML=okValue; 6 | span.classList.add(okClass); 7 | -------------------------------------------------------------------------------- /test-website/content/javascript/script06f.js: -------------------------------------------------------------------------------- 1 | import okValue from './resources.js?query=value'; 2 | import { okClass } from './resources.js?query=value'; 3 | 4 | const span = document.getElementById("span06f"); 5 | span.innerHTML=okValue; 6 | span.classList.add(okClass); 7 | -------------------------------------------------------------------------------- /test-website/content/javascript/script06g.js: -------------------------------------------------------------------------------- 1 | import okValue from './resources.js?query=valu%3De'; 2 | import { okClass } from './resources.js?query=valu%3De'; 3 | 4 | const span = document.getElementById("span06g"); 5 | span.innerHTML=okValue; 6 | span.classList.add(okClass); 7 | -------------------------------------------------------------------------------- /test-website/content/javascript/script06h.js: -------------------------------------------------------------------------------- 1 | import okValue from './resources.js?query=valu%25e'; 2 | import { okClass } from './resources.js?query=valu%25e'; 3 | 4 | const span = document.getElementById("span06h"); 5 | span.innerHTML=okValue; 6 | span.classList.add(okClass); 7 | -------------------------------------------------------------------------------- /test-website/content/javascript/script06i.js: -------------------------------------------------------------------------------- 1 | import okValue from './r%3Fsources.js'; 2 | import { okClass } from './r%3Fsources.js'; 3 | 4 | const span = document.getElementById("span06i"); 5 | span.innerHTML=okValue; 6 | span.classList.add(okClass); 7 | -------------------------------------------------------------------------------- /test-website/content/javascript/script06j.js: -------------------------------------------------------------------------------- 1 | import okValue from './r%3Fsources.js?query=value'; 2 | import { okClass } from './r%3Fsources.js?query=value'; 3 | 4 | const span = document.getElementById("span06j"); 5 | span.innerHTML=okValue; 6 | span.classList.add(okClass); 7 | -------------------------------------------------------------------------------- /test-website/content/javascript/script07.js: -------------------------------------------------------------------------------- 1 | import okValue from './resources.js'; 2 | import { okClass } from './resources.js'; 3 | 4 | const span = document.getElementById("span07"); 5 | span.innerHTML=okValue; 6 | span.classList.add(okClass); 7 | -------------------------------------------------------------------------------- /test-website/content/javascript/script08a.js: -------------------------------------------------------------------------------- 1 | import okValue from 'resources'; 2 | import { okClass } from 'resources'; 3 | 4 | const span = document.getElementById("span08a"); 5 | span.innerHTML=okValue; 6 | span.classList.add(okClass); 7 | -------------------------------------------------------------------------------- /test-website/content/javascript/script08b.js: -------------------------------------------------------------------------------- 1 | import okValue from 'javascript/resources'; 2 | import { okClass } from 'javascript/resources'; 3 | 4 | const span = document.getElementById("span08b"); 5 | span.innerHTML=okValue; 6 | span.classList.add(okClass); 7 | -------------------------------------------------------------------------------- /test-website/content/javascript/script08c.js: -------------------------------------------------------------------------------- 1 | import okValue from '../javascript/resources'; 2 | import { okClass } from '../javascript/resources'; 3 | 4 | const span = document.getElementById("span08c"); 5 | span.innerHTML=okValue; 6 | span.classList.add(okClass); 7 | -------------------------------------------------------------------------------- /test-website/content/javascript/script09a.js: -------------------------------------------------------------------------------- 1 | import okValue from 'https://example.com/resources'; 2 | import { okClass } from 'https://example.com/resources'; 3 | 4 | const span = document.getElementById("span09a"); 5 | span.innerHTML=okValue; 6 | span.classList.add(okClass); 7 | -------------------------------------------------------------------------------- /test-website/content/javascript/script09b.js: -------------------------------------------------------------------------------- 1 | import okValue from 'https://standard_netloc/resources'; 2 | import { okClass } from 'https://standard_netloc/resources'; 3 | 4 | const span = document.getElementById("span09b"); 5 | span.innerHTML=okValue; 6 | span.classList.add(okClass); 7 | -------------------------------------------------------------------------------- /test-website/content/javascript/script09c.js: -------------------------------------------------------------------------------- 1 | import okValue from 'https://not_standard_netloc_punny_encoded/resources'; 2 | import { okClass } from 'https://not_standard_netloc_punny_encoded/resources'; 3 | 4 | const span = document.getElementById("span09c"); 5 | span.innerHTML=okValue; 6 | span.classList.add(okClass); 7 | -------------------------------------------------------------------------------- /test-website/content/javascript/script09d.js: -------------------------------------------------------------------------------- 1 | import okValue from 'https://not_standard_netloc_not_encoded/resources'; 2 | import { okClass } from 'https://not_standard_netloc_not_encoded/resources'; 3 | 4 | const span = document.getElementById("span09d"); 5 | span.innerHTML=okValue; 6 | span.classList.add(okClass); 7 | -------------------------------------------------------------------------------- /test-website/content/javascript/script10.js: -------------------------------------------------------------------------------- 1 | const img10 = document.getElementById('img10'); 2 | const origSrc = img10.getAttribute('src') 3 | const newSrc = origSrc.replace('not_working', 'working') 4 | console.debug('Replacing ' + origSrc + ' with ' + newSrc) 5 | img10.src = newSrc; -------------------------------------------------------------------------------- /test-website/content/javascript/working.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openzim/warc2zim/270f5dbaaed6455ccb1dc05e4f10cf460f41b992/test-website/content/javascript/working.png -------------------------------------------------------------------------------- /test-website/content/jàvàscrïpt.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | Test website 7 | 8 | 9 | 10 | 11 | 12 | 20 | 21 | 22 | 23 | 24 |

Javascript acting on file with special chars in its path

25 | 26 |

On every lines below, you should see written "This is working OK"

27 | 28 |

29 | Test: This is not working 30 |

31 | 32 | 43 | 44 | 45 | 46 | 47 | -------------------------------------------------------------------------------- /test-website/content/mediaelement.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | Test website 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 |

MediaElement.js player

17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 |
27 | 39 |
40 | 41 | 49 | 50 | 51 | 52 | 53 | -------------------------------------------------------------------------------- /test-website/content/onxxx.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | Test website 7 | 8 | 9 | 10 | 11 | 12 | 20 | 21 | 22 | 23 | 24 |

onxxx HTML events

25 | 26 |

When clicking images below, it should redirect your browser to the another page

27 | 28 |

Relative (this site home page):

29 | 30 |

Absolute internal 1 (this site home page):

31 | 32 |

Absolute internal 2 (another site home page):

33 | 34 |

Absolute external (kiwix homepage):

35 | 36 | 37 | 38 | 39 | -------------------------------------------------------------------------------- /test-website/content/soundcloud.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | Test website 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 |

Soundcloud

17 | 18 | 20 |
22 | IAM · Je danse le Mia
26 | 27 | 28 | 29 | 30 | -------------------------------------------------------------------------------- /test-website/content/spotify.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | Test website 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 |

Spotify

17 | 18 | 22 | 23 | 24 | 25 | 26 | -------------------------------------------------------------------------------- /test-website/content/tiktok.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | Test website 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 |

TikTok

17 | 18 |
20 |
@elienv_ 21 | Répondre à @lordv_ Possible de trouver la page philosophie à partir de n’importe quel article ? #tiktokacademie #wikipedia #challenge ♬ Time Time - Trei Degete 28 |
29 |
30 | 31 | 32 | 33 | 34 | 35 | -------------------------------------------------------------------------------- /test-website/content/twitch1.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | Test website 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 |

Twitch iframe

17 | 18 | 20 | 21 | 22 | 23 | 24 | -------------------------------------------------------------------------------- /test-website/content/twitch2.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | Test website 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 |

Twitch JS

17 | 18 | 19 |
20 | 21 | 22 | 23 | 24 | 25 | 30 | 31 | 32 | 33 | 34 | -------------------------------------------------------------------------------- /test-website/content/twitter1.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | Test website 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 |

Twitter post embed from Feb. 2024

17 | 18 | 24 | 25 | 26 | 27 | 28 | 29 | -------------------------------------------------------------------------------- /test-website/content/twitter2.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | Test website 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 |

Twitter video embed from Feb. 2024

17 | 18 | 24 | 25 | 26 | 27 | 28 | 29 | -------------------------------------------------------------------------------- /test-website/content/url-encoding.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | Test website 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 |

This page contains URL encoding test(s).

17 | 18 |

Image with non-encoded chars in name with non-encoded special chars in link

19 | 20 |

An image should be displayed below

21 | 22 | 23 |

Image with non-encoded chars in name and already encoded special chars

24 | 25 |

An image should be displayed below

26 | 27 | 28 |

Image with non-encoded chars in name and a mix of non-encoded and already encoded special chars

29 | 30 |

An image should be displayed below

31 | 32 | 33 |

Image with non-encoded chars in name with non-encoded special chars in link and a query parameter and fragment with reserved chars

34 | 35 |

An image should be displayed below as well

36 | 37 | 38 |
39 | 40 |

Image with already encoded chars in name with non-encoded special chars in link

41 | 42 |

This is not working on source website, so should not work in ZIM

43 | 44 | 45 |

Image with already encoded chars in name and already encoded special chars in link

46 | 47 |

This is not working on source website, so should not work in ZIM

48 | 49 | 50 |

Image with already encoded chars in name and a mix of non-encoded and already encoded special chars in link

51 | 52 |

This is not working on source website, so should not work in ZIM

53 | 54 | 55 |
56 | 57 |

Image with a mix of non-encoded and already encoded special chars in name with non-encoded special chars in link

58 | 59 |

This is not working on source website, so should not work in ZIM

60 | 61 | 62 |

Image with a mix of non-encoded and already encoded special chars in name and already encoded special chars in link

63 | 64 |

This is not working on source website, so should not work in ZIM

65 | 66 | 67 |

Image with a mix of non-encoded and already encoded special chars in name and a mix of non-encoded and already encoded special chars in link

68 | 69 |

This is not working on source website, so should not work in ZIM

70 | 71 | 72 |
73 | 74 |

Image with many weird characters

75 | 76 |

An image should be displayed below

77 | 78 | 79 |

Image with UTF-8 characters

80 | 81 |

An image should be displayed below

82 | 83 | 84 |
85 | 86 |

Hostname with UTF-8 chars and simple image

87 | 88 |

An image should be displayed below

89 | 90 | 91 |

Hostname with UTF-8 chars and image with UTF-8 chars

92 | 93 |

An image should be displayed below

94 | 95 | 96 |

Hostname with UTF-8 chars and query parameter 1

97 | 98 |

An image should be displayed below with a 1 in the right

99 | 100 | 101 |

Hostname with UTF-8 chars and query parameter 2

102 | 103 |

An image should be displayed below with a 2 in the right

104 | 105 | 106 | 107 | 108 | 109 | -------------------------------------------------------------------------------- /test-website/content/video.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | Test website 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 |

Video tag

17 | 18 | 23 | 24 | 25 | 26 | 27 | -------------------------------------------------------------------------------- /test-website/content/vimeo.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | Test website 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 |

This page contains Vimeo test(s).

17 | 18 |

Vimeo video

19 | 20 | 22 | 23 | 24 | 25 | 26 | -------------------------------------------------------------------------------- /test-website/content/youtube.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | Test website 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 |

This page contains Youtube test(s).

17 | 18 |

Youtube video

19 | 20 | 24 | 25 | 26 | 27 | 28 | -------------------------------------------------------------------------------- /test-website/entrypoint-reverse.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Reverse the modifications done by entrypoint.sh ; usefull for local development when local directory is mounted inside the container 4 | # for rapid tests but the modifications have to be reversed so that proper stuff is commited in Github 5 | 6 | find /var/www/html -type f -exec sed -i 's/http:\/\/jédéacçents.local.oviles.info:8888/https:\/\/not_standard_netloc_not_encoded/g' {} \; 7 | find /var/www/html -type f -exec sed -i 's/http:\/\/xn--jdacents-v0aqb.local.oviles.info:8888/https:\/\/not_standard_netloc_punny_encoded/g' {} \; 8 | find /var/www/html -type f -exec sed -i 's/http:\/\/test-website.local.oviles.info:8888/https:\/\/standard_netloc/g' {} \; 9 | -------------------------------------------------------------------------------- /test-website/entrypoint.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Replace all occurences of hostnames in html files 4 | 5 | find /var/www/html -type f -exec sed -i "s/https:\/\/not_standard_netloc_not_encoded/$NOT_STANDARD_NETLOC_NOT_ENCODED/g" {} \; 6 | find /var/www/html -type f -exec sed -i "s/https:\/\/not_standard_netloc_punny_encoded/$NOT_STANDARD_NETLOC_PUNNY_ENCODED/g" {} \; 7 | find /var/www/html -type f -exec sed -i "s/https:\/\/standard_netloc/$STANDARD_NETLOC/g" {} \; 8 | 9 | caddy run --config /etc/caddy/Caddyfile --adapter caddyfile 10 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openzim/warc2zim/270f5dbaaed6455ccb1dc05e4f10cf460f41b992/tests/__init__.py -------------------------------------------------------------------------------- /tests/cdxj_indexer/test_postappend.py: -------------------------------------------------------------------------------- 1 | from io import BytesIO 2 | 3 | from warc2zim.cdxj_indexer.postquery import append_method_query 4 | 5 | 6 | # ============================================================================ 7 | class MethodQueryCanonicalizer: 8 | def __init__(self, method, content_type, req_len, req_stream): 9 | self.method = method 10 | self.content_type = content_type 11 | self.req_len = req_len 12 | self.req_stream = req_stream 13 | 14 | def append_query(self, url): 15 | self.req_stream.seek(0) 16 | query_only, full_string = append_method_query( 17 | self.method, self.content_type, self.req_len, self.req_stream, url 18 | ) 19 | return url + full_string 20 | 21 | 22 | # ============================================================================ 23 | class TestPostQueryExtract: 24 | @classmethod 25 | def setup_class(cls): 26 | cls.post_data = b"foo=bar&dir=%2Fbaz" 27 | cls.binary_post_data = ( 28 | b"\x816l`L\xa04P\x0e\xe0r\x02\xb5\x89\x19\x00fP\xdb\x0e\xb0\x02," 29 | ) 30 | 31 | def test_post_extract_1(self): 32 | mq = MethodQueryCanonicalizer( 33 | "POST", 34 | "application/x-www-form-urlencoded", 35 | len(self.post_data), 36 | BytesIO(self.post_data), 37 | ) 38 | 39 | assert ( 40 | mq.append_query("http://example.com/") 41 | == "http://example.com/?__wb_method=POST&foo=bar&dir=/baz" 42 | ) 43 | 44 | assert ( 45 | mq.append_query("http://example.com/?123=ABC") 46 | == "http://example.com/?123=ABC&__wb_method=POST&foo=bar&dir=/baz" 47 | ) 48 | 49 | def test_post_extract_json(self): 50 | post_data = b'{"a": "b", "c": {"a": 2}, "d": "e"}' 51 | mq = MethodQueryCanonicalizer( 52 | "POST", "application/json", len(post_data), BytesIO(post_data) 53 | ) 54 | 55 | assert ( 56 | mq.append_query("http://example.com/") 57 | == "http://example.com/?__wb_method=POST&a=b&a.2_=2&d=e" 58 | ) 59 | 60 | def test_post_extract_json_top_list(self): 61 | post_data = ( 62 | b'[{"a": "b", "c": {"a": 2}}, {"d": "e"}, "ignored", false, null, 0]' 63 | ) 64 | mq = MethodQueryCanonicalizer( 65 | "POST", "application/json", len(post_data), BytesIO(post_data) 66 | ) 67 | 68 | assert ( 69 | mq.append_query("http://example.com/") 70 | == "http://example.com/?__wb_method=POST&a=b&a.2_=2&d=e" 71 | ) 72 | 73 | def test_post_extract_json_lines(self): 74 | post_data = b'{"a": "b"}\n{"c": {"a": 2}, "d": "e"}' 75 | mq = MethodQueryCanonicalizer( 76 | "POST", "application/json", len(post_data), BytesIO(post_data) 77 | ) 78 | 79 | assert ( 80 | mq.append_query("http://example.com/") 81 | == "http://example.com/?__wb_method=POST&a=b&a.2_=2&d=e" 82 | ) 83 | 84 | def test_put_extract_method(self): 85 | mq = MethodQueryCanonicalizer( 86 | "PUT", 87 | "application/x-www-form-urlencoded", 88 | len(self.post_data), 89 | BytesIO(self.post_data), 90 | ) 91 | 92 | assert ( 93 | mq.append_query("http://example.com/") 94 | == "http://example.com/?__wb_method=PUT&foo=bar&dir=/baz" 95 | ) 96 | 97 | def test_post_extract_non_form_data_1(self): 98 | mq = MethodQueryCanonicalizer( 99 | "POST", 100 | "application/octet-stream", 101 | len(self.post_data), 102 | BytesIO(self.post_data), 103 | ) 104 | 105 | # base64 encoded data 106 | assert ( 107 | mq.append_query("http://example.com/") 108 | == "http://example.com/?__wb_method=POST&__wb_post_data=Zm9vPWJhciZkaXI9JTJGYmF6" 109 | ) 110 | 111 | def test_post_extract_non_form_data_2(self): 112 | mq = MethodQueryCanonicalizer( 113 | "POST", "text/plain", len(self.post_data), BytesIO(self.post_data) 114 | ) 115 | 116 | # base64 encoded data 117 | assert ( 118 | mq.append_query("http://example.com/pathbar?id=123") 119 | == "http://example.com/pathbar?id=123&__wb_method=POST&__wb_post_data=Zm9vPWJhciZkaXI9JTJGYmF6" 120 | ) 121 | 122 | def test_post_extract_length_invalid_ignore(self): 123 | mq = MethodQueryCanonicalizer( 124 | "POST", "application/x-www-form-urlencoded", 0, BytesIO(self.post_data) 125 | ) 126 | 127 | assert ( 128 | mq.append_query("http://example.com/") 129 | == "http://example.com/?__wb_method=POST" 130 | ) 131 | 132 | mq = MethodQueryCanonicalizer( 133 | "POST", "application/x-www-form-urlencoded", "abc", BytesIO(self.post_data) 134 | ) 135 | 136 | assert ( 137 | mq.append_query("http://example.com/") 138 | == "http://example.com/?__wb_method=POST" 139 | ) 140 | 141 | def test_post_extract_length_too_short(self): 142 | mq = MethodQueryCanonicalizer( 143 | "POST", 144 | "application/x-www-form-urlencoded", 145 | len(self.post_data) - 4, 146 | BytesIO(self.post_data), 147 | ) 148 | 149 | assert ( 150 | mq.append_query("http://example.com/") 151 | == "http://example.com/?__wb_method=POST&foo=bar&dir=%2" 152 | ) 153 | 154 | def test_post_extract_length_too_long(self): 155 | mq = MethodQueryCanonicalizer( 156 | "POST", 157 | "application/x-www-form-urlencoded", 158 | len(self.post_data) + 4, 159 | BytesIO(self.post_data), 160 | ) 161 | 162 | assert ( 163 | mq.append_query("http://example.com/") 164 | == "http://example.com/?__wb_method=POST&foo=bar&dir=/baz" 165 | ) 166 | 167 | def test_post_extract_malformed_form_data(self): 168 | mq = MethodQueryCanonicalizer( 169 | "POST", 170 | "application/x-www-form-urlencoded", 171 | len(self.binary_post_data), 172 | BytesIO(self.binary_post_data), 173 | ) 174 | 175 | # base64 encoded data 176 | assert ( 177 | mq.append_query("http://example.com/") 178 | == "http://example.com/?__wb_method=POST&__wb_post_data=gTZsYEygNFAO4HICtYkZAGZQ2w6wAiw=" 179 | ) 180 | 181 | def test_post_extract_no_boundary_in_multipart_form_mimetype(self): 182 | mq = MethodQueryCanonicalizer( 183 | "POST", "multipart/form-data", len(self.post_data), BytesIO(self.post_data) 184 | ) 185 | 186 | assert ( 187 | mq.append_query("http://example.com/") 188 | == "http://example.com/?__wb_method=POST&__wb_post_data=Zm9vPWJhciZkaXI9JTJGYmF6" 189 | ) 190 | 191 | def test_options(self): 192 | mq = MethodQueryCanonicalizer("OPTIONS", "", 0, BytesIO()) 193 | assert ( 194 | mq.append_query("http://example.com/") 195 | == "http://example.com/?__wb_method=OPTIONS" 196 | ) 197 | 198 | def test_head(self): 199 | mq = MethodQueryCanonicalizer("HEAD", "", 0, BytesIO()) 200 | assert ( 201 | mq.append_query("http://example.com/") 202 | == "http://example.com/?__wb_method=HEAD" 203 | ) 204 | -------------------------------------------------------------------------------- /tests/data-special/qsl.net-encoding-alias.warc.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openzim/warc2zim/270f5dbaaed6455ccb1dc05e4f10cf460f41b992/tests/data-special/qsl.net-encoding-alias.warc.gz -------------------------------------------------------------------------------- /tests/data/bad-redirections.warc.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openzim/warc2zim/270f5dbaaed6455ccb1dc05e4f10cf460f41b992/tests/data/bad-redirections.warc.gz -------------------------------------------------------------------------------- /tests/data/content-resource-types.warc.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openzim/warc2zim/270f5dbaaed6455ccb1dc05e4f10cf460f41b992/tests/data/content-resource-types.warc.gz -------------------------------------------------------------------------------- /tests/data/empty-file: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openzim/warc2zim/270f5dbaaed6455ccb1dc05e4f10cf460f41b992/tests/data/empty-file -------------------------------------------------------------------------------- /tests/data/example-response.warc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openzim/warc2zim/270f5dbaaed6455ccb1dc05e4f10cf460f41b992/tests/data/example-response.warc -------------------------------------------------------------------------------- /tests/data/example-revisit.warc.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openzim/warc2zim/270f5dbaaed6455ccb1dc05e4f10cf460f41b992/tests/data/example-revisit.warc.gz -------------------------------------------------------------------------------- /tests/data/example-utf8.warc: -------------------------------------------------------------------------------- 1 | WARC/1.0 2 | WARC-IP-Address: 3.220.112.94 3 | WARC-Type: response 4 | WARC-Record-ID: 5 | WARC-Target-URI: https://httpbin.org/anything/utf8=%E2%9C%93?query=test&a=b&1=%E2%9C%93 6 | WARC-Date: 2020-07-09T20:41:58Z 7 | WARC-Payload-Digest: sha1:APPCHQ3MPXOSGO3CDWKQD2TIOJGCLD7B 8 | WARC-Block-Digest: sha1:6X2SE5VSJI4EQHXZ7C5AWTU5H2HXKXMW 9 | Content-Type: application/http; msgtype=response 10 | Content-Length: 818 11 | 12 | HTTP/1.1 200 OK 13 | Date: Thu, 09 Jul 2020 20:41:58 GMT 14 | Content-Type: application/json 15 | Content-Length: 588 16 | Connection: keep-alive 17 | Server: gunicorn/19.9.0 18 | Access-Control-Allow-Origin: * 19 | Access-Control-Allow-Credentials: true 20 | 21 | { 22 | "args": { 23 | "1": "\u2713", 24 | "a": "b", 25 | "query": "test" 26 | }, 27 | "data": "", 28 | "files": {}, 29 | "form": {}, 30 | "headers": { 31 | "Accept": "*/*", 32 | "Accept-Encoding": "gzip, deflate", 33 | "Host": "httpbin.org", 34 | "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36", 35 | "X-Amzn-Trace-Id": "Root=1-5f078116-58ed90b44cbd1e86b92d2c19" 36 | }, 37 | "json": null, 38 | "method": "GET", 39 | "origin": "161.35.72.192", 40 | "url": "https://httpbin.org/anything/utf8=\u2713?query=test&a=b&1=\u2713" 41 | } 42 | 43 | 44 | WARC/1.0 45 | WARC-IP-Address: 3.220.112.94 46 | WARC-Type: request 47 | WARC-Record-ID: 48 | WARC-Target-URI: https://httpbin.org/anything/utf8=%E2%9C%93?query=test&a=b&1=%E2%9C%93 49 | WARC-Date: 2020-07-09T20:41:58Z 50 | WARC-Payload-Digest: sha1:3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ 51 | WARC-Concurrent-To: 52 | WARC-Block-Digest: sha1:CJ2JE5XZQ6K2BY2ILND5LBBKSWJWFG44 53 | Content-Type: application/http; msgtype=request 54 | Content-Length: 291 55 | 56 | GET /anything/utf8=%E2%9C%93?query=test&a=b&1=%E2%9C%93 HTTP/1.1 57 | Host: httpbin.org 58 | User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36 59 | Accept-Encoding: gzip, deflate 60 | Accept: */* 61 | Connection: keep-alive 62 | 63 | 64 | 65 | WARC/1.0 66 | WARC-IP-Address: 3.220.112.94 67 | WARC-Type: response 68 | WARC-Record-ID: 69 | WARC-Target-URI: https://httpbin.org/anything/utf8=%E2%9C%93?query=test&a=b&1=%E2%9C%93 70 | WARC-Date: 2020-07-12T04:55:45Z 71 | WARC-Payload-Digest: sha1:3C2SYPEZC3KIGXFSCKVWEC3NE3WIQOAI 72 | WARC-Block-Digest: sha1:VFMTQFAU7C4XNASY5LAXE2JH3PEDJNHW 73 | Content-Type: application/http; msgtype=response 74 | Content-Length: 818 75 | 76 | HTTP/1.1 200 OK 77 | Date: Sun, 12 Jul 2020 04:55:45 GMT 78 | Content-Type: application/json 79 | Content-Length: 588 80 | Connection: keep-alive 81 | Server: gunicorn/19.9.0 82 | Access-Control-Allow-Origin: * 83 | Access-Control-Allow-Credentials: true 84 | 85 | { 86 | "args": { 87 | "1": "\u2713", 88 | "a": "b", 89 | "query": "test" 90 | }, 91 | "data": "", 92 | "files": {}, 93 | "form": {}, 94 | "headers": { 95 | "Accept": "*/*", 96 | "Accept-Encoding": "gzip, deflate", 97 | "Host": "httpbin.org", 98 | "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36", 99 | "X-Amzn-Trace-Id": "Root=1-5f0a97d1-68b1d51c705ee29c610f8d2b" 100 | }, 101 | "json": null, 102 | "method": "GET", 103 | "origin": "161.35.72.192", 104 | "url": "https://httpbin.org/anything/utf8=\u2713?query=test&a=b&1=\u2713" 105 | } 106 | 107 | 108 | WARC/1.0 109 | WARC-IP-Address: 3.220.112.94 110 | WARC-Type: request 111 | WARC-Record-ID: 112 | WARC-Target-URI: https://httpbin.org/anything/utf8=%E2%9C%93?query=test&a=b&1=%E2%9C%93 113 | WARC-Date: 2020-07-12T04:55:45Z 114 | WARC-Payload-Digest: sha1:3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ 115 | WARC-Concurrent-To: 116 | WARC-Block-Digest: sha1:CJ2JE5XZQ6K2BY2ILND5LBBKSWJWFG44 117 | Content-Type: application/http; msgtype=request 118 | Content-Length: 291 119 | 120 | GET /anything/utf8=%E2%9C%93?query=test&a=b&1=%E2%9C%93 HTTP/1.1 121 | Host: httpbin.org 122 | User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36 123 | Accept-Encoding: gzip, deflate 124 | Accept: */* 125 | Connection: keep-alive 126 | 127 | 128 | 129 | WARC/1.0 130 | WARC-IP-Address: 54.236.246.173 131 | WARC-Type: response 132 | WARC-Record-ID: 133 | WARC-Target-URI: https://httpbin.org/anything/Saint-Exup%C3%A9ry?foo=bar 134 | WARC-Date: 2020-07-12T04:55:45Z 135 | WARC-Payload-Digest: sha1:AH45AISVSL5WPLO7KFHMYYQUGHU3QM7I 136 | WARC-Block-Digest: sha1:LAYKJWASNW67SCJNBSPRCLHA57WR5B3M 137 | Content-Type: application/http; msgtype=response 138 | Content-Length: 771 139 | 140 | HTTP/1.1 200 OK 141 | Date: Sun, 12 Jul 2020 04:55:45 GMT 142 | Content-Type: application/json 143 | Content-Length: 541 144 | Connection: keep-alive 145 | Server: gunicorn/19.9.0 146 | Access-Control-Allow-Origin: * 147 | Access-Control-Allow-Credentials: true 148 | 149 | { 150 | "args": { 151 | "foo": "bar" 152 | }, 153 | "data": "", 154 | "files": {}, 155 | "form": {}, 156 | "headers": { 157 | "Accept": "*/*", 158 | "Accept-Encoding": "gzip, deflate", 159 | "Host": "httpbin.org", 160 | "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36", 161 | "X-Amzn-Trace-Id": "Root=1-5f0a97d1-3e7c0264c617439a850ca5a5" 162 | }, 163 | "json": null, 164 | "method": "GET", 165 | "origin": "161.35.72.192", 166 | "url": "https://httpbin.org/anything/Saint-Exup\u00e9ry?foo=bar" 167 | } 168 | 169 | 170 | WARC/1.0 171 | WARC-IP-Address: 54.236.246.173 172 | WARC-Type: request 173 | WARC-Record-ID: 174 | WARC-Target-URI: https://httpbin.org/anything/Saint-Exup%C3%A9ry?foo=bar 175 | WARC-Date: 2020-07-12T04:55:45Z 176 | WARC-Payload-Digest: sha1:3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ 177 | WARC-Concurrent-To: 178 | WARC-Block-Digest: sha1:PZFUDJ3ZKH5JZQ66M7E47TD2GP4ZZABR 179 | Content-Type: application/http; msgtype=request 180 | Content-Length: 276 181 | 182 | GET /anything/Saint-Exup%C3%A9ry?foo=bar HTTP/1.1 183 | Host: httpbin.org 184 | User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36 185 | Accept-Encoding: gzip, deflate 186 | Accept: */* 187 | Connection: keep-alive 188 | 189 | 190 | 191 | WARC/1.0 192 | WARC-IP-Address: 54.236.246.173 193 | WARC-Type: response 194 | WARC-Record-ID: 195 | WARC-Target-URI: https://httpbin.org/anything/test?foo=%C3%A9 196 | WARC-Date: 2020-07-12T04:55:46Z 197 | WARC-Payload-Digest: sha1:UWV3XF55NFX2YU23BUEY52H6LK54LV7G 198 | WARC-Block-Digest: sha1:EDHXZSWGEJUKE6WMJZYKZHC2SLQEM5U5 199 | Content-Type: application/http; msgtype=response 200 | Content-Length: 763 201 | 202 | HTTP/1.1 200 OK 203 | Date: Sun, 12 Jul 2020 04:55:45 GMT 204 | Content-Type: application/json 205 | Content-Length: 533 206 | Connection: keep-alive 207 | Server: gunicorn/19.9.0 208 | Access-Control-Allow-Origin: * 209 | Access-Control-Allow-Credentials: true 210 | 211 | { 212 | "args": { 213 | "foo": "\u00e9" 214 | }, 215 | "data": "", 216 | "files": {}, 217 | "form": {}, 218 | "headers": { 219 | "Accept": "*/*", 220 | "Accept-Encoding": "gzip, deflate", 221 | "Host": "httpbin.org", 222 | "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36", 223 | "X-Amzn-Trace-Id": "Root=1-5f0a97d1-61e09380e5ba93c0c33af8c0" 224 | }, 225 | "json": null, 226 | "method": "GET", 227 | "origin": "161.35.72.192", 228 | "url": "https://httpbin.org/anything/test?foo=\u00e9" 229 | } 230 | 231 | 232 | WARC/1.0 233 | WARC-IP-Address: 54.236.246.173 234 | WARC-Type: request 235 | WARC-Record-ID: 236 | WARC-Target-URI: https://httpbin.org/anything/test?foo=%C3%A9 237 | WARC-Date: 2020-07-12T04:55:46Z 238 | WARC-Payload-Digest: sha1:3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ 239 | WARC-Concurrent-To: 240 | WARC-Block-Digest: sha1:R5RVON3BIZ6XHBHC2CNZACZXOQU2TKSA 241 | Content-Type: application/http; msgtype=request 242 | Content-Length: 265 243 | 244 | GET /anything/test?foo=%C3%A9 HTTP/1.1 245 | Host: httpbin.org 246 | User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36 247 | Accept-Encoding: gzip, deflate 248 | Accept: */* 249 | Connection: keep-alive 250 | 251 | 252 | 253 | -------------------------------------------------------------------------------- /tests/data/example-with-timestamp.warc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openzim/warc2zim/270f5dbaaed6455ccb1dc05e4f10cf460f41b992/tests/data/example-with-timestamp.warc -------------------------------------------------------------------------------- /tests/data/http-return-codes.warc.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openzim/warc2zim/270f5dbaaed6455ccb1dc05e4f10cf460f41b992/tests/data/http-return-codes.warc.gz -------------------------------------------------------------------------------- /tests/data/kiwix-with-redirects.warc.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openzim/warc2zim/270f5dbaaed6455ccb1dc05e4f10cf460f41b992/tests/data/kiwix-with-redirects.warc.gz -------------------------------------------------------------------------------- /tests/data/main-entry-403.warc.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openzim/warc2zim/270f5dbaaed6455ccb1dc05e4f10cf460f41b992/tests/data/main-entry-403.warc.gz -------------------------------------------------------------------------------- /tests/data/self-redirect.warc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openzim/warc2zim/270f5dbaaed6455ccb1dc05e4f10cf460f41b992/tests/data/self-redirect.warc -------------------------------------------------------------------------------- /tests/data/single-page-test.warc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openzim/warc2zim/270f5dbaaed6455ccb1dc05e4f10cf460f41b992/tests/data/single-page-test.warc -------------------------------------------------------------------------------- /tests/data/solidaritenum.warc.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openzim/warc2zim/270f5dbaaed6455ccb1dc05e4f10cf460f41b992/tests/data/solidaritenum.warc.gz -------------------------------------------------------------------------------- /tests/data/video-vimeo.warc.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openzim/warc2zim/270f5dbaaed6455ccb1dc05e4f10cf460f41b992/tests/data/video-vimeo.warc.gz -------------------------------------------------------------------------------- /tests/data/video-yt-2.warc.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openzim/warc2zim/270f5dbaaed6455ccb1dc05e4f10cf460f41b992/tests/data/video-yt-2.warc.gz -------------------------------------------------------------------------------- /tests/data/video-yt.warc.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openzim/warc2zim/270f5dbaaed6455ccb1dc05e4f10cf460f41b992/tests/data/video-yt.warc.gz -------------------------------------------------------------------------------- /tests/encodings/definition.json: -------------------------------------------------------------------------------- 1 | { 2 | "files": [ 3 | { 4 | "filename": "file01.js", 5 | "source": "https://www.marxists.org/espanol/menu.js", 6 | "date": "2024-06", 7 | "probable_charset": "ISO-8859-1", 8 | "expected_strings": [ 9 | "Afanásiev, Víktor", 10 | "Andrópov, Yuri", 11 | "Amaguaña, Tránsito", 12 | "Cunhal, Álvaro", 13 | "De la Cruz, Juana Inés", 14 | "Faure, Sèbastien" 15 | ] 16 | }, 17 | { 18 | "filename": "file02.js", 19 | "source": "https://www.cloudflare.com/vendor/onetrust/scripttemplates/202308.2.0/otBannerSdk.js", 20 | "date": "2024-06", 21 | "probable_charset": "UTF-8", 22 | "expected_strings": [ 23 | "_Container:\"#ot-ven-lst\",P_Ven_Bx:\"ot-ven-box\",P_Ven_Name:\".ot-ven-name\"", 24 | "ist,IabType:e.IabType,InactiveText:e.InactiveText,IsConsentLoggingEnabled:e.IsConsentLoggingEnabl", 25 | "0;\\n transition: visibility 0s \"+e+\"ms, opacity \"+e+\"ms linear;\\n \",!0);var", 26 | "r.prototype.escapeRegExp=function(e){return e.replace(/[-/\\\\^$*+?.()|[\\]{}]/g,\"\\\\$&\")}" 27 | ] 28 | }, 29 | { 30 | "filename": "file03.html", 31 | "source": "https://www.solidarite-numerique.fr/tutoriels/comprendre-les-cookies/?thematique=internet", 32 | "date": "2024-06", 33 | "probable_charset": "UTF-8", 34 | "contains_bad_chars": true, 35 | "expected_strings": [ 36 | "Vous souhaitez changer de navigateur et utiliser Firefox ? Ce tutoriel vous détaille la procédure d'installation et la configuration pour une premi�..." 37 | ] 38 | }, 39 | { 40 | "filename": "file04.js", 41 | "source": "https://static.mailerlite.com/js/w/ml_jQuery.inputmask.bundle.min.js?v3.3.1", 42 | "date": "2024-06", 43 | "probable_charset": "ascii", 44 | "expected_strings": [ 45 | "1,this.isOptional=b||!1,this.isQuantifier=c||!1,this.isAlterna", 46 | "is;if(na=!1,g.clearMaskOnLostFocus&&document.activeElement!==b){var c=x().slice(),d=b.inputmask._v" 47 | ] 48 | }, 49 | { 50 | "filename": "file05.js", 51 | "source": "https://static.sketchfab.com/static/builds/web/dist/ac0f732c4fc1a30c77920d75c1a9be83-v2.js", 52 | "date": "2024-06", 53 | "probable_charset": "ascii", 54 | "expected_strings": [ 55 | "isTickUsed||(this._isTickUsed=!0,this._schedule(this.drainQueues))},s.prototype._reset=function(){this._is" 56 | ] 57 | }, 58 | { 59 | "filename": "file06.html", 60 | "source": "https://website.test.openzim.org/chinese-encoding.html", 61 | "date": "2024-06", 62 | "known_charset": "gb2312", 63 | "expected_strings": [ 64 | "simplified chinese characters: 汉字" 65 | ] 66 | }, 67 | { 68 | "filename": "file07.html", 69 | "source": "https://website.test.openzim.org/chinese-encoding.html without header", 70 | "date": "2024-06", 71 | "known_charset": "gb2312", 72 | "http_charset": "gb2312", 73 | "expected_strings": [ 74 | "simplified chinese characters: 汉字" 75 | ] 76 | }, 77 | { 78 | "filename": "file08.js", 79 | "source": "https://community.mozilla.org/wp-content/plugins/events-manager/includes/js/events-manager.min.js?ver=6.4.1", 80 | "date": "2024-06", 81 | "probable_charset": "UTF-8", 82 | "expected_strings": [ 83 | "t Array]\"===Object.prototype.toString.call(e)},s={a:\"[aḀḁĂăÂâǍǎȺⱥȦȧẠạÄäÀàÁáĀāÃãÅåąĄÃąĄ]\",b:\"[b␢β" 84 | ] 85 | } 86 | ] 87 | } 88 | -------------------------------------------------------------------------------- /tests/encodings/file01.js: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openzim/warc2zim/270f5dbaaed6455ccb1dc05e4f10cf460f41b992/tests/encodings/file01.js -------------------------------------------------------------------------------- /tests/encodings/file03.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openzim/warc2zim/270f5dbaaed6455ccb1dc05e4f10cf460f41b992/tests/encodings/file03.html -------------------------------------------------------------------------------- /tests/encodings/file06.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openzim/warc2zim/270f5dbaaed6455ccb1dc05e4f10cf460f41b992/tests/encodings/file06.html -------------------------------------------------------------------------------- /tests/encodings/file07.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openzim/warc2zim/270f5dbaaed6455ccb1dc05e4f10cf460f41b992/tests/encodings/file07.html -------------------------------------------------------------------------------- /tests/test_converter.py: -------------------------------------------------------------------------------- 1 | import tempfile 2 | 3 | import pytest 4 | 5 | from warc2zim.converter import Converter 6 | from warc2zim.main import _create_arguments_parser 7 | 8 | 9 | @pytest.mark.parametrize( 10 | "inputs, warc_files", 11 | [ 12 | pytest.param([], [], id="empty_array"), 13 | pytest.param(["foo.warc.gz"], ["foo.warc.gz"], id="one_file"), 14 | pytest.param( 15 | [ 16 | "rec-f9c30d949953-20240724035746176-0.warc.gz", 17 | "rec-f9c30d949953-20240724045846176-0.warc.gz", 18 | ], 19 | None, # no change 20 | id="two_already_sorted", 21 | ), 22 | pytest.param( 23 | [ 24 | "rec-f9c30d949953-20240724045846176-0.warc.gz", 25 | "rec-f9c30d949953-20240724035746176-0.warc.gz", 26 | ], 27 | [ 28 | "rec-f9c30d949953-20240724035746176-0.warc.gz", 29 | "rec-f9c30d949953-20240724045846176-0.warc.gz", 30 | ], 31 | id="two_not_sorted", 32 | ), 33 | pytest.param( 34 | [ 35 | "aaaa/rec-f9c30d949953-20240724045846176-0.warc.gz", 36 | "bbb/rec-f9c30d949953-20240724035746176-0.warc.gz", 37 | ], 38 | [ 39 | "bbb/rec-f9c30d949953-20240724035746176-0.warc.gz", 40 | "aaaa/rec-f9c30d949953-20240724045846176-0.warc.gz", 41 | ], 42 | id="two_not_sorted_in_random_unsorted_dirs", 43 | ), 44 | ], 45 | ) 46 | def test_sort_warc_files(inputs, warc_files): 47 | parser = _create_arguments_parser() 48 | tmpdir = tempfile.mkdtemp() 49 | args = parser.parse_args(["--name", "foo", "--output", tmpdir]) 50 | args.inputs = inputs 51 | conv = Converter(args) 52 | assert conv.warc_files == (warc_files if warc_files else inputs) 53 | -------------------------------------------------------------------------------- /tests/test_icon_finder.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | import pytest 4 | 5 | from warc2zim.icon_finder import Icon, get_sorted_icons, icons_in_html 6 | 7 | 8 | @pytest.mark.parametrize( 9 | "html, expected", 10 | [ 11 | pytest.param( 12 | """""", 13 | set(), 14 | id="other_rel", 15 | ), 16 | pytest.param( 17 | """""", 18 | {"https://somewhere/favicon.ico"}, 19 | id="simple_icon", 20 | ), 21 | pytest.param( 22 | """""", 23 | {"https://somewhere/favicon.ico"}, 24 | id="simple_icon", 25 | ), 26 | pytest.param( 27 | """""", 28 | set(), 29 | id="icon_href_missing", 30 | ), 31 | pytest.param( 32 | """""", 33 | {"https://somewhere/favicon.ico"}, 34 | id="simple_shortcut_icon", 35 | ), 36 | pytest.param( 37 | """ 38 | """, 39 | {"https://somewhere/favicon.ico"}, 40 | id="no_duplicates", 41 | ), 42 | pytest.param( 43 | """ 44 | """, 45 | {"https://somewhere/favicon2.ico", "https://somewhere/favicon1.ico"}, 46 | id="sort_by_size", 47 | ), 48 | pytest.param( 49 | Path("tests/data-special/icons.html").read_text(), 50 | { 51 | "https://womenshistory.si.edu//sites/default/themes/si_sawhm/favicons/android-chrome-192x192.png", 52 | "https://womenshistory.si.edu//sites/default/themes/si_sawhm/favicons/favicon-96x96.png", 53 | "https://womenshistory.si.edu//sites/default/themes/si_sawhm/favicons/favicon-32x32.png", 54 | "https://womenshistory.si.edu//sites/default/themes/si_sawhm/favicons/favicon.ico", 55 | "https://womenshistory.si.edu//sites/default/themes/si_sawhm/favicons/favicon-16x16.png", 56 | }, 57 | id="real_life", 58 | ), 59 | pytest.param( 60 | """""", 61 | {"https://somewhere/favicon.ico"}, 62 | id="bad_sizes_1", 63 | ), 64 | pytest.param( 65 | """""", 66 | {"https://somewhere/favicon.ico"}, 67 | id="bad_sizes_2", 68 | ), 69 | ], 70 | ) 71 | def test_icons_in_html(html, expected): 72 | assert icons_in_html(html) == expected 73 | 74 | 75 | @pytest.mark.parametrize( 76 | "unsorted, expected", 77 | [ 78 | pytest.param([], [], id="empty"), 79 | pytest.param( 80 | [Icon("url1", 12, 12, b"", None)], 81 | [Icon("url1", 12, 12, b"", None)], 82 | id="one_item", 83 | ), 84 | pytest.param( 85 | [Icon("url3", 12, 12, b"", None), Icon("url2", 96, 96, b"", None)], 86 | [Icon("url2", 96, 96, b"", None), Icon("url3", 12, 12, b"", None)], 87 | id="two_items_with_size1", 88 | ), 89 | pytest.param( 90 | [Icon("url3", 128, 128, b"", None), Icon("url2", 96, 96, b"", None)], 91 | [Icon("url3", 128, 128, b"", None), Icon("url2", 96, 96, b"", None)], 92 | id="two_items_with_size2", 93 | ), 94 | pytest.param( 95 | [Icon("url2", 96, 96, b"", None), Icon("url3", 128, 128, b"", None)], 96 | [Icon("url3", 128, 128, b"", None), Icon("url2", 96, 96, b"", None)], 97 | id="two_items_with_size3", 98 | ), 99 | pytest.param( 100 | [Icon("url3", 12, 12, b"", None), Icon("url2", 26, 26, b"", None)], 101 | [Icon("url2", 26, 26, b"", None), Icon("url3", 12, 12, b"", None)], 102 | id="two_items_with_size4", 103 | ), 104 | pytest.param( 105 | [Icon("url2", 26, 26, b"", None), Icon("url3", 12, 12, b"", None)], 106 | [Icon("url2", 26, 26, b"", None), Icon("url3", 12, 12, b"", None)], 107 | id="two_items_with_size5", 108 | ), 109 | pytest.param( 110 | [Icon("url2", 48, 48, b"", None), Icon("url3", 12, 12, b"", None)], 111 | [Icon("url2", 48, 48, b"", None), Icon("url3", 12, 12, b"", None)], 112 | id="two_items_with_size6", 113 | ), 114 | pytest.param( 115 | [Icon("url2", 48, 48, b"", None), Icon("url3", 96, 96, b"", None)], 116 | [Icon("url2", 48, 48, b"", None), Icon("url3", 96, 96, b"", None)], 117 | id="two_items_with_size7", 118 | ), 119 | pytest.param( 120 | [Icon("url3", 12, 12, b"", None), Icon("url2", 48, 48, b"", None)], 121 | [Icon("url2", 48, 48, b"", None), Icon("url3", 12, 12, b"", None)], 122 | id="two_items_with_size8", 123 | ), 124 | pytest.param( 125 | [Icon("url3", 96, 96, b"", None), Icon("url2", 48, 48, b"", None)], 126 | [Icon("url2", 48, 48, b"", None), Icon("url3", 96, 96, b"", None)], 127 | id="two_items_with_size9", 128 | ), 129 | pytest.param( 130 | [Icon("url2", 48, 48, b"", None), Icon("url3", 48, 48, b"", None)], 131 | [Icon("url2", 48, 48, b"", None), Icon("url3", 48, 48, b"", None)], 132 | id="two_items_with_size10", 133 | ), 134 | pytest.param( 135 | [Icon("url2", 96, 96, b"", None), Icon("url3", 96, 96, b"", None)], 136 | [Icon("url2", 96, 96, b"", None), Icon("url3", 96, 96, b"", None)], 137 | id="two_items_with_size11", 138 | ), 139 | pytest.param( 140 | [Icon("url3", 32, 32, b"", None), Icon("url2", 96, 96, b"", None)], 141 | [Icon("url2", 96, 96, b"", None), Icon("url3", 32, 32, b"", None)], 142 | id="two_items_with_size12", 143 | ), 144 | pytest.param( 145 | [Icon("url2", 96, 96, b"", None), Icon("url3", 32, 32, b"", None)], 146 | [Icon("url2", 96, 96, b"", None), Icon("url3", 32, 32, b"", None)], 147 | id="two_items_with_size13", 148 | ), 149 | pytest.param( 150 | [Icon("url2", 26, 26, b"", None), Icon("url3", 26, 26, b"", None)], 151 | [Icon("url2", 26, 26, b"", None), Icon("url3", 26, 26, b"", None)], 152 | id="two_items_with_size14", 153 | ), 154 | ], 155 | ) 156 | def test_get_sorted_icons(unsorted, expected): 157 | assert get_sorted_icons(unsorted) == expected 158 | if len(unsorted) == 2: 159 | if unsorted[0] == expected[1]: 160 | assert unsorted[0] < unsorted[1] 161 | assert unsorted[1] > unsorted[0] 162 | -------------------------------------------------------------------------------- /tests/test_language.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from warc2zim.language import parse_language 4 | 5 | 6 | @pytest.mark.parametrize( 7 | "input_lang, expected_lang", 8 | [ 9 | pytest.param("en", "eng", id="english_2_chars"), 10 | pytest.param("eng", "eng", id="english_3_chars"), 11 | pytest.param("English", "eng", id="english_full_1"), 12 | pytest.param("zh", "zho", id="chinese_2_chars"), 13 | pytest.param("zh-hans", "zho", id="chinese_variant"), 14 | pytest.param("zho", "zho", id="chinese_3_chars"), 15 | pytest.param("Chinese", "zho", id="chinese_full_1"), 16 | pytest.param("chinEse", "zho", id="chinese_full_2"), 17 | pytest.param("patois", "eng", id="unrecognized_bad_name"), 18 | pytest.param("unknown,fra,unknown", "fra", id="ignore_unknown"), 19 | pytest.param("eng,fra", "eng,fra", id="two_langs_1"), 20 | pytest.param("fra,eng", "fra,eng", id="two_langs_2"), # order must be preserved 21 | pytest.param(" eng , fra ", "eng,fra", id="two_langs_spaces"), 22 | pytest.param("eng,fra,English", "eng,fra", id="duplicates"), 23 | pytest.param("eng;fra", "eng", id="unrecognized_bad_separator"), 24 | ], 25 | ) 26 | def test_parse_language(input_lang, expected_lang): 27 | assert parse_language(input_lang) == expected_lang 28 | -------------------------------------------------------------------------------- /tests/test_metadata_validation.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from warc2zim.main import main 4 | 5 | 6 | @pytest.mark.parametrize( 7 | "title, is_valid", 8 | [ 9 | pytest.param("A title", True, id="a_valid_title"), 10 | pytest.param("A very very very very long title", False, id="an_invalid_title"), 11 | ], 12 | ) 13 | def test_title_validation(title, is_valid): 14 | args = ["--name", "test", "--title", title, "--output", "./"] 15 | if is_valid: 16 | assert main(args) == 100 17 | else: 18 | with pytest.raises(ValueError, match="Title value is too long"): 19 | main(args) 20 | 21 | 22 | @pytest.mark.parametrize( 23 | "description, is_valid", 24 | [ 25 | pytest.param("A description", True, id="a_valid_description"), 26 | pytest.param( 27 | "A " + "".join(["very " for i in range(20)]) + "long description", 28 | False, 29 | id="an_invalid_description", 30 | ), 31 | ], 32 | ) 33 | def test_description_validation(description, is_valid): 34 | args = ["--name", "test", "--description", description, "--output", "./"] 35 | if is_valid: 36 | assert main(args) == 100 37 | else: 38 | with pytest.raises(ValueError, match="Description value is too long"): 39 | main(args) 40 | 41 | 42 | @pytest.mark.parametrize( 43 | "long_description, is_valid", 44 | [ 45 | pytest.param("A long description", True, id="a_valid_long_description"), 46 | pytest.param( 47 | "A " + "".join(["very " for i in range(800)]) + "long description", 48 | False, 49 | id="an_invalid_long_description", 50 | ), 51 | ], 52 | ) 53 | def test_long_description_validation(long_description, is_valid): 54 | args = [ 55 | "--name", 56 | "test", 57 | "--long-description", 58 | long_description, 59 | "--output", 60 | "./", 61 | ] 62 | if is_valid: 63 | assert main(args) == 100 64 | else: 65 | with pytest.raises(ValueError, match="LongDescription value is too long"): 66 | main(args) 67 | 68 | 69 | @pytest.mark.parametrize( 70 | "tags, is_valid", 71 | [ 72 | pytest.param(["tag1", "tag2"], True, id="valid_tags"), 73 | # NOTA: there is no tests for invalid tags, since it is not currently possible 74 | ], 75 | ) 76 | def test_tags_validation(tags, is_valid): 77 | args = ["--name", "test", "--tags", ";".join(tags), "--output", "./"] 78 | if is_valid: 79 | assert main(args) == 100 80 | -------------------------------------------------------------------------------- /tests/test_rewriting.py: -------------------------------------------------------------------------------- 1 | import io 2 | 3 | import pytest 4 | from jinja2 import Template 5 | from warcio import StatusAndHeaders 6 | from warcio.recordloader import ArcWarcRecord 7 | from zimscraperlib.rewriting.url_rewriting import ZimPath 8 | 9 | from warc2zim.rewriting import Rewriter 10 | 11 | 12 | @pytest.fixture(scope="module") 13 | def rewrite_generator(): 14 | """A fixture which return a generator for a generic rewriter""" 15 | 16 | def generate_and_call( 17 | content: bytes = b"dummy", content_type: str = "text/html; charset=UTF-8" 18 | ): 19 | rec_headers = StatusAndHeaders( 20 | "WARC/1.1", 21 | headers=[("WARC-Target-URI", "http://www.example.com")], 22 | ) 23 | http_headers = StatusAndHeaders( 24 | "HTTP/1.1 200 OK", 25 | headers=[("Content-Type", content_type)], 26 | ) 27 | return Rewriter( 28 | ZimPath("www.example.com"), 29 | ArcWarcRecord( 30 | "warc", # format = warc 31 | "response", # rec_type = response 32 | rec_headers, 33 | io.BytesIO(content), 34 | http_headers, 35 | "application/http; msgtype=response", 36 | content.__len__(), 37 | ), 38 | set(), 39 | set(), 40 | set(), 41 | ["UTF-8", "ISO-8859-1"], 42 | 1024, 43 | ignore_http_header_charsets=False, 44 | ignore_content_header_charsets=False, 45 | ).rewrite(Template(""), Template("")) 46 | 47 | yield generate_and_call 48 | 49 | 50 | @pytest.mark.parametrize( 51 | "content_str, encoding, content_type", 52 | [ 53 | pytest.param("Bérénice", "UTF-8", "text/html", id="html_content_utf8_auto"), 54 | pytest.param("Bérénice", "UTF-8", "text/css", id="js_content_utf8_auto"), 55 | pytest.param( 56 | "Bérénice", "UTF-8", "text/javascript", id="css_content_utf8_auto" 57 | ), 58 | pytest.param( 59 | "Bérénice", "UTF-8", "youdontknowme", id="unknown_content_utf8_auto" 60 | ), 61 | pytest.param("Bérénice", "ISO-8859-1", "text/html", id="html_content_iso_auto"), 62 | pytest.param("Bérénice", "ISO-8859-1", "text/css", id="js_content_iso_auto"), 63 | pytest.param( 64 | "Bérénice", "ISO-8859-1", "text/javascript", id="css_content_iso_auto" 65 | ), 66 | pytest.param( 67 | "Bérénice", "ISO-8859-1", "youdontknowme", id="unknown_content_iso_auto" 68 | ), 69 | pytest.param( 70 | "Bérénice", 71 | "UTF-8", 72 | "text/html; charset=UTF-8", 73 | id="html_content_utf8_declared", 74 | ), 75 | pytest.param( 76 | "Bérénice", 77 | "UTF-8", 78 | "text/css; charset=UTF-8", 79 | id="js_content_utf8_declared", 80 | ), 81 | pytest.param( 82 | "Bérénice", 83 | "UTF-8", 84 | "text/javascript; charset=UTF-8", 85 | id="css_content_utf8_declared", 86 | ), 87 | pytest.param( 88 | "Bérénice", 89 | "UTF-8", 90 | "youdontknowme; charset=UTF-8", 91 | id="unknown_content_utf8_declared", 92 | ), 93 | pytest.param( 94 | "Bérénice", 95 | "ISO-8859-1", 96 | "text/html; charset=ISO-8859-1", 97 | id="html_content_iso_declared", 98 | ), 99 | pytest.param( 100 | "Bérénice", 101 | "ISO-8859-1", 102 | "text/css; charset=ISO-8859-1", 103 | id="js_content_iso_declared", 104 | ), 105 | pytest.param( 106 | "Bérénice", 107 | "ISO-8859-1", 108 | "text/javascript; charset=ISO-8859-1", 109 | id="css_content_iso_declared", 110 | ), 111 | pytest.param( 112 | "Bérénice", 113 | "ISO-8859-1", 114 | "youdontknowme; charset=ISO-8859-1", 115 | id="unknown_content_iso_declared", 116 | ), 117 | ], 118 | ) 119 | def test_generic_rewriting_encoding_handling( 120 | rewrite_generator, content_str, encoding, content_type 121 | ): 122 | """Test handling of encoding in various content types""" 123 | content_bytes = content_str.encode(encoding) 124 | (_, content) = rewrite_generator(content=content_bytes, content_type=content_type) 125 | if isinstance(content, bytes): 126 | # we return original bytes if content is not rewriten 127 | assert content == content_bytes 128 | else: 129 | assert content == content_str 130 | -------------------------------------------------------------------------------- /tests/utils.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | 3 | 4 | @dataclass 5 | class ContentForTests: 6 | input_: str | bytes 7 | expected: str | bytes = "" 8 | article_url: str = "kiwix.org" 9 | 10 | def __post_init__(self): 11 | if not self.expected: 12 | self.expected = self.input_ 13 | 14 | @property 15 | def input_str(self) -> str: 16 | if isinstance(self.input_, str): 17 | return self.input_ 18 | raise ValueError("Input value is not a str.") 19 | 20 | @property 21 | def input_bytes(self) -> bytes: 22 | if isinstance(self.input_, bytes): 23 | return self.input_ 24 | raise ValueError("Input value is not a bytes.") 25 | 26 | @property 27 | def expected_str(self) -> str: 28 | if isinstance(self.expected, str): 29 | return self.expected 30 | raise ValueError("Expected value is not a str.") 31 | 32 | @property 33 | def expected_bytes(self) -> bytes: 34 | if isinstance(self.expected, bytes): 35 | return self.expected 36 | raise ValueError("Expected value is not a bytes.") 37 | --------------------------------------------------------------------------------