├── .github
    ├── FUNDING.yml
    ├── stale.yml
    └── workflows
    │   ├── DailyTests.yaml
    │   ├── Publish.yml
    │   ├── PublishDockerDevImage.yaml
    │   ├── QA.yaml
    │   ├── Tests.yaml
    │   └── update-zim-offliner-definition.yaml
├── .gitignore
├── .pre-commit-config.yaml
├── CHANGELOG.md
├── Dockerfile
├── LICENSE
├── README.md
├── offliner-definition.json
├── pyproject.toml
├── src
    └── zimit
    │   ├── __about__.py
    │   ├── constants.py
    │   ├── utils.py
    │   └── zimit.py
├── tasks.py
├── tests-daily
    ├── Dockerfile
    └── daily.py
├── tests-integration
    ├── README.md
    └── integration.py
└── tests
    └── test_dummy.py


/.github/FUNDING.yml:
--------------------------------------------------------------------------------
 1 | # These are supported funding model platforms
 2 | 
 3 | github: kiwix # Replace with up to 4 GitHub Sponsors-enabled usernames e.g., [user1, user2]
 4 | patreon: # Replace with a single Patreon username
 5 | open_collective: # Replace with a single Open Collective username
 6 | ko_fi: # Replace with a single Ko-fi username
 7 | tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel
 8 | community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry
 9 | liberapay: # Replace with a single Liberapay username
10 | issuehunt: # Replace with a single IssueHunt username
11 | otechie: # Replace with a single Otechie username
12 | custom: # https://kiwix.org/support-us/
13 | 


--------------------------------------------------------------------------------
/.github/stale.yml:
--------------------------------------------------------------------------------
 1 | daysUntilClose: false
 2 | staleLabel: stale
 3 | 
 4 | issues:
 5 |   daysUntilStale: 60
 6 |   markComment: >
 7 |     This issue has been automatically marked as stale because it has not had
 8 |     recent activity. It will be now be reviewed manually. Thank you
 9 |     for your contributions.
10 | pulls:
11 |    daysUntilStale: 7
12 |    markComment: >
13 |      This pull request has been automatically marked as stale because it has not had
14 |      recent activity. It will be now be reviewed manually. Thank you
15 |      for your contributions.
16 | 


--------------------------------------------------------------------------------
/.github/workflows/DailyTests.yaml:
--------------------------------------------------------------------------------
 1 | name: DailyTests
 2 | 
 3 | on:
 4 |   schedule:
 5 |     - cron: "0 4 * * *"
 6 |   workflow_dispatch:
 7 | 
 8 | 
 9 | jobs:
10 |   run-daily-tests:
11 |     runs-on: ubuntu-22.04
12 | 
13 |     steps:
14 |       - name: checkout
15 |         uses: actions/checkout@v4
16 | 
17 |       - name: build zimit image
18 |         run: docker build -t local-zimit .
19 | 
20 |       - name: run crawl of test website
21 |         run: docker run -v $PWD/output:/output local-zimit zimit --seeds https://website.test.openzim.org/ --name tests_eng_test-website --zim-file tests_eng_test-website.zim
22 | 
23 |       - name: archive ZIM
24 |         uses: actions/upload-artifact@v4
25 |         with:
26 |           name: tests_eng_test-website.zim
27 |           path: output/tests_eng_test-website.zim
28 |           retention-days: 30
29 | 
30 |       - name: build tests-daily Docker image
31 |         run: docker build -t local-tests-daily tests-daily
32 | 
33 |       - name: run integration test suite
34 |         run: docker run -e SKIP_YOUTUBE_TEST="True" -v $PWD/tests-daily/daily.py:/app/daily.py -v $PWD/output:/output local-tests-daily bash -c "cd /app && pytest -v --log-level=INFO --log-format='%(levelname)s - %(message)s' daily.py"
35 | 


--------------------------------------------------------------------------------
/.github/workflows/Publish.yml:
--------------------------------------------------------------------------------
 1 | name: Publish released version
 2 | 
 3 | on:
 4 |   release:
 5 |     types: [published]
 6 | 
 7 | jobs:
 8 |   publish-amd64:
 9 |     runs-on: ubuntu-24.04
10 |     name: "Publish for AMD64"
11 | 
12 |     steps:
13 |       - uses: actions/checkout@v4
14 | 
15 |       - name: Build and push Docker image
16 |         uses: openzim/docker-publish-action@v10
17 |         with:
18 |           image-name: openzim/zimit
19 |           tag-pattern: /^v([0-9.]+)$/
20 |           latest-on-tag: true
21 |           restrict-to: openzim/zimit
22 |           registries: ghcr.io
23 |           credentials: |
24 |             GHCRIO_USERNAME=${{ secrets.GHCR_USERNAME }}
25 |             GHCRIO_TOKEN=${{ secrets.GHCR_TOKEN }}
26 |           repo_description: auto
27 |           repo_overview: auto
28 |           platforms: |
29 |             linux/amd64
30 | 
31 |   # Disabled for now, see https://github.com/openzim/zimit/issues/463
32 |   # publish-arm64:
33 |   #   runs-on: ubuntu-24.04
34 |   #   name: "Publish for ARM64"
35 |   #
36 |   #   steps:
37 |   #     - uses: actions/checkout@v4
38 |   #
39 |   #     - name: Build and push Docker image
40 |   #       uses: openzim/docker-publish-action@v10
41 |   #       with:
42 |   #         image-name: openzim/zimit
43 |   #         tag-pattern: /^v([0-9.]+)$/
44 |   #         latest-on-tag: true
45 |   #         restrict-to: openzim/zimit
46 |   #         registries: ghcr.io
47 |   #         credentials: |
48 |   #           GHCRIO_USERNAME=${{ secrets.GHCR_USERNAME }}
49 |   #           GHCRIO_TOKEN=${{ secrets.GHCR_TOKEN }}
50 |   #         repo_description: auto
51 |   #         repo_overview: auto
52 |   #         platforms: |
53 |   #           linux/arm64
54 | 


--------------------------------------------------------------------------------
/.github/workflows/PublishDockerDevImage.yaml:
--------------------------------------------------------------------------------
 1 | name: Publish Docker dev image
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - main
 7 |   workflow_dispatch:
 8 | 
 9 | jobs:
10 |   publish-amd64:
11 |     runs-on: ubuntu-24.04
12 |     name: "Publish for AMD64"
13 | 
14 |     steps:
15 |       - uses: actions/checkout@v4
16 | 
17 |       - name: Build and push Docker image
18 |         uses: openzim/docker-publish-action@v10
19 |         with:
20 |           image-name: openzim/zimit
21 |           manual-tag: dev
22 |           latest-on-tag: false
23 |           restrict-to: openzim/zimit
24 |           registries: ghcr.io
25 |           credentials: |
26 |             GHCRIO_USERNAME=${{ secrets.GHCR_USERNAME }}
27 |             GHCRIO_TOKEN=${{ secrets.GHCR_TOKEN }}
28 |           repo_description: auto
29 |           repo_overview: auto
30 |           platforms: |
31 |             linux/amd64
32 | 
33 |   # Disabled for now, see https://github.com/openzim/zimit/issues/463
34 |   # publish-arm64:
35 |   #   runs-on: ubuntu-24.04-arm
36 |   #   name: "Publish for ARM64"
37 |   #
38 |   #   steps:
39 |   #     - uses: actions/checkout@v4
40 |   #
41 |   #     - name: Build and push Docker image
42 |   #       uses: openzim/docker-publish-action@v10
43 |   #       with:
44 |   #         image-name: openzim/zimit
45 |   #         manual-tag: dev
46 |   #         latest-on-tag: false
47 |   #         restrict-to: openzim/zimit
48 |   #         registries: ghcr.io
49 |   #         credentials: |
50 |   #           GHCRIO_USERNAME=${{ secrets.GHCR_USERNAME }}
51 |   #           GHCRIO_TOKEN=${{ secrets.GHCR_TOKEN }}
52 |   #         repo_description: auto
53 |   #         repo_overview: auto
54 |   #         platforms: |
55 |   #           linux/arm64
56 | 


--------------------------------------------------------------------------------
/.github/workflows/QA.yaml:
--------------------------------------------------------------------------------
 1 | name: QA
 2 | 
 3 | on:
 4 |   pull_request:
 5 |   push:
 6 |     branches:
 7 |       - main
 8 | 
 9 | jobs:
10 |   check-qa:
11 |     runs-on: ubuntu-22.04
12 | 
13 |     steps:
14 |       - uses: actions/checkout@v4
15 | 
16 |       - name: Set up Python
17 |         uses: actions/setup-python@v5
18 |         with:
19 |           python-version-file: pyproject.toml
20 |           architecture: x64
21 | 
22 |       - name: Install dependencies (and project)
23 |         run: |
24 |           pip install -U pip
25 |           pip install -e .[lint,scripts,test,check]
26 | 
27 |       - name: Check black formatting
28 |         run: inv lint-black
29 | 
30 |       - name: Check ruff
31 |         run: inv lint-ruff
32 | 
33 |       - name: Check pyright
34 |         run: inv check-pyright
35 | 


--------------------------------------------------------------------------------
/.github/workflows/Tests.yaml:
--------------------------------------------------------------------------------
 1 | name: Tests
 2 | 
 3 | on:
 4 |   pull_request:
 5 |   push:
 6 |     branches:
 7 |       - main
 8 | 
 9 | jobs:
10 |   run-tests:
11 |     runs-on: ubuntu-22.04
12 | 
13 |     steps:
14 |       - uses: actions/checkout@v4
15 | 
16 |       - name: Set up Python
17 |         uses: actions/setup-python@v5
18 |         with:
19 |           python-version-file: pyproject.toml
20 |           architecture: x64
21 | 
22 |       - name: Install dependencies (and project)
23 |         run: |
24 |           pip install -U pip
25 |           pip install -e .[test,scripts]
26 | 
27 |       - name: Run the tests
28 |         run: inv coverage --args "-vvv"
29 | 
30 |       - name: Upload coverage report to codecov
31 |         uses: codecov/codecov-action@v4
32 |         with:
33 |           token: ${{ secrets.CODECOV_TOKEN }}
34 | 
35 |   build_python:
36 |     runs-on: ubuntu-22.04
37 |     steps:
38 |       - uses: actions/checkout@v4
39 | 
40 |       - name: Set up Python
41 |         uses: actions/setup-python@v5
42 |         with:
43 |           python-version-file: pyproject.toml
44 |           architecture: x64
45 | 
46 |       - name: Ensure we can build Python targets
47 |         run: |
48 |           pip install -U pip build
49 |           python3 -m build --sdist --wheel
50 | 
51 |   # this job replaces the standard "build_docker" job since it builds the docker image
52 |   run-integration-tests:
53 |     runs-on: ubuntu-22.04
54 | 
55 |     steps:
56 |       - name: checkout
57 |         uses: actions/checkout@v4
58 | 
59 |       - name: build image
60 |         run: docker build -t local-zimit .
61 | 
62 |       - name: ensure help display without issue
63 |         run: docker run -v $PWD/output:/output local-zimit zimit --help
64 | 
65 |       - name: run crawl with soft size limit
66 |         run: docker run -v $PWD/output:/output local-zimit zimit --seeds http://website.test.openzim.org/ --sizeSoftLimit 8192 --name tests_en_sizesoftlimit --zim-file tests_en_sizesoftlimit.zim --adminEmail test@example.com --mobileDevice "Pixel 5" --zimit-progress-file /output/stats_sizesoftlimit.json
67 | 
68 |       - name: run crawl with hard size limit
69 |         run: docker run -v $PWD/output:/output local-zimit zimit --seeds http://website.test.openzim.org/ --sizeHardLimit 8192 --name tests_en_sizehardlimit --zim-file tests_en_sizehardlimit.zim --adminEmail test@example.com --mobileDevice "Pixel 5" --zimit-progress-file /output/stats_sizehardlimit.json || true
70 | 
71 |       - name: run crawl with soft time limit
72 |         run: docker run -v $PWD/output:/output local-zimit zimit --seeds http://website.test.openzim.org/ --timeSoftLimit 1 --name tests_en_timesoftlimit --zim-file tests_en_timesoftlimit.zim --adminEmail test@example.com --mobileDevice "Pixel 5" --zimit-progress-file /output/stats_timesoftlimit.json
73 | 
74 |       - name: run crawl with hard time limit
75 |         run: docker run -v $PWD/output:/output local-zimit zimit --seeds http://website.test.openzim.org/ --timeHardLimit 1 --name tests_en_timehardlimit --zim-file tests_en_timehardlimit.zim --adminEmail test@example.com --mobileDevice "Pixel 5" --zimit-progress-file /output/stats_timehardlimit.json || true
76 | 
77 |       - name: run standard crawl
78 |         run: docker run -v $PWD/output:/output local-zimit zimit --seeds http://website.test.openzim.org/http-return-codes.html --name tests_en_onepage --zim-file tests_en_onepage.zim --adminEmail test@example.com --mobileDevice "Pixel 5" --zimit-progress-file /output/stats.json --statsFilename /output/crawl.json --warc2zim-progress-file /output/warc2zim.json --keep
79 | 
80 |       - name: run integration test suite
81 |         run: docker run -v $PWD/tests-integration/integration.py:/app/integration.py -v $PWD/output:/output local-zimit bash -c "/app/zimit/bin/pip install pytest; /app/zimit/bin/pytest -v /app/integration.py"
82 | 


--------------------------------------------------------------------------------
/.github/workflows/update-zim-offliner-definition.yaml:
--------------------------------------------------------------------------------
 1 | name: Update ZIMFarm Definitions
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: [main]
 6 |     paths:
 7 |       - "offliner-definition.json"
 8 |   release:
 9 |     types: [published]
10 | 
11 |   workflow_dispatch:
12 |     inputs:
13 |       version:
14 |         description: "Version to publish"
15 |         required: false
16 |         default: "dev"
17 | 
18 | jobs:
19 |   prepare-json:
20 |     runs-on: ubuntu-24.04
21 |     outputs:
22 |       offliner_definition_b64: ${{ steps.read-json.outputs.offliner_definition_b64 }}
23 |     steps:
24 |       - name: Checkout repository
25 |         uses: actions/checkout@v4
26 |         with:
27 |           fetch-depth: 0
28 | 
29 |       - id: read-json
30 |         run: |
31 |           if [ ! -f "offliner-definition.json" ]; then
32 |             echo "File not found!" >&2
33 |             exit 1
34 |           fi
35 |           json_b64=$(base64 -w0 <<< "$(jq -c . offliner-definition.json)")
36 |           echo "offliner_definition_b64=$json_b64" >> $GITHUB_OUTPUT
37 |   call-workflow:
38 |     needs: prepare-json
39 |     uses: openzim/overview/.github/workflows/update-zimfarm-offliner-definition.yaml@main
40 |     with:
41 |       version: ${{ github.event_name == 'release' && github.event.release.tag_name || (github.event.inputs.version || 'dev') }}
42 |       offliner: zimit
43 |       offliner_definition_b64: ${{ needs.prepare-json.outputs.offliner_definition_b64 }}
44 |     secrets:
45 |       zimfarm_ci_secret: ${{ secrets.ZIMFARM_CI_SECRET }}
46 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Created by https://www.toptal.com/developers/gitignore/api/linux,macos,python,visualstudiocode,intellij
  2 | # Edit at https://www.toptal.com/developers/gitignore?templates=linux,macos,python,visualstudiocode,intellij
  3 | 
  4 | ### Intellij ###
  5 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider
  6 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
  7 | 
  8 | # User-specific stuff
  9 | .idea/**/workspace.xml
 10 | .idea/**/tasks.xml
 11 | .idea/**/usage.statistics.xml
 12 | .idea/**/dictionaries
 13 | .idea/**/shelf
 14 | 
 15 | # AWS User-specific
 16 | .idea/**/aws.xml
 17 | 
 18 | # Generated files
 19 | .idea/**/contentModel.xml
 20 | 
 21 | # Sensitive or high-churn files
 22 | .idea/**/dataSources/
 23 | .idea/**/dataSources.ids
 24 | .idea/**/dataSources.local.xml
 25 | .idea/**/sqlDataSources.xml
 26 | .idea/**/dynamic.xml
 27 | .idea/**/uiDesigner.xml
 28 | .idea/**/dbnavigator.xml
 29 | 
 30 | # Gradle
 31 | .idea/**/gradle.xml
 32 | .idea/**/libraries
 33 | 
 34 | # Gradle and Maven with auto-import
 35 | # When using Gradle or Maven with auto-import, you should exclude module files,
 36 | # since they will be recreated, and may cause churn.  Uncomment if using
 37 | # auto-import.
 38 | # .idea/artifacts
 39 | # .idea/compiler.xml
 40 | # .idea/jarRepositories.xml
 41 | # .idea/modules.xml
 42 | # .idea/*.iml
 43 | # .idea/modules
 44 | # *.iml
 45 | # *.ipr
 46 | 
 47 | # CMake
 48 | cmake-build-*/
 49 | 
 50 | # Mongo Explorer plugin
 51 | .idea/**/mongoSettings.xml
 52 | 
 53 | # File-based project format
 54 | *.iws
 55 | 
 56 | # IntelliJ
 57 | out/
 58 | 
 59 | # mpeltonen/sbt-idea plugin
 60 | .idea_modules/
 61 | 
 62 | # JIRA plugin
 63 | atlassian-ide-plugin.xml
 64 | 
 65 | # Cursive Clojure plugin
 66 | .idea/replstate.xml
 67 | 
 68 | # SonarLint plugin
 69 | .idea/sonarlint/
 70 | 
 71 | # Crashlytics plugin (for Android Studio and IntelliJ)
 72 | com_crashlytics_export_strings.xml
 73 | crashlytics.properties
 74 | crashlytics-build.properties
 75 | fabric.properties
 76 | 
 77 | # Editor-based Rest Client
 78 | .idea/httpRequests
 79 | 
 80 | # Android studio 3.1+ serialized cache file
 81 | .idea/caches/build_file_checksums.ser
 82 | 
 83 | ### Intellij Patch ###
 84 | # Comment Reason: https://github.com/joeblau/gitignore.io/issues/186#issuecomment-215987721
 85 | 
 86 | # *.iml
 87 | # modules.xml
 88 | # .idea/misc.xml
 89 | # *.ipr
 90 | 
 91 | # Sonarlint plugin
 92 | # https://plugins.jetbrains.com/plugin/7973-sonarlint
 93 | .idea/**/sonarlint/
 94 | 
 95 | # SonarQube Plugin
 96 | # https://plugins.jetbrains.com/plugin/7238-sonarqube-community-plugin
 97 | .idea/**/sonarIssues.xml
 98 | 
 99 | # Markdown Navigator plugin
100 | # https://plugins.jetbrains.com/plugin/7896-markdown-navigator-enhanced
101 | .idea/**/markdown-navigator.xml
102 | .idea/**/markdown-navigator-enh.xml
103 | .idea/**/markdown-navigator/
104 | 
105 | # Cache file creation bug
106 | # See https://youtrack.jetbrains.com/issue/JBR-2257
107 | .idea/$CACHE_FILE$
108 | 
109 | # CodeStream plugin
110 | # https://plugins.jetbrains.com/plugin/12206-codestream
111 | .idea/codestream.xml
112 | 
113 | # Azure Toolkit for IntelliJ plugin
114 | # https://plugins.jetbrains.com/plugin/8053-azure-toolkit-for-intellij
115 | .idea/**/azureSettings.xml
116 | 
117 | ### Linux ###
118 | *~
119 | 
120 | # temporary files which can be created if a process still has a handle open of a deleted file
121 | .fuse_hidden*
122 | 
123 | # KDE directory preferences
124 | .directory
125 | 
126 | # Linux trash folder which might appear on any partition or disk
127 | .Trash-*
128 | 
129 | # .nfs files are created when an open file is removed but is still being accessed
130 | .nfs*
131 | 
132 | ### macOS ###
133 | # General
134 | .DS_Store
135 | .AppleDouble
136 | .LSOverride
137 | 
138 | # Icon must end with two \r
139 | Icon
140 | 
141 | 
142 | # Thumbnails
143 | ._*
144 | 
145 | # Files that might appear in the root of a volume
146 | .DocumentRevisions-V100
147 | .fseventsd
148 | .Spotlight-V100
149 | .TemporaryItems
150 | .Trashes
151 | .VolumeIcon.icns
152 | .com.apple.timemachine.donotpresent
153 | 
154 | # Directories potentially created on remote AFP share
155 | .AppleDB
156 | .AppleDesktop
157 | Network Trash Folder
158 | Temporary Items
159 | .apdisk
160 | 
161 | ### macOS Patch ###
162 | # iCloud generated files
163 | *.icloud
164 | 
165 | ### Python ###
166 | # Byte-compiled / optimized / DLL files
167 | __pycache__/
168 | *.py[cod]
169 | *$py.class
170 | 
171 | # C extensions
172 | *.so
173 | 
174 | # Distribution / packaging
175 | .Python
176 | build/
177 | develop-eggs/
178 | dist/
179 | downloads/
180 | eggs/
181 | .eggs/
182 | lib/
183 | lib64/
184 | parts/
185 | sdist/
186 | var/
187 | wheels/
188 | share/python-wheels/
189 | *.egg-info/
190 | .installed.cfg
191 | *.egg
192 | MANIFEST
193 | 
194 | # PyInstaller
195 | #  Usually these files are written by a python script from a template
196 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
197 | *.manifest
198 | *.spec
199 | 
200 | # Installer logs
201 | pip-log.txt
202 | pip-delete-this-directory.txt
203 | 
204 | # Unit test / coverage reports
205 | htmlcov/
206 | .tox/
207 | .nox/
208 | .coverage
209 | .coverage.*
210 | .cache
211 | nosetests.xml
212 | coverage.xml
213 | *.cover
214 | *.py,cover
215 | .hypothesis/
216 | .pytest_cache/
217 | cover/
218 | 
219 | # Translations
220 | *.mo
221 | *.pot
222 | 
223 | # Django stuff:
224 | *.log
225 | local_settings.py
226 | db.sqlite3
227 | db.sqlite3-journal
228 | 
229 | # Flask stuff:
230 | instance/
231 | .webassets-cache
232 | 
233 | # Scrapy stuff:
234 | .scrapy
235 | 
236 | # Sphinx documentation
237 | docs/_build/
238 | 
239 | # PyBuilder
240 | .pybuilder/
241 | target/
242 | 
243 | # Jupyter Notebook
244 | .ipynb_checkpoints
245 | 
246 | # IPython
247 | profile_default/
248 | ipython_config.py
249 | 
250 | # pyenv
251 | #   For a library or package, you might want to ignore these files since the code is
252 | #   intended to run in multiple environments; otherwise, check them in:
253 | # .python-version
254 | 
255 | # pipenv
256 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
257 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
258 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
259 | #   install all needed dependencies.
260 | #Pipfile.lock
261 | 
262 | # poetry
263 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
264 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
265 | #   commonly ignored for libraries.
266 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
267 | #poetry.lock
268 | 
269 | # pdm
270 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
271 | #pdm.lock
272 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
273 | #   in version control.
274 | #   https://pdm.fming.dev/#use-with-ide
275 | .pdm.toml
276 | 
277 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
278 | __pypackages__/
279 | 
280 | # Celery stuff
281 | celerybeat-schedule
282 | celerybeat.pid
283 | 
284 | # SageMath parsed files
285 | *.sage.py
286 | 
287 | # Environments
288 | .env
289 | .venv
290 | env/
291 | venv/
292 | ENV/
293 | env.bak/
294 | venv.bak/
295 | 
296 | # Spyder project settings
297 | .spyderproject
298 | .spyproject
299 | 
300 | # Rope project settings
301 | .ropeproject
302 | 
303 | # mkdocs documentation
304 | /site
305 | 
306 | # mypy
307 | .mypy_cache/
308 | .dmypy.json
309 | dmypy.json
310 | 
311 | # Pyre type checker
312 | .pyre/
313 | 
314 | # pytype static type analyzer
315 | .pytype/
316 | 
317 | # Cython debug symbols
318 | cython_debug/
319 | 
320 | # PyCharm
321 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
322 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
323 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
324 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
325 | #.idea/
326 | 
327 | ### Python Patch ###
328 | # Poetry local configuration file - https://python-poetry.org/docs/configuration/#local-configuration
329 | poetry.toml
330 | 
331 | # ruff
332 | .ruff_cache/
333 | 
334 | # LSP config files
335 | pyrightconfig.json
336 | 
337 | ### VisualStudioCode ###
338 | .vscode/*
339 | !.vscode/settings.json
340 | !.vscode/tasks.json
341 | !.vscode/launch.json
342 | !.vscode/extensions.json
343 | !.vscode/*.code-snippets
344 | 
345 | # Local History for Visual Studio Code
346 | .history/
347 | 
348 | # Built Visual Studio Code Extensions
349 | *.vsix
350 | 
351 | ### VisualStudioCode Patch ###
352 | # Ignore all local history of files
353 | .history
354 | .ionide
355 | 
356 | # End of https://www.toptal.com/developers/gitignore/api/linux,macos,python,visualstudiocode,intellij
357 | 
358 | # output dir
359 | output
360 | 
361 | # ignore all vscode, this editor specific, not maintained by openzim
362 | .vscode
363 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | # See https://pre-commit.com for more information
 2 | # See https://pre-commit.com/hooks.html for more hooks
 3 | repos:
 4 | - repo: https://github.com/pre-commit/pre-commit-hooks
 5 |   rev: v5.0.0
 6 |   hooks:
 7 |   -   id: trailing-whitespace
 8 |   -   id: end-of-file-fixer
 9 | - repo: https://github.com/psf/black
10 |   rev: "25.1.0"
11 |   hooks:
12 |   -   id: black
13 | - repo: https://github.com/astral-sh/ruff-pre-commit
14 |   rev: v0.9.4
15 |   hooks:
16 |   - id: ruff
17 | - repo: https://github.com/RobertCraigie/pyright-python
18 |   rev: v1.1.393
19 |   hooks:
20 |   - id: pyright
21 |     name: pyright (system)
22 |     description: 'pyright static type checker'
23 |     entry: pyright
24 |     language: system
25 |     'types_or': [python, pyi]
26 |     require_serial: true
27 |     minimum_pre_commit_version: '2.9.2'
28 | 


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
  1 | ## Changelog
  2 | 
  3 | All notable changes to this project are documented in this file.
  4 | 
  5 | The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
  6 | and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html) (as of version 1.2.0).
  7 | 
  8 | ## [Unreleased]
  9 | 
 10 | ## [3.0.5] - 2024-04-11
 11 | 
 12 | ### Changed
 13 | 
 14 | - Upgrade to browsertrix crawler 1.6.0 (#493)
 15 | 
 16 | ## [3.0.4] - 2024-04-04
 17 | 
 18 | ### Changed
 19 | 
 20 | - Upgrade to browsertrix crawler 1.5.10 (#491)
 21 | 
 22 | ## [3.0.3] - 2024-02-28
 23 | 
 24 | ### Changed
 25 | 
 26 | - Upgrade to browsertrix crawler 1.5.7 (#483)
 27 | 
 28 | ## [3.0.2] - 2024-02-27
 29 | 
 30 | ### Changed
 31 | 
 32 | - Upgrade to browsertrix crawler 1.5.6 (#482)
 33 | 
 34 | ## [3.0.1] - 2024-02-24
 35 | 
 36 | ### Changed
 37 | 
 38 | - Upgrade to browsertrix crawler 1.5.4 (#476)
 39 | 
 40 | ## [3.0.0] - 2024-02-17
 41 | 
 42 | ### Changed
 43 | 
 44 | - Change solution to report partial ZIM to the Zimfarm and other clients (#304)
 45 | - Keep temporary folder when crawler or warc2zim fails, even if not asked for (#468)
 46 | - Add many missing Browsertrix Crawler arguments ; drop default overrides by zimit ; drop `--noMobileDevice` setting (not needed anymore) (#433)
 47 | - Document all Browsertrix Crawler default arguments values (#416)
 48 | - Use preferred Browsertrix Crawler arguments names: (part of #471)
 49 |   - `--seeds` instead of `--url`
 50 |   - `--seedFile` instead of `--urlFile`
 51 |   - `--pageLimit` instead of `--limit`
 52 |   - `--pageLoadTimeout` instead of `--timeout`
 53 |   - `--scopeIncludeRx` instead of `--include`
 54 |   - `--scopeExcludeRx` instead of `--exclude`
 55 |   - `--pageExtraDelay` instead of `--delay`
 56 | - Remove confusion between zimit, warc2zim and crawler stats filenames (part of #471)
 57 |   - `--statsFilename` is now the crawler stats file (since it is the same name, just like other arguments)
 58 |   - `--zimit-progress-file` is now the zimit stats location
 59 |   - `--warc2zim-progress-file` is the warc2zim stats location
 60 |   - all are optional values, if not set and needed temporary files are used
 61 | 
 62 | ### Fixed
 63 | 
 64 | - Do not create the ZIM when crawl is incomplete (#444)
 65 | 
 66 | ## [2.1.8] - 2024-02-07
 67 | 
 68 | ### Changed
 69 | 
 70 | - Upgrade to browsertrix crawler 1.5.1, Python 3.13 and others (#462 + #464)
 71 | 
 72 | ## [2.1.7] - 2024-01-10
 73 | 
 74 | ### Changed
 75 | 
 76 | - Upgrade to browsertrix crawler 1.4.2 (#450)
 77 | - Upgrade to warc2zim 2.2.0
 78 | 
 79 | ## [2.1.6] - 2024-11-07
 80 | 
 81 | ### Changed
 82 | 
 83 | - Upgrade to browsertrix crawler 1.3.5 (#426)
 84 | 
 85 | ## [2.1.5] - 2024-11-01
 86 | 
 87 | ### Changed
 88 | 
 89 | - Upgrade to browsertrix crawler 1.3.4 and warc2zim 2.1.3 (#424)
 90 | 
 91 | ## [2.1.4] - 2024-10-11
 92 | 
 93 | ### Changed
 94 | 
 95 | - Upgrade to browsertrix crawler 1.3.3 (#411)
 96 | 
 97 | ## [2.1.3] - 2024-10-08
 98 | 
 99 | ### Changed
100 | 
101 | - Upgrade to browsertrix crawler 1.3.2, warc2zim 2.1.2 and other dependencies (#406)
102 | 
103 | ### Fixed
104 | 
105 | - Fix help (#393)
106 | 
107 | ## [2.1.2] - 2024-09-09
108 | 
109 | ### Changed
110 | 
111 | - Upgrade to browsertrix crawler 1.3.0-beta.1 (#387) (fixes "Ziming a website with huge assets (e.g. PDFs) is failing to proceed" - #380)
112 | 
113 | ## [2.1.1] - 2024-09-05
114 | 
115 | ### Added
116 | 
117 | - Add support for uncompressed tar archive in --warcs (#369)
118 | 
119 | ### Changed
120 | 
121 | - Upgrade to browsertrix crawler 1.3.0-beta.0 (#379), including upgrage to Ubuntu Noble (#307)
122 | 
123 | ### Fixed
124 | 
125 | - Stream files downloads to not exhaust memory (#373)
126 | - Fix documentation on `--diskUtilization` setting (#375)
127 | 
128 | ## [2.1.0] - 2024-08-09
129 | 
130 | ### Added
131 | 
132 | - Add `--custom-behaviors` argument to support path/HTTP(S) URL custom behaviors to pass to the crawler (#313)
133 | - Add daily automated end-to-end tests of a page with Youtube player (#330)
134 | - Add `--warcs` option to directly process WARC files (#301)
135 | 
136 | ### Changed
137 | 
138 | - Make it clear that `--profile` argument can be an HTTP(S) URL (and not only a path) (#288)
139 | - Fix README imprecisions + add back warc2zim availability in docker image (#314)
140 | - Enhance integration test to assert final content of the ZIM (#287)
141 | - Stop fetching and passing browsertrix crawler version as scraperSuffix to warc2zim (#354)
142 | - Do not log number of WARC files found (#357)
143 | - Upgrade dependencies (warc2zim 2.1.0)
144 | 
145 | ### Fixed
146 | 
147 | - Sort WARC directories found by modification time (#366)
148 | 
149 | ## [2.0.6] - 2024-08-02
150 | 
151 | ### Changed
152 | 
153 | - Upgraded Browsertrix Crawler to 1.2.6
154 | 
155 | ## [2.0.5] - 2024-07-24
156 | 
157 | ### Changed
158 | 
159 | - Upgraded Browsertrix Crawler to 1.2.5
160 | - Upgraded warc2zim to 2.0.3
161 | 
162 | ## [2.0.4] - 2024-07-15
163 | 
164 | ### Changed
165 | 
166 | - Upgraded Browsertrix Crawler to 1.2.4 (fixes retrieve automatically the assets present in a data-xxx tag #316)
167 | 
168 | ## [2.0.3] - 2024-06-24
169 | 
170 | ### Changed
171 | 
172 | - Upgraded Browsertrix Crawler to 1.2.0 (fixes Youtube videos issue #323)
173 | 
174 | ## [2.0.2] - 2024-06-18
175 | 
176 | ### Changed
177 | 
178 | - Upgrade dependencies (mainly warc2zim 2.0.2)
179 | 
180 | 
181 | ## [2.0.1] - 2024-06-13
182 | 
183 | ### Changed
184 | 
185 | - Upgrade dependencies (especially warc2zim 2.0.1 and browsertrix crawler 1.2.0-beta.0) (#318)
186 | 
187 | ### Fixed
188 | 
189 | - Crawler is not correctly checking disk size / usage (#305)
190 | 
191 | ## [2.0.0] - 2024-06-04
192 | 
193 | ### Added
194 | 
195 | - New `--version` flag to display Zimit version (#234)
196 | - New `--logging` flag to adjust Browsertrix Crawler logging (#273)
197 | - Use new `--scraper-suffix` flag of warc2zim to enhance ZIM "Scraper" metadata (#275)
198 | - New `--noMobileDevice` CLI argument
199 | - Publish Docker image for `linux/arm64` (in addition to `linux/amd64`) (#178)
200 | 
201 | ### Changed
202 | 
203 | - **Use `warc2zim` version 2**, which works without Service Worker anymore (#193)
204 | - Upgraded Browsertrix Crawler to 1.1.3
205 | - Adopt Python bootstrap conventions
206 | - Upgrade to Python 3.12 + upgrade dependencies
207 | - Removed handling of redirects by zimit, they are handled by browsertrix crawler and detected properly by warc2zim (#284)
208 | - Drop initial check of URL in Python (#256)
209 | - `--userAgent` CLI argument overrides again the `--userAgentSuffix` and `--adminEmail` values
210 | - `--userAgent` CLI arguement is not mandatory anymore
211 | 
212 | ### Fixed
213 | 
214 | - Fix support for Youtube videos (#291)
215 | - Fix crawler `--waitUntil` values (#289)
216 | 
217 | ## [1.6.3] - 2024-01-18
218 | 
219 | ### Changed
220 | 
221 | - Adapt to new `warc2zim` code structure
222 | - Using browsertrix-crawler 0.12.4
223 | - Using warc2zim 1.5.5
224 | 
225 | ### Added
226 | 
227 | - New `--build` parameter (optional) to specify the directory holding Browsertrix files ; if not set, `--output`
228 | directory is used ; zimit creates one subdir of this folder per invocation to isolate datasets ; subdir is kept only
229 | if `--keep` is set.
230 | 
231 | ### Fixed
232 | 
233 | - `--collection` parameter was not working (#252)
234 | 
235 | ## [1.6.2] - 2023-11-17
236 | 
237 | ### Changed
238 | 
239 | - Using browsertrix-crawler 0.12.3
240 | 
241 | ### Fixed
242 | 
243 | - Fix logic passing args to crawler to support value '0' (#245)
244 | - Fix documentation about Chrome and headless (#248)
245 | 
246 | ## [1.6.1] - 2023-11-06
247 | 
248 | ### Changed
249 | 
250 | - Using browsertrix-crawler 0.12.1
251 | 
252 | ## [1.6.0] - 2023-11-02
253 | 
254 | ### Changed
255 | 
256 | - Scraper fails for all HTTP error codes returned when checking URL at startup (#223)
257 | - User-Agent now has a default value (#228)
258 | - Manipulation of spaces with UA suffix and adminEmail has been modified
259 | - Same User-Agent is used for check_url (Python) and Browsertrix crawler (#227)
260 | - Using browsertrix-crawler 0.12.0
261 | 
262 | ## [1.5.3] - 2023-10-02
263 | 
264 | ### Changed
265 | 
266 | - Using browsertrix-crawler 0.11.2
267 | 
268 | ## [1.5.2] - 2023-09-19
269 | 
270 | ### Changed
271 | 
272 | - Using browsertrix-crawler 0.11.1
273 | 
274 | ## [1.5.1] - 2023-09-18
275 | 
276 | ### Changed
277 | 
278 | - Using browsertrix-crawler 0.11.0
279 | - Scraper stat file is not created empty (#211)
280 | - Crawler statistics are not available anymore (#213)
281 | - Using warc2zim 1.5.4
282 | 
283 | ## [1.5.0] - 2023-08-23
284 | 
285 | ### Added
286 | 
287 | - `--long-description` param
288 | 
289 | ## [1.4.1] - 2023-08-23
290 | 
291 | ### Changed
292 | 
293 | - Using browsertrix-crawler 0.10.4
294 | - Using warc2zim 1.5.3
295 | 
296 | ## [1.4.0] - 2023-08-02
297 | 
298 | ### Added
299 | 
300 | - `--title` to set ZIM title
301 | - `--description` to set ZIM description
302 | - New crawler options: `--maxPageLimit`, `--delay`, `--diskUtilization`
303 | - `--zim-lang` param to set warc2zim's `--lang` (ISO-639-3)
304 | 
305 | ### Changed
306 | 
307 | - Using browsertrix-crawler 0.10.2
308 | - Default and accepted values for `--waitUntil` from crawler's update
309 | - Using warc2zim 1.5.2
310 | - Disabled Chrome updates to prevent incidental inclusion of update data in WARC/ZIM (#172)
311 | - `--failOnFailedSeed` used inconditionally
312 | - `--lang` now passed to crawler (ISO-639-1)
313 | 
314 | ### Removed
315 | 
316 | - `--newContext` from crawler's update
317 | 
318 | ## [1.3.1] - 2023-02-06
319 | 
320 | ### Changed
321 | 
322 | - Using browsertrix-crawler 0.8.0
323 | - Using warc2zim version 1.5.1 with wabac.js 2.15.2
324 | 
325 | ## [1.3.0] - 2023-02-02
326 | 
327 | ### Added
328 | 
329 | - Initial url check normalizes homepage redirects to standart ports – 80/443 (#137)
330 | 
331 | ### Changed
332 | 
333 | - Using warc2zim version 1.5.0 with scope conflict fix and videos fix
334 | - Using browsertrix-crawler 0.8.0-beta.1
335 | - Fixed `--allowHashUrls` being a boolean param
336 | - Increased `check_url` timeout (12s to connect, 27s to read) instead of 10s
337 | 
338 | ## [1.2.0] - 2022-06-21
339 | 
340 | ### Added
341 | 
342 | - `--urlFile` browsertrix crawler parameter
343 | - `--depth` browsertrix crawler parameter
344 | - `--extraHops`, parameter
345 | - `--collection` browsertrix crawler parameter
346 | - `--allowHashUrls` browsertrix crawler parameter
347 | - `--userAgentSuffix` browsertrix crawler parameter
348 | - `--behaviors`, parameter
349 | - `--behaviorTimeout` browsertrix crawler parameter
350 | - `--profile` browsertrix crawler parameter
351 | - `--sizeLimit` browsertrix crawler parameter
352 | - `--timeLimit` browsertrix crawler parameter
353 | - `--healthCheckPort`, parameter
354 | - `--overwrite` parameter
355 | 
356 | ### Changed
357 | 
358 | - using browsertrix-crawler `0.6.0` and warc2zim `1.4.2`
359 | - default WARC location after crawl changed
360 | from `collections/capture-*/archive/` to `collections/crawl-*/archive/`
361 | 
362 | ### Removed
363 | 
364 | - `--scroll` browsertrix crawler parameter (see `--behaviors`)
365 | - `--scope` browsertrix crawler parameter (see `--scopeType`, `--include` and `--exclude`)
366 | 
367 | 
368 | ## [1.1.5]
369 | 
370 | - using crawler 0.3.2 and warc2zim 1.3.6
371 | 
372 | ## [1.1.4]
373 | 
374 | - Defaults to `load,networkidle0` for waitUntil param (same as crawler)
375 | - Allows setting combinations of values for waitUntil param
376 | - Updated warc2zim to 1.3.5
377 | - Updated browsertrix-crawler to 0.3.1
378 | - Warc to zim now written to `{temp_root_dir}/collections/capture-*/archive/` where
379 |   `capture-*` is dynamic and includes the datetime. (from browsertrix-crawler)
380 | 
381 | ## [1.1.3]
382 | 
383 | - allows same first-level-domain redirects
384 | - fixed redirects to URL in scope
385 | - updated crawler to 0.2.0
386 | - `statsFilename` now informs whether limit was hit or not
387 | 
388 | ## [1.1.2]
389 | 
390 | - added support for --custom-css
391 | - added domains block list (dfault)
392 | 
393 | ## [1.1.1]
394 | 
395 | - updated browsertrix-crawler to 0.1.4
396 |   - autofetcher script to be injected by defaultDriver to capture srcsets + URLs in dynamically added stylesheets
397 | 
398 | ## [1.0]
399 | 
400 | - initial version using browsertrix-crawler:0.1.3 and warc2zim:1.3.3
401 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM webrecorder/browsertrix-crawler:1.6.0
 2 | LABEL org.opencontainers.image.source=https://github.com/openzim/zimit
 3 | 
 4 | # add deadsnakes ppa for latest Python on Ubuntu
 5 | RUN add-apt-repository ppa:deadsnakes/ppa -y
 6 | 
 7 | RUN apt-get update \
 8 |  && apt-get install -qqy --no-install-recommends \
 9 |       libmagic1 \
10 |       python3.13-venv \
11 |  && rm -rf /var/lib/apt/lists/* \
12 |  # python setup (in venv not to conflict with browsertrix)
13 |  && python3.13 -m venv /app/zimit \
14 |  # placeholder (default output location)
15 |  && mkdir -p /output \
16 |  # disable chrome upgrade
17 |  && printf "repo_add_once=\"false\"\nrepo_reenable_on_distupgrade=\"false\"\n" > /etc/default/google-chrome \
18 |  # download list of bad domains to filter-out. intentionnaly ran post-install \
19 |  # so it's not cached in earlier layers (url stays same but content updated) \
20 |  && mkdir -p /tmp/ads \
21 |  && cd /tmp/ads \
22 |  && curl -L -O https://hosts.anudeep.me/mirror/adservers.txt \
23 |  && curl -L -O https://hosts.anudeep.me/mirror/CoinMiner.txt \
24 |  && curl -L -O https://hosts.anudeep.me/mirror/facebook.txt \
25 |  && cat ./*.txt > /etc/blocklist.txt \
26 |  && rm ./*.txt \
27 |  && printf '#!/bin/sh\ncat /etc/blocklist.txt >> /etc/hosts\nexec "$@"' > /usr/local/bin/entrypoint.sh \
28 |  && chmod +x /usr/local/bin/entrypoint.sh
29 | 
30 | # Copy pyproject.toml and its dependencies
31 | COPY pyproject.toml README.md /src/
32 | COPY src/zimit/__about__.py /src/src/zimit/__about__.py
33 | 
34 | # Install Python dependencies
35 | RUN . /app/zimit/bin/activate && python -m pip install --no-cache-dir /src
36 | 
37 | # Copy code + associated artifacts
38 | COPY src /src/src
39 | COPY *.md /src/
40 | 
41 | # Install + cleanup
42 | RUN . /app/zimit/bin/activate && python -m pip install --no-cache-dir /src \
43 |  && ln -s /app/zimit/bin/zimit /usr/bin/zimit \
44 |  && ln -s /app/zimit/bin/warc2zim /usr/bin/warc2zim \
45 |  && chmod +x /usr/bin/zimit \
46 |  && rm -rf /src
47 | 
48 | ENTRYPOINT ["entrypoint.sh"]
49 | CMD ["zimit", "--help"]
50 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                     GNU GENERAL PUBLIC LICENSE
  2 |                        Version 3, 29 June 2007
  3 | 
  4 |  Copyright (C) 2007 Free Software Foundation, Inc. <https://fsf.org/>
  5 |  Everyone is permitted to copy and distribute verbatim copies
  6 |  of this license document, but changing it is not allowed.
  7 | 
  8 |                             Preamble
  9 | 
 10 |   The GNU General Public License is a free, copyleft license for
 11 | software and other kinds of works.
 12 | 
 13 |   The licenses for most software and other practical works are designed
 14 | to take away your freedom to share and change the works.  By contrast,
 15 | the GNU General Public License is intended to guarantee your freedom to
 16 | share and change all versions of a program--to make sure it remains free
 17 | software for all its users.  We, the Free Software Foundation, use the
 18 | GNU General Public License for most of our software; it applies also to
 19 | any other work released this way by its authors.  You can apply it to
 20 | your programs, too.
 21 | 
 22 |   When we speak of free software, we are referring to freedom, not
 23 | price.  Our General Public Licenses are designed to make sure that you
 24 | have the freedom to distribute copies of free software (and charge for
 25 | them if you wish), that you receive source code or can get it if you
 26 | want it, that you can change the software or use pieces of it in new
 27 | free programs, and that you know you can do these things.
 28 | 
 29 |   To protect your rights, we need to prevent others from denying you
 30 | these rights or asking you to surrender the rights.  Therefore, you have
 31 | certain responsibilities if you distribute copies of the software, or if
 32 | you modify it: responsibilities to respect the freedom of others.
 33 | 
 34 |   For example, if you distribute copies of such a program, whether
 35 | gratis or for a fee, you must pass on to the recipients the same
 36 | freedoms that you received.  You must make sure that they, too, receive
 37 | or can get the source code.  And you must show them these terms so they
 38 | know their rights.
 39 | 
 40 |   Developers that use the GNU GPL protect your rights with two steps:
 41 | (1) assert copyright on the software, and (2) offer you this License
 42 | giving you legal permission to copy, distribute and/or modify it.
 43 | 
 44 |   For the developers' and authors' protection, the GPL clearly explains
 45 | that there is no warranty for this free software.  For both users' and
 46 | authors' sake, the GPL requires that modified versions be marked as
 47 | changed, so that their problems will not be attributed erroneously to
 48 | authors of previous versions.
 49 | 
 50 |   Some devices are designed to deny users access to install or run
 51 | modified versions of the software inside them, although the manufacturer
 52 | can do so.  This is fundamentally incompatible with the aim of
 53 | protecting users' freedom to change the software.  The systematic
 54 | pattern of such abuse occurs in the area of products for individuals to
 55 | use, which is precisely where it is most unacceptable.  Therefore, we
 56 | have designed this version of the GPL to prohibit the practice for those
 57 | products.  If such problems arise substantially in other domains, we
 58 | stand ready to extend this provision to those domains in future versions
 59 | of the GPL, as needed to protect the freedom of users.
 60 | 
 61 |   Finally, every program is threatened constantly by software patents.
 62 | States should not allow patents to restrict development and use of
 63 | software on general-purpose computers, but in those that do, we wish to
 64 | avoid the special danger that patents applied to a free program could
 65 | make it effectively proprietary.  To prevent this, the GPL assures that
 66 | patents cannot be used to render the program non-free.
 67 | 
 68 |   The precise terms and conditions for copying, distribution and
 69 | modification follow.
 70 | 
 71 |                        TERMS AND CONDITIONS
 72 | 
 73 |   0. Definitions.
 74 | 
 75 |   "This License" refers to version 3 of the GNU General Public License.
 76 | 
 77 |   "Copyright" also means copyright-like laws that apply to other kinds of
 78 | works, such as semiconductor masks.
 79 | 
 80 |   "The Program" refers to any copyrightable work licensed under this
 81 | License.  Each licensee is addressed as "you".  "Licensees" and
 82 | "recipients" may be individuals or organizations.
 83 | 
 84 |   To "modify" a work means to copy from or adapt all or part of the work
 85 | in a fashion requiring copyright permission, other than the making of an
 86 | exact copy.  The resulting work is called a "modified version" of the
 87 | earlier work or a work "based on" the earlier work.
 88 | 
 89 |   A "covered work" means either the unmodified Program or a work based
 90 | on the Program.
 91 | 
 92 |   To "propagate" a work means to do anything with it that, without
 93 | permission, would make you directly or secondarily liable for
 94 | infringement under applicable copyright law, except executing it on a
 95 | computer or modifying a private copy.  Propagation includes copying,
 96 | distribution (with or without modification), making available to the
 97 | public, and in some countries other activities as well.
 98 | 
 99 |   To "convey" a work means any kind of propagation that enables other
100 | parties to make or receive copies.  Mere interaction with a user through
101 | a computer network, with no transfer of a copy, is not conveying.
102 | 
103 |   An interactive user interface displays "Appropriate Legal Notices"
104 | to the extent that it includes a convenient and prominently visible
105 | feature that (1) displays an appropriate copyright notice, and (2)
106 | tells the user that there is no warranty for the work (except to the
107 | extent that warranties are provided), that licensees may convey the
108 | work under this License, and how to view a copy of this License.  If
109 | the interface presents a list of user commands or options, such as a
110 | menu, a prominent item in the list meets this criterion.
111 | 
112 |   1. Source Code.
113 | 
114 |   The "source code" for a work means the preferred form of the work
115 | for making modifications to it.  "Object code" means any non-source
116 | form of a work.
117 | 
118 |   A "Standard Interface" means an interface that either is an official
119 | standard defined by a recognized standards body, or, in the case of
120 | interfaces specified for a particular programming language, one that
121 | is widely used among developers working in that language.
122 | 
123 |   The "System Libraries" of an executable work include anything, other
124 | than the work as a whole, that (a) is included in the normal form of
125 | packaging a Major Component, but which is not part of that Major
126 | Component, and (b) serves only to enable use of the work with that
127 | Major Component, or to implement a Standard Interface for which an
128 | implementation is available to the public in source code form.  A
129 | "Major Component", in this context, means a major essential component
130 | (kernel, window system, and so on) of the specific operating system
131 | (if any) on which the executable work runs, or a compiler used to
132 | produce the work, or an object code interpreter used to run it.
133 | 
134 |   The "Corresponding Source" for a work in object code form means all
135 | the source code needed to generate, install, and (for an executable
136 | work) run the object code and to modify the work, including scripts to
137 | control those activities.  However, it does not include the work's
138 | System Libraries, or general-purpose tools or generally available free
139 | programs which are used unmodified in performing those activities but
140 | which are not part of the work.  For example, Corresponding Source
141 | includes interface definition files associated with source files for
142 | the work, and the source code for shared libraries and dynamically
143 | linked subprograms that the work is specifically designed to require,
144 | such as by intimate data communication or control flow between those
145 | subprograms and other parts of the work.
146 | 
147 |   The Corresponding Source need not include anything that users
148 | can regenerate automatically from other parts of the Corresponding
149 | Source.
150 | 
151 |   The Corresponding Source for a work in source code form is that
152 | same work.
153 | 
154 |   2. Basic Permissions.
155 | 
156 |   All rights granted under this License are granted for the term of
157 | copyright on the Program, and are irrevocable provided the stated
158 | conditions are met.  This License explicitly affirms your unlimited
159 | permission to run the unmodified Program.  The output from running a
160 | covered work is covered by this License only if the output, given its
161 | content, constitutes a covered work.  This License acknowledges your
162 | rights of fair use or other equivalent, as provided by copyright law.
163 | 
164 |   You may make, run and propagate covered works that you do not
165 | convey, without conditions so long as your license otherwise remains
166 | in force.  You may convey covered works to others for the sole purpose
167 | of having them make modifications exclusively for you, or provide you
168 | with facilities for running those works, provided that you comply with
169 | the terms of this License in conveying all material for which you do
170 | not control copyright.  Those thus making or running the covered works
171 | for you must do so exclusively on your behalf, under your direction
172 | and control, on terms that prohibit them from making any copies of
173 | your copyrighted material outside their relationship with you.
174 | 
175 |   Conveying under any other circumstances is permitted solely under
176 | the conditions stated below.  Sublicensing is not allowed; section 10
177 | makes it unnecessary.
178 | 
179 |   3. Protecting Users' Legal Rights From Anti-Circumvention Law.
180 | 
181 |   No covered work shall be deemed part of an effective technological
182 | measure under any applicable law fulfilling obligations under article
183 | 11 of the WIPO copyright treaty adopted on 20 December 1996, or
184 | similar laws prohibiting or restricting circumvention of such
185 | measures.
186 | 
187 |   When you convey a covered work, you waive any legal power to forbid
188 | circumvention of technological measures to the extent such circumvention
189 | is effected by exercising rights under this License with respect to
190 | the covered work, and you disclaim any intention to limit operation or
191 | modification of the work as a means of enforcing, against the work's
192 | users, your or third parties' legal rights to forbid circumvention of
193 | technological measures.
194 | 
195 |   4. Conveying Verbatim Copies.
196 | 
197 |   You may convey verbatim copies of the Program's source code as you
198 | receive it, in any medium, provided that you conspicuously and
199 | appropriately publish on each copy an appropriate copyright notice;
200 | keep intact all notices stating that this License and any
201 | non-permissive terms added in accord with section 7 apply to the code;
202 | keep intact all notices of the absence of any warranty; and give all
203 | recipients a copy of this License along with the Program.
204 | 
205 |   You may charge any price or no price for each copy that you convey,
206 | and you may offer support or warranty protection for a fee.
207 | 
208 |   5. Conveying Modified Source Versions.
209 | 
210 |   You may convey a work based on the Program, or the modifications to
211 | produce it from the Program, in the form of source code under the
212 | terms of section 4, provided that you also meet all of these conditions:
213 | 
214 |     a) The work must carry prominent notices stating that you modified
215 |     it, and giving a relevant date.
216 | 
217 |     b) The work must carry prominent notices stating that it is
218 |     released under this License and any conditions added under section
219 |     7.  This requirement modifies the requirement in section 4 to
220 |     "keep intact all notices".
221 | 
222 |     c) You must license the entire work, as a whole, under this
223 |     License to anyone who comes into possession of a copy.  This
224 |     License will therefore apply, along with any applicable section 7
225 |     additional terms, to the whole of the work, and all its parts,
226 |     regardless of how they are packaged.  This License gives no
227 |     permission to license the work in any other way, but it does not
228 |     invalidate such permission if you have separately received it.
229 | 
230 |     d) If the work has interactive user interfaces, each must display
231 |     Appropriate Legal Notices; however, if the Program has interactive
232 |     interfaces that do not display Appropriate Legal Notices, your
233 |     work need not make them do so.
234 | 
235 |   A compilation of a covered work with other separate and independent
236 | works, which are not by their nature extensions of the covered work,
237 | and which are not combined with it such as to form a larger program,
238 | in or on a volume of a storage or distribution medium, is called an
239 | "aggregate" if the compilation and its resulting copyright are not
240 | used to limit the access or legal rights of the compilation's users
241 | beyond what the individual works permit.  Inclusion of a covered work
242 | in an aggregate does not cause this License to apply to the other
243 | parts of the aggregate.
244 | 
245 |   6. Conveying Non-Source Forms.
246 | 
247 |   You may convey a covered work in object code form under the terms
248 | of sections 4 and 5, provided that you also convey the
249 | machine-readable Corresponding Source under the terms of this License,
250 | in one of these ways:
251 | 
252 |     a) Convey the object code in, or embodied in, a physical product
253 |     (including a physical distribution medium), accompanied by the
254 |     Corresponding Source fixed on a durable physical medium
255 |     customarily used for software interchange.
256 | 
257 |     b) Convey the object code in, or embodied in, a physical product
258 |     (including a physical distribution medium), accompanied by a
259 |     written offer, valid for at least three years and valid for as
260 |     long as you offer spare parts or customer support for that product
261 |     model, to give anyone who possesses the object code either (1) a
262 |     copy of the Corresponding Source for all the software in the
263 |     product that is covered by this License, on a durable physical
264 |     medium customarily used for software interchange, for a price no
265 |     more than your reasonable cost of physically performing this
266 |     conveying of source, or (2) access to copy the
267 |     Corresponding Source from a network server at no charge.
268 | 
269 |     c) Convey individual copies of the object code with a copy of the
270 |     written offer to provide the Corresponding Source.  This
271 |     alternative is allowed only occasionally and noncommercially, and
272 |     only if you received the object code with such an offer, in accord
273 |     with subsection 6b.
274 | 
275 |     d) Convey the object code by offering access from a designated
276 |     place (gratis or for a charge), and offer equivalent access to the
277 |     Corresponding Source in the same way through the same place at no
278 |     further charge.  You need not require recipients to copy the
279 |     Corresponding Source along with the object code.  If the place to
280 |     copy the object code is a network server, the Corresponding Source
281 |     may be on a different server (operated by you or a third party)
282 |     that supports equivalent copying facilities, provided you maintain
283 |     clear directions next to the object code saying where to find the
284 |     Corresponding Source.  Regardless of what server hosts the
285 |     Corresponding Source, you remain obligated to ensure that it is
286 |     available for as long as needed to satisfy these requirements.
287 | 
288 |     e) Convey the object code using peer-to-peer transmission, provided
289 |     you inform other peers where the object code and Corresponding
290 |     Source of the work are being offered to the general public at no
291 |     charge under subsection 6d.
292 | 
293 |   A separable portion of the object code, whose source code is excluded
294 | from the Corresponding Source as a System Library, need not be
295 | included in conveying the object code work.
296 | 
297 |   A "User Product" is either (1) a "consumer product", which means any
298 | tangible personal property which is normally used for personal, family,
299 | or household purposes, or (2) anything designed or sold for incorporation
300 | into a dwelling.  In determining whether a product is a consumer product,
301 | doubtful cases shall be resolved in favor of coverage.  For a particular
302 | product received by a particular user, "normally used" refers to a
303 | typical or common use of that class of product, regardless of the status
304 | of the particular user or of the way in which the particular user
305 | actually uses, or expects or is expected to use, the product.  A product
306 | is a consumer product regardless of whether the product has substantial
307 | commercial, industrial or non-consumer uses, unless such uses represent
308 | the only significant mode of use of the product.
309 | 
310 |   "Installation Information" for a User Product means any methods,
311 | procedures, authorization keys, or other information required to install
312 | and execute modified versions of a covered work in that User Product from
313 | a modified version of its Corresponding Source.  The information must
314 | suffice to ensure that the continued functioning of the modified object
315 | code is in no case prevented or interfered with solely because
316 | modification has been made.
317 | 
318 |   If you convey an object code work under this section in, or with, or
319 | specifically for use in, a User Product, and the conveying occurs as
320 | part of a transaction in which the right of possession and use of the
321 | User Product is transferred to the recipient in perpetuity or for a
322 | fixed term (regardless of how the transaction is characterized), the
323 | Corresponding Source conveyed under this section must be accompanied
324 | by the Installation Information.  But this requirement does not apply
325 | if neither you nor any third party retains the ability to install
326 | modified object code on the User Product (for example, the work has
327 | been installed in ROM).
328 | 
329 |   The requirement to provide Installation Information does not include a
330 | requirement to continue to provide support service, warranty, or updates
331 | for a work that has been modified or installed by the recipient, or for
332 | the User Product in which it has been modified or installed.  Access to a
333 | network may be denied when the modification itself materially and
334 | adversely affects the operation of the network or violates the rules and
335 | protocols for communication across the network.
336 | 
337 |   Corresponding Source conveyed, and Installation Information provided,
338 | in accord with this section must be in a format that is publicly
339 | documented (and with an implementation available to the public in
340 | source code form), and must require no special password or key for
341 | unpacking, reading or copying.
342 | 
343 |   7. Additional Terms.
344 | 
345 |   "Additional permissions" are terms that supplement the terms of this
346 | License by making exceptions from one or more of its conditions.
347 | Additional permissions that are applicable to the entire Program shall
348 | be treated as though they were included in this License, to the extent
349 | that they are valid under applicable law.  If additional permissions
350 | apply only to part of the Program, that part may be used separately
351 | under those permissions, but the entire Program remains governed by
352 | this License without regard to the additional permissions.
353 | 
354 |   When you convey a copy of a covered work, you may at your option
355 | remove any additional permissions from that copy, or from any part of
356 | it.  (Additional permissions may be written to require their own
357 | removal in certain cases when you modify the work.)  You may place
358 | additional permissions on material, added by you to a covered work,
359 | for which you have or can give appropriate copyright permission.
360 | 
361 |   Notwithstanding any other provision of this License, for material you
362 | add to a covered work, you may (if authorized by the copyright holders of
363 | that material) supplement the terms of this License with terms:
364 | 
365 |     a) Disclaiming warranty or limiting liability differently from the
366 |     terms of sections 15 and 16 of this License; or
367 | 
368 |     b) Requiring preservation of specified reasonable legal notices or
369 |     author attributions in that material or in the Appropriate Legal
370 |     Notices displayed by works containing it; or
371 | 
372 |     c) Prohibiting misrepresentation of the origin of that material, or
373 |     requiring that modified versions of such material be marked in
374 |     reasonable ways as different from the original version; or
375 | 
376 |     d) Limiting the use for publicity purposes of names of licensors or
377 |     authors of the material; or
378 | 
379 |     e) Declining to grant rights under trademark law for use of some
380 |     trade names, trademarks, or service marks; or
381 | 
382 |     f) Requiring indemnification of licensors and authors of that
383 |     material by anyone who conveys the material (or modified versions of
384 |     it) with contractual assumptions of liability to the recipient, for
385 |     any liability that these contractual assumptions directly impose on
386 |     those licensors and authors.
387 | 
388 |   All other non-permissive additional terms are considered "further
389 | restrictions" within the meaning of section 10.  If the Program as you
390 | received it, or any part of it, contains a notice stating that it is
391 | governed by this License along with a term that is a further
392 | restriction, you may remove that term.  If a license document contains
393 | a further restriction but permits relicensing or conveying under this
394 | License, you may add to a covered work material governed by the terms
395 | of that license document, provided that the further restriction does
396 | not survive such relicensing or conveying.
397 | 
398 |   If you add terms to a covered work in accord with this section, you
399 | must place, in the relevant source files, a statement of the
400 | additional terms that apply to those files, or a notice indicating
401 | where to find the applicable terms.
402 | 
403 |   Additional terms, permissive or non-permissive, may be stated in the
404 | form of a separately written license, or stated as exceptions;
405 | the above requirements apply either way.
406 | 
407 |   8. Termination.
408 | 
409 |   You may not propagate or modify a covered work except as expressly
410 | provided under this License.  Any attempt otherwise to propagate or
411 | modify it is void, and will automatically terminate your rights under
412 | this License (including any patent licenses granted under the third
413 | paragraph of section 11).
414 | 
415 |   However, if you cease all violation of this License, then your
416 | license from a particular copyright holder is reinstated (a)
417 | provisionally, unless and until the copyright holder explicitly and
418 | finally terminates your license, and (b) permanently, if the copyright
419 | holder fails to notify you of the violation by some reasonable means
420 | prior to 60 days after the cessation.
421 | 
422 |   Moreover, your license from a particular copyright holder is
423 | reinstated permanently if the copyright holder notifies you of the
424 | violation by some reasonable means, this is the first time you have
425 | received notice of violation of this License (for any work) from that
426 | copyright holder, and you cure the violation prior to 30 days after
427 | your receipt of the notice.
428 | 
429 |   Termination of your rights under this section does not terminate the
430 | licenses of parties who have received copies or rights from you under
431 | this License.  If your rights have been terminated and not permanently
432 | reinstated, you do not qualify to receive new licenses for the same
433 | material under section 10.
434 | 
435 |   9. Acceptance Not Required for Having Copies.
436 | 
437 |   You are not required to accept this License in order to receive or
438 | run a copy of the Program.  Ancillary propagation of a covered work
439 | occurring solely as a consequence of using peer-to-peer transmission
440 | to receive a copy likewise does not require acceptance.  However,
441 | nothing other than this License grants you permission to propagate or
442 | modify any covered work.  These actions infringe copyright if you do
443 | not accept this License.  Therefore, by modifying or propagating a
444 | covered work, you indicate your acceptance of this License to do so.
445 | 
446 |   10. Automatic Licensing of Downstream Recipients.
447 | 
448 |   Each time you convey a covered work, the recipient automatically
449 | receives a license from the original licensors, to run, modify and
450 | propagate that work, subject to this License.  You are not responsible
451 | for enforcing compliance by third parties with this License.
452 | 
453 |   An "entity transaction" is a transaction transferring control of an
454 | organization, or substantially all assets of one, or subdividing an
455 | organization, or merging organizations.  If propagation of a covered
456 | work results from an entity transaction, each party to that
457 | transaction who receives a copy of the work also receives whatever
458 | licenses to the work the party's predecessor in interest had or could
459 | give under the previous paragraph, plus a right to possession of the
460 | Corresponding Source of the work from the predecessor in interest, if
461 | the predecessor has it or can get it with reasonable efforts.
462 | 
463 |   You may not impose any further restrictions on the exercise of the
464 | rights granted or affirmed under this License.  For example, you may
465 | not impose a license fee, royalty, or other charge for exercise of
466 | rights granted under this License, and you may not initiate litigation
467 | (including a cross-claim or counterclaim in a lawsuit) alleging that
468 | any patent claim is infringed by making, using, selling, offering for
469 | sale, or importing the Program or any portion of it.
470 | 
471 |   11. Patents.
472 | 
473 |   A "contributor" is a copyright holder who authorizes use under this
474 | License of the Program or a work on which the Program is based.  The
475 | work thus licensed is called the contributor's "contributor version".
476 | 
477 |   A contributor's "essential patent claims" are all patent claims
478 | owned or controlled by the contributor, whether already acquired or
479 | hereafter acquired, that would be infringed by some manner, permitted
480 | by this License, of making, using, or selling its contributor version,
481 | but do not include claims that would be infringed only as a
482 | consequence of further modification of the contributor version.  For
483 | purposes of this definition, "control" includes the right to grant
484 | patent sublicenses in a manner consistent with the requirements of
485 | this License.
486 | 
487 |   Each contributor grants you a non-exclusive, worldwide, royalty-free
488 | patent license under the contributor's essential patent claims, to
489 | make, use, sell, offer for sale, import and otherwise run, modify and
490 | propagate the contents of its contributor version.
491 | 
492 |   In the following three paragraphs, a "patent license" is any express
493 | agreement or commitment, however denominated, not to enforce a patent
494 | (such as an express permission to practice a patent or covenant not to
495 | sue for patent infringement).  To "grant" such a patent license to a
496 | party means to make such an agreement or commitment not to enforce a
497 | patent against the party.
498 | 
499 |   If you convey a covered work, knowingly relying on a patent license,
500 | and the Corresponding Source of the work is not available for anyone
501 | to copy, free of charge and under the terms of this License, through a
502 | publicly available network server or other readily accessible means,
503 | then you must either (1) cause the Corresponding Source to be so
504 | available, or (2) arrange to deprive yourself of the benefit of the
505 | patent license for this particular work, or (3) arrange, in a manner
506 | consistent with the requirements of this License, to extend the patent
507 | license to downstream recipients.  "Knowingly relying" means you have
508 | actual knowledge that, but for the patent license, your conveying the
509 | covered work in a country, or your recipient's use of the covered work
510 | in a country, would infringe one or more identifiable patents in that
511 | country that you have reason to believe are valid.
512 | 
513 |   If, pursuant to or in connection with a single transaction or
514 | arrangement, you convey, or propagate by procuring conveyance of, a
515 | covered work, and grant a patent license to some of the parties
516 | receiving the covered work authorizing them to use, propagate, modify
517 | or convey a specific copy of the covered work, then the patent license
518 | you grant is automatically extended to all recipients of the covered
519 | work and works based on it.
520 | 
521 |   A patent license is "discriminatory" if it does not include within
522 | the scope of its coverage, prohibits the exercise of, or is
523 | conditioned on the non-exercise of one or more of the rights that are
524 | specifically granted under this License.  You may not convey a covered
525 | work if you are a party to an arrangement with a third party that is
526 | in the business of distributing software, under which you make payment
527 | to the third party based on the extent of your activity of conveying
528 | the work, and under which the third party grants, to any of the
529 | parties who would receive the covered work from you, a discriminatory
530 | patent license (a) in connection with copies of the covered work
531 | conveyed by you (or copies made from those copies), or (b) primarily
532 | for and in connection with specific products or compilations that
533 | contain the covered work, unless you entered into that arrangement,
534 | or that patent license was granted, prior to 28 March 2007.
535 | 
536 |   Nothing in this License shall be construed as excluding or limiting
537 | any implied license or other defenses to infringement that may
538 | otherwise be available to you under applicable patent law.
539 | 
540 |   12. No Surrender of Others' Freedom.
541 | 
542 |   If conditions are imposed on you (whether by court order, agreement or
543 | otherwise) that contradict the conditions of this License, they do not
544 | excuse you from the conditions of this License.  If you cannot convey a
545 | covered work so as to satisfy simultaneously your obligations under this
546 | License and any other pertinent obligations, then as a consequence you may
547 | not convey it at all.  For example, if you agree to terms that obligate you
548 | to collect a royalty for further conveying from those to whom you convey
549 | the Program, the only way you could satisfy both those terms and this
550 | License would be to refrain entirely from conveying the Program.
551 | 
552 |   13. Use with the GNU Affero General Public License.
553 | 
554 |   Notwithstanding any other provision of this License, you have
555 | permission to link or combine any covered work with a work licensed
556 | under version 3 of the GNU Affero General Public License into a single
557 | combined work, and to convey the resulting work.  The terms of this
558 | License will continue to apply to the part which is the covered work,
559 | but the special requirements of the GNU Affero General Public License,
560 | section 13, concerning interaction through a network will apply to the
561 | combination as such.
562 | 
563 |   14. Revised Versions of this License.
564 | 
565 |   The Free Software Foundation may publish revised and/or new versions of
566 | the GNU General Public License from time to time.  Such new versions will
567 | be similar in spirit to the present version, but may differ in detail to
568 | address new problems or concerns.
569 | 
570 |   Each version is given a distinguishing version number.  If the
571 | Program specifies that a certain numbered version of the GNU General
572 | Public License "or any later version" applies to it, you have the
573 | option of following the terms and conditions either of that numbered
574 | version or of any later version published by the Free Software
575 | Foundation.  If the Program does not specify a version number of the
576 | GNU General Public License, you may choose any version ever published
577 | by the Free Software Foundation.
578 | 
579 |   If the Program specifies that a proxy can decide which future
580 | versions of the GNU General Public License can be used, that proxy's
581 | public statement of acceptance of a version permanently authorizes you
582 | to choose that version for the Program.
583 | 
584 |   Later license versions may give you additional or different
585 | permissions.  However, no additional obligations are imposed on any
586 | author or copyright holder as a result of your choosing to follow a
587 | later version.
588 | 
589 |   15. Disclaimer of Warranty.
590 | 
591 |   THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
592 | APPLICABLE LAW.  EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
593 | HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
594 | OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
595 | THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
596 | PURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
597 | IS WITH YOU.  SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
598 | ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
599 | 
600 |   16. Limitation of Liability.
601 | 
602 |   IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
603 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
604 | THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
605 | GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
606 | USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
607 | DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
608 | PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
609 | EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
610 | SUCH DAMAGES.
611 | 
612 |   17. Interpretation of Sections 15 and 16.
613 | 
614 |   If the disclaimer of warranty and limitation of liability provided
615 | above cannot be given local legal effect according to their terms,
616 | reviewing courts shall apply local law that most closely approximates
617 | an absolute waiver of all civil liability in connection with the
618 | Program, unless a warranty or assumption of liability accompanies a
619 | copy of the Program in return for a fee.
620 | 
621 |                      END OF TERMS AND CONDITIONS
622 | 
623 |             How to Apply These Terms to Your New Programs
624 | 
625 |   If you develop a new program, and you want it to be of the greatest
626 | possible use to the public, the best way to achieve this is to make it
627 | free software which everyone can redistribute and change under these terms.
628 | 
629 |   To do so, attach the following notices to the program.  It is safest
630 | to attach them to the start of each source file to most effectively
631 | state the exclusion of warranty; and each file should have at least
632 | the "copyright" line and a pointer to where the full notice is found.
633 | 
634 |     <one line to give the program's name and a brief idea of what it does.>
635 |     Copyright (C) <year>  <name of author>
636 | 
637 |     This program is free software: you can redistribute it and/or modify
638 |     it under the terms of the GNU General Public License as published by
639 |     the Free Software Foundation, either version 3 of the License, or
640 |     (at your option) any later version.
641 | 
642 |     This program is distributed in the hope that it will be useful,
643 |     but WITHOUT ANY WARRANTY; without even the implied warranty of
644 |     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
645 |     GNU General Public License for more details.
646 | 
647 |     You should have received a copy of the GNU General Public License
648 |     along with this program.  If not, see <https://www.gnu.org/licenses/>.
649 | 
650 | Also add information on how to contact you by electronic and paper mail.
651 | 
652 |   If the program does terminal interaction, make it output a short
653 | notice like this when it starts in an interactive mode:
654 | 
655 |     <program>  Copyright (C) <year>  <name of author>
656 |     This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
657 |     This is free software, and you are welcome to redistribute it
658 |     under certain conditions; type `show c' for details.
659 | 
660 | The hypothetical commands `show w' and `show c' should show the appropriate
661 | parts of the General Public License.  Of course, your program's commands
662 | might be different; for a GUI interface, you would use an "about box".
663 | 
664 |   You should also get your employer (if you work as a programmer) or school,
665 | if any, to sign a "copyright disclaimer" for the program, if necessary.
666 | For more information on this, and how to apply and follow the GNU GPL, see
667 | <https://www.gnu.org/licenses/>.
668 | 
669 |   The GNU General Public License does not permit incorporating your program
670 | into proprietary programs.  If your program is a subroutine library, you
671 | may consider it more useful to permit linking proprietary applications with
672 | the library.  If this is what you want to do, use the GNU Lesser General
673 | Public License instead of this License.  But first, please read
674 | <https://www.gnu.org/licenses/why-not-lgpl.html>.
675 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | Zimit
 2 | =====
 3 | 
 4 | Zimit is a scraper allowing to create [ZIM file](https://en.wikipedia.org/wiki/ZIM_(file_format)) from any Web site.
 5 | 
 6 | [![CodeFactor](https://www.codefactor.io/repository/github/openzim/zimit/badge)](https://www.codefactor.io/repository/github/openzim/zimit)
 7 | [![License: GPL v3](https://img.shields.io/badge/License-GPLv3-blue.svg)](https://www.gnu.org/licenses/gpl-3.0)
 8 | [![Docker](https://ghcr-badge.egpl.dev/openzim/zimit/latest_tag?label=docker)](https://ghcr.io/openzim/zimit)
 9 | 
10 | Zimit adheres to openZIM's [Contribution Guidelines](https://github.com/openzim/overview/wiki/Contributing).
11 | 
12 | Zimit has implemented openZIM's [Python bootstrap, conventions and policies](https://github.com/openzim/_python-bootstrap/blob/main/docs/Policy.md) **v1.0.1**.
13 | 
14 | Capabilities and known limitations
15 | --------------------
16 | 
17 | While we would like to support as many websites as possible, making an offline archive of any website with a versatile tool obviously has some limitations.
18 | 
19 | Most capabilities and known limitations are documented in [warc2zim README](https://github.com/openzim/warc2zim/blob/main/README.md). There are also some limitations in Browsertrix Crawler (used to fetch the website) and wombat (used to properly replay dynamic web requests), but these are not (yet?) clearly documented.
20 | 
21 | Technical background
22 | --------------------
23 | 
24 | Zimit runs a fully automated browser-based crawl of a website property and produces a ZIM of the crawled content. Zimit runs in a Docker container.
25 | 
26 | The system:
27 | - runs a website crawl with [Browsertrix Crawler](https://github.com/webrecorder/browsertrix-crawler), which produces WARC files
28 | - converts the crawled WARC files to a single ZIM using [warc2zim](https://github.com/openzim/warc2zim)
29 | 
30 | The `zimit.py` is the entrypoint for the system.
31 | 
32 | After the crawl is done, warc2zim is used to write a zim to the `/output` directory, which should be mounted as a volume to not loose the ZIM created when container stops.
33 | 
34 | Using the `--keep` flag, the crawled WARCs and few other artifacts will also be kept in a temp directory inside `/output`
35 | 
36 | Usage
37 | -----
38 | 
39 | `zimit` is intended to be run in Docker. Docker image is published at https://github.com/orgs/openzim/packages/container/package/zimit.
40 | 
41 | The image accepts the following parameters, **as well as any of the [Browsertrix crawler](https://crawler.docs.browsertrix.com/user-guide/cli-options/) and [warc2zim](https://github.com/openzim/warc2zim) ones**:
42 | 
43 | - Required: `--seeds URL` - the url to start crawling from ; multiple URLs can be separated by a comma (even if **usually not needed**, these are just the **seeds** of the crawl) ; first seed URL is used as ZIM homepage
44 | - Required: `--name` - Name of ZIM file
45 | - `--output` - output directory (defaults to `/output`)
46 | - `--pageLimit U` - Limit capture to at most U URLs
47 | - `--scopeExcludeRx <regex>` - skip URLs that match the regex from crawling. Can be specified multiple times. An example is `--scopeExcludeRx="(\?q=|signup-landing\?|\?cid=)"`, where URLs that contain either `?q=` or `signup-landing?` or `?cid=` will be excluded.
48 | - `--workers N` - number of crawl workers to be run in parallel
49 | - `--waitUntil` - Puppeteer setting for how long to wait for page load. See [page.goto waitUntil options](https://github.com/puppeteer/puppeteer/blob/main/docs/api.md#pagegotourl-options). The default is `load`, but for static sites, `--waitUntil domcontentloaded` may be used to speed up the crawl (to avoid waiting for ads to load for example).
50 | - `--keep` - in case of failure, WARC files and other temporary files (which are stored as a subfolder of output directory) are always kept, otherwise they are automatically deleted. Use this flag to always keep WARC files, even in case of success.
51 | 
52 | Example command:
53 | 
54 | ```bash
55 | docker run ghcr.io/openzim/zimit zimit --help
56 | docker run ghcr.io/openzim/zimit warc2zim --help
57 | docker run  -v /output:/output ghcr.io/openzim/zimit zimit --seeds URL --name myzimfile
58 | ```
59 | 
60 | **Note**: Image automatically filters out a large number of ads by using the 3 blocklists from [anudeepND](https://github.com/anudeepND/blacklist). If you don't want this filtering, disable the image's entrypoint in your container (`docker run --entrypoint="" ghcr.io/openzim/zimit ...`).
61 | 
62 | To re-build the Docker image locally run:
63 | 
64 | ```bash
65 | docker build -t ghcr.io/openzim/zimit .
66 | ```
67 | 
68 | FAQ
69 | ---
70 | 
71 | The Zimit contributor's team maintains [a page with most Frequently Asked Questions](https://github.com/openzim/zimit/wiki/Frequently-Asked-Questions).
72 | 
73 | Nota bene
74 | ---------
75 | 
76 | While Zimit 1.x relied on a Service Worker to display the ZIM content, this is not anymore the case
77 | since Zimit 2.x which does not have any special requirements anymore.
78 | 
79 | It should also be noted that a first version of a generic HTTP scraper was created in 2016 during
80 | the [Wikimania Esino Lario
81 | Hackathon](https://wikimania2016.wikimedia.org/wiki/Programme/Kiwix-dedicated_Hackathon).
82 | 
83 | That version is now considered outdated and [archived in `2016`
84 | branch](https://github.com/openzim/zimit/tree/2016).
85 | 
86 | License
87 | -------
88 | 
89 | [GPLv3](https://www.gnu.org/licenses/gpl-3.0) or later, see
90 | [LICENSE](LICENSE) for more details.
91 | 


--------------------------------------------------------------------------------
/offliner-definition.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "offliner_id": "zimit",
  3 |   "stdOutput": true,
  4 |   "stdStats": "zimit-progress-file",
  5 |   "flags": {
  6 |     "seeds": {
  7 |       "type": "string",
  8 |       "required": false,
  9 |       "title": "Seeds",
 10 |       "description": "The seed URL(s) to start crawling from. Multile seed URL must be separated by a comma (usually not needed, these are just the crawl seeds). First seed URL is used as ZIM homepage"
 11 |     },
 12 |     "seed_file": {
 13 |       "type": "string",
 14 |       "required": false,
 15 |       "title": "Seed File",
 16 |       "description": "If set, read a list of seed urls, one per line. HTTPS URL to an online file."
 17 |     },
 18 |     "lang": {
 19 |       "type": "string",
 20 |       "required": false,
 21 |       "title": "Browser Language",
 22 |       "description": "If set, sets the language used by the browser, should be ISO 639 language[-country] code, e.g. `en` or `en-GB`"
 23 |     },
 24 |     "title": {
 25 |       "type": "string",
 26 |       "required": false,
 27 |       "title": "Title",
 28 |       "description": "Custom title for your ZIM. Defaults to title of main page",
 29 |       "minLength": 1,
 30 |       "maxLength": 30
 31 |     },
 32 |     "description": {
 33 |       "type": "string",
 34 |       "required": false,
 35 |       "title": "Description",
 36 |       "description": "Description for ZIM",
 37 |       "minLength": 1,
 38 |       "maxLength": 80
 39 |     },
 40 |     "favicon": {
 41 |       "type": "url",
 42 |       "required": false,
 43 |       "title": "Illustration",
 44 |       "description": "URL for Illustration. "
 45 |     },
 46 |     "tags": {
 47 |       "type": "string",
 48 |       "required": false,
 49 |       "title": "ZIM Tags",
 50 |       "description": "Single string with individual tags separated by a semicolon."
 51 |     },
 52 |     "creator": {
 53 |       "type": "string",
 54 |       "required": false,
 55 |       "title": "Creator",
 56 |       "description": "Name of content creator"
 57 |     },
 58 |     "publisher": {
 59 |       "type": "string",
 60 |       "required": false,
 61 |       "title": "Publisher",
 62 |       "isPublisher": true,
 63 |       "description": "Custom publisher name (ZIM metadata). openZIM otherwise"
 64 |     },
 65 |     "source": {
 66 |       "type": "string",
 67 |       "required": false,
 68 |       "title": "Source",
 69 |       "description": "Source name/URL of content"
 70 |     },
 71 |     "workers": {
 72 |       "type": "integer",
 73 |       "required": false,
 74 |       "title": "Workers",
 75 |       "description": "The number of workers to run in parallel. Defaults to 1",
 76 |       "min": 1
 77 |     },
 78 |     "wait_until": {
 79 |       "type": "string",
 80 |       "required": false,
 81 |       "title": "WaitUntil",
 82 |       "description": "Puppeteer page.goto() condition to wait for before continuing. One of load, domcontentloaded, networkidle0 or networkidle2, or a comma-separated combination of those. Default is load,networkidle2"
 83 |     },
 84 |     "extra_hops": {
 85 |       "type": "integer",
 86 |       "required": false,
 87 |       "title": "Extra Hops",
 88 |       "description": "Number of extra 'hops' to follow, beyond the current scope. Default is 0",
 89 |       "min": 0
 90 |     },
 91 |     "page_limit": {
 92 |       "type": "integer",
 93 |       "required": false,
 94 |       "title": "Page Limit",
 95 |       "description": "Limit crawl to this number of pages. Default is 0 (no-limit).",
 96 |       "min": 0
 97 |     },
 98 |     "max_page_limit": {
 99 |       "type": "integer",
100 |       "required": false,
101 |       "title": "Max Page Limit",
102 |       "description": "Maximum pages to crawl, overriding pageLimit if both are set. Default is 0 (no-limit)",
103 |       "min": 0
104 |     },
105 |     "page_load_timeout": {
106 |       "type": "integer",
107 |       "required": false,
108 |       "title": "Page Load Timeout",
109 |       "description": "Timeout for each page to load (in seconds). Default is 90",
110 |       "min": 0
111 |     },
112 |     "scope_type": {
113 |       "type": "string-enum",
114 |       "required": false,
115 |       "title": "Scope Type",
116 |       "description": "A predfined scope of the crawl. For more customization, use 'custom' and set scopeIncludeRx/scopeExcludeRx regexes. Default is custom if scopeIncludeRx is set, prefix otherwise.",
117 |       "choices": [
118 |         {
119 |           "title": "Page",
120 |           "value": "page"
121 |         },
122 |         {
123 |           "title": "Page SPA",
124 |           "value": "page-spa"
125 |         },
126 |         {
127 |           "title": "Prefix",
128 |           "value": "prefix"
129 |         },
130 |         {
131 |           "title": "Host",
132 |           "value": "host"
133 |         },
134 |         {
135 |           "title": "Domain",
136 |           "value": "domain"
137 |         },
138 |         {
139 |           "title": "Any",
140 |           "value": "any"
141 |         },
142 |         {
143 |           "title": "Custom",
144 |           "value": "custom"
145 |         }
146 |       ]
147 |     },
148 |     "scope_include_rx": {
149 |       "type": "string",
150 |       "required": false,
151 |       "title": "Scope Include Regex",
152 |       "description": "Regex of page URLs that should be included in the crawl (defaults to the immediate directory of seed)"
153 |     },
154 |     "scope_exclude_rx": {
155 |       "type": "string",
156 |       "required": false,
157 |       "title": "Scope Exclude Regex",
158 |       "description": "Regex of page URLs that should be excluded from the crawl"
159 |     },
160 |     "allow_hash_urls": {
161 |       "type": "boolean",
162 |       "required": false,
163 |       "title": "Allow Hashtag URLs",
164 |       "description": "Allow Hashtag URLs, useful for single-page-application crawling or when different hashtags load dynamic content"
165 |     },
166 |     "mobile_device": {
167 |       "type": "string-enum",
168 |       "required": false,
169 |       "title": "As device",
170 |       "description": "Device to crawl as. See Pupeeter's Device.ts for a list",
171 |       "choices": [
172 |         {
173 |           "title": "Blackberry Playbook",
174 |           "value": "Blackberry PlayBook"
175 |         },
176 |         {
177 |           "title": "Blackberry Playbook Landscape",
178 |           "value": "Blackberry PlayBook landscape"
179 |         },
180 |         {
181 |           "title": "Blackberry Z30",
182 |           "value": "BlackBerry Z30"
183 |         },
184 |         {
185 |           "title": "Blackberry Z30 Landscape",
186 |           "value": "BlackBerry Z30 landscape"
187 |         },
188 |         {
189 |           "title": "Galaxy Note 3",
190 |           "value": "Galaxy Note 3"
191 |         },
192 |         {
193 |           "title": "Galaxy Note 3 Landscape",
194 |           "value": "Galaxy Note 3 landscape"
195 |         },
196 |         {
197 |           "title": "Galaxy Note II",
198 |           "value": "Galaxy Note II"
199 |         },
200 |         {
201 |           "title": "Galaxy Note II Landscape",
202 |           "value": "Galaxy Note II landscape"
203 |         },
204 |         {
205 |           "title": "Galaxy S III",
206 |           "value": "Galaxy S III"
207 |         },
208 |         {
209 |           "title": "Galaxy S III Landscape",
210 |           "value": "Galaxy S III landscape"
211 |         },
212 |         {
213 |           "title": "Galaxy S5",
214 |           "value": "Galaxy S5"
215 |         },
216 |         {
217 |           "title": "Galaxy S5 Landscape",
218 |           "value": "Galaxy S5 landscape"
219 |         },
220 |         {
221 |           "title": "Galaxy S8",
222 |           "value": "Galaxy S8"
223 |         },
224 |         {
225 |           "title": "Galaxy S8 Landscape",
226 |           "value": "Galaxy S8 landscape"
227 |         },
228 |         {
229 |           "title": "Galaxy S9 Plus",
230 |           "value": "Galaxy S9+"
231 |         },
232 |         {
233 |           "title": "Galaxy S9 Plus Landscape",
234 |           "value": "Galaxy S9+ landscape"
235 |         },
236 |         {
237 |           "title": "Galaxy Tab S4",
238 |           "value": "Galaxy Tab S4"
239 |         },
240 |         {
241 |           "title": "Galaxy Tab S4 Landscape",
242 |           "value": "Galaxy Tab S4 landscape"
243 |         },
244 |         {
245 |           "title": "iPad",
246 |           "value": "iPad"
247 |         },
248 |         {
249 |           "title": "iPad Landscape",
250 |           "value": "iPad landscape"
251 |         },
252 |         {
253 |           "title": "iPad Gen 6",
254 |           "value": "iPad (gen 6)"
255 |         },
256 |         {
257 |           "title": "iPad Gen 6 Landscape",
258 |           "value": "iPad (gen 6) landscape"
259 |         },
260 |         {
261 |           "title": "iPad Gen 7",
262 |           "value": "iPad (gen 7)"
263 |         },
264 |         {
265 |           "title": "iPad Gen 7 Landscape",
266 |           "value": "iPad (gen 7) landscape"
267 |         },
268 |         {
269 |           "title": "iPad Mini",
270 |           "value": "iPad Mini"
271 |         },
272 |         {
273 |           "title": "iPad Mini Landscape",
274 |           "value": "iPad Mini landscape"
275 |         },
276 |         {
277 |           "title": "iPad Pro",
278 |           "value": "iPad Pro"
279 |         },
280 |         {
281 |           "title": "iPad Pro Landscape",
282 |           "value": "iPad Pro landscape"
283 |         },
284 |         {
285 |           "title": "iPad Pro 11",
286 |           "value": "iPad Pro 11"
287 |         },
288 |         {
289 |           "title": "iPad Pro 11 Landscape",
290 |           "value": "iPad Pro 11 landscape"
291 |         },
292 |         {
293 |           "title": "iPhone 4",
294 |           "value": "iPhone 4"
295 |         },
296 |         {
297 |           "title": "iPhone 4 Landscape",
298 |           "value": "iPhone 4 landscape"
299 |         },
300 |         {
301 |           "title": "iPhone 5",
302 |           "value": "iPhone 5"
303 |         },
304 |         {
305 |           "title": "iPhone 5 Landscape",
306 |           "value": "iPhone 5 landscape"
307 |         },
308 |         {
309 |           "title": "iPhone 6",
310 |           "value": "iPhone 6"
311 |         },
312 |         {
313 |           "title": "iPhone 6 Landscape",
314 |           "value": "iPhone 6 landscape"
315 |         },
316 |         {
317 |           "title": "iPhone 6 Plus",
318 |           "value": "iPhone 6 Plus"
319 |         },
320 |         {
321 |           "title": "iPhone 6 Plus Landscape",
322 |           "value": "iPhone 6 Plus landscape"
323 |         },
324 |         {
325 |           "title": "iPhone 7",
326 |           "value": "iPhone 7"
327 |         },
328 |         {
329 |           "title": "iPhone 7 Landscape",
330 |           "value": "iPhone 7 landscape"
331 |         },
332 |         {
333 |           "title": "iPhone 7 Plus",
334 |           "value": "iPhone 7 Plus"
335 |         },
336 |         {
337 |           "title": "iPhone 7 Plus Landscape",
338 |           "value": "iPhone 7 Plus landscape"
339 |         },
340 |         {
341 |           "title": "iPhone 8",
342 |           "value": "iPhone 8"
343 |         },
344 |         {
345 |           "title": "iPhone 8 Landscape",
346 |           "value": "iPhone 8 landscape"
347 |         },
348 |         {
349 |           "title": "iPhone 8 Plus",
350 |           "value": "iPhone 8 Plus"
351 |         },
352 |         {
353 |           "title": "iPhone 8 Plus Landscape",
354 |           "value": "iPhone 8 Plus landscape"
355 |         },
356 |         {
357 |           "title": "iPhone SE",
358 |           "value": "iPhone SE"
359 |         },
360 |         {
361 |           "title": "iPhone SE Landscape",
362 |           "value": "iPhone SE landscape"
363 |         },
364 |         {
365 |           "title": "iPhone X",
366 |           "value": "iPhone X"
367 |         },
368 |         {
369 |           "title": "iPhone X Landscape",
370 |           "value": "iPhone X landscape"
371 |         },
372 |         {
373 |           "title": "iPhone XR",
374 |           "value": "iPhone XR"
375 |         },
376 |         {
377 |           "title": "iPhone XR Landscape",
378 |           "value": "iPhone XR landscape"
379 |         },
380 |         {
381 |           "title": "iPhone 11",
382 |           "value": "iPhone 11"
383 |         },
384 |         {
385 |           "title": "iPhone 11 Landscape",
386 |           "value": "iPhone 11 landscape"
387 |         },
388 |         {
389 |           "title": "iPhone 11 Pro",
390 |           "value": "iPhone 11 Pro"
391 |         },
392 |         {
393 |           "title": "iPhone 11 Pro Landscape",
394 |           "value": "iPhone 11 Pro landscape"
395 |         },
396 |         {
397 |           "title": "iPhone 11 Pro Max",
398 |           "value": "iPhone 11 Pro Max"
399 |         },
400 |         {
401 |           "title": "iPhone 11 Pro Max Landscape",
402 |           "value": "iPhone 11 Pro Max landscape"
403 |         },
404 |         {
405 |           "title": "iPhone 12",
406 |           "value": "iPhone 12"
407 |         },
408 |         {
409 |           "title": "iPhone 12 Landscape",
410 |           "value": "iPhone 12 landscape"
411 |         },
412 |         {
413 |           "title": "iPhone 12 Pro",
414 |           "value": "iPhone 12 Pro"
415 |         },
416 |         {
417 |           "title": "iPhone 12 Pro Landscape",
418 |           "value": "iPhone 12 Pro landscape"
419 |         },
420 |         {
421 |           "title": "iPhone 12 Pro Max",
422 |           "value": "iPhone 12 Pro Max"
423 |         },
424 |         {
425 |           "title": "iPhone 12 Pro Max Landscape",
426 |           "value": "iPhone 12 Pro Max landscape"
427 |         },
428 |         {
429 |           "title": "iPhone 12 Mini",
430 |           "value": "iPhone 12 Mini"
431 |         },
432 |         {
433 |           "title": "iPhone 12 Mini Landscape",
434 |           "value": "iPhone 12 Mini landscape"
435 |         },
436 |         {
437 |           "title": "iPhone 13",
438 |           "value": "iPhone 13"
439 |         },
440 |         {
441 |           "title": "iPhone 13 Landscape",
442 |           "value": "iPhone 13 landscape"
443 |         },
444 |         {
445 |           "title": "iPhone 13 Pro",
446 |           "value": "iPhone 13 Pro"
447 |         },
448 |         {
449 |           "title": "iPhone 13 Pro Landscape",
450 |           "value": "iPhone 13 Pro landscape"
451 |         },
452 |         {
453 |           "title": "iPhone 13 Pro Max",
454 |           "value": "iPhone 13 Pro Max"
455 |         },
456 |         {
457 |           "title": "iPhone 13 Pro Max Landscape",
458 |           "value": "iPhone 13 Pro Max landscape"
459 |         },
460 |         {
461 |           "title": "iPhone 13 Mini",
462 |           "value": "iPhone 13 Mini"
463 |         },
464 |         {
465 |           "title": "iPhone 13 Mini Landscape",
466 |           "value": "iPhone 13 Mini landscape"
467 |         },
468 |         {
469 |           "title": "Jio Phone 2",
470 |           "value": "JioPhone 2"
471 |         },
472 |         {
473 |           "title": "Jio Phone 2 Landscape",
474 |           "value": "JioPhone 2 landscape"
475 |         },
476 |         {
477 |           "title": "Kindle Fire HDX",
478 |           "value": "Kindle Fire HDX"
479 |         },
480 |         {
481 |           "title": "Kindle Fire HDX Landscape",
482 |           "value": "Kindle Fire HDX landscape"
483 |         },
484 |         {
485 |           "title": "LG Optimus L70",
486 |           "value": "LG Optimus L70"
487 |         },
488 |         {
489 |           "title": "LG Optimus L70 Landscape",
490 |           "value": "LG Optimus L70 landscape"
491 |         },
492 |         {
493 |           "title": "Microsoft Lumia 550",
494 |           "value": "Microsoft Lumia 550"
495 |         },
496 |         {
497 |           "title": "Microsoft Lumia 950",
498 |           "value": "Microsoft Lumia 950"
499 |         },
500 |         {
501 |           "title": "Microsoft Lumia 950 Landscape",
502 |           "value": "Microsoft Lumia 950 landscape"
503 |         },
504 |         {
505 |           "title": "Nexus 10",
506 |           "value": "Nexus 10"
507 |         },
508 |         {
509 |           "title": "Nexus 10 Landscape",
510 |           "value": "Nexus 10 landscape"
511 |         },
512 |         {
513 |           "title": "Nexus 4",
514 |           "value": "Nexus 4"
515 |         },
516 |         {
517 |           "title": "Nexus 4 Landscape",
518 |           "value": "Nexus 4 landscape"
519 |         },
520 |         {
521 |           "title": "Nexus 5",
522 |           "value": "Nexus 5"
523 |         },
524 |         {
525 |           "title": "Nexus 5 Landscape",
526 |           "value": "Nexus 5 landscape"
527 |         },
528 |         {
529 |           "title": "Nexus 5X",
530 |           "value": "Nexus 5X"
531 |         },
532 |         {
533 |           "title": "Nexus 5X Landscape",
534 |           "value": "Nexus 5X landscape"
535 |         },
536 |         {
537 |           "title": "Nexus 6",
538 |           "value": "Nexus 6"
539 |         },
540 |         {
541 |           "title": "Nexus 6 Landscape",
542 |           "value": "Nexus 6 landscape"
543 |         },
544 |         {
545 |           "title": "Nexus 6P",
546 |           "value": "Nexus 6P"
547 |         },
548 |         {
549 |           "title": "Nexus 6P Landscape",
550 |           "value": "Nexus 6P landscape"
551 |         },
552 |         {
553 |           "title": "Nexus 7",
554 |           "value": "Nexus 7"
555 |         },
556 |         {
557 |           "title": "Nexus 7 Landscape",
558 |           "value": "Nexus 7 landscape"
559 |         },
560 |         {
561 |           "title": "Nokia Lumia 520",
562 |           "value": "Nokia Lumia 520"
563 |         },
564 |         {
565 |           "title": "Nokia Lumia 520 Landscape",
566 |           "value": "Nokia Lumia 520 landscape"
567 |         },
568 |         {
569 |           "title": "Nokia N9",
570 |           "value": "Nokia N9"
571 |         },
572 |         {
573 |           "title": "Nokia N9 Landscape",
574 |           "value": "Nokia N9 landscape"
575 |         },
576 |         {
577 |           "title": "Pixel 2",
578 |           "value": "Pixel 2"
579 |         },
580 |         {
581 |           "title": "Pixel 2 Landscape",
582 |           "value": "Pixel 2 landscape"
583 |         },
584 |         {
585 |           "title": "Pixel 2 XL",
586 |           "value": "Pixel 2 XL"
587 |         },
588 |         {
589 |           "title": "Pixel 2 XL Landscape",
590 |           "value": "Pixel 2 XL landscape"
591 |         },
592 |         {
593 |           "title": "Pixel 3",
594 |           "value": "Pixel 3"
595 |         },
596 |         {
597 |           "title": "Pixel 3 Landscape",
598 |           "value": "Pixel 3 landscape"
599 |         },
600 |         {
601 |           "title": "Pixel 4",
602 |           "value": "Pixel 4"
603 |         },
604 |         {
605 |           "title": "Pixel 4 Landscape",
606 |           "value": "Pixel 4 landscape"
607 |         },
608 |         {
609 |           "title": "Pixel 4A 5G",
610 |           "value": "Pixel 4a (5G)"
611 |         },
612 |         {
613 |           "title": "Pixel 4A 5G Landscape",
614 |           "value": "Pixel 4a (5G) landscape"
615 |         },
616 |         {
617 |           "title": "Pixel 5",
618 |           "value": "Pixel 5"
619 |         },
620 |         {
621 |           "title": "Pixel 5 Landscape",
622 |           "value": "Pixel 5 landscape"
623 |         },
624 |         {
625 |           "title": "Moto G4",
626 |           "value": "Moto G4"
627 |         },
628 |         {
629 |           "title": "Moto G4 Landscape",
630 |           "value": "Moto G4 landscape"
631 |         }
632 |       ]
633 |     },
634 |     "select_links": {
635 |       "type": "string",
636 |       "required": false,
637 |       "title": "Select Links",
638 |       "description": "One or more selectors for extracting links, in the format [css selector]->[property to use],[css selector]->@[attribute to use]"
639 |     },
640 |     "click_selector": {
641 |       "type": "string",
642 |       "required": false,
643 |       "title": "Click Selector",
644 |       "description": "Selector for elements to click when using the autoclick behavior. Default is 'a'"
645 |     },
646 |     "block_rules": {
647 |       "type": "string",
648 |       "required": false,
649 |       "title": "Block Rules",
650 |       "description": "Additional rules for blocking certain URLs from being loaded, by URL regex and optionally via text match in an iframe"
651 |     },
652 |     "block_message": {
653 |       "type": "string",
654 |       "required": false,
655 |       "title": "Block Message",
656 |       "description": "If specified, when a URL is blocked, a record with this error message is added instead"
657 |     },
658 |     "block_ads": {
659 |       "type": "boolean",
660 |       "required": false,
661 |       "title": "Block Ads",
662 |       "description": "If set, block advertisements from being loaded (based on Stephen Black's blocklist). Note that some bad domains are also blocked by zimit configuration even if this option is not set."
663 |     },
664 |     "ad_block_message": {
665 |       "type": "string",
666 |       "required": false,
667 |       "title": "Ads Block Message",
668 |       "description": "If specified, when an ad is blocked, a record with this error message is added instead"
669 |     },
670 |     "user_agent": {
671 |       "type": "string",
672 |       "required": false,
673 |       "title": "User Agent",
674 |       "description": "Override user-agent with specified"
675 |     },
676 |     "user_agent_suffix": {
677 |       "type": "string",
678 |       "required": false,
679 |       "title": "User Agent Suffix",
680 |       "description": "Append suffix to existing browser user-agent. Defaults to +Zimit"
681 |     },
682 |     "use_sitemap": {
683 |       "type": "string",
684 |       "required": false,
685 |       "title": "Sitemap URL",
686 |       "description": "Use as sitemap to get additional URLs for the crawl (usually at /sitemap.xml)"
687 |     },
688 |     "sitemap_from_date": {
689 |       "type": "string",
690 |       "required": false,
691 |       "title": "Sitemap From Date",
692 |       "description": "If set, filter URLs from sitemaps to those greater than or equal to (>=) provided ISO Date string (YYYY-MM-DD or YYYY-MM-DDTHH:MM:SS or partial date)"
693 |     },
694 |     "sitemap_to_date": {
695 |       "type": "string",
696 |       "required": false,
697 |       "title": "Sitemap To Date",
698 |       "description": "If set, filter URLs from sitemaps to those less than or equal to (<=) provided ISO Date string (YYYY-MM-DD or YYYY-MM-DDTHH:MM:SS or partial date)"
699 |     },
700 |     "behavior_timeout": {
701 |       "type": "integer",
702 |       "required": false,
703 |       "title": "Behavior Timeout",
704 |       "description": "If >0, timeout (in seconds) for in-page behavior will run on each page. If 0, a behavior can run until finish. Default is 90.",
705 |       "min": 0
706 |     },
707 |     "post_load_delay": {
708 |       "type": "integer",
709 |       "required": false,
710 |       "title": "Post Load Delay",
711 |       "description": "If >0, amount of time to sleep (in seconds) after page has loaded, before taking screenshots / getting text / running behaviors. Default is 0.",
712 |       "min": 0
713 |     },
714 |     "page_extra_delay": {
715 |       "type": "integer",
716 |       "required": false,
717 |       "title": "Page Extra Delay",
718 |       "description": "If >0, amount of time to sleep (in seconds) after behaviors before moving on to next page. Default is 0.",
719 |       "min": 0
720 |     },
721 |     "dedup_policy": {
722 |       "type": "string-enum",
723 |       "required": false,
724 |       "title": "Dedup Policy",
725 |       "description": "Deduplication policy. One of skip, revisit or keep. Default is skip",
726 |       "choices": [
727 |         {
728 |           "title": "Skip",
729 |           "value": "skip"
730 |         },
731 |         {
732 |           "title": "Revisit",
733 |           "value": "revisit"
734 |         },
735 |         {
736 |           "title": "Keep",
737 |           "value": "keep"
738 |         }
739 |       ]
740 |     },
741 |     "screenshot": {
742 |       "type": "string",
743 |       "required": false,
744 |       "title": "Screenshot",
745 |       "description": "Screenshot options for crawler. One of view, thumbnail, fullPage, fullPageFinal or a comma-separated combination of those."
746 |     },
747 |     "size_soft_limit": {
748 |       "type": "integer",
749 |       "required": false,
750 |       "title": "Size Soft Limit",
751 |       "description": "If set, save crawl state and stop crawl if WARC size exceeds this value. ZIM will still be created.",
752 |       "min": 0
753 |     },
754 |     "size_hard_limit": {
755 |       "type": "integer",
756 |       "required": false,
757 |       "title": "Size Hard Limit",
758 |       "description": "If set, exit crawler and fail the scraper immediately if WARC size exceeds this value",
759 |       "min": 0
760 |     },
761 |     "disk_utilization": {
762 |       "type": "integer",
763 |       "required": false,
764 |       "title": "Disk Utilization",
765 |       "description": "Save state and exit if disk utilization exceeds this percentage value. Default (if not set) is 90%. Set to 0 to disable disk utilization check.",
766 |       "min": 0
767 |     },
768 |     "time_soft_limit": {
769 |       "type": "integer",
770 |       "required": false,
771 |       "title": "Time Soft Limit",
772 |       "description": "If set, save crawl state and stop crawl if WARC(s) creation takes longer than this value, in seconds. ZIM will still be created.",
773 |       "min": 0
774 |     },
775 |     "time_hard_limit": {
776 |       "type": "integer",
777 |       "required": false,
778 |       "title": "Time Hard Limit",
779 |       "description": "If set, exit crawler and fail the scraper immediately if WARC(s) creation takes longer than this value, in seconds",
780 |       "min": 0
781 |     },
782 |     "net_idle_wait": {
783 |       "type": "integer",
784 |       "required": false,
785 |       "title": "Net Idle Wait",
786 |       "description": "If set, wait for network idle after page load and after behaviors are done (in seconds). If -1 (default), determine based on scope."
787 |     },
788 |     "origin_override": {
789 |       "type": "string",
790 |       "required": false,
791 |       "title": "Origin Override",
792 |       "description": "If set, will redirect requests from each origin in key to origin in the value, eg. https://host:port=http://alt-host:alt-port."
793 |     },
794 |     "max_page_retries": {
795 |       "type": "integer",
796 |       "required": false,
797 |       "title": "Max Page Retries",
798 |       "description": "If set, number of times to retry a page that failed to load before page is considered to have failed. Default is 2.",
799 |       "min": 0
800 |     },
801 |     "fail_on_failed_seed": {
802 |       "type": "boolean",
803 |       "required": false,
804 |       "title": "Fail on failed seed",
805 |       "description": "Whether to display additional logs"
806 |     },
807 |     "fail_on_invalid_status": {
808 |       "type": "boolean",
809 |       "required": false,
810 |       "title": "Fail on invalid status",
811 |       "description": "If set, will treat pages with 4xx or 5xx response as failures. When combined with --failOnFailedLimit or --failOnFailedSeed may result in crawl failing due to non-200 responses"
812 |     },
813 |     "fail_on_failed_limit": {
814 |       "type": "integer",
815 |       "required": false,
816 |       "title": "Fail on failed - Limit",
817 |       "description": "If set, save state and exit if number of failed pages exceeds this value.",
818 |       "min": 0
819 |     },
820 |     "warcs": {
821 |       "type": "string",
822 |       "required": false,
823 |       "title": "WARC files",
824 |       "description": "Comma-separated list of WARC files to use as input."
825 |     },
826 |     "verbose": {
827 |       "type": "boolean",
828 |       "required": false,
829 |       "title": "Verbose mode",
830 |       "description": "Whether to display additional logs"
831 |     },
832 |     "keep": {
833 |       "type": "boolean",
834 |       "required": false,
835 |       "title": "Keep",
836 |       "description": "Should be True. Developer option: must be True if we want to keep the WARC files for artifacts archiving.",
837 |       "default": true
838 |     },
839 |     "output": {
840 |       "type": "string",
841 |       "required": false,
842 |       "title": "Output folder",
843 |       "description": "Output folder for ZIM file(s). Leave it as `/output`",
844 |       "pattern": "^/output$"
845 |     },
846 |     "admin_email": {
847 |       "type": "email",
848 |       "required": false,
849 |       "title": "Admin Email",
850 |       "description": "Admin Email for crawler: used in UserAgent so website admin can contact us",
851 |       "default": "contact+zimfarm@kiwix.org"
852 |     },
853 |     "profile": {
854 |       "type": "string",
855 |       "required": false,
856 |       "title": "Browser profile",
857 |       "description": "Path or HTTP(S) URL to tar.gz file which contains the browser profile directory for Browsertrix crawler."
858 |     },
859 |     "behaviors": {
860 |       "type": "string",
861 |       "required": false,
862 |       "title": "Behaviors",
863 |       "description": "Which background behaviors to enable on each page. Defaults to autoplay,autofetch,siteSpecific."
864 |     },
865 |     "depth": {
866 |       "type": "integer",
867 |       "required": false,
868 |       "title": "Depth",
869 |       "description": "The depth of the crawl for all seeds. Default is -1 (infinite).",
870 |       "min": -1
871 |     },
872 |     "zim_lang": {
873 |       "type": "string",
874 |       "required": false,
875 |       "title": "ZIM Language",
876 |       "description": "Language metadata of ZIM (warc2zim --lang param). ISO-639-3 code. Retrieved from homepage if found, fallback to `eng`",
877 |       "alias": "zim-lang",
878 |       "customValidator": "language_code"
879 |     },
880 |     "long_description": {
881 |       "type": "string",
882 |       "required": false,
883 |       "title": "Long description",
884 |       "description": "Optional long description for your ZIM",
885 |       "minLength": 1,
886 |       "maxLength": 4000,
887 |       "alias": "long-description"
888 |     },
889 |     "custom_css": {
890 |       "type": "url",
891 |       "required": false,
892 |       "title": "Custom CSS",
893 |       "description": "URL to a CSS file to inject into pages",
894 |       "alias": "custom-css"
895 |     },
896 |     "charsets_to_try": {
897 |       "type": "string",
898 |       "required": false,
899 |       "title": "Charsets to try",
900 |       "description": "List of charsets to try decode content when charset is not found",
901 |       "alias": "charsets-to-try"
902 |     },
903 |     "ignore_content_header_charsets": {
904 |       "type": "boolean",
905 |       "required": false,
906 |       "title": "Ignore Content Header Charsets",
907 |       "description": "Ignore the charsets specified in content headers - first bytes - typically because they are wrong.",
908 |       "alias": "ignore-content-header-charsets"
909 |     },
910 |     "content_header_bytes_length": {
911 |       "type": "integer",
912 |       "required": false,
913 |       "title": "Content Header Bytes Length",
914 |       "description": "How many bytes to consider when searching for content charsets in header (default is 1024).",
915 |       "alias": "content-header-bytes-length",
916 |       "min": 0
917 |     },
918 |     "ignore_http_header_charsets": {
919 |       "type": "boolean",
920 |       "required": false,
921 |       "title": "Ignore HTTP Header Charsets",
922 |       "description": "Ignore the charsets specified in HTTP `Content-Type` headers, typically because they are wrong.",
923 |       "alias": "ignore-http-header-charsets"
924 |     },
925 |     "encoding_aliases": {
926 |       "type": "string",
927 |       "required": false,
928 |       "title": "Encoding Aliases",
929 |       "description": "List of encoding/charset aliases to decode WARC content. Aliases are used when the encoding specified in upstream server exists in Python under a different name. This parameter is single string, multiple values are separated by a comma, like in alias1=encoding1,alias2=encoding2.",
930 |       "alias": "encoding-aliases"
931 |     },
932 |     "custom_behaviors": {
933 |       "type": "string",
934 |       "required": false,
935 |       "title": "Custom Behaviors",
936 |       "description": "JS code for custom behaviors to customize crawler. Single string with individual JS files URL/path separated by a comma.",
937 |       "alias": "custom-behaviours"
938 |     },
939 |     "zimit_progress_file": {
940 |       "type": "string",
941 |       "required": false,
942 |       "title": "Zimit Progress File",
943 |       "description": "Scraping progress file. Leave it as `/output/task_progress.json`",
944 |       "alias": "zimit-progress-file",
945 |       "pattern": "^/output/task_progress\\.json$"
946 |     },
947 |     "replay_viewer_source": {
948 |       "type": "url",
949 |       "required": false,
950 |       "title": "Replay Viewer Source",
951 |       "description": "URL from which to load the ReplayWeb.page replay viewer from",
952 |       "alias": "replay-viewer-source"
953 |     },
954 |     "zim_file": {
955 |       "type": "string",
956 |       "required": false,
957 |       "title": "ZIM filename",
958 |       "description": "ZIM file name (based on --name if not provided). Include {period} to insert date period dynamically",
959 |       "alias": "zim-file",
960 |       "pattern": "^([a-z0-9\\-\\.]+_)([a-z\\-]+_)([a-z0-9\\-\\.]+_)([a-z0-9\\-\\.]+_|)([\\d]{4}-[\\d]{2}|\\{period\\}).zim$",
961 |       "relaxedPattern": "^[A-Za-z0-9._-]+$"
962 |     },
963 |     "name": {
964 |       "type": "string",
965 |       "required": true,
966 |       "title": "ZIM name",
967 |       "description": "Name of the ZIM.",
968 |       "alias": "name",
969 |       "pattern": "^([a-z0-9\\-\\.]+_)([a-z\\-]+_)([a-z0-9\\-\\.]+)$",
970 |       "relaxedPattern": "^[A-Za-z0-9._-]+$"
971 |     }
972 |   }
973 | }
974 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
  1 | [build-system]
  2 | requires = ["hatchling", "hatch-openzim"]
  3 | build-backend = "hatchling.build"
  4 | 
  5 | [project]
  6 | name = "zimit"
  7 | requires-python = ">=3.13,<3.14"
  8 | description = "Make ZIM file from any website through crawling"
  9 | readme = "README.md"
 10 | dependencies = [
 11 |   "requests==2.32.3",
 12 |   "inotify==0.2.10",
 13 |   "tld==0.13",
 14 |   "warc2zim @ git+https://github.com/openzim/warc2zim@main",
 15 | ]
 16 | dynamic = ["authors", "classifiers", "keywords", "license", "version", "urls"]
 17 | 
 18 | [tool.hatch.metadata.hooks.openzim-metadata]
 19 | kind = "scraper"
 20 | 
 21 | [tool.hatch.metadata]
 22 | allow-direct-references = true  # to be removed once we use a released warc2zim version
 23 | 
 24 | [project.optional-dependencies]
 25 | scripts = [
 26 |   "invoke==2.2.0",
 27 | ]
 28 | lint = [
 29 |   "black==25.1.0",
 30 |   "ruff==0.9.4",
 31 | ]
 32 | check = [
 33 |   "pyright==1.1.393",
 34 | ]
 35 | test = [
 36 |   "pytest==8.3.4",
 37 |   "coverage==7.6.10",
 38 | ]
 39 | dev = [
 40 |   "pre-commit==4.1.0",
 41 |   "debugpy==1.8.12",
 42 |   "selenium==4.28.1", # used in daily tests, convenient for dev purpose (autocompletion)
 43 |   "zimit[scripts]",
 44 |   "zimit[lint]",
 45 |   "zimit[test]",
 46 |   "zimit[check]",
 47 | ]
 48 | 
 49 | [project.scripts]
 50 | zimit = "zimit:zimit.zimit"
 51 | 
 52 | [tool.hatch.version]
 53 | path = "src/zimit/__about__.py"
 54 | 
 55 | [tool.hatch.build]
 56 | exclude = [
 57 |   "/.github",
 58 | ]
 59 | 
 60 | [tool.hatch.build.targets.wheel]
 61 | packages = ["src/zimit"]
 62 | 
 63 | [tool.hatch.envs.default]
 64 | features = ["dev"]
 65 | 
 66 | [tool.hatch.envs.test]
 67 | features = ["scripts", "test"]
 68 | 
 69 | [tool.hatch.envs.test.scripts]
 70 | run = "inv test --args '{args}'"
 71 | run-cov = "inv test-cov --args '{args}'"
 72 | report-cov = "inv report-cov"
 73 | coverage = "inv coverage --args '{args}'"
 74 | html = "inv coverage --html --args '{args}'"
 75 | 
 76 | [tool.hatch.envs.lint]
 77 | template = "lint"
 78 | skip-install = false
 79 | features = ["scripts", "lint"]
 80 | 
 81 | [tool.hatch.envs.lint.scripts]
 82 | black = "inv lint-black --args '{args}'"
 83 | ruff = "inv lint-ruff --args '{args}'"
 84 | all = "inv lintall --args '{args}'"
 85 | fix-black = "inv fix-black --args '{args}'"
 86 | fix-ruff = "inv fix-ruff --args '{args}'"
 87 | fixall = "inv fixall --args '{args}'"
 88 | 
 89 | [tool.hatch.envs.check]
 90 | features = ["scripts", "check"]
 91 | 
 92 | [tool.hatch.envs.check.scripts]
 93 | pyright = "inv check-pyright --args '{args}'"
 94 | all = "inv checkall --args '{args}'"
 95 | 
 96 | [tool.black]
 97 | line-length = 88
 98 | target-version = ['py313']
 99 | 
100 | [tool.ruff]
101 | target-version = "py313"
102 | line-length = 88
103 | src = ["src"]
104 | 
105 | [tool.ruff.lint]
106 | select = [
107 |   "A",  # flake8-builtins
108 |   # "ANN",  # flake8-annotations
109 |   "ARG",  # flake8-unused-arguments
110 |   # "ASYNC",  # flake8-async
111 |   "B",  # flake8-bugbear
112 |   # "BLE",  # flake8-blind-except
113 |   "C4",  # flake8-comprehensions
114 |   "C90",  # mccabe
115 |   # "COM",  # flake8-commas
116 |   # "D",  # pydocstyle
117 |   # "DJ",  # flake8-django
118 |   "DTZ",  # flake8-datetimez
119 |   "E",  # pycodestyle (default)
120 |   "EM",  # flake8-errmsg
121 |   # "ERA",  # eradicate
122 |   # "EXE",  # flake8-executable
123 |   "F",  # Pyflakes (default)
124 |   # "FA",  # flake8-future-annotations
125 |   "FBT",  # flake8-boolean-trap
126 |   # "FLY",  # flynt
127 |   # "G",  # flake8-logging-format
128 |   "I",  # isort
129 |   "ICN",  # flake8-import-conventions
130 |   # "INP",  # flake8-no-pep420
131 |   # "INT",  # flake8-gettext
132 |   "ISC",  # flake8-implicit-str-concat
133 |   "N",  # pep8-naming
134 |   # "NPY",  # NumPy-specific rules
135 |   # "PD",  # pandas-vet
136 |   # "PGH",  # pygrep-hooks
137 |   # "PIE",  # flake8-pie
138 |   # "PL",  # Pylint
139 |   "PLC",  # Pylint: Convention
140 |   "PLE",  # Pylint: Error
141 |   "PLR",  # Pylint: Refactor
142 |   "PLW",  # Pylint: Warning
143 |   # "PT",  # flake8-pytest-style
144 |   # "PTH",  # flake8-use-pathlib
145 |   # "PYI",  # flake8-pyi
146 |   "Q",  # flake8-quotes
147 |   # "RET",  # flake8-return
148 |   # "RSE",  # flake8-raise
149 |   "RUF",  # Ruff-specific rules
150 |   "S",  # flake8-bandit
151 |   # "SIM",  # flake8-simplify
152 |   # "SLF",  # flake8-self
153 |   "T10",  # flake8-debugger
154 |   "T20",  # flake8-print
155 |   # "TCH",  # flake8-type-checking
156 |   # "TD",  # flake8-todos
157 |   "TID",  # flake8-tidy-imports
158 |   # "TRY",  # tryceratops
159 |   "UP",  # pyupgrade
160 |   "W",  # pycodestyle
161 |   "YTT",  # flake8-2020
162 | ]
163 | ignore = [
164 |   # Allow non-abstract empty methods in abstract base classes
165 |   "B027",
166 |   # Remove flake8-errmsg since we consider they bloat the code and provide limited value
167 |   "EM",
168 |   # Allow boolean positional values in function calls, like `dict.get(... True)`
169 |   "FBT003",
170 |   # Ignore checks for possible passwords
171 |   "S105", "S106", "S107",
172 |   # Ignore warnings on subprocess.run / popen
173 |   "S603",
174 |   # Ignore complexity
175 |   "C901", "PLR0911", "PLR0912", "PLR0913", "PLR0915",
176 | ]
177 | unfixable = [
178 |   # Don't touch unused imports
179 |   "F401",
180 | ]
181 | 
182 | [tool.ruff.lint.isort]
183 | known-first-party = ["zimit"]
184 | 
185 | [tool.ruff.lint.flake8-bugbear]
186 | # add exceptions to B008 for fastapi.
187 | extend-immutable-calls = ["fastapi.Depends", "fastapi.Query"]
188 | 
189 | [tool.ruff.lint.flake8-tidy-imports]
190 | ban-relative-imports = "all"
191 | 
192 | [tool.ruff.lint.per-file-ignores]
193 | # Tests can use magic values, assertions, and relative imports
194 | "tests**/**/*" = ["PLR2004", "S101", "TID252"]
195 | 
196 | [tool.pytest.ini_options]
197 | minversion = "7.3"
198 | testpaths = ["tests"]
199 | pythonpath = [".", "src"]
200 | 
201 | [tool.coverage.paths]
202 | zimit = ["src/zimit"]
203 | tests = ["tests"]
204 | 
205 | [tool.coverage.run]
206 | source_pkgs = ["zimit"]
207 | branch = true
208 | parallel = true
209 | omit = [
210 |   "src/zimit/__about__.py",
211 | ]
212 | 
213 | [tool.coverage.report]
214 | exclude_lines = [
215 |   "no cov",
216 |   "if __name__ == .__main__.:",
217 |   "if TYPE_CHECKING:",
218 | ]
219 | 
220 | [tool.pyright]
221 | include = ["src", "tests", "tasks.py"]
222 | exclude = [".env/**", ".venv/**"]
223 | extraPaths = ["src"]
224 | pythonVersion = "3.13"
225 | typeCheckingMode="basic"
226 | 


--------------------------------------------------------------------------------
/src/zimit/__about__.py:
--------------------------------------------------------------------------------
1 | __version__ = "3.0.6-dev0"
2 | 


--------------------------------------------------------------------------------
/src/zimit/constants.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | from zimscraperlib.logging import getLogger
 4 | 
 5 | EXIT_CODE_WARC2ZIM_CHECK_FAILED = 2
 6 | EXIT_CODE_CRAWLER_SIZE_LIMIT_HIT = 14
 7 | EXIT_CODE_CRAWLER_TIME_LIMIT_HIT = 15
 8 | NORMAL_WARC2ZIM_EXIT_CODE = 100
 9 | REQUESTS_TIMEOUT = 10
10 | 
11 | logger = getLogger(name="zimit", level=logging.INFO)
12 | 


--------------------------------------------------------------------------------
/src/zimit/utils.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | 
 3 | import requests
 4 | 
 5 | from zimit.constants import REQUESTS_TIMEOUT
 6 | 
 7 | 
 8 | def download_file(url: str, fpath: Path):
 9 |     """Download file from url to fpath with streaming"""
10 |     with requests.get(url, timeout=REQUESTS_TIMEOUT, stream=True) as resp:
11 |         resp.raise_for_status()
12 |         with open(fpath, "wb") as f:
13 |             for chunk in resp.iter_content(chunk_size=8192):
14 |                 f.write(chunk)
15 | 


--------------------------------------------------------------------------------
/src/zimit/zimit.py:
--------------------------------------------------------------------------------
   1 | """
   2 | Main zimit run script
   3 | This script validates arguments with warc2zim, checks permissions
   4 | and then calls the Node based driver
   5 | """
   6 | 
   7 | import atexit
   8 | import json
   9 | import re
  10 | import shutil
  11 | import signal
  12 | import subprocess
  13 | import sys
  14 | import tarfile
  15 | import tempfile
  16 | import urllib.parse
  17 | from argparse import ArgumentParser
  18 | from multiprocessing import Process
  19 | from pathlib import Path
  20 | 
  21 | import inotify
  22 | import inotify.adapters
  23 | from warc2zim.main import main as warc2zim
  24 | from zimscraperlib.uri import rebuild_uri
  25 | 
  26 | from zimit.__about__ import __version__
  27 | from zimit.constants import (
  28 |     EXIT_CODE_CRAWLER_SIZE_LIMIT_HIT,
  29 |     EXIT_CODE_CRAWLER_TIME_LIMIT_HIT,
  30 |     EXIT_CODE_WARC2ZIM_CHECK_FAILED,
  31 |     NORMAL_WARC2ZIM_EXIT_CODE,
  32 |     logger,
  33 | )
  34 | from zimit.utils import download_file
  35 | 
  36 | temp_root_dir: Path | None = None
  37 | 
  38 | 
  39 | class ProgressFileWatcher:
  40 |     def __init__(
  41 |         self, crawl_stats_path: Path, warc2zim_stats_path, zimit_stats_path: Path
  42 |     ):
  43 |         self.crawl_stats_path = crawl_stats_path
  44 |         self.warc2zim_stats_path = warc2zim_stats_path
  45 |         self.zimit_stats_path = zimit_stats_path
  46 | 
  47 |         # touch them all so inotify is not unhappy on add_watch
  48 |         self.crawl_stats_path.touch()
  49 |         self.warc2zim_stats_path.touch()
  50 |         self.process = None
  51 | 
  52 |     def stop(self):
  53 |         if not self.process:
  54 |             return
  55 |         self.process.join(0.1)
  56 |         self.process.terminate()
  57 | 
  58 |     def watch(self):
  59 |         self.process = Process(
  60 |             target=self.inotify_watcher,
  61 |             args=(
  62 |                 str(self.crawl_stats_path),
  63 |                 str(self.warc2zim_stats_path),
  64 |                 str(self.zimit_stats_path),
  65 |             ),
  66 |         )
  67 |         self.process.daemon = True
  68 |         self.process.start()
  69 | 
  70 |     def inotify_watcher(self, crawl_fpath: str, warc2zim_fpath: str, zimit_fpath: str):
  71 |         ino = inotify.adapters.Inotify()
  72 |         ino.add_watch(crawl_fpath, inotify.constants.IN_MODIFY)  # pyright: ignore
  73 |         ino.add_watch(warc2zim_fpath, inotify.constants.IN_MODIFY)  # pyright: ignore
  74 | 
  75 |         def crawl_conv(data):
  76 |             # we consider crawl to be 90% of the workload so total = craw_total * 90%
  77 |             return {
  78 |                 "done": data["crawled"],
  79 |                 "total": int(data["total"] / 0.9),
  80 |             }
  81 | 
  82 |         def warc2zim_conv(data):
  83 |             # we consider warc2zim to be 10% of the workload so
  84 |             # warc2zim_total = 10% and  total = 90 + warc2zim_total * 10%
  85 |             return {
  86 |                 "done": int(
  87 |                     data["total"]
  88 |                     * (0.9 + (float(data["written"]) / data["total"]) / 10)
  89 |                 ),
  90 |                 "total": data["total"],
  91 |             }
  92 | 
  93 |         for _, _, fpath, _ in ino.event_gen(yield_nones=False):  # pyright: ignore
  94 |             func = {crawl_fpath: crawl_conv, warc2zim_fpath: warc2zim_conv}.get(fpath)
  95 |             if not func:
  96 |                 continue
  97 |             # open input and output separatly as to not clear output on error
  98 |             with open(fpath) as ifh:
  99 |                 try:
 100 |                     out = func(json.load(ifh))
 101 |                 except Exception:  # nosec # noqa: S112
 102 |                     # simply ignore progress update should an error arise
 103 |                     # might be malformed input for instance
 104 |                     continue
 105 |                 if not out:
 106 |                     continue
 107 |                 with open(zimit_fpath, "w") as ofh:
 108 |                     json.dump(out, ofh)
 109 | 
 110 | 
 111 | def cleanup():
 112 |     if not temp_root_dir:
 113 |         logger.warning("Temporary root dir not already set, cannot clean this up")
 114 |         return
 115 |     logger.info("")
 116 |     logger.info("----------")
 117 |     logger.info(f"Cleanup, removing temp dir: {temp_root_dir}")
 118 |     shutil.rmtree(temp_root_dir)
 119 | 
 120 | 
 121 | def cancel_cleanup():
 122 |     logger.info(
 123 |         f"Temporary files have been kept in {temp_root_dir}, please clean them"
 124 |         " up manually once you don't need them anymore"
 125 |     )
 126 |     atexit.unregister(cleanup)
 127 | 
 128 | 
 129 | def run(raw_args):
 130 |     parser = ArgumentParser(
 131 |         description="Run a browser-based crawl on the specified URL and convert to ZIM"
 132 |     )
 133 | 
 134 |     parser.add_argument(
 135 |         "--seeds",
 136 |         help="The seed URL(s) to start crawling from. Multile seed URL must be "
 137 |         "separated by a comma (usually not needed, these are just the crawl seeds). "
 138 |         "First seed URL is used as ZIM homepage",
 139 |     )
 140 | 
 141 |     parser.add_argument("--title", help="WARC and ZIM title")
 142 |     parser.add_argument("--description", help="WARC and ZIM description")
 143 |     parser.add_argument("--long-description", help="ZIM long description metadata")
 144 | 
 145 |     parser.add_argument(
 146 |         "--seedFile",
 147 |         help="If set, read a list of seed urls, one per line. Can be a local file or "
 148 |         "the HTTP(s) URL to an online file.",
 149 |     )
 150 | 
 151 |     parser.add_argument(
 152 |         "-w", "--workers", type=int, help="Number of parallel workers. Default is 1."
 153 |     )
 154 | 
 155 |     parser.add_argument(
 156 |         "--crawlId",
 157 |         help="A user provided ID for this crawl or crawl configuration (can also be "
 158 |         "set via CRAWL_ID env var, defaults to machine hostname)",
 159 |     )
 160 | 
 161 |     parser.add_argument(
 162 |         "--waitUntil",
 163 |         help="Puppeteer page.goto() condition to wait for before continuing. One of "
 164 |         "load, domcontentloaded, networkidle0 or networkidle2, or a "
 165 |         "comma-separated combination of those. Default is load,networkidle2",
 166 |     )
 167 | 
 168 |     parser.add_argument(
 169 |         "--depth",
 170 |         help="The depth of the crawl for all seeds. Default is -1 (infinite).",
 171 |         type=int,
 172 |     )
 173 | 
 174 |     parser.add_argument(
 175 |         "--extraHops",
 176 |         help="Number of extra 'hops' to follow, beyond the current scope. "
 177 |         "Default is 0.",
 178 |         type=int,
 179 |     )
 180 | 
 181 |     parser.add_argument(
 182 |         "--pageLimit",
 183 |         help="Limit crawl to this number of pages. Default is 0 (no limit).",
 184 |         type=int,
 185 |     )
 186 | 
 187 |     parser.add_argument(
 188 |         "--maxPageLimit",
 189 |         help="Maximum pages to crawl, overriding pageLimit if both are set. Default is "
 190 |         "0 (no limit)",
 191 |         type=int,
 192 |     )
 193 | 
 194 |     parser.add_argument(
 195 |         "--pageLoadTimeout",
 196 |         help="Timeout for each page to load (in seconds). Default is 90 secs.",
 197 |         type=int,
 198 |     )
 199 | 
 200 |     parser.add_argument(
 201 |         "--scopeType",
 202 |         help="A predfined scope of the crawl. For more customization, "
 203 |         "use 'custom' and set scopeIncludeRx/scopeExcludeRx regexes. Default is custom"
 204 |         "if scopeIncludeRx is set, prefix otherwise.",
 205 |         choices=["page", "page-spa", "prefix", "host", "domain", "any", "custom"],
 206 |     )
 207 | 
 208 |     parser.add_argument(
 209 |         "--scopeIncludeRx",
 210 |         help="Regex of page URLs that should be included in the crawl (defaults to "
 211 |         "the immediate directory of URL)",
 212 |     )
 213 | 
 214 |     parser.add_argument(
 215 |         "--scopeExcludeRx",
 216 |         help="Regex of page URLs that should be excluded from the crawl",
 217 |     )
 218 | 
 219 |     parser.add_argument(
 220 |         "--allowHashUrls",
 221 |         help="Allow Hashtag URLs, useful for single-page-application crawling or "
 222 |         "when different hashtags load dynamic content",
 223 |         action="store_true",
 224 |     )
 225 | 
 226 |     parser.add_argument(
 227 |         "--selectLinks",
 228 |         help="One or more selectors for extracting links, in the format "
 229 |         "[css selector]->[property to use],[css selector]->@[attribute to use]",
 230 |     )
 231 | 
 232 |     parser.add_argument(
 233 |         "--clickSelector",
 234 |         help="Selector for elements to click when using the autoclick behavior. Default"
 235 |         " is 'a'",
 236 |     )
 237 | 
 238 |     parser.add_argument(
 239 |         "--blockRules",
 240 |         help="Additional rules for blocking certain URLs from being loaded, by URL "
 241 |         "regex and optionally via text match in an iframe",
 242 |     )
 243 | 
 244 |     parser.add_argument(
 245 |         "--blockMessage",
 246 |         help="If specified, when a URL is blocked, a record with this error message is"
 247 |         " added instead",
 248 |     )
 249 | 
 250 |     parser.add_argument(
 251 |         "--blockAds",
 252 |         help="If set, block advertisements from being loaded (based on Stephen Black's"
 253 |         " blocklist). Note that some bad domains are also blocked by zimit"
 254 |         " configuration even if this option is not set.",
 255 |     )
 256 | 
 257 |     parser.add_argument(
 258 |         "--adBlockMessage",
 259 |         help="If specified, when an ad is blocked, a record with this error message is"
 260 |         " added instead",
 261 |     )
 262 | 
 263 |     parser.add_argument(
 264 |         "--collection",
 265 |         help="Collection name to crawl to (replay will be accessible "
 266 |         "under this name in pywb preview). Default is crawl-@ts.",
 267 |     )
 268 | 
 269 |     parser.add_argument(
 270 |         "--headless",
 271 |         help="Run in headless mode, otherwise start xvfb",
 272 |         action="store_true",
 273 |     )
 274 | 
 275 |     parser.add_argument(
 276 |         "--driver",
 277 |         help="Custom driver for the crawler, if any",
 278 |     )
 279 | 
 280 |     parser.add_argument(
 281 |         "--generateCDX",
 282 |         help="If set, generate index (CDXJ) for use with pywb after crawl is done",
 283 |         action="store_true",
 284 |     )
 285 | 
 286 |     parser.add_argument(
 287 |         "--combineWARC",
 288 |         help="If set, combine the warcs",
 289 |         action="store_true",
 290 |     )
 291 | 
 292 |     parser.add_argument(
 293 |         "--rolloverSize",
 294 |         help="If set, declare the rollover size. Default is 1000000000.",
 295 |         type=int,
 296 |     )
 297 | 
 298 |     parser.add_argument(
 299 |         "--generateWACZ",
 300 |         help="If set, generate WACZ on disk",
 301 |         action="store_true",
 302 |     )
 303 | 
 304 |     parser.add_argument(
 305 |         "--logging",
 306 |         help="Crawler logging configuration",
 307 |     )
 308 | 
 309 |     parser.add_argument(
 310 |         "--logLevel",
 311 |         help="Comma-separated list of log levels to include in logs",
 312 |     )
 313 | 
 314 |     parser.add_argument(
 315 |         "--logContext",
 316 |         help="Comma-separated list of contexts to include in logs",
 317 |         choices=[
 318 |             "general",
 319 |             "worker",
 320 |             "recorder",
 321 |             "recorderNetwork",
 322 |             "writer",
 323 |             "state",
 324 |             "redis",
 325 |             "storage",
 326 |             "text",
 327 |             "exclusion",
 328 |             "screenshots",
 329 |             "screencast",
 330 |             "originOverride",
 331 |             "healthcheck",
 332 |             "browser",
 333 |             "blocking",
 334 |             "behavior",
 335 |             "behaviorScript",
 336 |             "jsError",
 337 |             "fetch",
 338 |             "pageStatus",
 339 |             "memoryStatus",
 340 |             "crawlStatus",
 341 |             "links",
 342 |             "sitemap",
 343 |             "wacz",
 344 |             "replay",
 345 |             "proxy",
 346 |         ],
 347 |     )
 348 | 
 349 |     parser.add_argument(
 350 |         "--logExcludeContext",
 351 |         help="Comma-separated list of contexts to NOT include in logs. Default is "
 352 |         "recorderNetwork,jsError,screencast",
 353 |         choices=[
 354 |             "general",
 355 |             "worker",
 356 |             "recorder",
 357 |             "recorderNetwork",
 358 |             "writer",
 359 |             "state",
 360 |             "redis",
 361 |             "storage",
 362 |             "text",
 363 |             "exclusion",
 364 |             "screenshots",
 365 |             "screencast",
 366 |             "originOverride",
 367 |             "healthcheck",
 368 |             "browser",
 369 |             "blocking",
 370 |             "behavior",
 371 |             "behaviorScript",
 372 |             "jsError",
 373 |             "fetch",
 374 |             "pageStatus",
 375 |             "memoryStatus",
 376 |             "crawlStatus",
 377 |             "links",
 378 |             "sitemap",
 379 |             "wacz",
 380 |             "replay",
 381 |             "proxy",
 382 |         ],
 383 |     )
 384 | 
 385 |     parser.add_argument(
 386 |         "--text",
 387 |         help="Extract initial (default) or final text to pages.jsonl or WARC resource"
 388 |         " record(s)",
 389 |     )
 390 | 
 391 |     # cwd is manipulated directly by zimit, based on --output / --build, we do not want
 392 |     # to expose this setting
 393 | 
 394 |     parser.add_argument(
 395 |         "--mobileDevice",
 396 |         help="Emulate mobile device by name from "
 397 |         "https://github.com/puppeteer/puppeteer/blob/"
 398 |         "main/packages/puppeteer-core/src/common/Device.ts",
 399 |     )
 400 | 
 401 |     parser.add_argument(
 402 |         "--userAgent",
 403 |         help="Override default user-agent with specified value ; --userAgentSuffix and "
 404 |         "--adminEmail have no effect when this is set",
 405 |     )
 406 | 
 407 |     parser.add_argument(
 408 |         "--userAgentSuffix",
 409 |         help="Append suffix to existing browser user-agent "
 410 |         "(ex: +MyCrawler, info@example.com)",
 411 |         default="+Zimit",
 412 |     )
 413 | 
 414 |     parser.add_argument(
 415 |         "--useSitemap",
 416 |         help="If set, use the URL as sitemap to get additional URLs for the crawl "
 417 |         "(usually /sitemap.xml)",
 418 |     )
 419 | 
 420 |     parser.add_argument(
 421 |         "--sitemapFromDate",
 422 |         help="If set, filter URLs from sitemaps to those greater than or equal to (>=)"
 423 |         " provided ISO Date string (YYYY-MM-DD or YYYY-MM-DDTHH:MM:SS or partial date)",
 424 |     )
 425 | 
 426 |     parser.add_argument(
 427 |         "--sitemapToDate",
 428 |         help="If set, filter URLs from sitemaps to those less than or equal to (<=) "
 429 |         "provided ISO Date string (YYYY-MM-DD or YYYY-MM-DDTHH:MM:SS or partial date)",
 430 |     )
 431 | 
 432 |     parser.add_argument(
 433 |         "--statsFilename",
 434 |         help="If set, output crawl stats as JSON to this file. Relative filename "
 435 |         "resolves to output directory, see --output.",
 436 |     )
 437 | 
 438 |     parser.add_argument(
 439 |         "--zimit-progress-file",
 440 |         help="If set, output zimit stats as JSON to this file. Forces the creation of"
 441 |         "crawler and warc2zim stats as well. If --statsFilename and/or "
 442 |         "--warc2zim-progress-file are not set, default temporary files will be used. "
 443 |         "Relative filename resolves to output directory, see --output.",
 444 |     )
 445 | 
 446 |     parser.add_argument(
 447 |         "--warc2zim-progress-file",
 448 |         help="If set, output warc2zim stats as JSON to this file. Relative filename "
 449 |         "resolves to output directory, see --output.",
 450 |     )
 451 | 
 452 |     parser.add_argument(
 453 |         "--behaviors",
 454 |         help="Which background behaviors to enable on each page. Default is autoplay,"
 455 |         "autofetch,autoscroll,siteSpecific",
 456 |     )
 457 | 
 458 |     parser.add_argument(
 459 |         "--behaviorTimeout",
 460 |         help="If >0, timeout (in seconds) for in-page behavior will run on each page. "
 461 |         "If 0, a behavior can run until finish. Default is 90.",
 462 |         type=int,
 463 |     )
 464 | 
 465 |     parser.add_argument(
 466 |         "--postLoadDelay",
 467 |         help="If >0, amount of time to sleep (in seconds) after page has loaded, before"
 468 |         " taking screenshots / getting text / running behaviors. Default is 0.",
 469 |         type=int,
 470 |     )
 471 | 
 472 |     parser.add_argument(
 473 |         "--pageExtraDelay",
 474 |         help="If >0, amount of time to sleep (in seconds) after behaviors "
 475 |         "before moving on to next page. Default is 0.",
 476 |         type=int,
 477 |     )
 478 | 
 479 |     parser.add_argument(
 480 |         "--dedupPolicy",
 481 |         help="Deduplication policy. Default is skip",
 482 |         choices=["skip", "revisit", "keep"],
 483 |     )
 484 | 
 485 |     parser.add_argument(
 486 |         "--profile",
 487 |         help="Path or HTTP(S) URL to tar.gz file which contains the browser profile "
 488 |         "directory",
 489 |     )
 490 | 
 491 |     parser.add_argument(
 492 |         "--screenshot",
 493 |         help="Screenshot options for crawler. One of view, thumbnail, fullPage, "
 494 |         "fullPageFinal or a comma-separated combination of those.",
 495 |     )
 496 | 
 497 |     parser.add_argument(
 498 |         "--screencastPort",
 499 |         help="If set to a non-zero value, starts an HTTP server with screencast "
 500 |         "accessible on this port.",
 501 |         type=int,
 502 |     )
 503 | 
 504 |     parser.add_argument(
 505 |         "--screencastRedis",
 506 |         help="If set, will use the state store redis pubsub for screencasting",
 507 |         action="store_true",
 508 |     )
 509 | 
 510 |     parser.add_argument(
 511 |         "--warcInfo",
 512 |         help="Optional fields added to the warcinfo record in combined WARCs",
 513 |     )
 514 | 
 515 |     parser.add_argument(
 516 |         "--saveState",
 517 |         help="If the crawl state should be serialized to the crawls/ directory. "
 518 |         "Defaults to 'partial', only saved when crawl is interrupted",
 519 |         choices=["never", "partial", "always"],
 520 |     )
 521 | 
 522 |     parser.add_argument(
 523 |         "--saveStateInterval",
 524 |         help="If save state is set to 'always', also save state during the crawl at "
 525 |         "this interval (in seconds). Default to 300.",
 526 |         type=int,
 527 |     )
 528 | 
 529 |     parser.add_argument(
 530 |         "--saveStateHistory",
 531 |         help="Number of save states to keep during the duration of a crawl. "
 532 |         "Default to 5.",
 533 |         type=int,
 534 |     )
 535 | 
 536 |     size_group = parser.add_mutually_exclusive_group()
 537 |     size_group.add_argument(
 538 |         "--sizeSoftLimit",
 539 |         help="If set, save crawl state and stop crawl if WARC size exceeds this value. "
 540 |         "ZIM will still be created.",
 541 |         type=int,
 542 |     )
 543 |     size_group.add_argument(
 544 |         "--sizeHardLimit",
 545 |         help="If set, exit crawler and fail the scraper immediately if WARC size "
 546 |         "exceeds this value",
 547 |         type=int,
 548 |     )
 549 | 
 550 |     parser.add_argument(
 551 |         "--diskUtilization",
 552 |         help="Save state and exit if disk utilization exceeds this percentage value."
 553 |         " Default (if not set) is 90%%. Set to 0 to disable disk utilization check.",
 554 |         type=int,
 555 |         default=90,
 556 |     )
 557 | 
 558 |     time_group = parser.add_mutually_exclusive_group()
 559 |     time_group.add_argument(
 560 |         "--timeSoftLimit",
 561 |         help="If set, save crawl state and stop crawl if WARC WARC(s) creation takes "
 562 |         "longer than this value, in seconds. ZIM will still be created.",
 563 |         type=int,
 564 |     )
 565 |     time_group.add_argument(
 566 |         "--timeHardLimit",
 567 |         help="If set, exit crawler and fail the scraper immediately if WARC(s) creation"
 568 |         " takes longer than this value, in seconds",
 569 |         type=int,
 570 |     )
 571 | 
 572 |     parser.add_argument(
 573 |         "--healthCheckPort",
 574 |         help="port to run healthcheck on",
 575 |         type=int,
 576 |     )
 577 | 
 578 |     parser.add_argument(
 579 |         "--overwrite",
 580 |         help="overwrite current crawl data: if set, existing collection directory "
 581 |         "will be deleted before crawl is started",
 582 |         action="store_true",
 583 |     )
 584 | 
 585 |     parser.add_argument(
 586 |         "--waitOnDone",
 587 |         help="if set, wait for interrupt signal when finished instead of exiting",
 588 |         action="store_true",
 589 |     )
 590 | 
 591 |     parser.add_argument(
 592 |         "--restartsOnError",
 593 |         help="if set, assume will be restarted if interrupted, don't run post-crawl "
 594 |         "processes on interrupt",
 595 |         action="store_true",
 596 |     )
 597 | 
 598 |     parser.add_argument(
 599 |         "--netIdleWait",
 600 |         help="If set, wait for network idle after page load and after behaviors are "
 601 |         "done (in seconds). if -1 (default), determine based on scope.",
 602 |         type=int,
 603 |     )
 604 | 
 605 |     parser.add_argument(
 606 |         "--lang",
 607 |         help="if set, sets the language used by the browser, should be ISO 639 "
 608 |         "language[-country] code",
 609 |     )
 610 | 
 611 |     parser.add_argument(
 612 |         "--originOverride",
 613 |         help="if set, will redirect requests from each origin in key to origin in the "
 614 |         "value, eg. --originOverride https://host:port=http://alt-host:alt-port",
 615 |     )
 616 | 
 617 |     parser.add_argument(
 618 |         "--logErrorsToRedis",
 619 |         help="If set, write error messages to redis",
 620 |         action="store_true",
 621 |     )
 622 | 
 623 |     parser.add_argument(
 624 |         "--writePagesToRedis",
 625 |         help="If set, write page objects to redis",
 626 |         action="store_true",
 627 |     )
 628 | 
 629 |     parser.add_argument(
 630 |         "--maxPageRetries",
 631 |         help="If set, number of times to retry a page that failed to load before page"
 632 |         " is considered to have failed. Default is 2.",
 633 |         type=int,
 634 |     )
 635 | 
 636 |     parser.add_argument(
 637 |         "--failOnFailedSeed",
 638 |         help="If set, crawler will fail with exit code 1 if any seed fails. When "
 639 |         "combined with --failOnInvalidStatus, will result in crawl failing with exit "
 640 |         "code 1 if any seed has a 4xx/5xx response",
 641 |         action="store_true",
 642 |     )
 643 | 
 644 |     parser.add_argument(
 645 |         "--failOnFailedLimit",
 646 |         help="If set, save state and exit if number of failed pages exceeds this value",
 647 |         action="store_true",
 648 |     )
 649 | 
 650 |     parser.add_argument(
 651 |         "--failOnInvalidStatus",
 652 |         help="If set, will treat pages with 4xx or 5xx response as failures. When "
 653 |         "combined with --failOnFailedLimit or --failOnFailedSeed may result in crawl "
 654 |         "failing due to non-200 responses",
 655 |         action="store_true",
 656 |     )
 657 | 
 658 |     # customBehaviors not included because it has special handling
 659 |     # debugAccessRedis not included due to custom redis engine in zimit
 660 | 
 661 |     parser.add_argument(
 662 |         "--debugAccessBrowser",
 663 |         help="if set, allow debugging browser on port 9222 via CDP",
 664 |         action="store_true",
 665 |     )
 666 | 
 667 |     parser.add_argument(
 668 |         "--warcPrefix",
 669 |         help="prefix for WARC files generated, including WARCs added to WACZ",
 670 |     )
 671 | 
 672 |     parser.add_argument(
 673 |         "--serviceWorker",
 674 |         help="service worker handling: disabled, enabled or disabled-if-profile. "
 675 |         "Default: disabled.",
 676 |     )
 677 | 
 678 |     parser.add_argument(
 679 |         "--proxyServer",
 680 |         help="if set, will use specified proxy server. Takes precedence over any env "
 681 |         "var proxy settings",
 682 |     )
 683 | 
 684 |     parser.add_argument(
 685 |         "--dryRun",
 686 |         help="If true, no archive data is written to disk, only pages and logs (and "
 687 |         "optionally saved state).",
 688 |         action="store_true",
 689 |     )
 690 | 
 691 |     parser.add_argument(
 692 |         "--qaSource",
 693 |         help="Required for QA mode. Path to the source WACZ or multi WACZ file for QA",
 694 |     )
 695 | 
 696 |     parser.add_argument(
 697 |         "--qaDebugImageDiff",
 698 |         help="if specified, will write crawl.png, replay.png and diff.png for each "
 699 |         "page where they're different",
 700 |         action="store_true",
 701 |     )
 702 | 
 703 |     parser.add_argument(
 704 |         "--sshProxyPrivateKeyFile",
 705 |         help="path to SSH private key for SOCKS5 over SSH proxy connection",
 706 |     )
 707 | 
 708 |     parser.add_argument(
 709 |         "--sshProxyKnownHostsFile",
 710 |         help="path to SSH known hosts file for SOCKS5 over SSH proxy connection",
 711 |     )
 712 | 
 713 |     parser.add_argument(
 714 |         "--keep",
 715 |         help="In case of failure, WARC files and other temporary files (which are "
 716 |         "stored as a subfolder of output directory) are always kept, otherwise "
 717 |         "they are automatically deleted. Use this flag to always keep WARC files, "
 718 |         "even in case of success.",
 719 |         action="store_true",
 720 |     )
 721 | 
 722 |     parser.add_argument(
 723 |         "--output",
 724 |         help="Output directory for ZIM. Default to /output.",
 725 |         default="/output",
 726 |     )
 727 | 
 728 |     parser.add_argument(
 729 |         "--build",
 730 |         help="Build directory for WARC files (if not set, output directory is used)",
 731 |     )
 732 | 
 733 |     parser.add_argument("--adminEmail", help="Admin Email for Zimit crawler")
 734 | 
 735 |     parser.add_argument(
 736 |         "--custom-css",
 737 |         help="[warc2zim] Custom CSS file URL/path to inject into all articles",
 738 |     )
 739 | 
 740 |     parser.add_argument(
 741 |         "--config",
 742 |         help="Path to YAML config file. If set, browsertrix-crawler will use this file"
 743 |         "to configure the crawling behaviour if not set via argument.",
 744 |     )
 745 | 
 746 |     parser.add_argument(
 747 |         "--version",
 748 |         help="Display scraper version and exit",
 749 |         action="version",
 750 |         version=f"Zimit {__version__}",
 751 |     )
 752 | 
 753 |     parser.add_argument(
 754 |         "--zim-lang",
 755 |         help="Language metadata of ZIM "
 756 |         "(warc2zim --lang param). ISO-639-3 code. "
 757 |         "Retrieved from homepage if found, fallback to `eng`",
 758 |     )
 759 | 
 760 |     parser.add_argument(
 761 |         "--custom-behaviors",
 762 |         help="JS code for custom behaviors to customize crawler. Single string with "
 763 |         "individual JS files URL/path separated by a comma",
 764 |     )
 765 | 
 766 |     parser.add_argument(
 767 |         "--warcs",
 768 |         help="Directly convert WARC archives to ZIM, by-passing the crawling phase. "
 769 |         "This argument must contain the path or HTTP(S) URL to either warc.gz files or"
 770 |         "to a tar or tar.gz containing the warc.gz files. Single value with individual "
 771 |         "path/URLs separated by comma",
 772 |     )
 773 | 
 774 |     parser.add_argument(
 775 |         "--acceptable-crawler-exit-codes",
 776 |         help="Non-zero crawler exit codes to consider as acceptable to continue with "
 777 |         " conversion of WARC to ZIM. Flag partialZim will be set in statsFilename (if "
 778 |         " used). Single value with individual error codes separated by comma",
 779 |     )
 780 | 
 781 |     # by design, all unknown args are for warc2zim ; known one are either for crawler
 782 |     # or shared
 783 |     known_args, warc2zim_args = parser.parse_known_args(raw_args)
 784 | 
 785 |     # pass a scraper suffix to warc2zim so that both zimit and warc2zim versions are
 786 |     # associated with the ZIM ; make it a CSV for easier parsing
 787 |     warc2zim_args.append("--scraper-suffix")
 788 |     warc2zim_args.append(f"zimit {__version__}")
 789 | 
 790 |     # pass url and output to warc2zim also
 791 |     if known_args.output:
 792 |         warc2zim_args.append("--output")
 793 |         warc2zim_args.append(known_args.output)
 794 | 
 795 |     user_agent_suffix = known_args.userAgentSuffix
 796 |     if known_args.adminEmail:
 797 |         user_agent_suffix += f" {known_args.adminEmail}"
 798 | 
 799 |     # make temp dir for this crawl
 800 |     global temp_root_dir  # noqa: PLW0603
 801 |     if known_args.build:
 802 |         temp_root_dir = Path(tempfile.mkdtemp(dir=known_args.build, prefix=".tmp"))
 803 |     else:
 804 |         temp_root_dir = Path(tempfile.mkdtemp(dir=known_args.output, prefix=".tmp"))
 805 | 
 806 |     seeds = []
 807 |     if known_args.seeds:
 808 |         seeds += [get_cleaned_url(url) for url in known_args.seeds.split(",")]
 809 |     if known_args.seedFile:
 810 |         if re.match(r"^https?\://", known_args.seedFile):
 811 |             with tempfile.NamedTemporaryFile(
 812 |                 dir=temp_root_dir,
 813 |                 prefix="seeds_",
 814 |                 suffix=".txt",
 815 |                 delete_on_close=True,
 816 |             ) as filename:
 817 |                 seed_file = Path(filename.name)
 818 |                 download_file(known_args.seedFile, seed_file)
 819 |                 seeds += [
 820 |                     get_cleaned_url(url) for url in seed_file.read_text().splitlines()
 821 |                 ]
 822 |         else:
 823 |             seeds += [
 824 |                 get_cleaned_url(url)
 825 |                 for url in Path(known_args.seedFile).read_text().splitlines()
 826 |             ]
 827 |     warc2zim_args.append("--url")
 828 |     warc2zim_args.append(seeds[0])
 829 | 
 830 |     if known_args.custom_css:
 831 |         warc2zim_args += ["--custom-css", known_args.custom_css]
 832 | 
 833 |     if known_args.title:
 834 |         warc2zim_args.append("--title")
 835 |         warc2zim_args.append(known_args.title)
 836 | 
 837 |     if known_args.description:
 838 |         warc2zim_args.append("--description")
 839 |         warc2zim_args.append(known_args.description)
 840 | 
 841 |     if known_args.long_description:
 842 |         warc2zim_args.append("--long-description")
 843 |         warc2zim_args.append(known_args.long_description)
 844 | 
 845 |     if known_args.zim_lang:
 846 |         warc2zim_args.append("--lang")
 847 |         warc2zim_args.append(known_args.zim_lang)
 848 | 
 849 |     logger.info("----------")
 850 |     logger.info("Testing warc2zim args")
 851 |     logger.info("Running: warc2zim " + " ".join(warc2zim_args))
 852 |     res = warc2zim(warc2zim_args)
 853 |     if res != NORMAL_WARC2ZIM_EXIT_CODE:
 854 |         logger.info("Exiting, invalid warc2zim params")
 855 |         return EXIT_CODE_WARC2ZIM_CHECK_FAILED
 856 | 
 857 |     if not known_args.keep:
 858 |         atexit.register(cleanup)
 859 | 
 860 |     # copy / download custom behaviors to one single folder and configure crawler
 861 |     if known_args.custom_behaviors:
 862 |         behaviors_dir = temp_root_dir / "custom-behaviors"
 863 |         behaviors_dir.mkdir()
 864 |         for custom_behavior in [
 865 |             custom_behavior.strip()
 866 |             for custom_behavior in known_args.custom_behaviors.split(",")
 867 |         ]:
 868 |             behaviors_file = tempfile.NamedTemporaryFile(
 869 |                 dir=behaviors_dir,
 870 |                 prefix="behavior_",
 871 |                 suffix=".js",
 872 |                 delete_on_close=False,
 873 |             )
 874 |             if re.match(r"^https?\://", custom_behavior):
 875 |                 logger.info(
 876 |                     f"Downloading browser profile from {custom_behavior} "
 877 |                     f"to {behaviors_file.name}"
 878 |                 )
 879 |                 download_file(custom_behavior, Path(behaviors_file.name))
 880 |             else:
 881 |                 logger.info(
 882 |                     f"Copying browser profile from {custom_behavior} "
 883 |                     f"to {behaviors_file.name}"
 884 |                 )
 885 |                 shutil.copy(custom_behavior, behaviors_file.name)
 886 |         known_args.customBehaviors = str(behaviors_dir)
 887 |     else:
 888 |         known_args.customBehaviors = None
 889 | 
 890 |     crawler_args = get_crawler_cmd_line(known_args)
 891 |     for seed in seeds:
 892 |         crawler_args.append("--seeds")
 893 |         crawler_args.append(seed)
 894 | 
 895 |     crawler_args.append("--userAgentSuffix")
 896 |     crawler_args.append(user_agent_suffix)
 897 | 
 898 |     crawler_args.append("--cwd")
 899 |     crawler_args.append(str(temp_root_dir))
 900 | 
 901 |     output_dir = Path(known_args.output)
 902 |     warc2zim_stats_file = (
 903 |         Path(known_args.warc2zim_progress_file)
 904 |         if known_args.warc2zim_progress_file
 905 |         else temp_root_dir / "warc2zim.json"
 906 |     )
 907 |     if not warc2zim_stats_file.is_absolute():
 908 |         warc2zim_stats_file = output_dir / warc2zim_stats_file
 909 |         warc2zim_stats_file.parent.mkdir(parents=True, exist_ok=True)
 910 |     warc2zim_stats_file.unlink(missing_ok=True)
 911 | 
 912 |     crawler_stats_file = (
 913 |         Path(known_args.statsFilename)
 914 |         if known_args.statsFilename
 915 |         else temp_root_dir / "crawl.json"
 916 |     )
 917 |     if not crawler_stats_file.is_absolute():
 918 |         crawler_stats_file = output_dir / crawler_stats_file
 919 |         crawler_stats_file.parent.mkdir(parents=True, exist_ok=True)
 920 |     crawler_stats_file.unlink(missing_ok=True)
 921 | 
 922 |     zimit_stats_file = (
 923 |         Path(known_args.zimit_progress_file)
 924 |         if known_args.zimit_progress_file
 925 |         else temp_root_dir / "stats.json"
 926 |     )
 927 |     if not zimit_stats_file.is_absolute():
 928 |         zimit_stats_file = output_dir / zimit_stats_file
 929 |         zimit_stats_file.parent.mkdir(parents=True, exist_ok=True)
 930 |     zimit_stats_file.unlink(missing_ok=True)
 931 | 
 932 |     if known_args.zimit_progress_file:
 933 |         # setup inotify crawler progress watcher
 934 |         watcher = ProgressFileWatcher(
 935 |             zimit_stats_path=zimit_stats_file,
 936 |             crawl_stats_path=crawler_stats_file,
 937 |             warc2zim_stats_path=warc2zim_stats_file,
 938 |         )
 939 |         logger.info(
 940 |             f"Writing zimit progress to {watcher.zimit_stats_path}, crawler progress to"
 941 |             f" {watcher.crawl_stats_path} and warc2zim progress to "
 942 |             f"{watcher.warc2zim_stats_path}"
 943 |         )
 944 |         # update crawler command
 945 |         crawler_args.append("--statsFilename")
 946 |         crawler_args.append(str(crawler_stats_file))
 947 |         # update warc2zim command
 948 |         warc2zim_args.append("-v")
 949 |         warc2zim_args.append("--progress-file")
 950 |         warc2zim_args.append(str(warc2zim_stats_file))
 951 |         watcher.watch()
 952 |     else:
 953 |         if known_args.statsFilename:
 954 |             logger.info(f"Writing crawler progress to {crawler_stats_file}")
 955 |             crawler_args.append("--statsFilename")
 956 |             crawler_args.append(str(crawler_stats_file))
 957 |         if known_args.warc2zim_progress_file:
 958 |             logger.info(f"Writing warc2zim progress to {warc2zim_stats_file}")
 959 |             warc2zim_args.append("-v")
 960 |             warc2zim_args.append("--progress-file")
 961 |             warc2zim_args.append(str(warc2zim_stats_file))
 962 | 
 963 |     cmd_line = " ".join(crawler_args)
 964 | 
 965 |     logger.info("")
 966 |     logger.info("----------")
 967 |     logger.info(
 968 |         f"Output to tempdir: {temp_root_dir} - "
 969 |         f"{'will keep' if known_args.keep else 'will delete'}"
 970 |     )
 971 | 
 972 |     partial_zim = False
 973 | 
 974 |     # if warc files are passed, do not run browsertrix crawler but fetch the files if
 975 |     # they are provided as an HTTP URL + extract the archive if it is a tar.gz
 976 |     warc_files: list[Path] = []
 977 |     if known_args.warcs:
 978 |         for warc_location in [
 979 |             warc_location.strip() for warc_location in known_args.warcs.split(",")
 980 |         ]:
 981 |             suffix = "".join(Path(urllib.parse.urlparse(warc_location).path).suffixes)
 982 |             if suffix not in {".tar", ".tar.gz", ".warc", ".warc.gz"}:
 983 |                 raise Exception(f"Unsupported file at {warc_location}")
 984 | 
 985 |             filename = tempfile.NamedTemporaryFile(
 986 |                 dir=temp_root_dir,
 987 |                 prefix="warc_",
 988 |                 suffix=suffix,
 989 |                 delete_on_close=False,
 990 |             )
 991 | 
 992 |             if not re.match(r"^https?\://", warc_location):
 993 |                 # warc_location is not a URL, so it is a path, simply add it to the list
 994 |                 if not Path(warc_location).exists():
 995 |                     raise Exception(f"Impossible to find file at {warc_location}")
 996 | 
 997 |                 # if it is a plain warc or warc.gz, simply add it to the list
 998 |                 if suffix in {".warc", ".warc.gz"}:
 999 |                     warc_files.append(Path(warc_location))
1000 |                     continue
1001 | 
1002 |                 # otherwise extract tar.gz but do not delete it afterwards
1003 |                 extract_path = temp_root_dir / f"{filename.name}_files"
1004 |                 logger.info(
1005 |                     f"Extracting WARC(s) from {warc_location} to {extract_path}"
1006 |                 )
1007 |                 with tarfile.open(warc_location, "r") as fh:
1008 |                     # Extract all the contents to the specified directory
1009 |                     fh.extractall(path=extract_path, filter="data")
1010 |                 warc_files.append(Path(extract_path))
1011 |                 continue
1012 | 
1013 |             # warc_location is a URL, let's download it to a temp name to avoid name
1014 |             # collisions
1015 |             warc_file = Path(filename.name)
1016 |             logger.info(f"Downloading WARC(s) from {warc_location} to {warc_file}")
1017 |             download_file(warc_location, warc_file)
1018 | 
1019 |             # if it is a plain warc or warc.gz, simply add it to the list
1020 |             if suffix in {".warc", ".warc.gz"}:
1021 |                 warc_files.append(warc_file)
1022 |                 continue
1023 | 
1024 |             # otherwise extract tar.gz and delete it afterwards
1025 |             extract_path = temp_root_dir / f"{filename.name}_files"
1026 |             logger.info(f"Extracting WARC(s) from {warc_file} to {extract_path}")
1027 |             with tarfile.open(warc_file, "r") as fh:
1028 |                 # Extract all the contents to the specified directory
1029 |                 fh.extractall(path=extract_path, filter="data")
1030 |             logger.info(f"Deleting archive at {warc_file}")
1031 |             warc_file.unlink()
1032 |             warc_files.append(Path(extract_path))
1033 | 
1034 |     else:
1035 | 
1036 |         logger.info(f"Running browsertrix-crawler crawl: {cmd_line}")
1037 |         crawl = subprocess.run(crawler_args, check=False)
1038 |         if (
1039 |             crawl.returncode == EXIT_CODE_CRAWLER_SIZE_LIMIT_HIT
1040 |             and known_args.sizeSoftLimit
1041 |         ):
1042 |             logger.info(
1043 |                 "Crawl size soft limit hit. Continuing with warc2zim conversion."
1044 |             )
1045 |             if known_args.zimit_progress_file:
1046 |                 partial_zim = True
1047 |         elif (
1048 |             crawl.returncode == EXIT_CODE_CRAWLER_TIME_LIMIT_HIT
1049 |             and known_args.timeSoftLimit
1050 |         ):
1051 |             logger.info(
1052 |                 "Crawl time soft limit hit. Continuing with warc2zim conversion."
1053 |             )
1054 |             if known_args.zimit_progress_file:
1055 |                 partial_zim = True
1056 |         elif crawl.returncode != 0:
1057 |             logger.error(
1058 |                 f"Crawl returned an error: {crawl.returncode}, scraper exiting"
1059 |             )
1060 |             cancel_cleanup()
1061 |             return crawl.returncode
1062 | 
1063 |         if known_args.collection:
1064 |             warc_files = [
1065 |                 temp_root_dir.joinpath(f"collections/{known_args.collection}/archive/")
1066 |             ]
1067 | 
1068 |         else:
1069 |             warc_dirs = sorted(
1070 |                 temp_root_dir.rglob("collections/crawl-*/archive/"),
1071 |                 key=lambda path: path.lstat().st_mtime,
1072 |             )
1073 |             if len(warc_dirs) == 0:
1074 |                 raise RuntimeError(
1075 |                     "Failed to find directory where WARC files have been created"
1076 |                 )
1077 |             elif len(warc_dirs) > 1:
1078 |                 logger.info(
1079 |                     "Found many WARC files directories, only most recently modified one"
1080 |                     " will be used"
1081 |                 )
1082 |                 for directory in warc_dirs:
1083 |                     logger.info(f"- {directory}")
1084 |             warc_files = [warc_dirs[-1]]
1085 | 
1086 |     logger.info("")
1087 |     logger.info("----------")
1088 |     logger.info(
1089 |         f"Processing WARC files in/at "
1090 |         f'{" ".join(str(warc_file) for warc_file in warc_files)}'
1091 |     )
1092 |     warc2zim_args.extend(str(warc_file) for warc_file in warc_files)
1093 | 
1094 |     logger.info(f"Calling warc2zim with these args: {warc2zim_args}")
1095 | 
1096 |     warc2zim_exit_code = warc2zim(warc2zim_args)
1097 | 
1098 |     if known_args.zimit_progress_file:
1099 |         stats_content = json.loads(zimit_stats_file.read_bytes())
1100 |         stats_content["partialZim"] = partial_zim
1101 |         zimit_stats_file.write_text(json.dumps(stats_content))
1102 | 
1103 |     # also call cancel_cleanup when --keep, even if it is not supposed to be registered,
1104 |     # so that we will display temporary files location just like in other situations
1105 |     if warc2zim_exit_code or known_args.keep:
1106 |         cancel_cleanup()
1107 | 
1108 |     return warc2zim_exit_code
1109 | 
1110 | 
1111 | def get_cleaned_url(url: str):
1112 |     parsed_url = urllib.parse.urlparse(url)
1113 | 
1114 |     # remove explicit port in URI for default-for-scheme as browsers does it
1115 |     if parsed_url.scheme == "https" and parsed_url.port == 443:  # noqa: PLR2004
1116 |         parsed_url = rebuild_uri(parsed_url, port="")
1117 |     if parsed_url.scheme == "http" and parsed_url.port == 80:  # noqa: PLR2004
1118 |         parsed_url = rebuild_uri(parsed_url, port="")
1119 | 
1120 |     return parsed_url.geturl()
1121 | 
1122 | 
1123 | def get_crawler_cmd_line(args):
1124 |     """Build the command line for Browsertrix crawler"""
1125 |     node_cmd = ["crawl"]
1126 |     for arg in [
1127 |         "title",
1128 |         "description",
1129 |         "workers",
1130 |         "crawlId",
1131 |         "waitUntil",
1132 |         "depth",
1133 |         "extraHops",
1134 |         "pageLimit",
1135 |         "maxPageLimit",
1136 |         "pageLoadTimeout",
1137 |         "scopeType",
1138 |         "scopeIncludeRx",
1139 |         "scopeExcludeRx",
1140 |         "collection",
1141 |         "allowHashUrls",
1142 |         "selectLinks",
1143 |         "clickSelector",
1144 |         "blockRules",
1145 |         "blockMessage",
1146 |         "blockAds",
1147 |         "adBlockMessage",
1148 |         "collection",
1149 |         "headless",
1150 |         "driver",
1151 |         "generateCDX",
1152 |         "combineWARC",
1153 |         "rolloverSize",
1154 |         "generateWACZ",
1155 |         "logging",
1156 |         "logLevel",
1157 |         "logContext",
1158 |         "logExcludeContext",
1159 |         "text",
1160 |         "mobileDevice",
1161 |         "userAgent",
1162 |         # userAgentSuffix (manipulated),
1163 |         "useSitemap",
1164 |         "sitemapFromDate",
1165 |         "sitemapToDate",
1166 |         # statsFilename (manipulated),
1167 |         "behaviors",
1168 |         "behaviorTimeout",
1169 |         "postLoadDelay",
1170 |         "pageExtraDelay",
1171 |         "dedupPolicy",
1172 |         "profile",
1173 |         "screenshot",
1174 |         "screencastPort",
1175 |         "screencastRedis",
1176 |         "warcInfo",
1177 |         "saveState",
1178 |         "saveStateInterval",
1179 |         "saveStateHistory",
1180 |         "sizeSoftLimit",
1181 |         "sizeHardLimit",
1182 |         "diskUtilization",
1183 |         "timeSoftLimit",
1184 |         "timeHardLimit",
1185 |         "healthCheckPort",
1186 |         "overwrite",
1187 |         "waitOnDone",
1188 |         "restartsOnError",
1189 |         "netIdleWait",
1190 |         "lang",
1191 |         "originOverride",
1192 |         "logErrorsToRedis",
1193 |         "writePagesToRedis",
1194 |         "maxPageRetries",
1195 |         "failOnFailedSeed",
1196 |         "failOnFailedLimit",
1197 |         "failOnInvalidStatus",
1198 |         "debugAccessBrowser",
1199 |         "warcPrefix",
1200 |         "serviceWorker",
1201 |         "proxyServer",
1202 |         "dryRun",
1203 |         "qaSource",
1204 |         "qaDebugImageDiff",
1205 |         "sshProxyPrivateKeyFile",
1206 |         "sshProxyKnownHostsFile",
1207 |         "customBehaviors",
1208 |         "config",
1209 |     ]:
1210 |         value = getattr(args, arg)
1211 |         if arg == "userAgent":
1212 |             # - strip leading whitespace which are not allowed on some websites
1213 |             # - strip trailing whitespace which are either not allowed if no suffix is
1214 |             # used, or duplicate with the automatically added one if a suffix is there
1215 |             # - value is None when userAgent is not passed
1216 |             if value:
1217 |                 value = value.strip()
1218 |             if not value:
1219 |                 # ignore empty userAgent arg and keep crawler default value if empty
1220 |                 continue
1221 |         if value is None or (isinstance(value, bool) and value is False):
1222 |             continue
1223 |         node_cmd.append(
1224 |             "--"
1225 |             + (
1226 |                 "sizeLimit"
1227 |                 if arg in ["sizeSoftLimit", "sizeHardLimit"]
1228 |                 else "timeLimit" if arg in ["timeSoftLimit", "timeHardLimit"] else arg
1229 |             )
1230 |         )
1231 |         if not isinstance(value, bool):
1232 |             node_cmd.append(str(value))
1233 | 
1234 |     return node_cmd
1235 | 
1236 | 
1237 | def sigint_handler(*args):  # noqa: ARG001
1238 |     logger.info("")
1239 |     logger.info("")
1240 |     logger.info("SIGINT/SIGTERM received, stopping zimit")
1241 |     logger.info("")
1242 |     logger.info("")
1243 |     sys.exit(3)
1244 | 
1245 | 
1246 | def zimit():
1247 |     sys.exit(run(sys.argv[1:]))
1248 | 
1249 | 
1250 | signal.signal(signal.SIGINT, sigint_handler)
1251 | signal.signal(signal.SIGTERM, sigint_handler)
1252 | 
1253 | 
1254 | if __name__ == "__main__":
1255 |     zimit()
1256 | 


--------------------------------------------------------------------------------
/tasks.py:
--------------------------------------------------------------------------------
  1 | # pyright: strict, reportUntypedFunctionDecorator=false
  2 | import os
  3 | 
  4 | from invoke.context import Context
  5 | from invoke.tasks import task  # pyright: ignore [reportUnknownVariableType]
  6 | 
  7 | use_pty = not os.getenv("CI", "")
  8 | 
  9 | 
 10 | @task(optional=["args"], help={"args": "pytest additional arguments"})
 11 | def test(ctx: Context, args: str = ""):
 12 |     """run tests (without coverage)"""
 13 |     ctx.run(f"pytest {args}", pty=use_pty)
 14 | 
 15 | 
 16 | @task(optional=["args"], help={"args": "pytest additional arguments"})
 17 | def test_cov(ctx: Context, args: str = ""):
 18 |     """run test vith coverage"""
 19 |     ctx.run(f"coverage run -m pytest {args}", pty=use_pty)
 20 | 
 21 | 
 22 | @task(optional=["html"], help={"html": "flag to export html report"})
 23 | def report_cov(ctx: Context, *, html: bool = False):
 24 |     """report coverage"""
 25 |     ctx.run("coverage combine", warn=True, pty=use_pty)
 26 |     ctx.run("coverage report --show-missing", pty=use_pty)
 27 |     if html:
 28 |         ctx.run("coverage html", pty=use_pty)
 29 | 
 30 | 
 31 | @task(
 32 |     optional=["args", "html"],
 33 |     help={
 34 |         "args": "pytest additional arguments",
 35 |         "html": "flag to export html report",
 36 |     },
 37 | )
 38 | def coverage(ctx: Context, args: str = "", *, html: bool = False):
 39 |     """run tests and report coverage"""
 40 |     test_cov(ctx, args=args)
 41 |     report_cov(ctx, html=html)
 42 | 
 43 | 
 44 | @task(optional=["args"], help={"args": "black additional arguments"})
 45 | def lint_black(ctx: Context, args: str = "."):
 46 |     args = args or "."  # needed for hatch script
 47 |     ctx.run("black --version", pty=use_pty)
 48 |     ctx.run(f"black --check --diff {args}", pty=use_pty)
 49 | 
 50 | 
 51 | @task(optional=["args"], help={"args": "ruff additional arguments"})
 52 | def lint_ruff(ctx: Context, args: str = "."):
 53 |     args = args or "."  # needed for hatch script
 54 |     ctx.run("ruff --version", pty=use_pty)
 55 |     ctx.run(f"ruff check {args}", pty=use_pty)
 56 | 
 57 | 
 58 | @task(
 59 |     optional=["args"],
 60 |     help={
 61 |         "args": "linting tools (black, ruff) additional arguments, typically a path",
 62 |     },
 63 | )
 64 | def lintall(ctx: Context, args: str = "."):
 65 |     """Check linting"""
 66 |     args = args or "."  # needed for hatch script
 67 |     lint_black(ctx, args)
 68 |     lint_ruff(ctx, args)
 69 | 
 70 | 
 71 | @task(optional=["args"], help={"args": "check tools (pyright) additional arguments"})
 72 | def check_pyright(ctx: Context, args: str = ""):
 73 |     """check static types with pyright"""
 74 |     ctx.run("pyright --version")
 75 |     ctx.run(f"pyright {args}", pty=use_pty)
 76 | 
 77 | 
 78 | @task(optional=["args"], help={"args": "check tools (pyright) additional arguments"})
 79 | def checkall(ctx: Context, args: str = ""):
 80 |     """check static types"""
 81 |     check_pyright(ctx, args)
 82 | 
 83 | 
 84 | @task(optional=["args"], help={"args": "black additional arguments"})
 85 | def fix_black(ctx: Context, args: str = "."):
 86 |     """fix black formatting"""
 87 |     args = args or "."  # needed for hatch script
 88 |     ctx.run(f"black {args}", pty=use_pty)
 89 | 
 90 | 
 91 | @task(optional=["args"], help={"args": "ruff additional arguments"})
 92 | def fix_ruff(ctx: Context, args: str = "."):
 93 |     """fix all ruff rules"""
 94 |     args = args or "."  # needed for hatch script
 95 |     ctx.run(f"ruff check --fix {args}", pty=use_pty)
 96 | 
 97 | 
 98 | @task(
 99 |     optional=["args"],
100 |     help={
101 |         "args": "linting tools (black, ruff) additional arguments, typically a path",
102 |     },
103 | )
104 | def fixall(ctx: Context, args: str = "."):
105 |     """Fix everything automatically"""
106 |     args = args or "."  # needed for hatch script
107 |     fix_black(ctx, args)
108 |     fix_ruff(ctx, args)
109 |     lintall(ctx, args)
110 | 


--------------------------------------------------------------------------------
/tests-daily/Dockerfile:
--------------------------------------------------------------------------------
 1 | # Let's extract kiwix-tools as usual on alpine temporary build container
 2 | FROM alpine:3.21 as kiwix-serve
 3 | LABEL org.opencontainers.image.source https://github.com/openzim/kiwix-tools
 4 | 
 5 | # TARGETPLATFORM is injected by docker build
 6 | ARG TARGETPLATFORM
 7 | ARG KIWIX_TOOLS_VERSION
 8 | 
 9 | RUN set -e && \
10 |     # default (no KIWIX_TOOLS_VERSION set) to today's nightly
11 |     if [ -z "$KIWIX_TOOLS_VERSION" ] ; then KIWIX_TOOLS_VERSION=$(date +"%Y-%m-%d") ; fi && \
12 |     apk --no-cache add dumb-init curl && \
13 |     echo "TARGETPLATFORM: $TARGETPLATFORM" && \
14 |     if [ "$TARGETPLATFORM" = "linux/386" ]; then ARCH="i586"; \
15 |     # linux/arm64/v8 points to linux/arm64
16 |     elif [ "$TARGETPLATFORM" = "linux/arm64/v8" \
17 |         -o "$TARGETPLATFORM" = "linux/arm64" ]; then ARCH="aarch64"; \
18 |     # linux/arm translates to linux/arm/v7
19 |     elif [ "$TARGETPLATFORM" = "linux/arm/v7" ]; then ARCH="armv8"; \
20 |     elif [ "$TARGETPLATFORM" = "linux/arm/v6" ]; then ARCH="armv6"; \
21 |     elif [ "$TARGETPLATFORM" = "linux/amd64/v3" \
22 |         -o "$TARGETPLATFORM" = "linux/amd64/v2" \
23 |         -o "$TARGETPLATFORM" = "linux/amd64" ]; then ARCH="x86_64"; \
24 |     # we dont suppot any other arch so let it fail
25 |     else ARCH="unknown"; fi && \
26 |     # download requested kiwix-tools version
27 |     url="http://mirror.download.kiwix.org/nightly/$KIWIX_TOOLS_VERSION/kiwix-tools_linux-$ARCH-$KIWIX_TOOLS_VERSION.tar.gz" && \
28 |     echo "URL: $url" && \
29 |     mkdir /kiwix-serve && \
30 |     curl -k -L $url | tar -xz -C /kiwix-serve --strip-components 1
31 | 
32 | # Build real "workload" container
33 | FROM python:3.13-slim-bookworm
34 | 
35 | # Add kiwix-serve
36 | COPY --from=kiwix-serve /kiwix-serve /usr/local/bin
37 | 
38 | # Update apt + install dependencies + install Google Chrome dependencies + clean-up apt lists
39 | RUN apt-get update -y && \
40 |     apt-get install -qqy wget xvfb unzip jq && \
41 |     apt-get install -qqy libxss1 libappindicator1 libgconf-2-4 \
42 |     fonts-liberation libasound2 libnspr4 libnss3 libx11-xcb1 libxtst6 lsb-release xdg-utils \
43 |     libgbm1 libnss3 libatk-bridge2.0-0 libgtk-3-0 libx11-xcb1 libxcb-dri3-0 && \
44 |     rm -rf /var/lib/apt/lists/*
45 | 
46 | # Fetch the latest version numbers and URLs for Chrome and ChromeDriver
47 | RUN wget -q -O /tmp/versions.json https://googlechromelabs.github.io/chrome-for-testing/last-known-good-versions-with-downloads.json
48 | 
49 | # Install chrome
50 | RUN CHROME_URL=$(jq -r '.channels.Stable.downloads.chrome[] | select(.platform=="linux64") | .url' /tmp/versions.json) && \
51 |     wget -q --continue -O /tmp/chrome-linux64.zip $CHROME_URL && \
52 |     unzip /tmp/chrome-linux64.zip -d /opt/chrome
53 | 
54 | RUN chmod +x /opt/chrome/chrome-linux64/chrome
55 | 
56 | # Install chromedriver
57 | RUN CHROMEDRIVER_URL=$(jq -r '.channels.Stable.downloads.chromedriver[] | select(.platform=="linux64") | .url' /tmp/versions.json) && \
58 |     wget -q --continue -O /tmp/chromedriver-linux64.zip $CHROMEDRIVER_URL && \
59 |     unzip /tmp/chromedriver-linux64.zip -d /opt/chromedriver && \
60 |     chmod +x /opt/chromedriver/chromedriver-linux64/chromedriver
61 | 
62 | # Set up Chromedriver Environment variables
63 | ENV CHROMEDRIVER_DIR /opt/chromedriver
64 | ENV PATH $CHROMEDRIVER_DIR:$PATH
65 | 
66 | # Clean up
67 | RUN rm /tmp/chrome-linux64.zip /tmp/chromedriver-linux64.zip /tmp/versions.json
68 | 
69 | # Update pip, install selenium, create work directory
70 | RUN \
71 |    python -m pip install --no-cache-dir -U \
72 |      pip \
73 |      selenium==4.28.1 \
74 |      pytest==8.3.4 \
75 | && mkdir -p /work
76 | 


--------------------------------------------------------------------------------
/tests-daily/daily.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import os
  3 | import subprocess
  4 | from time import sleep
  5 | 
  6 | import pytest
  7 | from selenium import webdriver
  8 | from selenium.webdriver.chrome.options import Options
  9 | from selenium.webdriver.chrome.service import Service as ChromeService
 10 | from selenium.webdriver.common.by import By
 11 | from selenium.webdriver.support import expected_conditions
 12 | from selenium.webdriver.support.ui import WebDriverWait
 13 | 
 14 | KIWIX_SERVE_START_SLEEP = 1
 15 | 
 16 | ZIM_NAME = "tests_eng_test-website"
 17 | YOUTUBE_VIDEO_PATH = "youtube.fuzzy.replayweb.page/embed/g5skcrNXdDM"
 18 | 
 19 | SKIP_YOUTUBE_TEST = os.getenv("SKIP_YOUTUBE_TEST", "False").lower() == "true"
 20 | 
 21 | CHECK_VIDEO_IS_PLAYING_AFTER_SECS = 30
 22 | 
 23 | logger = logging.getLogger(__name__)
 24 | 
 25 | 
 26 | @pytest.fixture(scope="module")
 27 | def chrome_driver():
 28 |     """Start chrome and setup chrome driver / selenium"""
 29 | 
 30 |     logger.info("Starting Chrome")
 31 |     chrome_options = Options()
 32 |     chrome_options.add_argument("--headless")
 33 |     chrome_options.add_argument("--no-sandbox")
 34 |     # Other options of interest:
 35 |     # --disable-dev-shm-usage (not needed anymore with recent chrome versions)
 36 |     # --disable-gpu (important for some versions of Chrome)
 37 |     # --remote-debugging-port=9222 (should you need to remote debug)
 38 | 
 39 |     # Set path to Chrome binary
 40 |     chrome_options.binary_location = "/opt/chrome/chrome-linux64/chrome"
 41 | 
 42 |     # Set path to ChromeDriver
 43 |     chrome_service = ChromeService(
 44 |         executable_path="/opt/chromedriver/chromedriver-linux64/chromedriver"
 45 |     )
 46 | 
 47 |     # Set up driver
 48 |     driver = webdriver.Chrome(service=chrome_service, options=chrome_options)
 49 | 
 50 |     yield driver
 51 | 
 52 |     # Cleanup
 53 |     logger.info("Quitting Chrome")
 54 |     driver.quit()
 55 | 
 56 | 
 57 | @pytest.fixture(scope="module")
 58 | def kiwix_serve():
 59 |     """Start  kiwix-serve with given ZIM"""
 60 | 
 61 |     logger.info("Starting kiwix-serve")
 62 |     process = subprocess.Popen(
 63 |         [
 64 |             "/usr/bin/env",
 65 |             "/usr/local/bin/kiwix-serve",
 66 |             f"/output/{ZIM_NAME}.zim",
 67 |         ]
 68 |     )
 69 | 
 70 |     logger.info(
 71 |         f"Waiting {KIWIX_SERVE_START_SLEEP} secs to be 'sure' that kiwix-serve is ready"
 72 |     )
 73 |     sleep(KIWIX_SERVE_START_SLEEP)
 74 | 
 75 |     if process.poll() is not None:
 76 |         raise Exception("kiwix-serve has terminated too early")
 77 | 
 78 |     yield process
 79 | 
 80 |     # Cleanup
 81 |     logger.info("Quitting kiwix-serve")
 82 |     process.terminate()
 83 | 
 84 | 
 85 | @pytest.mark.skipif(SKIP_YOUTUBE_TEST, reason="Youtube test disabled by environment")
 86 | def test_youtube_video(chrome_driver, kiwix_serve):  # noqa: ARG001
 87 |     """Test that youtube video loads, and still plays after a while"""
 88 | 
 89 |     chrome_driver.get(f"http://localhost:80/content/{ZIM_NAME}/{YOUTUBE_VIDEO_PATH}")
 90 | 
 91 |     if chrome_driver.title == "Content not found":
 92 |         raise Exception("Wrong URL, kiwix-serve said that content is not found")
 93 | 
 94 |     button = WebDriverWait(chrome_driver, 1).until(
 95 |         expected_conditions.presence_of_element_located(
 96 |             (By.XPATH, "//button[@title='Play']")
 97 |         )
 98 |     )
 99 | 
100 |     logger.info("Play button found in page")
101 | 
102 |     button.click()
103 | 
104 |     video = WebDriverWait(chrome_driver, 1).until(
105 |         expected_conditions.presence_of_element_located((By.TAG_NAME, "video"))
106 |     )
107 | 
108 |     logger.info("Video found in page")
109 | 
110 |     # arguments[0] is the video tag passed to execute_script
111 |     if not chrome_driver.execute_script("return arguments[0].paused === false", video):
112 |         raise Exception("Video is not playing, failed to start probably")
113 | 
114 |     logger.info("Video is playing")
115 | 
116 |     logger.info(
117 |         f"Waiting {CHECK_VIDEO_IS_PLAYING_AFTER_SECS} secs to check video is still "
118 |         "playing"
119 |     )
120 |     sleep(CHECK_VIDEO_IS_PLAYING_AFTER_SECS)
121 | 
122 |     # arguments[0] is the video tag passed to execute_script
123 |     if not chrome_driver.execute_script("return arguments[0].paused === false", video):
124 |         raise Exception(
125 |             "Video is not playing anymore after "
126 |             f"{CHECK_VIDEO_IS_PLAYING_AFTER_SECS} secs"
127 |         )
128 |     logger.info("Video is still playing")
129 | 


--------------------------------------------------------------------------------
/tests-integration/README.md:
--------------------------------------------------------------------------------
1 | These are integration tests, meant to be ran inside the CI (because we need to first perform a zimit run on a given website and then check its output)
2 | 


--------------------------------------------------------------------------------
/tests-integration/integration.py:
--------------------------------------------------------------------------------
  1 | import glob
  2 | import json
  3 | import os
  4 | from pathlib import Path
  5 | 
  6 | import pytest
  7 | from warcio import ArchiveIterator
  8 | from zimscraperlib.zim import Archive
  9 | 
 10 | 
 11 | @pytest.mark.parametrize(
 12 |     "filename",
 13 |     [
 14 |         pytest.param("/output/tests_en_onepage.zim", id="onepage"),
 15 |         pytest.param("/output/tests_en_sizesoftlimit.zim", id="sizesoftlimit"),
 16 |         pytest.param("/output/tests_en_timesoftlimit.zim", id="timesoftlimit"),
 17 |     ],
 18 | )
 19 | def test_zim_created(filename):
 20 |     """Ensure ZIM file exists"""
 21 |     assert os.path.isfile(filename)
 22 | 
 23 | 
 24 | @pytest.mark.parametrize(
 25 |     "filename",
 26 |     [
 27 |         pytest.param("/output/tests_en_sizehardlimit.zim", id="sizehardlimit"),
 28 |         pytest.param("/output/tests_en_timehardlimit.zim", id="timehardlimit"),
 29 |     ],
 30 | )
 31 | def test_zim_not_created(filename):
 32 |     """Ensure ZIM file does not exists"""
 33 |     assert not os.path.exists(filename)
 34 | 
 35 | 
 36 | def test_zim_main_page():
 37 |     """Main page specified, http://website.test.openzim.org/http-return-codes.html,
 38 |     was a redirect to https
 39 |     Ensure main page is the redirected page"""
 40 | 
 41 |     main_entry = Archive(Path("/output/tests_en_onepage.zim")).main_entry
 42 |     assert main_entry.is_redirect
 43 |     assert (
 44 |         main_entry.get_redirect_entry().path
 45 |         == "website.test.openzim.org/http-return-codes.html"
 46 |     )
 47 | 
 48 | 
 49 | def test_zim_scraper():
 50 |     """Check content of scraper metadata"""
 51 | 
 52 |     zim_fh = Archive(Path("/output/tests_en_onepage.zim"))
 53 |     scraper = zim_fh.get_text_metadata("Scraper")
 54 |     assert "zimit " in scraper
 55 |     assert "warc2zim " in scraper
 56 |     assert "Browsertrix-Crawler " in scraper
 57 | 
 58 | 
 59 | def test_files_list():
 60 |     """Check that expected files are present in the ZIM at proper path"""
 61 |     zim_fh = Archive(Path("/output/tests_en_onepage.zim"))
 62 |     for expected_entry in [
 63 |         "_zim_static/__wb_module_decl.js",
 64 |         "_zim_static/wombat.js",
 65 |         "_zim_static/wombatSetup.js",
 66 |         "website.test.openzim.org/http-return-codes.html",
 67 |         "website.test.openzim.org/200-response",
 68 |         "website.test.openzim.org/201-response",
 69 |         "website.test.openzim.org/202-response",
 70 |         "website.test.openzim.org/301-external-redirect-ok",
 71 |         "website.test.openzim.org/301-internal-redirect-ok",
 72 |         "website.test.openzim.org/302-external-redirect-ok",
 73 |         "website.test.openzim.org/302-internal-redirect-ok",
 74 |         "website.test.openzim.org/307-external-redirect-ok",
 75 |         "website.test.openzim.org/307-internal-redirect-ok",
 76 |         "website.test.openzim.org/308-external-redirect-ok",
 77 |         "website.test.openzim.org/308-internal-redirect-ok",
 78 |         "website.test.openzim.org/http-return-codes.html",
 79 |         "website.test.openzim.org/icons/favicon.ico",
 80 |         "website.test.openzim.org/icons/site.webmanifest",
 81 |         "website.test.openzim.org/internal_redirect_target.html",
 82 |         "www.example.com/",
 83 |     ]:
 84 |         assert zim_fh.get_content(expected_entry)
 85 | 
 86 | 
 87 | def test_user_agent():
 88 |     """Test that mobile user agent was used
 89 | 
 90 |     Check is done in WARC request records with custom Zimit and email suffix
 91 |     """
 92 | 
 93 |     found = False
 94 |     for warc in glob.glob("/output/.tmp*/collections/crawl-*/archive/*.warc.gz"):
 95 |         with open(warc, "rb") as fh:
 96 |             for record in ArchiveIterator(fh):
 97 |                 if record.rec_type == "request":
 98 |                     print(record.http_headers)  # noqa: T201
 99 |                     ua = record.http_headers.get_header("User-Agent")
100 |                     if ua:
101 |                         assert "Mozilla" in ua
102 |                         assert ua.endswith(" +Zimit test@example.com")
103 |                         found = True
104 | 
105 |     # should find at least one
106 |     assert found
107 | 
108 | 
109 | def test_stats_output_standard():
110 |     assert json.loads(Path("/output/crawl.json").read_bytes()) == {
111 |         "crawled": 17,
112 |         "pending": 0,
113 |         "pendingPages": [],
114 |         "total": 35,
115 |         "failed": 18,
116 |         "limit": {"max": 0, "hit": False},
117 |     }
118 | 
119 |     assert json.loads(Path("/output/warc2zim.json").read_bytes()) == {
120 |         "written": 8,
121 |         "total": 8,
122 |     }
123 | 
124 |     assert json.loads(Path("/output/stats.json").read_bytes()) == {
125 |         "done": 8,
126 |         "total": 8,
127 |         "partialZim": False,
128 |     }
129 | 
130 | 
131 | @pytest.mark.parametrize(
132 |     "filename",
133 |     [
134 |         pytest.param("/output/stats_sizesoftlimit.json", id="sizesoftlimit"),
135 |         pytest.param("/output/stats_timesoftlimit.json", id="timesoftlimit"),
136 |     ],
137 | )
138 | def test_stats_output_softlimit(filename):
139 |     file = Path(filename)
140 |     assert file.exists
141 |     content = json.loads(file.read_bytes())
142 |     assert "done" in content
143 |     assert "total" in content
144 |     assert "partialZim" in content
145 |     assert content["partialZim"]
146 | 


--------------------------------------------------------------------------------
/tests/test_dummy.py:
--------------------------------------------------------------------------------
1 | from zimit.zimit import NORMAL_WARC2ZIM_EXIT_CODE
2 | 
3 | 
4 | # dummy test, just to have coverage report done
5 | def test_something_exists():
6 |     assert NORMAL_WARC2ZIM_EXIT_CODE
7 | 


--------------------------------------------------------------------------------