├── .github
├── FUNDING.yml
├── stale.yml
└── workflows
│ ├── DailyTests.yaml
│ ├── Publish.yml
│ ├── PublishDockerDevImage.yaml
│ ├── QA.yaml
│ ├── Tests.yaml
│ └── update-zim-offliner-definition.yaml
├── .gitignore
├── .pre-commit-config.yaml
├── CHANGELOG.md
├── Dockerfile
├── LICENSE
├── README.md
├── offliner-definition.json
├── pyproject.toml
├── src
└── zimit
│ ├── __about__.py
│ ├── constants.py
│ ├── utils.py
│ └── zimit.py
├── tasks.py
├── tests-daily
├── Dockerfile
└── daily.py
├── tests-integration
├── README.md
└── integration.py
└── tests
└── test_dummy.py
/.github/FUNDING.yml:
--------------------------------------------------------------------------------
1 | # These are supported funding model platforms
2 |
3 | github: kiwix # Replace with up to 4 GitHub Sponsors-enabled usernames e.g., [user1, user2]
4 | patreon: # Replace with a single Patreon username
5 | open_collective: # Replace with a single Open Collective username
6 | ko_fi: # Replace with a single Ko-fi username
7 | tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel
8 | community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry
9 | liberapay: # Replace with a single Liberapay username
10 | issuehunt: # Replace with a single IssueHunt username
11 | otechie: # Replace with a single Otechie username
12 | custom: # https://kiwix.org/support-us/
13 |
--------------------------------------------------------------------------------
/.github/stale.yml:
--------------------------------------------------------------------------------
1 | daysUntilClose: false
2 | staleLabel: stale
3 |
4 | issues:
5 | daysUntilStale: 60
6 | markComment: >
7 | This issue has been automatically marked as stale because it has not had
8 | recent activity. It will be now be reviewed manually. Thank you
9 | for your contributions.
10 | pulls:
11 | daysUntilStale: 7
12 | markComment: >
13 | This pull request has been automatically marked as stale because it has not had
14 | recent activity. It will be now be reviewed manually. Thank you
15 | for your contributions.
16 |
--------------------------------------------------------------------------------
/.github/workflows/DailyTests.yaml:
--------------------------------------------------------------------------------
1 | name: DailyTests
2 |
3 | on:
4 | schedule:
5 | - cron: "0 4 * * *"
6 | workflow_dispatch:
7 |
8 |
9 | jobs:
10 | run-daily-tests:
11 | runs-on: ubuntu-22.04
12 |
13 | steps:
14 | - name: checkout
15 | uses: actions/checkout@v4
16 |
17 | - name: build zimit image
18 | run: docker build -t local-zimit .
19 |
20 | - name: run crawl of test website
21 | run: docker run -v $PWD/output:/output local-zimit zimit --seeds https://website.test.openzim.org/ --name tests_eng_test-website --zim-file tests_eng_test-website.zim
22 |
23 | - name: archive ZIM
24 | uses: actions/upload-artifact@v4
25 | with:
26 | name: tests_eng_test-website.zim
27 | path: output/tests_eng_test-website.zim
28 | retention-days: 30
29 |
30 | - name: build tests-daily Docker image
31 | run: docker build -t local-tests-daily tests-daily
32 |
33 | - name: run integration test suite
34 | run: docker run -e SKIP_YOUTUBE_TEST="True" -v $PWD/tests-daily/daily.py:/app/daily.py -v $PWD/output:/output local-tests-daily bash -c "cd /app && pytest -v --log-level=INFO --log-format='%(levelname)s - %(message)s' daily.py"
35 |
--------------------------------------------------------------------------------
/.github/workflows/Publish.yml:
--------------------------------------------------------------------------------
1 | name: Publish released version
2 |
3 | on:
4 | release:
5 | types: [published]
6 |
7 | jobs:
8 | publish-amd64:
9 | runs-on: ubuntu-24.04
10 | name: "Publish for AMD64"
11 |
12 | steps:
13 | - uses: actions/checkout@v4
14 |
15 | - name: Build and push Docker image
16 | uses: openzim/docker-publish-action@v10
17 | with:
18 | image-name: openzim/zimit
19 | tag-pattern: /^v([0-9.]+)$/
20 | latest-on-tag: true
21 | restrict-to: openzim/zimit
22 | registries: ghcr.io
23 | credentials: |
24 | GHCRIO_USERNAME=${{ secrets.GHCR_USERNAME }}
25 | GHCRIO_TOKEN=${{ secrets.GHCR_TOKEN }}
26 | repo_description: auto
27 | repo_overview: auto
28 | platforms: |
29 | linux/amd64
30 |
31 | # Disabled for now, see https://github.com/openzim/zimit/issues/463
32 | # publish-arm64:
33 | # runs-on: ubuntu-24.04
34 | # name: "Publish for ARM64"
35 | #
36 | # steps:
37 | # - uses: actions/checkout@v4
38 | #
39 | # - name: Build and push Docker image
40 | # uses: openzim/docker-publish-action@v10
41 | # with:
42 | # image-name: openzim/zimit
43 | # tag-pattern: /^v([0-9.]+)$/
44 | # latest-on-tag: true
45 | # restrict-to: openzim/zimit
46 | # registries: ghcr.io
47 | # credentials: |
48 | # GHCRIO_USERNAME=${{ secrets.GHCR_USERNAME }}
49 | # GHCRIO_TOKEN=${{ secrets.GHCR_TOKEN }}
50 | # repo_description: auto
51 | # repo_overview: auto
52 | # platforms: |
53 | # linux/arm64
54 |
--------------------------------------------------------------------------------
/.github/workflows/PublishDockerDevImage.yaml:
--------------------------------------------------------------------------------
1 | name: Publish Docker dev image
2 |
3 | on:
4 | push:
5 | branches:
6 | - main
7 | workflow_dispatch:
8 |
9 | jobs:
10 | publish-amd64:
11 | runs-on: ubuntu-24.04
12 | name: "Publish for AMD64"
13 |
14 | steps:
15 | - uses: actions/checkout@v4
16 |
17 | - name: Build and push Docker image
18 | uses: openzim/docker-publish-action@v10
19 | with:
20 | image-name: openzim/zimit
21 | manual-tag: dev
22 | latest-on-tag: false
23 | restrict-to: openzim/zimit
24 | registries: ghcr.io
25 | credentials: |
26 | GHCRIO_USERNAME=${{ secrets.GHCR_USERNAME }}
27 | GHCRIO_TOKEN=${{ secrets.GHCR_TOKEN }}
28 | repo_description: auto
29 | repo_overview: auto
30 | platforms: |
31 | linux/amd64
32 |
33 | # Disabled for now, see https://github.com/openzim/zimit/issues/463
34 | # publish-arm64:
35 | # runs-on: ubuntu-24.04-arm
36 | # name: "Publish for ARM64"
37 | #
38 | # steps:
39 | # - uses: actions/checkout@v4
40 | #
41 | # - name: Build and push Docker image
42 | # uses: openzim/docker-publish-action@v10
43 | # with:
44 | # image-name: openzim/zimit
45 | # manual-tag: dev
46 | # latest-on-tag: false
47 | # restrict-to: openzim/zimit
48 | # registries: ghcr.io
49 | # credentials: |
50 | # GHCRIO_USERNAME=${{ secrets.GHCR_USERNAME }}
51 | # GHCRIO_TOKEN=${{ secrets.GHCR_TOKEN }}
52 | # repo_description: auto
53 | # repo_overview: auto
54 | # platforms: |
55 | # linux/arm64
56 |
--------------------------------------------------------------------------------
/.github/workflows/QA.yaml:
--------------------------------------------------------------------------------
1 | name: QA
2 |
3 | on:
4 | pull_request:
5 | push:
6 | branches:
7 | - main
8 |
9 | jobs:
10 | check-qa:
11 | runs-on: ubuntu-22.04
12 |
13 | steps:
14 | - uses: actions/checkout@v4
15 |
16 | - name: Set up Python
17 | uses: actions/setup-python@v5
18 | with:
19 | python-version-file: pyproject.toml
20 | architecture: x64
21 |
22 | - name: Install dependencies (and project)
23 | run: |
24 | pip install -U pip
25 | pip install -e .[lint,scripts,test,check]
26 |
27 | - name: Check black formatting
28 | run: inv lint-black
29 |
30 | - name: Check ruff
31 | run: inv lint-ruff
32 |
33 | - name: Check pyright
34 | run: inv check-pyright
35 |
--------------------------------------------------------------------------------
/.github/workflows/Tests.yaml:
--------------------------------------------------------------------------------
1 | name: Tests
2 |
3 | on:
4 | pull_request:
5 | push:
6 | branches:
7 | - main
8 |
9 | jobs:
10 | run-tests:
11 | runs-on: ubuntu-22.04
12 |
13 | steps:
14 | - uses: actions/checkout@v4
15 |
16 | - name: Set up Python
17 | uses: actions/setup-python@v5
18 | with:
19 | python-version-file: pyproject.toml
20 | architecture: x64
21 |
22 | - name: Install dependencies (and project)
23 | run: |
24 | pip install -U pip
25 | pip install -e .[test,scripts]
26 |
27 | - name: Run the tests
28 | run: inv coverage --args "-vvv"
29 |
30 | - name: Upload coverage report to codecov
31 | uses: codecov/codecov-action@v4
32 | with:
33 | token: ${{ secrets.CODECOV_TOKEN }}
34 |
35 | build_python:
36 | runs-on: ubuntu-22.04
37 | steps:
38 | - uses: actions/checkout@v4
39 |
40 | - name: Set up Python
41 | uses: actions/setup-python@v5
42 | with:
43 | python-version-file: pyproject.toml
44 | architecture: x64
45 |
46 | - name: Ensure we can build Python targets
47 | run: |
48 | pip install -U pip build
49 | python3 -m build --sdist --wheel
50 |
51 | # this job replaces the standard "build_docker" job since it builds the docker image
52 | run-integration-tests:
53 | runs-on: ubuntu-22.04
54 |
55 | steps:
56 | - name: checkout
57 | uses: actions/checkout@v4
58 |
59 | - name: build image
60 | run: docker build -t local-zimit .
61 |
62 | - name: ensure help display without issue
63 | run: docker run -v $PWD/output:/output local-zimit zimit --help
64 |
65 | - name: run crawl with soft size limit
66 | run: docker run -v $PWD/output:/output local-zimit zimit --seeds http://website.test.openzim.org/ --sizeSoftLimit 8192 --name tests_en_sizesoftlimit --zim-file tests_en_sizesoftlimit.zim --adminEmail test@example.com --mobileDevice "Pixel 5" --zimit-progress-file /output/stats_sizesoftlimit.json
67 |
68 | - name: run crawl with hard size limit
69 | run: docker run -v $PWD/output:/output local-zimit zimit --seeds http://website.test.openzim.org/ --sizeHardLimit 8192 --name tests_en_sizehardlimit --zim-file tests_en_sizehardlimit.zim --adminEmail test@example.com --mobileDevice "Pixel 5" --zimit-progress-file /output/stats_sizehardlimit.json || true
70 |
71 | - name: run crawl with soft time limit
72 | run: docker run -v $PWD/output:/output local-zimit zimit --seeds http://website.test.openzim.org/ --timeSoftLimit 1 --name tests_en_timesoftlimit --zim-file tests_en_timesoftlimit.zim --adminEmail test@example.com --mobileDevice "Pixel 5" --zimit-progress-file /output/stats_timesoftlimit.json
73 |
74 | - name: run crawl with hard time limit
75 | run: docker run -v $PWD/output:/output local-zimit zimit --seeds http://website.test.openzim.org/ --timeHardLimit 1 --name tests_en_timehardlimit --zim-file tests_en_timehardlimit.zim --adminEmail test@example.com --mobileDevice "Pixel 5" --zimit-progress-file /output/stats_timehardlimit.json || true
76 |
77 | - name: run standard crawl
78 | run: docker run -v $PWD/output:/output local-zimit zimit --seeds http://website.test.openzim.org/http-return-codes.html --name tests_en_onepage --zim-file tests_en_onepage.zim --adminEmail test@example.com --mobileDevice "Pixel 5" --zimit-progress-file /output/stats.json --statsFilename /output/crawl.json --warc2zim-progress-file /output/warc2zim.json --keep
79 |
80 | - name: run integration test suite
81 | run: docker run -v $PWD/tests-integration/integration.py:/app/integration.py -v $PWD/output:/output local-zimit bash -c "/app/zimit/bin/pip install pytest; /app/zimit/bin/pytest -v /app/integration.py"
82 |
--------------------------------------------------------------------------------
/.github/workflows/update-zim-offliner-definition.yaml:
--------------------------------------------------------------------------------
1 | name: Update ZIMFarm Definitions
2 |
3 | on:
4 | push:
5 | branches: [main]
6 | paths:
7 | - "offliner-definition.json"
8 | release:
9 | types: [published]
10 |
11 | workflow_dispatch:
12 | inputs:
13 | version:
14 | description: "Version to publish"
15 | required: false
16 | default: "dev"
17 |
18 | jobs:
19 | prepare-json:
20 | runs-on: ubuntu-24.04
21 | outputs:
22 | offliner_definition_b64: ${{ steps.read-json.outputs.offliner_definition_b64 }}
23 | steps:
24 | - name: Checkout repository
25 | uses: actions/checkout@v4
26 | with:
27 | fetch-depth: 0
28 |
29 | - id: read-json
30 | run: |
31 | if [ ! -f "offliner-definition.json" ]; then
32 | echo "File not found!" >&2
33 | exit 1
34 | fi
35 | json_b64=$(base64 -w0 <<< "$(jq -c . offliner-definition.json)")
36 | echo "offliner_definition_b64=$json_b64" >> $GITHUB_OUTPUT
37 | call-workflow:
38 | needs: prepare-json
39 | uses: openzim/overview/.github/workflows/update-zimfarm-offliner-definition.yaml@main
40 | with:
41 | version: ${{ github.event_name == 'release' && github.event.release.tag_name || (github.event.inputs.version || 'dev') }}
42 | offliner: zimit
43 | offliner_definition_b64: ${{ needs.prepare-json.outputs.offliner_definition_b64 }}
44 | secrets:
45 | zimfarm_ci_secret: ${{ secrets.ZIMFARM_CI_SECRET }}
46 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Created by https://www.toptal.com/developers/gitignore/api/linux,macos,python,visualstudiocode,intellij
2 | # Edit at https://www.toptal.com/developers/gitignore?templates=linux,macos,python,visualstudiocode,intellij
3 |
4 | ### Intellij ###
5 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider
6 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
7 |
8 | # User-specific stuff
9 | .idea/**/workspace.xml
10 | .idea/**/tasks.xml
11 | .idea/**/usage.statistics.xml
12 | .idea/**/dictionaries
13 | .idea/**/shelf
14 |
15 | # AWS User-specific
16 | .idea/**/aws.xml
17 |
18 | # Generated files
19 | .idea/**/contentModel.xml
20 |
21 | # Sensitive or high-churn files
22 | .idea/**/dataSources/
23 | .idea/**/dataSources.ids
24 | .idea/**/dataSources.local.xml
25 | .idea/**/sqlDataSources.xml
26 | .idea/**/dynamic.xml
27 | .idea/**/uiDesigner.xml
28 | .idea/**/dbnavigator.xml
29 |
30 | # Gradle
31 | .idea/**/gradle.xml
32 | .idea/**/libraries
33 |
34 | # Gradle and Maven with auto-import
35 | # When using Gradle or Maven with auto-import, you should exclude module files,
36 | # since they will be recreated, and may cause churn. Uncomment if using
37 | # auto-import.
38 | # .idea/artifacts
39 | # .idea/compiler.xml
40 | # .idea/jarRepositories.xml
41 | # .idea/modules.xml
42 | # .idea/*.iml
43 | # .idea/modules
44 | # *.iml
45 | # *.ipr
46 |
47 | # CMake
48 | cmake-build-*/
49 |
50 | # Mongo Explorer plugin
51 | .idea/**/mongoSettings.xml
52 |
53 | # File-based project format
54 | *.iws
55 |
56 | # IntelliJ
57 | out/
58 |
59 | # mpeltonen/sbt-idea plugin
60 | .idea_modules/
61 |
62 | # JIRA plugin
63 | atlassian-ide-plugin.xml
64 |
65 | # Cursive Clojure plugin
66 | .idea/replstate.xml
67 |
68 | # SonarLint plugin
69 | .idea/sonarlint/
70 |
71 | # Crashlytics plugin (for Android Studio and IntelliJ)
72 | com_crashlytics_export_strings.xml
73 | crashlytics.properties
74 | crashlytics-build.properties
75 | fabric.properties
76 |
77 | # Editor-based Rest Client
78 | .idea/httpRequests
79 |
80 | # Android studio 3.1+ serialized cache file
81 | .idea/caches/build_file_checksums.ser
82 |
83 | ### Intellij Patch ###
84 | # Comment Reason: https://github.com/joeblau/gitignore.io/issues/186#issuecomment-215987721
85 |
86 | # *.iml
87 | # modules.xml
88 | # .idea/misc.xml
89 | # *.ipr
90 |
91 | # Sonarlint plugin
92 | # https://plugins.jetbrains.com/plugin/7973-sonarlint
93 | .idea/**/sonarlint/
94 |
95 | # SonarQube Plugin
96 | # https://plugins.jetbrains.com/plugin/7238-sonarqube-community-plugin
97 | .idea/**/sonarIssues.xml
98 |
99 | # Markdown Navigator plugin
100 | # https://plugins.jetbrains.com/plugin/7896-markdown-navigator-enhanced
101 | .idea/**/markdown-navigator.xml
102 | .idea/**/markdown-navigator-enh.xml
103 | .idea/**/markdown-navigator/
104 |
105 | # Cache file creation bug
106 | # See https://youtrack.jetbrains.com/issue/JBR-2257
107 | .idea/$CACHE_FILE$
108 |
109 | # CodeStream plugin
110 | # https://plugins.jetbrains.com/plugin/12206-codestream
111 | .idea/codestream.xml
112 |
113 | # Azure Toolkit for IntelliJ plugin
114 | # https://plugins.jetbrains.com/plugin/8053-azure-toolkit-for-intellij
115 | .idea/**/azureSettings.xml
116 |
117 | ### Linux ###
118 | *~
119 |
120 | # temporary files which can be created if a process still has a handle open of a deleted file
121 | .fuse_hidden*
122 |
123 | # KDE directory preferences
124 | .directory
125 |
126 | # Linux trash folder which might appear on any partition or disk
127 | .Trash-*
128 |
129 | # .nfs files are created when an open file is removed but is still being accessed
130 | .nfs*
131 |
132 | ### macOS ###
133 | # General
134 | .DS_Store
135 | .AppleDouble
136 | .LSOverride
137 |
138 | # Icon must end with two \r
139 | Icon
140 |
141 |
142 | # Thumbnails
143 | ._*
144 |
145 | # Files that might appear in the root of a volume
146 | .DocumentRevisions-V100
147 | .fseventsd
148 | .Spotlight-V100
149 | .TemporaryItems
150 | .Trashes
151 | .VolumeIcon.icns
152 | .com.apple.timemachine.donotpresent
153 |
154 | # Directories potentially created on remote AFP share
155 | .AppleDB
156 | .AppleDesktop
157 | Network Trash Folder
158 | Temporary Items
159 | .apdisk
160 |
161 | ### macOS Patch ###
162 | # iCloud generated files
163 | *.icloud
164 |
165 | ### Python ###
166 | # Byte-compiled / optimized / DLL files
167 | __pycache__/
168 | *.py[cod]
169 | *$py.class
170 |
171 | # C extensions
172 | *.so
173 |
174 | # Distribution / packaging
175 | .Python
176 | build/
177 | develop-eggs/
178 | dist/
179 | downloads/
180 | eggs/
181 | .eggs/
182 | lib/
183 | lib64/
184 | parts/
185 | sdist/
186 | var/
187 | wheels/
188 | share/python-wheels/
189 | *.egg-info/
190 | .installed.cfg
191 | *.egg
192 | MANIFEST
193 |
194 | # PyInstaller
195 | # Usually these files are written by a python script from a template
196 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
197 | *.manifest
198 | *.spec
199 |
200 | # Installer logs
201 | pip-log.txt
202 | pip-delete-this-directory.txt
203 |
204 | # Unit test / coverage reports
205 | htmlcov/
206 | .tox/
207 | .nox/
208 | .coverage
209 | .coverage.*
210 | .cache
211 | nosetests.xml
212 | coverage.xml
213 | *.cover
214 | *.py,cover
215 | .hypothesis/
216 | .pytest_cache/
217 | cover/
218 |
219 | # Translations
220 | *.mo
221 | *.pot
222 |
223 | # Django stuff:
224 | *.log
225 | local_settings.py
226 | db.sqlite3
227 | db.sqlite3-journal
228 |
229 | # Flask stuff:
230 | instance/
231 | .webassets-cache
232 |
233 | # Scrapy stuff:
234 | .scrapy
235 |
236 | # Sphinx documentation
237 | docs/_build/
238 |
239 | # PyBuilder
240 | .pybuilder/
241 | target/
242 |
243 | # Jupyter Notebook
244 | .ipynb_checkpoints
245 |
246 | # IPython
247 | profile_default/
248 | ipython_config.py
249 |
250 | # pyenv
251 | # For a library or package, you might want to ignore these files since the code is
252 | # intended to run in multiple environments; otherwise, check them in:
253 | # .python-version
254 |
255 | # pipenv
256 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
257 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
258 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
259 | # install all needed dependencies.
260 | #Pipfile.lock
261 |
262 | # poetry
263 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
264 | # This is especially recommended for binary packages to ensure reproducibility, and is more
265 | # commonly ignored for libraries.
266 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
267 | #poetry.lock
268 |
269 | # pdm
270 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
271 | #pdm.lock
272 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
273 | # in version control.
274 | # https://pdm.fming.dev/#use-with-ide
275 | .pdm.toml
276 |
277 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
278 | __pypackages__/
279 |
280 | # Celery stuff
281 | celerybeat-schedule
282 | celerybeat.pid
283 |
284 | # SageMath parsed files
285 | *.sage.py
286 |
287 | # Environments
288 | .env
289 | .venv
290 | env/
291 | venv/
292 | ENV/
293 | env.bak/
294 | venv.bak/
295 |
296 | # Spyder project settings
297 | .spyderproject
298 | .spyproject
299 |
300 | # Rope project settings
301 | .ropeproject
302 |
303 | # mkdocs documentation
304 | /site
305 |
306 | # mypy
307 | .mypy_cache/
308 | .dmypy.json
309 | dmypy.json
310 |
311 | # Pyre type checker
312 | .pyre/
313 |
314 | # pytype static type analyzer
315 | .pytype/
316 |
317 | # Cython debug symbols
318 | cython_debug/
319 |
320 | # PyCharm
321 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
322 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
323 | # and can be added to the global gitignore or merged into this file. For a more nuclear
324 | # option (not recommended) you can uncomment the following to ignore the entire idea folder.
325 | #.idea/
326 |
327 | ### Python Patch ###
328 | # Poetry local configuration file - https://python-poetry.org/docs/configuration/#local-configuration
329 | poetry.toml
330 |
331 | # ruff
332 | .ruff_cache/
333 |
334 | # LSP config files
335 | pyrightconfig.json
336 |
337 | ### VisualStudioCode ###
338 | .vscode/*
339 | !.vscode/settings.json
340 | !.vscode/tasks.json
341 | !.vscode/launch.json
342 | !.vscode/extensions.json
343 | !.vscode/*.code-snippets
344 |
345 | # Local History for Visual Studio Code
346 | .history/
347 |
348 | # Built Visual Studio Code Extensions
349 | *.vsix
350 |
351 | ### VisualStudioCode Patch ###
352 | # Ignore all local history of files
353 | .history
354 | .ionide
355 |
356 | # End of https://www.toptal.com/developers/gitignore/api/linux,macos,python,visualstudiocode,intellij
357 |
358 | # output dir
359 | output
360 |
361 | # ignore all vscode, this editor specific, not maintained by openzim
362 | .vscode
363 |
--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
1 | # See https://pre-commit.com for more information
2 | # See https://pre-commit.com/hooks.html for more hooks
3 | repos:
4 | - repo: https://github.com/pre-commit/pre-commit-hooks
5 | rev: v5.0.0
6 | hooks:
7 | - id: trailing-whitespace
8 | - id: end-of-file-fixer
9 | - repo: https://github.com/psf/black
10 | rev: "25.1.0"
11 | hooks:
12 | - id: black
13 | - repo: https://github.com/astral-sh/ruff-pre-commit
14 | rev: v0.9.4
15 | hooks:
16 | - id: ruff
17 | - repo: https://github.com/RobertCraigie/pyright-python
18 | rev: v1.1.393
19 | hooks:
20 | - id: pyright
21 | name: pyright (system)
22 | description: 'pyright static type checker'
23 | entry: pyright
24 | language: system
25 | 'types_or': [python, pyi]
26 | require_serial: true
27 | minimum_pre_commit_version: '2.9.2'
28 |
--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
1 | ## Changelog
2 |
3 | All notable changes to this project are documented in this file.
4 |
5 | The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
6 | and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html) (as of version 1.2.0).
7 |
8 | ## [Unreleased]
9 |
10 | ## [3.0.5] - 2024-04-11
11 |
12 | ### Changed
13 |
14 | - Upgrade to browsertrix crawler 1.6.0 (#493)
15 |
16 | ## [3.0.4] - 2024-04-04
17 |
18 | ### Changed
19 |
20 | - Upgrade to browsertrix crawler 1.5.10 (#491)
21 |
22 | ## [3.0.3] - 2024-02-28
23 |
24 | ### Changed
25 |
26 | - Upgrade to browsertrix crawler 1.5.7 (#483)
27 |
28 | ## [3.0.2] - 2024-02-27
29 |
30 | ### Changed
31 |
32 | - Upgrade to browsertrix crawler 1.5.6 (#482)
33 |
34 | ## [3.0.1] - 2024-02-24
35 |
36 | ### Changed
37 |
38 | - Upgrade to browsertrix crawler 1.5.4 (#476)
39 |
40 | ## [3.0.0] - 2024-02-17
41 |
42 | ### Changed
43 |
44 | - Change solution to report partial ZIM to the Zimfarm and other clients (#304)
45 | - Keep temporary folder when crawler or warc2zim fails, even if not asked for (#468)
46 | - Add many missing Browsertrix Crawler arguments ; drop default overrides by zimit ; drop `--noMobileDevice` setting (not needed anymore) (#433)
47 | - Document all Browsertrix Crawler default arguments values (#416)
48 | - Use preferred Browsertrix Crawler arguments names: (part of #471)
49 | - `--seeds` instead of `--url`
50 | - `--seedFile` instead of `--urlFile`
51 | - `--pageLimit` instead of `--limit`
52 | - `--pageLoadTimeout` instead of `--timeout`
53 | - `--scopeIncludeRx` instead of `--include`
54 | - `--scopeExcludeRx` instead of `--exclude`
55 | - `--pageExtraDelay` instead of `--delay`
56 | - Remove confusion between zimit, warc2zim and crawler stats filenames (part of #471)
57 | - `--statsFilename` is now the crawler stats file (since it is the same name, just like other arguments)
58 | - `--zimit-progress-file` is now the zimit stats location
59 | - `--warc2zim-progress-file` is the warc2zim stats location
60 | - all are optional values, if not set and needed temporary files are used
61 |
62 | ### Fixed
63 |
64 | - Do not create the ZIM when crawl is incomplete (#444)
65 |
66 | ## [2.1.8] - 2024-02-07
67 |
68 | ### Changed
69 |
70 | - Upgrade to browsertrix crawler 1.5.1, Python 3.13 and others (#462 + #464)
71 |
72 | ## [2.1.7] - 2024-01-10
73 |
74 | ### Changed
75 |
76 | - Upgrade to browsertrix crawler 1.4.2 (#450)
77 | - Upgrade to warc2zim 2.2.0
78 |
79 | ## [2.1.6] - 2024-11-07
80 |
81 | ### Changed
82 |
83 | - Upgrade to browsertrix crawler 1.3.5 (#426)
84 |
85 | ## [2.1.5] - 2024-11-01
86 |
87 | ### Changed
88 |
89 | - Upgrade to browsertrix crawler 1.3.4 and warc2zim 2.1.3 (#424)
90 |
91 | ## [2.1.4] - 2024-10-11
92 |
93 | ### Changed
94 |
95 | - Upgrade to browsertrix crawler 1.3.3 (#411)
96 |
97 | ## [2.1.3] - 2024-10-08
98 |
99 | ### Changed
100 |
101 | - Upgrade to browsertrix crawler 1.3.2, warc2zim 2.1.2 and other dependencies (#406)
102 |
103 | ### Fixed
104 |
105 | - Fix help (#393)
106 |
107 | ## [2.1.2] - 2024-09-09
108 |
109 | ### Changed
110 |
111 | - Upgrade to browsertrix crawler 1.3.0-beta.1 (#387) (fixes "Ziming a website with huge assets (e.g. PDFs) is failing to proceed" - #380)
112 |
113 | ## [2.1.1] - 2024-09-05
114 |
115 | ### Added
116 |
117 | - Add support for uncompressed tar archive in --warcs (#369)
118 |
119 | ### Changed
120 |
121 | - Upgrade to browsertrix crawler 1.3.0-beta.0 (#379), including upgrage to Ubuntu Noble (#307)
122 |
123 | ### Fixed
124 |
125 | - Stream files downloads to not exhaust memory (#373)
126 | - Fix documentation on `--diskUtilization` setting (#375)
127 |
128 | ## [2.1.0] - 2024-08-09
129 |
130 | ### Added
131 |
132 | - Add `--custom-behaviors` argument to support path/HTTP(S) URL custom behaviors to pass to the crawler (#313)
133 | - Add daily automated end-to-end tests of a page with Youtube player (#330)
134 | - Add `--warcs` option to directly process WARC files (#301)
135 |
136 | ### Changed
137 |
138 | - Make it clear that `--profile` argument can be an HTTP(S) URL (and not only a path) (#288)
139 | - Fix README imprecisions + add back warc2zim availability in docker image (#314)
140 | - Enhance integration test to assert final content of the ZIM (#287)
141 | - Stop fetching and passing browsertrix crawler version as scraperSuffix to warc2zim (#354)
142 | - Do not log number of WARC files found (#357)
143 | - Upgrade dependencies (warc2zim 2.1.0)
144 |
145 | ### Fixed
146 |
147 | - Sort WARC directories found by modification time (#366)
148 |
149 | ## [2.0.6] - 2024-08-02
150 |
151 | ### Changed
152 |
153 | - Upgraded Browsertrix Crawler to 1.2.6
154 |
155 | ## [2.0.5] - 2024-07-24
156 |
157 | ### Changed
158 |
159 | - Upgraded Browsertrix Crawler to 1.2.5
160 | - Upgraded warc2zim to 2.0.3
161 |
162 | ## [2.0.4] - 2024-07-15
163 |
164 | ### Changed
165 |
166 | - Upgraded Browsertrix Crawler to 1.2.4 (fixes retrieve automatically the assets present in a data-xxx tag #316)
167 |
168 | ## [2.0.3] - 2024-06-24
169 |
170 | ### Changed
171 |
172 | - Upgraded Browsertrix Crawler to 1.2.0 (fixes Youtube videos issue #323)
173 |
174 | ## [2.0.2] - 2024-06-18
175 |
176 | ### Changed
177 |
178 | - Upgrade dependencies (mainly warc2zim 2.0.2)
179 |
180 |
181 | ## [2.0.1] - 2024-06-13
182 |
183 | ### Changed
184 |
185 | - Upgrade dependencies (especially warc2zim 2.0.1 and browsertrix crawler 1.2.0-beta.0) (#318)
186 |
187 | ### Fixed
188 |
189 | - Crawler is not correctly checking disk size / usage (#305)
190 |
191 | ## [2.0.0] - 2024-06-04
192 |
193 | ### Added
194 |
195 | - New `--version` flag to display Zimit version (#234)
196 | - New `--logging` flag to adjust Browsertrix Crawler logging (#273)
197 | - Use new `--scraper-suffix` flag of warc2zim to enhance ZIM "Scraper" metadata (#275)
198 | - New `--noMobileDevice` CLI argument
199 | - Publish Docker image for `linux/arm64` (in addition to `linux/amd64`) (#178)
200 |
201 | ### Changed
202 |
203 | - **Use `warc2zim` version 2**, which works without Service Worker anymore (#193)
204 | - Upgraded Browsertrix Crawler to 1.1.3
205 | - Adopt Python bootstrap conventions
206 | - Upgrade to Python 3.12 + upgrade dependencies
207 | - Removed handling of redirects by zimit, they are handled by browsertrix crawler and detected properly by warc2zim (#284)
208 | - Drop initial check of URL in Python (#256)
209 | - `--userAgent` CLI argument overrides again the `--userAgentSuffix` and `--adminEmail` values
210 | - `--userAgent` CLI arguement is not mandatory anymore
211 |
212 | ### Fixed
213 |
214 | - Fix support for Youtube videos (#291)
215 | - Fix crawler `--waitUntil` values (#289)
216 |
217 | ## [1.6.3] - 2024-01-18
218 |
219 | ### Changed
220 |
221 | - Adapt to new `warc2zim` code structure
222 | - Using browsertrix-crawler 0.12.4
223 | - Using warc2zim 1.5.5
224 |
225 | ### Added
226 |
227 | - New `--build` parameter (optional) to specify the directory holding Browsertrix files ; if not set, `--output`
228 | directory is used ; zimit creates one subdir of this folder per invocation to isolate datasets ; subdir is kept only
229 | if `--keep` is set.
230 |
231 | ### Fixed
232 |
233 | - `--collection` parameter was not working (#252)
234 |
235 | ## [1.6.2] - 2023-11-17
236 |
237 | ### Changed
238 |
239 | - Using browsertrix-crawler 0.12.3
240 |
241 | ### Fixed
242 |
243 | - Fix logic passing args to crawler to support value '0' (#245)
244 | - Fix documentation about Chrome and headless (#248)
245 |
246 | ## [1.6.1] - 2023-11-06
247 |
248 | ### Changed
249 |
250 | - Using browsertrix-crawler 0.12.1
251 |
252 | ## [1.6.0] - 2023-11-02
253 |
254 | ### Changed
255 |
256 | - Scraper fails for all HTTP error codes returned when checking URL at startup (#223)
257 | - User-Agent now has a default value (#228)
258 | - Manipulation of spaces with UA suffix and adminEmail has been modified
259 | - Same User-Agent is used for check_url (Python) and Browsertrix crawler (#227)
260 | - Using browsertrix-crawler 0.12.0
261 |
262 | ## [1.5.3] - 2023-10-02
263 |
264 | ### Changed
265 |
266 | - Using browsertrix-crawler 0.11.2
267 |
268 | ## [1.5.2] - 2023-09-19
269 |
270 | ### Changed
271 |
272 | - Using browsertrix-crawler 0.11.1
273 |
274 | ## [1.5.1] - 2023-09-18
275 |
276 | ### Changed
277 |
278 | - Using browsertrix-crawler 0.11.0
279 | - Scraper stat file is not created empty (#211)
280 | - Crawler statistics are not available anymore (#213)
281 | - Using warc2zim 1.5.4
282 |
283 | ## [1.5.0] - 2023-08-23
284 |
285 | ### Added
286 |
287 | - `--long-description` param
288 |
289 | ## [1.4.1] - 2023-08-23
290 |
291 | ### Changed
292 |
293 | - Using browsertrix-crawler 0.10.4
294 | - Using warc2zim 1.5.3
295 |
296 | ## [1.4.0] - 2023-08-02
297 |
298 | ### Added
299 |
300 | - `--title` to set ZIM title
301 | - `--description` to set ZIM description
302 | - New crawler options: `--maxPageLimit`, `--delay`, `--diskUtilization`
303 | - `--zim-lang` param to set warc2zim's `--lang` (ISO-639-3)
304 |
305 | ### Changed
306 |
307 | - Using browsertrix-crawler 0.10.2
308 | - Default and accepted values for `--waitUntil` from crawler's update
309 | - Using warc2zim 1.5.2
310 | - Disabled Chrome updates to prevent incidental inclusion of update data in WARC/ZIM (#172)
311 | - `--failOnFailedSeed` used inconditionally
312 | - `--lang` now passed to crawler (ISO-639-1)
313 |
314 | ### Removed
315 |
316 | - `--newContext` from crawler's update
317 |
318 | ## [1.3.1] - 2023-02-06
319 |
320 | ### Changed
321 |
322 | - Using browsertrix-crawler 0.8.0
323 | - Using warc2zim version 1.5.1 with wabac.js 2.15.2
324 |
325 | ## [1.3.0] - 2023-02-02
326 |
327 | ### Added
328 |
329 | - Initial url check normalizes homepage redirects to standart ports – 80/443 (#137)
330 |
331 | ### Changed
332 |
333 | - Using warc2zim version 1.5.0 with scope conflict fix and videos fix
334 | - Using browsertrix-crawler 0.8.0-beta.1
335 | - Fixed `--allowHashUrls` being a boolean param
336 | - Increased `check_url` timeout (12s to connect, 27s to read) instead of 10s
337 |
338 | ## [1.2.0] - 2022-06-21
339 |
340 | ### Added
341 |
342 | - `--urlFile` browsertrix crawler parameter
343 | - `--depth` browsertrix crawler parameter
344 | - `--extraHops`, parameter
345 | - `--collection` browsertrix crawler parameter
346 | - `--allowHashUrls` browsertrix crawler parameter
347 | - `--userAgentSuffix` browsertrix crawler parameter
348 | - `--behaviors`, parameter
349 | - `--behaviorTimeout` browsertrix crawler parameter
350 | - `--profile` browsertrix crawler parameter
351 | - `--sizeLimit` browsertrix crawler parameter
352 | - `--timeLimit` browsertrix crawler parameter
353 | - `--healthCheckPort`, parameter
354 | - `--overwrite` parameter
355 |
356 | ### Changed
357 |
358 | - using browsertrix-crawler `0.6.0` and warc2zim `1.4.2`
359 | - default WARC location after crawl changed
360 | from `collections/capture-*/archive/` to `collections/crawl-*/archive/`
361 |
362 | ### Removed
363 |
364 | - `--scroll` browsertrix crawler parameter (see `--behaviors`)
365 | - `--scope` browsertrix crawler parameter (see `--scopeType`, `--include` and `--exclude`)
366 |
367 |
368 | ## [1.1.5]
369 |
370 | - using crawler 0.3.2 and warc2zim 1.3.6
371 |
372 | ## [1.1.4]
373 |
374 | - Defaults to `load,networkidle0` for waitUntil param (same as crawler)
375 | - Allows setting combinations of values for waitUntil param
376 | - Updated warc2zim to 1.3.5
377 | - Updated browsertrix-crawler to 0.3.1
378 | - Warc to zim now written to `{temp_root_dir}/collections/capture-*/archive/` where
379 | `capture-*` is dynamic and includes the datetime. (from browsertrix-crawler)
380 |
381 | ## [1.1.3]
382 |
383 | - allows same first-level-domain redirects
384 | - fixed redirects to URL in scope
385 | - updated crawler to 0.2.0
386 | - `statsFilename` now informs whether limit was hit or not
387 |
388 | ## [1.1.2]
389 |
390 | - added support for --custom-css
391 | - added domains block list (dfault)
392 |
393 | ## [1.1.1]
394 |
395 | - updated browsertrix-crawler to 0.1.4
396 | - autofetcher script to be injected by defaultDriver to capture srcsets + URLs in dynamically added stylesheets
397 |
398 | ## [1.0]
399 |
400 | - initial version using browsertrix-crawler:0.1.3 and warc2zim:1.3.3
401 |
--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM webrecorder/browsertrix-crawler:1.6.0
2 | LABEL org.opencontainers.image.source=https://github.com/openzim/zimit
3 |
4 | # add deadsnakes ppa for latest Python on Ubuntu
5 | RUN add-apt-repository ppa:deadsnakes/ppa -y
6 |
7 | RUN apt-get update \
8 | && apt-get install -qqy --no-install-recommends \
9 | libmagic1 \
10 | python3.13-venv \
11 | && rm -rf /var/lib/apt/lists/* \
12 | # python setup (in venv not to conflict with browsertrix)
13 | && python3.13 -m venv /app/zimit \
14 | # placeholder (default output location)
15 | && mkdir -p /output \
16 | # disable chrome upgrade
17 | && printf "repo_add_once=\"false\"\nrepo_reenable_on_distupgrade=\"false\"\n" > /etc/default/google-chrome \
18 | # download list of bad domains to filter-out. intentionnaly ran post-install \
19 | # so it's not cached in earlier layers (url stays same but content updated) \
20 | && mkdir -p /tmp/ads \
21 | && cd /tmp/ads \
22 | && curl -L -O https://hosts.anudeep.me/mirror/adservers.txt \
23 | && curl -L -O https://hosts.anudeep.me/mirror/CoinMiner.txt \
24 | && curl -L -O https://hosts.anudeep.me/mirror/facebook.txt \
25 | && cat ./*.txt > /etc/blocklist.txt \
26 | && rm ./*.txt \
27 | && printf '#!/bin/sh\ncat /etc/blocklist.txt >> /etc/hosts\nexec "$@"' > /usr/local/bin/entrypoint.sh \
28 | && chmod +x /usr/local/bin/entrypoint.sh
29 |
30 | # Copy pyproject.toml and its dependencies
31 | COPY pyproject.toml README.md /src/
32 | COPY src/zimit/__about__.py /src/src/zimit/__about__.py
33 |
34 | # Install Python dependencies
35 | RUN . /app/zimit/bin/activate && python -m pip install --no-cache-dir /src
36 |
37 | # Copy code + associated artifacts
38 | COPY src /src/src
39 | COPY *.md /src/
40 |
41 | # Install + cleanup
42 | RUN . /app/zimit/bin/activate && python -m pip install --no-cache-dir /src \
43 | && ln -s /app/zimit/bin/zimit /usr/bin/zimit \
44 | && ln -s /app/zimit/bin/warc2zim /usr/bin/warc2zim \
45 | && chmod +x /usr/bin/zimit \
46 | && rm -rf /src
47 |
48 | ENTRYPOINT ["entrypoint.sh"]
49 | CMD ["zimit", "--help"]
50 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | GNU GENERAL PUBLIC LICENSE
2 | Version 3, 29 June 2007
3 |
4 | Copyright (C) 2007 Free Software Foundation, Inc.
5 | Everyone is permitted to copy and distribute verbatim copies
6 | of this license document, but changing it is not allowed.
7 |
8 | Preamble
9 |
10 | The GNU General Public License is a free, copyleft license for
11 | software and other kinds of works.
12 |
13 | The licenses for most software and other practical works are designed
14 | to take away your freedom to share and change the works. By contrast,
15 | the GNU General Public License is intended to guarantee your freedom to
16 | share and change all versions of a program--to make sure it remains free
17 | software for all its users. We, the Free Software Foundation, use the
18 | GNU General Public License for most of our software; it applies also to
19 | any other work released this way by its authors. You can apply it to
20 | your programs, too.
21 |
22 | When we speak of free software, we are referring to freedom, not
23 | price. Our General Public Licenses are designed to make sure that you
24 | have the freedom to distribute copies of free software (and charge for
25 | them if you wish), that you receive source code or can get it if you
26 | want it, that you can change the software or use pieces of it in new
27 | free programs, and that you know you can do these things.
28 |
29 | To protect your rights, we need to prevent others from denying you
30 | these rights or asking you to surrender the rights. Therefore, you have
31 | certain responsibilities if you distribute copies of the software, or if
32 | you modify it: responsibilities to respect the freedom of others.
33 |
34 | For example, if you distribute copies of such a program, whether
35 | gratis or for a fee, you must pass on to the recipients the same
36 | freedoms that you received. You must make sure that they, too, receive
37 | or can get the source code. And you must show them these terms so they
38 | know their rights.
39 |
40 | Developers that use the GNU GPL protect your rights with two steps:
41 | (1) assert copyright on the software, and (2) offer you this License
42 | giving you legal permission to copy, distribute and/or modify it.
43 |
44 | For the developers' and authors' protection, the GPL clearly explains
45 | that there is no warranty for this free software. For both users' and
46 | authors' sake, the GPL requires that modified versions be marked as
47 | changed, so that their problems will not be attributed erroneously to
48 | authors of previous versions.
49 |
50 | Some devices are designed to deny users access to install or run
51 | modified versions of the software inside them, although the manufacturer
52 | can do so. This is fundamentally incompatible with the aim of
53 | protecting users' freedom to change the software. The systematic
54 | pattern of such abuse occurs in the area of products for individuals to
55 | use, which is precisely where it is most unacceptable. Therefore, we
56 | have designed this version of the GPL to prohibit the practice for those
57 | products. If such problems arise substantially in other domains, we
58 | stand ready to extend this provision to those domains in future versions
59 | of the GPL, as needed to protect the freedom of users.
60 |
61 | Finally, every program is threatened constantly by software patents.
62 | States should not allow patents to restrict development and use of
63 | software on general-purpose computers, but in those that do, we wish to
64 | avoid the special danger that patents applied to a free program could
65 | make it effectively proprietary. To prevent this, the GPL assures that
66 | patents cannot be used to render the program non-free.
67 |
68 | The precise terms and conditions for copying, distribution and
69 | modification follow.
70 |
71 | TERMS AND CONDITIONS
72 |
73 | 0. Definitions.
74 |
75 | "This License" refers to version 3 of the GNU General Public License.
76 |
77 | "Copyright" also means copyright-like laws that apply to other kinds of
78 | works, such as semiconductor masks.
79 |
80 | "The Program" refers to any copyrightable work licensed under this
81 | License. Each licensee is addressed as "you". "Licensees" and
82 | "recipients" may be individuals or organizations.
83 |
84 | To "modify" a work means to copy from or adapt all or part of the work
85 | in a fashion requiring copyright permission, other than the making of an
86 | exact copy. The resulting work is called a "modified version" of the
87 | earlier work or a work "based on" the earlier work.
88 |
89 | A "covered work" means either the unmodified Program or a work based
90 | on the Program.
91 |
92 | To "propagate" a work means to do anything with it that, without
93 | permission, would make you directly or secondarily liable for
94 | infringement under applicable copyright law, except executing it on a
95 | computer or modifying a private copy. Propagation includes copying,
96 | distribution (with or without modification), making available to the
97 | public, and in some countries other activities as well.
98 |
99 | To "convey" a work means any kind of propagation that enables other
100 | parties to make or receive copies. Mere interaction with a user through
101 | a computer network, with no transfer of a copy, is not conveying.
102 |
103 | An interactive user interface displays "Appropriate Legal Notices"
104 | to the extent that it includes a convenient and prominently visible
105 | feature that (1) displays an appropriate copyright notice, and (2)
106 | tells the user that there is no warranty for the work (except to the
107 | extent that warranties are provided), that licensees may convey the
108 | work under this License, and how to view a copy of this License. If
109 | the interface presents a list of user commands or options, such as a
110 | menu, a prominent item in the list meets this criterion.
111 |
112 | 1. Source Code.
113 |
114 | The "source code" for a work means the preferred form of the work
115 | for making modifications to it. "Object code" means any non-source
116 | form of a work.
117 |
118 | A "Standard Interface" means an interface that either is an official
119 | standard defined by a recognized standards body, or, in the case of
120 | interfaces specified for a particular programming language, one that
121 | is widely used among developers working in that language.
122 |
123 | The "System Libraries" of an executable work include anything, other
124 | than the work as a whole, that (a) is included in the normal form of
125 | packaging a Major Component, but which is not part of that Major
126 | Component, and (b) serves only to enable use of the work with that
127 | Major Component, or to implement a Standard Interface for which an
128 | implementation is available to the public in source code form. A
129 | "Major Component", in this context, means a major essential component
130 | (kernel, window system, and so on) of the specific operating system
131 | (if any) on which the executable work runs, or a compiler used to
132 | produce the work, or an object code interpreter used to run it.
133 |
134 | The "Corresponding Source" for a work in object code form means all
135 | the source code needed to generate, install, and (for an executable
136 | work) run the object code and to modify the work, including scripts to
137 | control those activities. However, it does not include the work's
138 | System Libraries, or general-purpose tools or generally available free
139 | programs which are used unmodified in performing those activities but
140 | which are not part of the work. For example, Corresponding Source
141 | includes interface definition files associated with source files for
142 | the work, and the source code for shared libraries and dynamically
143 | linked subprograms that the work is specifically designed to require,
144 | such as by intimate data communication or control flow between those
145 | subprograms and other parts of the work.
146 |
147 | The Corresponding Source need not include anything that users
148 | can regenerate automatically from other parts of the Corresponding
149 | Source.
150 |
151 | The Corresponding Source for a work in source code form is that
152 | same work.
153 |
154 | 2. Basic Permissions.
155 |
156 | All rights granted under this License are granted for the term of
157 | copyright on the Program, and are irrevocable provided the stated
158 | conditions are met. This License explicitly affirms your unlimited
159 | permission to run the unmodified Program. The output from running a
160 | covered work is covered by this License only if the output, given its
161 | content, constitutes a covered work. This License acknowledges your
162 | rights of fair use or other equivalent, as provided by copyright law.
163 |
164 | You may make, run and propagate covered works that you do not
165 | convey, without conditions so long as your license otherwise remains
166 | in force. You may convey covered works to others for the sole purpose
167 | of having them make modifications exclusively for you, or provide you
168 | with facilities for running those works, provided that you comply with
169 | the terms of this License in conveying all material for which you do
170 | not control copyright. Those thus making or running the covered works
171 | for you must do so exclusively on your behalf, under your direction
172 | and control, on terms that prohibit them from making any copies of
173 | your copyrighted material outside their relationship with you.
174 |
175 | Conveying under any other circumstances is permitted solely under
176 | the conditions stated below. Sublicensing is not allowed; section 10
177 | makes it unnecessary.
178 |
179 | 3. Protecting Users' Legal Rights From Anti-Circumvention Law.
180 |
181 | No covered work shall be deemed part of an effective technological
182 | measure under any applicable law fulfilling obligations under article
183 | 11 of the WIPO copyright treaty adopted on 20 December 1996, or
184 | similar laws prohibiting or restricting circumvention of such
185 | measures.
186 |
187 | When you convey a covered work, you waive any legal power to forbid
188 | circumvention of technological measures to the extent such circumvention
189 | is effected by exercising rights under this License with respect to
190 | the covered work, and you disclaim any intention to limit operation or
191 | modification of the work as a means of enforcing, against the work's
192 | users, your or third parties' legal rights to forbid circumvention of
193 | technological measures.
194 |
195 | 4. Conveying Verbatim Copies.
196 |
197 | You may convey verbatim copies of the Program's source code as you
198 | receive it, in any medium, provided that you conspicuously and
199 | appropriately publish on each copy an appropriate copyright notice;
200 | keep intact all notices stating that this License and any
201 | non-permissive terms added in accord with section 7 apply to the code;
202 | keep intact all notices of the absence of any warranty; and give all
203 | recipients a copy of this License along with the Program.
204 |
205 | You may charge any price or no price for each copy that you convey,
206 | and you may offer support or warranty protection for a fee.
207 |
208 | 5. Conveying Modified Source Versions.
209 |
210 | You may convey a work based on the Program, or the modifications to
211 | produce it from the Program, in the form of source code under the
212 | terms of section 4, provided that you also meet all of these conditions:
213 |
214 | a) The work must carry prominent notices stating that you modified
215 | it, and giving a relevant date.
216 |
217 | b) The work must carry prominent notices stating that it is
218 | released under this License and any conditions added under section
219 | 7. This requirement modifies the requirement in section 4 to
220 | "keep intact all notices".
221 |
222 | c) You must license the entire work, as a whole, under this
223 | License to anyone who comes into possession of a copy. This
224 | License will therefore apply, along with any applicable section 7
225 | additional terms, to the whole of the work, and all its parts,
226 | regardless of how they are packaged. This License gives no
227 | permission to license the work in any other way, but it does not
228 | invalidate such permission if you have separately received it.
229 |
230 | d) If the work has interactive user interfaces, each must display
231 | Appropriate Legal Notices; however, if the Program has interactive
232 | interfaces that do not display Appropriate Legal Notices, your
233 | work need not make them do so.
234 |
235 | A compilation of a covered work with other separate and independent
236 | works, which are not by their nature extensions of the covered work,
237 | and which are not combined with it such as to form a larger program,
238 | in or on a volume of a storage or distribution medium, is called an
239 | "aggregate" if the compilation and its resulting copyright are not
240 | used to limit the access or legal rights of the compilation's users
241 | beyond what the individual works permit. Inclusion of a covered work
242 | in an aggregate does not cause this License to apply to the other
243 | parts of the aggregate.
244 |
245 | 6. Conveying Non-Source Forms.
246 |
247 | You may convey a covered work in object code form under the terms
248 | of sections 4 and 5, provided that you also convey the
249 | machine-readable Corresponding Source under the terms of this License,
250 | in one of these ways:
251 |
252 | a) Convey the object code in, or embodied in, a physical product
253 | (including a physical distribution medium), accompanied by the
254 | Corresponding Source fixed on a durable physical medium
255 | customarily used for software interchange.
256 |
257 | b) Convey the object code in, or embodied in, a physical product
258 | (including a physical distribution medium), accompanied by a
259 | written offer, valid for at least three years and valid for as
260 | long as you offer spare parts or customer support for that product
261 | model, to give anyone who possesses the object code either (1) a
262 | copy of the Corresponding Source for all the software in the
263 | product that is covered by this License, on a durable physical
264 | medium customarily used for software interchange, for a price no
265 | more than your reasonable cost of physically performing this
266 | conveying of source, or (2) access to copy the
267 | Corresponding Source from a network server at no charge.
268 |
269 | c) Convey individual copies of the object code with a copy of the
270 | written offer to provide the Corresponding Source. This
271 | alternative is allowed only occasionally and noncommercially, and
272 | only if you received the object code with such an offer, in accord
273 | with subsection 6b.
274 |
275 | d) Convey the object code by offering access from a designated
276 | place (gratis or for a charge), and offer equivalent access to the
277 | Corresponding Source in the same way through the same place at no
278 | further charge. You need not require recipients to copy the
279 | Corresponding Source along with the object code. If the place to
280 | copy the object code is a network server, the Corresponding Source
281 | may be on a different server (operated by you or a third party)
282 | that supports equivalent copying facilities, provided you maintain
283 | clear directions next to the object code saying where to find the
284 | Corresponding Source. Regardless of what server hosts the
285 | Corresponding Source, you remain obligated to ensure that it is
286 | available for as long as needed to satisfy these requirements.
287 |
288 | e) Convey the object code using peer-to-peer transmission, provided
289 | you inform other peers where the object code and Corresponding
290 | Source of the work are being offered to the general public at no
291 | charge under subsection 6d.
292 |
293 | A separable portion of the object code, whose source code is excluded
294 | from the Corresponding Source as a System Library, need not be
295 | included in conveying the object code work.
296 |
297 | A "User Product" is either (1) a "consumer product", which means any
298 | tangible personal property which is normally used for personal, family,
299 | or household purposes, or (2) anything designed or sold for incorporation
300 | into a dwelling. In determining whether a product is a consumer product,
301 | doubtful cases shall be resolved in favor of coverage. For a particular
302 | product received by a particular user, "normally used" refers to a
303 | typical or common use of that class of product, regardless of the status
304 | of the particular user or of the way in which the particular user
305 | actually uses, or expects or is expected to use, the product. A product
306 | is a consumer product regardless of whether the product has substantial
307 | commercial, industrial or non-consumer uses, unless such uses represent
308 | the only significant mode of use of the product.
309 |
310 | "Installation Information" for a User Product means any methods,
311 | procedures, authorization keys, or other information required to install
312 | and execute modified versions of a covered work in that User Product from
313 | a modified version of its Corresponding Source. The information must
314 | suffice to ensure that the continued functioning of the modified object
315 | code is in no case prevented or interfered with solely because
316 | modification has been made.
317 |
318 | If you convey an object code work under this section in, or with, or
319 | specifically for use in, a User Product, and the conveying occurs as
320 | part of a transaction in which the right of possession and use of the
321 | User Product is transferred to the recipient in perpetuity or for a
322 | fixed term (regardless of how the transaction is characterized), the
323 | Corresponding Source conveyed under this section must be accompanied
324 | by the Installation Information. But this requirement does not apply
325 | if neither you nor any third party retains the ability to install
326 | modified object code on the User Product (for example, the work has
327 | been installed in ROM).
328 |
329 | The requirement to provide Installation Information does not include a
330 | requirement to continue to provide support service, warranty, or updates
331 | for a work that has been modified or installed by the recipient, or for
332 | the User Product in which it has been modified or installed. Access to a
333 | network may be denied when the modification itself materially and
334 | adversely affects the operation of the network or violates the rules and
335 | protocols for communication across the network.
336 |
337 | Corresponding Source conveyed, and Installation Information provided,
338 | in accord with this section must be in a format that is publicly
339 | documented (and with an implementation available to the public in
340 | source code form), and must require no special password or key for
341 | unpacking, reading or copying.
342 |
343 | 7. Additional Terms.
344 |
345 | "Additional permissions" are terms that supplement the terms of this
346 | License by making exceptions from one or more of its conditions.
347 | Additional permissions that are applicable to the entire Program shall
348 | be treated as though they were included in this License, to the extent
349 | that they are valid under applicable law. If additional permissions
350 | apply only to part of the Program, that part may be used separately
351 | under those permissions, but the entire Program remains governed by
352 | this License without regard to the additional permissions.
353 |
354 | When you convey a copy of a covered work, you may at your option
355 | remove any additional permissions from that copy, or from any part of
356 | it. (Additional permissions may be written to require their own
357 | removal in certain cases when you modify the work.) You may place
358 | additional permissions on material, added by you to a covered work,
359 | for which you have or can give appropriate copyright permission.
360 |
361 | Notwithstanding any other provision of this License, for material you
362 | add to a covered work, you may (if authorized by the copyright holders of
363 | that material) supplement the terms of this License with terms:
364 |
365 | a) Disclaiming warranty or limiting liability differently from the
366 | terms of sections 15 and 16 of this License; or
367 |
368 | b) Requiring preservation of specified reasonable legal notices or
369 | author attributions in that material or in the Appropriate Legal
370 | Notices displayed by works containing it; or
371 |
372 | c) Prohibiting misrepresentation of the origin of that material, or
373 | requiring that modified versions of such material be marked in
374 | reasonable ways as different from the original version; or
375 |
376 | d) Limiting the use for publicity purposes of names of licensors or
377 | authors of the material; or
378 |
379 | e) Declining to grant rights under trademark law for use of some
380 | trade names, trademarks, or service marks; or
381 |
382 | f) Requiring indemnification of licensors and authors of that
383 | material by anyone who conveys the material (or modified versions of
384 | it) with contractual assumptions of liability to the recipient, for
385 | any liability that these contractual assumptions directly impose on
386 | those licensors and authors.
387 |
388 | All other non-permissive additional terms are considered "further
389 | restrictions" within the meaning of section 10. If the Program as you
390 | received it, or any part of it, contains a notice stating that it is
391 | governed by this License along with a term that is a further
392 | restriction, you may remove that term. If a license document contains
393 | a further restriction but permits relicensing or conveying under this
394 | License, you may add to a covered work material governed by the terms
395 | of that license document, provided that the further restriction does
396 | not survive such relicensing or conveying.
397 |
398 | If you add terms to a covered work in accord with this section, you
399 | must place, in the relevant source files, a statement of the
400 | additional terms that apply to those files, or a notice indicating
401 | where to find the applicable terms.
402 |
403 | Additional terms, permissive or non-permissive, may be stated in the
404 | form of a separately written license, or stated as exceptions;
405 | the above requirements apply either way.
406 |
407 | 8. Termination.
408 |
409 | You may not propagate or modify a covered work except as expressly
410 | provided under this License. Any attempt otherwise to propagate or
411 | modify it is void, and will automatically terminate your rights under
412 | this License (including any patent licenses granted under the third
413 | paragraph of section 11).
414 |
415 | However, if you cease all violation of this License, then your
416 | license from a particular copyright holder is reinstated (a)
417 | provisionally, unless and until the copyright holder explicitly and
418 | finally terminates your license, and (b) permanently, if the copyright
419 | holder fails to notify you of the violation by some reasonable means
420 | prior to 60 days after the cessation.
421 |
422 | Moreover, your license from a particular copyright holder is
423 | reinstated permanently if the copyright holder notifies you of the
424 | violation by some reasonable means, this is the first time you have
425 | received notice of violation of this License (for any work) from that
426 | copyright holder, and you cure the violation prior to 30 days after
427 | your receipt of the notice.
428 |
429 | Termination of your rights under this section does not terminate the
430 | licenses of parties who have received copies or rights from you under
431 | this License. If your rights have been terminated and not permanently
432 | reinstated, you do not qualify to receive new licenses for the same
433 | material under section 10.
434 |
435 | 9. Acceptance Not Required for Having Copies.
436 |
437 | You are not required to accept this License in order to receive or
438 | run a copy of the Program. Ancillary propagation of a covered work
439 | occurring solely as a consequence of using peer-to-peer transmission
440 | to receive a copy likewise does not require acceptance. However,
441 | nothing other than this License grants you permission to propagate or
442 | modify any covered work. These actions infringe copyright if you do
443 | not accept this License. Therefore, by modifying or propagating a
444 | covered work, you indicate your acceptance of this License to do so.
445 |
446 | 10. Automatic Licensing of Downstream Recipients.
447 |
448 | Each time you convey a covered work, the recipient automatically
449 | receives a license from the original licensors, to run, modify and
450 | propagate that work, subject to this License. You are not responsible
451 | for enforcing compliance by third parties with this License.
452 |
453 | An "entity transaction" is a transaction transferring control of an
454 | organization, or substantially all assets of one, or subdividing an
455 | organization, or merging organizations. If propagation of a covered
456 | work results from an entity transaction, each party to that
457 | transaction who receives a copy of the work also receives whatever
458 | licenses to the work the party's predecessor in interest had or could
459 | give under the previous paragraph, plus a right to possession of the
460 | Corresponding Source of the work from the predecessor in interest, if
461 | the predecessor has it or can get it with reasonable efforts.
462 |
463 | You may not impose any further restrictions on the exercise of the
464 | rights granted or affirmed under this License. For example, you may
465 | not impose a license fee, royalty, or other charge for exercise of
466 | rights granted under this License, and you may not initiate litigation
467 | (including a cross-claim or counterclaim in a lawsuit) alleging that
468 | any patent claim is infringed by making, using, selling, offering for
469 | sale, or importing the Program or any portion of it.
470 |
471 | 11. Patents.
472 |
473 | A "contributor" is a copyright holder who authorizes use under this
474 | License of the Program or a work on which the Program is based. The
475 | work thus licensed is called the contributor's "contributor version".
476 |
477 | A contributor's "essential patent claims" are all patent claims
478 | owned or controlled by the contributor, whether already acquired or
479 | hereafter acquired, that would be infringed by some manner, permitted
480 | by this License, of making, using, or selling its contributor version,
481 | but do not include claims that would be infringed only as a
482 | consequence of further modification of the contributor version. For
483 | purposes of this definition, "control" includes the right to grant
484 | patent sublicenses in a manner consistent with the requirements of
485 | this License.
486 |
487 | Each contributor grants you a non-exclusive, worldwide, royalty-free
488 | patent license under the contributor's essential patent claims, to
489 | make, use, sell, offer for sale, import and otherwise run, modify and
490 | propagate the contents of its contributor version.
491 |
492 | In the following three paragraphs, a "patent license" is any express
493 | agreement or commitment, however denominated, not to enforce a patent
494 | (such as an express permission to practice a patent or covenant not to
495 | sue for patent infringement). To "grant" such a patent license to a
496 | party means to make such an agreement or commitment not to enforce a
497 | patent against the party.
498 |
499 | If you convey a covered work, knowingly relying on a patent license,
500 | and the Corresponding Source of the work is not available for anyone
501 | to copy, free of charge and under the terms of this License, through a
502 | publicly available network server or other readily accessible means,
503 | then you must either (1) cause the Corresponding Source to be so
504 | available, or (2) arrange to deprive yourself of the benefit of the
505 | patent license for this particular work, or (3) arrange, in a manner
506 | consistent with the requirements of this License, to extend the patent
507 | license to downstream recipients. "Knowingly relying" means you have
508 | actual knowledge that, but for the patent license, your conveying the
509 | covered work in a country, or your recipient's use of the covered work
510 | in a country, would infringe one or more identifiable patents in that
511 | country that you have reason to believe are valid.
512 |
513 | If, pursuant to or in connection with a single transaction or
514 | arrangement, you convey, or propagate by procuring conveyance of, a
515 | covered work, and grant a patent license to some of the parties
516 | receiving the covered work authorizing them to use, propagate, modify
517 | or convey a specific copy of the covered work, then the patent license
518 | you grant is automatically extended to all recipients of the covered
519 | work and works based on it.
520 |
521 | A patent license is "discriminatory" if it does not include within
522 | the scope of its coverage, prohibits the exercise of, or is
523 | conditioned on the non-exercise of one or more of the rights that are
524 | specifically granted under this License. You may not convey a covered
525 | work if you are a party to an arrangement with a third party that is
526 | in the business of distributing software, under which you make payment
527 | to the third party based on the extent of your activity of conveying
528 | the work, and under which the third party grants, to any of the
529 | parties who would receive the covered work from you, a discriminatory
530 | patent license (a) in connection with copies of the covered work
531 | conveyed by you (or copies made from those copies), or (b) primarily
532 | for and in connection with specific products or compilations that
533 | contain the covered work, unless you entered into that arrangement,
534 | or that patent license was granted, prior to 28 March 2007.
535 |
536 | Nothing in this License shall be construed as excluding or limiting
537 | any implied license or other defenses to infringement that may
538 | otherwise be available to you under applicable patent law.
539 |
540 | 12. No Surrender of Others' Freedom.
541 |
542 | If conditions are imposed on you (whether by court order, agreement or
543 | otherwise) that contradict the conditions of this License, they do not
544 | excuse you from the conditions of this License. If you cannot convey a
545 | covered work so as to satisfy simultaneously your obligations under this
546 | License and any other pertinent obligations, then as a consequence you may
547 | not convey it at all. For example, if you agree to terms that obligate you
548 | to collect a royalty for further conveying from those to whom you convey
549 | the Program, the only way you could satisfy both those terms and this
550 | License would be to refrain entirely from conveying the Program.
551 |
552 | 13. Use with the GNU Affero General Public License.
553 |
554 | Notwithstanding any other provision of this License, you have
555 | permission to link or combine any covered work with a work licensed
556 | under version 3 of the GNU Affero General Public License into a single
557 | combined work, and to convey the resulting work. The terms of this
558 | License will continue to apply to the part which is the covered work,
559 | but the special requirements of the GNU Affero General Public License,
560 | section 13, concerning interaction through a network will apply to the
561 | combination as such.
562 |
563 | 14. Revised Versions of this License.
564 |
565 | The Free Software Foundation may publish revised and/or new versions of
566 | the GNU General Public License from time to time. Such new versions will
567 | be similar in spirit to the present version, but may differ in detail to
568 | address new problems or concerns.
569 |
570 | Each version is given a distinguishing version number. If the
571 | Program specifies that a certain numbered version of the GNU General
572 | Public License "or any later version" applies to it, you have the
573 | option of following the terms and conditions either of that numbered
574 | version or of any later version published by the Free Software
575 | Foundation. If the Program does not specify a version number of the
576 | GNU General Public License, you may choose any version ever published
577 | by the Free Software Foundation.
578 |
579 | If the Program specifies that a proxy can decide which future
580 | versions of the GNU General Public License can be used, that proxy's
581 | public statement of acceptance of a version permanently authorizes you
582 | to choose that version for the Program.
583 |
584 | Later license versions may give you additional or different
585 | permissions. However, no additional obligations are imposed on any
586 | author or copyright holder as a result of your choosing to follow a
587 | later version.
588 |
589 | 15. Disclaimer of Warranty.
590 |
591 | THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
592 | APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
593 | HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
594 | OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
595 | THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
596 | PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
597 | IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
598 | ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
599 |
600 | 16. Limitation of Liability.
601 |
602 | IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
603 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
604 | THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
605 | GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
606 | USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
607 | DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
608 | PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
609 | EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
610 | SUCH DAMAGES.
611 |
612 | 17. Interpretation of Sections 15 and 16.
613 |
614 | If the disclaimer of warranty and limitation of liability provided
615 | above cannot be given local legal effect according to their terms,
616 | reviewing courts shall apply local law that most closely approximates
617 | an absolute waiver of all civil liability in connection with the
618 | Program, unless a warranty or assumption of liability accompanies a
619 | copy of the Program in return for a fee.
620 |
621 | END OF TERMS AND CONDITIONS
622 |
623 | How to Apply These Terms to Your New Programs
624 |
625 | If you develop a new program, and you want it to be of the greatest
626 | possible use to the public, the best way to achieve this is to make it
627 | free software which everyone can redistribute and change under these terms.
628 |
629 | To do so, attach the following notices to the program. It is safest
630 | to attach them to the start of each source file to most effectively
631 | state the exclusion of warranty; and each file should have at least
632 | the "copyright" line and a pointer to where the full notice is found.
633 |
634 |
635 | Copyright (C)
636 |
637 | This program is free software: you can redistribute it and/or modify
638 | it under the terms of the GNU General Public License as published by
639 | the Free Software Foundation, either version 3 of the License, or
640 | (at your option) any later version.
641 |
642 | This program is distributed in the hope that it will be useful,
643 | but WITHOUT ANY WARRANTY; without even the implied warranty of
644 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
645 | GNU General Public License for more details.
646 |
647 | You should have received a copy of the GNU General Public License
648 | along with this program. If not, see .
649 |
650 | Also add information on how to contact you by electronic and paper mail.
651 |
652 | If the program does terminal interaction, make it output a short
653 | notice like this when it starts in an interactive mode:
654 |
655 | Copyright (C)
656 | This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
657 | This is free software, and you are welcome to redistribute it
658 | under certain conditions; type `show c' for details.
659 |
660 | The hypothetical commands `show w' and `show c' should show the appropriate
661 | parts of the General Public License. Of course, your program's commands
662 | might be different; for a GUI interface, you would use an "about box".
663 |
664 | You should also get your employer (if you work as a programmer) or school,
665 | if any, to sign a "copyright disclaimer" for the program, if necessary.
666 | For more information on this, and how to apply and follow the GNU GPL, see
667 | .
668 |
669 | The GNU General Public License does not permit incorporating your program
670 | into proprietary programs. If your program is a subroutine library, you
671 | may consider it more useful to permit linking proprietary applications with
672 | the library. If this is what you want to do, use the GNU Lesser General
673 | Public License instead of this License. But first, please read
674 | .
675 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | Zimit
2 | =====
3 |
4 | Zimit is a scraper allowing to create [ZIM file](https://en.wikipedia.org/wiki/ZIM_(file_format)) from any Web site.
5 |
6 | [](https://www.codefactor.io/repository/github/openzim/zimit)
7 | [](https://www.gnu.org/licenses/gpl-3.0)
8 | [](https://ghcr.io/openzim/zimit)
9 |
10 | Zimit adheres to openZIM's [Contribution Guidelines](https://github.com/openzim/overview/wiki/Contributing).
11 |
12 | Zimit has implemented openZIM's [Python bootstrap, conventions and policies](https://github.com/openzim/_python-bootstrap/blob/main/docs/Policy.md) **v1.0.1**.
13 |
14 | Capabilities and known limitations
15 | --------------------
16 |
17 | While we would like to support as many websites as possible, making an offline archive of any website with a versatile tool obviously has some limitations.
18 |
19 | Most capabilities and known limitations are documented in [warc2zim README](https://github.com/openzim/warc2zim/blob/main/README.md). There are also some limitations in Browsertrix Crawler (used to fetch the website) and wombat (used to properly replay dynamic web requests), but these are not (yet?) clearly documented.
20 |
21 | Technical background
22 | --------------------
23 |
24 | Zimit runs a fully automated browser-based crawl of a website property and produces a ZIM of the crawled content. Zimit runs in a Docker container.
25 |
26 | The system:
27 | - runs a website crawl with [Browsertrix Crawler](https://github.com/webrecorder/browsertrix-crawler), which produces WARC files
28 | - converts the crawled WARC files to a single ZIM using [warc2zim](https://github.com/openzim/warc2zim)
29 |
30 | The `zimit.py` is the entrypoint for the system.
31 |
32 | After the crawl is done, warc2zim is used to write a zim to the `/output` directory, which should be mounted as a volume to not loose the ZIM created when container stops.
33 |
34 | Using the `--keep` flag, the crawled WARCs and few other artifacts will also be kept in a temp directory inside `/output`
35 |
36 | Usage
37 | -----
38 |
39 | `zimit` is intended to be run in Docker. Docker image is published at https://github.com/orgs/openzim/packages/container/package/zimit.
40 |
41 | The image accepts the following parameters, **as well as any of the [Browsertrix crawler](https://crawler.docs.browsertrix.com/user-guide/cli-options/) and [warc2zim](https://github.com/openzim/warc2zim) ones**:
42 |
43 | - Required: `--seeds URL` - the url to start crawling from ; multiple URLs can be separated by a comma (even if **usually not needed**, these are just the **seeds** of the crawl) ; first seed URL is used as ZIM homepage
44 | - Required: `--name` - Name of ZIM file
45 | - `--output` - output directory (defaults to `/output`)
46 | - `--pageLimit U` - Limit capture to at most U URLs
47 | - `--scopeExcludeRx ` - skip URLs that match the regex from crawling. Can be specified multiple times. An example is `--scopeExcludeRx="(\?q=|signup-landing\?|\?cid=)"`, where URLs that contain either `?q=` or `signup-landing?` or `?cid=` will be excluded.
48 | - `--workers N` - number of crawl workers to be run in parallel
49 | - `--waitUntil` - Puppeteer setting for how long to wait for page load. See [page.goto waitUntil options](https://github.com/puppeteer/puppeteer/blob/main/docs/api.md#pagegotourl-options). The default is `load`, but for static sites, `--waitUntil domcontentloaded` may be used to speed up the crawl (to avoid waiting for ads to load for example).
50 | - `--keep` - in case of failure, WARC files and other temporary files (which are stored as a subfolder of output directory) are always kept, otherwise they are automatically deleted. Use this flag to always keep WARC files, even in case of success.
51 |
52 | Example command:
53 |
54 | ```bash
55 | docker run ghcr.io/openzim/zimit zimit --help
56 | docker run ghcr.io/openzim/zimit warc2zim --help
57 | docker run -v /output:/output ghcr.io/openzim/zimit zimit --seeds URL --name myzimfile
58 | ```
59 |
60 | **Note**: Image automatically filters out a large number of ads by using the 3 blocklists from [anudeepND](https://github.com/anudeepND/blacklist). If you don't want this filtering, disable the image's entrypoint in your container (`docker run --entrypoint="" ghcr.io/openzim/zimit ...`).
61 |
62 | To re-build the Docker image locally run:
63 |
64 | ```bash
65 | docker build -t ghcr.io/openzim/zimit .
66 | ```
67 |
68 | FAQ
69 | ---
70 |
71 | The Zimit contributor's team maintains [a page with most Frequently Asked Questions](https://github.com/openzim/zimit/wiki/Frequently-Asked-Questions).
72 |
73 | Nota bene
74 | ---------
75 |
76 | While Zimit 1.x relied on a Service Worker to display the ZIM content, this is not anymore the case
77 | since Zimit 2.x which does not have any special requirements anymore.
78 |
79 | It should also be noted that a first version of a generic HTTP scraper was created in 2016 during
80 | the [Wikimania Esino Lario
81 | Hackathon](https://wikimania2016.wikimedia.org/wiki/Programme/Kiwix-dedicated_Hackathon).
82 |
83 | That version is now considered outdated and [archived in `2016`
84 | branch](https://github.com/openzim/zimit/tree/2016).
85 |
86 | License
87 | -------
88 |
89 | [GPLv3](https://www.gnu.org/licenses/gpl-3.0) or later, see
90 | [LICENSE](LICENSE) for more details.
91 |
--------------------------------------------------------------------------------
/offliner-definition.json:
--------------------------------------------------------------------------------
1 | {
2 | "offliner_id": "zimit",
3 | "stdOutput": true,
4 | "stdStats": "zimit-progress-file",
5 | "flags": {
6 | "seeds": {
7 | "type": "string",
8 | "required": false,
9 | "title": "Seeds",
10 | "description": "The seed URL(s) to start crawling from. Multile seed URL must be separated by a comma (usually not needed, these are just the crawl seeds). First seed URL is used as ZIM homepage"
11 | },
12 | "seed_file": {
13 | "type": "string",
14 | "required": false,
15 | "title": "Seed File",
16 | "description": "If set, read a list of seed urls, one per line. HTTPS URL to an online file."
17 | },
18 | "lang": {
19 | "type": "string",
20 | "required": false,
21 | "title": "Browser Language",
22 | "description": "If set, sets the language used by the browser, should be ISO 639 language[-country] code, e.g. `en` or `en-GB`"
23 | },
24 | "title": {
25 | "type": "string",
26 | "required": false,
27 | "title": "Title",
28 | "description": "Custom title for your ZIM. Defaults to title of main page",
29 | "minLength": 1,
30 | "maxLength": 30
31 | },
32 | "description": {
33 | "type": "string",
34 | "required": false,
35 | "title": "Description",
36 | "description": "Description for ZIM",
37 | "minLength": 1,
38 | "maxLength": 80
39 | },
40 | "favicon": {
41 | "type": "url",
42 | "required": false,
43 | "title": "Illustration",
44 | "description": "URL for Illustration. "
45 | },
46 | "tags": {
47 | "type": "string",
48 | "required": false,
49 | "title": "ZIM Tags",
50 | "description": "Single string with individual tags separated by a semicolon."
51 | },
52 | "creator": {
53 | "type": "string",
54 | "required": false,
55 | "title": "Creator",
56 | "description": "Name of content creator"
57 | },
58 | "publisher": {
59 | "type": "string",
60 | "required": false,
61 | "title": "Publisher",
62 | "isPublisher": true,
63 | "description": "Custom publisher name (ZIM metadata). openZIM otherwise"
64 | },
65 | "source": {
66 | "type": "string",
67 | "required": false,
68 | "title": "Source",
69 | "description": "Source name/URL of content"
70 | },
71 | "workers": {
72 | "type": "integer",
73 | "required": false,
74 | "title": "Workers",
75 | "description": "The number of workers to run in parallel. Defaults to 1",
76 | "min": 1
77 | },
78 | "wait_until": {
79 | "type": "string",
80 | "required": false,
81 | "title": "WaitUntil",
82 | "description": "Puppeteer page.goto() condition to wait for before continuing. One of load, domcontentloaded, networkidle0 or networkidle2, or a comma-separated combination of those. Default is load,networkidle2"
83 | },
84 | "extra_hops": {
85 | "type": "integer",
86 | "required": false,
87 | "title": "Extra Hops",
88 | "description": "Number of extra 'hops' to follow, beyond the current scope. Default is 0",
89 | "min": 0
90 | },
91 | "page_limit": {
92 | "type": "integer",
93 | "required": false,
94 | "title": "Page Limit",
95 | "description": "Limit crawl to this number of pages. Default is 0 (no-limit).",
96 | "min": 0
97 | },
98 | "max_page_limit": {
99 | "type": "integer",
100 | "required": false,
101 | "title": "Max Page Limit",
102 | "description": "Maximum pages to crawl, overriding pageLimit if both are set. Default is 0 (no-limit)",
103 | "min": 0
104 | },
105 | "page_load_timeout": {
106 | "type": "integer",
107 | "required": false,
108 | "title": "Page Load Timeout",
109 | "description": "Timeout for each page to load (in seconds). Default is 90",
110 | "min": 0
111 | },
112 | "scope_type": {
113 | "type": "string-enum",
114 | "required": false,
115 | "title": "Scope Type",
116 | "description": "A predfined scope of the crawl. For more customization, use 'custom' and set scopeIncludeRx/scopeExcludeRx regexes. Default is custom if scopeIncludeRx is set, prefix otherwise.",
117 | "choices": [
118 | {
119 | "title": "Page",
120 | "value": "page"
121 | },
122 | {
123 | "title": "Page SPA",
124 | "value": "page-spa"
125 | },
126 | {
127 | "title": "Prefix",
128 | "value": "prefix"
129 | },
130 | {
131 | "title": "Host",
132 | "value": "host"
133 | },
134 | {
135 | "title": "Domain",
136 | "value": "domain"
137 | },
138 | {
139 | "title": "Any",
140 | "value": "any"
141 | },
142 | {
143 | "title": "Custom",
144 | "value": "custom"
145 | }
146 | ]
147 | },
148 | "scope_include_rx": {
149 | "type": "string",
150 | "required": false,
151 | "title": "Scope Include Regex",
152 | "description": "Regex of page URLs that should be included in the crawl (defaults to the immediate directory of seed)"
153 | },
154 | "scope_exclude_rx": {
155 | "type": "string",
156 | "required": false,
157 | "title": "Scope Exclude Regex",
158 | "description": "Regex of page URLs that should be excluded from the crawl"
159 | },
160 | "allow_hash_urls": {
161 | "type": "boolean",
162 | "required": false,
163 | "title": "Allow Hashtag URLs",
164 | "description": "Allow Hashtag URLs, useful for single-page-application crawling or when different hashtags load dynamic content"
165 | },
166 | "mobile_device": {
167 | "type": "string-enum",
168 | "required": false,
169 | "title": "As device",
170 | "description": "Device to crawl as. See Pupeeter's Device.ts for a list",
171 | "choices": [
172 | {
173 | "title": "Blackberry Playbook",
174 | "value": "Blackberry PlayBook"
175 | },
176 | {
177 | "title": "Blackberry Playbook Landscape",
178 | "value": "Blackberry PlayBook landscape"
179 | },
180 | {
181 | "title": "Blackberry Z30",
182 | "value": "BlackBerry Z30"
183 | },
184 | {
185 | "title": "Blackberry Z30 Landscape",
186 | "value": "BlackBerry Z30 landscape"
187 | },
188 | {
189 | "title": "Galaxy Note 3",
190 | "value": "Galaxy Note 3"
191 | },
192 | {
193 | "title": "Galaxy Note 3 Landscape",
194 | "value": "Galaxy Note 3 landscape"
195 | },
196 | {
197 | "title": "Galaxy Note II",
198 | "value": "Galaxy Note II"
199 | },
200 | {
201 | "title": "Galaxy Note II Landscape",
202 | "value": "Galaxy Note II landscape"
203 | },
204 | {
205 | "title": "Galaxy S III",
206 | "value": "Galaxy S III"
207 | },
208 | {
209 | "title": "Galaxy S III Landscape",
210 | "value": "Galaxy S III landscape"
211 | },
212 | {
213 | "title": "Galaxy S5",
214 | "value": "Galaxy S5"
215 | },
216 | {
217 | "title": "Galaxy S5 Landscape",
218 | "value": "Galaxy S5 landscape"
219 | },
220 | {
221 | "title": "Galaxy S8",
222 | "value": "Galaxy S8"
223 | },
224 | {
225 | "title": "Galaxy S8 Landscape",
226 | "value": "Galaxy S8 landscape"
227 | },
228 | {
229 | "title": "Galaxy S9 Plus",
230 | "value": "Galaxy S9+"
231 | },
232 | {
233 | "title": "Galaxy S9 Plus Landscape",
234 | "value": "Galaxy S9+ landscape"
235 | },
236 | {
237 | "title": "Galaxy Tab S4",
238 | "value": "Galaxy Tab S4"
239 | },
240 | {
241 | "title": "Galaxy Tab S4 Landscape",
242 | "value": "Galaxy Tab S4 landscape"
243 | },
244 | {
245 | "title": "iPad",
246 | "value": "iPad"
247 | },
248 | {
249 | "title": "iPad Landscape",
250 | "value": "iPad landscape"
251 | },
252 | {
253 | "title": "iPad Gen 6",
254 | "value": "iPad (gen 6)"
255 | },
256 | {
257 | "title": "iPad Gen 6 Landscape",
258 | "value": "iPad (gen 6) landscape"
259 | },
260 | {
261 | "title": "iPad Gen 7",
262 | "value": "iPad (gen 7)"
263 | },
264 | {
265 | "title": "iPad Gen 7 Landscape",
266 | "value": "iPad (gen 7) landscape"
267 | },
268 | {
269 | "title": "iPad Mini",
270 | "value": "iPad Mini"
271 | },
272 | {
273 | "title": "iPad Mini Landscape",
274 | "value": "iPad Mini landscape"
275 | },
276 | {
277 | "title": "iPad Pro",
278 | "value": "iPad Pro"
279 | },
280 | {
281 | "title": "iPad Pro Landscape",
282 | "value": "iPad Pro landscape"
283 | },
284 | {
285 | "title": "iPad Pro 11",
286 | "value": "iPad Pro 11"
287 | },
288 | {
289 | "title": "iPad Pro 11 Landscape",
290 | "value": "iPad Pro 11 landscape"
291 | },
292 | {
293 | "title": "iPhone 4",
294 | "value": "iPhone 4"
295 | },
296 | {
297 | "title": "iPhone 4 Landscape",
298 | "value": "iPhone 4 landscape"
299 | },
300 | {
301 | "title": "iPhone 5",
302 | "value": "iPhone 5"
303 | },
304 | {
305 | "title": "iPhone 5 Landscape",
306 | "value": "iPhone 5 landscape"
307 | },
308 | {
309 | "title": "iPhone 6",
310 | "value": "iPhone 6"
311 | },
312 | {
313 | "title": "iPhone 6 Landscape",
314 | "value": "iPhone 6 landscape"
315 | },
316 | {
317 | "title": "iPhone 6 Plus",
318 | "value": "iPhone 6 Plus"
319 | },
320 | {
321 | "title": "iPhone 6 Plus Landscape",
322 | "value": "iPhone 6 Plus landscape"
323 | },
324 | {
325 | "title": "iPhone 7",
326 | "value": "iPhone 7"
327 | },
328 | {
329 | "title": "iPhone 7 Landscape",
330 | "value": "iPhone 7 landscape"
331 | },
332 | {
333 | "title": "iPhone 7 Plus",
334 | "value": "iPhone 7 Plus"
335 | },
336 | {
337 | "title": "iPhone 7 Plus Landscape",
338 | "value": "iPhone 7 Plus landscape"
339 | },
340 | {
341 | "title": "iPhone 8",
342 | "value": "iPhone 8"
343 | },
344 | {
345 | "title": "iPhone 8 Landscape",
346 | "value": "iPhone 8 landscape"
347 | },
348 | {
349 | "title": "iPhone 8 Plus",
350 | "value": "iPhone 8 Plus"
351 | },
352 | {
353 | "title": "iPhone 8 Plus Landscape",
354 | "value": "iPhone 8 Plus landscape"
355 | },
356 | {
357 | "title": "iPhone SE",
358 | "value": "iPhone SE"
359 | },
360 | {
361 | "title": "iPhone SE Landscape",
362 | "value": "iPhone SE landscape"
363 | },
364 | {
365 | "title": "iPhone X",
366 | "value": "iPhone X"
367 | },
368 | {
369 | "title": "iPhone X Landscape",
370 | "value": "iPhone X landscape"
371 | },
372 | {
373 | "title": "iPhone XR",
374 | "value": "iPhone XR"
375 | },
376 | {
377 | "title": "iPhone XR Landscape",
378 | "value": "iPhone XR landscape"
379 | },
380 | {
381 | "title": "iPhone 11",
382 | "value": "iPhone 11"
383 | },
384 | {
385 | "title": "iPhone 11 Landscape",
386 | "value": "iPhone 11 landscape"
387 | },
388 | {
389 | "title": "iPhone 11 Pro",
390 | "value": "iPhone 11 Pro"
391 | },
392 | {
393 | "title": "iPhone 11 Pro Landscape",
394 | "value": "iPhone 11 Pro landscape"
395 | },
396 | {
397 | "title": "iPhone 11 Pro Max",
398 | "value": "iPhone 11 Pro Max"
399 | },
400 | {
401 | "title": "iPhone 11 Pro Max Landscape",
402 | "value": "iPhone 11 Pro Max landscape"
403 | },
404 | {
405 | "title": "iPhone 12",
406 | "value": "iPhone 12"
407 | },
408 | {
409 | "title": "iPhone 12 Landscape",
410 | "value": "iPhone 12 landscape"
411 | },
412 | {
413 | "title": "iPhone 12 Pro",
414 | "value": "iPhone 12 Pro"
415 | },
416 | {
417 | "title": "iPhone 12 Pro Landscape",
418 | "value": "iPhone 12 Pro landscape"
419 | },
420 | {
421 | "title": "iPhone 12 Pro Max",
422 | "value": "iPhone 12 Pro Max"
423 | },
424 | {
425 | "title": "iPhone 12 Pro Max Landscape",
426 | "value": "iPhone 12 Pro Max landscape"
427 | },
428 | {
429 | "title": "iPhone 12 Mini",
430 | "value": "iPhone 12 Mini"
431 | },
432 | {
433 | "title": "iPhone 12 Mini Landscape",
434 | "value": "iPhone 12 Mini landscape"
435 | },
436 | {
437 | "title": "iPhone 13",
438 | "value": "iPhone 13"
439 | },
440 | {
441 | "title": "iPhone 13 Landscape",
442 | "value": "iPhone 13 landscape"
443 | },
444 | {
445 | "title": "iPhone 13 Pro",
446 | "value": "iPhone 13 Pro"
447 | },
448 | {
449 | "title": "iPhone 13 Pro Landscape",
450 | "value": "iPhone 13 Pro landscape"
451 | },
452 | {
453 | "title": "iPhone 13 Pro Max",
454 | "value": "iPhone 13 Pro Max"
455 | },
456 | {
457 | "title": "iPhone 13 Pro Max Landscape",
458 | "value": "iPhone 13 Pro Max landscape"
459 | },
460 | {
461 | "title": "iPhone 13 Mini",
462 | "value": "iPhone 13 Mini"
463 | },
464 | {
465 | "title": "iPhone 13 Mini Landscape",
466 | "value": "iPhone 13 Mini landscape"
467 | },
468 | {
469 | "title": "Jio Phone 2",
470 | "value": "JioPhone 2"
471 | },
472 | {
473 | "title": "Jio Phone 2 Landscape",
474 | "value": "JioPhone 2 landscape"
475 | },
476 | {
477 | "title": "Kindle Fire HDX",
478 | "value": "Kindle Fire HDX"
479 | },
480 | {
481 | "title": "Kindle Fire HDX Landscape",
482 | "value": "Kindle Fire HDX landscape"
483 | },
484 | {
485 | "title": "LG Optimus L70",
486 | "value": "LG Optimus L70"
487 | },
488 | {
489 | "title": "LG Optimus L70 Landscape",
490 | "value": "LG Optimus L70 landscape"
491 | },
492 | {
493 | "title": "Microsoft Lumia 550",
494 | "value": "Microsoft Lumia 550"
495 | },
496 | {
497 | "title": "Microsoft Lumia 950",
498 | "value": "Microsoft Lumia 950"
499 | },
500 | {
501 | "title": "Microsoft Lumia 950 Landscape",
502 | "value": "Microsoft Lumia 950 landscape"
503 | },
504 | {
505 | "title": "Nexus 10",
506 | "value": "Nexus 10"
507 | },
508 | {
509 | "title": "Nexus 10 Landscape",
510 | "value": "Nexus 10 landscape"
511 | },
512 | {
513 | "title": "Nexus 4",
514 | "value": "Nexus 4"
515 | },
516 | {
517 | "title": "Nexus 4 Landscape",
518 | "value": "Nexus 4 landscape"
519 | },
520 | {
521 | "title": "Nexus 5",
522 | "value": "Nexus 5"
523 | },
524 | {
525 | "title": "Nexus 5 Landscape",
526 | "value": "Nexus 5 landscape"
527 | },
528 | {
529 | "title": "Nexus 5X",
530 | "value": "Nexus 5X"
531 | },
532 | {
533 | "title": "Nexus 5X Landscape",
534 | "value": "Nexus 5X landscape"
535 | },
536 | {
537 | "title": "Nexus 6",
538 | "value": "Nexus 6"
539 | },
540 | {
541 | "title": "Nexus 6 Landscape",
542 | "value": "Nexus 6 landscape"
543 | },
544 | {
545 | "title": "Nexus 6P",
546 | "value": "Nexus 6P"
547 | },
548 | {
549 | "title": "Nexus 6P Landscape",
550 | "value": "Nexus 6P landscape"
551 | },
552 | {
553 | "title": "Nexus 7",
554 | "value": "Nexus 7"
555 | },
556 | {
557 | "title": "Nexus 7 Landscape",
558 | "value": "Nexus 7 landscape"
559 | },
560 | {
561 | "title": "Nokia Lumia 520",
562 | "value": "Nokia Lumia 520"
563 | },
564 | {
565 | "title": "Nokia Lumia 520 Landscape",
566 | "value": "Nokia Lumia 520 landscape"
567 | },
568 | {
569 | "title": "Nokia N9",
570 | "value": "Nokia N9"
571 | },
572 | {
573 | "title": "Nokia N9 Landscape",
574 | "value": "Nokia N9 landscape"
575 | },
576 | {
577 | "title": "Pixel 2",
578 | "value": "Pixel 2"
579 | },
580 | {
581 | "title": "Pixel 2 Landscape",
582 | "value": "Pixel 2 landscape"
583 | },
584 | {
585 | "title": "Pixel 2 XL",
586 | "value": "Pixel 2 XL"
587 | },
588 | {
589 | "title": "Pixel 2 XL Landscape",
590 | "value": "Pixel 2 XL landscape"
591 | },
592 | {
593 | "title": "Pixel 3",
594 | "value": "Pixel 3"
595 | },
596 | {
597 | "title": "Pixel 3 Landscape",
598 | "value": "Pixel 3 landscape"
599 | },
600 | {
601 | "title": "Pixel 4",
602 | "value": "Pixel 4"
603 | },
604 | {
605 | "title": "Pixel 4 Landscape",
606 | "value": "Pixel 4 landscape"
607 | },
608 | {
609 | "title": "Pixel 4A 5G",
610 | "value": "Pixel 4a (5G)"
611 | },
612 | {
613 | "title": "Pixel 4A 5G Landscape",
614 | "value": "Pixel 4a (5G) landscape"
615 | },
616 | {
617 | "title": "Pixel 5",
618 | "value": "Pixel 5"
619 | },
620 | {
621 | "title": "Pixel 5 Landscape",
622 | "value": "Pixel 5 landscape"
623 | },
624 | {
625 | "title": "Moto G4",
626 | "value": "Moto G4"
627 | },
628 | {
629 | "title": "Moto G4 Landscape",
630 | "value": "Moto G4 landscape"
631 | }
632 | ]
633 | },
634 | "select_links": {
635 | "type": "string",
636 | "required": false,
637 | "title": "Select Links",
638 | "description": "One or more selectors for extracting links, in the format [css selector]->[property to use],[css selector]->@[attribute to use]"
639 | },
640 | "click_selector": {
641 | "type": "string",
642 | "required": false,
643 | "title": "Click Selector",
644 | "description": "Selector for elements to click when using the autoclick behavior. Default is 'a'"
645 | },
646 | "block_rules": {
647 | "type": "string",
648 | "required": false,
649 | "title": "Block Rules",
650 | "description": "Additional rules for blocking certain URLs from being loaded, by URL regex and optionally via text match in an iframe"
651 | },
652 | "block_message": {
653 | "type": "string",
654 | "required": false,
655 | "title": "Block Message",
656 | "description": "If specified, when a URL is blocked, a record with this error message is added instead"
657 | },
658 | "block_ads": {
659 | "type": "boolean",
660 | "required": false,
661 | "title": "Block Ads",
662 | "description": "If set, block advertisements from being loaded (based on Stephen Black's blocklist). Note that some bad domains are also blocked by zimit configuration even if this option is not set."
663 | },
664 | "ad_block_message": {
665 | "type": "string",
666 | "required": false,
667 | "title": "Ads Block Message",
668 | "description": "If specified, when an ad is blocked, a record with this error message is added instead"
669 | },
670 | "user_agent": {
671 | "type": "string",
672 | "required": false,
673 | "title": "User Agent",
674 | "description": "Override user-agent with specified"
675 | },
676 | "user_agent_suffix": {
677 | "type": "string",
678 | "required": false,
679 | "title": "User Agent Suffix",
680 | "description": "Append suffix to existing browser user-agent. Defaults to +Zimit"
681 | },
682 | "use_sitemap": {
683 | "type": "string",
684 | "required": false,
685 | "title": "Sitemap URL",
686 | "description": "Use as sitemap to get additional URLs for the crawl (usually at /sitemap.xml)"
687 | },
688 | "sitemap_from_date": {
689 | "type": "string",
690 | "required": false,
691 | "title": "Sitemap From Date",
692 | "description": "If set, filter URLs from sitemaps to those greater than or equal to (>=) provided ISO Date string (YYYY-MM-DD or YYYY-MM-DDTHH:MM:SS or partial date)"
693 | },
694 | "sitemap_to_date": {
695 | "type": "string",
696 | "required": false,
697 | "title": "Sitemap To Date",
698 | "description": "If set, filter URLs from sitemaps to those less than or equal to (<=) provided ISO Date string (YYYY-MM-DD or YYYY-MM-DDTHH:MM:SS or partial date)"
699 | },
700 | "behavior_timeout": {
701 | "type": "integer",
702 | "required": false,
703 | "title": "Behavior Timeout",
704 | "description": "If >0, timeout (in seconds) for in-page behavior will run on each page. If 0, a behavior can run until finish. Default is 90.",
705 | "min": 0
706 | },
707 | "post_load_delay": {
708 | "type": "integer",
709 | "required": false,
710 | "title": "Post Load Delay",
711 | "description": "If >0, amount of time to sleep (in seconds) after page has loaded, before taking screenshots / getting text / running behaviors. Default is 0.",
712 | "min": 0
713 | },
714 | "page_extra_delay": {
715 | "type": "integer",
716 | "required": false,
717 | "title": "Page Extra Delay",
718 | "description": "If >0, amount of time to sleep (in seconds) after behaviors before moving on to next page. Default is 0.",
719 | "min": 0
720 | },
721 | "dedup_policy": {
722 | "type": "string-enum",
723 | "required": false,
724 | "title": "Dedup Policy",
725 | "description": "Deduplication policy. One of skip, revisit or keep. Default is skip",
726 | "choices": [
727 | {
728 | "title": "Skip",
729 | "value": "skip"
730 | },
731 | {
732 | "title": "Revisit",
733 | "value": "revisit"
734 | },
735 | {
736 | "title": "Keep",
737 | "value": "keep"
738 | }
739 | ]
740 | },
741 | "screenshot": {
742 | "type": "string",
743 | "required": false,
744 | "title": "Screenshot",
745 | "description": "Screenshot options for crawler. One of view, thumbnail, fullPage, fullPageFinal or a comma-separated combination of those."
746 | },
747 | "size_soft_limit": {
748 | "type": "integer",
749 | "required": false,
750 | "title": "Size Soft Limit",
751 | "description": "If set, save crawl state and stop crawl if WARC size exceeds this value. ZIM will still be created.",
752 | "min": 0
753 | },
754 | "size_hard_limit": {
755 | "type": "integer",
756 | "required": false,
757 | "title": "Size Hard Limit",
758 | "description": "If set, exit crawler and fail the scraper immediately if WARC size exceeds this value",
759 | "min": 0
760 | },
761 | "disk_utilization": {
762 | "type": "integer",
763 | "required": false,
764 | "title": "Disk Utilization",
765 | "description": "Save state and exit if disk utilization exceeds this percentage value. Default (if not set) is 90%. Set to 0 to disable disk utilization check.",
766 | "min": 0
767 | },
768 | "time_soft_limit": {
769 | "type": "integer",
770 | "required": false,
771 | "title": "Time Soft Limit",
772 | "description": "If set, save crawl state and stop crawl if WARC(s) creation takes longer than this value, in seconds. ZIM will still be created.",
773 | "min": 0
774 | },
775 | "time_hard_limit": {
776 | "type": "integer",
777 | "required": false,
778 | "title": "Time Hard Limit",
779 | "description": "If set, exit crawler and fail the scraper immediately if WARC(s) creation takes longer than this value, in seconds",
780 | "min": 0
781 | },
782 | "net_idle_wait": {
783 | "type": "integer",
784 | "required": false,
785 | "title": "Net Idle Wait",
786 | "description": "If set, wait for network idle after page load and after behaviors are done (in seconds). If -1 (default), determine based on scope."
787 | },
788 | "origin_override": {
789 | "type": "string",
790 | "required": false,
791 | "title": "Origin Override",
792 | "description": "If set, will redirect requests from each origin in key to origin in the value, eg. https://host:port=http://alt-host:alt-port."
793 | },
794 | "max_page_retries": {
795 | "type": "integer",
796 | "required": false,
797 | "title": "Max Page Retries",
798 | "description": "If set, number of times to retry a page that failed to load before page is considered to have failed. Default is 2.",
799 | "min": 0
800 | },
801 | "fail_on_failed_seed": {
802 | "type": "boolean",
803 | "required": false,
804 | "title": "Fail on failed seed",
805 | "description": "Whether to display additional logs"
806 | },
807 | "fail_on_invalid_status": {
808 | "type": "boolean",
809 | "required": false,
810 | "title": "Fail on invalid status",
811 | "description": "If set, will treat pages with 4xx or 5xx response as failures. When combined with --failOnFailedLimit or --failOnFailedSeed may result in crawl failing due to non-200 responses"
812 | },
813 | "fail_on_failed_limit": {
814 | "type": "integer",
815 | "required": false,
816 | "title": "Fail on failed - Limit",
817 | "description": "If set, save state and exit if number of failed pages exceeds this value.",
818 | "min": 0
819 | },
820 | "warcs": {
821 | "type": "string",
822 | "required": false,
823 | "title": "WARC files",
824 | "description": "Comma-separated list of WARC files to use as input."
825 | },
826 | "verbose": {
827 | "type": "boolean",
828 | "required": false,
829 | "title": "Verbose mode",
830 | "description": "Whether to display additional logs"
831 | },
832 | "keep": {
833 | "type": "boolean",
834 | "required": false,
835 | "title": "Keep",
836 | "description": "Should be True. Developer option: must be True if we want to keep the WARC files for artifacts archiving.",
837 | "default": true
838 | },
839 | "output": {
840 | "type": "string",
841 | "required": false,
842 | "title": "Output folder",
843 | "description": "Output folder for ZIM file(s). Leave it as `/output`",
844 | "pattern": "^/output$"
845 | },
846 | "admin_email": {
847 | "type": "email",
848 | "required": false,
849 | "title": "Admin Email",
850 | "description": "Admin Email for crawler: used in UserAgent so website admin can contact us",
851 | "default": "contact+zimfarm@kiwix.org"
852 | },
853 | "profile": {
854 | "type": "string",
855 | "required": false,
856 | "title": "Browser profile",
857 | "description": "Path or HTTP(S) URL to tar.gz file which contains the browser profile directory for Browsertrix crawler."
858 | },
859 | "behaviors": {
860 | "type": "string",
861 | "required": false,
862 | "title": "Behaviors",
863 | "description": "Which background behaviors to enable on each page. Defaults to autoplay,autofetch,siteSpecific."
864 | },
865 | "depth": {
866 | "type": "integer",
867 | "required": false,
868 | "title": "Depth",
869 | "description": "The depth of the crawl for all seeds. Default is -1 (infinite).",
870 | "min": -1
871 | },
872 | "zim_lang": {
873 | "type": "string",
874 | "required": false,
875 | "title": "ZIM Language",
876 | "description": "Language metadata of ZIM (warc2zim --lang param). ISO-639-3 code. Retrieved from homepage if found, fallback to `eng`",
877 | "alias": "zim-lang",
878 | "customValidator": "language_code"
879 | },
880 | "long_description": {
881 | "type": "string",
882 | "required": false,
883 | "title": "Long description",
884 | "description": "Optional long description for your ZIM",
885 | "minLength": 1,
886 | "maxLength": 4000,
887 | "alias": "long-description"
888 | },
889 | "custom_css": {
890 | "type": "url",
891 | "required": false,
892 | "title": "Custom CSS",
893 | "description": "URL to a CSS file to inject into pages",
894 | "alias": "custom-css"
895 | },
896 | "charsets_to_try": {
897 | "type": "string",
898 | "required": false,
899 | "title": "Charsets to try",
900 | "description": "List of charsets to try decode content when charset is not found",
901 | "alias": "charsets-to-try"
902 | },
903 | "ignore_content_header_charsets": {
904 | "type": "boolean",
905 | "required": false,
906 | "title": "Ignore Content Header Charsets",
907 | "description": "Ignore the charsets specified in content headers - first bytes - typically because they are wrong.",
908 | "alias": "ignore-content-header-charsets"
909 | },
910 | "content_header_bytes_length": {
911 | "type": "integer",
912 | "required": false,
913 | "title": "Content Header Bytes Length",
914 | "description": "How many bytes to consider when searching for content charsets in header (default is 1024).",
915 | "alias": "content-header-bytes-length",
916 | "min": 0
917 | },
918 | "ignore_http_header_charsets": {
919 | "type": "boolean",
920 | "required": false,
921 | "title": "Ignore HTTP Header Charsets",
922 | "description": "Ignore the charsets specified in HTTP `Content-Type` headers, typically because they are wrong.",
923 | "alias": "ignore-http-header-charsets"
924 | },
925 | "encoding_aliases": {
926 | "type": "string",
927 | "required": false,
928 | "title": "Encoding Aliases",
929 | "description": "List of encoding/charset aliases to decode WARC content. Aliases are used when the encoding specified in upstream server exists in Python under a different name. This parameter is single string, multiple values are separated by a comma, like in alias1=encoding1,alias2=encoding2.",
930 | "alias": "encoding-aliases"
931 | },
932 | "custom_behaviors": {
933 | "type": "string",
934 | "required": false,
935 | "title": "Custom Behaviors",
936 | "description": "JS code for custom behaviors to customize crawler. Single string with individual JS files URL/path separated by a comma.",
937 | "alias": "custom-behaviours"
938 | },
939 | "zimit_progress_file": {
940 | "type": "string",
941 | "required": false,
942 | "title": "Zimit Progress File",
943 | "description": "Scraping progress file. Leave it as `/output/task_progress.json`",
944 | "alias": "zimit-progress-file",
945 | "pattern": "^/output/task_progress\\.json$"
946 | },
947 | "replay_viewer_source": {
948 | "type": "url",
949 | "required": false,
950 | "title": "Replay Viewer Source",
951 | "description": "URL from which to load the ReplayWeb.page replay viewer from",
952 | "alias": "replay-viewer-source"
953 | },
954 | "zim_file": {
955 | "type": "string",
956 | "required": false,
957 | "title": "ZIM filename",
958 | "description": "ZIM file name (based on --name if not provided). Include {period} to insert date period dynamically",
959 | "alias": "zim-file",
960 | "pattern": "^([a-z0-9\\-\\.]+_)([a-z\\-]+_)([a-z0-9\\-\\.]+_)([a-z0-9\\-\\.]+_|)([\\d]{4}-[\\d]{2}|\\{period\\}).zim$",
961 | "relaxedPattern": "^[A-Za-z0-9._-]+$"
962 | },
963 | "name": {
964 | "type": "string",
965 | "required": true,
966 | "title": "ZIM name",
967 | "description": "Name of the ZIM.",
968 | "alias": "name",
969 | "pattern": "^([a-z0-9\\-\\.]+_)([a-z\\-]+_)([a-z0-9\\-\\.]+)$",
970 | "relaxedPattern": "^[A-Za-z0-9._-]+$"
971 | }
972 | }
973 | }
974 |
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [build-system]
2 | requires = ["hatchling", "hatch-openzim"]
3 | build-backend = "hatchling.build"
4 |
5 | [project]
6 | name = "zimit"
7 | requires-python = ">=3.13,<3.14"
8 | description = "Make ZIM file from any website through crawling"
9 | readme = "README.md"
10 | dependencies = [
11 | "requests==2.32.3",
12 | "inotify==0.2.10",
13 | "tld==0.13",
14 | "warc2zim @ git+https://github.com/openzim/warc2zim@main",
15 | ]
16 | dynamic = ["authors", "classifiers", "keywords", "license", "version", "urls"]
17 |
18 | [tool.hatch.metadata.hooks.openzim-metadata]
19 | kind = "scraper"
20 |
21 | [tool.hatch.metadata]
22 | allow-direct-references = true # to be removed once we use a released warc2zim version
23 |
24 | [project.optional-dependencies]
25 | scripts = [
26 | "invoke==2.2.0",
27 | ]
28 | lint = [
29 | "black==25.1.0",
30 | "ruff==0.9.4",
31 | ]
32 | check = [
33 | "pyright==1.1.393",
34 | ]
35 | test = [
36 | "pytest==8.3.4",
37 | "coverage==7.6.10",
38 | ]
39 | dev = [
40 | "pre-commit==4.1.0",
41 | "debugpy==1.8.12",
42 | "selenium==4.28.1", # used in daily tests, convenient for dev purpose (autocompletion)
43 | "zimit[scripts]",
44 | "zimit[lint]",
45 | "zimit[test]",
46 | "zimit[check]",
47 | ]
48 |
49 | [project.scripts]
50 | zimit = "zimit:zimit.zimit"
51 |
52 | [tool.hatch.version]
53 | path = "src/zimit/__about__.py"
54 |
55 | [tool.hatch.build]
56 | exclude = [
57 | "/.github",
58 | ]
59 |
60 | [tool.hatch.build.targets.wheel]
61 | packages = ["src/zimit"]
62 |
63 | [tool.hatch.envs.default]
64 | features = ["dev"]
65 |
66 | [tool.hatch.envs.test]
67 | features = ["scripts", "test"]
68 |
69 | [tool.hatch.envs.test.scripts]
70 | run = "inv test --args '{args}'"
71 | run-cov = "inv test-cov --args '{args}'"
72 | report-cov = "inv report-cov"
73 | coverage = "inv coverage --args '{args}'"
74 | html = "inv coverage --html --args '{args}'"
75 |
76 | [tool.hatch.envs.lint]
77 | template = "lint"
78 | skip-install = false
79 | features = ["scripts", "lint"]
80 |
81 | [tool.hatch.envs.lint.scripts]
82 | black = "inv lint-black --args '{args}'"
83 | ruff = "inv lint-ruff --args '{args}'"
84 | all = "inv lintall --args '{args}'"
85 | fix-black = "inv fix-black --args '{args}'"
86 | fix-ruff = "inv fix-ruff --args '{args}'"
87 | fixall = "inv fixall --args '{args}'"
88 |
89 | [tool.hatch.envs.check]
90 | features = ["scripts", "check"]
91 |
92 | [tool.hatch.envs.check.scripts]
93 | pyright = "inv check-pyright --args '{args}'"
94 | all = "inv checkall --args '{args}'"
95 |
96 | [tool.black]
97 | line-length = 88
98 | target-version = ['py313']
99 |
100 | [tool.ruff]
101 | target-version = "py313"
102 | line-length = 88
103 | src = ["src"]
104 |
105 | [tool.ruff.lint]
106 | select = [
107 | "A", # flake8-builtins
108 | # "ANN", # flake8-annotations
109 | "ARG", # flake8-unused-arguments
110 | # "ASYNC", # flake8-async
111 | "B", # flake8-bugbear
112 | # "BLE", # flake8-blind-except
113 | "C4", # flake8-comprehensions
114 | "C90", # mccabe
115 | # "COM", # flake8-commas
116 | # "D", # pydocstyle
117 | # "DJ", # flake8-django
118 | "DTZ", # flake8-datetimez
119 | "E", # pycodestyle (default)
120 | "EM", # flake8-errmsg
121 | # "ERA", # eradicate
122 | # "EXE", # flake8-executable
123 | "F", # Pyflakes (default)
124 | # "FA", # flake8-future-annotations
125 | "FBT", # flake8-boolean-trap
126 | # "FLY", # flynt
127 | # "G", # flake8-logging-format
128 | "I", # isort
129 | "ICN", # flake8-import-conventions
130 | # "INP", # flake8-no-pep420
131 | # "INT", # flake8-gettext
132 | "ISC", # flake8-implicit-str-concat
133 | "N", # pep8-naming
134 | # "NPY", # NumPy-specific rules
135 | # "PD", # pandas-vet
136 | # "PGH", # pygrep-hooks
137 | # "PIE", # flake8-pie
138 | # "PL", # Pylint
139 | "PLC", # Pylint: Convention
140 | "PLE", # Pylint: Error
141 | "PLR", # Pylint: Refactor
142 | "PLW", # Pylint: Warning
143 | # "PT", # flake8-pytest-style
144 | # "PTH", # flake8-use-pathlib
145 | # "PYI", # flake8-pyi
146 | "Q", # flake8-quotes
147 | # "RET", # flake8-return
148 | # "RSE", # flake8-raise
149 | "RUF", # Ruff-specific rules
150 | "S", # flake8-bandit
151 | # "SIM", # flake8-simplify
152 | # "SLF", # flake8-self
153 | "T10", # flake8-debugger
154 | "T20", # flake8-print
155 | # "TCH", # flake8-type-checking
156 | # "TD", # flake8-todos
157 | "TID", # flake8-tidy-imports
158 | # "TRY", # tryceratops
159 | "UP", # pyupgrade
160 | "W", # pycodestyle
161 | "YTT", # flake8-2020
162 | ]
163 | ignore = [
164 | # Allow non-abstract empty methods in abstract base classes
165 | "B027",
166 | # Remove flake8-errmsg since we consider they bloat the code and provide limited value
167 | "EM",
168 | # Allow boolean positional values in function calls, like `dict.get(... True)`
169 | "FBT003",
170 | # Ignore checks for possible passwords
171 | "S105", "S106", "S107",
172 | # Ignore warnings on subprocess.run / popen
173 | "S603",
174 | # Ignore complexity
175 | "C901", "PLR0911", "PLR0912", "PLR0913", "PLR0915",
176 | ]
177 | unfixable = [
178 | # Don't touch unused imports
179 | "F401",
180 | ]
181 |
182 | [tool.ruff.lint.isort]
183 | known-first-party = ["zimit"]
184 |
185 | [tool.ruff.lint.flake8-bugbear]
186 | # add exceptions to B008 for fastapi.
187 | extend-immutable-calls = ["fastapi.Depends", "fastapi.Query"]
188 |
189 | [tool.ruff.lint.flake8-tidy-imports]
190 | ban-relative-imports = "all"
191 |
192 | [tool.ruff.lint.per-file-ignores]
193 | # Tests can use magic values, assertions, and relative imports
194 | "tests**/**/*" = ["PLR2004", "S101", "TID252"]
195 |
196 | [tool.pytest.ini_options]
197 | minversion = "7.3"
198 | testpaths = ["tests"]
199 | pythonpath = [".", "src"]
200 |
201 | [tool.coverage.paths]
202 | zimit = ["src/zimit"]
203 | tests = ["tests"]
204 |
205 | [tool.coverage.run]
206 | source_pkgs = ["zimit"]
207 | branch = true
208 | parallel = true
209 | omit = [
210 | "src/zimit/__about__.py",
211 | ]
212 |
213 | [tool.coverage.report]
214 | exclude_lines = [
215 | "no cov",
216 | "if __name__ == .__main__.:",
217 | "if TYPE_CHECKING:",
218 | ]
219 |
220 | [tool.pyright]
221 | include = ["src", "tests", "tasks.py"]
222 | exclude = [".env/**", ".venv/**"]
223 | extraPaths = ["src"]
224 | pythonVersion = "3.13"
225 | typeCheckingMode="basic"
226 |
--------------------------------------------------------------------------------
/src/zimit/__about__.py:
--------------------------------------------------------------------------------
1 | __version__ = "3.0.6-dev0"
2 |
--------------------------------------------------------------------------------
/src/zimit/constants.py:
--------------------------------------------------------------------------------
1 | import logging
2 |
3 | from zimscraperlib.logging import getLogger
4 |
5 | EXIT_CODE_WARC2ZIM_CHECK_FAILED = 2
6 | EXIT_CODE_CRAWLER_SIZE_LIMIT_HIT = 14
7 | EXIT_CODE_CRAWLER_TIME_LIMIT_HIT = 15
8 | NORMAL_WARC2ZIM_EXIT_CODE = 100
9 | REQUESTS_TIMEOUT = 10
10 |
11 | logger = getLogger(name="zimit", level=logging.INFO)
12 |
--------------------------------------------------------------------------------
/src/zimit/utils.py:
--------------------------------------------------------------------------------
1 | from pathlib import Path
2 |
3 | import requests
4 |
5 | from zimit.constants import REQUESTS_TIMEOUT
6 |
7 |
8 | def download_file(url: str, fpath: Path):
9 | """Download file from url to fpath with streaming"""
10 | with requests.get(url, timeout=REQUESTS_TIMEOUT, stream=True) as resp:
11 | resp.raise_for_status()
12 | with open(fpath, "wb") as f:
13 | for chunk in resp.iter_content(chunk_size=8192):
14 | f.write(chunk)
15 |
--------------------------------------------------------------------------------
/src/zimit/zimit.py:
--------------------------------------------------------------------------------
1 | """
2 | Main zimit run script
3 | This script validates arguments with warc2zim, checks permissions
4 | and then calls the Node based driver
5 | """
6 |
7 | import atexit
8 | import json
9 | import re
10 | import shutil
11 | import signal
12 | import subprocess
13 | import sys
14 | import tarfile
15 | import tempfile
16 | import urllib.parse
17 | from argparse import ArgumentParser
18 | from multiprocessing import Process
19 | from pathlib import Path
20 |
21 | import inotify
22 | import inotify.adapters
23 | from warc2zim.main import main as warc2zim
24 | from zimscraperlib.uri import rebuild_uri
25 |
26 | from zimit.__about__ import __version__
27 | from zimit.constants import (
28 | EXIT_CODE_CRAWLER_SIZE_LIMIT_HIT,
29 | EXIT_CODE_CRAWLER_TIME_LIMIT_HIT,
30 | EXIT_CODE_WARC2ZIM_CHECK_FAILED,
31 | NORMAL_WARC2ZIM_EXIT_CODE,
32 | logger,
33 | )
34 | from zimit.utils import download_file
35 |
36 | temp_root_dir: Path | None = None
37 |
38 |
39 | class ProgressFileWatcher:
40 | def __init__(
41 | self, crawl_stats_path: Path, warc2zim_stats_path, zimit_stats_path: Path
42 | ):
43 | self.crawl_stats_path = crawl_stats_path
44 | self.warc2zim_stats_path = warc2zim_stats_path
45 | self.zimit_stats_path = zimit_stats_path
46 |
47 | # touch them all so inotify is not unhappy on add_watch
48 | self.crawl_stats_path.touch()
49 | self.warc2zim_stats_path.touch()
50 | self.process = None
51 |
52 | def stop(self):
53 | if not self.process:
54 | return
55 | self.process.join(0.1)
56 | self.process.terminate()
57 |
58 | def watch(self):
59 | self.process = Process(
60 | target=self.inotify_watcher,
61 | args=(
62 | str(self.crawl_stats_path),
63 | str(self.warc2zim_stats_path),
64 | str(self.zimit_stats_path),
65 | ),
66 | )
67 | self.process.daemon = True
68 | self.process.start()
69 |
70 | def inotify_watcher(self, crawl_fpath: str, warc2zim_fpath: str, zimit_fpath: str):
71 | ino = inotify.adapters.Inotify()
72 | ino.add_watch(crawl_fpath, inotify.constants.IN_MODIFY) # pyright: ignore
73 | ino.add_watch(warc2zim_fpath, inotify.constants.IN_MODIFY) # pyright: ignore
74 |
75 | def crawl_conv(data):
76 | # we consider crawl to be 90% of the workload so total = craw_total * 90%
77 | return {
78 | "done": data["crawled"],
79 | "total": int(data["total"] / 0.9),
80 | }
81 |
82 | def warc2zim_conv(data):
83 | # we consider warc2zim to be 10% of the workload so
84 | # warc2zim_total = 10% and total = 90 + warc2zim_total * 10%
85 | return {
86 | "done": int(
87 | data["total"]
88 | * (0.9 + (float(data["written"]) / data["total"]) / 10)
89 | ),
90 | "total": data["total"],
91 | }
92 |
93 | for _, _, fpath, _ in ino.event_gen(yield_nones=False): # pyright: ignore
94 | func = {crawl_fpath: crawl_conv, warc2zim_fpath: warc2zim_conv}.get(fpath)
95 | if not func:
96 | continue
97 | # open input and output separatly as to not clear output on error
98 | with open(fpath) as ifh:
99 | try:
100 | out = func(json.load(ifh))
101 | except Exception: # nosec # noqa: S112
102 | # simply ignore progress update should an error arise
103 | # might be malformed input for instance
104 | continue
105 | if not out:
106 | continue
107 | with open(zimit_fpath, "w") as ofh:
108 | json.dump(out, ofh)
109 |
110 |
111 | def cleanup():
112 | if not temp_root_dir:
113 | logger.warning("Temporary root dir not already set, cannot clean this up")
114 | return
115 | logger.info("")
116 | logger.info("----------")
117 | logger.info(f"Cleanup, removing temp dir: {temp_root_dir}")
118 | shutil.rmtree(temp_root_dir)
119 |
120 |
121 | def cancel_cleanup():
122 | logger.info(
123 | f"Temporary files have been kept in {temp_root_dir}, please clean them"
124 | " up manually once you don't need them anymore"
125 | )
126 | atexit.unregister(cleanup)
127 |
128 |
129 | def run(raw_args):
130 | parser = ArgumentParser(
131 | description="Run a browser-based crawl on the specified URL and convert to ZIM"
132 | )
133 |
134 | parser.add_argument(
135 | "--seeds",
136 | help="The seed URL(s) to start crawling from. Multile seed URL must be "
137 | "separated by a comma (usually not needed, these are just the crawl seeds). "
138 | "First seed URL is used as ZIM homepage",
139 | )
140 |
141 | parser.add_argument("--title", help="WARC and ZIM title")
142 | parser.add_argument("--description", help="WARC and ZIM description")
143 | parser.add_argument("--long-description", help="ZIM long description metadata")
144 |
145 | parser.add_argument(
146 | "--seedFile",
147 | help="If set, read a list of seed urls, one per line. Can be a local file or "
148 | "the HTTP(s) URL to an online file.",
149 | )
150 |
151 | parser.add_argument(
152 | "-w", "--workers", type=int, help="Number of parallel workers. Default is 1."
153 | )
154 |
155 | parser.add_argument(
156 | "--crawlId",
157 | help="A user provided ID for this crawl or crawl configuration (can also be "
158 | "set via CRAWL_ID env var, defaults to machine hostname)",
159 | )
160 |
161 | parser.add_argument(
162 | "--waitUntil",
163 | help="Puppeteer page.goto() condition to wait for before continuing. One of "
164 | "load, domcontentloaded, networkidle0 or networkidle2, or a "
165 | "comma-separated combination of those. Default is load,networkidle2",
166 | )
167 |
168 | parser.add_argument(
169 | "--depth",
170 | help="The depth of the crawl for all seeds. Default is -1 (infinite).",
171 | type=int,
172 | )
173 |
174 | parser.add_argument(
175 | "--extraHops",
176 | help="Number of extra 'hops' to follow, beyond the current scope. "
177 | "Default is 0.",
178 | type=int,
179 | )
180 |
181 | parser.add_argument(
182 | "--pageLimit",
183 | help="Limit crawl to this number of pages. Default is 0 (no limit).",
184 | type=int,
185 | )
186 |
187 | parser.add_argument(
188 | "--maxPageLimit",
189 | help="Maximum pages to crawl, overriding pageLimit if both are set. Default is "
190 | "0 (no limit)",
191 | type=int,
192 | )
193 |
194 | parser.add_argument(
195 | "--pageLoadTimeout",
196 | help="Timeout for each page to load (in seconds). Default is 90 secs.",
197 | type=int,
198 | )
199 |
200 | parser.add_argument(
201 | "--scopeType",
202 | help="A predfined scope of the crawl. For more customization, "
203 | "use 'custom' and set scopeIncludeRx/scopeExcludeRx regexes. Default is custom"
204 | "if scopeIncludeRx is set, prefix otherwise.",
205 | choices=["page", "page-spa", "prefix", "host", "domain", "any", "custom"],
206 | )
207 |
208 | parser.add_argument(
209 | "--scopeIncludeRx",
210 | help="Regex of page URLs that should be included in the crawl (defaults to "
211 | "the immediate directory of URL)",
212 | )
213 |
214 | parser.add_argument(
215 | "--scopeExcludeRx",
216 | help="Regex of page URLs that should be excluded from the crawl",
217 | )
218 |
219 | parser.add_argument(
220 | "--allowHashUrls",
221 | help="Allow Hashtag URLs, useful for single-page-application crawling or "
222 | "when different hashtags load dynamic content",
223 | action="store_true",
224 | )
225 |
226 | parser.add_argument(
227 | "--selectLinks",
228 | help="One or more selectors for extracting links, in the format "
229 | "[css selector]->[property to use],[css selector]->@[attribute to use]",
230 | )
231 |
232 | parser.add_argument(
233 | "--clickSelector",
234 | help="Selector for elements to click when using the autoclick behavior. Default"
235 | " is 'a'",
236 | )
237 |
238 | parser.add_argument(
239 | "--blockRules",
240 | help="Additional rules for blocking certain URLs from being loaded, by URL "
241 | "regex and optionally via text match in an iframe",
242 | )
243 |
244 | parser.add_argument(
245 | "--blockMessage",
246 | help="If specified, when a URL is blocked, a record with this error message is"
247 | " added instead",
248 | )
249 |
250 | parser.add_argument(
251 | "--blockAds",
252 | help="If set, block advertisements from being loaded (based on Stephen Black's"
253 | " blocklist). Note that some bad domains are also blocked by zimit"
254 | " configuration even if this option is not set.",
255 | )
256 |
257 | parser.add_argument(
258 | "--adBlockMessage",
259 | help="If specified, when an ad is blocked, a record with this error message is"
260 | " added instead",
261 | )
262 |
263 | parser.add_argument(
264 | "--collection",
265 | help="Collection name to crawl to (replay will be accessible "
266 | "under this name in pywb preview). Default is crawl-@ts.",
267 | )
268 |
269 | parser.add_argument(
270 | "--headless",
271 | help="Run in headless mode, otherwise start xvfb",
272 | action="store_true",
273 | )
274 |
275 | parser.add_argument(
276 | "--driver",
277 | help="Custom driver for the crawler, if any",
278 | )
279 |
280 | parser.add_argument(
281 | "--generateCDX",
282 | help="If set, generate index (CDXJ) for use with pywb after crawl is done",
283 | action="store_true",
284 | )
285 |
286 | parser.add_argument(
287 | "--combineWARC",
288 | help="If set, combine the warcs",
289 | action="store_true",
290 | )
291 |
292 | parser.add_argument(
293 | "--rolloverSize",
294 | help="If set, declare the rollover size. Default is 1000000000.",
295 | type=int,
296 | )
297 |
298 | parser.add_argument(
299 | "--generateWACZ",
300 | help="If set, generate WACZ on disk",
301 | action="store_true",
302 | )
303 |
304 | parser.add_argument(
305 | "--logging",
306 | help="Crawler logging configuration",
307 | )
308 |
309 | parser.add_argument(
310 | "--logLevel",
311 | help="Comma-separated list of log levels to include in logs",
312 | )
313 |
314 | parser.add_argument(
315 | "--logContext",
316 | help="Comma-separated list of contexts to include in logs",
317 | choices=[
318 | "general",
319 | "worker",
320 | "recorder",
321 | "recorderNetwork",
322 | "writer",
323 | "state",
324 | "redis",
325 | "storage",
326 | "text",
327 | "exclusion",
328 | "screenshots",
329 | "screencast",
330 | "originOverride",
331 | "healthcheck",
332 | "browser",
333 | "blocking",
334 | "behavior",
335 | "behaviorScript",
336 | "jsError",
337 | "fetch",
338 | "pageStatus",
339 | "memoryStatus",
340 | "crawlStatus",
341 | "links",
342 | "sitemap",
343 | "wacz",
344 | "replay",
345 | "proxy",
346 | ],
347 | )
348 |
349 | parser.add_argument(
350 | "--logExcludeContext",
351 | help="Comma-separated list of contexts to NOT include in logs. Default is "
352 | "recorderNetwork,jsError,screencast",
353 | choices=[
354 | "general",
355 | "worker",
356 | "recorder",
357 | "recorderNetwork",
358 | "writer",
359 | "state",
360 | "redis",
361 | "storage",
362 | "text",
363 | "exclusion",
364 | "screenshots",
365 | "screencast",
366 | "originOverride",
367 | "healthcheck",
368 | "browser",
369 | "blocking",
370 | "behavior",
371 | "behaviorScript",
372 | "jsError",
373 | "fetch",
374 | "pageStatus",
375 | "memoryStatus",
376 | "crawlStatus",
377 | "links",
378 | "sitemap",
379 | "wacz",
380 | "replay",
381 | "proxy",
382 | ],
383 | )
384 |
385 | parser.add_argument(
386 | "--text",
387 | help="Extract initial (default) or final text to pages.jsonl or WARC resource"
388 | " record(s)",
389 | )
390 |
391 | # cwd is manipulated directly by zimit, based on --output / --build, we do not want
392 | # to expose this setting
393 |
394 | parser.add_argument(
395 | "--mobileDevice",
396 | help="Emulate mobile device by name from "
397 | "https://github.com/puppeteer/puppeteer/blob/"
398 | "main/packages/puppeteer-core/src/common/Device.ts",
399 | )
400 |
401 | parser.add_argument(
402 | "--userAgent",
403 | help="Override default user-agent with specified value ; --userAgentSuffix and "
404 | "--adminEmail have no effect when this is set",
405 | )
406 |
407 | parser.add_argument(
408 | "--userAgentSuffix",
409 | help="Append suffix to existing browser user-agent "
410 | "(ex: +MyCrawler, info@example.com)",
411 | default="+Zimit",
412 | )
413 |
414 | parser.add_argument(
415 | "--useSitemap",
416 | help="If set, use the URL as sitemap to get additional URLs for the crawl "
417 | "(usually /sitemap.xml)",
418 | )
419 |
420 | parser.add_argument(
421 | "--sitemapFromDate",
422 | help="If set, filter URLs from sitemaps to those greater than or equal to (>=)"
423 | " provided ISO Date string (YYYY-MM-DD or YYYY-MM-DDTHH:MM:SS or partial date)",
424 | )
425 |
426 | parser.add_argument(
427 | "--sitemapToDate",
428 | help="If set, filter URLs from sitemaps to those less than or equal to (<=) "
429 | "provided ISO Date string (YYYY-MM-DD or YYYY-MM-DDTHH:MM:SS or partial date)",
430 | )
431 |
432 | parser.add_argument(
433 | "--statsFilename",
434 | help="If set, output crawl stats as JSON to this file. Relative filename "
435 | "resolves to output directory, see --output.",
436 | )
437 |
438 | parser.add_argument(
439 | "--zimit-progress-file",
440 | help="If set, output zimit stats as JSON to this file. Forces the creation of"
441 | "crawler and warc2zim stats as well. If --statsFilename and/or "
442 | "--warc2zim-progress-file are not set, default temporary files will be used. "
443 | "Relative filename resolves to output directory, see --output.",
444 | )
445 |
446 | parser.add_argument(
447 | "--warc2zim-progress-file",
448 | help="If set, output warc2zim stats as JSON to this file. Relative filename "
449 | "resolves to output directory, see --output.",
450 | )
451 |
452 | parser.add_argument(
453 | "--behaviors",
454 | help="Which background behaviors to enable on each page. Default is autoplay,"
455 | "autofetch,autoscroll,siteSpecific",
456 | )
457 |
458 | parser.add_argument(
459 | "--behaviorTimeout",
460 | help="If >0, timeout (in seconds) for in-page behavior will run on each page. "
461 | "If 0, a behavior can run until finish. Default is 90.",
462 | type=int,
463 | )
464 |
465 | parser.add_argument(
466 | "--postLoadDelay",
467 | help="If >0, amount of time to sleep (in seconds) after page has loaded, before"
468 | " taking screenshots / getting text / running behaviors. Default is 0.",
469 | type=int,
470 | )
471 |
472 | parser.add_argument(
473 | "--pageExtraDelay",
474 | help="If >0, amount of time to sleep (in seconds) after behaviors "
475 | "before moving on to next page. Default is 0.",
476 | type=int,
477 | )
478 |
479 | parser.add_argument(
480 | "--dedupPolicy",
481 | help="Deduplication policy. Default is skip",
482 | choices=["skip", "revisit", "keep"],
483 | )
484 |
485 | parser.add_argument(
486 | "--profile",
487 | help="Path or HTTP(S) URL to tar.gz file which contains the browser profile "
488 | "directory",
489 | )
490 |
491 | parser.add_argument(
492 | "--screenshot",
493 | help="Screenshot options for crawler. One of view, thumbnail, fullPage, "
494 | "fullPageFinal or a comma-separated combination of those.",
495 | )
496 |
497 | parser.add_argument(
498 | "--screencastPort",
499 | help="If set to a non-zero value, starts an HTTP server with screencast "
500 | "accessible on this port.",
501 | type=int,
502 | )
503 |
504 | parser.add_argument(
505 | "--screencastRedis",
506 | help="If set, will use the state store redis pubsub for screencasting",
507 | action="store_true",
508 | )
509 |
510 | parser.add_argument(
511 | "--warcInfo",
512 | help="Optional fields added to the warcinfo record in combined WARCs",
513 | )
514 |
515 | parser.add_argument(
516 | "--saveState",
517 | help="If the crawl state should be serialized to the crawls/ directory. "
518 | "Defaults to 'partial', only saved when crawl is interrupted",
519 | choices=["never", "partial", "always"],
520 | )
521 |
522 | parser.add_argument(
523 | "--saveStateInterval",
524 | help="If save state is set to 'always', also save state during the crawl at "
525 | "this interval (in seconds). Default to 300.",
526 | type=int,
527 | )
528 |
529 | parser.add_argument(
530 | "--saveStateHistory",
531 | help="Number of save states to keep during the duration of a crawl. "
532 | "Default to 5.",
533 | type=int,
534 | )
535 |
536 | size_group = parser.add_mutually_exclusive_group()
537 | size_group.add_argument(
538 | "--sizeSoftLimit",
539 | help="If set, save crawl state and stop crawl if WARC size exceeds this value. "
540 | "ZIM will still be created.",
541 | type=int,
542 | )
543 | size_group.add_argument(
544 | "--sizeHardLimit",
545 | help="If set, exit crawler and fail the scraper immediately if WARC size "
546 | "exceeds this value",
547 | type=int,
548 | )
549 |
550 | parser.add_argument(
551 | "--diskUtilization",
552 | help="Save state and exit if disk utilization exceeds this percentage value."
553 | " Default (if not set) is 90%%. Set to 0 to disable disk utilization check.",
554 | type=int,
555 | default=90,
556 | )
557 |
558 | time_group = parser.add_mutually_exclusive_group()
559 | time_group.add_argument(
560 | "--timeSoftLimit",
561 | help="If set, save crawl state and stop crawl if WARC WARC(s) creation takes "
562 | "longer than this value, in seconds. ZIM will still be created.",
563 | type=int,
564 | )
565 | time_group.add_argument(
566 | "--timeHardLimit",
567 | help="If set, exit crawler and fail the scraper immediately if WARC(s) creation"
568 | " takes longer than this value, in seconds",
569 | type=int,
570 | )
571 |
572 | parser.add_argument(
573 | "--healthCheckPort",
574 | help="port to run healthcheck on",
575 | type=int,
576 | )
577 |
578 | parser.add_argument(
579 | "--overwrite",
580 | help="overwrite current crawl data: if set, existing collection directory "
581 | "will be deleted before crawl is started",
582 | action="store_true",
583 | )
584 |
585 | parser.add_argument(
586 | "--waitOnDone",
587 | help="if set, wait for interrupt signal when finished instead of exiting",
588 | action="store_true",
589 | )
590 |
591 | parser.add_argument(
592 | "--restartsOnError",
593 | help="if set, assume will be restarted if interrupted, don't run post-crawl "
594 | "processes on interrupt",
595 | action="store_true",
596 | )
597 |
598 | parser.add_argument(
599 | "--netIdleWait",
600 | help="If set, wait for network idle after page load and after behaviors are "
601 | "done (in seconds). if -1 (default), determine based on scope.",
602 | type=int,
603 | )
604 |
605 | parser.add_argument(
606 | "--lang",
607 | help="if set, sets the language used by the browser, should be ISO 639 "
608 | "language[-country] code",
609 | )
610 |
611 | parser.add_argument(
612 | "--originOverride",
613 | help="if set, will redirect requests from each origin in key to origin in the "
614 | "value, eg. --originOverride https://host:port=http://alt-host:alt-port",
615 | )
616 |
617 | parser.add_argument(
618 | "--logErrorsToRedis",
619 | help="If set, write error messages to redis",
620 | action="store_true",
621 | )
622 |
623 | parser.add_argument(
624 | "--writePagesToRedis",
625 | help="If set, write page objects to redis",
626 | action="store_true",
627 | )
628 |
629 | parser.add_argument(
630 | "--maxPageRetries",
631 | help="If set, number of times to retry a page that failed to load before page"
632 | " is considered to have failed. Default is 2.",
633 | type=int,
634 | )
635 |
636 | parser.add_argument(
637 | "--failOnFailedSeed",
638 | help="If set, crawler will fail with exit code 1 if any seed fails. When "
639 | "combined with --failOnInvalidStatus, will result in crawl failing with exit "
640 | "code 1 if any seed has a 4xx/5xx response",
641 | action="store_true",
642 | )
643 |
644 | parser.add_argument(
645 | "--failOnFailedLimit",
646 | help="If set, save state and exit if number of failed pages exceeds this value",
647 | action="store_true",
648 | )
649 |
650 | parser.add_argument(
651 | "--failOnInvalidStatus",
652 | help="If set, will treat pages with 4xx or 5xx response as failures. When "
653 | "combined with --failOnFailedLimit or --failOnFailedSeed may result in crawl "
654 | "failing due to non-200 responses",
655 | action="store_true",
656 | )
657 |
658 | # customBehaviors not included because it has special handling
659 | # debugAccessRedis not included due to custom redis engine in zimit
660 |
661 | parser.add_argument(
662 | "--debugAccessBrowser",
663 | help="if set, allow debugging browser on port 9222 via CDP",
664 | action="store_true",
665 | )
666 |
667 | parser.add_argument(
668 | "--warcPrefix",
669 | help="prefix for WARC files generated, including WARCs added to WACZ",
670 | )
671 |
672 | parser.add_argument(
673 | "--serviceWorker",
674 | help="service worker handling: disabled, enabled or disabled-if-profile. "
675 | "Default: disabled.",
676 | )
677 |
678 | parser.add_argument(
679 | "--proxyServer",
680 | help="if set, will use specified proxy server. Takes precedence over any env "
681 | "var proxy settings",
682 | )
683 |
684 | parser.add_argument(
685 | "--dryRun",
686 | help="If true, no archive data is written to disk, only pages and logs (and "
687 | "optionally saved state).",
688 | action="store_true",
689 | )
690 |
691 | parser.add_argument(
692 | "--qaSource",
693 | help="Required for QA mode. Path to the source WACZ or multi WACZ file for QA",
694 | )
695 |
696 | parser.add_argument(
697 | "--qaDebugImageDiff",
698 | help="if specified, will write crawl.png, replay.png and diff.png for each "
699 | "page where they're different",
700 | action="store_true",
701 | )
702 |
703 | parser.add_argument(
704 | "--sshProxyPrivateKeyFile",
705 | help="path to SSH private key for SOCKS5 over SSH proxy connection",
706 | )
707 |
708 | parser.add_argument(
709 | "--sshProxyKnownHostsFile",
710 | help="path to SSH known hosts file for SOCKS5 over SSH proxy connection",
711 | )
712 |
713 | parser.add_argument(
714 | "--keep",
715 | help="In case of failure, WARC files and other temporary files (which are "
716 | "stored as a subfolder of output directory) are always kept, otherwise "
717 | "they are automatically deleted. Use this flag to always keep WARC files, "
718 | "even in case of success.",
719 | action="store_true",
720 | )
721 |
722 | parser.add_argument(
723 | "--output",
724 | help="Output directory for ZIM. Default to /output.",
725 | default="/output",
726 | )
727 |
728 | parser.add_argument(
729 | "--build",
730 | help="Build directory for WARC files (if not set, output directory is used)",
731 | )
732 |
733 | parser.add_argument("--adminEmail", help="Admin Email for Zimit crawler")
734 |
735 | parser.add_argument(
736 | "--custom-css",
737 | help="[warc2zim] Custom CSS file URL/path to inject into all articles",
738 | )
739 |
740 | parser.add_argument(
741 | "--config",
742 | help="Path to YAML config file. If set, browsertrix-crawler will use this file"
743 | "to configure the crawling behaviour if not set via argument.",
744 | )
745 |
746 | parser.add_argument(
747 | "--version",
748 | help="Display scraper version and exit",
749 | action="version",
750 | version=f"Zimit {__version__}",
751 | )
752 |
753 | parser.add_argument(
754 | "--zim-lang",
755 | help="Language metadata of ZIM "
756 | "(warc2zim --lang param). ISO-639-3 code. "
757 | "Retrieved from homepage if found, fallback to `eng`",
758 | )
759 |
760 | parser.add_argument(
761 | "--custom-behaviors",
762 | help="JS code for custom behaviors to customize crawler. Single string with "
763 | "individual JS files URL/path separated by a comma",
764 | )
765 |
766 | parser.add_argument(
767 | "--warcs",
768 | help="Directly convert WARC archives to ZIM, by-passing the crawling phase. "
769 | "This argument must contain the path or HTTP(S) URL to either warc.gz files or"
770 | "to a tar or tar.gz containing the warc.gz files. Single value with individual "
771 | "path/URLs separated by comma",
772 | )
773 |
774 | parser.add_argument(
775 | "--acceptable-crawler-exit-codes",
776 | help="Non-zero crawler exit codes to consider as acceptable to continue with "
777 | " conversion of WARC to ZIM. Flag partialZim will be set in statsFilename (if "
778 | " used). Single value with individual error codes separated by comma",
779 | )
780 |
781 | # by design, all unknown args are for warc2zim ; known one are either for crawler
782 | # or shared
783 | known_args, warc2zim_args = parser.parse_known_args(raw_args)
784 |
785 | # pass a scraper suffix to warc2zim so that both zimit and warc2zim versions are
786 | # associated with the ZIM ; make it a CSV for easier parsing
787 | warc2zim_args.append("--scraper-suffix")
788 | warc2zim_args.append(f"zimit {__version__}")
789 |
790 | # pass url and output to warc2zim also
791 | if known_args.output:
792 | warc2zim_args.append("--output")
793 | warc2zim_args.append(known_args.output)
794 |
795 | user_agent_suffix = known_args.userAgentSuffix
796 | if known_args.adminEmail:
797 | user_agent_suffix += f" {known_args.adminEmail}"
798 |
799 | # make temp dir for this crawl
800 | global temp_root_dir # noqa: PLW0603
801 | if known_args.build:
802 | temp_root_dir = Path(tempfile.mkdtemp(dir=known_args.build, prefix=".tmp"))
803 | else:
804 | temp_root_dir = Path(tempfile.mkdtemp(dir=known_args.output, prefix=".tmp"))
805 |
806 | seeds = []
807 | if known_args.seeds:
808 | seeds += [get_cleaned_url(url) for url in known_args.seeds.split(",")]
809 | if known_args.seedFile:
810 | if re.match(r"^https?\://", known_args.seedFile):
811 | with tempfile.NamedTemporaryFile(
812 | dir=temp_root_dir,
813 | prefix="seeds_",
814 | suffix=".txt",
815 | delete_on_close=True,
816 | ) as filename:
817 | seed_file = Path(filename.name)
818 | download_file(known_args.seedFile, seed_file)
819 | seeds += [
820 | get_cleaned_url(url) for url in seed_file.read_text().splitlines()
821 | ]
822 | else:
823 | seeds += [
824 | get_cleaned_url(url)
825 | for url in Path(known_args.seedFile).read_text().splitlines()
826 | ]
827 | warc2zim_args.append("--url")
828 | warc2zim_args.append(seeds[0])
829 |
830 | if known_args.custom_css:
831 | warc2zim_args += ["--custom-css", known_args.custom_css]
832 |
833 | if known_args.title:
834 | warc2zim_args.append("--title")
835 | warc2zim_args.append(known_args.title)
836 |
837 | if known_args.description:
838 | warc2zim_args.append("--description")
839 | warc2zim_args.append(known_args.description)
840 |
841 | if known_args.long_description:
842 | warc2zim_args.append("--long-description")
843 | warc2zim_args.append(known_args.long_description)
844 |
845 | if known_args.zim_lang:
846 | warc2zim_args.append("--lang")
847 | warc2zim_args.append(known_args.zim_lang)
848 |
849 | logger.info("----------")
850 | logger.info("Testing warc2zim args")
851 | logger.info("Running: warc2zim " + " ".join(warc2zim_args))
852 | res = warc2zim(warc2zim_args)
853 | if res != NORMAL_WARC2ZIM_EXIT_CODE:
854 | logger.info("Exiting, invalid warc2zim params")
855 | return EXIT_CODE_WARC2ZIM_CHECK_FAILED
856 |
857 | if not known_args.keep:
858 | atexit.register(cleanup)
859 |
860 | # copy / download custom behaviors to one single folder and configure crawler
861 | if known_args.custom_behaviors:
862 | behaviors_dir = temp_root_dir / "custom-behaviors"
863 | behaviors_dir.mkdir()
864 | for custom_behavior in [
865 | custom_behavior.strip()
866 | for custom_behavior in known_args.custom_behaviors.split(",")
867 | ]:
868 | behaviors_file = tempfile.NamedTemporaryFile(
869 | dir=behaviors_dir,
870 | prefix="behavior_",
871 | suffix=".js",
872 | delete_on_close=False,
873 | )
874 | if re.match(r"^https?\://", custom_behavior):
875 | logger.info(
876 | f"Downloading browser profile from {custom_behavior} "
877 | f"to {behaviors_file.name}"
878 | )
879 | download_file(custom_behavior, Path(behaviors_file.name))
880 | else:
881 | logger.info(
882 | f"Copying browser profile from {custom_behavior} "
883 | f"to {behaviors_file.name}"
884 | )
885 | shutil.copy(custom_behavior, behaviors_file.name)
886 | known_args.customBehaviors = str(behaviors_dir)
887 | else:
888 | known_args.customBehaviors = None
889 |
890 | crawler_args = get_crawler_cmd_line(known_args)
891 | for seed in seeds:
892 | crawler_args.append("--seeds")
893 | crawler_args.append(seed)
894 |
895 | crawler_args.append("--userAgentSuffix")
896 | crawler_args.append(user_agent_suffix)
897 |
898 | crawler_args.append("--cwd")
899 | crawler_args.append(str(temp_root_dir))
900 |
901 | output_dir = Path(known_args.output)
902 | warc2zim_stats_file = (
903 | Path(known_args.warc2zim_progress_file)
904 | if known_args.warc2zim_progress_file
905 | else temp_root_dir / "warc2zim.json"
906 | )
907 | if not warc2zim_stats_file.is_absolute():
908 | warc2zim_stats_file = output_dir / warc2zim_stats_file
909 | warc2zim_stats_file.parent.mkdir(parents=True, exist_ok=True)
910 | warc2zim_stats_file.unlink(missing_ok=True)
911 |
912 | crawler_stats_file = (
913 | Path(known_args.statsFilename)
914 | if known_args.statsFilename
915 | else temp_root_dir / "crawl.json"
916 | )
917 | if not crawler_stats_file.is_absolute():
918 | crawler_stats_file = output_dir / crawler_stats_file
919 | crawler_stats_file.parent.mkdir(parents=True, exist_ok=True)
920 | crawler_stats_file.unlink(missing_ok=True)
921 |
922 | zimit_stats_file = (
923 | Path(known_args.zimit_progress_file)
924 | if known_args.zimit_progress_file
925 | else temp_root_dir / "stats.json"
926 | )
927 | if not zimit_stats_file.is_absolute():
928 | zimit_stats_file = output_dir / zimit_stats_file
929 | zimit_stats_file.parent.mkdir(parents=True, exist_ok=True)
930 | zimit_stats_file.unlink(missing_ok=True)
931 |
932 | if known_args.zimit_progress_file:
933 | # setup inotify crawler progress watcher
934 | watcher = ProgressFileWatcher(
935 | zimit_stats_path=zimit_stats_file,
936 | crawl_stats_path=crawler_stats_file,
937 | warc2zim_stats_path=warc2zim_stats_file,
938 | )
939 | logger.info(
940 | f"Writing zimit progress to {watcher.zimit_stats_path}, crawler progress to"
941 | f" {watcher.crawl_stats_path} and warc2zim progress to "
942 | f"{watcher.warc2zim_stats_path}"
943 | )
944 | # update crawler command
945 | crawler_args.append("--statsFilename")
946 | crawler_args.append(str(crawler_stats_file))
947 | # update warc2zim command
948 | warc2zim_args.append("-v")
949 | warc2zim_args.append("--progress-file")
950 | warc2zim_args.append(str(warc2zim_stats_file))
951 | watcher.watch()
952 | else:
953 | if known_args.statsFilename:
954 | logger.info(f"Writing crawler progress to {crawler_stats_file}")
955 | crawler_args.append("--statsFilename")
956 | crawler_args.append(str(crawler_stats_file))
957 | if known_args.warc2zim_progress_file:
958 | logger.info(f"Writing warc2zim progress to {warc2zim_stats_file}")
959 | warc2zim_args.append("-v")
960 | warc2zim_args.append("--progress-file")
961 | warc2zim_args.append(str(warc2zim_stats_file))
962 |
963 | cmd_line = " ".join(crawler_args)
964 |
965 | logger.info("")
966 | logger.info("----------")
967 | logger.info(
968 | f"Output to tempdir: {temp_root_dir} - "
969 | f"{'will keep' if known_args.keep else 'will delete'}"
970 | )
971 |
972 | partial_zim = False
973 |
974 | # if warc files are passed, do not run browsertrix crawler but fetch the files if
975 | # they are provided as an HTTP URL + extract the archive if it is a tar.gz
976 | warc_files: list[Path] = []
977 | if known_args.warcs:
978 | for warc_location in [
979 | warc_location.strip() for warc_location in known_args.warcs.split(",")
980 | ]:
981 | suffix = "".join(Path(urllib.parse.urlparse(warc_location).path).suffixes)
982 | if suffix not in {".tar", ".tar.gz", ".warc", ".warc.gz"}:
983 | raise Exception(f"Unsupported file at {warc_location}")
984 |
985 | filename = tempfile.NamedTemporaryFile(
986 | dir=temp_root_dir,
987 | prefix="warc_",
988 | suffix=suffix,
989 | delete_on_close=False,
990 | )
991 |
992 | if not re.match(r"^https?\://", warc_location):
993 | # warc_location is not a URL, so it is a path, simply add it to the list
994 | if not Path(warc_location).exists():
995 | raise Exception(f"Impossible to find file at {warc_location}")
996 |
997 | # if it is a plain warc or warc.gz, simply add it to the list
998 | if suffix in {".warc", ".warc.gz"}:
999 | warc_files.append(Path(warc_location))
1000 | continue
1001 |
1002 | # otherwise extract tar.gz but do not delete it afterwards
1003 | extract_path = temp_root_dir / f"{filename.name}_files"
1004 | logger.info(
1005 | f"Extracting WARC(s) from {warc_location} to {extract_path}"
1006 | )
1007 | with tarfile.open(warc_location, "r") as fh:
1008 | # Extract all the contents to the specified directory
1009 | fh.extractall(path=extract_path, filter="data")
1010 | warc_files.append(Path(extract_path))
1011 | continue
1012 |
1013 | # warc_location is a URL, let's download it to a temp name to avoid name
1014 | # collisions
1015 | warc_file = Path(filename.name)
1016 | logger.info(f"Downloading WARC(s) from {warc_location} to {warc_file}")
1017 | download_file(warc_location, warc_file)
1018 |
1019 | # if it is a plain warc or warc.gz, simply add it to the list
1020 | if suffix in {".warc", ".warc.gz"}:
1021 | warc_files.append(warc_file)
1022 | continue
1023 |
1024 | # otherwise extract tar.gz and delete it afterwards
1025 | extract_path = temp_root_dir / f"{filename.name}_files"
1026 | logger.info(f"Extracting WARC(s) from {warc_file} to {extract_path}")
1027 | with tarfile.open(warc_file, "r") as fh:
1028 | # Extract all the contents to the specified directory
1029 | fh.extractall(path=extract_path, filter="data")
1030 | logger.info(f"Deleting archive at {warc_file}")
1031 | warc_file.unlink()
1032 | warc_files.append(Path(extract_path))
1033 |
1034 | else:
1035 |
1036 | logger.info(f"Running browsertrix-crawler crawl: {cmd_line}")
1037 | crawl = subprocess.run(crawler_args, check=False)
1038 | if (
1039 | crawl.returncode == EXIT_CODE_CRAWLER_SIZE_LIMIT_HIT
1040 | and known_args.sizeSoftLimit
1041 | ):
1042 | logger.info(
1043 | "Crawl size soft limit hit. Continuing with warc2zim conversion."
1044 | )
1045 | if known_args.zimit_progress_file:
1046 | partial_zim = True
1047 | elif (
1048 | crawl.returncode == EXIT_CODE_CRAWLER_TIME_LIMIT_HIT
1049 | and known_args.timeSoftLimit
1050 | ):
1051 | logger.info(
1052 | "Crawl time soft limit hit. Continuing with warc2zim conversion."
1053 | )
1054 | if known_args.zimit_progress_file:
1055 | partial_zim = True
1056 | elif crawl.returncode != 0:
1057 | logger.error(
1058 | f"Crawl returned an error: {crawl.returncode}, scraper exiting"
1059 | )
1060 | cancel_cleanup()
1061 | return crawl.returncode
1062 |
1063 | if known_args.collection:
1064 | warc_files = [
1065 | temp_root_dir.joinpath(f"collections/{known_args.collection}/archive/")
1066 | ]
1067 |
1068 | else:
1069 | warc_dirs = sorted(
1070 | temp_root_dir.rglob("collections/crawl-*/archive/"),
1071 | key=lambda path: path.lstat().st_mtime,
1072 | )
1073 | if len(warc_dirs) == 0:
1074 | raise RuntimeError(
1075 | "Failed to find directory where WARC files have been created"
1076 | )
1077 | elif len(warc_dirs) > 1:
1078 | logger.info(
1079 | "Found many WARC files directories, only most recently modified one"
1080 | " will be used"
1081 | )
1082 | for directory in warc_dirs:
1083 | logger.info(f"- {directory}")
1084 | warc_files = [warc_dirs[-1]]
1085 |
1086 | logger.info("")
1087 | logger.info("----------")
1088 | logger.info(
1089 | f"Processing WARC files in/at "
1090 | f'{" ".join(str(warc_file) for warc_file in warc_files)}'
1091 | )
1092 | warc2zim_args.extend(str(warc_file) for warc_file in warc_files)
1093 |
1094 | logger.info(f"Calling warc2zim with these args: {warc2zim_args}")
1095 |
1096 | warc2zim_exit_code = warc2zim(warc2zim_args)
1097 |
1098 | if known_args.zimit_progress_file:
1099 | stats_content = json.loads(zimit_stats_file.read_bytes())
1100 | stats_content["partialZim"] = partial_zim
1101 | zimit_stats_file.write_text(json.dumps(stats_content))
1102 |
1103 | # also call cancel_cleanup when --keep, even if it is not supposed to be registered,
1104 | # so that we will display temporary files location just like in other situations
1105 | if warc2zim_exit_code or known_args.keep:
1106 | cancel_cleanup()
1107 |
1108 | return warc2zim_exit_code
1109 |
1110 |
1111 | def get_cleaned_url(url: str):
1112 | parsed_url = urllib.parse.urlparse(url)
1113 |
1114 | # remove explicit port in URI for default-for-scheme as browsers does it
1115 | if parsed_url.scheme == "https" and parsed_url.port == 443: # noqa: PLR2004
1116 | parsed_url = rebuild_uri(parsed_url, port="")
1117 | if parsed_url.scheme == "http" and parsed_url.port == 80: # noqa: PLR2004
1118 | parsed_url = rebuild_uri(parsed_url, port="")
1119 |
1120 | return parsed_url.geturl()
1121 |
1122 |
1123 | def get_crawler_cmd_line(args):
1124 | """Build the command line for Browsertrix crawler"""
1125 | node_cmd = ["crawl"]
1126 | for arg in [
1127 | "title",
1128 | "description",
1129 | "workers",
1130 | "crawlId",
1131 | "waitUntil",
1132 | "depth",
1133 | "extraHops",
1134 | "pageLimit",
1135 | "maxPageLimit",
1136 | "pageLoadTimeout",
1137 | "scopeType",
1138 | "scopeIncludeRx",
1139 | "scopeExcludeRx",
1140 | "collection",
1141 | "allowHashUrls",
1142 | "selectLinks",
1143 | "clickSelector",
1144 | "blockRules",
1145 | "blockMessage",
1146 | "blockAds",
1147 | "adBlockMessage",
1148 | "collection",
1149 | "headless",
1150 | "driver",
1151 | "generateCDX",
1152 | "combineWARC",
1153 | "rolloverSize",
1154 | "generateWACZ",
1155 | "logging",
1156 | "logLevel",
1157 | "logContext",
1158 | "logExcludeContext",
1159 | "text",
1160 | "mobileDevice",
1161 | "userAgent",
1162 | # userAgentSuffix (manipulated),
1163 | "useSitemap",
1164 | "sitemapFromDate",
1165 | "sitemapToDate",
1166 | # statsFilename (manipulated),
1167 | "behaviors",
1168 | "behaviorTimeout",
1169 | "postLoadDelay",
1170 | "pageExtraDelay",
1171 | "dedupPolicy",
1172 | "profile",
1173 | "screenshot",
1174 | "screencastPort",
1175 | "screencastRedis",
1176 | "warcInfo",
1177 | "saveState",
1178 | "saveStateInterval",
1179 | "saveStateHistory",
1180 | "sizeSoftLimit",
1181 | "sizeHardLimit",
1182 | "diskUtilization",
1183 | "timeSoftLimit",
1184 | "timeHardLimit",
1185 | "healthCheckPort",
1186 | "overwrite",
1187 | "waitOnDone",
1188 | "restartsOnError",
1189 | "netIdleWait",
1190 | "lang",
1191 | "originOverride",
1192 | "logErrorsToRedis",
1193 | "writePagesToRedis",
1194 | "maxPageRetries",
1195 | "failOnFailedSeed",
1196 | "failOnFailedLimit",
1197 | "failOnInvalidStatus",
1198 | "debugAccessBrowser",
1199 | "warcPrefix",
1200 | "serviceWorker",
1201 | "proxyServer",
1202 | "dryRun",
1203 | "qaSource",
1204 | "qaDebugImageDiff",
1205 | "sshProxyPrivateKeyFile",
1206 | "sshProxyKnownHostsFile",
1207 | "customBehaviors",
1208 | "config",
1209 | ]:
1210 | value = getattr(args, arg)
1211 | if arg == "userAgent":
1212 | # - strip leading whitespace which are not allowed on some websites
1213 | # - strip trailing whitespace which are either not allowed if no suffix is
1214 | # used, or duplicate with the automatically added one if a suffix is there
1215 | # - value is None when userAgent is not passed
1216 | if value:
1217 | value = value.strip()
1218 | if not value:
1219 | # ignore empty userAgent arg and keep crawler default value if empty
1220 | continue
1221 | if value is None or (isinstance(value, bool) and value is False):
1222 | continue
1223 | node_cmd.append(
1224 | "--"
1225 | + (
1226 | "sizeLimit"
1227 | if arg in ["sizeSoftLimit", "sizeHardLimit"]
1228 | else "timeLimit" if arg in ["timeSoftLimit", "timeHardLimit"] else arg
1229 | )
1230 | )
1231 | if not isinstance(value, bool):
1232 | node_cmd.append(str(value))
1233 |
1234 | return node_cmd
1235 |
1236 |
1237 | def sigint_handler(*args): # noqa: ARG001
1238 | logger.info("")
1239 | logger.info("")
1240 | logger.info("SIGINT/SIGTERM received, stopping zimit")
1241 | logger.info("")
1242 | logger.info("")
1243 | sys.exit(3)
1244 |
1245 |
1246 | def zimit():
1247 | sys.exit(run(sys.argv[1:]))
1248 |
1249 |
1250 | signal.signal(signal.SIGINT, sigint_handler)
1251 | signal.signal(signal.SIGTERM, sigint_handler)
1252 |
1253 |
1254 | if __name__ == "__main__":
1255 | zimit()
1256 |
--------------------------------------------------------------------------------
/tasks.py:
--------------------------------------------------------------------------------
1 | # pyright: strict, reportUntypedFunctionDecorator=false
2 | import os
3 |
4 | from invoke.context import Context
5 | from invoke.tasks import task # pyright: ignore [reportUnknownVariableType]
6 |
7 | use_pty = not os.getenv("CI", "")
8 |
9 |
10 | @task(optional=["args"], help={"args": "pytest additional arguments"})
11 | def test(ctx: Context, args: str = ""):
12 | """run tests (without coverage)"""
13 | ctx.run(f"pytest {args}", pty=use_pty)
14 |
15 |
16 | @task(optional=["args"], help={"args": "pytest additional arguments"})
17 | def test_cov(ctx: Context, args: str = ""):
18 | """run test vith coverage"""
19 | ctx.run(f"coverage run -m pytest {args}", pty=use_pty)
20 |
21 |
22 | @task(optional=["html"], help={"html": "flag to export html report"})
23 | def report_cov(ctx: Context, *, html: bool = False):
24 | """report coverage"""
25 | ctx.run("coverage combine", warn=True, pty=use_pty)
26 | ctx.run("coverage report --show-missing", pty=use_pty)
27 | if html:
28 | ctx.run("coverage html", pty=use_pty)
29 |
30 |
31 | @task(
32 | optional=["args", "html"],
33 | help={
34 | "args": "pytest additional arguments",
35 | "html": "flag to export html report",
36 | },
37 | )
38 | def coverage(ctx: Context, args: str = "", *, html: bool = False):
39 | """run tests and report coverage"""
40 | test_cov(ctx, args=args)
41 | report_cov(ctx, html=html)
42 |
43 |
44 | @task(optional=["args"], help={"args": "black additional arguments"})
45 | def lint_black(ctx: Context, args: str = "."):
46 | args = args or "." # needed for hatch script
47 | ctx.run("black --version", pty=use_pty)
48 | ctx.run(f"black --check --diff {args}", pty=use_pty)
49 |
50 |
51 | @task(optional=["args"], help={"args": "ruff additional arguments"})
52 | def lint_ruff(ctx: Context, args: str = "."):
53 | args = args or "." # needed for hatch script
54 | ctx.run("ruff --version", pty=use_pty)
55 | ctx.run(f"ruff check {args}", pty=use_pty)
56 |
57 |
58 | @task(
59 | optional=["args"],
60 | help={
61 | "args": "linting tools (black, ruff) additional arguments, typically a path",
62 | },
63 | )
64 | def lintall(ctx: Context, args: str = "."):
65 | """Check linting"""
66 | args = args or "." # needed for hatch script
67 | lint_black(ctx, args)
68 | lint_ruff(ctx, args)
69 |
70 |
71 | @task(optional=["args"], help={"args": "check tools (pyright) additional arguments"})
72 | def check_pyright(ctx: Context, args: str = ""):
73 | """check static types with pyright"""
74 | ctx.run("pyright --version")
75 | ctx.run(f"pyright {args}", pty=use_pty)
76 |
77 |
78 | @task(optional=["args"], help={"args": "check tools (pyright) additional arguments"})
79 | def checkall(ctx: Context, args: str = ""):
80 | """check static types"""
81 | check_pyright(ctx, args)
82 |
83 |
84 | @task(optional=["args"], help={"args": "black additional arguments"})
85 | def fix_black(ctx: Context, args: str = "."):
86 | """fix black formatting"""
87 | args = args or "." # needed for hatch script
88 | ctx.run(f"black {args}", pty=use_pty)
89 |
90 |
91 | @task(optional=["args"], help={"args": "ruff additional arguments"})
92 | def fix_ruff(ctx: Context, args: str = "."):
93 | """fix all ruff rules"""
94 | args = args or "." # needed for hatch script
95 | ctx.run(f"ruff check --fix {args}", pty=use_pty)
96 |
97 |
98 | @task(
99 | optional=["args"],
100 | help={
101 | "args": "linting tools (black, ruff) additional arguments, typically a path",
102 | },
103 | )
104 | def fixall(ctx: Context, args: str = "."):
105 | """Fix everything automatically"""
106 | args = args or "." # needed for hatch script
107 | fix_black(ctx, args)
108 | fix_ruff(ctx, args)
109 | lintall(ctx, args)
110 |
--------------------------------------------------------------------------------
/tests-daily/Dockerfile:
--------------------------------------------------------------------------------
1 | # Let's extract kiwix-tools as usual on alpine temporary build container
2 | FROM alpine:3.21 as kiwix-serve
3 | LABEL org.opencontainers.image.source https://github.com/openzim/kiwix-tools
4 |
5 | # TARGETPLATFORM is injected by docker build
6 | ARG TARGETPLATFORM
7 | ARG KIWIX_TOOLS_VERSION
8 |
9 | RUN set -e && \
10 | # default (no KIWIX_TOOLS_VERSION set) to today's nightly
11 | if [ -z "$KIWIX_TOOLS_VERSION" ] ; then KIWIX_TOOLS_VERSION=$(date +"%Y-%m-%d") ; fi && \
12 | apk --no-cache add dumb-init curl && \
13 | echo "TARGETPLATFORM: $TARGETPLATFORM" && \
14 | if [ "$TARGETPLATFORM" = "linux/386" ]; then ARCH="i586"; \
15 | # linux/arm64/v8 points to linux/arm64
16 | elif [ "$TARGETPLATFORM" = "linux/arm64/v8" \
17 | -o "$TARGETPLATFORM" = "linux/arm64" ]; then ARCH="aarch64"; \
18 | # linux/arm translates to linux/arm/v7
19 | elif [ "$TARGETPLATFORM" = "linux/arm/v7" ]; then ARCH="armv8"; \
20 | elif [ "$TARGETPLATFORM" = "linux/arm/v6" ]; then ARCH="armv6"; \
21 | elif [ "$TARGETPLATFORM" = "linux/amd64/v3" \
22 | -o "$TARGETPLATFORM" = "linux/amd64/v2" \
23 | -o "$TARGETPLATFORM" = "linux/amd64" ]; then ARCH="x86_64"; \
24 | # we dont suppot any other arch so let it fail
25 | else ARCH="unknown"; fi && \
26 | # download requested kiwix-tools version
27 | url="http://mirror.download.kiwix.org/nightly/$KIWIX_TOOLS_VERSION/kiwix-tools_linux-$ARCH-$KIWIX_TOOLS_VERSION.tar.gz" && \
28 | echo "URL: $url" && \
29 | mkdir /kiwix-serve && \
30 | curl -k -L $url | tar -xz -C /kiwix-serve --strip-components 1
31 |
32 | # Build real "workload" container
33 | FROM python:3.13-slim-bookworm
34 |
35 | # Add kiwix-serve
36 | COPY --from=kiwix-serve /kiwix-serve /usr/local/bin
37 |
38 | # Update apt + install dependencies + install Google Chrome dependencies + clean-up apt lists
39 | RUN apt-get update -y && \
40 | apt-get install -qqy wget xvfb unzip jq && \
41 | apt-get install -qqy libxss1 libappindicator1 libgconf-2-4 \
42 | fonts-liberation libasound2 libnspr4 libnss3 libx11-xcb1 libxtst6 lsb-release xdg-utils \
43 | libgbm1 libnss3 libatk-bridge2.0-0 libgtk-3-0 libx11-xcb1 libxcb-dri3-0 && \
44 | rm -rf /var/lib/apt/lists/*
45 |
46 | # Fetch the latest version numbers and URLs for Chrome and ChromeDriver
47 | RUN wget -q -O /tmp/versions.json https://googlechromelabs.github.io/chrome-for-testing/last-known-good-versions-with-downloads.json
48 |
49 | # Install chrome
50 | RUN CHROME_URL=$(jq -r '.channels.Stable.downloads.chrome[] | select(.platform=="linux64") | .url' /tmp/versions.json) && \
51 | wget -q --continue -O /tmp/chrome-linux64.zip $CHROME_URL && \
52 | unzip /tmp/chrome-linux64.zip -d /opt/chrome
53 |
54 | RUN chmod +x /opt/chrome/chrome-linux64/chrome
55 |
56 | # Install chromedriver
57 | RUN CHROMEDRIVER_URL=$(jq -r '.channels.Stable.downloads.chromedriver[] | select(.platform=="linux64") | .url' /tmp/versions.json) && \
58 | wget -q --continue -O /tmp/chromedriver-linux64.zip $CHROMEDRIVER_URL && \
59 | unzip /tmp/chromedriver-linux64.zip -d /opt/chromedriver && \
60 | chmod +x /opt/chromedriver/chromedriver-linux64/chromedriver
61 |
62 | # Set up Chromedriver Environment variables
63 | ENV CHROMEDRIVER_DIR /opt/chromedriver
64 | ENV PATH $CHROMEDRIVER_DIR:$PATH
65 |
66 | # Clean up
67 | RUN rm /tmp/chrome-linux64.zip /tmp/chromedriver-linux64.zip /tmp/versions.json
68 |
69 | # Update pip, install selenium, create work directory
70 | RUN \
71 | python -m pip install --no-cache-dir -U \
72 | pip \
73 | selenium==4.28.1 \
74 | pytest==8.3.4 \
75 | && mkdir -p /work
76 |
--------------------------------------------------------------------------------
/tests-daily/daily.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import os
3 | import subprocess
4 | from time import sleep
5 |
6 | import pytest
7 | from selenium import webdriver
8 | from selenium.webdriver.chrome.options import Options
9 | from selenium.webdriver.chrome.service import Service as ChromeService
10 | from selenium.webdriver.common.by import By
11 | from selenium.webdriver.support import expected_conditions
12 | from selenium.webdriver.support.ui import WebDriverWait
13 |
14 | KIWIX_SERVE_START_SLEEP = 1
15 |
16 | ZIM_NAME = "tests_eng_test-website"
17 | YOUTUBE_VIDEO_PATH = "youtube.fuzzy.replayweb.page/embed/g5skcrNXdDM"
18 |
19 | SKIP_YOUTUBE_TEST = os.getenv("SKIP_YOUTUBE_TEST", "False").lower() == "true"
20 |
21 | CHECK_VIDEO_IS_PLAYING_AFTER_SECS = 30
22 |
23 | logger = logging.getLogger(__name__)
24 |
25 |
26 | @pytest.fixture(scope="module")
27 | def chrome_driver():
28 | """Start chrome and setup chrome driver / selenium"""
29 |
30 | logger.info("Starting Chrome")
31 | chrome_options = Options()
32 | chrome_options.add_argument("--headless")
33 | chrome_options.add_argument("--no-sandbox")
34 | # Other options of interest:
35 | # --disable-dev-shm-usage (not needed anymore with recent chrome versions)
36 | # --disable-gpu (important for some versions of Chrome)
37 | # --remote-debugging-port=9222 (should you need to remote debug)
38 |
39 | # Set path to Chrome binary
40 | chrome_options.binary_location = "/opt/chrome/chrome-linux64/chrome"
41 |
42 | # Set path to ChromeDriver
43 | chrome_service = ChromeService(
44 | executable_path="/opt/chromedriver/chromedriver-linux64/chromedriver"
45 | )
46 |
47 | # Set up driver
48 | driver = webdriver.Chrome(service=chrome_service, options=chrome_options)
49 |
50 | yield driver
51 |
52 | # Cleanup
53 | logger.info("Quitting Chrome")
54 | driver.quit()
55 |
56 |
57 | @pytest.fixture(scope="module")
58 | def kiwix_serve():
59 | """Start kiwix-serve with given ZIM"""
60 |
61 | logger.info("Starting kiwix-serve")
62 | process = subprocess.Popen(
63 | [
64 | "/usr/bin/env",
65 | "/usr/local/bin/kiwix-serve",
66 | f"/output/{ZIM_NAME}.zim",
67 | ]
68 | )
69 |
70 | logger.info(
71 | f"Waiting {KIWIX_SERVE_START_SLEEP} secs to be 'sure' that kiwix-serve is ready"
72 | )
73 | sleep(KIWIX_SERVE_START_SLEEP)
74 |
75 | if process.poll() is not None:
76 | raise Exception("kiwix-serve has terminated too early")
77 |
78 | yield process
79 |
80 | # Cleanup
81 | logger.info("Quitting kiwix-serve")
82 | process.terminate()
83 |
84 |
85 | @pytest.mark.skipif(SKIP_YOUTUBE_TEST, reason="Youtube test disabled by environment")
86 | def test_youtube_video(chrome_driver, kiwix_serve): # noqa: ARG001
87 | """Test that youtube video loads, and still plays after a while"""
88 |
89 | chrome_driver.get(f"http://localhost:80/content/{ZIM_NAME}/{YOUTUBE_VIDEO_PATH}")
90 |
91 | if chrome_driver.title == "Content not found":
92 | raise Exception("Wrong URL, kiwix-serve said that content is not found")
93 |
94 | button = WebDriverWait(chrome_driver, 1).until(
95 | expected_conditions.presence_of_element_located(
96 | (By.XPATH, "//button[@title='Play']")
97 | )
98 | )
99 |
100 | logger.info("Play button found in page")
101 |
102 | button.click()
103 |
104 | video = WebDriverWait(chrome_driver, 1).until(
105 | expected_conditions.presence_of_element_located((By.TAG_NAME, "video"))
106 | )
107 |
108 | logger.info("Video found in page")
109 |
110 | # arguments[0] is the video tag passed to execute_script
111 | if not chrome_driver.execute_script("return arguments[0].paused === false", video):
112 | raise Exception("Video is not playing, failed to start probably")
113 |
114 | logger.info("Video is playing")
115 |
116 | logger.info(
117 | f"Waiting {CHECK_VIDEO_IS_PLAYING_AFTER_SECS} secs to check video is still "
118 | "playing"
119 | )
120 | sleep(CHECK_VIDEO_IS_PLAYING_AFTER_SECS)
121 |
122 | # arguments[0] is the video tag passed to execute_script
123 | if not chrome_driver.execute_script("return arguments[0].paused === false", video):
124 | raise Exception(
125 | "Video is not playing anymore after "
126 | f"{CHECK_VIDEO_IS_PLAYING_AFTER_SECS} secs"
127 | )
128 | logger.info("Video is still playing")
129 |
--------------------------------------------------------------------------------
/tests-integration/README.md:
--------------------------------------------------------------------------------
1 | These are integration tests, meant to be ran inside the CI (because we need to first perform a zimit run on a given website and then check its output)
2 |
--------------------------------------------------------------------------------
/tests-integration/integration.py:
--------------------------------------------------------------------------------
1 | import glob
2 | import json
3 | import os
4 | from pathlib import Path
5 |
6 | import pytest
7 | from warcio import ArchiveIterator
8 | from zimscraperlib.zim import Archive
9 |
10 |
11 | @pytest.mark.parametrize(
12 | "filename",
13 | [
14 | pytest.param("/output/tests_en_onepage.zim", id="onepage"),
15 | pytest.param("/output/tests_en_sizesoftlimit.zim", id="sizesoftlimit"),
16 | pytest.param("/output/tests_en_timesoftlimit.zim", id="timesoftlimit"),
17 | ],
18 | )
19 | def test_zim_created(filename):
20 | """Ensure ZIM file exists"""
21 | assert os.path.isfile(filename)
22 |
23 |
24 | @pytest.mark.parametrize(
25 | "filename",
26 | [
27 | pytest.param("/output/tests_en_sizehardlimit.zim", id="sizehardlimit"),
28 | pytest.param("/output/tests_en_timehardlimit.zim", id="timehardlimit"),
29 | ],
30 | )
31 | def test_zim_not_created(filename):
32 | """Ensure ZIM file does not exists"""
33 | assert not os.path.exists(filename)
34 |
35 |
36 | def test_zim_main_page():
37 | """Main page specified, http://website.test.openzim.org/http-return-codes.html,
38 | was a redirect to https
39 | Ensure main page is the redirected page"""
40 |
41 | main_entry = Archive(Path("/output/tests_en_onepage.zim")).main_entry
42 | assert main_entry.is_redirect
43 | assert (
44 | main_entry.get_redirect_entry().path
45 | == "website.test.openzim.org/http-return-codes.html"
46 | )
47 |
48 |
49 | def test_zim_scraper():
50 | """Check content of scraper metadata"""
51 |
52 | zim_fh = Archive(Path("/output/tests_en_onepage.zim"))
53 | scraper = zim_fh.get_text_metadata("Scraper")
54 | assert "zimit " in scraper
55 | assert "warc2zim " in scraper
56 | assert "Browsertrix-Crawler " in scraper
57 |
58 |
59 | def test_files_list():
60 | """Check that expected files are present in the ZIM at proper path"""
61 | zim_fh = Archive(Path("/output/tests_en_onepage.zim"))
62 | for expected_entry in [
63 | "_zim_static/__wb_module_decl.js",
64 | "_zim_static/wombat.js",
65 | "_zim_static/wombatSetup.js",
66 | "website.test.openzim.org/http-return-codes.html",
67 | "website.test.openzim.org/200-response",
68 | "website.test.openzim.org/201-response",
69 | "website.test.openzim.org/202-response",
70 | "website.test.openzim.org/301-external-redirect-ok",
71 | "website.test.openzim.org/301-internal-redirect-ok",
72 | "website.test.openzim.org/302-external-redirect-ok",
73 | "website.test.openzim.org/302-internal-redirect-ok",
74 | "website.test.openzim.org/307-external-redirect-ok",
75 | "website.test.openzim.org/307-internal-redirect-ok",
76 | "website.test.openzim.org/308-external-redirect-ok",
77 | "website.test.openzim.org/308-internal-redirect-ok",
78 | "website.test.openzim.org/http-return-codes.html",
79 | "website.test.openzim.org/icons/favicon.ico",
80 | "website.test.openzim.org/icons/site.webmanifest",
81 | "website.test.openzim.org/internal_redirect_target.html",
82 | "www.example.com/",
83 | ]:
84 | assert zim_fh.get_content(expected_entry)
85 |
86 |
87 | def test_user_agent():
88 | """Test that mobile user agent was used
89 |
90 | Check is done in WARC request records with custom Zimit and email suffix
91 | """
92 |
93 | found = False
94 | for warc in glob.glob("/output/.tmp*/collections/crawl-*/archive/*.warc.gz"):
95 | with open(warc, "rb") as fh:
96 | for record in ArchiveIterator(fh):
97 | if record.rec_type == "request":
98 | print(record.http_headers) # noqa: T201
99 | ua = record.http_headers.get_header("User-Agent")
100 | if ua:
101 | assert "Mozilla" in ua
102 | assert ua.endswith(" +Zimit test@example.com")
103 | found = True
104 |
105 | # should find at least one
106 | assert found
107 |
108 |
109 | def test_stats_output_standard():
110 | assert json.loads(Path("/output/crawl.json").read_bytes()) == {
111 | "crawled": 17,
112 | "pending": 0,
113 | "pendingPages": [],
114 | "total": 35,
115 | "failed": 18,
116 | "limit": {"max": 0, "hit": False},
117 | }
118 |
119 | assert json.loads(Path("/output/warc2zim.json").read_bytes()) == {
120 | "written": 8,
121 | "total": 8,
122 | }
123 |
124 | assert json.loads(Path("/output/stats.json").read_bytes()) == {
125 | "done": 8,
126 | "total": 8,
127 | "partialZim": False,
128 | }
129 |
130 |
131 | @pytest.mark.parametrize(
132 | "filename",
133 | [
134 | pytest.param("/output/stats_sizesoftlimit.json", id="sizesoftlimit"),
135 | pytest.param("/output/stats_timesoftlimit.json", id="timesoftlimit"),
136 | ],
137 | )
138 | def test_stats_output_softlimit(filename):
139 | file = Path(filename)
140 | assert file.exists
141 | content = json.loads(file.read_bytes())
142 | assert "done" in content
143 | assert "total" in content
144 | assert "partialZim" in content
145 | assert content["partialZim"]
146 |
--------------------------------------------------------------------------------
/tests/test_dummy.py:
--------------------------------------------------------------------------------
1 | from zimit.zimit import NORMAL_WARC2ZIM_EXIT_CODE
2 |
3 |
4 | # dummy test, just to have coverage report done
5 | def test_something_exists():
6 | assert NORMAL_WARC2ZIM_EXIT_CODE
7 |
--------------------------------------------------------------------------------