├── tests ├── __init__.py ├── mock_server │ ├── __init__.py │ └── templates │ │ ├── shift_jis.html │ │ ├── malformed.html │ │ ├── example-single.jsonl │ │ ├── example.json.bad │ │ ├── example.atom │ │ ├── example.jsonl │ │ ├── example.json │ │ ├── example.rss │ │ └── example.com.html ├── tags_migration │ └── index.sqlite3 ├── test_util.py ├── conftest.py ├── fixtures.py └── test_update.py ├── archivebox ├── core │ ├── actors.py │ ├── migrations │ │ ├── __init__.py │ │ ├── 0056_remove_tag_uuid.py │ │ ├── 0057_rename_id_tag_old_id.py │ │ ├── 0065_remove_snapshottag_old_tag.py │ │ ├── 0038_rename_uuid_snapshot_id.py │ │ ├── 0042_remove_archiveresult_snapshot_old.py │ │ ├── 0053_remove_snapshottag_snapshot_old.py │ │ ├── 0033_rename_id_archiveresult_old_id.py │ │ ├── 0019_auto_20210401_0654.py │ │ ├── 0010_auto_20210216_1055.py │ │ ├── 0030_alter_archiveresult_uuid.py │ │ ├── 0037_rename_id_snapshot_old_id.py │ │ ├── 0002_auto_20200625_1521.py │ │ ├── 0016_auto_20210218_1204.py │ │ ├── 0039_rename_snapshot_archiveresult_snapshot_old.py │ │ ├── 0060_alter_tag_id.py │ │ ├── 0055_alter_tag_slug.py │ │ ├── 0068_alter_archiveresult_options.py │ │ ├── 0009_auto_20210216_1038.py │ │ ├── 0028_alter_archiveresult_uuid.py │ │ ├── 0013_auto_20210218_0729.py │ │ ├── 0014_auto_20210218_0729.py │ │ ├── 0015_auto_20210218_0730.py │ │ ├── 0008_auto_20210105_1421.py │ │ ├── 0017_auto_20210219_0211.py │ │ ├── 0029_alter_archiveresult_id.py │ │ ├── 0035_remove_archiveresult_uuid_archiveresult_id.py │ │ ├── 0025_alter_archiveresult_uuid.py │ │ ├── 0054_alter_snapshot_timestamp.py │ │ ├── 0004_auto_20200713_1552.py │ │ ├── 0074_alter_snapshot_downloaded_at.py │ │ ├── 0045_alter_snapshot_old_id.py │ │ ├── 0062_alter_snapshottag_old_tag.py │ │ ├── 0067_alter_snapshottag_tag.py │ │ ├── 0058_alter_tag_old_id.py │ │ ├── 0061_rename_tag_snapshottag_old_tag_and_more.py │ │ ├── 0050_alter_snapshottag_snapshot_old.py │ │ ├── 0049_rename_snapshot_snapshottag_snapshot_old_and_more.py │ │ ├── 0018_auto_20210327_0952.py │ │ ├── 0043_alter_archiveresult_snapshot_alter_snapshot_id_and_more.py │ │ ├── 0032_alter_archiveresult_id.py │ │ ├── 0072_rename_added_snapshot_bookmarked_at_and_more.py │ │ ├── 0012_auto_20210216_1425.py │ │ ├── 0020_auto_20210410_1031.py │ │ ├── 0021_auto_20220914_0934.py │ │ ├── 0022_auto_20231023_2008.py │ │ ├── 0041_alter_archiveresult_snapshot_and_more.py │ │ ├── 0047_alter_snapshottag_unique_together_and_more.py │ │ ├── 0048_alter_archiveresult_snapshot_and_more.py │ │ ├── 0064_alter_snapshottag_unique_together_and_more.py │ │ ├── 0005_auto_20200728_0326.py │ │ ├── 0052_alter_snapshottag_unique_together_and_more.py │ │ ├── 0036_alter_archiveresult_id_alter_archiveresult_old_id.py │ │ ├── 0011_auto_20210216_1331.py │ │ ├── 0001_initial.py │ │ ├── 0046_alter_archiveresult_snapshot_alter_snapshot_id_and_more.py │ │ ├── 0031_alter_archiveresult_id_alter_archiveresult_uuid_and_more.py │ │ ├── 0066_alter_snapshottag_tag_alter_tag_id_alter_tag_old_id.py │ │ ├── 0003_auto_20200630_1034.py │ │ ├── 0073_rename_created_archiveresult_created_at_and_more.py │ │ └── 0069_alter_archiveresult_created_alter_snapshot_added_and_more.py │ ├── templatetags │ │ ├── __init__.py │ │ └── core_tags.py │ ├── tests.py │ ├── apps.py │ ├── wsgi.py │ ├── admin.py │ ├── management │ │ └── commands │ │ │ └── archivebox.py │ ├── __init__.py │ └── asgi.py ├── tags │ ├── __init__.py │ ├── migrations │ │ └── __init__.py │ └── apps.py ├── README.md ├── personas │ ├── __init__.py │ ├── migrations │ │ └── __init__.py │ ├── admin.py │ ├── tests.py │ ├── views.py │ └── apps.py ├── pkgs │ ├── abx │ │ ├── README.md │ │ └── pyproject.toml │ ├── abx-plugin-chrome │ │ ├── README.md │ │ ├── pyproject.toml │ │ └── abx_plugin_chrome │ │ │ ├── extractors.py │ │ │ └── __init__.py │ ├── abx-plugin-curl │ │ ├── README.md │ │ ├── abx_plugin_curl │ │ │ ├── __init__.py │ │ │ ├── binaries.py │ │ │ └── config.py │ │ └── pyproject.toml │ ├── abx-plugin-git │ │ ├── README.md │ │ ├── abx_plugin_git │ │ │ ├── extractors.py │ │ │ ├── binaries.py │ │ │ ├── __init__.py │ │ │ └── config.py │ │ └── pyproject.toml │ ├── abx-plugin-npm │ │ ├── README.md │ │ ├── abx_plugin_npm │ │ │ ├── config.py │ │ │ ├── __init__.py │ │ │ └── binproviders.py │ │ └── pyproject.toml │ ├── abx-plugin-pip │ │ ├── README.md │ │ ├── abx_plugin_pip │ │ │ ├── .plugin_order │ │ │ ├── config.py │ │ │ └── __init__.py │ │ └── pyproject.toml │ ├── abx-plugin-pocket │ │ ├── README.md │ │ ├── abx_plugin_pocket │ │ │ ├── __init__.py │ │ │ └── config.py │ │ └── pyproject.toml │ ├── abx-plugin-title │ │ ├── README.md │ │ ├── abx_plugin_title │ │ │ ├── extractors.py │ │ │ └── __init__.py │ │ └── pyproject.toml │ ├── abx-plugin-wget │ │ ├── README.md │ │ ├── pyproject.toml │ │ └── abx_plugin_wget │ │ │ ├── binaries.py │ │ │ ├── __init__.py │ │ │ └── extractors.py │ ├── abx-plugin-ytdlp │ │ ├── README.md │ │ ├── pyproject.toml │ │ └── abx_plugin_ytdlp │ │ │ └── __init__.py │ ├── abx-spec-abx-pkg │ │ ├── README.md │ │ └── pyproject.toml │ ├── abx-spec-config │ │ ├── README.md │ │ └── pyproject.toml │ ├── abx-spec-django │ │ ├── README.md │ │ └── pyproject.toml │ ├── abx-plugin-favicon │ │ ├── README.md │ │ ├── abx_plugin_favicon │ │ │ ├── config.py │ │ │ ├── models.py │ │ │ ├── extractors.py │ │ │ ├── __init__.py │ │ │ └── actors.py │ │ └── pyproject.toml │ ├── abx-plugin-htmltotext │ │ ├── README.md │ │ ├── abx_plugin_htmltotext │ │ │ ├── config.py │ │ │ └── __init__.py │ │ └── pyproject.toml │ ├── abx-plugin-ldap-auth │ │ ├── README.md │ │ └── pyproject.toml │ ├── abx-plugin-mercury │ │ ├── README.md │ │ ├── pyproject.toml │ │ └── abx_plugin_mercury │ │ │ ├── extractors.py │ │ │ ├── __init__.py │ │ │ ├── config.py │ │ │ └── binaries.py │ ├── abx-plugin-playwright │ │ ├── README.md │ │ ├── abx_plugin_playwright │ │ │ ├── config.py │ │ │ ├── __init__.py │ │ │ └── binaries.py │ │ └── pyproject.toml │ ├── abx-plugin-puppeteer │ │ ├── README.md │ │ ├── pyproject.toml │ │ └── abx_plugin_puppeteer │ │ │ ├── config.py │ │ │ ├── binaries.py │ │ │ └── __init__.py │ ├── abx-plugin-readability │ │ ├── README.md │ │ ├── pyproject.toml │ │ └── abx_plugin_readability │ │ │ ├── extractors.py │ │ │ ├── config.py │ │ │ ├── __init__.py │ │ │ └── binaries.py │ ├── abx-plugin-readwise │ │ ├── README.md │ │ ├── pyproject.toml │ │ └── abx_plugin_readwise.py │ ├── abx-plugin-singlefile │ │ ├── README.md │ │ ├── abx_plugin_singlefile │ │ │ ├── models.py │ │ │ ├── extractors.py │ │ │ ├── __init__.py │ │ │ ├── config.py │ │ │ └── actors.py │ │ └── pyproject.toml │ ├── abx-spec-archivebox │ │ ├── README.md │ │ ├── pyproject.toml │ │ └── abx_spec_archivebox │ │ │ ├── effects.py │ │ │ ├── __init__.py │ │ │ └── events.py │ ├── abx-spec-extractor │ │ ├── README.md │ │ └── pyproject.toml │ ├── abx-spec-searchbackend │ │ ├── README.md │ │ ├── pyproject.toml │ │ └── abx_spec_searchbackend.py │ ├── abx-plugin-archivedotorg │ │ ├── README.md │ │ ├── abx_plugin_archivedotorg │ │ │ ├── config.py │ │ │ └── __init__.py │ │ └── pyproject.toml │ ├── abx-plugin-ripgrep-search │ │ ├── README.md │ │ ├── pyproject.toml │ │ └── abx_plugin_ripgrep_search │ │ │ ├── binaries.py │ │ │ ├── __init__.py │ │ │ └── config.py │ ├── abx-plugin-sonic-search │ │ ├── README.md │ │ ├── pyproject.toml │ │ └── abx_plugin_sonic_search │ │ │ ├── __init__.py │ │ │ └── binaries.py │ ├── abx-plugin-sqlitefts-search │ │ ├── README.md │ │ ├── pyproject.toml │ │ └── abx_plugin_sqlitefts_search │ │ │ └── __init__.py │ └── abx-plugin-default-binproviders │ │ ├── README.md │ │ ├── pyproject.toml │ │ └── abx_plugin_default_binproviders.py ├── static ├── api │ ├── migrations │ │ ├── __init__.py │ │ ├── 0002_alter_apitoken_options.py │ │ ├── 0005_remove_apitoken_uuid_remove_outboundwebhook_uuid_and_more.py │ │ ├── 0004_alter_apitoken_id_alter_apitoken_uuid.py │ │ ├── 0007_alter_apitoken_created_by.py │ │ ├── 0006_remove_outboundwebhook_uuid_apitoken_id_and_more.py │ │ ├── 0001_initial.py │ │ └── 0009_rename_created_apitoken_created_at_and_more.py │ ├── __init__.py │ ├── apps.py │ ├── urls.py │ ├── tests.py │ └── admin.py ├── crawls │ ├── migrations │ │ └── __init__.py │ ├── tests.py │ ├── views.py │ ├── apps.py │ └── __init__.py ├── machine │ ├── migrations │ │ └── __init__.py │ ├── __init__.py │ └── apps.py ├── workers │ ├── migrations │ │ └── __init__.py │ ├── apps.py │ ├── __init__.py │ ├── management │ │ └── commands │ │ │ └── orchestrator.py │ ├── views.py │ ├── admin.py │ └── tests.py ├── base_models │ ├── migrations │ │ └── __init__.py │ ├── __init__.py │ └── apps.py ├── templates │ ├── admin │ │ ├── actions_as_select.html │ │ └── app_index.html │ ├── static │ │ ├── robots.txt │ │ ├── archive.png │ │ ├── external.png │ │ ├── favicon.ico │ │ ├── sort_asc.png │ │ ├── sort_both.png │ │ ├── sort_desc.png │ │ └── spinner.gif │ └── core │ │ ├── minimal_index.html │ │ └── navigation.html ├── misc │ ├── __init__.py │ ├── debugging.py │ └── paginators.py ├── mypy.ini ├── .flake8 ├── __main__.py ├── cli │ ├── archivebox_shell.py │ ├── archivebox_worker.py │ ├── archivebox_manage.py │ └── archivebox_extract.py ├── parsers │ ├── generic_jsonl.py │ ├── url_list.py │ ├── generic_rss.py │ ├── medium_rss.py │ ├── netscape_html.py │ └── pinboard_rss.py ├── index │ └── csv.py ├── search │ └── admin.py └── config │ └── __init__.py ├── website ├── CNAME ├── README.md ├── icon.png ├── _config.yml └── assets │ ├── README.md │ └── css │ └── style.scss ├── .gitmodules ├── .github ├── FUNDING.yml ├── PULL_REQUEST_TEMPLATE.md ├── .readthedocs.yaml ├── dependabot.yml ├── ISSUE_TEMPLATE │ └── config.yml ├── workflows │ └── lint.yml └── CONTRIBUTING.md ├── etc ├── uwsgi.ini ├── package.json ├── README.md ├── crontabs │ └── archivebox ├── fly.toml ├── archivebox.service ├── sonic.cfg └── nginx.conf ├── bin ├── test.sh ├── release_pip.sh ├── lint.sh ├── build_pip.sh ├── build.sh ├── release_docs.sh ├── release_git.sh ├── build_docs.sh ├── release.sh └── build_git.sh ├── .dockerignore ├── .gitignore └── LICENSE /tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /archivebox/core/actors.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /archivebox/tags/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /website/CNAME: -------------------------------------------------------------------------------- 1 | archivebox.io -------------------------------------------------------------------------------- /archivebox/README.md: -------------------------------------------------------------------------------- 1 | ../README.md -------------------------------------------------------------------------------- /archivebox/personas/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /archivebox/pkgs/abx/README.md: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /archivebox/static: -------------------------------------------------------------------------------- 1 | templates/static -------------------------------------------------------------------------------- /tests/mock_server/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /website/README.md: -------------------------------------------------------------------------------- 1 | ../README.md -------------------------------------------------------------------------------- /archivebox/api/migrations/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /archivebox/core/migrations/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /archivebox/tags/migrations/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /archivebox/core/templatetags/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /archivebox/crawls/migrations/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /archivebox/machine/migrations/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /archivebox/personas/migrations/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /archivebox/pkgs/abx-plugin-chrome/README.md: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /archivebox/pkgs/abx-plugin-curl/README.md: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /archivebox/pkgs/abx-plugin-git/README.md: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /archivebox/pkgs/abx-plugin-npm/README.md: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /archivebox/pkgs/abx-plugin-pip/README.md: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /archivebox/pkgs/abx-plugin-pocket/README.md: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /archivebox/pkgs/abx-plugin-title/README.md: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /archivebox/pkgs/abx-plugin-wget/README.md: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /archivebox/pkgs/abx-plugin-ytdlp/README.md: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /archivebox/pkgs/abx-spec-abx-pkg/README.md: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /archivebox/pkgs/abx-spec-config/README.md: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /archivebox/pkgs/abx-spec-django/README.md: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /archivebox/workers/migrations/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /archivebox/base_models/migrations/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /archivebox/pkgs/abx-plugin-favicon/README.md: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /archivebox/pkgs/abx-plugin-htmltotext/README.md: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /archivebox/pkgs/abx-plugin-ldap-auth/README.md: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /archivebox/pkgs/abx-plugin-mercury/README.md: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /archivebox/pkgs/abx-plugin-playwright/README.md: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /archivebox/pkgs/abx-plugin-puppeteer/README.md: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /archivebox/pkgs/abx-plugin-readability/README.md: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /archivebox/pkgs/abx-plugin-readwise/README.md: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /archivebox/pkgs/abx-plugin-singlefile/README.md: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /archivebox/pkgs/abx-spec-archivebox/README.md: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /archivebox/pkgs/abx-spec-extractor/README.md: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /archivebox/pkgs/abx-spec-searchbackend/README.md: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /archivebox/pkgs/abx-plugin-archivedotorg/README.md: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /archivebox/pkgs/abx-plugin-ripgrep-search/README.md: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /archivebox/pkgs/abx-plugin-sonic-search/README.md: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /archivebox/pkgs/abx-plugin-sqlitefts-search/README.md: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /archivebox/templates/admin/actions_as_select.html: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /archivebox/pkgs/abx-plugin-default-binproviders/README.md: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /archivebox/api/__init__.py: -------------------------------------------------------------------------------- 1 | __package__ = 'archivebox.api' 2 | -------------------------------------------------------------------------------- /archivebox/misc/__init__.py: -------------------------------------------------------------------------------- 1 | __package__ = 'archivebox.misc' 2 | -------------------------------------------------------------------------------- /archivebox/pkgs/abx-plugin-pip/abx_plugin_pip/.plugin_order: -------------------------------------------------------------------------------- 1 | 400 2 | -------------------------------------------------------------------------------- /archivebox/machine/__init__.py: -------------------------------------------------------------------------------- 1 | __package__ = 'archivebox.machine' 2 | -------------------------------------------------------------------------------- /archivebox/templates/static/robots.txt: -------------------------------------------------------------------------------- 1 | User-agent: * 2 | Disallow: / 3 | -------------------------------------------------------------------------------- /archivebox/base_models/__init__.py: -------------------------------------------------------------------------------- 1 | __package__ = 'archivebox.base_models' 2 | -------------------------------------------------------------------------------- /archivebox/mypy.ini: -------------------------------------------------------------------------------- 1 | [mypy] 2 | plugins = 3 | mypy_django_plugin.main 4 | -------------------------------------------------------------------------------- /website/icon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/localnerve/ArchiveBox/dev/website/icon.png -------------------------------------------------------------------------------- /archivebox/core/tests.py: -------------------------------------------------------------------------------- 1 | #from django.test import TestCase 2 | 3 | # Create your tests here. 4 | -------------------------------------------------------------------------------- /archivebox/crawls/tests.py: -------------------------------------------------------------------------------- 1 | from django.test import TestCase 2 | 3 | # Create your tests here. 4 | -------------------------------------------------------------------------------- /archivebox/crawls/views.py: -------------------------------------------------------------------------------- 1 | from django.shortcuts import render 2 | 3 | # Create your views here. 4 | -------------------------------------------------------------------------------- /archivebox/personas/admin.py: -------------------------------------------------------------------------------- 1 | from django.contrib import admin 2 | 3 | # Register your models here. 4 | -------------------------------------------------------------------------------- /archivebox/personas/tests.py: -------------------------------------------------------------------------------- 1 | from django.test import TestCase 2 | 3 | # Create your tests here. 4 | -------------------------------------------------------------------------------- /archivebox/personas/views.py: -------------------------------------------------------------------------------- 1 | from django.shortcuts import render 2 | 3 | # Create your views here. 4 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "docs"] 2 | path = docs 3 | url = https://github.com/ArchiveBox/ArchiveBox.wiki.git 4 | -------------------------------------------------------------------------------- /tests/tags_migration/index.sqlite3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/localnerve/ArchiveBox/dev/tests/tags_migration/index.sqlite3 -------------------------------------------------------------------------------- /.github/FUNDING.yml: -------------------------------------------------------------------------------- 1 | github: ["ArchiveBox", "pirate"] 2 | custom: ["https://donate.archivebox.io", "https://swag.archivebox.io"] 3 | -------------------------------------------------------------------------------- /archivebox/templates/static/archive.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/localnerve/ArchiveBox/dev/archivebox/templates/static/archive.png -------------------------------------------------------------------------------- /archivebox/templates/static/external.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/localnerve/ArchiveBox/dev/archivebox/templates/static/external.png -------------------------------------------------------------------------------- /archivebox/templates/static/favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/localnerve/ArchiveBox/dev/archivebox/templates/static/favicon.ico -------------------------------------------------------------------------------- /archivebox/templates/static/sort_asc.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/localnerve/ArchiveBox/dev/archivebox/templates/static/sort_asc.png -------------------------------------------------------------------------------- /archivebox/templates/static/sort_both.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/localnerve/ArchiveBox/dev/archivebox/templates/static/sort_both.png -------------------------------------------------------------------------------- /archivebox/templates/static/sort_desc.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/localnerve/ArchiveBox/dev/archivebox/templates/static/sort_desc.png -------------------------------------------------------------------------------- /archivebox/templates/static/spinner.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/localnerve/ArchiveBox/dev/archivebox/templates/static/spinner.gif -------------------------------------------------------------------------------- /tests/mock_server/templates/shift_jis.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/localnerve/ArchiveBox/dev/tests/mock_server/templates/shift_jis.html -------------------------------------------------------------------------------- /website/_config.yml: -------------------------------------------------------------------------------- 1 | production_url: https://archivebox.io 2 | theme: jekyll-theme-merlot 3 | # Github Pages static site settings for https://archivebox.io 4 | -------------------------------------------------------------------------------- /tests/mock_server/templates/malformed.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 5 | malformed document 7 | 8 | 9 | -------------------------------------------------------------------------------- /archivebox/crawls/apps.py: -------------------------------------------------------------------------------- 1 | from django.apps import AppConfig 2 | 3 | 4 | class CrawlsConfig(AppConfig): 5 | default_auto_field = "django.db.models.BigAutoField" 6 | name = "crawls" 7 | -------------------------------------------------------------------------------- /archivebox/tags/apps.py: -------------------------------------------------------------------------------- 1 | from django.apps import AppConfig 2 | 3 | 4 | class TagsConfig(AppConfig): 5 | default_auto_field = 'django.db.models.BigAutoField' 6 | 7 | name = 'tags' 8 | -------------------------------------------------------------------------------- /archivebox/personas/apps.py: -------------------------------------------------------------------------------- 1 | from django.apps import AppConfig 2 | 3 | 4 | class SessionsConfig(AppConfig): 5 | default_auto_field = "django.db.models.BigAutoField" 6 | name = "personas" 7 | -------------------------------------------------------------------------------- /archivebox/workers/apps.py: -------------------------------------------------------------------------------- 1 | from django.apps import AppConfig 2 | 3 | 4 | class WorkersConfig(AppConfig): 5 | default_auto_field = 'django.db.models.BigAutoField' 6 | name = 'workers' 7 | 8 | -------------------------------------------------------------------------------- /archivebox/base_models/apps.py: -------------------------------------------------------------------------------- 1 | # from django.apps import AppConfig 2 | 3 | 4 | # class AbidUtilsConfig(AppConfig): 5 | # default_auto_field = 'django.db.models.BigAutoField' 6 | 7 | # name = 'base_models' 8 | -------------------------------------------------------------------------------- /website/assets/README.md: -------------------------------------------------------------------------------- 1 | # assets/ 2 | 3 | This folder contains assets used by the Jekyll Static Site Generator for ArchiveBox.io. 4 | 5 | It cannot be moved or renamed or the custom CSS on ArchiveBox.io will break. 6 | -------------------------------------------------------------------------------- /archivebox/workers/__init__.py: -------------------------------------------------------------------------------- 1 | __package__ = 'archivebox.workers' 2 | __order__ = 100 3 | 4 | import abx 5 | 6 | @abx.hookimpl 7 | def register_admin(admin_site): 8 | from workers.admin import register_admin 9 | register_admin(admin_site) 10 | -------------------------------------------------------------------------------- /archivebox/pkgs/abx-plugin-playwright/abx_plugin_playwright/config.py: -------------------------------------------------------------------------------- 1 | from abx_spec_config import BaseConfigSet 2 | 3 | class PlaywrightConfigs(BaseConfigSet): 4 | PLAYWRIGHT_BINARY: str = 'playwright' 5 | 6 | 7 | PLAYWRIGHT_CONFIG = PlaywrightConfigs() 8 | -------------------------------------------------------------------------------- /archivebox/pkgs/abx-plugin-htmltotext/abx_plugin_htmltotext/config.py: -------------------------------------------------------------------------------- 1 | from abx_spec_config.base_configset import BaseConfigSet 2 | 3 | 4 | class HtmltotextConfig(BaseConfigSet): 5 | SAVE_HTMLTOTEXT: bool = True 6 | 7 | 8 | HTMLTOTEXT_CONFIG = HtmltotextConfig() 9 | -------------------------------------------------------------------------------- /archivebox/crawls/__init__.py: -------------------------------------------------------------------------------- 1 | __package__ = 'archivebox.crawls' 2 | __order__ = 100 3 | 4 | import abx 5 | 6 | 7 | @abx.hookimpl 8 | def register_admin(admin_site): 9 | from .admin import register_admin as register_crawls_admin 10 | register_crawls_admin(admin_site) 11 | -------------------------------------------------------------------------------- /archivebox/pkgs/abx-plugin-archivedotorg/abx_plugin_archivedotorg/config.py: -------------------------------------------------------------------------------- 1 | from abx_spec_config.base_configset import BaseConfigSet 2 | 3 | 4 | class ArchivedotorgConfig(BaseConfigSet): 5 | SAVE_ARCHIVE_DOT_ORG: bool = True 6 | 7 | 8 | ARCHIVEDOTORG_CONFIG = ArchivedotorgConfig() 9 | -------------------------------------------------------------------------------- /archivebox/.flake8: -------------------------------------------------------------------------------- 1 | [flake8] 2 | ignore = D100,D101,D102,D103,D104,D105,D202,D203,D205,D400,E131,E241,E252,E266,E272,E701,E731,W293,W503,W291,W391 3 | select = F,E9,W 4 | max-line-length = 130 5 | max-complexity = 10 6 | exclude = migrations,tests,node_modules,vendor,venv,.venv,.venv2,.docker-venv,data,data* 7 | -------------------------------------------------------------------------------- /archivebox/pkgs/abx-plugin-title/abx_plugin_title/extractors.py: -------------------------------------------------------------------------------- 1 | __package__ = 'abx_plugin_title' 2 | 3 | from abx_spec_extractor import BaseExtractor, ExtractorName 4 | 5 | 6 | 7 | class TitleExtractor(BaseExtractor): 8 | name: ExtractorName = 'title' 9 | 10 | TITLE_EXTRACTOR = TitleExtractor() 11 | -------------------------------------------------------------------------------- /archivebox/api/apps.py: -------------------------------------------------------------------------------- 1 | __package__ = 'archivebox.api' 2 | 3 | from django.apps import AppConfig 4 | 5 | import abx 6 | 7 | 8 | class APIConfig(AppConfig): 9 | name = 'api' 10 | 11 | 12 | @abx.hookimpl 13 | def register_admin(admin_site): 14 | from api.admin import register_admin 15 | register_admin(admin_site) 16 | -------------------------------------------------------------------------------- /archivebox/pkgs/abx-plugin-favicon/abx_plugin_favicon/config.py: -------------------------------------------------------------------------------- 1 | from abx_spec_config.base_configset import BaseConfigSet 2 | 3 | 4 | class FaviconConfig(BaseConfigSet): 5 | SAVE_FAVICON: bool = True 6 | 7 | FAVICON_PROVIDER: str = 'https://www.google.com/s2/favicons?domain={}' 8 | 9 | 10 | FAVICON_CONFIG = FaviconConfig() 11 | -------------------------------------------------------------------------------- /tests/mock_server/templates/example-single.jsonl: -------------------------------------------------------------------------------- 1 | {"href":"http://127.0.0.1:8080/static/example.com.html","description":"Example","extended":"","meta":"18a973f09c9cc0608c116967b64e0419","hash":"910293f019c2f4bb1a749fb937ba58e3","time":"2014-06-14T15:51:42Z","shared":"no","toread":"no","tags":"Tag1 Tag2","trap":"http://www.example.com/should-not-exist"} 2 | -------------------------------------------------------------------------------- /etc/uwsgi.ini: -------------------------------------------------------------------------------- 1 | [uwsgi] 2 | socket = 127.0.0.1:3031 3 | chdir = ../ 4 | http = 0.0.0.0:8001 5 | env = DATA_DIR=./data 6 | wsgi-file = archivebox/core/wsgi.py 7 | processes = 4 8 | threads = 1 9 | stats = 127.0.0.1:9191 10 | static-map /static=./archivebox/templates/static 11 | harakiri = 172800 12 | post-buffering = 1 13 | disable-logging = True 14 | -------------------------------------------------------------------------------- /archivebox/pkgs/abx-plugin-curl/abx_plugin_curl/__init__.py: -------------------------------------------------------------------------------- 1 | import abx 2 | 3 | 4 | @abx.hookimpl 5 | def get_CONFIG(): 6 | from .config import CURL_CONFIG 7 | 8 | return { 9 | 'curl': CURL_CONFIG 10 | } 11 | 12 | @abx.hookimpl 13 | def get_BINARIES(): 14 | from .binaries import CURL_BINARY 15 | 16 | return { 17 | 'curl': CURL_BINARY, 18 | } 19 | -------------------------------------------------------------------------------- /archivebox/pkgs/abx/pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "abx" 3 | version = "0.1.0" 4 | description = "The common shared interfaces for the ABX ArchiveBox plugin ecosystem." 5 | readme = "README.md" 6 | requires-python = ">=3.10" 7 | dependencies = [ 8 | "pluggy>=1.5.0", 9 | "django>=5.1.1,<6.0", 10 | ] 11 | 12 | [build-system] 13 | requires = ["hatchling"] 14 | build-backend = "hatchling.build" 15 | -------------------------------------------------------------------------------- /etc/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "archivebox", 3 | "version": "0.0.1", 4 | "repository": "github:ArchiveBox/ArchiveBox", 5 | "license": "MIT", 6 | "dependencies": { 7 | "@postlight/parser": "^2.2.3", 8 | "readability-extractor": "github:ArchiveBox/readability-extractor", 9 | "single-file-cli": "^1.1.54", 10 | "puppeteer": "^23.5.0", 11 | "@puppeteer/browsers": "^2.4.0" 12 | } 13 | } 14 | -------------------------------------------------------------------------------- /tests/mock_server/templates/example.json.bad: -------------------------------------------------------------------------------- 1 | this line would cause problems but --parser=json will actually skip it 2 | [{"href":"http://127.0.0.1:8080/static/example.com.html","description":"Example","extended":"","meta":"18a973f09c9cc0608c116967b64e0419","hash":"910293f019c2f4bb1a749fb937ba58e3","time":"2014-06-14T15:51:42Z","shared":"no","toread":"no","tags":"Tag1 Tag2","trap":"http://www.example.com/should-not-exist"}] 3 | -------------------------------------------------------------------------------- /archivebox/core/migrations/0056_remove_tag_uuid.py: -------------------------------------------------------------------------------- 1 | # Generated by Django 5.0.6 on 2024-08-20 03:25 2 | 3 | from django.db import migrations 4 | 5 | 6 | class Migration(migrations.Migration): 7 | 8 | dependencies = [ 9 | ('core', '0055_alter_tag_slug'), 10 | ] 11 | 12 | operations = [ 13 | migrations.RemoveField( 14 | model_name='tag', 15 | name='uuid', 16 | ), 17 | ] 18 | -------------------------------------------------------------------------------- /archivebox/pkgs/abx-plugin-pocket/abx_plugin_pocket/__init__.py: -------------------------------------------------------------------------------- 1 | __package__ = 'abx_plugin_pocket' 2 | __label__ = 'Pocket' 3 | 4 | import abx 5 | 6 | 7 | @abx.hookimpl 8 | def get_CONFIG(): 9 | from .config import POCKET_CONFIG 10 | 11 | return { 12 | 'POCKET_CONFIG': POCKET_CONFIG 13 | } 14 | 15 | @abx.hookimpl 16 | def ready(): 17 | from .config import POCKET_CONFIG 18 | POCKET_CONFIG.validate() 19 | -------------------------------------------------------------------------------- /archivebox/pkgs/abx-plugin-title/abx_plugin_title/__init__.py: -------------------------------------------------------------------------------- 1 | import abx 2 | 3 | # @abx.hookimpl 4 | # def get_CONFIG(): 5 | # from .config import TITLE_EXTRACTOR_CONFIG 6 | 7 | # return { 8 | # 'title_extractor': TITLE_EXTRACTOR_CONFIG 9 | # } 10 | 11 | 12 | @abx.hookimpl 13 | def get_EXTRACTORS(): 14 | from .extractors import TITLE_EXTRACTOR 15 | return { 16 | 'title': TITLE_EXTRACTOR, 17 | } 18 | -------------------------------------------------------------------------------- /archivebox/pkgs/abx-plugin-favicon/abx_plugin_favicon/models.py: -------------------------------------------------------------------------------- 1 | # from django.db import models 2 | 3 | # from core.models import ArchiveResult 4 | 5 | # class FaviconResultManager(models.Manager): 6 | # def get_queryset(self): 7 | # return super().get_queryset().filter(extractor='favicon') 8 | 9 | 10 | # class FaviconResult(ArchiveResult): 11 | # objects = FaviconResultManager() 12 | 13 | # class Meta: 14 | # proxy = True 15 | -------------------------------------------------------------------------------- /archivebox/machine/apps.py: -------------------------------------------------------------------------------- 1 | __package__ = 'archivebox.machine' 2 | 3 | from django.apps import AppConfig 4 | 5 | import abx 6 | 7 | 8 | class MachineConfig(AppConfig): 9 | default_auto_field = 'django.db.models.BigAutoField' 10 | 11 | name = 'machine' 12 | verbose_name = 'Machine Info' 13 | 14 | 15 | @abx.hookimpl 16 | def register_admin(admin_site): 17 | from machine.admin import register_admin 18 | register_admin(admin_site) 19 | -------------------------------------------------------------------------------- /archivebox/pkgs/abx-plugin-pocket/abx_plugin_pocket/config.py: -------------------------------------------------------------------------------- 1 | from typing import Dict 2 | from pydantic import Field 3 | 4 | from abx_spec_config import BaseConfigSet 5 | 6 | 7 | class PocketConfig(BaseConfigSet): 8 | POCKET_CONSUMER_KEY: str | None = Field(default=None) 9 | POCKET_ACCESS_TOKENS: Dict[str, str] = Field(default=dict) # {: , ...} 10 | 11 | 12 | POCKET_CONFIG = PocketConfig() 13 | -------------------------------------------------------------------------------- /archivebox/pkgs/abx-spec-django/pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "abx-spec-django" 3 | version = "0.1.0" 4 | description = "Add your description here" 5 | readme = "README.md" 6 | requires-python = ">=3.10" 7 | dependencies = [ 8 | "abx>=0.1.0", 9 | "django>=5.1.1,<6.0", 10 | ] 11 | 12 | [build-system] 13 | requires = ["hatchling"] 14 | build-backend = "hatchling.build" 15 | 16 | [project.entry-points.abx] 17 | abx_spec_django = "abx_spec_django" 18 | -------------------------------------------------------------------------------- /tests/test_util.py: -------------------------------------------------------------------------------- 1 | from archivebox import util 2 | 3 | def test_download_url_downloads_content(): 4 | text = util.download_url("http://127.0.0.1:8080/static/example.com.html") 5 | assert "Example Domain" in text 6 | 7 | def test_download_url_gets_encoding_from_body(): 8 | text = util.download_url("http://127.0.0.1:8080/static_no_content_type/shift_jis.html") 9 | assert "鹿児島のニュース|MBC南日本放送" in text 10 | assert "掲載された全ての記事・画像等の無断転載、二次利用をお断りいたします" in text -------------------------------------------------------------------------------- /archivebox/pkgs/abx-plugin-singlefile/abx_plugin_singlefile/models.py: -------------------------------------------------------------------------------- 1 | # from django.db import models 2 | 3 | # from core.models import ArchiveResult 4 | 5 | # class SinglefileResultManager(models.Manager): 6 | # def get_queryset(self): 7 | # return super().get_queryset().filter(extractor='singlefile') 8 | 9 | 10 | # class SinglefileResult(ArchiveResult): 11 | # objects = SinglefileResultManager() 12 | 13 | # class Meta: 14 | # proxy = True 15 | -------------------------------------------------------------------------------- /archivebox/core/migrations/0057_rename_id_tag_old_id.py: -------------------------------------------------------------------------------- 1 | # Generated by Django 5.0.6 on 2024-08-20 03:29 2 | 3 | from django.db import migrations 4 | 5 | 6 | class Migration(migrations.Migration): 7 | 8 | dependencies = [ 9 | ('core', '0056_remove_tag_uuid'), 10 | ] 11 | 12 | operations = [ 13 | migrations.RenameField( 14 | model_name='tag', 15 | old_name='id', 16 | new_name='old_id', 17 | ), 18 | ] 19 | -------------------------------------------------------------------------------- /archivebox/pkgs/abx-plugin-mercury/pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "abx-plugin-mercury" 3 | version = "2024.10.28" 4 | description = "Add your description here" 5 | readme = "README.md" 6 | requires-python = ">=3.10" 7 | dependencies = [ 8 | "abx>=0.1.0", 9 | "abx-spec-config>=0.1.0", 10 | ] 11 | 12 | [build-system] 13 | requires = ["hatchling"] 14 | build-backend = "hatchling.build" 15 | 16 | [project.entry-points.abx] 17 | abx_plugin_mercury = "abx_plugin_mercury" 18 | -------------------------------------------------------------------------------- /archivebox/core/migrations/0065_remove_snapshottag_old_tag.py: -------------------------------------------------------------------------------- 1 | # Generated by Django 5.0.6 on 2024-08-20 03:51 2 | 3 | from django.db import migrations 4 | 5 | 6 | class Migration(migrations.Migration): 7 | 8 | dependencies = [ 9 | ('core', '0064_alter_snapshottag_unique_together_and_more'), 10 | ] 11 | 12 | operations = [ 13 | migrations.RemoveField( 14 | model_name='snapshottag', 15 | name='old_tag', 16 | ), 17 | ] 18 | -------------------------------------------------------------------------------- /archivebox/pkgs/abx-plugin-readwise/pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "abx-plugin-readwise" 3 | version = "2024.10.28" 4 | description = "Readwise API Extractor" 5 | readme = "README.md" 6 | requires-python = ">=3.10" 7 | dependencies = [ 8 | "abx>=0.1.0", 9 | "abx-spec-config>=0.1.0", 10 | ] 11 | 12 | [build-system] 13 | requires = ["hatchling"] 14 | build-backend = "hatchling.build" 15 | 16 | [project.entry-points.abx] 17 | abx_plugin_readwise = "abx_plugin_readwise" 18 | 19 | -------------------------------------------------------------------------------- /archivebox/core/migrations/0038_rename_uuid_snapshot_id.py: -------------------------------------------------------------------------------- 1 | # Generated by Django 5.0.6 on 2024-08-18 06:09 2 | 3 | from django.db import migrations 4 | 5 | 6 | class Migration(migrations.Migration): 7 | 8 | dependencies = [ 9 | ('core', '0037_rename_id_snapshot_old_id'), 10 | ] 11 | 12 | operations = [ 13 | migrations.RenameField( 14 | model_name='snapshot', 15 | old_name='uuid', 16 | new_name='id', 17 | ), 18 | ] 19 | -------------------------------------------------------------------------------- /archivebox/core/migrations/0042_remove_archiveresult_snapshot_old.py: -------------------------------------------------------------------------------- 1 | # Generated by Django 5.0.6 on 2024-08-18 06:51 2 | 3 | from django.db import migrations 4 | 5 | 6 | class Migration(migrations.Migration): 7 | 8 | dependencies = [ 9 | ('core', '0041_alter_archiveresult_snapshot_and_more'), 10 | ] 11 | 12 | operations = [ 13 | migrations.RemoveField( 14 | model_name='archiveresult', 15 | name='snapshot_old', 16 | ), 17 | ] 18 | -------------------------------------------------------------------------------- /archivebox/core/migrations/0053_remove_snapshottag_snapshot_old.py: -------------------------------------------------------------------------------- 1 | # Generated by Django 5.0.6 on 2024-08-20 02:38 2 | 3 | from django.db import migrations 4 | 5 | 6 | class Migration(migrations.Migration): 7 | 8 | dependencies = [ 9 | ('core', '0052_alter_snapshottag_unique_together_and_more'), 10 | ] 11 | 12 | operations = [ 13 | migrations.RemoveField( 14 | model_name='snapshottag', 15 | name='snapshot_old', 16 | ), 17 | ] 18 | -------------------------------------------------------------------------------- /archivebox/pkgs/abx-plugin-htmltotext/pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "abx-plugin-htmltotext" 3 | version = "2024.10.28" 4 | description = "Add your description here" 5 | readme = "README.md" 6 | requires-python = ">=3.10" 7 | dependencies = [ 8 | "abx>=0.1.0", 9 | "abx-spec-config>=0.1.0", 10 | ] 11 | 12 | [build-system] 13 | requires = ["hatchling"] 14 | build-backend = "hatchling.build" 15 | 16 | [project.entry-points.abx] 17 | abx_plugin_htmltotext = "abx_plugin_htmltotext" 18 | -------------------------------------------------------------------------------- /archivebox/pkgs/abx-spec-abx-pkg/pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "abx-spec-abx-pkg" 3 | version = "0.1.1" 4 | description = "The ABX plugin specification for Binaries and BinProviders" 5 | readme = "README.md" 6 | requires-python = ">=3.10" 7 | dependencies = [ 8 | "abx>=0.1.0", 9 | "abx-pkg>=0.6.0", 10 | ] 11 | 12 | [build-system] 13 | requires = ["hatchling"] 14 | build-backend = "hatchling.build" 15 | 16 | [project.entry-points.abx] 17 | abx_spec_abx_pkg = "abx_spec_abx_pkg" 18 | -------------------------------------------------------------------------------- /archivebox/api/migrations/0002_alter_apitoken_options.py: -------------------------------------------------------------------------------- 1 | # Generated by Django 5.0.4 on 2024-04-26 05:28 2 | 3 | from django.db import migrations 4 | 5 | 6 | class Migration(migrations.Migration): 7 | 8 | dependencies = [ 9 | ('api', '0001_initial'), 10 | ] 11 | 12 | operations = [ 13 | migrations.AlterModelOptions( 14 | name='apitoken', 15 | options={'verbose_name': 'API Key', 'verbose_name_plural': 'API Keys'}, 16 | ), 17 | ] 18 | -------------------------------------------------------------------------------- /archivebox/pkgs/abx-plugin-git/abx_plugin_git/extractors.py: -------------------------------------------------------------------------------- 1 | __package__ = 'abx_plugin_git' 2 | 3 | 4 | from abx_pkg import BinName 5 | 6 | from abx_spec_extractor import BaseExtractor, ExtractorName 7 | 8 | from .binaries import GIT_BINARY 9 | 10 | 11 | class GitExtractor(BaseExtractor): 12 | name: ExtractorName = 'git' 13 | binary: BinName = GIT_BINARY.name 14 | 15 | def get_output_path(self, snapshot) -> str: 16 | return 'git' 17 | 18 | GIT_EXTRACTOR = GitExtractor() 19 | -------------------------------------------------------------------------------- /archivebox/pkgs/abx-plugin-readability/pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "abx-plugin-readability" 3 | version = "2024.10.28" 4 | description = "Add your description here" 5 | readme = "README.md" 6 | requires-python = ">=3.10" 7 | dependencies = [ 8 | "abx>=0.1.0", 9 | "abx-spec-config>=0.1.0", 10 | ] 11 | 12 | [build-system] 13 | requires = ["hatchling"] 14 | build-backend = "hatchling.build" 15 | 16 | [project.entry-points.abx] 17 | abx_plugin_readability = "abx_plugin_readability" 18 | -------------------------------------------------------------------------------- /archivebox/templates/admin/app_index.html: -------------------------------------------------------------------------------- 1 | {% extends "admin/index.html" %} 2 | {% load i18n %} 3 | 4 | {% block bodyclass %}{{ block.super }} app-{{ app_label }}{% endblock %} 5 | 6 | {% if not is_popup %} 7 | {% block breadcrumbs %} 8 | 15 | {% endblock %} 16 | {% endif %} 17 | 18 | {% block sidebar %}{% endblock %} 19 | -------------------------------------------------------------------------------- /archivebox/core/migrations/0033_rename_id_archiveresult_old_id.py: -------------------------------------------------------------------------------- 1 | # Generated by Django 5.0.6 on 2024-08-18 05:34 2 | 3 | from django.db import migrations 4 | 5 | 6 | class Migration(migrations.Migration): 7 | 8 | dependencies = [ 9 | ('core', '0032_alter_archiveresult_id'), 10 | ] 11 | 12 | operations = [ 13 | migrations.RenameField( 14 | model_name='archiveresult', 15 | old_name='id', 16 | new_name='old_id', 17 | ), 18 | ] 19 | -------------------------------------------------------------------------------- /archivebox/pkgs/abx-plugin-curl/pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "abx-plugin-curl" 3 | version = "2024.10.24" 4 | description = "Add your description here" 5 | readme = "README.md" 6 | requires-python = ">=3.10" 7 | dependencies = [ 8 | "abx>=0.1.0", 9 | "abx-spec-config>=0.1.0", 10 | "abx-spec-abx-pkg>=0.1.0", 11 | ] 12 | 13 | [build-system] 14 | requires = ["hatchling"] 15 | build-backend = "hatchling.build" 16 | 17 | [project.entry-points.abx] 18 | abx_plugin_curl = "abx_plugin_curl" 19 | -------------------------------------------------------------------------------- /archivebox/pkgs/abx-plugin-pocket/pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "abx-plugin-pocket" 3 | version = "2024.10.28" 4 | description = "Add your description here" 5 | readme = "README.md" 6 | requires-python = ">=3.10" 7 | dependencies = [ 8 | "abx>=0.1.0", 9 | "abx-spec-config>=0.1.0", 10 | "pocket>=0.3.6", 11 | ] 12 | 13 | [build-system] 14 | requires = ["hatchling"] 15 | build-backend = "hatchling.build" 16 | 17 | [project.entry-points.abx] 18 | abx_plugin_pocket = "abx_plugin_pocket" 19 | -------------------------------------------------------------------------------- /archivebox/pkgs/abx-plugin-title/pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "abx-plugin-title" 3 | version = "2024.10.27" 4 | description = "Title Extractor" 5 | readme = "README.md" 6 | requires-python = ">=3.10" 7 | dependencies = [ 8 | "abx>=0.1.0", 9 | "abx-spec-config>=0.1.0", 10 | "abx-plugin-curl>=2024.10.28", 11 | ] 12 | 13 | [build-system] 14 | requires = ["hatchling"] 15 | build-backend = "hatchling.build" 16 | 17 | [project.entry-points.abx] 18 | abx_plugin_title = "abx_plugin_title" 19 | -------------------------------------------------------------------------------- /archivebox/pkgs/abx-plugin-wget/pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "abx-plugin-wget" 3 | version = "2024.10.28" 4 | description = "Add your description here" 5 | readme = "README.md" 6 | requires-python = ">=3.10" 7 | dependencies = [ 8 | "abx>=0.1.0", 9 | "abx-spec-config>=0.1.0", 10 | "abx-spec-abx-pkg>=0.1.0", 11 | ] 12 | 13 | [build-system] 14 | requires = ["hatchling"] 15 | build-backend = "hatchling.build" 16 | 17 | [project.entry-points.abx] 18 | abx_plugin_wget = "abx_plugin_wget" 19 | -------------------------------------------------------------------------------- /archivebox/pkgs/abx-spec-extractor/pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "abx-spec-extractor" 3 | version = "0.1.0" 4 | description = "Add your description here" 5 | readme = "README.md" 6 | requires-python = ">=3.10" 7 | dependencies = [ 8 | "abx>=0.1.0", 9 | "python-benedict>=0.26.0", 10 | "pydantic>=2.5.0", 11 | ] 12 | 13 | [build-system] 14 | requires = ["hatchling"] 15 | build-backend = "hatchling.build" 16 | 17 | [project.entry-points.abx] 18 | abx_spec_extractor = "abx_spec_extractor" 19 | -------------------------------------------------------------------------------- /bin/test.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | ### Bash Environment Setup 4 | # http://redsymbol.net/articles/unofficial-bash-strict-mode/ 5 | # https://www.gnu.org/software/bash/manual/html_node/The-Set-Builtin.html 6 | # set -o xtrace 7 | set -o errexit 8 | set -o errtrace 9 | set -o nounset 10 | set -o pipefail 11 | IFS=$'\n' 12 | 13 | DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && cd .. && pwd )" 14 | 15 | source "$DIR/.venv/bin/activate" 16 | 17 | pytest -s --basetemp=tests/out "$@" 18 | -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- 1 | from multiprocessing import Process 2 | 3 | import pytest 4 | from .mock_server.server import start 5 | 6 | server_process = None 7 | 8 | @pytest.hookimpl 9 | def pytest_sessionstart(session): 10 | global server_process 11 | server_process = Process(target=start) 12 | server_process.start() 13 | 14 | @pytest.hookimpl 15 | def pytest_sessionfinish(session): 16 | if server_process is not None: 17 | server_process.terminate() 18 | server_process.join() 19 | -------------------------------------------------------------------------------- /archivebox/pkgs/abx-plugin-chrome/pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "abx-plugin-chrome" 3 | version = "2024.10.28" 4 | description = "Add your description here" 5 | readme = "README.md" 6 | requires-python = ">=3.10" 7 | dependencies = [ 8 | "abx>=0.1.0", 9 | "abx-spec-config>=0.1.0", 10 | "abx-spec-abx-pkg>=0.1.0", 11 | ] 12 | 13 | [build-system] 14 | requires = ["hatchling"] 15 | build-backend = "hatchling.build" 16 | 17 | [project.entry-points.abx] 18 | abx_plugin_chrome = "abx_plugin_chrome" 19 | -------------------------------------------------------------------------------- /archivebox/core/migrations/0019_auto_20210401_0654.py: -------------------------------------------------------------------------------- 1 | # Generated by Django 3.1.3 on 2021-04-01 06:54 2 | 3 | from django.db import migrations, models 4 | 5 | 6 | class Migration(migrations.Migration): 7 | 8 | dependencies = [ 9 | ('core', '0018_auto_20210327_0952'), 10 | ] 11 | 12 | operations = [ 13 | migrations.AlterField( 14 | model_name='snapshot', 15 | name='url', 16 | field=models.URLField(db_index=True, unique=True), 17 | ), 18 | ] 19 | -------------------------------------------------------------------------------- /archivebox/pkgs/abx-plugin-favicon/pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "abx-plugin-favicon" 3 | version = "2024.10.28" 4 | description = "Add your description here" 5 | readme = "README.md" 6 | requires-python = ">=3.10" 7 | dependencies = [ 8 | "abx>=0.1.0", 9 | "abx-spec-config>=0.1.0", 10 | "abx-plugin-curl>=2024.10.28", 11 | ] 12 | 13 | [build-system] 14 | requires = ["hatchling"] 15 | build-backend = "hatchling.build" 16 | 17 | [project.entry-points.abx] 18 | abx_plugin_favicon = "abx_plugin_favicon" 19 | -------------------------------------------------------------------------------- /archivebox/pkgs/abx-spec-archivebox/pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "abx-spec-archivebox" 3 | version = "0.1.0" 4 | description = "The common shared interfaces for the ABX ArchiveBox plugin ecosystem." 5 | readme = "README.md" 6 | requires-python = ">=3.10" 7 | dependencies = [ 8 | "abx>=0.1.0", 9 | "django>=5.1.1,<6.0", 10 | ] 11 | 12 | [build-system] 13 | requires = ["hatchling"] 14 | build-backend = "hatchling.build" 15 | 16 | [project.entry-points.abx] 17 | abx_spec_archivebox = "abx_spec_archivebox" 18 | -------------------------------------------------------------------------------- /archivebox/pkgs/abx-spec-searchbackend/pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "abx-spec-searchbackend" 3 | version = "0.1.0" 4 | description = "Add your description here" 5 | readme = "README.md" 6 | requires-python = ">=3.10" 7 | dependencies = [ 8 | "abx>=0.1.0", 9 | "python-benedict>=0.26.0", 10 | "pydantic>=2.5.0", 11 | ] 12 | 13 | [build-system] 14 | requires = ["hatchling"] 15 | build-backend = "hatchling.build" 16 | 17 | [project.entry-points.abx] 18 | abx_spec_searchbackend = "abx_spec_searchbackend" 19 | -------------------------------------------------------------------------------- /archivebox/core/migrations/0010_auto_20210216_1055.py: -------------------------------------------------------------------------------- 1 | # Generated by Django 3.1.3 on 2021-02-16 10:55 2 | 3 | from django.db import migrations, models 4 | 5 | 6 | class Migration(migrations.Migration): 7 | 8 | dependencies = [ 9 | ('core', '0009_auto_20210216_1038'), 10 | ] 11 | 12 | operations = [ 13 | migrations.AlterField( 14 | model_name='archiveresult', 15 | name='start_ts', 16 | field=models.DateTimeField(db_index=True), 17 | ), 18 | ] 19 | -------------------------------------------------------------------------------- /archivebox/core/migrations/0030_alter_archiveresult_uuid.py: -------------------------------------------------------------------------------- 1 | # Generated by Django 5.0.6 on 2024-08-18 05:00 2 | 3 | from django.db import migrations, models 4 | 5 | 6 | class Migration(migrations.Migration): 7 | 8 | dependencies = [ 9 | ('core', '0029_alter_archiveresult_id'), 10 | ] 11 | 12 | operations = [ 13 | migrations.AlterField( 14 | model_name='archiveresult', 15 | name='uuid', 16 | field=models.UUIDField(unique=True), 17 | ), 18 | ] 19 | -------------------------------------------------------------------------------- /archivebox/core/migrations/0037_rename_id_snapshot_old_id.py: -------------------------------------------------------------------------------- 1 | # Generated by Django 5.0.6 on 2024-08-18 06:08 2 | 3 | from django.db import migrations 4 | 5 | 6 | class Migration(migrations.Migration): 7 | 8 | dependencies = [ 9 | ('core', '0036_alter_archiveresult_id_alter_archiveresult_old_id'), 10 | ] 11 | 12 | operations = [ 13 | migrations.RenameField( 14 | model_name='snapshot', 15 | old_name='id', 16 | new_name='old_id', 17 | ), 18 | ] 19 | -------------------------------------------------------------------------------- /archivebox/pkgs/abx-plugin-npm/abx_plugin_npm/config.py: -------------------------------------------------------------------------------- 1 | from abx_spec_config import BaseConfigSet 2 | 3 | 4 | ###################### Config ########################## 5 | 6 | 7 | class NpmDependencyConfigs(BaseConfigSet): 8 | # USE_NPM: bool = True 9 | # NPM_BINARY: str = Field(default='npm') 10 | # NPM_ARGS: Optional[List[str]] = Field(default=None) 11 | # NPM_EXTRA_ARGS: List[str] = [] 12 | # NPM_DEFAULT_ARGS: List[str] = [] 13 | pass 14 | 15 | 16 | NPM_CONFIG = NpmDependencyConfigs() 17 | 18 | -------------------------------------------------------------------------------- /archivebox/core/migrations/0002_auto_20200625_1521.py: -------------------------------------------------------------------------------- 1 | # Generated by Django 3.0.7 on 2020-06-25 15:21 2 | 3 | from django.db import migrations, models 4 | 5 | 6 | class Migration(migrations.Migration): 7 | 8 | dependencies = [ 9 | ('core', '0001_initial'), 10 | ] 11 | 12 | operations = [ 13 | migrations.AlterField( 14 | model_name='snapshot', 15 | name='timestamp', 16 | field=models.CharField(default=None, max_length=32, null=True), 17 | ), 18 | ] 19 | -------------------------------------------------------------------------------- /archivebox/core/migrations/0016_auto_20210218_1204.py: -------------------------------------------------------------------------------- 1 | # Generated by Django 3.1.3 on 2021-02-18 12:04 2 | 3 | from django.db import migrations, models 4 | 5 | 6 | class Migration(migrations.Migration): 7 | 8 | dependencies = [ 9 | ('core', '0015_auto_20210218_0730'), 10 | ] 11 | 12 | operations = [ 13 | migrations.AlterField( 14 | model_name='snapshot', 15 | name='tags', 16 | field=models.ManyToManyField(blank=True, to='core.Tag'), 17 | ), 18 | ] 19 | -------------------------------------------------------------------------------- /archivebox/core/migrations/0039_rename_snapshot_archiveresult_snapshot_old.py: -------------------------------------------------------------------------------- 1 | # Generated by Django 5.0.6 on 2024-08-18 06:25 2 | 3 | from django.db import migrations 4 | 5 | 6 | class Migration(migrations.Migration): 7 | 8 | dependencies = [ 9 | ('core', '0038_rename_uuid_snapshot_id'), 10 | ] 11 | 12 | operations = [ 13 | migrations.RenameField( 14 | model_name='archiveresult', 15 | old_name='snapshot', 16 | new_name='snapshot_old', 17 | ), 18 | ] 19 | -------------------------------------------------------------------------------- /archivebox/core/migrations/0060_alter_tag_id.py: -------------------------------------------------------------------------------- 1 | # Generated by Django 5.0.6 on 2024-08-20 03:42 2 | 3 | import uuid 4 | from django.db import migrations, models 5 | 6 | 7 | class Migration(migrations.Migration): 8 | 9 | dependencies = [ 10 | ('core', '0059_tag_id'), 11 | ] 12 | 13 | operations = [ 14 | migrations.AlterField( 15 | model_name='tag', 16 | name='id', 17 | field=models.UUIDField(default=uuid.uuid4, editable=False, unique=True), 18 | ), 19 | ] 20 | -------------------------------------------------------------------------------- /archivebox/pkgs/abx-plugin-git/abx_plugin_git/binaries.py: -------------------------------------------------------------------------------- 1 | __package__ = 'abx_plugin_git' 2 | 3 | from typing import List 4 | 5 | from pydantic import InstanceOf 6 | from abx_pkg import BinProvider, BinName, Binary 7 | 8 | from abx_plugin_default_binproviders import apt, brew, env 9 | 10 | from .config import GIT_CONFIG 11 | 12 | 13 | 14 | class GitBinary(Binary): 15 | name: BinName = GIT_CONFIG.GIT_BINARY 16 | binproviders_supported: List[InstanceOf[BinProvider]] = [apt, brew, env] 17 | 18 | GIT_BINARY = GitBinary() 19 | -------------------------------------------------------------------------------- /archivebox/pkgs/abx-plugin-ldap-auth/pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "abx-plugin-ldap-auth" 3 | version = "2024.10.28" 4 | description = "Add your description here" 5 | readme = "README.md" 6 | requires-python = ">=3.10" 7 | dependencies = [ 8 | "abx>=0.1.0", 9 | "abx-spec-config>=0.1.0", 10 | "abx-spec-django>=0.1.0", 11 | ] 12 | 13 | 14 | [build-system] 15 | requires = ["hatchling"] 16 | build-backend = "hatchling.build" 17 | 18 | 19 | [project.entry-points.abx] 20 | abx_plugin_ldap_auth = "abx_plugin_ldap_auth" 21 | -------------------------------------------------------------------------------- /archivebox/pkgs/abx-plugin-ytdlp/pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "abx-plugin-ytdlp" 3 | version = "2024.10.28" 4 | description = "Add your description here" 5 | readme = "README.md" 6 | requires-python = ">=3.10" 7 | dependencies = [ 8 | "abx>=0.1.0", 9 | "abx-spec-config>=0.1.0", 10 | "abx-spec-abx-pkg>=0.1.0", 11 | "abx-pkg>=0.5.4", 12 | ] 13 | 14 | [build-system] 15 | requires = ["hatchling"] 16 | build-backend = "hatchling.build" 17 | 18 | [project.entry-points.abx] 19 | abx_plugin_ytdlp = "abx_plugin_ytdlp" 20 | -------------------------------------------------------------------------------- /archivebox/core/migrations/0055_alter_tag_slug.py: -------------------------------------------------------------------------------- 1 | # Generated by Django 5.0.6 on 2024-08-20 03:24 2 | 3 | from django.db import migrations, models 4 | 5 | 6 | class Migration(migrations.Migration): 7 | 8 | dependencies = [ 9 | ('core', '0054_alter_snapshot_timestamp'), 10 | ] 11 | 12 | operations = [ 13 | migrations.AlterField( 14 | model_name='tag', 15 | name='slug', 16 | field=models.SlugField(editable=False, max_length=100, unique=True), 17 | ), 18 | ] 19 | -------------------------------------------------------------------------------- /archivebox/core/migrations/0068_alter_archiveresult_options.py: -------------------------------------------------------------------------------- 1 | # Generated by Django 5.0.6 on 2024-08-20 07:26 2 | 3 | from django.db import migrations 4 | 5 | 6 | class Migration(migrations.Migration): 7 | 8 | dependencies = [ 9 | ('core', '0067_alter_snapshottag_tag'), 10 | ] 11 | 12 | operations = [ 13 | migrations.AlterModelOptions( 14 | name='archiveresult', 15 | options={'verbose_name': 'Archive Result', 'verbose_name_plural': 'Archive Results Log'}, 16 | ), 17 | ] 18 | -------------------------------------------------------------------------------- /archivebox/pkgs/abx-plugin-pip/abx_plugin_pip/config.py: -------------------------------------------------------------------------------- 1 | __package__ = 'pip' 2 | 3 | from typing import List, Optional 4 | from pydantic import Field 5 | 6 | from abx_spec_config.base_configset import BaseConfigSet 7 | 8 | 9 | class PipDependencyConfigs(BaseConfigSet): 10 | USE_PIP: bool = True 11 | PIP_BINARY: str = Field(default='pip') 12 | PIP_ARGS: Optional[List[str]] = Field(default=None) 13 | PIP_EXTRA_ARGS: List[str] = [] 14 | PIP_DEFAULT_ARGS: List[str] = [] 15 | 16 | PIP_CONFIG = PipDependencyConfigs() 17 | -------------------------------------------------------------------------------- /archivebox/pkgs/abx-spec-archivebox/abx_spec_archivebox/effects.py: -------------------------------------------------------------------------------- 1 | """ 2 | Hookspec for side-effects that ArchiveBox plugins can trigger. 3 | 4 | (e.g. network requests, binary execution, remote API calls, external library calls, etc.) 5 | """ 6 | 7 | __package__ = 'abx.archivebox' 8 | 9 | import abx 10 | 11 | 12 | @abx.hookspec 13 | def check_remote_seed_connection(urls, extractor, credentials, created_by): 14 | pass 15 | 16 | 17 | @abx.hookspec 18 | def exec_extractor(url, extractor, credentials, config): 19 | pass 20 | 21 | -------------------------------------------------------------------------------- /etc/README.md: -------------------------------------------------------------------------------- 1 | # Example etc files for deploying ArchiveBox 2 | 3 | In this folder are some example config files you can use for setting up ArchiveBox on your machine. 4 | 5 | E.g. see `nginx.conf` for an example nginx config to serve your archive with SSL, or `fly.toml` for an example deployment to the Fly.io hosting platform. 6 | 7 | Please contribute your etc files here! Example contributions 8 | 9 | - supervisord config 10 | - systemd config 11 | - apache webserver config 12 | - other init system, webservers, or scheduler configs 13 | -------------------------------------------------------------------------------- /archivebox/core/migrations/0009_auto_20210216_1038.py: -------------------------------------------------------------------------------- 1 | # Generated by Django 3.1.3 on 2021-02-16 10:38 2 | 3 | from django.db import migrations, models 4 | 5 | 6 | class Migration(migrations.Migration): 7 | 8 | dependencies = [ 9 | ('core', '0008_auto_20210105_1421'), 10 | ] 11 | 12 | operations = [ 13 | migrations.AlterField( 14 | model_name='snapshot', 15 | name='updated', 16 | field=models.DateTimeField(auto_now=True, db_index=True, null=True), 17 | ), 18 | ] 19 | -------------------------------------------------------------------------------- /archivebox/core/migrations/0028_alter_archiveresult_uuid.py: -------------------------------------------------------------------------------- 1 | # Generated by Django 5.0.6 on 2024-08-18 04:28 2 | 3 | import uuid 4 | from django.db import migrations, models 5 | 6 | 7 | class Migration(migrations.Migration): 8 | 9 | dependencies = [ 10 | ('core', '0027_update_snapshot_ids'), 11 | ] 12 | 13 | operations = [ 14 | migrations.AlterField( 15 | model_name='archiveresult', 16 | name='uuid', 17 | field=models.UUIDField(default=uuid.uuid4), 18 | ), 19 | ] 20 | -------------------------------------------------------------------------------- /archivebox/pkgs/abx-plugin-archivedotorg/pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "abx-plugin-archivedotorg" 3 | version = "2024.10.28" 4 | description = "Add your description here" 5 | readme = "README.md" 6 | requires-python = ">=3.10" 7 | dependencies = [ 8 | "abx>=0.1.0", 9 | "abx-spec-config>=0.1.0", 10 | "abx-plugin-curl>=2024.10.24", 11 | ] 12 | 13 | [build-system] 14 | requires = ["hatchling"] 15 | build-backend = "hatchling.build" 16 | 17 | [project.entry-points.abx] 18 | abx_plugin_archivedotorg = "abx_plugin_archivedotorg" 19 | -------------------------------------------------------------------------------- /archivebox/pkgs/abx-plugin-curl/abx_plugin_curl/binaries.py: -------------------------------------------------------------------------------- 1 | __package__ = 'abx_plugin_curl' 2 | 3 | from typing import List 4 | 5 | from pydantic import InstanceOf 6 | from abx_pkg import BinProvider, BinName, Binary 7 | 8 | from abx_plugin_default_binproviders import apt, brew, env 9 | 10 | 11 | from .config import CURL_CONFIG 12 | 13 | 14 | class CurlBinary(Binary): 15 | name: BinName = CURL_CONFIG.CURL_BINARY 16 | binproviders_supported: List[InstanceOf[BinProvider]] = [apt, brew, env] 17 | 18 | CURL_BINARY = CurlBinary() 19 | -------------------------------------------------------------------------------- /archivebox/pkgs/abx-plugin-wget/abx_plugin_wget/binaries.py: -------------------------------------------------------------------------------- 1 | __package__ = 'abx_plugin_wget' 2 | 3 | from typing import List 4 | 5 | 6 | from pydantic import InstanceOf 7 | from abx_pkg import BinProvider, BinName, Binary 8 | 9 | from abx_plugin_default_binproviders import apt, brew, env 10 | 11 | from .config import WGET_CONFIG 12 | 13 | 14 | class WgetBinary(Binary): 15 | name: BinName = WGET_CONFIG.WGET_BINARY 16 | binproviders_supported: List[InstanceOf[BinProvider]] = [apt, brew, env] 17 | 18 | WGET_BINARY = WgetBinary() 19 | -------------------------------------------------------------------------------- /archivebox/core/apps.py: -------------------------------------------------------------------------------- 1 | __package__ = 'archivebox.core' 2 | 3 | from django.apps import AppConfig 4 | 5 | import archivebox 6 | 7 | 8 | class CoreConfig(AppConfig): 9 | name = 'core' 10 | 11 | def ready(self): 12 | """Register the archivebox.core.admin_site as the main django admin site""" 13 | from django.conf import settings 14 | archivebox.pm.hook.ready(settings=settings) 15 | 16 | from core.admin_site import register_admin_site 17 | register_admin_site() 18 | 19 | 20 | 21 | -------------------------------------------------------------------------------- /archivebox/core/migrations/0013_auto_20210218_0729.py: -------------------------------------------------------------------------------- 1 | # Generated by Django 3.1.3 on 2021-02-18 07:29 2 | 3 | from django.db import migrations, models 4 | 5 | 6 | class Migration(migrations.Migration): 7 | 8 | dependencies = [ 9 | ('core', '0012_auto_20210216_1425'), 10 | ] 11 | 12 | operations = [ 13 | migrations.AlterField( 14 | model_name='snapshot', 15 | name='title', 16 | field=models.CharField(blank=True, db_index=True, max_length=256, null=True), 17 | ), 18 | ] 19 | -------------------------------------------------------------------------------- /archivebox/core/migrations/0014_auto_20210218_0729.py: -------------------------------------------------------------------------------- 1 | # Generated by Django 3.1.3 on 2021-02-18 07:29 2 | 3 | from django.db import migrations, models 4 | 5 | 6 | class Migration(migrations.Migration): 7 | 8 | dependencies = [ 9 | ('core', '0013_auto_20210218_0729'), 10 | ] 11 | 12 | operations = [ 13 | migrations.AlterField( 14 | model_name='snapshot', 15 | name='title', 16 | field=models.CharField(blank=True, db_index=True, max_length=1024, null=True), 17 | ), 18 | ] 19 | -------------------------------------------------------------------------------- /archivebox/core/migrations/0015_auto_20210218_0730.py: -------------------------------------------------------------------------------- 1 | # Generated by Django 3.1.3 on 2021-02-18 07:30 2 | 3 | from django.db import migrations, models 4 | 5 | 6 | class Migration(migrations.Migration): 7 | 8 | dependencies = [ 9 | ('core', '0014_auto_20210218_0729'), 10 | ] 11 | 12 | operations = [ 13 | migrations.AlterField( 14 | model_name='snapshot', 15 | name='title', 16 | field=models.CharField(blank=True, db_index=True, max_length=512, null=True), 17 | ), 18 | ] 19 | -------------------------------------------------------------------------------- /archivebox/pkgs/abx-plugin-htmltotext/abx_plugin_htmltotext/__init__.py: -------------------------------------------------------------------------------- 1 | __package__ = 'abx_plugin_htmltotext' 2 | __label__ = 'HTML-to-Text' 3 | 4 | import abx 5 | 6 | 7 | @abx.hookimpl 8 | def get_CONFIG(): 9 | from .config import HTMLTOTEXT_CONFIG 10 | 11 | return { 12 | 'HTMLTOTEXT_CONFIG': HTMLTOTEXT_CONFIG 13 | } 14 | 15 | 16 | # @abx.hookimpl 17 | # def get_EXTRACTORS(): 18 | # from .extractors import FAVICON_EXTRACTOR 19 | 20 | # return { 21 | # 'htmltotext': FAVICON_EXTRACTOR, 22 | # } 23 | -------------------------------------------------------------------------------- /archivebox/pkgs/abx-plugin-ripgrep-search/pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "abx-plugin-ripgrep-search" 3 | version = "2024.10.28" 4 | description = "Add your description here" 5 | readme = "README.md" 6 | requires-python = ">=3.10" 7 | dependencies = [ 8 | "abx>=0.1.0", 9 | "abx-spec-config>=0.1.0", 10 | "abx-spec-searchbackend>=0.1.0", 11 | ] 12 | 13 | [build-system] 14 | requires = ["hatchling"] 15 | build-backend = "hatchling.build" 16 | 17 | [project.entry-points.abx] 18 | abx_plugin_ripgrep_search = "abx_plugin_ripgrep_search" 19 | -------------------------------------------------------------------------------- /archivebox/core/migrations/0008_auto_20210105_1421.py: -------------------------------------------------------------------------------- 1 | # Generated by Django 3.1.3 on 2021-01-05 14:21 2 | 3 | from django.db import migrations, models 4 | 5 | 6 | class Migration(migrations.Migration): 7 | 8 | dependencies = [ 9 | ('core', '0007_archiveresult'), 10 | ] 11 | 12 | operations = [ 13 | migrations.AlterField( 14 | model_name='archiveresult', 15 | name='cmd_version', 16 | field=models.CharField(blank=True, default=None, max_length=32, null=True), 17 | ), 18 | ] 19 | -------------------------------------------------------------------------------- /archivebox/core/migrations/0017_auto_20210219_0211.py: -------------------------------------------------------------------------------- 1 | # Generated by Django 3.1.3 on 2021-02-19 02:11 2 | 3 | from django.db import migrations, models 4 | 5 | 6 | class Migration(migrations.Migration): 7 | 8 | dependencies = [ 9 | ('core', '0016_auto_20210218_1204'), 10 | ] 11 | 12 | operations = [ 13 | migrations.AlterField( 14 | model_name='tag', 15 | name='slug', 16 | field=models.SlugField(blank=True, max_length=100, unique=True, verbose_name='slug'), 17 | ), 18 | ] 19 | -------------------------------------------------------------------------------- /archivebox/pkgs/abx-plugin-puppeteer/pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "abx-plugin-puppeteer" 3 | version = "2024.10.28" 4 | description = "Add your description here" 5 | readme = "README.md" 6 | requires-python = ">=3.10" 7 | dependencies = [ 8 | "abx>=0.1.0", 9 | "abx-spec-config>=0.1.0", 10 | "abx-spec-abx-pkg>=0.1.0", 11 | "abx-pkg>=0.5.4", 12 | ] 13 | 14 | [build-system] 15 | requires = ["hatchling"] 16 | build-backend = "hatchling.build" 17 | 18 | [project.entry-points.abx] 19 | abx_plugin_puppeteer = "abx_plugin_puppeteer" 20 | -------------------------------------------------------------------------------- /archivebox/pkgs/abx-plugin-singlefile/pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "abx-plugin-singlefile" 3 | version = "2024.10.28" 4 | description = "Add your description here" 5 | readme = "README.md" 6 | requires-python = ">=3.10" 7 | dependencies = [ 8 | "abx>=0.1.0", 9 | "abx-spec-config>=0.1.0", 10 | "abx-spec-abx-pkg>=0.1.0", 11 | "abx-pkg>=0.5.4", 12 | ] 13 | 14 | [build-system] 15 | requires = ["hatchling"] 16 | build-backend = "hatchling.build" 17 | 18 | [project.entry-points.abx] 19 | abx_plugin_singlefile = "abx_plugin_singlefile" 20 | -------------------------------------------------------------------------------- /.github/PULL_REQUEST_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | # Summary 4 | 5 | 6 | 7 | # Related issues 8 | 9 | 10 | 11 | # Changes these areas 12 | 13 | - [ ] Bugfixes 14 | - [ ] Feature behavior 15 | - [ ] Command line interface 16 | - [ ] Configuration options 17 | - [ ] Internal architecture 18 | - [ ] Snapshot data layout on disk 19 | -------------------------------------------------------------------------------- /archivebox/pkgs/abx-plugin-git/pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "abx-plugin-git" 3 | version = "2024.10.28" 4 | description = "Add your description here" 5 | readme = "README.md" 6 | requires-python = ">=3.10" 7 | dependencies = [ 8 | "abx>=0.1.0", 9 | "abx-spec-config>=0.1.0", 10 | "abx-spec-abx-pkg>=0.1.0", 11 | "abx-plugin-default-binproviders>=2024.10.24", 12 | ] 13 | 14 | [build-system] 15 | requires = ["hatchling"] 16 | build-backend = "hatchling.build" 17 | 18 | [project.entry-points.abx] 19 | abx_plugin_git = "abx_plugin_git" 20 | -------------------------------------------------------------------------------- /archivebox/pkgs/abx-plugin-sqlitefts-search/pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "abx-plugin-sqlitefts-search" 3 | version = "2024.10.28" 4 | description = "Add your description here" 5 | readme = "README.md" 6 | requires-python = ">=3.10" 7 | dependencies = [ 8 | "abx>=0.1.0", 9 | "abx-spec-config>=0.1.0", 10 | "abx-spec-searchbackend>=0.1.0", 11 | ] 12 | 13 | [build-system] 14 | requires = ["hatchling"] 15 | build-backend = "hatchling.build" 16 | 17 | [project.entry-points.abx] 18 | abx_plugin_sqlitefts_search = "abx_plugin_sqlitefts_search" 19 | -------------------------------------------------------------------------------- /archivebox/core/migrations/0029_alter_archiveresult_id.py: -------------------------------------------------------------------------------- 1 | # Generated by Django 5.0.6 on 2024-08-18 04:28 2 | 3 | from django.db import migrations, models 4 | 5 | 6 | class Migration(migrations.Migration): 7 | 8 | dependencies = [ 9 | ('core', '0028_alter_archiveresult_uuid'), 10 | ] 11 | 12 | operations = [ 13 | migrations.AlterField( 14 | model_name='archiveresult', 15 | name='id', 16 | field=models.BigIntegerField(primary_key=True, serialize=False, verbose_name='ID'), 17 | ), 18 | ] 19 | -------------------------------------------------------------------------------- /archivebox/core/migrations/0035_remove_archiveresult_uuid_archiveresult_id.py: -------------------------------------------------------------------------------- 1 | # Generated by Django 5.0.6 on 2024-08-18 05:49 2 | 3 | import uuid 4 | from django.db import migrations, models 5 | 6 | 7 | class Migration(migrations.Migration): 8 | 9 | dependencies = [ 10 | ('core', '0034_alter_archiveresult_old_id_alter_archiveresult_uuid'), 11 | ] 12 | 13 | operations = [ 14 | migrations.RenameField( 15 | model_name='archiveresult', 16 | old_name='uuid', 17 | new_name='id', 18 | ), 19 | ] 20 | -------------------------------------------------------------------------------- /archivebox/core/wsgi.py: -------------------------------------------------------------------------------- 1 | """ 2 | WSGI config for archivebox project. 3 | 4 | It exposes the WSGI callable as a module-level variable named ``application``. 5 | 6 | For more information on this file, see 7 | https://docs.djangoproject.com/en/2.1/howto/deployment/wsgi/ 8 | """ 9 | 10 | import archivebox # noqa 11 | from archivebox.config.django import setup_django 12 | 13 | setup_django(in_memory_db=False, check_db=True) 14 | 15 | from django.core.wsgi import get_wsgi_application 16 | 17 | application = get_wsgi_application() 18 | -------------------------------------------------------------------------------- /archivebox/pkgs/abx-plugin-archivedotorg/abx_plugin_archivedotorg/__init__.py: -------------------------------------------------------------------------------- 1 | __label__ = 'Archive.org' 2 | __homepage__ = 'https://archive.org' 3 | 4 | import abx 5 | 6 | @abx.hookimpl 7 | def get_CONFIG(): 8 | from .config import ARCHIVEDOTORG_CONFIG 9 | 10 | return { 11 | 'ARCHIVEDOTORG_CONFIG': ARCHIVEDOTORG_CONFIG 12 | } 13 | 14 | 15 | # @abx.hookimpl 16 | # def get_EXTRACTORS(): 17 | # from .extractors import ARCHIVEDOTORG_EXTRACTOR 18 | # 19 | # return { 20 | # 'archivedotorg': ARCHIVEDOTORG_EXTRACTOR, 21 | # } 22 | -------------------------------------------------------------------------------- /archivebox/pkgs/abx-plugin-sqlitefts-search/abx_plugin_sqlitefts_search/__init__.py: -------------------------------------------------------------------------------- 1 | __package__ = 'abx_plugin_sqlitefts_search' 2 | __label__ = 'SQLiteFTS Search' 3 | 4 | import abx 5 | 6 | @abx.hookimpl 7 | def get_CONFIG(): 8 | from .config import SQLITEFTS_CONFIG 9 | 10 | return { 11 | 'SQLITEFTS_CONFIG': SQLITEFTS_CONFIG 12 | } 13 | 14 | 15 | @abx.hookimpl 16 | def get_SEARCHBACKENDS(): 17 | from .searchbackend import SQLITEFTS_SEARCH_BACKEND 18 | 19 | return { 20 | 'sqlitefts': SQLITEFTS_SEARCH_BACKEND, 21 | } 22 | -------------------------------------------------------------------------------- /archivebox/core/migrations/0025_alter_archiveresult_uuid.py: -------------------------------------------------------------------------------- 1 | # Generated by Django 5.0.6 on 2024-05-13 12:08 2 | 3 | import uuid 4 | from django.db import migrations, models 5 | 6 | 7 | class Migration(migrations.Migration): 8 | 9 | dependencies = [ 10 | ('core', '0024_auto_20240513_1143'), 11 | ] 12 | 13 | operations = [ 14 | migrations.AlterField( 15 | model_name='archiveresult', 16 | name='uuid', 17 | field=models.UUIDField(default=uuid.uuid4, editable=False, unique=True), 18 | ), 19 | ] 20 | -------------------------------------------------------------------------------- /archivebox/core/migrations/0054_alter_snapshot_timestamp.py: -------------------------------------------------------------------------------- 1 | # Generated by Django 5.0.6 on 2024-08-20 02:40 2 | 3 | from django.db import migrations, models 4 | 5 | 6 | class Migration(migrations.Migration): 7 | 8 | dependencies = [ 9 | ('core', '0053_remove_snapshottag_snapshot_old'), 10 | ] 11 | 12 | operations = [ 13 | migrations.AlterField( 14 | model_name='snapshot', 15 | name='timestamp', 16 | field=models.CharField(db_index=True, editable=False, max_length=32, unique=True), 17 | ), 18 | ] 19 | -------------------------------------------------------------------------------- /archivebox/pkgs/abx-plugin-default-binproviders/pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "abx-plugin-default-binproviders" 3 | version = "2024.10.24" 4 | description = "Default BinProviders for ABX (apt, brew, env)" 5 | readme = "README.md" 6 | requires-python = ">=3.10" 7 | dependencies = [ 8 | "abx>=0.1.0", 9 | "abx-pkg>=0.5.4", 10 | "abx-spec-abx-pkg>=0.1.0", 11 | ] 12 | 13 | [build-system] 14 | requires = ["hatchling"] 15 | build-backend = "hatchling.build" 16 | 17 | [project.entry-points.abx] 18 | abx_plugin_default_binproviders = "abx_plugin_default_binproviders" 19 | -------------------------------------------------------------------------------- /bin/release_pip.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | ### Bash Environment Setup 4 | # http://redsymbol.net/articles/unofficial-bash-strict-mode/ 5 | # https://www.gnu.org/software/bash/manual/html_node/The-Set-Builtin.html 6 | # set -o xtrace 7 | set -o errexit 8 | set -o errtrace 9 | set -o nounset 10 | set -o pipefail 11 | IFS=$'\n' 12 | 13 | REPO_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && cd .. && pwd )" 14 | cd "$REPO_DIR" 15 | source "$REPO_DIR/.venv/bin/activate" 16 | 17 | echo "[^] Publishing to PyPI..." 18 | rm -Rf dist 19 | uv build 20 | uv publish 21 | -------------------------------------------------------------------------------- /archivebox/pkgs/abx-plugin-playwright/pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "abx-plugin-playwright" 3 | version = "2024.10.28" 4 | description = "Add your description here" 5 | readme = "README.md" 6 | requires-python = ">=3.10" 7 | dependencies = [ 8 | "abx>=0.1.0", 9 | "pydantic>=2.4.2", 10 | "abx-pkg>=0.5.4", 11 | "abx-spec-abx-pkg>=0.1.0", 12 | "abx-spec-config>=0.1.0", 13 | ] 14 | 15 | [build-system] 16 | requires = ["hatchling"] 17 | build-backend = "hatchling.build" 18 | 19 | [project.entry-points.abx] 20 | abx_plugin_playwright = "abx_plugin_playwright" 21 | -------------------------------------------------------------------------------- /archivebox/pkgs/abx-plugin-npm/pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "abx-plugin-npm" 3 | version = "2024.10.24" 4 | description = "NPM binary provider plugin for ABX" 5 | readme = "README.md" 6 | requires-python = ">=3.10" 7 | dependencies = [ 8 | "abx>=0.1.0", 9 | "abx-pkg>=0.5.4", 10 | "abx-spec-abx-pkg>=0.1.0", 11 | "abx-spec-config>=0.1.0", 12 | "abx-plugin-default-binproviders>=2024.10.24", 13 | ] 14 | 15 | [build-system] 16 | requires = ["hatchling"] 17 | build-backend = "hatchling.build" 18 | 19 | [project.entry-points.abx] 20 | abx_plugin_npm = "abx_plugin_npm" 21 | -------------------------------------------------------------------------------- /archivebox/api/urls.py: -------------------------------------------------------------------------------- 1 | __package__ = 'archivebox.api' 2 | 3 | from django.urls import path 4 | from django.views.generic.base import RedirectView 5 | 6 | from .v1_api import urls as v1_api_urls 7 | 8 | urlpatterns = [ 9 | path("", RedirectView.as_view(url='/api/v1')), 10 | 11 | path("v1/", v1_api_urls), 12 | path("v1", RedirectView.as_view(url='/api/v1/docs')), 13 | 14 | # ... v2 can be added here ... 15 | # path("v2/", v2_api_urls), 16 | # path("v2", RedirectView.as_view(url='/api/v2/docs')), 17 | ] 18 | -------------------------------------------------------------------------------- /archivebox/core/migrations/0004_auto_20200713_1552.py: -------------------------------------------------------------------------------- 1 | # Generated by Django 3.0.7 on 2020-07-13 15:52 2 | 3 | from django.db import migrations, models 4 | 5 | 6 | class Migration(migrations.Migration): 7 | 8 | dependencies = [ 9 | ('core', '0003_auto_20200630_1034'), 10 | ] 11 | 12 | operations = [ 13 | migrations.AlterField( 14 | model_name='snapshot', 15 | name='timestamp', 16 | field=models.CharField(db_index=True, default=None, max_length=32, unique=True), 17 | preserve_default=False, 18 | ), 19 | ] 20 | -------------------------------------------------------------------------------- /archivebox/pkgs/abx-plugin-puppeteer/abx_plugin_puppeteer/config.py: -------------------------------------------------------------------------------- 1 | __package__ = 'abx_plugin_puppeteer' 2 | 3 | 4 | from abx_spec_config.base_configset import BaseConfigSet 5 | 6 | 7 | ###################### Config ########################## 8 | 9 | 10 | class PuppeteerConfig(BaseConfigSet): 11 | PUPPETEER_BINARY: str = 'puppeteer' 12 | # PUPPETEER_ARGS: Optional[List[str]] = Field(default=None) 13 | # PUPPETEER_EXTRA_ARGS: List[str] = [] 14 | # PUPPETEER_DEFAULT_ARGS: List[str] = ['--timeout={TIMEOUT-10}'] 15 | pass 16 | 17 | 18 | PUPPETEER_CONFIG = PuppeteerConfig() 19 | -------------------------------------------------------------------------------- /archivebox/pkgs/abx-spec-config/pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "abx-spec-config" 3 | version = "0.1.0" 4 | description = "The common shared interfaces for the ABX ArchiveBox plugin ecosystem." 5 | readme = "README.md" 6 | requires-python = ">=3.10" 7 | dependencies = [ 8 | "abx>=0.1.0", 9 | "python-benedict>=0.34.0", 10 | "pydantic>=2.9.2", 11 | "pydantic-settings>=2.6.0", 12 | "rich>=13.9.3", 13 | ] 14 | 15 | [build-system] 16 | requires = ["hatchling"] 17 | build-backend = "hatchling.build" 18 | 19 | [project.entry-points.abx] 20 | abx_spec_config = "abx_spec_config" 21 | -------------------------------------------------------------------------------- /archivebox/pkgs/abx-plugin-sonic-search/pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "abx-plugin-sonic-search" 3 | version = "2024.10.28" 4 | description = "Add your description here" 5 | readme = "README.md" 6 | requires-python = ">=3.10" 7 | dependencies = [ 8 | "abx>=0.1.0", 9 | "abx-spec-config>=0.1.0", 10 | "abx-spec-abx-pkg>=0.1.0", 11 | "abx-spec-searchbackend>=0.1.0", 12 | "abx-pkg>=0.5.4", 13 | ] 14 | 15 | [build-system] 16 | requires = ["hatchling"] 17 | build-backend = "hatchling.build" 18 | 19 | [project.entry-points.abx] 20 | abx_plugin_sonic_search = "abx_plugin_sonic_search" 21 | -------------------------------------------------------------------------------- /archivebox/core/migrations/0074_alter_snapshot_downloaded_at.py: -------------------------------------------------------------------------------- 1 | # Generated by Django 5.1 on 2024-09-05 01:24 2 | 3 | from django.db import migrations, models 4 | 5 | 6 | class Migration(migrations.Migration): 7 | 8 | dependencies = [ 9 | ('core', '0073_rename_created_archiveresult_created_at_and_more'), 10 | ] 11 | 12 | operations = [ 13 | migrations.AlterField( 14 | model_name='snapshot', 15 | name='downloaded_at', 16 | field=models.DateTimeField(blank=True, db_index=True, default=None, editable=False, null=True), 17 | ), 18 | ] 19 | -------------------------------------------------------------------------------- /archivebox/pkgs/abx-plugin-favicon/abx_plugin_favicon/extractors.py: -------------------------------------------------------------------------------- 1 | __package__ = 'abx_plugin_favicon' 2 | 3 | from pathlib import Path 4 | 5 | from abx_pkg import BinName 6 | 7 | from abx_spec_extractor import BaseExtractor, ExtractorName 8 | 9 | from abx_plugin_curl.binaries import CURL_BINARY 10 | 11 | 12 | class FaviconExtractor(BaseExtractor): 13 | name: ExtractorName = 'favicon' 14 | binary: BinName = CURL_BINARY.name 15 | 16 | def get_output_path(self, snapshot) -> Path | None: 17 | return Path(snapshot.link_dir) / 'favicon.png' 18 | 19 | FAVICON_EXTRACTOR = FaviconExtractor() 20 | -------------------------------------------------------------------------------- /archivebox/pkgs/abx-plugin-pip/pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "abx-plugin-pip" 3 | version = "2024.10.24" 4 | description = "Add your description here" 5 | readme = "README.md" 6 | requires-python = ">=3.10" 7 | dependencies = [ 8 | "abx>=0.1.0", 9 | "abx-pkg>=0.5.4", 10 | "abx-spec-config>=0.1.0", 11 | "abx-spec-abx-pkg>=0.1.0", 12 | "abx-plugin-default-binproviders>=2024.10.24", 13 | "django>=5.0.0", 14 | ] 15 | 16 | 17 | [build-system] 18 | requires = ["hatchling"] 19 | build-backend = "hatchling.build" 20 | 21 | [project.entry-points.abx] 22 | abx_plugin_pip = "abx_plugin_pip" 23 | -------------------------------------------------------------------------------- /archivebox/pkgs/abx-plugin-mercury/abx_plugin_mercury/extractors.py: -------------------------------------------------------------------------------- 1 | __package__ = 'abx_plugin_mercury' 2 | 3 | from pathlib import Path 4 | 5 | from abx_pkg import BinName 6 | from abx_spec_extractor import BaseExtractor, ExtractorName 7 | 8 | from .binaries import MERCURY_BINARY 9 | 10 | 11 | 12 | class MercuryExtractor(BaseExtractor): 13 | name: ExtractorName = 'mercury' 14 | binary: BinName = MERCURY_BINARY.name 15 | 16 | def get_output_path(self, snapshot) -> Path | None: 17 | return snapshot.link_dir / 'mercury' / 'content.html' 18 | 19 | 20 | MERCURY_EXTRACTOR = MercuryExtractor() 21 | -------------------------------------------------------------------------------- /archivebox/pkgs/abx-plugin-git/abx_plugin_git/__init__.py: -------------------------------------------------------------------------------- 1 | __package__ = 'abx_plugin_git' 2 | __label__ = 'Git' 3 | 4 | import abx 5 | 6 | 7 | @abx.hookimpl 8 | def get_CONFIG(): 9 | from .config import GIT_CONFIG 10 | 11 | return { 12 | 'GIT_CONFIG': GIT_CONFIG 13 | } 14 | 15 | @abx.hookimpl 16 | def get_BINARIES(): 17 | from .binaries import GIT_BINARY 18 | 19 | return { 20 | 'git': GIT_BINARY, 21 | } 22 | 23 | @abx.hookimpl 24 | def get_EXTRACTORS(): 25 | from .extractors import GIT_EXTRACTOR 26 | 27 | return { 28 | 'git': GIT_EXTRACTOR, 29 | } 30 | -------------------------------------------------------------------------------- /archivebox/core/migrations/0045_alter_snapshot_old_id.py: -------------------------------------------------------------------------------- 1 | # Generated by Django 5.0.6 on 2024-08-20 01:54 2 | 3 | import uuid 4 | from django.db import migrations, models 5 | 6 | 7 | class Migration(migrations.Migration): 8 | 9 | dependencies = [ 10 | ('core', '0044_alter_archiveresult_snapshot_alter_tag_uuid_and_more'), 11 | ] 12 | 13 | operations = [ 14 | migrations.AlterField( 15 | model_name='snapshot', 16 | name='old_id', 17 | field=models.UUIDField(default=uuid.uuid4, editable=False, primary_key=True, serialize=False, unique=True), 18 | ), 19 | ] 20 | -------------------------------------------------------------------------------- /.dockerignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | ._* 3 | *.pyc 4 | __pycache__/ 5 | .mypy_cache/ 6 | .pytest_cache/ 7 | .github/ 8 | .pdm-build/ 9 | .pdm-python 10 | .eggs/ 11 | .git/ 12 | .vscode/ 13 | !.git/HEAD 14 | !.git/refs/heads/* 15 | 16 | venv/ 17 | .venv/ 18 | .venv-old/ 19 | .docker_venv/ 20 | .docker-venv/ 21 | node_modules/ 22 | chrome/ 23 | chromeprofile/ 24 | chrome_profile/ 25 | 26 | pdm.dev.lock 27 | pdm.lock 28 | 29 | docs/ 30 | build/ 31 | dist/ 32 | brew_dist/ 33 | deb_dist/ 34 | pip_dist/ 35 | assets/ 36 | docker/ 37 | website/ 38 | typings/ 39 | 40 | tmp/ 41 | data/ 42 | data*/ 43 | output/ 44 | index.sqlite3 45 | index.sqlite3-wal 46 | -------------------------------------------------------------------------------- /archivebox/pkgs/abx-plugin-singlefile/abx_plugin_singlefile/extractors.py: -------------------------------------------------------------------------------- 1 | __package__ = 'abx_plugin_singlefile' 2 | 3 | 4 | from pathlib import Path 5 | 6 | from abx_pkg import BinName 7 | 8 | from abx_spec_extractor import BaseExtractor, ExtractorName 9 | 10 | from .binaries import SINGLEFILE_BINARY 11 | 12 | 13 | class SinglefileExtractor(BaseExtractor): 14 | name: ExtractorName = 'singlefile' 15 | binary: BinName = SINGLEFILE_BINARY.name 16 | 17 | def get_output_path(self, snapshot) -> Path: 18 | return Path(snapshot.link_dir) / 'singlefile.html' 19 | 20 | 21 | SINGLEFILE_EXTRACTOR = SinglefileExtractor() 22 | -------------------------------------------------------------------------------- /archivebox/api/migrations/0005_remove_apitoken_uuid_remove_outboundwebhook_uuid_and_more.py: -------------------------------------------------------------------------------- 1 | # Generated by Django 5.1 on 2024-08-20 22:40 2 | 3 | import uuid 4 | from django.db import migrations, models 5 | 6 | 7 | class Migration(migrations.Migration): 8 | 9 | dependencies = [ 10 | ('api', '0004_alter_apitoken_id_alter_apitoken_uuid'), 11 | ] 12 | 13 | operations = [ 14 | migrations.RemoveField( 15 | model_name='apitoken', 16 | name='uuid', 17 | ), 18 | migrations.RemoveField( 19 | model_name='outboundwebhook', 20 | name='id', 21 | ), 22 | ] 23 | -------------------------------------------------------------------------------- /archivebox/core/migrations/0062_alter_snapshottag_old_tag.py: -------------------------------------------------------------------------------- 1 | # Generated by Django 5.0.6 on 2024-08-20 03:44 2 | 3 | import django.db.models.deletion 4 | from django.db import migrations, models 5 | 6 | 7 | class Migration(migrations.Migration): 8 | 9 | dependencies = [ 10 | ('core', '0061_rename_tag_snapshottag_old_tag_and_more'), 11 | ] 12 | 13 | operations = [ 14 | migrations.AlterField( 15 | model_name='snapshottag', 16 | name='old_tag', 17 | field=models.ForeignKey(db_column='old_tag_id', on_delete=django.db.models.deletion.CASCADE, to='core.tag'), 18 | ), 19 | ] 20 | -------------------------------------------------------------------------------- /archivebox/core/migrations/0067_alter_snapshottag_tag.py: -------------------------------------------------------------------------------- 1 | # Generated by Django 5.0.6 on 2024-08-20 03:53 2 | 3 | import django.db.models.deletion 4 | from django.db import migrations, models 5 | 6 | 7 | class Migration(migrations.Migration): 8 | 9 | dependencies = [ 10 | ('core', '0066_alter_snapshottag_tag_alter_tag_id_alter_tag_old_id'), 11 | ] 12 | 13 | operations = [ 14 | migrations.AlterField( 15 | model_name='snapshottag', 16 | name='tag', 17 | field=models.ForeignKey(db_column='tag_id', on_delete=django.db.models.deletion.CASCADE, to='core.tag'), 18 | ), 19 | ] 20 | -------------------------------------------------------------------------------- /archivebox/pkgs/abx-plugin-readability/abx_plugin_readability/extractors.py: -------------------------------------------------------------------------------- 1 | # __package__ = 'abx_plugin_readability' 2 | 3 | from pathlib import Path 4 | 5 | from abx_pkg import BinName 6 | 7 | from abx_spec_extractor import BaseExtractor, ExtractorName 8 | from .binaries import READABILITY_BINARY 9 | 10 | 11 | class ReadabilityExtractor(BaseExtractor): 12 | name: ExtractorName = 'readability' 13 | binary: BinName = READABILITY_BINARY.name 14 | 15 | def get_output_path(self, snapshot) -> Path: 16 | return Path(snapshot.link_dir) / 'readability' / 'content.html' 17 | 18 | 19 | READABILITY_EXTRACTOR = ReadabilityExtractor() 20 | -------------------------------------------------------------------------------- /archivebox/core/migrations/0058_alter_tag_old_id.py: -------------------------------------------------------------------------------- 1 | # Generated by Django 5.0.6 on 2024-08-20 03:30 2 | 3 | import random 4 | from django.db import migrations, models 5 | 6 | 7 | def rand_int_id(): 8 | return random.getrandbits(32) 9 | 10 | class Migration(migrations.Migration): 11 | 12 | dependencies = [ 13 | ('core', '0057_rename_id_tag_old_id'), 14 | ] 15 | 16 | operations = [ 17 | migrations.AlterField( 18 | model_name='tag', 19 | name='old_id', 20 | field=models.BigIntegerField(default=rand_int_id, primary_key=True, serialize=False, verbose_name='Old ID'), 21 | ), 22 | ] 23 | -------------------------------------------------------------------------------- /archivebox/core/migrations/0061_rename_tag_snapshottag_old_tag_and_more.py: -------------------------------------------------------------------------------- 1 | # Generated by Django 5.0.6 on 2024-08-20 03:43 2 | 3 | from django.db import migrations 4 | 5 | 6 | class Migration(migrations.Migration): 7 | 8 | dependencies = [ 9 | ('core', '0060_alter_tag_id'), 10 | ] 11 | 12 | operations = [ 13 | migrations.RenameField( 14 | model_name='snapshottag', 15 | old_name='tag', 16 | new_name='old_tag', 17 | ), 18 | migrations.AlterUniqueTogether( 19 | name='snapshottag', 20 | unique_together={('snapshot', 'old_tag')}, 21 | ), 22 | ] 23 | -------------------------------------------------------------------------------- /archivebox/workers/management/commands/orchestrator.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | from django.core.management.base import BaseCommand 4 | 5 | from workers.orchestrator import ArchivingOrchestrator 6 | 7 | 8 | class Command(BaseCommand): 9 | help = 'Run the archivebox orchestrator' 10 | 11 | # def add_arguments(self, parser): 12 | # parser.add_argument('subcommand', type=str, help='The subcommand you want to run') 13 | # parser.add_argument('command_args', nargs='*', help='Arguments to pass to the subcommand') 14 | 15 | 16 | def handle(self, *args, **kwargs): 17 | orchestrator = ArchivingOrchestrator() 18 | orchestrator.start() 19 | -------------------------------------------------------------------------------- /archivebox/__main__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """This is the entrypoint for python -m archivebox ...""" 3 | __package__ = 'archivebox' 4 | 5 | import archivebox # noqa # make sure monkey patches are applied before anything else 6 | import sys 7 | 8 | from .cli import main 9 | 10 | ASCII_LOGO_MINI = r""" 11 | _ _ _ ____ 12 | / \ _ __ ___| |__ (_)_ _____| __ ) _____ __ 13 | / _ \ | '__/ __| '_ \| \ \ / / _ \ _ \ / _ \ \/ / 14 | / ___ \| | | (__| | | | |\ V / __/ |_) | (_) > < 15 | /_/ \_\_| \___|_| |_|_| \_/ \___|____/ \___/_/\_\ 16 | """ 17 | 18 | main(args=sys.argv[1:], stdin=sys.stdin) 19 | -------------------------------------------------------------------------------- /.github/.readthedocs.yaml: -------------------------------------------------------------------------------- 1 | # Read the Docs config for https://docs.archivebox.io 2 | # https://docs.readthedocs.io/en/stable/config-file/v2.html 3 | 4 | version: 2 5 | 6 | submodules: 7 | include: all 8 | recursive: true 9 | 10 | build: 11 | os: ubuntu-22.04 12 | tools: 13 | python: "3.12" 14 | #nodejs: "20" # not needed unless we need the full archivebox to run while building docs for some reason 15 | 16 | sphinx: 17 | configuration: docs/conf.py 18 | 19 | formats: 20 | - pdf 21 | - epub 22 | 23 | # https://docs.readthedocs.io/en/stable/guides/reproducible-builds.html 24 | python: 25 | install: 26 | - requirements: docs/requirements.txt 27 | -------------------------------------------------------------------------------- /archivebox/pkgs/abx-plugin-default-binproviders/abx_plugin_default_binproviders.py: -------------------------------------------------------------------------------- 1 | 2 | import abx 3 | 4 | from typing import Dict 5 | 6 | from abx_pkg import ( 7 | AptProvider, 8 | BrewProvider, 9 | EnvProvider, 10 | BinProvider, 11 | ) 12 | apt = APT_BINPROVIDER = AptProvider() 13 | brew = BREW_BINPROVIDER = BrewProvider() 14 | env = ENV_BINPROVIDER = EnvProvider() 15 | apt.setup() 16 | brew.setup() 17 | env.setup() 18 | 19 | 20 | @abx.hookimpl(tryfirst=True) 21 | def get_BINPROVIDERS() -> Dict[str, BinProvider]: 22 | return { 23 | 'apt': APT_BINPROVIDER, 24 | 'brew': BREW_BINPROVIDER, 25 | 'env': ENV_BINPROVIDER, 26 | } 27 | -------------------------------------------------------------------------------- /archivebox/core/admin.py: -------------------------------------------------------------------------------- 1 | __package__ = 'archivebox.core' 2 | 3 | from django.contrib.auth import get_user_model 4 | 5 | 6 | from core.models import Snapshot, ArchiveResult, Tag 7 | from core.admin_tags import TagAdmin 8 | from core.admin_snapshots import SnapshotAdmin 9 | from core.admin_archiveresults import ArchiveResultAdmin 10 | from core.admin_users import UserAdmin 11 | 12 | import abx 13 | 14 | 15 | @abx.hookimpl 16 | def register_admin(admin_site): 17 | admin_site.register(get_user_model(), UserAdmin) 18 | admin_site.register(ArchiveResult, ArchiveResultAdmin) 19 | admin_site.register(Snapshot, SnapshotAdmin) 20 | admin_site.register(Tag, TagAdmin) 21 | -------------------------------------------------------------------------------- /archivebox/core/management/commands/archivebox.py: -------------------------------------------------------------------------------- 1 | __package__ = 'archivebox' 2 | 3 | from django.core.management.base import BaseCommand 4 | 5 | 6 | from .cli import run_subcommand 7 | 8 | 9 | class Command(BaseCommand): 10 | help = 'Run an ArchiveBox CLI subcommand (e.g. add, remove, list, etc)' 11 | 12 | def add_arguments(self, parser): 13 | parser.add_argument('subcommand', type=str, help='The subcommand you want to run') 14 | parser.add_argument('command_args', nargs='*', help='Arguments to pass to the subcommand') 15 | 16 | 17 | def handle(self, *args, **kwargs): 18 | run_subcommand(kwargs['subcommand'], args=kwargs['command_args']) 19 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | ._* 3 | *.pyc 4 | __pycache__/ 5 | .mypy_cache/ 6 | .eggs/ 7 | tests/out/ 8 | 9 | # Python and Node dependencies 10 | venv/ 11 | .venv/ 12 | .docker-venv/ 13 | node_modules/ 14 | typings/ 15 | 16 | # Ignore dev lockfiles (should always be built fresh) 17 | pdm.dev.lock 18 | requirements-dev.txt 19 | 20 | # Packaging artifacts 21 | requirements.txt 22 | .pdm-python 23 | .pdm-build 24 | archivebox.egg-info 25 | archivebox-*.tar.gz 26 | build/ 27 | dist/ 28 | 29 | # Data folders 30 | lib/ 31 | tmp/ 32 | data/ 33 | data*/ 34 | output/ 35 | index.sqlite3 36 | queue.sqlite3 37 | *.sqlite* 38 | data.* 39 | .archivebox_id 40 | 41 | # vim 42 | *.sw? 43 | .vscode 44 | -------------------------------------------------------------------------------- /bin/lint.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | ### Bash Environment Setup 4 | # http://redsymbol.net/articles/unofficial-bash-strict-mode/ 5 | # https://www.gnu.org/software/bash/manual/html_node/The-Set-Builtin.html 6 | # set -o xtrace 7 | set -o errexit 8 | set -o errtrace 9 | set -o nounset 10 | set -o pipefail 11 | IFS=$'\n' 12 | 13 | DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && cd .. && pwd )" 14 | 15 | source "$DIR/.venv/bin/activate" 16 | 17 | echo "[*] Running flake8..." 18 | cd "$DIR/archivebox" 19 | flake8 . && echo "√ No errors found." 20 | 21 | echo 22 | 23 | echo "[*] Running mypy..." 24 | echo "(skipping for now, run 'mypy archivebox' to run it manually)" 25 | -------------------------------------------------------------------------------- /archivebox/core/migrations/0050_alter_snapshottag_snapshot_old.py: -------------------------------------------------------------------------------- 1 | # Generated by Django 5.0.6 on 2024-08-20 02:30 2 | 3 | import django.db.models.deletion 4 | from django.db import migrations, models 5 | 6 | 7 | class Migration(migrations.Migration): 8 | 9 | dependencies = [ 10 | ('core', '0049_rename_snapshot_snapshottag_snapshot_old_and_more'), 11 | ] 12 | 13 | operations = [ 14 | migrations.AlterField( 15 | model_name='snapshottag', 16 | name='snapshot_old', 17 | field=models.ForeignKey(db_column='snapshot_old_id', on_delete=django.db.models.deletion.CASCADE, to='core.snapshot', to_field='old_id'), 18 | ), 19 | ] 20 | -------------------------------------------------------------------------------- /archivebox/pkgs/abx-plugin-ytdlp/abx_plugin_ytdlp/__init__.py: -------------------------------------------------------------------------------- 1 | __package__ = 'abx_plugin_ytdlp' 2 | __label__ = 'YT-DLP' 3 | __homepage__ = 'https://github.com/yt-dlp/yt-dlp' 4 | 5 | import abx 6 | 7 | @abx.hookimpl 8 | def get_CONFIG(): 9 | from .config import YTDLP_CONFIG 10 | 11 | return { 12 | 'YTDLP_CONFIG': YTDLP_CONFIG 13 | } 14 | 15 | @abx.hookimpl 16 | def get_BINARIES(): 17 | from .binaries import YTDLP_BINARY, FFMPEG_BINARY 18 | 19 | return { 20 | 'ytdlp': YTDLP_BINARY, 21 | 'ffmpeg': FFMPEG_BINARY, 22 | } 23 | 24 | @abx.hookimpl 25 | def ready(): 26 | from .config import YTDLP_CONFIG 27 | YTDLP_CONFIG.validate() 28 | -------------------------------------------------------------------------------- /archivebox/core/migrations/0049_rename_snapshot_snapshottag_snapshot_old_and_more.py: -------------------------------------------------------------------------------- 1 | # Generated by Django 5.0.6 on 2024-08-20 02:26 2 | 3 | from django.db import migrations 4 | 5 | 6 | class Migration(migrations.Migration): 7 | 8 | dependencies = [ 9 | ('core', '0048_alter_archiveresult_snapshot_and_more'), 10 | ] 11 | 12 | operations = [ 13 | migrations.RenameField( 14 | model_name='snapshottag', 15 | old_name='snapshot', 16 | new_name='snapshot_old', 17 | ), 18 | migrations.AlterUniqueTogether( 19 | name='snapshottag', 20 | unique_together={('snapshot_old', 'tag')}, 21 | ), 22 | ] 23 | -------------------------------------------------------------------------------- /archivebox/pkgs/abx-plugin-puppeteer/abx_plugin_puppeteer/binaries.py: -------------------------------------------------------------------------------- 1 | __package__ = 'abx_plugin_puppeteer' 2 | 3 | from typing import List 4 | 5 | from pydantic import InstanceOf 6 | from abx_pkg import BinProvider, BinName, Binary 7 | 8 | 9 | from abx_plugin_default_binproviders import env 10 | 11 | from abx_plugin_npm.binproviders import LIB_NPM_BINPROVIDER, SYS_NPM_BINPROVIDER 12 | 13 | 14 | ###################### Config ########################## 15 | 16 | 17 | class PuppeteerBinary(Binary): 18 | name: BinName = "puppeteer" 19 | 20 | binproviders_supported: List[InstanceOf[BinProvider]] = [LIB_NPM_BINPROVIDER, SYS_NPM_BINPROVIDER, env] 21 | 22 | 23 | PUPPETEER_BINARY = PuppeteerBinary() 24 | -------------------------------------------------------------------------------- /archivebox/core/migrations/0018_auto_20210327_0952.py: -------------------------------------------------------------------------------- 1 | # Generated by Django 3.1.3 on 2021-03-27 09:52 2 | 3 | from django.db import migrations, models 4 | 5 | 6 | class Migration(migrations.Migration): 7 | 8 | dependencies = [ 9 | ('core', '0017_auto_20210219_0211'), 10 | ] 11 | 12 | operations = [ 13 | migrations.AlterField( 14 | model_name='tag', 15 | name='name', 16 | field=models.CharField(max_length=100, unique=True), 17 | ), 18 | migrations.AlterField( 19 | model_name='tag', 20 | name='slug', 21 | field=models.SlugField(blank=True, max_length=100, unique=True), 22 | ), 23 | ] 24 | -------------------------------------------------------------------------------- /archivebox/core/migrations/0043_alter_archiveresult_snapshot_alter_snapshot_id_and_more.py: -------------------------------------------------------------------------------- 1 | # Generated by Django 5.0.6 on 2024-08-18 06:52 2 | 3 | import django.db.models.deletion 4 | import uuid 5 | from django.db import migrations, models 6 | 7 | 8 | class Migration(migrations.Migration): 9 | 10 | dependencies = [ 11 | ('core', '0042_remove_archiveresult_snapshot_old'), 12 | ] 13 | 14 | operations = [ 15 | migrations.AlterField( 16 | model_name='archiveresult', 17 | name='snapshot', 18 | field=models.ForeignKey(db_column='snapshot_id', on_delete=django.db.models.deletion.CASCADE, to='core.snapshot', to_field='id'), 19 | ), 20 | ] 21 | -------------------------------------------------------------------------------- /archivebox/core/migrations/0032_alter_archiveresult_id.py: -------------------------------------------------------------------------------- 1 | # Generated by Django 5.0.6 on 2024-08-18 05:20 2 | 3 | import core.models 4 | import random 5 | from django.db import migrations, models 6 | 7 | 8 | def rand_int_id(): 9 | return random.getrandbits(32) 10 | 11 | class Migration(migrations.Migration): 12 | 13 | dependencies = [ 14 | ('core', '0031_alter_archiveresult_id_alter_archiveresult_uuid_and_more'), 15 | ] 16 | 17 | operations = [ 18 | migrations.AlterField( 19 | model_name='archiveresult', 20 | name='id', 21 | field=models.BigIntegerField(default=rand_int_id, primary_key=True, serialize=False, verbose_name='ID'), 22 | ), 23 | ] 24 | -------------------------------------------------------------------------------- /archivebox/core/migrations/0072_rename_added_snapshot_bookmarked_at_and_more.py: -------------------------------------------------------------------------------- 1 | # Generated by Django 5.1 on 2024-09-05 00:05 2 | 3 | from django.db import migrations 4 | 5 | 6 | class Migration(migrations.Migration): 7 | 8 | dependencies = [ 9 | ('core', '0071_remove_archiveresult_old_id_remove_snapshot_old_id_and_more'), 10 | ] 11 | 12 | operations = [ 13 | migrations.RenameField( 14 | model_name='snapshot', 15 | old_name='added', 16 | new_name='bookmarked_at', 17 | ), 18 | migrations.RenameField( 19 | model_name='snapshot', 20 | old_name='updated', 21 | new_name='downloaded_at', 22 | ), 23 | ] 24 | -------------------------------------------------------------------------------- /archivebox/pkgs/abx-plugin-readability/abx_plugin_readability/config.py: -------------------------------------------------------------------------------- 1 | from pydantic import Field 2 | 3 | from abx_spec_config.base_configset import BaseConfigSet 4 | 5 | from archivebox.config.common import ARCHIVING_CONFIG 6 | 7 | 8 | class ReadabilityConfig(BaseConfigSet): 9 | SAVE_READABILITY: bool = Field(default=True, alias='USE_READABILITY') 10 | 11 | READABILITY_TIMEOUT: int = Field(default=lambda: ARCHIVING_CONFIG.TIMEOUT) 12 | 13 | READABILITY_BINARY: str = Field(default='readability-extractor') 14 | # READABILITY_EXTRA_ARGS: List[str] = [] # readability-extractor doesn't take any extra args 15 | 16 | 17 | READABILITY_CONFIG = ReadabilityConfig() 18 | -------------------------------------------------------------------------------- /archivebox/pkgs/abx-plugin-favicon/abx_plugin_favicon/__init__.py: -------------------------------------------------------------------------------- 1 | __label__ = 'Favicon' 2 | __version__ = '2024.10.24' 3 | __author__ = 'ArchiveBox' 4 | __homepage__ = 'https://github.com/ArchiveBox/archivebox' 5 | __dependencies__ = [ 6 | 'abx>=0.1.0', 7 | 'abx-spec-config>=0.1.0', 8 | 'abx-plugin-curl-extractor>=2024.10.24', 9 | ] 10 | 11 | import abx 12 | 13 | 14 | @abx.hookimpl 15 | def get_CONFIG(): 16 | from .config import FAVICON_CONFIG 17 | 18 | return { 19 | 'FAVICON_CONFIG': FAVICON_CONFIG 20 | } 21 | 22 | 23 | @abx.hookimpl 24 | def get_EXTRACTORS(): 25 | from .extractors import FAVICON_EXTRACTOR 26 | 27 | return { 28 | 'favicon': FAVICON_EXTRACTOR, 29 | } 30 | -------------------------------------------------------------------------------- /bin/build_pip.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | ### Bash Environment Setup 4 | # http://redsymbol.net/articles/unofficial-bash-strict-mode/ 5 | # https://www.gnu.org/software/bash/manual/html_node/The-Set-Builtin.html 6 | # set -o xtrace 7 | set -o errexit 8 | set -o errtrace 9 | set -o nounset 10 | set -o pipefail 11 | IFS=$'\n' 12 | 13 | REPO_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && cd .. && pwd )" 14 | cd "$REPO_DIR" 15 | 16 | # Generate pdm.lock, requirements.txt, and package-lock.json 17 | bash ./bin/lock_pkgs.sh 18 | source .venv/bin/activate 19 | 20 | echo "[+] Building sdist, bdist_wheel, and egg_info" 21 | rm -Rf build dist 22 | uv build 23 | 24 | echo 25 | echo "[√] Finished. Built package in dist/" 26 | -------------------------------------------------------------------------------- /archivebox/pkgs/abx-plugin-playwright/abx_plugin_playwright/__init__.py: -------------------------------------------------------------------------------- 1 | __label__ = 'Playwright' 2 | __homepage__ = 'https://github.com/microsoft/playwright-python' 3 | 4 | import abx 5 | 6 | 7 | @abx.hookimpl 8 | def get_CONFIG(): 9 | from .config import PLAYWRIGHT_CONFIG 10 | return { 11 | 'PLAYWRIGHT_CONFIG': PLAYWRIGHT_CONFIG 12 | } 13 | 14 | @abx.hookimpl 15 | def get_BINARIES(): 16 | from .binaries import PLAYWRIGHT_BINARY 17 | 18 | return { 19 | 'playwright': PLAYWRIGHT_BINARY, 20 | } 21 | 22 | @abx.hookimpl 23 | def get_BINPROVIDERS(): 24 | from .binproviders import PLAYWRIGHT_BINPROVIDER 25 | 26 | return { 27 | 'playwright': PLAYWRIGHT_BINPROVIDER, 28 | } 29 | -------------------------------------------------------------------------------- /bin/build.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | ### Bash Environment Setup 4 | # http://redsymbol.net/articles/unofficial-bash-strict-mode/ 5 | # https://www.gnu.org/software/bash/manual/html_node/The-Set-Builtin.html 6 | # set -o xtrace 7 | set -o errexit 8 | set -o errtrace 9 | set -o nounset 10 | set -o pipefail 11 | IFS=$'\n' 12 | 13 | REPO_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && cd .. && pwd )" 14 | 15 | cd "$REPO_DIR" 16 | 17 | # pipenv install --dev 18 | 19 | # the order matters 20 | ./bin/build_docs.sh 21 | ./bin/build_pip.sh 22 | ./bin/build_docker.sh 23 | 24 | echo "[√] Done. Install the built package by running:" 25 | echo " python3 setup.py install" 26 | echo " # or" 27 | echo " pip3 install ." 28 | -------------------------------------------------------------------------------- /archivebox/core/migrations/0012_auto_20210216_1425.py: -------------------------------------------------------------------------------- 1 | # Generated by Django 3.1.3 on 2021-02-16 14:25 2 | 3 | from django.db import migrations, models 4 | 5 | 6 | class Migration(migrations.Migration): 7 | 8 | dependencies = [ 9 | ('core', '0011_auto_20210216_1331'), 10 | ] 11 | 12 | operations = [ 13 | migrations.AlterField( 14 | model_name='archiveresult', 15 | name='cmd_version', 16 | field=models.CharField(blank=True, default=None, max_length=128, null=True), 17 | ), 18 | migrations.AlterField( 19 | model_name='archiveresult', 20 | name='output', 21 | field=models.CharField(max_length=1024), 22 | ), 23 | ] 24 | -------------------------------------------------------------------------------- /archivebox/pkgs/abx-plugin-mercury/abx_plugin_mercury/__init__.py: -------------------------------------------------------------------------------- 1 | __package__ = 'abx_plugin_mercury' 2 | __label__ = 'Postlight Parser' 3 | __homepage__ = 'https://github.com/postlight/mercury-parser' 4 | 5 | import abx 6 | 7 | @abx.hookimpl 8 | def get_CONFIG(): 9 | from .config import MERCURY_CONFIG 10 | 11 | return { 12 | 'MERCURY_CONFIG': MERCURY_CONFIG 13 | } 14 | 15 | @abx.hookimpl 16 | def get_BINARIES(): 17 | from .binaries import MERCURY_BINARY 18 | 19 | return { 20 | 'mercury': MERCURY_BINARY, 21 | } 22 | 23 | @abx.hookimpl 24 | def get_EXTRACTORS(): 25 | from .extractors import MERCURY_EXTRACTOR 26 | 27 | return { 28 | 'mercury': MERCURY_EXTRACTOR, 29 | } 30 | -------------------------------------------------------------------------------- /archivebox/cli/archivebox_shell.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | __package__ = 'archivebox.cli' 4 | 5 | from typing import Iterable 6 | 7 | import rich_click as click 8 | 9 | from archivebox.misc.util import docstring 10 | 11 | 12 | def shell(args: Iterable[str]=()) -> None: 13 | """Enter an interactive ArchiveBox Django shell""" 14 | 15 | from django.core.management import call_command 16 | call_command("shell_plus", *args) 17 | 18 | 19 | @click.command(add_help_option=False, context_settings=dict(ignore_unknown_options=True)) 20 | @click.argument('args', nargs=-1) 21 | @docstring(shell.__doc__) 22 | def main(args: Iterable[str]=()) -> None: 23 | shell(args=args) 24 | 25 | 26 | if __name__ == '__main__': 27 | main() 28 | -------------------------------------------------------------------------------- /etc/crontabs/archivebox: -------------------------------------------------------------------------------- 1 | # DO NOT EDIT THIS FILE - edit the master and reinstall. 2 | # (/tmp/tmpe3dawo9u installed on Tue Jun 13 23:21:48 2023) 3 | # (Cron version -- $Id: crontab.c,v 2.13 1994/01/17 03:20:37 vixie Exp $) 4 | 5 | @daily cd /data && /usr/local/bin/archivebox add --depth=0 "https://example.com/3" >> /data/logs/schedule.log 2>&1 # archivebox_schedule 6 | @daily cd /data && /usr/local/bin/archivebox add --depth=0 "https://example.com/2" >> /data/logs/schedule.log 2>&1 # archivebox_schedule 7 | @daily cd /data && /usr/local/bin/archivebox add --depth=0 "https://example.com" >> /data/logs/schedule.log 2>&1 # archivebox_schedule 8 | @daily cd /data && /usr/local/bin/archivebox add --depth=0 "update" >> /data/logs/schedule.log 2>&1 # archivebox_schedule 9 | -------------------------------------------------------------------------------- /archivebox/core/migrations/0020_auto_20210410_1031.py: -------------------------------------------------------------------------------- 1 | # Generated by Django 3.1.8 on 2021-04-10 10:31 2 | 3 | from django.db import migrations, models 4 | 5 | 6 | class Migration(migrations.Migration): 7 | 8 | dependencies = [ 9 | ('core', '0019_auto_20210401_0654'), 10 | ] 11 | 12 | operations = [ 13 | migrations.AlterField( 14 | model_name='archiveresult', 15 | name='id', 16 | field=models.AutoField(primary_key=True, serialize=False, verbose_name='ID'), 17 | ), 18 | migrations.AlterField( 19 | model_name='tag', 20 | name='id', 21 | field=models.AutoField(primary_key=True, serialize=False, verbose_name='ID'), 22 | ), 23 | ] 24 | -------------------------------------------------------------------------------- /archivebox/pkgs/abx-plugin-playwright/abx_plugin_playwright/binaries.py: -------------------------------------------------------------------------------- 1 | __package__ = 'abx_plugin_playwright' 2 | 3 | from typing import List 4 | 5 | from pydantic import InstanceOf 6 | from abx_pkg import BinName, BinProvider, Binary 7 | 8 | 9 | from abx_plugin_pip.binproviders import LIB_PIP_BINPROVIDER, VENV_PIP_BINPROVIDER, SYS_PIP_BINPROVIDER 10 | from abx_plugin_default_binproviders import env 11 | 12 | from .config import PLAYWRIGHT_CONFIG 13 | 14 | 15 | class PlaywrightBinary(Binary): 16 | name: BinName = PLAYWRIGHT_CONFIG.PLAYWRIGHT_BINARY 17 | 18 | binproviders_supported: List[InstanceOf[BinProvider]] = [LIB_PIP_BINPROVIDER, VENV_PIP_BINPROVIDER, SYS_PIP_BINPROVIDER, env] 19 | 20 | 21 | PLAYWRIGHT_BINARY = PlaywrightBinary() 22 | -------------------------------------------------------------------------------- /archivebox/pkgs/abx-plugin-ripgrep-search/abx_plugin_ripgrep_search/binaries.py: -------------------------------------------------------------------------------- 1 | __package__ = 'abx_plugin_ripgrep_search' 2 | 3 | from typing import List 4 | 5 | from pydantic import InstanceOf 6 | from abx_pkg import BinProvider, BinaryOverrides, BinName, Binary 7 | 8 | from abx_plugin_default_binproviders import apt, brew, env 9 | 10 | 11 | from .config import RIPGREP_CONFIG 12 | 13 | 14 | class RipgrepBinary(Binary): 15 | name: BinName = RIPGREP_CONFIG.RIPGREP_BINARY 16 | binproviders_supported: List[InstanceOf[BinProvider]] = [apt, brew, env] 17 | 18 | overrides: BinaryOverrides = { 19 | apt.name: {'packages': ['ripgrep']}, 20 | brew.name: {'packages': ['ripgrep']}, 21 | } 22 | 23 | RIPGREP_BINARY = RipgrepBinary() 24 | -------------------------------------------------------------------------------- /bin/release_docs.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | ### Bash Environment Setup 4 | # http://redsymbol.net/articles/unofficial-bash-strict-mode/ 5 | # https://www.gnu.org/software/bash/manual/html_node/The-Set-Builtin.html 6 | # set -o xtrace 7 | set -o errexit 8 | set -o errtrace 9 | set -o nounset 10 | set -o pipefail 11 | IFS=$'\n' 12 | 13 | REPO_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && cd .. && pwd )" 14 | VERSION="$(grep '^version = ' "${REPO_DIR}/pyproject.toml" | awk -F'"' '{print $2}')" 15 | cd "$REPO_DIR" 16 | 17 | 18 | echo "[^] Pushing docs to github" 19 | cd docs/ 20 | git add . 21 | git commit -am "$VERSION release" 22 | git push 23 | git tag -a "v$VERSION" -m "v$VERSION" 24 | git push origin 25 | git push origin --tags 26 | -------------------------------------------------------------------------------- /archivebox/pkgs/abx-plugin-puppeteer/abx_plugin_puppeteer/__init__.py: -------------------------------------------------------------------------------- 1 | __package__ = 'abx_plugin_puppeteer' 2 | __label__ = 'Puppeteer' 3 | __homepage__ = 'https://github.com/puppeteer/puppeteer' 4 | 5 | import abx 6 | 7 | 8 | @abx.hookimpl 9 | def get_CONFIG(): 10 | from .config import PUPPETEER_CONFIG 11 | 12 | return { 13 | 'PUPPETEER_CONFIG': PUPPETEER_CONFIG 14 | } 15 | 16 | @abx.hookimpl 17 | def get_BINARIES(): 18 | from .binaries import PUPPETEER_BINARY 19 | 20 | return { 21 | 'puppeteer': PUPPETEER_BINARY, 22 | } 23 | 24 | @abx.hookimpl 25 | def get_BINPROVIDERS(): 26 | from .binproviders import PUPPETEER_BINPROVIDER 27 | 28 | return { 29 | 'puppeteer': PUPPETEER_BINPROVIDER, 30 | } 31 | -------------------------------------------------------------------------------- /archivebox/pkgs/abx-plugin-ripgrep-search/abx_plugin_ripgrep_search/__init__.py: -------------------------------------------------------------------------------- 1 | __package__ = 'abx_plugin_ripgrep_search' 2 | __label__ = 'Ripgrep Search' 3 | __homepage__ = 'https://github.com/BurntSushi/ripgrep' 4 | 5 | import abx 6 | 7 | @abx.hookimpl 8 | def get_CONFIG(): 9 | from .config import RIPGREP_CONFIG 10 | 11 | return { 12 | 'RIPGREP_CONFIG': RIPGREP_CONFIG 13 | } 14 | 15 | 16 | @abx.hookimpl 17 | def get_BINARIES(): 18 | from .binaries import RIPGREP_BINARY 19 | 20 | return { 21 | 'ripgrep': RIPGREP_BINARY 22 | } 23 | 24 | 25 | @abx.hookimpl 26 | def get_SEARCHBACKENDS(): 27 | from .searchbackend import RIPGREP_SEARCH_BACKEND 28 | 29 | return { 30 | 'ripgrep': RIPGREP_SEARCH_BACKEND, 31 | } 32 | -------------------------------------------------------------------------------- /archivebox/pkgs/abx-plugin-chrome/abx_plugin_chrome/extractors.py: -------------------------------------------------------------------------------- 1 | __package__ = 'abx_plugin_chrome' 2 | 3 | from abx_pkg import BinName 4 | 5 | from abx_spec_extractor import BaseExtractor, ExtractorName 6 | 7 | from .binaries import CHROME_BINARY 8 | 9 | 10 | class PDFExtractor(BaseExtractor): 11 | name: ExtractorName = 'pdf' 12 | binary: BinName = CHROME_BINARY.name 13 | 14 | PDF_EXTRACTOR = PDFExtractor() 15 | 16 | 17 | class ScreenshotExtractor(BaseExtractor): 18 | name: ExtractorName = 'screenshot' 19 | binary: BinName = CHROME_BINARY.name 20 | 21 | SCREENSHOT_EXTRACTOR = ScreenshotExtractor() 22 | 23 | class DOMExtractor(BaseExtractor): 24 | name: ExtractorName = 'dom' 25 | binary: BinName = CHROME_BINARY.name 26 | DOM_EXTRACTOR = DOMExtractor() 27 | -------------------------------------------------------------------------------- /archivebox/pkgs/abx-plugin-readability/abx_plugin_readability/__init__.py: -------------------------------------------------------------------------------- 1 | __package__ = 'abx_plugin_readability' 2 | __label__ = 'Readability' 3 | __homepage__ = 'https://github.com/ArchiveBox/readability-extractor' 4 | 5 | import abx 6 | 7 | 8 | @abx.hookimpl 9 | def get_CONFIG(): 10 | from .config import READABILITY_CONFIG 11 | 12 | return { 13 | 'READABILITY_CONFIG': READABILITY_CONFIG 14 | } 15 | 16 | @abx.hookimpl 17 | def get_BINARIES(): 18 | from .binaries import READABILITY_BINARY 19 | 20 | return { 21 | 'readability': READABILITY_BINARY, 22 | } 23 | 24 | @abx.hookimpl 25 | def get_EXTRACTORS(): 26 | from .extractors import READABILITY_EXTRACTOR 27 | 28 | return { 29 | 'readability': READABILITY_EXTRACTOR, 30 | } 31 | -------------------------------------------------------------------------------- /archivebox/pkgs/abx-plugin-wget/abx_plugin_wget/__init__.py: -------------------------------------------------------------------------------- 1 | __package__ = 'abx_plugin_wget' 2 | __label__ = 'WGET' 3 | 4 | import abx 5 | 6 | 7 | @abx.hookimpl 8 | def get_CONFIG(): 9 | from .config import WGET_CONFIG 10 | 11 | return { 12 | 'WGET_CONFIG': WGET_CONFIG 13 | } 14 | 15 | @abx.hookimpl 16 | def get_BINARIES(): 17 | from .binaries import WGET_BINARY 18 | 19 | return { 20 | 'wget': WGET_BINARY, 21 | } 22 | 23 | @abx.hookimpl 24 | def get_EXTRACTORS(): 25 | from .extractors import WGET_EXTRACTOR, WARC_EXTRACTOR 26 | 27 | return { 28 | 'wget': WGET_EXTRACTOR, 29 | 'warc': WARC_EXTRACTOR, 30 | } 31 | 32 | @abx.hookimpl 33 | def ready(): 34 | from .config import WGET_CONFIG 35 | WGET_CONFIG.validate() 36 | -------------------------------------------------------------------------------- /archivebox/core/migrations/0021_auto_20220914_0934.py: -------------------------------------------------------------------------------- 1 | # Generated by Django 3.1.14 on 2022-09-14 09:34 2 | 3 | from django.db import migrations, models 4 | 5 | 6 | class Migration(migrations.Migration): 7 | 8 | dependencies = [ 9 | ('core', '0020_auto_20210410_1031'), 10 | ] 11 | 12 | operations = [ 13 | migrations.AlterField( 14 | model_name='archiveresult', 15 | name='extractor', 16 | field=models.CharField(choices=[('favicon', 'favicon'), ('headers', 'headers'), ('singlefile', 'singlefile'), ('pdf', 'pdf'), ('screenshot', 'screenshot'), ('dom', 'dom'), ('wget', 'wget'), ('title', 'title'), ('readability', 'readability'), ('mercury', 'mercury'), ('git', 'git'), ('media', 'media'), ('archive_org', 'archive_org')], max_length=32), 17 | ), 18 | ] 19 | -------------------------------------------------------------------------------- /archivebox/workers/views.py: -------------------------------------------------------------------------------- 1 | 2 | from django.views.generic import TemplateView 3 | from django.contrib.auth.mixins import UserPassesTestMixin 4 | from django.utils import timezone 5 | from api.auth import get_or_create_api_token 6 | 7 | 8 | class JobsDashboardView(UserPassesTestMixin, TemplateView): 9 | template_name = "jobs_dashboard.html" 10 | 11 | 12 | def test_func(self): 13 | return self.request.user and self.request.user.is_superuser 14 | 15 | def get_context_data(self, **kwargs): 16 | api_token = get_or_create_api_token(self.request.user) 17 | context = super().get_context_data(**kwargs) 18 | context['api_token'] = api_token.token if api_token else 'UNABLE TO GENERATE API TOKEN' 19 | context['now'] = timezone.now().strftime("%H:%M:%S") 20 | return context 21 | -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | # To get started with Dependabot version updates, you'll need to specify which 2 | # package ecosystems to update and where the package manifests are located. 3 | # Please see the documentation for all configuration options: 4 | # https://docs.github.com/code-security/dependabot/dependabot-version-updates/configuration-options-for-the-dependabot.yml-file 5 | 6 | version: 2 7 | updates: 8 | - package-ecosystem: "pip" 9 | directory: "/" 10 | target-branch: "dev" 11 | schedule: 12 | interval: "monthly" 13 | groups: 14 | pip: 15 | patterns: 16 | - "*" 17 | - package-ecosystem: "npm" 18 | directory: "/" 19 | target-branch: "dev" 20 | schedule: 21 | interval: "monthly" 22 | groups: 23 | npm: 24 | patterns: 25 | - "*" 26 | -------------------------------------------------------------------------------- /archivebox/pkgs/abx-plugin-npm/abx_plugin_npm/__init__.py: -------------------------------------------------------------------------------- 1 | __label__ = 'NPM' 2 | __author__ = 'ArchiveBox' 3 | __homepage__ = 'https://www.npmjs.com/' 4 | 5 | import abx 6 | 7 | 8 | @abx.hookimpl 9 | def get_CONFIG(): 10 | from .config import NPM_CONFIG 11 | return { 12 | 'NPM_CONFIG': NPM_CONFIG, 13 | } 14 | 15 | @abx.hookimpl 16 | def get_BINARIES(): 17 | from .binaries import NODE_BINARY, NPM_BINARY, NPX_BINARY 18 | 19 | return { 20 | 'node': NODE_BINARY, 21 | 'npm': NPM_BINARY, 22 | 'npx': NPX_BINARY, 23 | } 24 | 25 | @abx.hookimpl 26 | def get_BINPROVIDERS(): 27 | from .binproviders import LIB_NPM_BINPROVIDER, SYS_NPM_BINPROVIDER 28 | 29 | return { 30 | 'sys_npm': SYS_NPM_BINPROVIDER, 31 | 'lib_npm': LIB_NPM_BINPROVIDER, 32 | } 33 | -------------------------------------------------------------------------------- /bin/release_git.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | ### Bash Environment Setup 4 | # http://redsymbol.net/articles/unofficial-bash-strict-mode/ 5 | # https://www.gnu.org/software/bash/manual/html_node/The-Set-Builtin.html 6 | # set -o xtrace 7 | set -o errexit 8 | set -o errtrace 9 | set -o nounset 10 | set -o pipefail 11 | IFS=$'\n' 12 | 13 | REPO_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && cd .. && pwd )" 14 | VERSION="$(grep '^version = ' "${REPO_DIR}/pyproject.toml" | awk -F'"' '{print $2}')" 15 | cd "$REPO_DIR" 16 | 17 | 18 | # Push build to github 19 | echo "[^] Pushing release commit + tag to Github" 20 | git tag -f -a "v$VERSION" -m "v$VERSION" 21 | git push origin -f --tags 22 | echo " To finish publishing the release go here:" 23 | echo " https://github.com/ArchiveBox/ArchiveBox/releases/new" 24 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/config.yml: -------------------------------------------------------------------------------- 1 | blank_issues_enabled: false 2 | contact_links: 3 | - name: ❓ Ask a question or start a discussion 4 | url: https://github.com/ArchiveBox/ArchiveBox/discussions 5 | about: "Ask a question, get support, or start a design discussion (to report a problem please use '🐞 Bug report' instead)" 6 | - name: 💬 Chat with the dev team & community on Zulip 7 | url: https://zulip.archivebox.io 8 | about: "Join us on our Zulip forum to chat with the developers and other users (it's similar to Discord but self-hosted)." 9 | - name: 💁‍♂️ Hire us for professional support with fast response times 10 | url: https://docs.monadical.com/s/archivebox-consulting-services 11 | about: "We provide hosting, develoment, and support, including on-prem/cloud w/ SSO & storage, CAPTCHA-solving, proxies, etc." 12 | -------------------------------------------------------------------------------- /archivebox/api/migrations/0004_alter_apitoken_id_alter_apitoken_uuid.py: -------------------------------------------------------------------------------- 1 | # Generated by Django 5.1 on 2024-08-20 10:44 2 | 3 | import uuid 4 | from django.db import migrations, models 5 | 6 | 7 | class Migration(migrations.Migration): 8 | 9 | dependencies = [ 10 | ('api', '0003_rename_user_apitoken_created_by_apitoken_abid_and_more'), 11 | ] 12 | 13 | operations = [ 14 | migrations.AlterField( 15 | model_name='apitoken', 16 | name='id', 17 | field=models.UUIDField(default=uuid.uuid4, editable=False, primary_key=True, serialize=False), 18 | ), 19 | migrations.AlterField( 20 | model_name='apitoken', 21 | name='uuid', 22 | field=models.UUIDField(blank=True, editable=False, null=True, unique=True), 23 | ), 24 | ] 25 | -------------------------------------------------------------------------------- /archivebox/parsers/generic_jsonl.py: -------------------------------------------------------------------------------- 1 | __package__ = 'archivebox.parsers' 2 | 3 | import json 4 | from typing import IO, Iterable 5 | 6 | from archivebox.misc.util import enforce_types 7 | 8 | from ..index.schema import Link 9 | from .generic_json import jsonObjectToLink 10 | 11 | def parse_line(line: str): 12 | if line.strip() != "": 13 | return json.loads(line) 14 | 15 | @enforce_types 16 | def parse_generic_jsonl_export(json_file: IO[str], **_kwargs) -> Iterable[Link]: 17 | """Parse JSONL format bookmarks export files""" 18 | 19 | json_file.seek(0) 20 | 21 | links = [ parse_line(line) for line in json_file ] 22 | 23 | for link in links: 24 | if link: 25 | yield jsonObjectToLink(link,json_file.name) 26 | 27 | KEY = 'jsonl' 28 | NAME = 'Generic JSONL' 29 | PARSER = parse_generic_jsonl_export 30 | -------------------------------------------------------------------------------- /etc/fly.toml: -------------------------------------------------------------------------------- 1 | # fly.toml file generated for archivebox on 2021-04-23T16:35:11-04:00 2 | 3 | app = "archivebox" 4 | 5 | kill_signal = "SIGINT" 6 | kill_timeout = 5 7 | 8 | [env] 9 | 10 | [mounts] 11 | source="archivebox_data" 12 | destination="/data" 13 | 14 | [experimental] 15 | auto_rollback = true 16 | 17 | [[services]] 18 | http_checks = [] 19 | internal_port = 8000 20 | protocol = "tcp" 21 | script_checks = [] 22 | 23 | [services.concurrency] 24 | hard_limit = 25 25 | soft_limit = 20 26 | type = "connections" 27 | 28 | [[services.ports]] 29 | handlers = ["http"] 30 | port = 80 31 | 32 | [[services.ports]] 33 | handlers = ["tls", "http"] 34 | port = 443 35 | 36 | [[services.tcp_checks]] 37 | grace_period = "1s" 38 | interval = "15s" 39 | restart_limit = 6 40 | timeout = "2s" 41 | -------------------------------------------------------------------------------- /tests/fixtures.py: -------------------------------------------------------------------------------- 1 | import os 2 | import subprocess 3 | 4 | import pytest 5 | 6 | @pytest.fixture 7 | def process(tmp_path): 8 | os.chdir(tmp_path) 9 | process = subprocess.run(['archivebox', 'init'], capture_output=True) 10 | return process 11 | 12 | @pytest.fixture 13 | def disable_extractors_dict(): 14 | env = os.environ.copy() 15 | env.update({ 16 | "USE_WGET": "false", 17 | "USE_SINGLEFILE": "false", 18 | "USE_READABILITY": "false", 19 | "USE_MERCURY": "false", 20 | "SAVE_HTMLTOTEXT": "false", 21 | "SAVE_PDF": "false", 22 | "SAVE_SCREENSHOT": "false", 23 | "SAVE_DOM": "false", 24 | "SAVE_HEADERS": "false", 25 | "USE_GIT": "false", 26 | "SAVE_MEDIA": "false", 27 | "SAVE_ARCHIVE_DOT_ORG": "false" 28 | }) 29 | return env 30 | -------------------------------------------------------------------------------- /archivebox/core/migrations/0022_auto_20231023_2008.py: -------------------------------------------------------------------------------- 1 | # Generated by Django 3.1.14 on 2023-10-23 20:08 2 | 3 | from django.db import migrations, models 4 | 5 | 6 | class Migration(migrations.Migration): 7 | 8 | dependencies = [ 9 | ('core', '0021_auto_20220914_0934'), 10 | ] 11 | 12 | operations = [ 13 | migrations.AlterField( 14 | model_name='archiveresult', 15 | name='extractor', 16 | field=models.CharField(choices=[('favicon', 'favicon'), ('headers', 'headers'), ('singlefile', 'singlefile'), ('pdf', 'pdf'), ('screenshot', 'screenshot'), ('dom', 'dom'), ('wget', 'wget'), ('title', 'title'), ('readability', 'readability'), ('mercury', 'mercury'), ('htmltotext', 'htmltotext'), ('git', 'git'), ('media', 'media'), ('archive_org', 'archive_org')], max_length=32), 17 | ), 18 | ] 19 | -------------------------------------------------------------------------------- /archivebox/api/migrations/0007_alter_apitoken_created_by.py: -------------------------------------------------------------------------------- 1 | # Generated by Django 5.1 on 2024-08-20 22:52 2 | 3 | import django.db.models.deletion 4 | from django.conf import settings 5 | from django.db import migrations, models 6 | 7 | import archivebox.base_models.models 8 | 9 | 10 | class Migration(migrations.Migration): 11 | 12 | dependencies = [ 13 | ('api', '0006_remove_outboundwebhook_uuid_apitoken_id_and_more'), 14 | migrations.swappable_dependency(settings.AUTH_USER_MODEL), 15 | ] 16 | 17 | operations = [ 18 | migrations.AlterField( 19 | model_name='apitoken', 20 | name='created_by', 21 | field=models.ForeignKey(default=archivebox.base_models.models.get_or_create_system_user_pk, on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL), 22 | ), 23 | ] 24 | -------------------------------------------------------------------------------- /archivebox/pkgs/abx-plugin-sonic-search/abx_plugin_sonic_search/__init__.py: -------------------------------------------------------------------------------- 1 | __package__ = 'abx_plugin_sonic_search' 2 | __label__ = 'Sonic Search' 3 | __homepage__ = 'https://github.com/valeriansaliou/sonic' 4 | 5 | import abx 6 | 7 | 8 | @abx.hookimpl 9 | def get_CONFIG(): 10 | from .config import SONIC_CONFIG 11 | 12 | return { 13 | 'SONIC_CONFIG': SONIC_CONFIG 14 | } 15 | 16 | 17 | @abx.hookimpl 18 | def get_BINARIES(): 19 | from .binaries import SONIC_BINARY 20 | 21 | return { 22 | 'sonic': SONIC_BINARY 23 | } 24 | 25 | 26 | @abx.hookimpl 27 | def get_SEARCHBACKENDS(): 28 | from .searchbackend import SONIC_SEARCH_BACKEND 29 | 30 | return { 31 | 'sonic': SONIC_SEARCH_BACKEND, 32 | } 33 | 34 | @abx.hookimpl 35 | def ready(): 36 | from .config import SONIC_CONFIG 37 | SONIC_CONFIG.validate() 38 | -------------------------------------------------------------------------------- /archivebox/workers/admin.py: -------------------------------------------------------------------------------- 1 | __package__ = 'archivebox.workers' 2 | 3 | import abx 4 | 5 | from django.contrib.auth import get_permission_codename 6 | 7 | from huey_monitor.apps import HueyMonitorConfig 8 | from huey_monitor.admin import TaskModel, TaskModelAdmin, SignalInfoModel, SignalInfoModelAdmin 9 | 10 | 11 | HueyMonitorConfig.verbose_name = 'Background Workers' 12 | 13 | 14 | class CustomTaskModelAdmin(TaskModelAdmin): 15 | actions = ["delete_selected"] 16 | 17 | def has_delete_permission(self, request, obj=None): 18 | codename = get_permission_codename("delete", self.opts) 19 | return request.user.has_perm("%s.%s" % (self.opts.app_label, codename)) 20 | 21 | 22 | 23 | @abx.hookimpl 24 | def register_admin(admin_site): 25 | admin_site.register(TaskModel, CustomTaskModelAdmin) 26 | admin_site.register(SignalInfoModel, SignalInfoModelAdmin) 27 | -------------------------------------------------------------------------------- /archivebox/pkgs/abx-plugin-git/abx_plugin_git/config.py: -------------------------------------------------------------------------------- 1 | __package__ = 'abx_plugin_git' 2 | 3 | from typing import List 4 | 5 | from pydantic import Field 6 | 7 | from abx_spec_config.base_configset import BaseConfigSet 8 | 9 | from archivebox.config.common import ARCHIVING_CONFIG 10 | 11 | 12 | class GitConfig(BaseConfigSet): 13 | 14 | SAVE_GIT: bool = True 15 | 16 | GIT_DOMAINS: str = Field(default='github.com,bitbucket.org,gitlab.com,gist.github.com,codeberg.org,gitea.com,git.sr.ht') 17 | 18 | GIT_BINARY: str = Field(default='git') 19 | GIT_ARGS: List[str] = [ 20 | '--recursive', 21 | ] 22 | GIT_EXTRA_ARGS: List[str] = [] 23 | 24 | GIT_TIMEOUT: int = Field(default=lambda: ARCHIVING_CONFIG.TIMEOUT) 25 | GIT_CHECK_SSL_VALIDITY: bool = Field(default=lambda: ARCHIVING_CONFIG.CHECK_SSL_VALIDITY) 26 | 27 | 28 | GIT_CONFIG = GitConfig() 29 | -------------------------------------------------------------------------------- /archivebox/pkgs/abx-plugin-singlefile/abx_plugin_singlefile/__init__.py: -------------------------------------------------------------------------------- 1 | __package__ = 'abx_plugin_singlefile' 2 | __label__ = 'Singlefile' 3 | __homepage__ = 'https://github.com/gildas-lormeau/singlefile' 4 | 5 | import abx 6 | 7 | 8 | @abx.hookimpl 9 | def get_CONFIG(): 10 | from .config import SINGLEFILE_CONFIG 11 | 12 | return { 13 | 'SINGLEFILE_CONFIG': SINGLEFILE_CONFIG 14 | } 15 | 16 | @abx.hookimpl 17 | def get_BINARIES(): 18 | from .binaries import SINGLEFILE_BINARY 19 | 20 | return { 21 | 'singlefile': SINGLEFILE_BINARY, 22 | } 23 | 24 | @abx.hookimpl 25 | def get_EXTRACTORS(): 26 | from .extractors import SINGLEFILE_EXTRACTOR 27 | 28 | return { 29 | 'singlefile': SINGLEFILE_EXTRACTOR, 30 | } 31 | 32 | @abx.hookimpl 33 | def get_INSTALLED_APPS(): 34 | # needed to load ./models.py 35 | return [__package__] 36 | -------------------------------------------------------------------------------- /archivebox/core/migrations/0041_alter_archiveresult_snapshot_and_more.py: -------------------------------------------------------------------------------- 1 | # Generated by Django 5.0.6 on 2024-08-18 06:50 2 | 3 | import django.db.models.deletion 4 | from django.db import migrations, models 5 | 6 | 7 | class Migration(migrations.Migration): 8 | 9 | dependencies = [ 10 | ('core', '0040_archiveresult_snapshot'), 11 | ] 12 | 13 | operations = [ 14 | migrations.AlterField( 15 | model_name='archiveresult', 16 | name='snapshot', 17 | field=models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='core.snapshot', to_field='id'), 18 | ), 19 | migrations.AlterField( 20 | model_name='archiveresult', 21 | name='snapshot_old', 22 | field=models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='archiveresults_old', to='core.snapshot'), 23 | ), 24 | ] 25 | -------------------------------------------------------------------------------- /bin/build_docs.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | ### Bash Environment Setup 4 | # http://redsymbol.net/articles/unofficial-bash-strict-mode/ 5 | # https://www.gnu.org/software/bash/manual/html_node/The-Set-Builtin.html 6 | # set -o xtrace 7 | set -o errexit 8 | set -o errtrace 9 | set -o nounset 10 | set -o pipefail 11 | IFS=$'\n' 12 | 13 | REPO_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && cd .. && pwd )" 14 | 15 | if [[ -f "$REPO_DIR/.venv/bin/activate" ]]; then 16 | source "$REPO_DIR/.venv/bin/activate" 17 | else 18 | echo "[!] Warning: No virtualenv presesnt in $REPO_DIR.venv" 19 | fi 20 | cd "$REPO_DIR" 21 | 22 | 23 | echo "[*] Fetching latest docs version" 24 | cd "$REPO_DIR/docs" 25 | git pull 26 | cd "$REPO_DIR" 27 | 28 | echo "[+] Building docs" 29 | cd "$REPO_DIR/docs" 30 | make clean 31 | make html 32 | # open docs/_build/html/index.html to see the output 33 | cd "$REPO_DIR" 34 | -------------------------------------------------------------------------------- /.github/workflows/lint.yml: -------------------------------------------------------------------------------- 1 | name: Run linters 2 | 3 | on: 4 | workflow_dispatch: 5 | push: 6 | 7 | env: 8 | MAX_LINE_LENGTH: 110 9 | 10 | jobs: 11 | lint: 12 | runs-on: ubuntu-20.04 13 | steps: 14 | - uses: actions/checkout@v2 15 | with: 16 | submodules: true 17 | fetch-depth: 1 18 | 19 | - name: Set up Python 20 | uses: actions/setup-python@v1 21 | with: 22 | python-version: 3.9 23 | architecture: x64 24 | 25 | - name: Install flake8 26 | run: | 27 | pip install flake8 28 | 29 | - name: Lint with flake8 30 | run: | 31 | cd archivebox 32 | # one pass for show-stopper syntax errors or undefined names 33 | flake8 . --count --show-source --statistics 34 | # one pass for small stylistic things 35 | flake8 . --count --max-line-length="$MAX_LINE_LENGTH" --statistics 36 | -------------------------------------------------------------------------------- /archivebox/core/migrations/0047_alter_snapshottag_unique_together_and_more.py: -------------------------------------------------------------------------------- 1 | # Generated by Django 5.0.6 on 2024-08-20 02:16 2 | 3 | import django.db.models.deletion 4 | from django.db import migrations, models 5 | 6 | 7 | class Migration(migrations.Migration): 8 | 9 | dependencies = [ 10 | ('core', '0046_alter_archiveresult_snapshot_alter_snapshot_id_and_more'), 11 | ] 12 | 13 | operations = [ 14 | migrations.AlterField( 15 | model_name='archiveresult', 16 | name='snapshot', 17 | field=models.ForeignKey(db_column='snapshot_id', on_delete=django.db.models.deletion.CASCADE, to='core.snapshot', to_field='id'), 18 | ), 19 | migrations.AlterField( 20 | model_name='snapshottag', 21 | name='tag', 22 | field=models.ForeignKey(db_column='tag_id', on_delete=django.db.models.deletion.CASCADE, to='core.tag'), 23 | ), 24 | ] 25 | -------------------------------------------------------------------------------- /archivebox/core/migrations/0048_alter_archiveresult_snapshot_and_more.py: -------------------------------------------------------------------------------- 1 | # Generated by Django 5.0.6 on 2024-08-20 02:17 2 | 3 | import django.db.models.deletion 4 | from django.db import migrations, models 5 | 6 | 7 | class Migration(migrations.Migration): 8 | 9 | dependencies = [ 10 | ('core', '0047_alter_snapshottag_unique_together_and_more'), 11 | ] 12 | 13 | operations = [ 14 | migrations.AlterField( 15 | model_name='archiveresult', 16 | name='snapshot', 17 | field=models.ForeignKey(db_column='snapshot_id', on_delete=django.db.models.deletion.CASCADE, to='core.snapshot'), 18 | ), 19 | migrations.AlterField( 20 | model_name='snapshottag', 21 | name='snapshot', 22 | field=models.ForeignKey(db_column='snapshot_id', on_delete=django.db.models.deletion.CASCADE, to='core.snapshot', to_field='old_id'), 23 | ), 24 | ] 25 | -------------------------------------------------------------------------------- /archivebox/core/__init__.py: -------------------------------------------------------------------------------- 1 | __package__ = 'archivebox.core' 2 | __order__ = 100 3 | import abx 4 | 5 | @abx.hookimpl 6 | def register_admin(admin_site): 7 | """Register the core.models views (Snapshot, ArchiveResult, Tag, etc.) with the admin site""" 8 | from core.admin import register_admin 9 | register_admin(admin_site) 10 | 11 | 12 | 13 | @abx.hookimpl 14 | def get_CONFIG(): 15 | from archivebox.config.common import ( 16 | SHELL_CONFIG, 17 | STORAGE_CONFIG, 18 | GENERAL_CONFIG, 19 | SERVER_CONFIG, 20 | ARCHIVING_CONFIG, 21 | SEARCH_BACKEND_CONFIG, 22 | ) 23 | return { 24 | 'SHELL_CONFIG': SHELL_CONFIG, 25 | 'STORAGE_CONFIG': STORAGE_CONFIG, 26 | 'GENERAL_CONFIG': GENERAL_CONFIG, 27 | 'SERVER_CONFIG': SERVER_CONFIG, 28 | 'ARCHIVING_CONFIG': ARCHIVING_CONFIG, 29 | 'SEARCHBACKEND_CONFIG': SEARCH_BACKEND_CONFIG, 30 | } 31 | 32 | -------------------------------------------------------------------------------- /archivebox/core/migrations/0064_alter_snapshottag_unique_together_and_more.py: -------------------------------------------------------------------------------- 1 | # Generated by Django 5.0.6 on 2024-08-20 03:50 2 | 3 | import django.db.models.deletion 4 | from django.db import migrations, models 5 | 6 | 7 | class Migration(migrations.Migration): 8 | 9 | dependencies = [ 10 | ('core', '0063_snapshottag_tag_alter_snapshottag_old_tag'), 11 | ] 12 | 13 | operations = [ 14 | migrations.AlterUniqueTogether( 15 | name='snapshottag', 16 | unique_together=set(), 17 | ), 18 | migrations.AlterField( 19 | model_name='snapshottag', 20 | name='tag', 21 | field=models.ForeignKey(db_column='tag_id', on_delete=django.db.models.deletion.CASCADE, to='core.tag', to_field='id'), 22 | ), 23 | migrations.AlterUniqueTogether( 24 | name='snapshottag', 25 | unique_together={('snapshot', 'tag')}, 26 | ), 27 | ] 28 | -------------------------------------------------------------------------------- /archivebox/pkgs/abx-spec-archivebox/abx_spec_archivebox/__init__.py: -------------------------------------------------------------------------------- 1 | __package__ = 'abx_spec_archivebox' 2 | __order__ = 400 3 | 4 | # from .effects import * 5 | # from .events import * 6 | # from .reads import * 7 | # from .writes import * 8 | # from .states import * 9 | 10 | from typing import cast 11 | 12 | import abx 13 | from abx_spec_config import ConfigPluginSpec 14 | from abx_spec_abx_pkg import AbxPkgPluginSpec 15 | from abx_spec_django import DjangoPluginSpec 16 | from abx_spec_searchbackend import SearchBackendPluginSpec 17 | 18 | class ArchiveBoxPluginSpec(ConfigPluginSpec, AbxPkgPluginSpec, DjangoPluginSpec, SearchBackendPluginSpec): 19 | """ 20 | ArchiveBox plugins can use any of the hooks from the Config, AbxPkg, and Django plugin specs. 21 | """ 22 | pass 23 | 24 | PLUGIN_SPEC = ArchiveBoxPluginSpec 25 | 26 | 27 | TypedPluginManager = abx.ABXPluginManager[ArchiveBoxPluginSpec] 28 | pm = cast(TypedPluginManager, abx.pm) 29 | -------------------------------------------------------------------------------- /archivebox/workers/tests.py: -------------------------------------------------------------------------------- 1 | from django.test import TestCase 2 | 3 | # Create your tests here. 4 | 5 | 6 | class CrawlActorTest(TestCase): 7 | 8 | def test_crawl_creation(self): 9 | seed = Seed.objects.create(uri='https://example.com') 10 | Event.dispatch('CRAWL_CREATE', {'seed_id': seed.id}) 11 | 12 | crawl_actor = CrawlActor() 13 | 14 | output_events = list(crawl_actor.process_next_event()) 15 | 16 | assert len(output_events) == 1 17 | assert output_events[0].get('name', 'unset') == 'FS_WRITE' 18 | assert output_events[0].get('path') == '/tmp/test_crawl/index.json' 19 | 20 | output_events = list(crawl_actor.process_next_event()) 21 | assert len(output_events) == 1 22 | assert output_events[0].get('name', 'unset') == 'CRAWL_CREATED' 23 | 24 | assert Crawl.objects.filter(seed_id=seed.id).exists(), 'Crawl was not created' 25 | 26 | -------------------------------------------------------------------------------- /archivebox/core/migrations/0005_auto_20200728_0326.py: -------------------------------------------------------------------------------- 1 | # Generated by Django 3.0.7 on 2020-07-28 03:26 2 | 3 | from django.db import migrations, models 4 | 5 | 6 | class Migration(migrations.Migration): 7 | 8 | dependencies = [ 9 | ('core', '0004_auto_20200713_1552'), 10 | ] 11 | 12 | operations = [ 13 | migrations.AlterField( 14 | model_name='snapshot', 15 | name='tags', 16 | field=models.CharField(blank=True, db_index=True, max_length=256, null=True), 17 | ), 18 | migrations.AlterField( 19 | model_name='snapshot', 20 | name='title', 21 | field=models.CharField(blank=True, db_index=True, max_length=128, null=True), 22 | ), 23 | migrations.AlterField( 24 | model_name='snapshot', 25 | name='updated', 26 | field=models.DateTimeField(blank=True, db_index=True, null=True), 27 | ), 28 | ] 29 | -------------------------------------------------------------------------------- /archivebox/core/migrations/0052_alter_snapshottag_unique_together_and_more.py: -------------------------------------------------------------------------------- 1 | # Generated by Django 5.0.6 on 2024-08-20 02:37 2 | 3 | import django.db.models.deletion 4 | from django.db import migrations, models 5 | 6 | 7 | class Migration(migrations.Migration): 8 | 9 | dependencies = [ 10 | ('core', '0051_snapshottag_snapshot_alter_snapshottag_snapshot_old'), 11 | ] 12 | 13 | operations = [ 14 | migrations.AlterUniqueTogether( 15 | name='snapshottag', 16 | unique_together=set(), 17 | ), 18 | migrations.AlterField( 19 | model_name='snapshottag', 20 | name='snapshot', 21 | field=models.ForeignKey(db_column='snapshot_id', on_delete=django.db.models.deletion.CASCADE, to='core.snapshot'), 22 | ), 23 | migrations.AlterUniqueTogether( 24 | name='snapshottag', 25 | unique_together={('snapshot', 'tag')}, 26 | ), 27 | ] 28 | -------------------------------------------------------------------------------- /archivebox/pkgs/abx-plugin-singlefile/abx_plugin_singlefile/config.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | from typing import List, Optional 3 | 4 | from pydantic import Field 5 | 6 | from abx_spec_config.base_configset import BaseConfigSet 7 | 8 | from archivebox.config.common import ARCHIVING_CONFIG 9 | 10 | 11 | class SinglefileConfig(BaseConfigSet): 12 | SAVE_SINGLEFILE: bool = True 13 | 14 | SINGLEFILE_USER_AGENT: str = Field(default=lambda: ARCHIVING_CONFIG.USER_AGENT) 15 | SINGLEFILE_TIMEOUT: int = Field(default=lambda: ARCHIVING_CONFIG.TIMEOUT) 16 | SINGLEFILE_CHECK_SSL_VALIDITY: bool = Field(default=lambda: ARCHIVING_CONFIG.CHECK_SSL_VALIDITY) 17 | SINGLEFILE_COOKIES_FILE: Optional[Path] = Field(default=lambda: ARCHIVING_CONFIG.COOKIES_FILE) 18 | 19 | SINGLEFILE_BINARY: str = Field(default='single-file') 20 | SINGLEFILE_EXTRA_ARGS: List[str] = [] 21 | 22 | 23 | SINGLEFILE_CONFIG = SinglefileConfig() 24 | -------------------------------------------------------------------------------- /archivebox/templates/core/minimal_index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Archived Sites 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | {% for link in links %} 19 | {% include "index_row.html" with link=link %} 20 | {% endfor %} 21 | 22 |
BookmarkedSaved Link ({{num_links}})FilesOriginal URL
23 | 24 | 25 | -------------------------------------------------------------------------------- /bin/release.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | ### Bash Environment Setup 4 | # http://redsymbol.net/articles/unofficial-bash-strict-mode/ 5 | # https://www.gnu.org/software/bash/manual/html_node/The-Set-Builtin.html 6 | # set -o xtrace 7 | set -o errexit 8 | set -o errtrace 9 | set -o nounset 10 | set -o pipefail 11 | IFS=$'\n' 12 | 13 | REPO_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && cd .. && pwd )" 14 | cd "$REPO_DIR" 15 | 16 | 17 | # Run the linters and tests 18 | # ./bin/lint.sh 19 | # ./bin/test.sh 20 | 21 | # # Run all the build scripts 22 | # ./bin/build_git.sh 23 | # ./bin/build_docs.sh 24 | # ./bin/build_pip.sh 25 | # ./bin/build_docker.sh 26 | 27 | # Push relase to public repositories 28 | # ./bin/release_docs.sh 29 | ./bin/release_git.sh "$@" 30 | ./bin/release_pip.sh "$@" 31 | ./bin/release_docker.sh "$@" 32 | 33 | VERSION="$(grep '^version = ' "${REPO_DIR}/pyproject.toml" | awk -F'"' '{print $2}')" 34 | echo "[√] Done. Published version v$VERSION" 35 | -------------------------------------------------------------------------------- /archivebox/core/migrations/0036_alter_archiveresult_id_alter_archiveresult_old_id.py: -------------------------------------------------------------------------------- 1 | # Generated by Django 5.0.6 on 2024-08-18 05:59 2 | 3 | import core.models 4 | import uuid 5 | import random 6 | from django.db import migrations, models 7 | 8 | 9 | def rand_int_id(): 10 | return random.getrandbits(32) 11 | 12 | class Migration(migrations.Migration): 13 | 14 | dependencies = [ 15 | ('core', '0035_remove_archiveresult_uuid_archiveresult_id'), 16 | ] 17 | 18 | operations = [ 19 | migrations.AlterField( 20 | model_name='archiveresult', 21 | name='id', 22 | field=models.UUIDField(default=uuid.uuid4, primary_key=True, serialize=False, unique=True, verbose_name='ID'), 23 | ), 24 | migrations.AlterField( 25 | model_name='archiveresult', 26 | name='old_id', 27 | field=models.BigIntegerField(default=rand_int_id, serialize=False, verbose_name='Old ID'), 28 | ), 29 | ] 30 | -------------------------------------------------------------------------------- /archivebox/pkgs/abx-spec-archivebox/abx_spec_archivebox/events.py: -------------------------------------------------------------------------------- 1 | """ 2 | Hookspec for ArchiveBox system events that plugins can hook into. 3 | 4 | Loosely modeled after Django's signals architecture. 5 | https://docs.djangoproject.com/en/5.1/ref/signals/ 6 | """ 7 | 8 | __package__ = 'abx.archivebox' 9 | 10 | import abx 11 | 12 | 13 | 14 | @abx.hookspec 15 | def on_crawl_schedule_tick(crawl_schedule): 16 | pass 17 | 18 | 19 | 20 | 21 | @abx.hookspec 22 | def on_seed_post_save(seed, created=False): 23 | ... 24 | 25 | @abx.hookspec 26 | def on_crawl_post_save(crawl, created=False): 27 | ... 28 | 29 | 30 | @abx.hookspec 31 | def on_snapshot_post_save(snapshot, created=False): 32 | ... 33 | 34 | # @abx.hookspec 35 | # def on_snapshot_post_delete(snapshot): 36 | # ... 37 | 38 | 39 | @abx.hookspec 40 | def on_archiveresult_post_save(archiveresult, created=False): 41 | ... 42 | 43 | # @abx.hookspec 44 | # def on_archiveresult_post_delete(archiveresult): 45 | # ... 46 | -------------------------------------------------------------------------------- /archivebox/parsers/url_list.py: -------------------------------------------------------------------------------- 1 | __package__ = 'archivebox.parsers' 2 | __description__ = 'URL list' 3 | 4 | import re 5 | 6 | from typing import IO, Iterable 7 | from datetime import datetime, timezone 8 | 9 | from ..index.schema import Link 10 | from archivebox.misc.util import ( 11 | enforce_types, 12 | URL_REGEX, 13 | ) 14 | 15 | 16 | @enforce_types 17 | def parse_url_list(text_file: IO[str], **_kwargs) -> Iterable[Link]: 18 | """Parse raw URLs from each line in a text file""" 19 | 20 | text_file.seek(0) 21 | for line in text_file.readlines(): 22 | url = line.strip() 23 | if (not url) or not re.findall(URL_REGEX, url): 24 | continue 25 | 26 | yield Link( 27 | url=url, 28 | timestamp=str(datetime.now(timezone.utc).timestamp()), 29 | title=None, 30 | tags=None, 31 | sources=[text_file.name], 32 | ) 33 | 34 | 35 | KEY = 'url_list' 36 | NAME = 'URL List' 37 | PARSER = parse_url_list 38 | -------------------------------------------------------------------------------- /etc/archivebox.service: -------------------------------------------------------------------------------- 1 | # This is an example systemd service config definition for ArchiveBox. 2 | # 3 | # Link it into place on your system to use systemd to auto-start the ArchiveBox server on boot: 4 | # https://unix.stackexchange.com/questions/224992/where-do-i-put-my-systemd-unit-file 5 | # 6 | # Review and change these lines as-needed for your specific environment and needs: 7 | # WorkingDirectory, ExecStart, User, Group 8 | 9 | [Unit] 10 | Description=Open source self-hosted web archiving 11 | Documentation=https://github.com/ArchiveBox/ArchiveBox/wiki 12 | 13 | [Service] 14 | Type=simple 15 | WorkingDirectory=/home/archivebox/archivebox/ 16 | ExecStart=/usr/local/bin/archivebox server 0.0.0.0:8000 17 | ExecReload=/bin/kill -s HUP $MAINPID 18 | ExecStop=/bin/kill -s QUIT $MAINPID 19 | Restart=always 20 | RestartSec=2 21 | StandardOutput=syslog 22 | StandardError=syslog 23 | SyslogIdentifier=archivebox 24 | User=archivebox 25 | Group=archivebox 26 | 27 | 28 | [Install] 29 | WantedBy=multi-user.target 30 | -------------------------------------------------------------------------------- /website/assets/css/style.scss: -------------------------------------------------------------------------------- 1 | --- 2 | --- 3 | 4 | @import "{{ site.theme }}"; 5 | 6 | div.shell { 7 | width: 80%; 8 | max-width: 1300px; 9 | min-width: 300px; 10 | } 11 | 12 | span.banner-fix { 13 | width: 80%; 14 | max-width: 1300px; 15 | min-width: 300px; 16 | } 17 | 18 | header h1 { 19 | background-color: #aa1f55; 20 | padding-bottom: 15px; 21 | font-weight: 200px; 22 | } 23 | header h2 { 24 | background-color: #aa1f55; 25 | font-family: 'Open Sans'; 26 | } 27 | 28 | #main_content div[align=center] h1 { 29 | display: none; 30 | } 31 | #main_content img { 32 | box-shadow: 4px 4px 4px rgba(0,0,0,0.1); 33 | border-radius: 8px; 34 | border: 0px; 35 | vertical-align: top; 36 | } 37 | #main_content em img { 38 | display: block; 39 | margin-top: -83px; 40 | padding: 0px; 41 | margin-bottom: 20px; 42 | } 43 | 44 | #main_content img[alt=comparison] { 45 | margin: 25px; 46 | } 47 | 48 | #forkme_banner { 49 | opacity: 0.1; 50 | } 51 | -------------------------------------------------------------------------------- /archivebox/api/migrations/0006_remove_outboundwebhook_uuid_apitoken_id_and_more.py: -------------------------------------------------------------------------------- 1 | # Generated by Django 5.1 on 2024-08-20 22:43 2 | 3 | import uuid 4 | from django.db import migrations, models 5 | 6 | 7 | class Migration(migrations.Migration): 8 | 9 | dependencies = [ 10 | ('api', '0005_remove_apitoken_uuid_remove_outboundwebhook_uuid_and_more'), 11 | ] 12 | 13 | operations = [ 14 | migrations.RenameField( 15 | model_name='outboundwebhook', 16 | old_name='uuid', 17 | new_name='id' 18 | ), 19 | migrations.AlterField( 20 | model_name='outboundwebhook', 21 | name='id', 22 | field=models.UUIDField(default=uuid.uuid4, editable=False, primary_key=True, serialize=False), 23 | ), 24 | migrations.AlterField( 25 | model_name='apitoken', 26 | name='id', 27 | field=models.UUIDField(default=uuid.uuid4, editable=False, primary_key=True, serialize=False), 28 | ), 29 | ] 30 | -------------------------------------------------------------------------------- /archivebox/core/migrations/0011_auto_20210216_1331.py: -------------------------------------------------------------------------------- 1 | # Generated by Django 3.1.3 on 2021-02-16 13:31 2 | 3 | from django.db import migrations, models 4 | import uuid 5 | 6 | 7 | class Migration(migrations.Migration): 8 | 9 | dependencies = [ 10 | ('core', '0010_auto_20210216_1055'), 11 | ] 12 | 13 | operations = [ 14 | migrations.AddField( 15 | model_name='archiveresult', 16 | name='uuid', 17 | field=models.UUIDField(default=uuid.uuid4, editable=False), 18 | ), 19 | migrations.AlterField( 20 | model_name='archiveresult', 21 | name='extractor', 22 | field=models.CharField(choices=[('title', 'title'), ('favicon', 'favicon'), ('headers', 'headers'), ('singlefile', 'singlefile'), ('pdf', 'pdf'), ('screenshot', 'screenshot'), ('dom', 'dom'), ('wget', 'wget'), ('readability', 'readability'), ('mercury', 'mercury'), ('git', 'git'), ('media', 'media'), ('archive_org', 'archive_org')], max_length=32), 23 | ), 24 | ] 25 | -------------------------------------------------------------------------------- /tests/mock_server/templates/example.atom: -------------------------------------------------------------------------------- 1 | 2 | 6 | http://www.example.com/ 7 | Example of an Atom feed 8 | 9 | 10 | 11 | Jim Winstead 12 | 13 | 2024-02-26T03:18:26Z 14 | 15 | Example 16 | 17 | tag:example.com,2024-02-25:3319 18 | 2024-02-26T03:18:26Z 19 | 2024-02-25T19:18:25-08:00 20 | 21 | 22 | This is some <b>content</b> 23 | 24 | 25 | -------------------------------------------------------------------------------- /archivebox/core/migrations/0001_initial.py: -------------------------------------------------------------------------------- 1 | # Generated by Django 2.2 on 2019-05-01 03:27 2 | 3 | from django.db import migrations, models 4 | import uuid 5 | 6 | 7 | class Migration(migrations.Migration): 8 | 9 | initial = True 10 | 11 | dependencies = [ 12 | ] 13 | 14 | operations = [ 15 | migrations.CreateModel( 16 | name='Snapshot', 17 | fields=[ 18 | ('id', models.UUIDField(default=uuid.uuid4, editable=False, primary_key=True, serialize=False)), 19 | ('url', models.URLField(unique=True)), 20 | ('timestamp', models.CharField(default=None, max_length=32, null=True, unique=True)), 21 | ('title', models.CharField(default=None, max_length=128, null=True)), 22 | ('tags', models.CharField(default=None, max_length=256, null=True)), 23 | ('added', models.DateTimeField(auto_now_add=True)), 24 | ('updated', models.DateTimeField(default=None, null=True)), 25 | ], 26 | ), 27 | ] 28 | -------------------------------------------------------------------------------- /archivebox/cli/archivebox_worker.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | __package__ = 'archivebox.cli' 4 | __command__ = 'archivebox worker' 5 | 6 | import sys 7 | import json 8 | 9 | import rich_click as click 10 | 11 | 12 | @click.command() 13 | @click.argument('worker_type') 14 | @click.option('--wait-for-first-event', is_flag=True) 15 | @click.option('--exit-on-idle', is_flag=True) 16 | def main(worker_type: str, wait_for_first_event: bool, exit_on_idle: bool): 17 | """Start an ArchiveBox worker process of the given type""" 18 | 19 | from workers.worker import get_worker_type 20 | 21 | # allow piping in events to process from stdin 22 | # if not sys.stdin.isatty(): 23 | # for line in sys.stdin.readlines(): 24 | # Event.dispatch(event=json.loads(line), parent=None) 25 | 26 | # run the actor 27 | Worker = get_worker_type(worker_type) 28 | for event in Worker.run(wait_for_first_event=wait_for_first_event, exit_on_idle=exit_on_idle): 29 | print(event) 30 | 31 | 32 | if __name__ == '__main__': 33 | main() 34 | -------------------------------------------------------------------------------- /archivebox/core/asgi.py: -------------------------------------------------------------------------------- 1 | """ 2 | WSGI config for archivebox project. 3 | 4 | It exposes the WSGI callable as a module-level variable named ``application``. 5 | 6 | For more information on this file, see 7 | https://docs.djangoproject.com/en/2.1/howto/deployment/wsgi/ 8 | """ 9 | 10 | from archivebox.config.django import setup_django 11 | 12 | setup_django(in_memory_db=False, check_db=True) 13 | 14 | 15 | # from channels.auth import AuthMiddlewareStack 16 | # from channels.security.websocket import AllowedHostsOriginValidator 17 | from channels.routing import ProtocolTypeRouter # , URLRouter 18 | from django.core.asgi import get_asgi_application 19 | 20 | # from core.routing import websocket_urlpatterns 21 | 22 | 23 | django_asgi_app = get_asgi_application() 24 | 25 | application = ProtocolTypeRouter( 26 | { 27 | "http": django_asgi_app, 28 | # only if we need websocket support later: 29 | # "websocket": AllowedHostsOriginValidator( 30 | # AuthMiddlewareStack(URLRouter(websocket_urlpatterns)) 31 | # ), 32 | } 33 | ) 34 | -------------------------------------------------------------------------------- /archivebox/pkgs/abx-plugin-pip/abx_plugin_pip/__init__.py: -------------------------------------------------------------------------------- 1 | __package__ = 'abx_plugin_pip' 2 | __label__ = 'PIP' 3 | __order__ = 200 4 | 5 | import abx 6 | 7 | 8 | @abx.hookimpl 9 | def get_CONFIG(): 10 | from .config import PIP_CONFIG 11 | 12 | return { 13 | 'PIP_CONFIG': PIP_CONFIG 14 | } 15 | 16 | @abx.hookimpl(tryfirst=True) 17 | def get_BINARIES(): 18 | from .binaries import ARCHIVEBOX_BINARY, PYTHON_BINARY, DJANGO_BINARY, SQLITE_BINARY, PIP_BINARY, PIPX_BINARY 19 | 20 | return { 21 | 'archivebox': ARCHIVEBOX_BINARY, 22 | 'python': PYTHON_BINARY, 23 | 'django': DJANGO_BINARY, 24 | 'sqlite': SQLITE_BINARY, 25 | 'pip': PIP_BINARY, 26 | 'pipx': PIPX_BINARY, 27 | } 28 | 29 | @abx.hookimpl 30 | def get_BINPROVIDERS(): 31 | from .binproviders import SYS_PIP_BINPROVIDER, VENV_PIP_BINPROVIDER, LIB_PIP_BINPROVIDER 32 | 33 | return { 34 | 'sys_pip': SYS_PIP_BINPROVIDER, 35 | 'venv_pip': VENV_PIP_BINPROVIDER, 36 | 'lib_pip': LIB_PIP_BINPROVIDER, 37 | } 38 | -------------------------------------------------------------------------------- /archivebox/pkgs/abx-plugin-readability/abx_plugin_readability/binaries.py: -------------------------------------------------------------------------------- 1 | __package__ = 'abx_plugin_readability' 2 | 3 | from typing import List 4 | 5 | from pydantic import InstanceOf 6 | from abx_pkg import Binary, BinProvider, BinaryOverrides, BinName 7 | 8 | from abx_plugin_default_binproviders import env 9 | from abx_plugin_npm.binproviders import SYS_NPM_BINPROVIDER, LIB_NPM_BINPROVIDER 10 | 11 | from .config import READABILITY_CONFIG 12 | 13 | 14 | READABILITY_PACKAGE_NAME = 'github:ArchiveBox/readability-extractor' 15 | 16 | class ReadabilityBinary(Binary): 17 | name: BinName = READABILITY_CONFIG.READABILITY_BINARY 18 | binproviders_supported: List[InstanceOf[BinProvider]] = [LIB_NPM_BINPROVIDER, SYS_NPM_BINPROVIDER, env] 19 | 20 | overrides: BinaryOverrides = { 21 | LIB_NPM_BINPROVIDER.name: {"packages": [READABILITY_PACKAGE_NAME]}, 22 | SYS_NPM_BINPROVIDER.name: {"packages": [READABILITY_PACKAGE_NAME], "install": lambda: None}, # prevent modifying system global npm packages 23 | } 24 | 25 | 26 | READABILITY_BINARY = ReadabilityBinary() 27 | -------------------------------------------------------------------------------- /bin/build_git.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | ### Bash Environment Setup 4 | # http://redsymbol.net/articles/unofficial-bash-strict-mode/ 5 | # https://www.gnu.org/software/bash/manual/html_node/The-Set-Builtin.html 6 | # set -o xtrace 7 | set -o errexit 8 | set -o errtrace 9 | set -o nounset 10 | set -o pipefail 11 | IFS=$'\n' 12 | 13 | REPO_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && cd .. && pwd )" 14 | 15 | cd "$REPO_DIR" 16 | source "./.venv/bin/activate" 17 | 18 | 19 | # Make sure git is clean 20 | if [ -z "$(git status --porcelain)" ] && [[ "$(git branch --show-current)" == "master" ]]; then 21 | git pull 22 | else 23 | echo "[!] Warning: git status is dirty!" 24 | echo " Press Ctrl-C to cancel, or wait 10sec to continue..." 25 | sleep 10 26 | fi 27 | 28 | # Bump version number in source 29 | function bump_semver { 30 | echo "$1" | awk -F. '{$NF = $NF + 1;} 1' | sed 's/ /./g' 31 | } 32 | 33 | # OLD_VERSION="$(grep '^version = ' "${REPO_DIR}/pyproject.toml" | awk -F'"' '{print $2}')" 34 | # NEW_VERSION="$(bump_semver "$OLD_VERSION")" 35 | 36 | -------------------------------------------------------------------------------- /archivebox/index/csv.py: -------------------------------------------------------------------------------- 1 | __package__ = 'archivebox.index' 2 | 3 | from typing import List, Optional, Any 4 | 5 | from archivebox.misc.util import enforce_types 6 | from .schema import Link 7 | 8 | 9 | @enforce_types 10 | def links_to_csv(links: List[Link], 11 | cols: Optional[List[str]]=None, 12 | header: bool=True, 13 | separator: str=',', 14 | ljust: int=0) -> str: 15 | 16 | cols = cols or ['timestamp', 'is_archived', 'url'] 17 | 18 | header_str = '' 19 | if header: 20 | header_str = separator.join(col.ljust(ljust) for col in cols) 21 | 22 | row_strs = ( 23 | link.to_csv(cols=cols, ljust=ljust, separator=separator) 24 | for link in links 25 | ) 26 | 27 | return '\n'.join((header_str, *row_strs)) 28 | 29 | 30 | @enforce_types 31 | def to_csv(obj: Any, cols: List[str], separator: str=',', ljust: int=0) -> str: 32 | from .json import to_json 33 | 34 | return separator.join( 35 | to_json(getattr(obj, col), indent=None).ljust(ljust) 36 | for col in cols 37 | ) 38 | -------------------------------------------------------------------------------- /tests/mock_server/templates/example.jsonl: -------------------------------------------------------------------------------- 1 | {"href":"http://127.0.0.1:8080/static/example.com.html","description":"Example","extended":"","meta":"18a973f09c9cc0608c116967b64e0419","hash":"910293f019c2f4bb1a749fb937ba58e3","time":"2014-06-14T15:51:42Z","shared":"no","toread":"no","tags":"Tag1 Tag2","trap":"http://www.example.com/should-not-exist"} 2 | {"href":"http://127.0.0.1:8080/static/iana.org.html","description":"Example 2","extended":"","meta":"18a973f09c9cc0608c116967b64e0419","hash":"910293f019c2f4bb1a749fb937ba58e3","time":"2014-06-14T15:51:43Z","shared":"no","toread":"no","tags":"Tag3,Tag4 with Space"} 3 | {"href":"http://127.0.0.1:8080/static/shift_jis.html","description":"Example 2","extended":"","meta":"18a973f09c9cc0608c116967b64e0419","hash":"910293f019c2f4bb1a749fb937ba58e3","time":"2014-06-14T15:51:44Z","shared":"no","toread":"no","tags":["Tag5","Tag6 with Space"]} 4 | {"href":"http://127.0.0.1:8080/static/title_og_with_html","description":"Example 2","extended":"","meta":"18a973f09c9cc0608c116967b64e0419","hash":"910293f019c2f4bb1a749fb937ba58e3","time":"2014-06-14T15:51:45Z","shared":"no","toread":"no"} 5 | -------------------------------------------------------------------------------- /archivebox/search/admin.py: -------------------------------------------------------------------------------- 1 | __package__ = 'archivebox.search' 2 | 3 | from django.contrib import messages 4 | from django.contrib import admin 5 | 6 | from archivebox.search import query_search_index 7 | 8 | class SearchResultsAdminMixin(admin.ModelAdmin): 9 | def get_search_results(self, request, queryset, search_term: str): 10 | """Enhances the search queryset with results from the search backend""" 11 | 12 | qs, use_distinct = super().get_search_results(request, queryset, search_term) 13 | 14 | search_term = search_term.strip() 15 | if not search_term: 16 | return qs.distinct(), use_distinct 17 | try: 18 | qsearch = query_search_index(search_term) 19 | qs = qs | qsearch 20 | except Exception as err: 21 | print(f'[!] Error while using search backend: {err.__class__.__name__} {err}') 22 | messages.add_message(request, messages.WARNING, f'Error from the search backend, only showing results from default admin search fields - Error: {err}') 23 | 24 | return qs.distinct(), use_distinct 25 | -------------------------------------------------------------------------------- /archivebox/api/migrations/0001_initial.py: -------------------------------------------------------------------------------- 1 | # Generated by Django 4.2.11 on 2024-04-25 04:19 2 | 3 | import api.models 4 | from django.conf import settings 5 | from django.db import migrations, models 6 | import django.db.models.deletion 7 | import uuid 8 | 9 | 10 | class Migration(migrations.Migration): 11 | 12 | initial = True 13 | 14 | dependencies = [ 15 | migrations.swappable_dependency(settings.AUTH_USER_MODEL), 16 | ] 17 | 18 | operations = [ 19 | migrations.CreateModel( 20 | name='APIToken', 21 | fields=[ 22 | ('id', models.UUIDField(default=uuid.uuid4, editable=False, primary_key=True, serialize=False)), 23 | ('token', models.CharField(default=api.models.generate_secret_token, max_length=32, unique=True)), 24 | ('created', models.DateTimeField(auto_now_add=True)), 25 | ('expires', models.DateTimeField(blank=True, null=True)), 26 | ('user', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL)), 27 | ], 28 | ), 29 | ] 30 | -------------------------------------------------------------------------------- /archivebox/pkgs/abx-plugin-ripgrep-search/abx_plugin_ripgrep_search/config.py: -------------------------------------------------------------------------------- 1 | __package__ = 'abx_plugin_ripgrep_search' 2 | 3 | from pathlib import Path 4 | from typing import List 5 | 6 | from pydantic import Field 7 | 8 | from abx_spec_config.base_configset import BaseConfigSet 9 | 10 | from archivebox.config import CONSTANTS 11 | from archivebox.config.common import SEARCH_BACKEND_CONFIG 12 | 13 | 14 | class RipgrepConfig(BaseConfigSet): 15 | RIPGREP_BINARY: str = Field(default='rg') 16 | 17 | RIPGREP_IGNORE_EXTENSIONS: str = Field(default='css,js,orig,svg') 18 | RIPGREP_ARGS_DEFAULT: List[str] = Field(default=lambda c: [ 19 | # https://github.com/BurntSushi/ripgrep/blob/master/GUIDE.md 20 | f'--type-add=ignore:*.{{{c.RIPGREP_IGNORE_EXTENSIONS}}}', 21 | '--type-not=ignore', 22 | '--ignore-case', 23 | '--files-with-matches', 24 | '--regexp', 25 | ]) 26 | RIPGREP_SEARCH_DIR: Path = CONSTANTS.ARCHIVE_DIR 27 | RIPGREP_TIMEOUT: int = Field(default=lambda: SEARCH_BACKEND_CONFIG.SEARCH_BACKEND_TIMEOUT) 28 | 29 | RIPGREP_CONFIG = RipgrepConfig() 30 | -------------------------------------------------------------------------------- /tests/mock_server/templates/example.json: -------------------------------------------------------------------------------- 1 | [ 2 | {"href":"http://127.0.0.1:8080/static/example.com.html","description":"Example","extended":"","meta":"18a973f09c9cc0608c116967b64e0419","hash":"910293f019c2f4bb1a749fb937ba58e3","time":"2014-06-14T15:51:42Z","shared":"no","toread":"no","tags":"Tag1 Tag2","trap":"http://www.example.com/should-not-exist"}, 3 | {"href":"http://127.0.0.1:8080/static/iana.org.html","description":"Example 2","extended":"","meta":"18a973f09c9cc0608c116967b64e0419","hash":"910293f019c2f4bb1a749fb937ba58e3","time":"2014-06-14T15:51:43Z","shared":"no","toread":"no","tags":"Tag3,Tag4 with Space"}, 4 | {"href":"http://127.0.0.1:8080/static/shift_jis.html","description":"Example 2","extended":"","meta":"18a973f09c9cc0608c116967b64e0419","hash":"910293f019c2f4bb1a749fb937ba58e3","time":"2014-06-14T15:51:44Z","shared":"no","toread":"no","tags":["Tag5","Tag6 with Space"]}, 5 | {"href":"http://127.0.0.1:8080/static/title_og_with_html","description":"Example 2","extended":"","meta":"18a973f09c9cc0608c116967b64e0419","hash":"910293f019c2f4bb1a749fb937ba58e3","time":"2014-06-14T15:51:45Z","shared":"no","toread":"no"} 6 | ] 7 | -------------------------------------------------------------------------------- /archivebox/core/migrations/0046_alter_archiveresult_snapshot_alter_snapshot_id_and_more.py: -------------------------------------------------------------------------------- 1 | # Generated by Django 5.0.6 on 2024-08-20 01:55 2 | 3 | import django.db.models.deletion 4 | import uuid 5 | from django.db import migrations, models 6 | 7 | 8 | class Migration(migrations.Migration): 9 | 10 | dependencies = [ 11 | ('core', '0045_alter_snapshot_old_id'), 12 | ] 13 | 14 | operations = [ 15 | migrations.AlterField( 16 | model_name='archiveresult', 17 | name='snapshot', 18 | field=models.ForeignKey(db_column='snapshot_id', on_delete=django.db.models.deletion.CASCADE, to='core.snapshot', to_field='id'), 19 | ), 20 | migrations.AlterField( 21 | model_name='snapshot', 22 | name='id', 23 | field=models.UUIDField(default=uuid.uuid4, primary_key=True, serialize=False, unique=True), 24 | ), 25 | migrations.AlterField( 26 | model_name='snapshot', 27 | name='old_id', 28 | field=models.UUIDField(default=uuid.uuid4, editable=False, unique=True), 29 | ), 30 | ] 31 | -------------------------------------------------------------------------------- /archivebox/pkgs/abx-plugin-readwise/abx_plugin_readwise.py: -------------------------------------------------------------------------------- 1 | __package__ = 'abx_plugin_readwise_extractor' 2 | __id__ = 'abx_plugin_readwise_extractor' 3 | __label__ = 'Readwise API' 4 | __version__ = '2024.10.27' 5 | __author__ = 'ArchiveBox' 6 | __homepage__ = 'https://github.com/ArchiveBox/ArchiveBox/tree/dev/archivebox/pkgs/abx-plugin-readwise-extractor' 7 | __dependencies__ = [] 8 | 9 | import abx 10 | 11 | from typing import Dict 12 | from pathlib import Path 13 | 14 | from pydantic import Field 15 | 16 | from abx_spec_config.base_configset import BaseConfigSet 17 | 18 | from archivebox.config import CONSTANTS 19 | 20 | class ReadwiseConfig(BaseConfigSet): 21 | READWISE_DB_PATH: Path = Field(default=CONSTANTS.SOURCES_DIR / "readwise_reader_api.db") 22 | READWISE_READER_TOKENS: Dict[str, str] = Field(default=lambda: {}) # {: , ...} 23 | 24 | 25 | @abx.hookimpl 26 | def get_CONFIG(): 27 | return { 28 | __id__: ReadwiseConfig() 29 | } 30 | 31 | @abx.hookimpl 32 | def ready(): 33 | READWISE_CONFIG = abx.pm.hook.get_CONFIG()[__id__] 34 | READWISE_CONFIG.validate() 35 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 Nick Sweeting 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /archivebox/pkgs/abx-plugin-curl/abx_plugin_curl/config.py: -------------------------------------------------------------------------------- 1 | __package__ = 'abx_plugin_curl' 2 | 3 | from typing import List, Optional 4 | from pathlib import Path 5 | 6 | from pydantic import Field 7 | 8 | from abx_spec_config.base_configset import BaseConfigSet 9 | 10 | from archivebox.config.common import ARCHIVING_CONFIG 11 | 12 | 13 | class CurlConfig(BaseConfigSet): 14 | 15 | SAVE_TITLE: bool = Field(default=True) 16 | SAVE_HEADERS: bool = Field(default=True) 17 | USE_CURL: bool = Field(default=True) 18 | 19 | CURL_BINARY: str = Field(default='curl') 20 | CURL_ARGS: List[str] = [ 21 | '--silent', 22 | '--location', 23 | '--compressed', 24 | ] 25 | CURL_EXTRA_ARGS: List[str] = [] 26 | 27 | CURL_TIMEOUT: int = Field(default=lambda: ARCHIVING_CONFIG.TIMEOUT) 28 | CURL_CHECK_SSL_VALIDITY: bool = Field(default=lambda: ARCHIVING_CONFIG.CHECK_SSL_VALIDITY) 29 | CURL_USER_AGENT: str = Field(default=lambda: ARCHIVING_CONFIG.USER_AGENT) 30 | CURL_COOKIES_FILE: Optional[Path] = Field(default=lambda: ARCHIVING_CONFIG.COOKIES_FILE) 31 | 32 | 33 | CURL_CONFIG = CurlConfig() 34 | -------------------------------------------------------------------------------- /archivebox/pkgs/abx-plugin-sonic-search/abx_plugin_sonic_search/binaries.py: -------------------------------------------------------------------------------- 1 | __package__ = 'abx_plugin_sonic_search' 2 | 3 | from typing import List 4 | 5 | from pydantic import InstanceOf 6 | from abx_pkg import BinProvider, BinaryOverrides, BinName, Binary 7 | 8 | from abx_plugin_default_binproviders import brew, env 9 | 10 | from .config import SONIC_CONFIG 11 | 12 | 13 | class SonicBinary(Binary): 14 | name: BinName = SONIC_CONFIG.SONIC_BINARY 15 | binproviders_supported: List[InstanceOf[BinProvider]] = [brew, env] # TODO: add cargo 16 | 17 | overrides: BinaryOverrides = { 18 | brew.name: {'packages': ['sonic']}, 19 | # cargo.name: {'packages': ['sonic-server']}, # TODO: add cargo 20 | } 21 | 22 | # TODO: add version checking over protocol? for when sonic backend is on remote server and binary is not installed locally 23 | # def on_get_version(self): 24 | # with sonic.IngestClient(SONIC_CONFIG.SONIC_HOST, str(SONIC_CONFIG.SONIC_PORT), SONIC_CONFIG.SONIC_PASSWORD) as ingestcl: 25 | # return SemVer.parse(str(ingestcl.protocol)) 26 | 27 | SONIC_BINARY = SonicBinary() 28 | -------------------------------------------------------------------------------- /tests/test_update.py: -------------------------------------------------------------------------------- 1 | import sqlite3 2 | 3 | from .fixtures import * 4 | 5 | def test_update_status_invalid(tmp_path, process, disable_extractors_dict): 6 | subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'], capture_output=True, env=disable_extractors_dict) 7 | assert list((tmp_path / "archive").iterdir()) != [] 8 | 9 | a_process = subprocess.run(['archivebox', 'remove', 'http://127.0.0.1:8080/static/example.com.html', '--yes'], capture_output=True) 10 | 11 | conn = sqlite3.connect(str(tmp_path / "index.sqlite3")) 12 | c = conn.cursor() 13 | link = c.execute("SELECT * FROM core_snapshot").fetchone() 14 | conn.commit() 15 | conn.close() 16 | 17 | assert link is None 18 | 19 | update_process = subprocess.run(['archivebox', 'update', '--status=invalid'], capture_output=True, env=disable_extractors_dict) 20 | 21 | conn = sqlite3.connect(str(tmp_path / "index.sqlite3")) 22 | c = conn.cursor() 23 | url = c.execute("SELECT url FROM core_snapshot").fetchone()[0] 24 | conn.commit() 25 | conn.close() 26 | 27 | assert url == 'http://127.0.0.1:8080/static/example.com.html' 28 | -------------------------------------------------------------------------------- /archivebox/misc/debugging.py: -------------------------------------------------------------------------------- 1 | from functools import wraps 2 | from time import time 3 | 4 | def timed_function(func): 5 | """ 6 | Very simple profiling decorator for debugging. 7 | Usage: 8 | @timed_function 9 | def my_func(): 10 | ... 11 | 12 | More advanced alternatives: 13 | - viztracer ../.venv/bin/archivebox manage check # https://viztracer.readthedocs.io/en/latest/filter.html 14 | - python -m cProfile -o archivebox.prof ../.venv/bin/archivebox manage check; snakeviz archivebox.prof 15 | - Django Debug Toolbar + django-debug-toolbar-flamegraph 16 | + Django Requests Tracker (requests-tracker) 17 | """ 18 | @wraps(func) 19 | def wrap(*args, **kwargs): 20 | if args and hasattr(args[0], '__module__'): 21 | module = args[0].__module__ 22 | else: 23 | module = func.__module__ 24 | ts_start = time() 25 | result = func(*args, **kwargs) 26 | ts_end = time() 27 | ms_elapsed = int((ts_end-ts_start) * 1000) 28 | print(f'[DEBUG][{ms_elapsed}ms] {module}.{func.__name__}(...)') 29 | return result 30 | return wrap 31 | -------------------------------------------------------------------------------- /archivebox/pkgs/abx-plugin-mercury/abx_plugin_mercury/config.py: -------------------------------------------------------------------------------- 1 | __package__ = 'abx_plugin_mercury' 2 | 3 | from typing import List, Optional 4 | from pathlib import Path 5 | 6 | from pydantic import Field 7 | 8 | from abx_spec_config.base_configset import BaseConfigSet 9 | 10 | from archivebox.config.common import ARCHIVING_CONFIG, STORAGE_CONFIG 11 | 12 | 13 | 14 | class MercuryConfig(BaseConfigSet): 15 | 16 | SAVE_MERCURY: bool = Field(default=True, alias='USE_MERCURY') 17 | 18 | MERCURY_BINARY: str = Field(default='postlight-parser') 19 | MERCURY_EXTRA_ARGS: List[str] = [] 20 | 21 | SAVE_MERCURY_REQUISITES: bool = Field(default=True) 22 | MERCURY_RESTRICT_FILE_NAMES: str = Field(default=lambda: STORAGE_CONFIG.RESTRICT_FILE_NAMES) 23 | 24 | MERCURY_TIMEOUT: int = Field(default=lambda: ARCHIVING_CONFIG.TIMEOUT) 25 | MERCURY_CHECK_SSL_VALIDITY: bool = Field(default=lambda: ARCHIVING_CONFIG.CHECK_SSL_VALIDITY) 26 | MERCURY_USER_AGENT: str = Field(default=lambda: ARCHIVING_CONFIG.USER_AGENT) 27 | MERCURY_COOKIES_FILE: Optional[Path] = Field(default=lambda: ARCHIVING_CONFIG.COOKIES_FILE) 28 | 29 | 30 | 31 | MERCURY_CONFIG = MercuryConfig() 32 | -------------------------------------------------------------------------------- /archivebox/pkgs/abx-plugin-wget/abx_plugin_wget/extractors.py: -------------------------------------------------------------------------------- 1 | __package__ = 'abx_plugin_wget' 2 | 3 | from pathlib import Path 4 | 5 | from abx_pkg import BinName 6 | 7 | from abx_spec_extractor import BaseExtractor, ExtractorName 8 | 9 | from .binaries import WGET_BINARY 10 | from .wget_util import wget_output_path 11 | 12 | class WgetExtractor(BaseExtractor): 13 | name: ExtractorName = 'wget' 14 | binary: BinName = WGET_BINARY.name 15 | 16 | def get_output_path(self, snapshot) -> str: 17 | # wget_index_path = wget_output_path(snapshot.as_link()) 18 | # if wget_index_path: 19 | # return Path(wget_index_path) 20 | return 'wget' 21 | 22 | WGET_EXTRACTOR = WgetExtractor() 23 | 24 | 25 | class WarcExtractor(BaseExtractor): 26 | name: ExtractorName = 'warc' 27 | binary: BinName = WGET_BINARY.name 28 | 29 | def get_output_path(self, snapshot) -> Path | None: 30 | warc_files = list((Path(snapshot.link_dir) / 'warc').glob('*.warc.gz')) 31 | if warc_files: 32 | return sorted(warc_files, key=lambda x: x.stat().st_size, reverse=True)[0] 33 | return None 34 | 35 | 36 | WARC_EXTRACTOR = WarcExtractor() 37 | 38 | -------------------------------------------------------------------------------- /archivebox/pkgs/abx-spec-searchbackend/abx_spec_searchbackend.py: -------------------------------------------------------------------------------- 1 | import abc 2 | from typing import Iterable, List, Dict, cast 3 | 4 | import abx 5 | from abx_spec_config import ConfigPluginSpec 6 | 7 | 8 | class BaseSearchBackend(abc.ABC): 9 | name: str 10 | 11 | @staticmethod 12 | @abc.abstractmethod 13 | def index(snapshot_id: str, texts: List[str]): 14 | return 15 | 16 | @staticmethod 17 | @abc.abstractmethod 18 | def flush(snapshot_ids: Iterable[str]): 19 | return 20 | 21 | @staticmethod 22 | @abc.abstractmethod 23 | def search(text: str) -> List[str]: 24 | raise NotImplementedError("search method must be implemented by subclass") 25 | 26 | 27 | class SearchBackendPluginSpec: 28 | __order__ = 10 29 | 30 | @staticmethod 31 | @abx.hookspec 32 | @abx.hookimpl 33 | def get_SEARCHBACKENDS() -> Dict[abx.PluginId, BaseSearchBackend]: 34 | return {} 35 | 36 | 37 | class ExpectedPluginSpec(SearchBackendPluginSpec, ConfigPluginSpec): 38 | pass 39 | 40 | PLUGIN_SPEC = SearchBackendPluginSpec 41 | 42 | TypedPluginManager = abx.ABXPluginManager[ExpectedPluginSpec] 43 | pm = cast(TypedPluginManager, abx.pm) 44 | -------------------------------------------------------------------------------- /archivebox/core/migrations/0031_alter_archiveresult_id_alter_archiveresult_uuid_and_more.py: -------------------------------------------------------------------------------- 1 | # Generated by Django 5.0.6 on 2024-08-18 05:09 2 | 3 | import uuid 4 | from django.db import migrations, models 5 | 6 | 7 | class Migration(migrations.Migration): 8 | 9 | dependencies = [ 10 | ('core', '0030_alter_archiveresult_uuid'), 11 | ] 12 | 13 | operations = [ 14 | migrations.AlterField( 15 | model_name='archiveresult', 16 | name='id', 17 | field=models.IntegerField(default=uuid.uuid4, primary_key=True, serialize=False, verbose_name='ID'), 18 | ), 19 | migrations.AlterField( 20 | model_name='archiveresult', 21 | name='uuid', 22 | field=models.UUIDField(default=uuid.uuid4, unique=True), 23 | ), 24 | migrations.AlterField( 25 | model_name='snapshot', 26 | name='uuid', 27 | field=models.UUIDField(default=uuid.uuid4, unique=True), 28 | ), 29 | migrations.AlterField( 30 | model_name='tag', 31 | name='uuid', 32 | field=models.UUIDField(default=uuid.uuid4, null=True, unique=True), 33 | ), 34 | ] 35 | -------------------------------------------------------------------------------- /archivebox/pkgs/abx-plugin-chrome/abx_plugin_chrome/__init__.py: -------------------------------------------------------------------------------- 1 | __package__ = 'abx_plugin_chrome' 2 | __label__ = 'Chrome' 3 | __author__ = 'ArchiveBox' 4 | 5 | import abx 6 | 7 | @abx.hookimpl 8 | def get_CONFIG(): 9 | from .config import CHROME_CONFIG 10 | 11 | return { 12 | 'CHROME_CONFIG': CHROME_CONFIG 13 | } 14 | 15 | @abx.hookimpl 16 | def get_BINARIES(): 17 | from .binaries import CHROME_BINARY 18 | 19 | return { 20 | 'chrome': CHROME_BINARY, 21 | } 22 | 23 | @abx.hookimpl 24 | def ready(): 25 | from .config import CHROME_CONFIG 26 | CHROME_CONFIG.validate() 27 | 28 | 29 | @abx.hookimpl 30 | def get_EXTRACTORS(): 31 | """extractors that can be run for each URL, producing one or more ArchiveResults each""" 32 | from .extractors import PDF_EXTRACTOR, SCREENSHOT_EXTRACTOR, DOM_EXTRACTOR 33 | # dom -> ./output.html -> ./chrome_dom/index.html 34 | # screenshot -> ./screenshot.png -> ./chrome_screenshot/screenshot.png 35 | # pdf -> ./output.pdf -> ./chrome_pdf/pdf.pdf 36 | return { 37 | 'pdf': PDF_EXTRACTOR, 38 | 'screenshot': SCREENSHOT_EXTRACTOR, 39 | 'dom': DOM_EXTRACTOR, 40 | } 41 | -------------------------------------------------------------------------------- /archivebox/config/__init__.py: -------------------------------------------------------------------------------- 1 | __package__ = 'archivebox.config' 2 | __order__ = 200 3 | 4 | from .paths import ( 5 | PACKAGE_DIR, # noqa 6 | DATA_DIR, # noqa 7 | ARCHIVE_DIR, # noqa 8 | ) 9 | from .constants import CONSTANTS, CONSTANTS_CONFIG, PACKAGE_DIR, DATA_DIR, ARCHIVE_DIR # noqa 10 | from .version import VERSION # noqa 11 | 12 | # import abx 13 | 14 | # @abx.hookimpl 15 | # def get_CONFIG(): 16 | # from .common import ( 17 | # SHELL_CONFIG, 18 | # STORAGE_CONFIG, 19 | # GENERAL_CONFIG, 20 | # SERVER_CONFIG, 21 | # ARCHIVING_CONFIG, 22 | # SEARCH_BACKEND_CONFIG, 23 | # ) 24 | # return { 25 | # 'SHELL_CONFIG': SHELL_CONFIG, 26 | # 'STORAGE_CONFIG': STORAGE_CONFIG, 27 | # 'GENERAL_CONFIG': GENERAL_CONFIG, 28 | # 'SERVER_CONFIG': SERVER_CONFIG, 29 | # 'ARCHIVING_CONFIG': ARCHIVING_CONFIG, 30 | # 'SEARCHBACKEND_CONFIG': SEARCH_BACKEND_CONFIG, 31 | # } 32 | 33 | # @abx.hookimpl 34 | # def ready(): 35 | # for config in get_CONFIG().values(): 36 | # config.validate() 37 | -------------------------------------------------------------------------------- /archivebox/pkgs/abx-plugin-mercury/abx_plugin_mercury/binaries.py: -------------------------------------------------------------------------------- 1 | __package__ = 'abx_plugin_mercury' 2 | 3 | from typing import List 4 | 5 | from pydantic import InstanceOf 6 | from abx_pkg import BinProvider, BinName, BinaryOverrides, bin_abspath, Binary 7 | 8 | from abx_plugin_default_binproviders import env 9 | 10 | from abx_plugin_npm.binproviders import SYS_NPM_BINPROVIDER, LIB_NPM_BINPROVIDER 11 | 12 | from .config import MERCURY_CONFIG 13 | 14 | 15 | class MercuryBinary(Binary): 16 | name: BinName = MERCURY_CONFIG.MERCURY_BINARY 17 | binproviders_supported: List[InstanceOf[BinProvider]] = [LIB_NPM_BINPROVIDER, SYS_NPM_BINPROVIDER, env] 18 | 19 | overrides: BinaryOverrides = { 20 | LIB_NPM_BINPROVIDER.name: { 21 | 'packages': ['@postlight/parser@^2.2.3'], 22 | }, 23 | SYS_NPM_BINPROVIDER.name: { 24 | 'packages': ['@postlight/parser@^2.2.3'], 25 | 'install': lambda: None, # never try to install things into global prefix 26 | }, 27 | env.name: { 28 | 'version': lambda: '999.999.999' if bin_abspath('postlight-parser', PATH=env.PATH) else None, 29 | }, 30 | } 31 | 32 | MERCURY_BINARY = MercuryBinary() 33 | -------------------------------------------------------------------------------- /archivebox/templates/core/navigation.html: -------------------------------------------------------------------------------- 1 | {% load i18n static %} 2 | 3 |
4 | Add ➕     5 | Snapshots | 6 | Tags | 7 | Log     8 | Docs | 9 | API | 10 | Public | 11 | Admin 12 |     13 | {% if user.is_authenticated %} 14 | {% block welcome-msg %} 15 | {% trans 'User' %} 16 | {% firstof user.get_short_name user.get_username %}     17 | {% endblock %} 18 | {% block userlinks %} 19 | {% if user.has_usable_password %} 20 | Account / 21 | {% endif %} 22 | {% trans 'Log out' %} 23 | {% endblock %} 24 | {% else %} 25 | {% trans 'Log in' %} 26 | {% endif %} 27 |
28 | -------------------------------------------------------------------------------- /etc/sonic.cfg: -------------------------------------------------------------------------------- 1 | # Sonic 2 | # Fast, lightweight and schema-less search backend 3 | # Configuration file 4 | # Example: https://github.com/valeriansaliou/sonic/blob/master/config.cfg 5 | 6 | 7 | [server] 8 | 9 | # log_level = "debug" 10 | log_level = "warn" 11 | 12 | 13 | [channel] 14 | 15 | inet = "0.0.0.0:1491" 16 | tcp_timeout = 300 17 | 18 | auth_password = "${env.SEARCH_BACKEND_PASSWORD}" 19 | 20 | [channel.search] 21 | 22 | query_limit_default = 65535 23 | query_limit_maximum = 65535 24 | query_alternates_try = 10 25 | 26 | suggest_limit_default = 5 27 | suggest_limit_maximum = 20 28 | 29 | 30 | [store] 31 | 32 | [store.kv] 33 | 34 | path = "/var/lib/sonic/store/kv/" 35 | 36 | retain_word_objects = 100000 37 | 38 | [store.kv.pool] 39 | 40 | inactive_after = 1800 41 | 42 | [store.kv.database] 43 | 44 | flush_after = 900 45 | 46 | compress = true 47 | parallelism = 2 48 | max_files = 100 49 | max_compactions = 1 50 | max_flushes = 1 51 | write_buffer = 16384 52 | write_ahead_log = true 53 | 54 | [store.fst] 55 | 56 | path = "/var/lib/sonic/store/fst/" 57 | 58 | [store.fst.pool] 59 | 60 | inactive_after = 300 61 | 62 | [store.fst.graph] 63 | 64 | consolidate_after = 180 65 | 66 | max_size = 2048 67 | max_words = 250000 68 | -------------------------------------------------------------------------------- /archivebox/core/migrations/0066_alter_snapshottag_tag_alter_tag_id_alter_tag_old_id.py: -------------------------------------------------------------------------------- 1 | # Generated by Django 5.0.6 on 2024-08-20 03:52 2 | 3 | import core.models 4 | import django.db.models.deletion 5 | import uuid 6 | import random 7 | from django.db import migrations, models 8 | 9 | def rand_int_id(): 10 | return random.getrandbits(32) 11 | 12 | class Migration(migrations.Migration): 13 | 14 | dependencies = [ 15 | ('core', '0065_remove_snapshottag_old_tag'), 16 | ] 17 | 18 | operations = [ 19 | migrations.AlterField( 20 | model_name='snapshottag', 21 | name='tag', 22 | field=models.ForeignKey(db_column='tag_id', on_delete=django.db.models.deletion.CASCADE, to='core.tag', to_field='id'), 23 | ), 24 | migrations.AlterField( 25 | model_name='tag', 26 | name='id', 27 | field=models.UUIDField(default=uuid.uuid4, editable=False, primary_key=True, serialize=False, unique=True), 28 | ), 29 | migrations.AlterField( 30 | model_name='tag', 31 | name='old_id', 32 | field=models.BigIntegerField(default=rand_int_id, serialize=False, unique=True, verbose_name='Old ID'), 33 | ), 34 | ] 35 | -------------------------------------------------------------------------------- /archivebox/cli/archivebox_manage.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | __package__ = 'archivebox.cli' 4 | 5 | import rich_click as click 6 | from archivebox.misc.util import docstring, enforce_types 7 | 8 | 9 | @enforce_types 10 | def manage(args: list[str] | None=None) -> None: 11 | """Run an ArchiveBox Django management command""" 12 | 13 | from archivebox.config.common import SHELL_CONFIG 14 | from archivebox.misc.logging import stderr 15 | 16 | if (args and "createsuperuser" in args) and (SHELL_CONFIG.IN_DOCKER and not SHELL_CONFIG.IS_TTY): 17 | stderr('[!] Warning: you need to pass -it to use interactive commands in docker', color='lightyellow') 18 | stderr(' docker run -it archivebox manage {}'.format(' '.join(args or ['...'])), color='lightyellow') 19 | stderr('') 20 | 21 | from django.core.management import execute_from_command_line 22 | execute_from_command_line(['manage.py', *(args or ['help'])]) 23 | 24 | 25 | @click.command(add_help_option=False, context_settings=dict(ignore_unknown_options=True)) 26 | @click.argument('args', nargs=-1) 27 | @docstring(manage.__doc__) 28 | def main(args: list[str] | None=None) -> None: 29 | manage(args=args) 30 | 31 | 32 | if __name__ == '__main__': 33 | main() 34 | -------------------------------------------------------------------------------- /archivebox/pkgs/abx-plugin-singlefile/abx_plugin_singlefile/actors.py: -------------------------------------------------------------------------------- 1 | # __package__ = 'abx_plugin_singlefile' 2 | 3 | # from typing import ClassVar 4 | # from django.db.models import QuerySet 5 | # from django.utils.functional import classproperty 6 | 7 | # from workers.actor import ActorType 8 | 9 | # from .models import SinglefileResult 10 | 11 | 12 | # class SinglefileActor(ActorType[SinglefileResult]): 13 | # CLAIM_ORDER: ClassVar[str] = 'created_at DESC' 14 | # CLAIM_WHERE: ClassVar[str] = 'status = "queued" AND extractor = "favicon"' 15 | # CLAIM_SET: ClassVar[str] = 'status = "started"' 16 | 17 | # @classproperty 18 | # def QUERYSET(cls) -> QuerySet: 19 | # return SinglefileResult.objects.filter(status='queued') 20 | 21 | # def tick(self, obj: SinglefileResult): 22 | # print(f'[grey53]{self}.tick({obj.abid or obj.id}, status={obj.status}) remaining:[/grey53]', self.get_queue().count()) 23 | # updated = SinglefileResult.objects.filter(id=obj.id, status='started').update(status='success') == 1 24 | # if not updated: 25 | # raise Exception(f'Failed to update {obj.abid or obj.id}, interrupted by another actor writing to the same object') 26 | # obj.refresh_from_db() 27 | # obj.save() 28 | -------------------------------------------------------------------------------- /archivebox/api/tests.py: -------------------------------------------------------------------------------- 1 | __package__ = 'archivebox.api' 2 | 3 | # from django.test import TestCase 4 | # from ninja.testing import TestClient 5 | 6 | # from .routes_cli import router 7 | 8 | # class ArchiveBoxCLIAPITestCase(TestCase): 9 | # def setUp(self): 10 | # self.client = TestClient(router) 11 | 12 | # def test_add_endpoint(self): 13 | # response = self.client.post("/add", json={"urls": ["http://example.com"], "tag": "testTag1,testTag2"}) 14 | # self.assertEqual(response.status_code, 200) 15 | # self.assertTrue(response.json()["success"]) 16 | 17 | # def test_remove_endpoint(self): 18 | # response = self.client.post("/remove", json={"filter_patterns": ["http://example.com"]}) 19 | # self.assertEqual(response.status_code, 200) 20 | # self.assertTrue(response.json()["success"]) 21 | 22 | # def test_update_endpoint(self): 23 | # response = self.client.post("/update", json={}) 24 | # self.assertEqual(response.status_code, 200) 25 | # self.assertTrue(response.json()["success"]) 26 | 27 | # def test_list_all_endpoint(self): 28 | # response = self.client.post("/list_all", json={}) 29 | # self.assertEqual(response.status_code, 200) 30 | # self.assertTrue(response.json()["success"]) 31 | -------------------------------------------------------------------------------- /tests/mock_server/templates/example.rss: -------------------------------------------------------------------------------- 1 | 2 | 7 | 8 | Sample Feed 9 | http://example.org/ 10 | For documentation only 11 | en-us 12 | Nobody (nobody@example.org) 13 | Public domain 14 | 2024-02-26T17:28:12-08:00 15 | 16 | 17 | 18 | 19 | First! 20 | http://127.0.0.1:8080/static/example.com.html 21 | just-an@example.org 22 | 23 | This has a description. 24 | 25 | Tag1 Tag2 26 | 2024-02-26T17:28:12-08:00 27 | description.]]> 29 | 30 | 31 | 32 | 33 | -------------------------------------------------------------------------------- /archivebox/misc/paginators.py: -------------------------------------------------------------------------------- 1 | __package__ = 'archivebox.misc' 2 | 3 | from django.core.paginator import Paginator 4 | from django.utils.functional import cached_property 5 | 6 | 7 | class AccelleratedPaginator(Paginator): 8 | """ 9 | Accellerated Pagniator ignores DISTINCT when counting total number of rows. 10 | Speeds up SELECT Count(*) on Admin views by >20x. 11 | https://hakibenita.com/optimizing-the-django-admin-paginator 12 | """ 13 | 14 | @cached_property 15 | def count(self): 16 | if self.object_list._has_filters(): # type: ignore 17 | # fallback to normal count method on filtered queryset 18 | return super().count 19 | else: 20 | # otherwise count total rows in a separate fast query 21 | return self.object_list.model.objects.count() 22 | 23 | # Alternative approach for PostgreSQL: fallback count takes > 200ms 24 | # from django.db import connection, transaction, OperationalError 25 | # with transaction.atomic(), connection.cursor() as cursor: 26 | # cursor.execute('SET LOCAL statement_timeout TO 200;') 27 | # try: 28 | # return super().count 29 | # except OperationalError: 30 | # return 9999999999999 31 | -------------------------------------------------------------------------------- /etc/nginx.conf: -------------------------------------------------------------------------------- 1 | user www-data; 2 | pid /var/run/nginx.pid; 3 | 4 | worker_processes auto; 5 | timer_resolution 100ms; 6 | 7 | worker_rlimit_nofile 40000; 8 | events { 9 | worker_connections 40000; 10 | use epoll; 11 | multi_accept on; 12 | } 13 | 14 | http { 15 | sendfile on; 16 | tcp_nopush on; 17 | tcp_nodelay on; 18 | server_tokens off; 19 | send_timeout 20; 20 | keepalive_timeout 65; 21 | types_hash_max_size 2048; 22 | client_max_body_size 25m; 23 | 24 | include mime.types; 25 | default_type application/octet-stream; 26 | 27 | access_log /dev/stdout; 28 | error_log stderr; 29 | 30 | reset_timedout_connection on; 31 | 32 | server_names_hash_bucket_size 64; 33 | 34 | server { 35 | listen 80 default_server; 36 | server_name _; 37 | 38 | index index.html; 39 | autoindex on; 40 | try_files $uri $uri/ $uri.html =404; 41 | 42 | location /archive { 43 | root /var/www/archive; 44 | } 45 | } 46 | } 47 | 48 | -------------------------------------------------------------------------------- /archivebox/parsers/generic_rss.py: -------------------------------------------------------------------------------- 1 | __package__ = 'archivebox.parsers' 2 | 3 | 4 | from typing import IO, Iterable 5 | from time import mktime 6 | from feedparser import parse as feedparser 7 | 8 | from ..index.schema import Link 9 | from archivebox.misc.util import ( 10 | htmldecode, 11 | enforce_types 12 | ) 13 | 14 | @enforce_types 15 | def parse_generic_rss_export(rss_file: IO[str], **_kwargs) -> Iterable[Link]: 16 | """Parse RSS XML-format files into links""" 17 | 18 | rss_file.seek(0) 19 | feed = feedparser(rss_file.read()) 20 | for item in feed.entries: 21 | url = item.link 22 | title = item.title 23 | time = mktime(item.updated_parsed) 24 | 25 | try: 26 | tags = ','.join(map(lambda tag: tag.term, item.tags)) 27 | except AttributeError: 28 | tags = '' 29 | 30 | if url is None: 31 | # Yielding a Link with no URL will 32 | # crash on a URL validation assertion 33 | continue 34 | 35 | yield Link( 36 | url=htmldecode(url), 37 | timestamp=str(time), 38 | title=htmldecode(title) or None, 39 | tags=tags, 40 | sources=[rss_file.name], 41 | ) 42 | 43 | 44 | KEY = 'rss' 45 | NAME = 'Generic RSS' 46 | PARSER = parse_generic_rss_export 47 | -------------------------------------------------------------------------------- /archivebox/api/admin.py: -------------------------------------------------------------------------------- 1 | __package__ = 'archivebox.api' 2 | 3 | from signal_webhooks.admin import WebhookAdmin 4 | from signal_webhooks.utils import get_webhook_model 5 | 6 | from archivebox.base_models.admin import ABIDModelAdmin 7 | 8 | from api.models import APIToken 9 | 10 | 11 | class APITokenAdmin(ABIDModelAdmin): 12 | list_display = ('created_at', 'abid', 'created_by', 'token_redacted', 'expires') 13 | sort_fields = ('abid', 'created_at', 'created_by', 'expires') 14 | readonly_fields = ('created_at', 'modified_at', 'abid_info') 15 | search_fields = ('id', 'abid', 'created_by__username', 'token') 16 | fields = ('created_by', 'token', 'expires', *readonly_fields) 17 | 18 | list_filter = ('created_by',) 19 | ordering = ['-created_at'] 20 | list_per_page = 100 21 | 22 | 23 | class CustomWebhookAdmin(WebhookAdmin, ABIDModelAdmin): 24 | list_display = ('created_at', 'created_by', 'abid', *WebhookAdmin.list_display) 25 | sort_fields = ('created_at', 'created_by', 'abid', 'referenced_model', 'endpoint', 'last_success', 'last_error') 26 | readonly_fields = ('created_at', 'modified_at', 'abid_info', *WebhookAdmin.readonly_fields) 27 | 28 | 29 | def register_admin(admin_site): 30 | admin_site.register(APIToken, APITokenAdmin) 31 | admin_site.register(get_webhook_model(), CustomWebhookAdmin) 32 | -------------------------------------------------------------------------------- /archivebox/pkgs/abx-plugin-favicon/abx_plugin_favicon/actors.py: -------------------------------------------------------------------------------- 1 | __package__ = 'abx_plugin_favicon' 2 | 3 | from typing import ClassVar 4 | 5 | from core.actors import ActorType 6 | from core.statemachines import ArchiveResultMachine 7 | 8 | from statemachine import State 9 | 10 | from .models import FaviconResult 11 | 12 | 13 | class FaviconResultActor(ActorType[FaviconResult]): 14 | """ 15 | The primary actor for progressing ArchiveResult objects 16 | through their lifecycle using the ArchiveResultMachine. 17 | """ 18 | Model = FaviconResult 19 | StateMachineClass = ArchiveResultMachine 20 | 21 | ACTIVE_STATE: ClassVar[State] = ArchiveResultMachine.started # 'started' 22 | FINAL_STATES: ClassVar[list[State]] = ArchiveResultMachine.final_states # ['succeeded', 'failed', 'skipped'] 23 | STATE_FIELD_NAME: ClassVar[str] = ArchiveResultMachine.state_field_name # status 24 | 25 | MAX_CONCURRENT_ACTORS: ClassVar[int] = 6 26 | MAX_TICK_TIME: ClassVar[int] = 60 27 | CLAIM_FROM_TOP_N: ClassVar[int] = MAX_CONCURRENT_ACTORS * 10 28 | 29 | # @classproperty 30 | # def qs(cls) -> QuerySet[ModelType]: 31 | # """Get the unfiltered and unsorted QuerySet of all objects that this Actor might care about.""" 32 | # return cls.Model.objects.filter(extractor='favicon') 33 | -------------------------------------------------------------------------------- /archivebox/pkgs/abx-plugin-npm/abx_plugin_npm/binproviders.py: -------------------------------------------------------------------------------- 1 | import os 2 | from pathlib import Path 3 | from typing import Optional 4 | 5 | from abx_pkg import NpmProvider, PATHStr, BinProviderName 6 | 7 | import abx 8 | 9 | DEFAULT_LIB_NPM_DIR = Path('/usr/local/share/abx/npm') 10 | 11 | OLD_NODE_BIN_PATH = Path(os.getcwd()) / 'node_modules' / '.bin' 12 | NEW_NODE_BIN_PATH = DEFAULT_LIB_NPM_DIR / 'node_modules' / '.bin' 13 | 14 | 15 | class SystemNpmBinProvider(NpmProvider): 16 | name: BinProviderName = "sys_npm" 17 | 18 | npm_prefix: Optional[Path] = None 19 | 20 | 21 | class LibNpmBinProvider(NpmProvider): 22 | name: BinProviderName = "lib_npm" 23 | PATH: PATHStr = f'{NEW_NODE_BIN_PATH}:{OLD_NODE_BIN_PATH}' 24 | 25 | npm_prefix: Optional[Path] = DEFAULT_LIB_NPM_DIR 26 | 27 | def setup(self) -> None: 28 | # update paths from config at runtime 29 | LIB_DIR = abx.pm.hook.get_LIB_DIR() 30 | self.npm_prefix = LIB_DIR / 'npm' 31 | self.PATH = f'{LIB_DIR / "npm" / "node_modules" / ".bin"}:{NEW_NODE_BIN_PATH}:{OLD_NODE_BIN_PATH}' 32 | super().setup() 33 | 34 | 35 | SYS_NPM_BINPROVIDER = SystemNpmBinProvider() 36 | LIB_NPM_BINPROVIDER = LibNpmBinProvider() 37 | LIB_NPM_BINPROVIDER.setup() 38 | npm = LIB_NPM_BINPROVIDER 39 | 40 | LIB_NPM_BINPROVIDER.setup() 41 | SYS_NPM_BINPROVIDER.setup() 42 | -------------------------------------------------------------------------------- /archivebox/core/migrations/0003_auto_20200630_1034.py: -------------------------------------------------------------------------------- 1 | # Generated by Django 3.0.7 on 2020-06-30 10:34 2 | 3 | from django.db import migrations, models 4 | 5 | 6 | class Migration(migrations.Migration): 7 | 8 | dependencies = [ 9 | ('core', '0002_auto_20200625_1521'), 10 | ] 11 | 12 | operations = [ 13 | migrations.AlterField( 14 | model_name='snapshot', 15 | name='added', 16 | field=models.DateTimeField(auto_now_add=True, db_index=True), 17 | ), 18 | migrations.AlterField( 19 | model_name='snapshot', 20 | name='tags', 21 | field=models.CharField(db_index=True, default=None, max_length=256, null=True), 22 | ), 23 | migrations.AlterField( 24 | model_name='snapshot', 25 | name='timestamp', 26 | field=models.CharField(db_index=True, default=None, max_length=32, null=True), 27 | ), 28 | migrations.AlterField( 29 | model_name='snapshot', 30 | name='title', 31 | field=models.CharField(db_index=True, default=None, max_length=128, null=True), 32 | ), 33 | migrations.AlterField( 34 | model_name='snapshot', 35 | name='updated', 36 | field=models.DateTimeField(db_index=True, default=None, null=True), 37 | ), 38 | ] 39 | -------------------------------------------------------------------------------- /archivebox/core/migrations/0073_rename_created_archiveresult_created_at_and_more.py: -------------------------------------------------------------------------------- 1 | # Generated by Django 5.1 on 2024-09-05 00:25 2 | 3 | from django.db import migrations 4 | 5 | 6 | class Migration(migrations.Migration): 7 | 8 | dependencies = [ 9 | ('core', '0072_rename_added_snapshot_bookmarked_at_and_more'), 10 | ] 11 | 12 | operations = [ 13 | migrations.RenameField( 14 | model_name='archiveresult', 15 | old_name='created', 16 | new_name='created_at', 17 | ), 18 | migrations.RenameField( 19 | model_name='archiveresult', 20 | old_name='modified', 21 | new_name='modified_at', 22 | ), 23 | migrations.RenameField( 24 | model_name='snapshot', 25 | old_name='created', 26 | new_name='created_at', 27 | ), 28 | migrations.RenameField( 29 | model_name='snapshot', 30 | old_name='modified', 31 | new_name='modified_at', 32 | ), 33 | migrations.RenameField( 34 | model_name='tag', 35 | old_name='created', 36 | new_name='created_at', 37 | ), 38 | migrations.RenameField( 39 | model_name='tag', 40 | old_name='modified', 41 | new_name='modified_at', 42 | ), 43 | ] 44 | -------------------------------------------------------------------------------- /archivebox/parsers/medium_rss.py: -------------------------------------------------------------------------------- 1 | __package__ = 'archivebox.parsers' 2 | 3 | 4 | from typing import IO, Iterable 5 | from datetime import datetime 6 | 7 | from xml.etree import ElementTree 8 | 9 | from ..index.schema import Link 10 | from archivebox.misc.util import ( 11 | htmldecode, 12 | enforce_types, 13 | ) 14 | 15 | 16 | @enforce_types 17 | def parse_medium_rss_export(rss_file: IO[str], **_kwargs) -> Iterable[Link]: 18 | """Parse Medium RSS feed files into links""" 19 | 20 | rss_file.seek(0) 21 | root = ElementTree.parse(rss_file).getroot() 22 | items = root.find("channel").findall("item") # type: ignore 23 | for item in items: 24 | url = item.find("link").text # type: ignore 25 | title = item.find("title").text.strip() # type: ignore 26 | ts_str = item.find("pubDate").text # type: ignore 27 | time = datetime.strptime(ts_str, "%a, %d %b %Y %H:%M:%S %Z") # type: ignore 28 | 29 | yield Link( 30 | url=htmldecode(url), 31 | timestamp=str(time.timestamp()), 32 | title=htmldecode(title) or None, 33 | tags=None, 34 | sources=[rss_file.name], 35 | ) 36 | 37 | 38 | KEY = 'medium_rss' 39 | NAME = 'Medium RSS' 40 | PARSER = parse_medium_rss_export 41 | -------------------------------------------------------------------------------- /archivebox/core/templatetags/core_tags.py: -------------------------------------------------------------------------------- 1 | from django import template 2 | from django.contrib.admin.templatetags.base import InclusionAdminNode 3 | 4 | 5 | from typing import Union 6 | 7 | 8 | register = template.Library() 9 | 10 | @register.filter(name='split') 11 | def split(value, separator: str=','): 12 | return (value or '').split(separator) 13 | 14 | @register.filter 15 | def file_size(num_bytes: Union[int, float]) -> str: 16 | for count in ['Bytes','KB','MB','GB']: 17 | if num_bytes > -1024.0 and num_bytes < 1024.0: 18 | return '%3.1f %s' % (num_bytes, count) 19 | num_bytes /= 1024.0 20 | return '%3.1f %s' % (num_bytes, 'TB') 21 | 22 | def result_list(cl): 23 | """ 24 | Monkey patched result 25 | """ 26 | num_sorted_fields = 0 27 | return { 28 | 'cl': cl, 29 | 'num_sorted_fields': num_sorted_fields, 30 | 'results': cl.result_list, 31 | } 32 | 33 | @register.tag(name='snapshots_grid') 34 | def result_list_tag(parser, token): 35 | return InclusionAdminNode( 36 | parser, token, 37 | func=result_list, 38 | template_name='snapshots_grid.html', 39 | takes_context=False, 40 | ) 41 | 42 | @register.simple_tag(takes_context=True) 43 | def url_replace(context, **kwargs): 44 | dict_ = context['request'].GET.copy() 45 | dict_.update(**kwargs) 46 | return dict_.urlencode() 47 | -------------------------------------------------------------------------------- /archivebox/core/migrations/0069_alter_archiveresult_created_alter_snapshot_added_and_more.py: -------------------------------------------------------------------------------- 1 | # Generated by Django 5.1 on 2024-08-28 09:40 2 | 3 | import django.utils.timezone 4 | from django.db import migrations 5 | 6 | import archivebox.base_models.models 7 | 8 | 9 | class Migration(migrations.Migration): 10 | 11 | dependencies = [ 12 | ('core', '0068_alter_archiveresult_options'), 13 | ] 14 | 15 | operations = [ 16 | migrations.AlterField( 17 | model_name='archiveresult', 18 | name='created', 19 | field=archivebox.base_models.models.AutoDateTimeField(db_index=True, default=django.utils.timezone.now), 20 | ), 21 | migrations.AlterField( 22 | model_name='snapshot', 23 | name='added', 24 | field=archivebox.base_models.models.AutoDateTimeField(db_index=True, default=django.utils.timezone.now), 25 | ), 26 | migrations.AlterField( 27 | model_name='snapshot', 28 | name='created', 29 | field=archivebox.base_models.models.AutoDateTimeField(db_index=True, default=django.utils.timezone.now), 30 | ), 31 | migrations.AlterField( 32 | model_name='tag', 33 | name='created', 34 | field=archivebox.base_models.models.AutoDateTimeField(db_index=True, default=django.utils.timezone.now), 35 | ), 36 | ] 37 | -------------------------------------------------------------------------------- /archivebox/api/migrations/0009_rename_created_apitoken_created_at_and_more.py: -------------------------------------------------------------------------------- 1 | # Generated by Django 5.1 on 2024-09-05 00:26 2 | 3 | from django.db import migrations, models 4 | 5 | import archivebox.base_models.models 6 | 7 | 8 | class Migration(migrations.Migration): 9 | 10 | dependencies = [ 11 | ('api', '0008_alter_apitoken_created_alter_apitoken_created_by_and_more'), 12 | ] 13 | 14 | operations = [ 15 | migrations.RenameField( 16 | model_name='apitoken', 17 | old_name='created', 18 | new_name='created_at', 19 | ), 20 | migrations.RenameField( 21 | model_name='apitoken', 22 | old_name='modified', 23 | new_name='modified_at', 24 | ), 25 | migrations.RenameField( 26 | model_name='outboundwebhook', 27 | old_name='modified', 28 | new_name='modified_at', 29 | ), 30 | migrations.AddField( 31 | model_name='outboundwebhook', 32 | name='created_at', 33 | field=archivebox.base_models.models.AutoDateTimeField(db_index=True, default=None), 34 | ), 35 | migrations.AlterField( 36 | model_name='outboundwebhook', 37 | name='created', 38 | field=models.DateTimeField(auto_now_add=True, help_text='When the webhook was created.', verbose_name='created'), 39 | ), 40 | ] 41 | -------------------------------------------------------------------------------- /tests/mock_server/templates/example.com.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Example Domain 5 | 6 | 7 | 8 | 9 | 37 | 38 | 39 | 40 |
41 |

Example Domain

42 |

This domain is for use in illustrative examples in documents. You may use this 43 | domain in literature without prior coordination or asking for permission.

44 |

45 | More information... 46 |

47 |
48 | 49 | 50 | -------------------------------------------------------------------------------- /archivebox/cli/archivebox_extract.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | __package__ = 'archivebox.cli' 4 | __command__ = 'archivebox extract' 5 | 6 | 7 | import sys 8 | from typing import TYPE_CHECKING, Generator 9 | 10 | import rich_click as click 11 | 12 | from django.db.models import Q 13 | 14 | from archivebox.misc.util import enforce_types, docstring 15 | 16 | 17 | if TYPE_CHECKING: 18 | from core.models import ArchiveResult 19 | 20 | 21 | ORCHESTRATOR = None 22 | 23 | @enforce_types 24 | def extract(archiveresult_id: str) -> Generator['ArchiveResult', None, None]: 25 | archiveresult = ArchiveResult.objects.get(Q(id=archiveresult_id) | Q(abid=archiveresult_id)) 26 | if not archiveresult: 27 | raise Exception(f'ArchiveResult {archiveresult_id} not found') 28 | 29 | return archiveresult.EXTRACTOR.extract() 30 | 31 | # @#/absolute/path/to/binary 32 | # 2014.24.01 33 | 34 | @click.command() 35 | 36 | @click.argument('archiveresult_ids', nargs=-1, type=str) 37 | @docstring(extract.__doc__) 38 | def main(archiveresult_ids: list[str]): 39 | """Add a new URL or list of URLs to your archive""" 40 | 41 | for archiveresult_id in (archiveresult_ids or sys.stdin): 42 | print(f'Extracting {archiveresult_id}...') 43 | archiveresult = extract(str(archiveresult_id)) 44 | print(archiveresult.as_json()) 45 | 46 | 47 | if __name__ == '__main__': 48 | main() 49 | 50 | -------------------------------------------------------------------------------- /archivebox/parsers/netscape_html.py: -------------------------------------------------------------------------------- 1 | __package__ = 'archivebox.parsers' 2 | 3 | 4 | import re 5 | 6 | from typing import IO, Iterable 7 | from datetime import datetime 8 | 9 | from ..index.schema import Link 10 | from archivebox.misc.util import ( 11 | htmldecode, 12 | enforce_types, 13 | ) 14 | 15 | 16 | @enforce_types 17 | def parse_netscape_html_export(html_file: IO[str], **_kwargs) -> Iterable[Link]: 18 | """Parse netscape-format bookmarks export files (produced by all browsers)""" 19 | 20 | html_file.seek(0) 21 | pattern = re.compile("]*>(.+)", re.UNICODE | re.IGNORECASE) 22 | for line in html_file: 23 | # example line 24 | #
example bookmark title 25 | 26 | match = pattern.search(line) 27 | if match: 28 | url = match.group(1) 29 | time = datetime.fromtimestamp(float(match.group(2))) 30 | title = match.group(3).strip() 31 | 32 | yield Link( 33 | url=htmldecode(url), 34 | timestamp=str(time.timestamp()), 35 | title=htmldecode(title) or None, 36 | tags=None, 37 | sources=[html_file.name], 38 | ) 39 | 40 | 41 | KEY = 'netscape_html' 42 | NAME = 'Netscape HTML' 43 | PARSER = parse_netscape_html_export 44 | -------------------------------------------------------------------------------- /archivebox/parsers/pinboard_rss.py: -------------------------------------------------------------------------------- 1 | __package__ = 'archivebox.parsers' 2 | 3 | 4 | from typing import IO, Iterable 5 | from time import mktime 6 | from feedparser import parse as feedparser 7 | 8 | from ..index.schema import Link 9 | from archivebox.misc.util import ( 10 | htmldecode, 11 | enforce_types 12 | ) 13 | 14 | @enforce_types 15 | def parse_pinboard_rss_export(rss_file: IO[str], **_kwargs) -> Iterable[Link]: 16 | """Parse Pinboard RSS feed files into links""" 17 | 18 | rss_file.seek(0) 19 | feed = feedparser(rss_file.read()) 20 | for item in feed.entries: 21 | url = item.link 22 | # title will start with "[priv] " if pin was marked private. useful? 23 | title = item.title 24 | time = mktime(item.updated_parsed) 25 | 26 | # all tags are in one entry.tags with spaces in it. annoying! 27 | try: 28 | tags = item.tags[0].term.replace(' ', ',') 29 | except AttributeError: 30 | tags = '' 31 | 32 | if url is None: 33 | # Yielding a Link with no URL will 34 | # crash on a URL validation assertion 35 | continue 36 | 37 | yield Link( 38 | url=htmldecode(url), 39 | timestamp=str(time), 40 | title=htmldecode(title) or None, 41 | tags=htmldecode(tags) or None, 42 | sources=[rss_file.name], 43 | ) 44 | 45 | 46 | KEY = 'pinboard_rss' 47 | NAME = 'Pinboard RSS' 48 | PARSER = parse_pinboard_rss_export 49 | -------------------------------------------------------------------------------- /.github/CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contribution Process 2 | 3 | 1. Confirm your desired features fit into our bigger project goals [Roadmap](https://github.com/pirate/ArchiveBox/wiki/Roadmap). 4 | 2. Open an issue with your planned implementation to discuss 5 | 3. Check in with me before starting development to make sure your work wont conflict with or duplicate existing work 6 | 4. Setup your dev environment, make some changes, and test using the test input files 7 | 5. Commit, push, and submit a PR and wait for review feedback 8 | 6. Have patience, don't abandon your PR! We love contributors but we all have day jobs and don't always have time to respond to notifications instantly. If you want a faster response, ping @theSquashSH on twitter or Patreon. 9 | 10 | **Useful links:** 11 | 12 | - https://github.com/ArchiveBox/ArchiveBox/issues 13 | - https://github.com/ArchiveBox/ArchiveBox/pulls 14 | - https://github.com/ArchiveBox/ArchiveBox/wiki/Roadmap 15 | - https://github.com/ArchiveBox/ArchiveBox/wiki/Install#manual-setup 16 | 17 | ### Development Setup 18 | 19 | ```bash 20 | git clone https://github.com/ArchiveBox/ArchiveBox 21 | cd ArchiveBox 22 | # Ideally do this in a virtualenv 23 | pip install -e '.[dev]' # or use: pipenv install --dev 24 | ``` 25 | 26 | ### Running Tests 27 | 28 | ```bash 29 | ./bin/lint.sh 30 | ./bin/test.sh 31 | ./bin/build.sh 32 | ``` 33 | 34 | For more common tasks see the `Development` section at the bottom of the README. 35 | 36 | ### Getting Help 37 | 38 | Open issues on Github or message me https://sweeting.me/#contact. 39 | --------------------------------------------------------------------------------