├── tests
├── __init__.py
├── mock_server
│ ├── __init__.py
│ └── templates
│ │ ├── shift_jis.html
│ │ ├── malformed.html
│ │ ├── example-single.jsonl
│ │ ├── example.json.bad
│ │ ├── example.atom
│ │ ├── example.jsonl
│ │ ├── example.json
│ │ ├── example.rss
│ │ └── example.com.html
├── tags_migration
│ └── index.sqlite3
├── test_util.py
├── conftest.py
├── fixtures.py
└── test_update.py
├── archivebox
├── core
│ ├── actors.py
│ ├── migrations
│ │ ├── __init__.py
│ │ ├── 0056_remove_tag_uuid.py
│ │ ├── 0057_rename_id_tag_old_id.py
│ │ ├── 0065_remove_snapshottag_old_tag.py
│ │ ├── 0038_rename_uuid_snapshot_id.py
│ │ ├── 0042_remove_archiveresult_snapshot_old.py
│ │ ├── 0053_remove_snapshottag_snapshot_old.py
│ │ ├── 0033_rename_id_archiveresult_old_id.py
│ │ ├── 0019_auto_20210401_0654.py
│ │ ├── 0010_auto_20210216_1055.py
│ │ ├── 0030_alter_archiveresult_uuid.py
│ │ ├── 0037_rename_id_snapshot_old_id.py
│ │ ├── 0002_auto_20200625_1521.py
│ │ ├── 0016_auto_20210218_1204.py
│ │ ├── 0039_rename_snapshot_archiveresult_snapshot_old.py
│ │ ├── 0060_alter_tag_id.py
│ │ ├── 0055_alter_tag_slug.py
│ │ ├── 0068_alter_archiveresult_options.py
│ │ ├── 0009_auto_20210216_1038.py
│ │ ├── 0028_alter_archiveresult_uuid.py
│ │ ├── 0013_auto_20210218_0729.py
│ │ ├── 0014_auto_20210218_0729.py
│ │ ├── 0015_auto_20210218_0730.py
│ │ ├── 0008_auto_20210105_1421.py
│ │ ├── 0017_auto_20210219_0211.py
│ │ ├── 0029_alter_archiveresult_id.py
│ │ ├── 0035_remove_archiveresult_uuid_archiveresult_id.py
│ │ ├── 0025_alter_archiveresult_uuid.py
│ │ ├── 0054_alter_snapshot_timestamp.py
│ │ ├── 0004_auto_20200713_1552.py
│ │ ├── 0074_alter_snapshot_downloaded_at.py
│ │ ├── 0045_alter_snapshot_old_id.py
│ │ ├── 0062_alter_snapshottag_old_tag.py
│ │ ├── 0067_alter_snapshottag_tag.py
│ │ ├── 0058_alter_tag_old_id.py
│ │ ├── 0061_rename_tag_snapshottag_old_tag_and_more.py
│ │ ├── 0050_alter_snapshottag_snapshot_old.py
│ │ ├── 0049_rename_snapshot_snapshottag_snapshot_old_and_more.py
│ │ ├── 0018_auto_20210327_0952.py
│ │ ├── 0043_alter_archiveresult_snapshot_alter_snapshot_id_and_more.py
│ │ ├── 0032_alter_archiveresult_id.py
│ │ ├── 0072_rename_added_snapshot_bookmarked_at_and_more.py
│ │ ├── 0012_auto_20210216_1425.py
│ │ ├── 0020_auto_20210410_1031.py
│ │ ├── 0021_auto_20220914_0934.py
│ │ ├── 0022_auto_20231023_2008.py
│ │ ├── 0041_alter_archiveresult_snapshot_and_more.py
│ │ ├── 0047_alter_snapshottag_unique_together_and_more.py
│ │ ├── 0048_alter_archiveresult_snapshot_and_more.py
│ │ ├── 0064_alter_snapshottag_unique_together_and_more.py
│ │ ├── 0005_auto_20200728_0326.py
│ │ ├── 0052_alter_snapshottag_unique_together_and_more.py
│ │ ├── 0036_alter_archiveresult_id_alter_archiveresult_old_id.py
│ │ ├── 0011_auto_20210216_1331.py
│ │ ├── 0001_initial.py
│ │ ├── 0046_alter_archiveresult_snapshot_alter_snapshot_id_and_more.py
│ │ ├── 0031_alter_archiveresult_id_alter_archiveresult_uuid_and_more.py
│ │ ├── 0066_alter_snapshottag_tag_alter_tag_id_alter_tag_old_id.py
│ │ ├── 0003_auto_20200630_1034.py
│ │ ├── 0073_rename_created_archiveresult_created_at_and_more.py
│ │ └── 0069_alter_archiveresult_created_alter_snapshot_added_and_more.py
│ ├── templatetags
│ │ ├── __init__.py
│ │ └── core_tags.py
│ ├── tests.py
│ ├── apps.py
│ ├── wsgi.py
│ ├── admin.py
│ ├── management
│ │ └── commands
│ │ │ └── archivebox.py
│ ├── __init__.py
│ └── asgi.py
├── tags
│ ├── __init__.py
│ ├── migrations
│ │ └── __init__.py
│ └── apps.py
├── README.md
├── personas
│ ├── __init__.py
│ ├── migrations
│ │ └── __init__.py
│ ├── admin.py
│ ├── tests.py
│ ├── views.py
│ └── apps.py
├── pkgs
│ ├── abx
│ │ ├── README.md
│ │ └── pyproject.toml
│ ├── abx-plugin-chrome
│ │ ├── README.md
│ │ ├── pyproject.toml
│ │ └── abx_plugin_chrome
│ │ │ ├── extractors.py
│ │ │ └── __init__.py
│ ├── abx-plugin-curl
│ │ ├── README.md
│ │ ├── abx_plugin_curl
│ │ │ ├── __init__.py
│ │ │ ├── binaries.py
│ │ │ └── config.py
│ │ └── pyproject.toml
│ ├── abx-plugin-git
│ │ ├── README.md
│ │ ├── abx_plugin_git
│ │ │ ├── extractors.py
│ │ │ ├── binaries.py
│ │ │ ├── __init__.py
│ │ │ └── config.py
│ │ └── pyproject.toml
│ ├── abx-plugin-npm
│ │ ├── README.md
│ │ ├── abx_plugin_npm
│ │ │ ├── config.py
│ │ │ ├── __init__.py
│ │ │ └── binproviders.py
│ │ └── pyproject.toml
│ ├── abx-plugin-pip
│ │ ├── README.md
│ │ ├── abx_plugin_pip
│ │ │ ├── .plugin_order
│ │ │ ├── config.py
│ │ │ └── __init__.py
│ │ └── pyproject.toml
│ ├── abx-plugin-pocket
│ │ ├── README.md
│ │ ├── abx_plugin_pocket
│ │ │ ├── __init__.py
│ │ │ └── config.py
│ │ └── pyproject.toml
│ ├── abx-plugin-title
│ │ ├── README.md
│ │ ├── abx_plugin_title
│ │ │ ├── extractors.py
│ │ │ └── __init__.py
│ │ └── pyproject.toml
│ ├── abx-plugin-wget
│ │ ├── README.md
│ │ ├── pyproject.toml
│ │ └── abx_plugin_wget
│ │ │ ├── binaries.py
│ │ │ ├── __init__.py
│ │ │ └── extractors.py
│ ├── abx-plugin-ytdlp
│ │ ├── README.md
│ │ ├── pyproject.toml
│ │ └── abx_plugin_ytdlp
│ │ │ └── __init__.py
│ ├── abx-spec-abx-pkg
│ │ ├── README.md
│ │ └── pyproject.toml
│ ├── abx-spec-config
│ │ ├── README.md
│ │ └── pyproject.toml
│ ├── abx-spec-django
│ │ ├── README.md
│ │ └── pyproject.toml
│ ├── abx-plugin-favicon
│ │ ├── README.md
│ │ ├── abx_plugin_favicon
│ │ │ ├── config.py
│ │ │ ├── models.py
│ │ │ ├── extractors.py
│ │ │ ├── __init__.py
│ │ │ └── actors.py
│ │ └── pyproject.toml
│ ├── abx-plugin-htmltotext
│ │ ├── README.md
│ │ ├── abx_plugin_htmltotext
│ │ │ ├── config.py
│ │ │ └── __init__.py
│ │ └── pyproject.toml
│ ├── abx-plugin-ldap-auth
│ │ ├── README.md
│ │ └── pyproject.toml
│ ├── abx-plugin-mercury
│ │ ├── README.md
│ │ ├── pyproject.toml
│ │ └── abx_plugin_mercury
│ │ │ ├── extractors.py
│ │ │ ├── __init__.py
│ │ │ ├── config.py
│ │ │ └── binaries.py
│ ├── abx-plugin-playwright
│ │ ├── README.md
│ │ ├── abx_plugin_playwright
│ │ │ ├── config.py
│ │ │ ├── __init__.py
│ │ │ └── binaries.py
│ │ └── pyproject.toml
│ ├── abx-plugin-puppeteer
│ │ ├── README.md
│ │ ├── pyproject.toml
│ │ └── abx_plugin_puppeteer
│ │ │ ├── config.py
│ │ │ ├── binaries.py
│ │ │ └── __init__.py
│ ├── abx-plugin-readability
│ │ ├── README.md
│ │ ├── pyproject.toml
│ │ └── abx_plugin_readability
│ │ │ ├── extractors.py
│ │ │ ├── config.py
│ │ │ ├── __init__.py
│ │ │ └── binaries.py
│ ├── abx-plugin-readwise
│ │ ├── README.md
│ │ ├── pyproject.toml
│ │ └── abx_plugin_readwise.py
│ ├── abx-plugin-singlefile
│ │ ├── README.md
│ │ ├── abx_plugin_singlefile
│ │ │ ├── models.py
│ │ │ ├── extractors.py
│ │ │ ├── __init__.py
│ │ │ ├── config.py
│ │ │ └── actors.py
│ │ └── pyproject.toml
│ ├── abx-spec-archivebox
│ │ ├── README.md
│ │ ├── pyproject.toml
│ │ └── abx_spec_archivebox
│ │ │ ├── effects.py
│ │ │ ├── __init__.py
│ │ │ └── events.py
│ ├── abx-spec-extractor
│ │ ├── README.md
│ │ └── pyproject.toml
│ ├── abx-spec-searchbackend
│ │ ├── README.md
│ │ ├── pyproject.toml
│ │ └── abx_spec_searchbackend.py
│ ├── abx-plugin-archivedotorg
│ │ ├── README.md
│ │ ├── abx_plugin_archivedotorg
│ │ │ ├── config.py
│ │ │ └── __init__.py
│ │ └── pyproject.toml
│ ├── abx-plugin-ripgrep-search
│ │ ├── README.md
│ │ ├── pyproject.toml
│ │ └── abx_plugin_ripgrep_search
│ │ │ ├── binaries.py
│ │ │ ├── __init__.py
│ │ │ └── config.py
│ ├── abx-plugin-sonic-search
│ │ ├── README.md
│ │ ├── pyproject.toml
│ │ └── abx_plugin_sonic_search
│ │ │ ├── __init__.py
│ │ │ └── binaries.py
│ ├── abx-plugin-sqlitefts-search
│ │ ├── README.md
│ │ ├── pyproject.toml
│ │ └── abx_plugin_sqlitefts_search
│ │ │ └── __init__.py
│ └── abx-plugin-default-binproviders
│ │ ├── README.md
│ │ ├── pyproject.toml
│ │ └── abx_plugin_default_binproviders.py
├── static
├── api
│ ├── migrations
│ │ ├── __init__.py
│ │ ├── 0002_alter_apitoken_options.py
│ │ ├── 0005_remove_apitoken_uuid_remove_outboundwebhook_uuid_and_more.py
│ │ ├── 0004_alter_apitoken_id_alter_apitoken_uuid.py
│ │ ├── 0007_alter_apitoken_created_by.py
│ │ ├── 0006_remove_outboundwebhook_uuid_apitoken_id_and_more.py
│ │ ├── 0001_initial.py
│ │ └── 0009_rename_created_apitoken_created_at_and_more.py
│ ├── __init__.py
│ ├── apps.py
│ ├── urls.py
│ ├── tests.py
│ └── admin.py
├── crawls
│ ├── migrations
│ │ └── __init__.py
│ ├── tests.py
│ ├── views.py
│ ├── apps.py
│ └── __init__.py
├── machine
│ ├── migrations
│ │ └── __init__.py
│ ├── __init__.py
│ └── apps.py
├── workers
│ ├── migrations
│ │ └── __init__.py
│ ├── apps.py
│ ├── __init__.py
│ ├── management
│ │ └── commands
│ │ │ └── orchestrator.py
│ ├── views.py
│ ├── admin.py
│ └── tests.py
├── base_models
│ ├── migrations
│ │ └── __init__.py
│ ├── __init__.py
│ └── apps.py
├── templates
│ ├── admin
│ │ ├── actions_as_select.html
│ │ └── app_index.html
│ ├── static
│ │ ├── robots.txt
│ │ ├── archive.png
│ │ ├── external.png
│ │ ├── favicon.ico
│ │ ├── sort_asc.png
│ │ ├── sort_both.png
│ │ ├── sort_desc.png
│ │ └── spinner.gif
│ └── core
│ │ ├── minimal_index.html
│ │ └── navigation.html
├── misc
│ ├── __init__.py
│ ├── debugging.py
│ └── paginators.py
├── mypy.ini
├── .flake8
├── __main__.py
├── cli
│ ├── archivebox_shell.py
│ ├── archivebox_worker.py
│ ├── archivebox_manage.py
│ └── archivebox_extract.py
├── parsers
│ ├── generic_jsonl.py
│ ├── url_list.py
│ ├── generic_rss.py
│ ├── medium_rss.py
│ ├── netscape_html.py
│ └── pinboard_rss.py
├── index
│ └── csv.py
├── search
│ └── admin.py
└── config
│ └── __init__.py
├── website
├── CNAME
├── README.md
├── icon.png
├── _config.yml
└── assets
│ ├── README.md
│ └── css
│ └── style.scss
├── .gitmodules
├── .github
├── FUNDING.yml
├── PULL_REQUEST_TEMPLATE.md
├── .readthedocs.yaml
├── dependabot.yml
├── ISSUE_TEMPLATE
│ └── config.yml
├── workflows
│ └── lint.yml
└── CONTRIBUTING.md
├── etc
├── uwsgi.ini
├── package.json
├── README.md
├── crontabs
│ └── archivebox
├── fly.toml
├── archivebox.service
├── sonic.cfg
└── nginx.conf
├── bin
├── test.sh
├── release_pip.sh
├── lint.sh
├── build_pip.sh
├── build.sh
├── release_docs.sh
├── release_git.sh
├── build_docs.sh
├── release.sh
└── build_git.sh
├── .dockerignore
├── .gitignore
└── LICENSE
/tests/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/archivebox/core/actors.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/archivebox/tags/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/website/CNAME:
--------------------------------------------------------------------------------
1 | archivebox.io
--------------------------------------------------------------------------------
/archivebox/README.md:
--------------------------------------------------------------------------------
1 | ../README.md
--------------------------------------------------------------------------------
/archivebox/personas/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/archivebox/pkgs/abx/README.md:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/archivebox/static:
--------------------------------------------------------------------------------
1 | templates/static
--------------------------------------------------------------------------------
/tests/mock_server/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/website/README.md:
--------------------------------------------------------------------------------
1 | ../README.md
--------------------------------------------------------------------------------
/archivebox/api/migrations/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/archivebox/core/migrations/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/archivebox/tags/migrations/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/archivebox/core/templatetags/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/archivebox/crawls/migrations/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/archivebox/machine/migrations/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/archivebox/personas/migrations/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/archivebox/pkgs/abx-plugin-chrome/README.md:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/archivebox/pkgs/abx-plugin-curl/README.md:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/archivebox/pkgs/abx-plugin-git/README.md:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/archivebox/pkgs/abx-plugin-npm/README.md:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/archivebox/pkgs/abx-plugin-pip/README.md:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/archivebox/pkgs/abx-plugin-pocket/README.md:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/archivebox/pkgs/abx-plugin-title/README.md:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/archivebox/pkgs/abx-plugin-wget/README.md:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/archivebox/pkgs/abx-plugin-ytdlp/README.md:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/archivebox/pkgs/abx-spec-abx-pkg/README.md:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/archivebox/pkgs/abx-spec-config/README.md:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/archivebox/pkgs/abx-spec-django/README.md:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/archivebox/workers/migrations/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/archivebox/base_models/migrations/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/archivebox/pkgs/abx-plugin-favicon/README.md:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/archivebox/pkgs/abx-plugin-htmltotext/README.md:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/archivebox/pkgs/abx-plugin-ldap-auth/README.md:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/archivebox/pkgs/abx-plugin-mercury/README.md:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/archivebox/pkgs/abx-plugin-playwright/README.md:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/archivebox/pkgs/abx-plugin-puppeteer/README.md:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/archivebox/pkgs/abx-plugin-readability/README.md:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/archivebox/pkgs/abx-plugin-readwise/README.md:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/archivebox/pkgs/abx-plugin-singlefile/README.md:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/archivebox/pkgs/abx-spec-archivebox/README.md:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/archivebox/pkgs/abx-spec-extractor/README.md:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/archivebox/pkgs/abx-spec-searchbackend/README.md:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/archivebox/pkgs/abx-plugin-archivedotorg/README.md:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/archivebox/pkgs/abx-plugin-ripgrep-search/README.md:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/archivebox/pkgs/abx-plugin-sonic-search/README.md:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/archivebox/pkgs/abx-plugin-sqlitefts-search/README.md:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/archivebox/templates/admin/actions_as_select.html:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/archivebox/pkgs/abx-plugin-default-binproviders/README.md:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/archivebox/api/__init__.py:
--------------------------------------------------------------------------------
1 | __package__ = 'archivebox.api'
2 |
--------------------------------------------------------------------------------
/archivebox/misc/__init__.py:
--------------------------------------------------------------------------------
1 | __package__ = 'archivebox.misc'
2 |
--------------------------------------------------------------------------------
/archivebox/pkgs/abx-plugin-pip/abx_plugin_pip/.plugin_order:
--------------------------------------------------------------------------------
1 | 400
2 |
--------------------------------------------------------------------------------
/archivebox/machine/__init__.py:
--------------------------------------------------------------------------------
1 | __package__ = 'archivebox.machine'
2 |
--------------------------------------------------------------------------------
/archivebox/templates/static/robots.txt:
--------------------------------------------------------------------------------
1 | User-agent: *
2 | Disallow: /
3 |
--------------------------------------------------------------------------------
/archivebox/base_models/__init__.py:
--------------------------------------------------------------------------------
1 | __package__ = 'archivebox.base_models'
2 |
--------------------------------------------------------------------------------
/archivebox/mypy.ini:
--------------------------------------------------------------------------------
1 | [mypy]
2 | plugins =
3 | mypy_django_plugin.main
4 |
--------------------------------------------------------------------------------
/website/icon.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/localnerve/ArchiveBox/dev/website/icon.png
--------------------------------------------------------------------------------
/archivebox/core/tests.py:
--------------------------------------------------------------------------------
1 | #from django.test import TestCase
2 |
3 | # Create your tests here.
4 |
--------------------------------------------------------------------------------
/archivebox/crawls/tests.py:
--------------------------------------------------------------------------------
1 | from django.test import TestCase
2 |
3 | # Create your tests here.
4 |
--------------------------------------------------------------------------------
/archivebox/crawls/views.py:
--------------------------------------------------------------------------------
1 | from django.shortcuts import render
2 |
3 | # Create your views here.
4 |
--------------------------------------------------------------------------------
/archivebox/personas/admin.py:
--------------------------------------------------------------------------------
1 | from django.contrib import admin
2 |
3 | # Register your models here.
4 |
--------------------------------------------------------------------------------
/archivebox/personas/tests.py:
--------------------------------------------------------------------------------
1 | from django.test import TestCase
2 |
3 | # Create your tests here.
4 |
--------------------------------------------------------------------------------
/archivebox/personas/views.py:
--------------------------------------------------------------------------------
1 | from django.shortcuts import render
2 |
3 | # Create your views here.
4 |
--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "docs"]
2 | path = docs
3 | url = https://github.com/ArchiveBox/ArchiveBox.wiki.git
4 |
--------------------------------------------------------------------------------
/tests/tags_migration/index.sqlite3:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/localnerve/ArchiveBox/dev/tests/tags_migration/index.sqlite3
--------------------------------------------------------------------------------
/.github/FUNDING.yml:
--------------------------------------------------------------------------------
1 | github: ["ArchiveBox", "pirate"]
2 | custom: ["https://donate.archivebox.io", "https://swag.archivebox.io"]
3 |
--------------------------------------------------------------------------------
/archivebox/templates/static/archive.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/localnerve/ArchiveBox/dev/archivebox/templates/static/archive.png
--------------------------------------------------------------------------------
/archivebox/templates/static/external.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/localnerve/ArchiveBox/dev/archivebox/templates/static/external.png
--------------------------------------------------------------------------------
/archivebox/templates/static/favicon.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/localnerve/ArchiveBox/dev/archivebox/templates/static/favicon.ico
--------------------------------------------------------------------------------
/archivebox/templates/static/sort_asc.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/localnerve/ArchiveBox/dev/archivebox/templates/static/sort_asc.png
--------------------------------------------------------------------------------
/archivebox/templates/static/sort_both.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/localnerve/ArchiveBox/dev/archivebox/templates/static/sort_both.png
--------------------------------------------------------------------------------
/archivebox/templates/static/sort_desc.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/localnerve/ArchiveBox/dev/archivebox/templates/static/sort_desc.png
--------------------------------------------------------------------------------
/archivebox/templates/static/spinner.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/localnerve/ArchiveBox/dev/archivebox/templates/static/spinner.gif
--------------------------------------------------------------------------------
/tests/mock_server/templates/shift_jis.html:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/localnerve/ArchiveBox/dev/tests/mock_server/templates/shift_jis.html
--------------------------------------------------------------------------------
/website/_config.yml:
--------------------------------------------------------------------------------
1 | production_url: https://archivebox.io
2 | theme: jekyll-theme-merlot
3 | # Github Pages static site settings for https://archivebox.io
4 |
--------------------------------------------------------------------------------
/tests/mock_server/templates/malformed.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
5 | malformed document
7 |
8 |
9 |
--------------------------------------------------------------------------------
/archivebox/crawls/apps.py:
--------------------------------------------------------------------------------
1 | from django.apps import AppConfig
2 |
3 |
4 | class CrawlsConfig(AppConfig):
5 | default_auto_field = "django.db.models.BigAutoField"
6 | name = "crawls"
7 |
--------------------------------------------------------------------------------
/archivebox/tags/apps.py:
--------------------------------------------------------------------------------
1 | from django.apps import AppConfig
2 |
3 |
4 | class TagsConfig(AppConfig):
5 | default_auto_field = 'django.db.models.BigAutoField'
6 |
7 | name = 'tags'
8 |
--------------------------------------------------------------------------------
/archivebox/personas/apps.py:
--------------------------------------------------------------------------------
1 | from django.apps import AppConfig
2 |
3 |
4 | class SessionsConfig(AppConfig):
5 | default_auto_field = "django.db.models.BigAutoField"
6 | name = "personas"
7 |
--------------------------------------------------------------------------------
/archivebox/workers/apps.py:
--------------------------------------------------------------------------------
1 | from django.apps import AppConfig
2 |
3 |
4 | class WorkersConfig(AppConfig):
5 | default_auto_field = 'django.db.models.BigAutoField'
6 | name = 'workers'
7 |
8 |
--------------------------------------------------------------------------------
/archivebox/base_models/apps.py:
--------------------------------------------------------------------------------
1 | # from django.apps import AppConfig
2 |
3 |
4 | # class AbidUtilsConfig(AppConfig):
5 | # default_auto_field = 'django.db.models.BigAutoField'
6 |
7 | # name = 'base_models'
8 |
--------------------------------------------------------------------------------
/website/assets/README.md:
--------------------------------------------------------------------------------
1 | # assets/
2 |
3 | This folder contains assets used by the Jekyll Static Site Generator for ArchiveBox.io.
4 |
5 | It cannot be moved or renamed or the custom CSS on ArchiveBox.io will break.
6 |
--------------------------------------------------------------------------------
/archivebox/workers/__init__.py:
--------------------------------------------------------------------------------
1 | __package__ = 'archivebox.workers'
2 | __order__ = 100
3 |
4 | import abx
5 |
6 | @abx.hookimpl
7 | def register_admin(admin_site):
8 | from workers.admin import register_admin
9 | register_admin(admin_site)
10 |
--------------------------------------------------------------------------------
/archivebox/pkgs/abx-plugin-playwright/abx_plugin_playwright/config.py:
--------------------------------------------------------------------------------
1 | from abx_spec_config import BaseConfigSet
2 |
3 | class PlaywrightConfigs(BaseConfigSet):
4 | PLAYWRIGHT_BINARY: str = 'playwright'
5 |
6 |
7 | PLAYWRIGHT_CONFIG = PlaywrightConfigs()
8 |
--------------------------------------------------------------------------------
/archivebox/pkgs/abx-plugin-htmltotext/abx_plugin_htmltotext/config.py:
--------------------------------------------------------------------------------
1 | from abx_spec_config.base_configset import BaseConfigSet
2 |
3 |
4 | class HtmltotextConfig(BaseConfigSet):
5 | SAVE_HTMLTOTEXT: bool = True
6 |
7 |
8 | HTMLTOTEXT_CONFIG = HtmltotextConfig()
9 |
--------------------------------------------------------------------------------
/archivebox/crawls/__init__.py:
--------------------------------------------------------------------------------
1 | __package__ = 'archivebox.crawls'
2 | __order__ = 100
3 |
4 | import abx
5 |
6 |
7 | @abx.hookimpl
8 | def register_admin(admin_site):
9 | from .admin import register_admin as register_crawls_admin
10 | register_crawls_admin(admin_site)
11 |
--------------------------------------------------------------------------------
/archivebox/pkgs/abx-plugin-archivedotorg/abx_plugin_archivedotorg/config.py:
--------------------------------------------------------------------------------
1 | from abx_spec_config.base_configset import BaseConfigSet
2 |
3 |
4 | class ArchivedotorgConfig(BaseConfigSet):
5 | SAVE_ARCHIVE_DOT_ORG: bool = True
6 |
7 |
8 | ARCHIVEDOTORG_CONFIG = ArchivedotorgConfig()
9 |
--------------------------------------------------------------------------------
/archivebox/.flake8:
--------------------------------------------------------------------------------
1 | [flake8]
2 | ignore = D100,D101,D102,D103,D104,D105,D202,D203,D205,D400,E131,E241,E252,E266,E272,E701,E731,W293,W503,W291,W391
3 | select = F,E9,W
4 | max-line-length = 130
5 | max-complexity = 10
6 | exclude = migrations,tests,node_modules,vendor,venv,.venv,.venv2,.docker-venv,data,data*
7 |
--------------------------------------------------------------------------------
/archivebox/pkgs/abx-plugin-title/abx_plugin_title/extractors.py:
--------------------------------------------------------------------------------
1 | __package__ = 'abx_plugin_title'
2 |
3 | from abx_spec_extractor import BaseExtractor, ExtractorName
4 |
5 |
6 |
7 | class TitleExtractor(BaseExtractor):
8 | name: ExtractorName = 'title'
9 |
10 | TITLE_EXTRACTOR = TitleExtractor()
11 |
--------------------------------------------------------------------------------
/archivebox/api/apps.py:
--------------------------------------------------------------------------------
1 | __package__ = 'archivebox.api'
2 |
3 | from django.apps import AppConfig
4 |
5 | import abx
6 |
7 |
8 | class APIConfig(AppConfig):
9 | name = 'api'
10 |
11 |
12 | @abx.hookimpl
13 | def register_admin(admin_site):
14 | from api.admin import register_admin
15 | register_admin(admin_site)
16 |
--------------------------------------------------------------------------------
/archivebox/pkgs/abx-plugin-favicon/abx_plugin_favicon/config.py:
--------------------------------------------------------------------------------
1 | from abx_spec_config.base_configset import BaseConfigSet
2 |
3 |
4 | class FaviconConfig(BaseConfigSet):
5 | SAVE_FAVICON: bool = True
6 |
7 | FAVICON_PROVIDER: str = 'https://www.google.com/s2/favicons?domain={}'
8 |
9 |
10 | FAVICON_CONFIG = FaviconConfig()
11 |
--------------------------------------------------------------------------------
/tests/mock_server/templates/example-single.jsonl:
--------------------------------------------------------------------------------
1 | {"href":"http://127.0.0.1:8080/static/example.com.html","description":"Example","extended":"","meta":"18a973f09c9cc0608c116967b64e0419","hash":"910293f019c2f4bb1a749fb937ba58e3","time":"2014-06-14T15:51:42Z","shared":"no","toread":"no","tags":"Tag1 Tag2","trap":"http://www.example.com/should-not-exist"}
2 |
--------------------------------------------------------------------------------
/etc/uwsgi.ini:
--------------------------------------------------------------------------------
1 | [uwsgi]
2 | socket = 127.0.0.1:3031
3 | chdir = ../
4 | http = 0.0.0.0:8001
5 | env = DATA_DIR=./data
6 | wsgi-file = archivebox/core/wsgi.py
7 | processes = 4
8 | threads = 1
9 | stats = 127.0.0.1:9191
10 | static-map /static=./archivebox/templates/static
11 | harakiri = 172800
12 | post-buffering = 1
13 | disable-logging = True
14 |
--------------------------------------------------------------------------------
/archivebox/pkgs/abx-plugin-curl/abx_plugin_curl/__init__.py:
--------------------------------------------------------------------------------
1 | import abx
2 |
3 |
4 | @abx.hookimpl
5 | def get_CONFIG():
6 | from .config import CURL_CONFIG
7 |
8 | return {
9 | 'curl': CURL_CONFIG
10 | }
11 |
12 | @abx.hookimpl
13 | def get_BINARIES():
14 | from .binaries import CURL_BINARY
15 |
16 | return {
17 | 'curl': CURL_BINARY,
18 | }
19 |
--------------------------------------------------------------------------------
/archivebox/pkgs/abx/pyproject.toml:
--------------------------------------------------------------------------------
1 | [project]
2 | name = "abx"
3 | version = "0.1.0"
4 | description = "The common shared interfaces for the ABX ArchiveBox plugin ecosystem."
5 | readme = "README.md"
6 | requires-python = ">=3.10"
7 | dependencies = [
8 | "pluggy>=1.5.0",
9 | "django>=5.1.1,<6.0",
10 | ]
11 |
12 | [build-system]
13 | requires = ["hatchling"]
14 | build-backend = "hatchling.build"
15 |
--------------------------------------------------------------------------------
/etc/package.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "archivebox",
3 | "version": "0.0.1",
4 | "repository": "github:ArchiveBox/ArchiveBox",
5 | "license": "MIT",
6 | "dependencies": {
7 | "@postlight/parser": "^2.2.3",
8 | "readability-extractor": "github:ArchiveBox/readability-extractor",
9 | "single-file-cli": "^1.1.54",
10 | "puppeteer": "^23.5.0",
11 | "@puppeteer/browsers": "^2.4.0"
12 | }
13 | }
14 |
--------------------------------------------------------------------------------
/tests/mock_server/templates/example.json.bad:
--------------------------------------------------------------------------------
1 | this line would cause problems but --parser=json will actually skip it
2 | [{"href":"http://127.0.0.1:8080/static/example.com.html","description":"Example","extended":"","meta":"18a973f09c9cc0608c116967b64e0419","hash":"910293f019c2f4bb1a749fb937ba58e3","time":"2014-06-14T15:51:42Z","shared":"no","toread":"no","tags":"Tag1 Tag2","trap":"http://www.example.com/should-not-exist"}]
3 |
--------------------------------------------------------------------------------
/archivebox/core/migrations/0056_remove_tag_uuid.py:
--------------------------------------------------------------------------------
1 | # Generated by Django 5.0.6 on 2024-08-20 03:25
2 |
3 | from django.db import migrations
4 |
5 |
6 | class Migration(migrations.Migration):
7 |
8 | dependencies = [
9 | ('core', '0055_alter_tag_slug'),
10 | ]
11 |
12 | operations = [
13 | migrations.RemoveField(
14 | model_name='tag',
15 | name='uuid',
16 | ),
17 | ]
18 |
--------------------------------------------------------------------------------
/archivebox/pkgs/abx-plugin-pocket/abx_plugin_pocket/__init__.py:
--------------------------------------------------------------------------------
1 | __package__ = 'abx_plugin_pocket'
2 | __label__ = 'Pocket'
3 |
4 | import abx
5 |
6 |
7 | @abx.hookimpl
8 | def get_CONFIG():
9 | from .config import POCKET_CONFIG
10 |
11 | return {
12 | 'POCKET_CONFIG': POCKET_CONFIG
13 | }
14 |
15 | @abx.hookimpl
16 | def ready():
17 | from .config import POCKET_CONFIG
18 | POCKET_CONFIG.validate()
19 |
--------------------------------------------------------------------------------
/archivebox/pkgs/abx-plugin-title/abx_plugin_title/__init__.py:
--------------------------------------------------------------------------------
1 | import abx
2 |
3 | # @abx.hookimpl
4 | # def get_CONFIG():
5 | # from .config import TITLE_EXTRACTOR_CONFIG
6 |
7 | # return {
8 | # 'title_extractor': TITLE_EXTRACTOR_CONFIG
9 | # }
10 |
11 |
12 | @abx.hookimpl
13 | def get_EXTRACTORS():
14 | from .extractors import TITLE_EXTRACTOR
15 | return {
16 | 'title': TITLE_EXTRACTOR,
17 | }
18 |
--------------------------------------------------------------------------------
/archivebox/pkgs/abx-plugin-favicon/abx_plugin_favicon/models.py:
--------------------------------------------------------------------------------
1 | # from django.db import models
2 |
3 | # from core.models import ArchiveResult
4 |
5 | # class FaviconResultManager(models.Manager):
6 | # def get_queryset(self):
7 | # return super().get_queryset().filter(extractor='favicon')
8 |
9 |
10 | # class FaviconResult(ArchiveResult):
11 | # objects = FaviconResultManager()
12 |
13 | # class Meta:
14 | # proxy = True
15 |
--------------------------------------------------------------------------------
/archivebox/machine/apps.py:
--------------------------------------------------------------------------------
1 | __package__ = 'archivebox.machine'
2 |
3 | from django.apps import AppConfig
4 |
5 | import abx
6 |
7 |
8 | class MachineConfig(AppConfig):
9 | default_auto_field = 'django.db.models.BigAutoField'
10 |
11 | name = 'machine'
12 | verbose_name = 'Machine Info'
13 |
14 |
15 | @abx.hookimpl
16 | def register_admin(admin_site):
17 | from machine.admin import register_admin
18 | register_admin(admin_site)
19 |
--------------------------------------------------------------------------------
/archivebox/pkgs/abx-plugin-pocket/abx_plugin_pocket/config.py:
--------------------------------------------------------------------------------
1 | from typing import Dict
2 | from pydantic import Field
3 |
4 | from abx_spec_config import BaseConfigSet
5 |
6 |
7 | class PocketConfig(BaseConfigSet):
8 | POCKET_CONSUMER_KEY: str | None = Field(default=None)
9 | POCKET_ACCESS_TOKENS: Dict[str, str] = Field(default=dict) # {: , ...}
10 |
11 |
12 | POCKET_CONFIG = PocketConfig()
13 |
--------------------------------------------------------------------------------
/archivebox/pkgs/abx-spec-django/pyproject.toml:
--------------------------------------------------------------------------------
1 | [project]
2 | name = "abx-spec-django"
3 | version = "0.1.0"
4 | description = "Add your description here"
5 | readme = "README.md"
6 | requires-python = ">=3.10"
7 | dependencies = [
8 | "abx>=0.1.0",
9 | "django>=5.1.1,<6.0",
10 | ]
11 |
12 | [build-system]
13 | requires = ["hatchling"]
14 | build-backend = "hatchling.build"
15 |
16 | [project.entry-points.abx]
17 | abx_spec_django = "abx_spec_django"
18 |
--------------------------------------------------------------------------------
/tests/test_util.py:
--------------------------------------------------------------------------------
1 | from archivebox import util
2 |
3 | def test_download_url_downloads_content():
4 | text = util.download_url("http://127.0.0.1:8080/static/example.com.html")
5 | assert "Example Domain" in text
6 |
7 | def test_download_url_gets_encoding_from_body():
8 | text = util.download_url("http://127.0.0.1:8080/static_no_content_type/shift_jis.html")
9 | assert "鹿児島のニュース|MBC南日本放送" in text
10 | assert "掲載された全ての記事・画像等の無断転載、二次利用をお断りいたします" in text
--------------------------------------------------------------------------------
/archivebox/pkgs/abx-plugin-singlefile/abx_plugin_singlefile/models.py:
--------------------------------------------------------------------------------
1 | # from django.db import models
2 |
3 | # from core.models import ArchiveResult
4 |
5 | # class SinglefileResultManager(models.Manager):
6 | # def get_queryset(self):
7 | # return super().get_queryset().filter(extractor='singlefile')
8 |
9 |
10 | # class SinglefileResult(ArchiveResult):
11 | # objects = SinglefileResultManager()
12 |
13 | # class Meta:
14 | # proxy = True
15 |
--------------------------------------------------------------------------------
/archivebox/core/migrations/0057_rename_id_tag_old_id.py:
--------------------------------------------------------------------------------
1 | # Generated by Django 5.0.6 on 2024-08-20 03:29
2 |
3 | from django.db import migrations
4 |
5 |
6 | class Migration(migrations.Migration):
7 |
8 | dependencies = [
9 | ('core', '0056_remove_tag_uuid'),
10 | ]
11 |
12 | operations = [
13 | migrations.RenameField(
14 | model_name='tag',
15 | old_name='id',
16 | new_name='old_id',
17 | ),
18 | ]
19 |
--------------------------------------------------------------------------------
/archivebox/pkgs/abx-plugin-mercury/pyproject.toml:
--------------------------------------------------------------------------------
1 | [project]
2 | name = "abx-plugin-mercury"
3 | version = "2024.10.28"
4 | description = "Add your description here"
5 | readme = "README.md"
6 | requires-python = ">=3.10"
7 | dependencies = [
8 | "abx>=0.1.0",
9 | "abx-spec-config>=0.1.0",
10 | ]
11 |
12 | [build-system]
13 | requires = ["hatchling"]
14 | build-backend = "hatchling.build"
15 |
16 | [project.entry-points.abx]
17 | abx_plugin_mercury = "abx_plugin_mercury"
18 |
--------------------------------------------------------------------------------
/archivebox/core/migrations/0065_remove_snapshottag_old_tag.py:
--------------------------------------------------------------------------------
1 | # Generated by Django 5.0.6 on 2024-08-20 03:51
2 |
3 | from django.db import migrations
4 |
5 |
6 | class Migration(migrations.Migration):
7 |
8 | dependencies = [
9 | ('core', '0064_alter_snapshottag_unique_together_and_more'),
10 | ]
11 |
12 | operations = [
13 | migrations.RemoveField(
14 | model_name='snapshottag',
15 | name='old_tag',
16 | ),
17 | ]
18 |
--------------------------------------------------------------------------------
/archivebox/pkgs/abx-plugin-readwise/pyproject.toml:
--------------------------------------------------------------------------------
1 | [project]
2 | name = "abx-plugin-readwise"
3 | version = "2024.10.28"
4 | description = "Readwise API Extractor"
5 | readme = "README.md"
6 | requires-python = ">=3.10"
7 | dependencies = [
8 | "abx>=0.1.0",
9 | "abx-spec-config>=0.1.0",
10 | ]
11 |
12 | [build-system]
13 | requires = ["hatchling"]
14 | build-backend = "hatchling.build"
15 |
16 | [project.entry-points.abx]
17 | abx_plugin_readwise = "abx_plugin_readwise"
18 |
19 |
--------------------------------------------------------------------------------
/archivebox/core/migrations/0038_rename_uuid_snapshot_id.py:
--------------------------------------------------------------------------------
1 | # Generated by Django 5.0.6 on 2024-08-18 06:09
2 |
3 | from django.db import migrations
4 |
5 |
6 | class Migration(migrations.Migration):
7 |
8 | dependencies = [
9 | ('core', '0037_rename_id_snapshot_old_id'),
10 | ]
11 |
12 | operations = [
13 | migrations.RenameField(
14 | model_name='snapshot',
15 | old_name='uuid',
16 | new_name='id',
17 | ),
18 | ]
19 |
--------------------------------------------------------------------------------
/archivebox/core/migrations/0042_remove_archiveresult_snapshot_old.py:
--------------------------------------------------------------------------------
1 | # Generated by Django 5.0.6 on 2024-08-18 06:51
2 |
3 | from django.db import migrations
4 |
5 |
6 | class Migration(migrations.Migration):
7 |
8 | dependencies = [
9 | ('core', '0041_alter_archiveresult_snapshot_and_more'),
10 | ]
11 |
12 | operations = [
13 | migrations.RemoveField(
14 | model_name='archiveresult',
15 | name='snapshot_old',
16 | ),
17 | ]
18 |
--------------------------------------------------------------------------------
/archivebox/core/migrations/0053_remove_snapshottag_snapshot_old.py:
--------------------------------------------------------------------------------
1 | # Generated by Django 5.0.6 on 2024-08-20 02:38
2 |
3 | from django.db import migrations
4 |
5 |
6 | class Migration(migrations.Migration):
7 |
8 | dependencies = [
9 | ('core', '0052_alter_snapshottag_unique_together_and_more'),
10 | ]
11 |
12 | operations = [
13 | migrations.RemoveField(
14 | model_name='snapshottag',
15 | name='snapshot_old',
16 | ),
17 | ]
18 |
--------------------------------------------------------------------------------
/archivebox/pkgs/abx-plugin-htmltotext/pyproject.toml:
--------------------------------------------------------------------------------
1 | [project]
2 | name = "abx-plugin-htmltotext"
3 | version = "2024.10.28"
4 | description = "Add your description here"
5 | readme = "README.md"
6 | requires-python = ">=3.10"
7 | dependencies = [
8 | "abx>=0.1.0",
9 | "abx-spec-config>=0.1.0",
10 | ]
11 |
12 | [build-system]
13 | requires = ["hatchling"]
14 | build-backend = "hatchling.build"
15 |
16 | [project.entry-points.abx]
17 | abx_plugin_htmltotext = "abx_plugin_htmltotext"
18 |
--------------------------------------------------------------------------------
/archivebox/pkgs/abx-spec-abx-pkg/pyproject.toml:
--------------------------------------------------------------------------------
1 | [project]
2 | name = "abx-spec-abx-pkg"
3 | version = "0.1.1"
4 | description = "The ABX plugin specification for Binaries and BinProviders"
5 | readme = "README.md"
6 | requires-python = ">=3.10"
7 | dependencies = [
8 | "abx>=0.1.0",
9 | "abx-pkg>=0.6.0",
10 | ]
11 |
12 | [build-system]
13 | requires = ["hatchling"]
14 | build-backend = "hatchling.build"
15 |
16 | [project.entry-points.abx]
17 | abx_spec_abx_pkg = "abx_spec_abx_pkg"
18 |
--------------------------------------------------------------------------------
/archivebox/api/migrations/0002_alter_apitoken_options.py:
--------------------------------------------------------------------------------
1 | # Generated by Django 5.0.4 on 2024-04-26 05:28
2 |
3 | from django.db import migrations
4 |
5 |
6 | class Migration(migrations.Migration):
7 |
8 | dependencies = [
9 | ('api', '0001_initial'),
10 | ]
11 |
12 | operations = [
13 | migrations.AlterModelOptions(
14 | name='apitoken',
15 | options={'verbose_name': 'API Key', 'verbose_name_plural': 'API Keys'},
16 | ),
17 | ]
18 |
--------------------------------------------------------------------------------
/archivebox/pkgs/abx-plugin-git/abx_plugin_git/extractors.py:
--------------------------------------------------------------------------------
1 | __package__ = 'abx_plugin_git'
2 |
3 |
4 | from abx_pkg import BinName
5 |
6 | from abx_spec_extractor import BaseExtractor, ExtractorName
7 |
8 | from .binaries import GIT_BINARY
9 |
10 |
11 | class GitExtractor(BaseExtractor):
12 | name: ExtractorName = 'git'
13 | binary: BinName = GIT_BINARY.name
14 |
15 | def get_output_path(self, snapshot) -> str:
16 | return 'git'
17 |
18 | GIT_EXTRACTOR = GitExtractor()
19 |
--------------------------------------------------------------------------------
/archivebox/pkgs/abx-plugin-readability/pyproject.toml:
--------------------------------------------------------------------------------
1 | [project]
2 | name = "abx-plugin-readability"
3 | version = "2024.10.28"
4 | description = "Add your description here"
5 | readme = "README.md"
6 | requires-python = ">=3.10"
7 | dependencies = [
8 | "abx>=0.1.0",
9 | "abx-spec-config>=0.1.0",
10 | ]
11 |
12 | [build-system]
13 | requires = ["hatchling"]
14 | build-backend = "hatchling.build"
15 |
16 | [project.entry-points.abx]
17 | abx_plugin_readability = "abx_plugin_readability"
18 |
--------------------------------------------------------------------------------
/archivebox/templates/admin/app_index.html:
--------------------------------------------------------------------------------
1 | {% extends "admin/index.html" %}
2 | {% load i18n %}
3 |
4 | {% block bodyclass %}{{ block.super }} app-{{ app_label }}{% endblock %}
5 |
6 | {% if not is_popup %}
7 | {% block breadcrumbs %}
8 |
9 |
{% trans 'Home' %}
10 | ›
11 | {% for app in app_list %}
12 | {{ app.name }}
13 | {% endfor %}
14 |
15 | {% endblock %}
16 | {% endif %}
17 |
18 | {% block sidebar %}{% endblock %}
19 |
--------------------------------------------------------------------------------
/archivebox/core/migrations/0033_rename_id_archiveresult_old_id.py:
--------------------------------------------------------------------------------
1 | # Generated by Django 5.0.6 on 2024-08-18 05:34
2 |
3 | from django.db import migrations
4 |
5 |
6 | class Migration(migrations.Migration):
7 |
8 | dependencies = [
9 | ('core', '0032_alter_archiveresult_id'),
10 | ]
11 |
12 | operations = [
13 | migrations.RenameField(
14 | model_name='archiveresult',
15 | old_name='id',
16 | new_name='old_id',
17 | ),
18 | ]
19 |
--------------------------------------------------------------------------------
/archivebox/pkgs/abx-plugin-curl/pyproject.toml:
--------------------------------------------------------------------------------
1 | [project]
2 | name = "abx-plugin-curl"
3 | version = "2024.10.24"
4 | description = "Add your description here"
5 | readme = "README.md"
6 | requires-python = ">=3.10"
7 | dependencies = [
8 | "abx>=0.1.0",
9 | "abx-spec-config>=0.1.0",
10 | "abx-spec-abx-pkg>=0.1.0",
11 | ]
12 |
13 | [build-system]
14 | requires = ["hatchling"]
15 | build-backend = "hatchling.build"
16 |
17 | [project.entry-points.abx]
18 | abx_plugin_curl = "abx_plugin_curl"
19 |
--------------------------------------------------------------------------------
/archivebox/pkgs/abx-plugin-pocket/pyproject.toml:
--------------------------------------------------------------------------------
1 | [project]
2 | name = "abx-plugin-pocket"
3 | version = "2024.10.28"
4 | description = "Add your description here"
5 | readme = "README.md"
6 | requires-python = ">=3.10"
7 | dependencies = [
8 | "abx>=0.1.0",
9 | "abx-spec-config>=0.1.0",
10 | "pocket>=0.3.6",
11 | ]
12 |
13 | [build-system]
14 | requires = ["hatchling"]
15 | build-backend = "hatchling.build"
16 |
17 | [project.entry-points.abx]
18 | abx_plugin_pocket = "abx_plugin_pocket"
19 |
--------------------------------------------------------------------------------
/archivebox/pkgs/abx-plugin-title/pyproject.toml:
--------------------------------------------------------------------------------
1 | [project]
2 | name = "abx-plugin-title"
3 | version = "2024.10.27"
4 | description = "Title Extractor"
5 | readme = "README.md"
6 | requires-python = ">=3.10"
7 | dependencies = [
8 | "abx>=0.1.0",
9 | "abx-spec-config>=0.1.0",
10 | "abx-plugin-curl>=2024.10.28",
11 | ]
12 |
13 | [build-system]
14 | requires = ["hatchling"]
15 | build-backend = "hatchling.build"
16 |
17 | [project.entry-points.abx]
18 | abx_plugin_title = "abx_plugin_title"
19 |
--------------------------------------------------------------------------------
/archivebox/pkgs/abx-plugin-wget/pyproject.toml:
--------------------------------------------------------------------------------
1 | [project]
2 | name = "abx-plugin-wget"
3 | version = "2024.10.28"
4 | description = "Add your description here"
5 | readme = "README.md"
6 | requires-python = ">=3.10"
7 | dependencies = [
8 | "abx>=0.1.0",
9 | "abx-spec-config>=0.1.0",
10 | "abx-spec-abx-pkg>=0.1.0",
11 | ]
12 |
13 | [build-system]
14 | requires = ["hatchling"]
15 | build-backend = "hatchling.build"
16 |
17 | [project.entry-points.abx]
18 | abx_plugin_wget = "abx_plugin_wget"
19 |
--------------------------------------------------------------------------------
/archivebox/pkgs/abx-spec-extractor/pyproject.toml:
--------------------------------------------------------------------------------
1 | [project]
2 | name = "abx-spec-extractor"
3 | version = "0.1.0"
4 | description = "Add your description here"
5 | readme = "README.md"
6 | requires-python = ">=3.10"
7 | dependencies = [
8 | "abx>=0.1.0",
9 | "python-benedict>=0.26.0",
10 | "pydantic>=2.5.0",
11 | ]
12 |
13 | [build-system]
14 | requires = ["hatchling"]
15 | build-backend = "hatchling.build"
16 |
17 | [project.entry-points.abx]
18 | abx_spec_extractor = "abx_spec_extractor"
19 |
--------------------------------------------------------------------------------
/bin/test.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | ### Bash Environment Setup
4 | # http://redsymbol.net/articles/unofficial-bash-strict-mode/
5 | # https://www.gnu.org/software/bash/manual/html_node/The-Set-Builtin.html
6 | # set -o xtrace
7 | set -o errexit
8 | set -o errtrace
9 | set -o nounset
10 | set -o pipefail
11 | IFS=$'\n'
12 |
13 | DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && cd .. && pwd )"
14 |
15 | source "$DIR/.venv/bin/activate"
16 |
17 | pytest -s --basetemp=tests/out "$@"
18 |
--------------------------------------------------------------------------------
/tests/conftest.py:
--------------------------------------------------------------------------------
1 | from multiprocessing import Process
2 |
3 | import pytest
4 | from .mock_server.server import start
5 |
6 | server_process = None
7 |
8 | @pytest.hookimpl
9 | def pytest_sessionstart(session):
10 | global server_process
11 | server_process = Process(target=start)
12 | server_process.start()
13 |
14 | @pytest.hookimpl
15 | def pytest_sessionfinish(session):
16 | if server_process is not None:
17 | server_process.terminate()
18 | server_process.join()
19 |
--------------------------------------------------------------------------------
/archivebox/pkgs/abx-plugin-chrome/pyproject.toml:
--------------------------------------------------------------------------------
1 | [project]
2 | name = "abx-plugin-chrome"
3 | version = "2024.10.28"
4 | description = "Add your description here"
5 | readme = "README.md"
6 | requires-python = ">=3.10"
7 | dependencies = [
8 | "abx>=0.1.0",
9 | "abx-spec-config>=0.1.0",
10 | "abx-spec-abx-pkg>=0.1.0",
11 | ]
12 |
13 | [build-system]
14 | requires = ["hatchling"]
15 | build-backend = "hatchling.build"
16 |
17 | [project.entry-points.abx]
18 | abx_plugin_chrome = "abx_plugin_chrome"
19 |
--------------------------------------------------------------------------------
/archivebox/core/migrations/0019_auto_20210401_0654.py:
--------------------------------------------------------------------------------
1 | # Generated by Django 3.1.3 on 2021-04-01 06:54
2 |
3 | from django.db import migrations, models
4 |
5 |
6 | class Migration(migrations.Migration):
7 |
8 | dependencies = [
9 | ('core', '0018_auto_20210327_0952'),
10 | ]
11 |
12 | operations = [
13 | migrations.AlterField(
14 | model_name='snapshot',
15 | name='url',
16 | field=models.URLField(db_index=True, unique=True),
17 | ),
18 | ]
19 |
--------------------------------------------------------------------------------
/archivebox/pkgs/abx-plugin-favicon/pyproject.toml:
--------------------------------------------------------------------------------
1 | [project]
2 | name = "abx-plugin-favicon"
3 | version = "2024.10.28"
4 | description = "Add your description here"
5 | readme = "README.md"
6 | requires-python = ">=3.10"
7 | dependencies = [
8 | "abx>=0.1.0",
9 | "abx-spec-config>=0.1.0",
10 | "abx-plugin-curl>=2024.10.28",
11 | ]
12 |
13 | [build-system]
14 | requires = ["hatchling"]
15 | build-backend = "hatchling.build"
16 |
17 | [project.entry-points.abx]
18 | abx_plugin_favicon = "abx_plugin_favicon"
19 |
--------------------------------------------------------------------------------
/archivebox/pkgs/abx-spec-archivebox/pyproject.toml:
--------------------------------------------------------------------------------
1 | [project]
2 | name = "abx-spec-archivebox"
3 | version = "0.1.0"
4 | description = "The common shared interfaces for the ABX ArchiveBox plugin ecosystem."
5 | readme = "README.md"
6 | requires-python = ">=3.10"
7 | dependencies = [
8 | "abx>=0.1.0",
9 | "django>=5.1.1,<6.0",
10 | ]
11 |
12 | [build-system]
13 | requires = ["hatchling"]
14 | build-backend = "hatchling.build"
15 |
16 | [project.entry-points.abx]
17 | abx_spec_archivebox = "abx_spec_archivebox"
18 |
--------------------------------------------------------------------------------
/archivebox/pkgs/abx-spec-searchbackend/pyproject.toml:
--------------------------------------------------------------------------------
1 | [project]
2 | name = "abx-spec-searchbackend"
3 | version = "0.1.0"
4 | description = "Add your description here"
5 | readme = "README.md"
6 | requires-python = ">=3.10"
7 | dependencies = [
8 | "abx>=0.1.0",
9 | "python-benedict>=0.26.0",
10 | "pydantic>=2.5.0",
11 | ]
12 |
13 | [build-system]
14 | requires = ["hatchling"]
15 | build-backend = "hatchling.build"
16 |
17 | [project.entry-points.abx]
18 | abx_spec_searchbackend = "abx_spec_searchbackend"
19 |
--------------------------------------------------------------------------------
/archivebox/core/migrations/0010_auto_20210216_1055.py:
--------------------------------------------------------------------------------
1 | # Generated by Django 3.1.3 on 2021-02-16 10:55
2 |
3 | from django.db import migrations, models
4 |
5 |
6 | class Migration(migrations.Migration):
7 |
8 | dependencies = [
9 | ('core', '0009_auto_20210216_1038'),
10 | ]
11 |
12 | operations = [
13 | migrations.AlterField(
14 | model_name='archiveresult',
15 | name='start_ts',
16 | field=models.DateTimeField(db_index=True),
17 | ),
18 | ]
19 |
--------------------------------------------------------------------------------
/archivebox/core/migrations/0030_alter_archiveresult_uuid.py:
--------------------------------------------------------------------------------
1 | # Generated by Django 5.0.6 on 2024-08-18 05:00
2 |
3 | from django.db import migrations, models
4 |
5 |
6 | class Migration(migrations.Migration):
7 |
8 | dependencies = [
9 | ('core', '0029_alter_archiveresult_id'),
10 | ]
11 |
12 | operations = [
13 | migrations.AlterField(
14 | model_name='archiveresult',
15 | name='uuid',
16 | field=models.UUIDField(unique=True),
17 | ),
18 | ]
19 |
--------------------------------------------------------------------------------
/archivebox/core/migrations/0037_rename_id_snapshot_old_id.py:
--------------------------------------------------------------------------------
1 | # Generated by Django 5.0.6 on 2024-08-18 06:08
2 |
3 | from django.db import migrations
4 |
5 |
6 | class Migration(migrations.Migration):
7 |
8 | dependencies = [
9 | ('core', '0036_alter_archiveresult_id_alter_archiveresult_old_id'),
10 | ]
11 |
12 | operations = [
13 | migrations.RenameField(
14 | model_name='snapshot',
15 | old_name='id',
16 | new_name='old_id',
17 | ),
18 | ]
19 |
--------------------------------------------------------------------------------
/archivebox/pkgs/abx-plugin-npm/abx_plugin_npm/config.py:
--------------------------------------------------------------------------------
1 | from abx_spec_config import BaseConfigSet
2 |
3 |
4 | ###################### Config ##########################
5 |
6 |
7 | class NpmDependencyConfigs(BaseConfigSet):
8 | # USE_NPM: bool = True
9 | # NPM_BINARY: str = Field(default='npm')
10 | # NPM_ARGS: Optional[List[str]] = Field(default=None)
11 | # NPM_EXTRA_ARGS: List[str] = []
12 | # NPM_DEFAULT_ARGS: List[str] = []
13 | pass
14 |
15 |
16 | NPM_CONFIG = NpmDependencyConfigs()
17 |
18 |
--------------------------------------------------------------------------------
/archivebox/core/migrations/0002_auto_20200625_1521.py:
--------------------------------------------------------------------------------
1 | # Generated by Django 3.0.7 on 2020-06-25 15:21
2 |
3 | from django.db import migrations, models
4 |
5 |
6 | class Migration(migrations.Migration):
7 |
8 | dependencies = [
9 | ('core', '0001_initial'),
10 | ]
11 |
12 | operations = [
13 | migrations.AlterField(
14 | model_name='snapshot',
15 | name='timestamp',
16 | field=models.CharField(default=None, max_length=32, null=True),
17 | ),
18 | ]
19 |
--------------------------------------------------------------------------------
/archivebox/core/migrations/0016_auto_20210218_1204.py:
--------------------------------------------------------------------------------
1 | # Generated by Django 3.1.3 on 2021-02-18 12:04
2 |
3 | from django.db import migrations, models
4 |
5 |
6 | class Migration(migrations.Migration):
7 |
8 | dependencies = [
9 | ('core', '0015_auto_20210218_0730'),
10 | ]
11 |
12 | operations = [
13 | migrations.AlterField(
14 | model_name='snapshot',
15 | name='tags',
16 | field=models.ManyToManyField(blank=True, to='core.Tag'),
17 | ),
18 | ]
19 |
--------------------------------------------------------------------------------
/archivebox/core/migrations/0039_rename_snapshot_archiveresult_snapshot_old.py:
--------------------------------------------------------------------------------
1 | # Generated by Django 5.0.6 on 2024-08-18 06:25
2 |
3 | from django.db import migrations
4 |
5 |
6 | class Migration(migrations.Migration):
7 |
8 | dependencies = [
9 | ('core', '0038_rename_uuid_snapshot_id'),
10 | ]
11 |
12 | operations = [
13 | migrations.RenameField(
14 | model_name='archiveresult',
15 | old_name='snapshot',
16 | new_name='snapshot_old',
17 | ),
18 | ]
19 |
--------------------------------------------------------------------------------
/archivebox/core/migrations/0060_alter_tag_id.py:
--------------------------------------------------------------------------------
1 | # Generated by Django 5.0.6 on 2024-08-20 03:42
2 |
3 | import uuid
4 | from django.db import migrations, models
5 |
6 |
7 | class Migration(migrations.Migration):
8 |
9 | dependencies = [
10 | ('core', '0059_tag_id'),
11 | ]
12 |
13 | operations = [
14 | migrations.AlterField(
15 | model_name='tag',
16 | name='id',
17 | field=models.UUIDField(default=uuid.uuid4, editable=False, unique=True),
18 | ),
19 | ]
20 |
--------------------------------------------------------------------------------
/archivebox/pkgs/abx-plugin-git/abx_plugin_git/binaries.py:
--------------------------------------------------------------------------------
1 | __package__ = 'abx_plugin_git'
2 |
3 | from typing import List
4 |
5 | from pydantic import InstanceOf
6 | from abx_pkg import BinProvider, BinName, Binary
7 |
8 | from abx_plugin_default_binproviders import apt, brew, env
9 |
10 | from .config import GIT_CONFIG
11 |
12 |
13 |
14 | class GitBinary(Binary):
15 | name: BinName = GIT_CONFIG.GIT_BINARY
16 | binproviders_supported: List[InstanceOf[BinProvider]] = [apt, brew, env]
17 |
18 | GIT_BINARY = GitBinary()
19 |
--------------------------------------------------------------------------------
/archivebox/pkgs/abx-plugin-ldap-auth/pyproject.toml:
--------------------------------------------------------------------------------
1 | [project]
2 | name = "abx-plugin-ldap-auth"
3 | version = "2024.10.28"
4 | description = "Add your description here"
5 | readme = "README.md"
6 | requires-python = ">=3.10"
7 | dependencies = [
8 | "abx>=0.1.0",
9 | "abx-spec-config>=0.1.0",
10 | "abx-spec-django>=0.1.0",
11 | ]
12 |
13 |
14 | [build-system]
15 | requires = ["hatchling"]
16 | build-backend = "hatchling.build"
17 |
18 |
19 | [project.entry-points.abx]
20 | abx_plugin_ldap_auth = "abx_plugin_ldap_auth"
21 |
--------------------------------------------------------------------------------
/archivebox/pkgs/abx-plugin-ytdlp/pyproject.toml:
--------------------------------------------------------------------------------
1 | [project]
2 | name = "abx-plugin-ytdlp"
3 | version = "2024.10.28"
4 | description = "Add your description here"
5 | readme = "README.md"
6 | requires-python = ">=3.10"
7 | dependencies = [
8 | "abx>=0.1.0",
9 | "abx-spec-config>=0.1.0",
10 | "abx-spec-abx-pkg>=0.1.0",
11 | "abx-pkg>=0.5.4",
12 | ]
13 |
14 | [build-system]
15 | requires = ["hatchling"]
16 | build-backend = "hatchling.build"
17 |
18 | [project.entry-points.abx]
19 | abx_plugin_ytdlp = "abx_plugin_ytdlp"
20 |
--------------------------------------------------------------------------------
/archivebox/core/migrations/0055_alter_tag_slug.py:
--------------------------------------------------------------------------------
1 | # Generated by Django 5.0.6 on 2024-08-20 03:24
2 |
3 | from django.db import migrations, models
4 |
5 |
6 | class Migration(migrations.Migration):
7 |
8 | dependencies = [
9 | ('core', '0054_alter_snapshot_timestamp'),
10 | ]
11 |
12 | operations = [
13 | migrations.AlterField(
14 | model_name='tag',
15 | name='slug',
16 | field=models.SlugField(editable=False, max_length=100, unique=True),
17 | ),
18 | ]
19 |
--------------------------------------------------------------------------------
/archivebox/core/migrations/0068_alter_archiveresult_options.py:
--------------------------------------------------------------------------------
1 | # Generated by Django 5.0.6 on 2024-08-20 07:26
2 |
3 | from django.db import migrations
4 |
5 |
6 | class Migration(migrations.Migration):
7 |
8 | dependencies = [
9 | ('core', '0067_alter_snapshottag_tag'),
10 | ]
11 |
12 | operations = [
13 | migrations.AlterModelOptions(
14 | name='archiveresult',
15 | options={'verbose_name': 'Archive Result', 'verbose_name_plural': 'Archive Results Log'},
16 | ),
17 | ]
18 |
--------------------------------------------------------------------------------
/archivebox/pkgs/abx-plugin-pip/abx_plugin_pip/config.py:
--------------------------------------------------------------------------------
1 | __package__ = 'pip'
2 |
3 | from typing import List, Optional
4 | from pydantic import Field
5 |
6 | from abx_spec_config.base_configset import BaseConfigSet
7 |
8 |
9 | class PipDependencyConfigs(BaseConfigSet):
10 | USE_PIP: bool = True
11 | PIP_BINARY: str = Field(default='pip')
12 | PIP_ARGS: Optional[List[str]] = Field(default=None)
13 | PIP_EXTRA_ARGS: List[str] = []
14 | PIP_DEFAULT_ARGS: List[str] = []
15 |
16 | PIP_CONFIG = PipDependencyConfigs()
17 |
--------------------------------------------------------------------------------
/archivebox/pkgs/abx-spec-archivebox/abx_spec_archivebox/effects.py:
--------------------------------------------------------------------------------
1 | """
2 | Hookspec for side-effects that ArchiveBox plugins can trigger.
3 |
4 | (e.g. network requests, binary execution, remote API calls, external library calls, etc.)
5 | """
6 |
7 | __package__ = 'abx.archivebox'
8 |
9 | import abx
10 |
11 |
12 | @abx.hookspec
13 | def check_remote_seed_connection(urls, extractor, credentials, created_by):
14 | pass
15 |
16 |
17 | @abx.hookspec
18 | def exec_extractor(url, extractor, credentials, config):
19 | pass
20 |
21 |
--------------------------------------------------------------------------------
/etc/README.md:
--------------------------------------------------------------------------------
1 | # Example etc files for deploying ArchiveBox
2 |
3 | In this folder are some example config files you can use for setting up ArchiveBox on your machine.
4 |
5 | E.g. see `nginx.conf` for an example nginx config to serve your archive with SSL, or `fly.toml` for an example deployment to the Fly.io hosting platform.
6 |
7 | Please contribute your etc files here! Example contributions
8 |
9 | - supervisord config
10 | - systemd config
11 | - apache webserver config
12 | - other init system, webservers, or scheduler configs
13 |
--------------------------------------------------------------------------------
/archivebox/core/migrations/0009_auto_20210216_1038.py:
--------------------------------------------------------------------------------
1 | # Generated by Django 3.1.3 on 2021-02-16 10:38
2 |
3 | from django.db import migrations, models
4 |
5 |
6 | class Migration(migrations.Migration):
7 |
8 | dependencies = [
9 | ('core', '0008_auto_20210105_1421'),
10 | ]
11 |
12 | operations = [
13 | migrations.AlterField(
14 | model_name='snapshot',
15 | name='updated',
16 | field=models.DateTimeField(auto_now=True, db_index=True, null=True),
17 | ),
18 | ]
19 |
--------------------------------------------------------------------------------
/archivebox/core/migrations/0028_alter_archiveresult_uuid.py:
--------------------------------------------------------------------------------
1 | # Generated by Django 5.0.6 on 2024-08-18 04:28
2 |
3 | import uuid
4 | from django.db import migrations, models
5 |
6 |
7 | class Migration(migrations.Migration):
8 |
9 | dependencies = [
10 | ('core', '0027_update_snapshot_ids'),
11 | ]
12 |
13 | operations = [
14 | migrations.AlterField(
15 | model_name='archiveresult',
16 | name='uuid',
17 | field=models.UUIDField(default=uuid.uuid4),
18 | ),
19 | ]
20 |
--------------------------------------------------------------------------------
/archivebox/pkgs/abx-plugin-archivedotorg/pyproject.toml:
--------------------------------------------------------------------------------
1 | [project]
2 | name = "abx-plugin-archivedotorg"
3 | version = "2024.10.28"
4 | description = "Add your description here"
5 | readme = "README.md"
6 | requires-python = ">=3.10"
7 | dependencies = [
8 | "abx>=0.1.0",
9 | "abx-spec-config>=0.1.0",
10 | "abx-plugin-curl>=2024.10.24",
11 | ]
12 |
13 | [build-system]
14 | requires = ["hatchling"]
15 | build-backend = "hatchling.build"
16 |
17 | [project.entry-points.abx]
18 | abx_plugin_archivedotorg = "abx_plugin_archivedotorg"
19 |
--------------------------------------------------------------------------------
/archivebox/pkgs/abx-plugin-curl/abx_plugin_curl/binaries.py:
--------------------------------------------------------------------------------
1 | __package__ = 'abx_plugin_curl'
2 |
3 | from typing import List
4 |
5 | from pydantic import InstanceOf
6 | from abx_pkg import BinProvider, BinName, Binary
7 |
8 | from abx_plugin_default_binproviders import apt, brew, env
9 |
10 |
11 | from .config import CURL_CONFIG
12 |
13 |
14 | class CurlBinary(Binary):
15 | name: BinName = CURL_CONFIG.CURL_BINARY
16 | binproviders_supported: List[InstanceOf[BinProvider]] = [apt, brew, env]
17 |
18 | CURL_BINARY = CurlBinary()
19 |
--------------------------------------------------------------------------------
/archivebox/pkgs/abx-plugin-wget/abx_plugin_wget/binaries.py:
--------------------------------------------------------------------------------
1 | __package__ = 'abx_plugin_wget'
2 |
3 | from typing import List
4 |
5 |
6 | from pydantic import InstanceOf
7 | from abx_pkg import BinProvider, BinName, Binary
8 |
9 | from abx_plugin_default_binproviders import apt, brew, env
10 |
11 | from .config import WGET_CONFIG
12 |
13 |
14 | class WgetBinary(Binary):
15 | name: BinName = WGET_CONFIG.WGET_BINARY
16 | binproviders_supported: List[InstanceOf[BinProvider]] = [apt, brew, env]
17 |
18 | WGET_BINARY = WgetBinary()
19 |
--------------------------------------------------------------------------------
/archivebox/core/apps.py:
--------------------------------------------------------------------------------
1 | __package__ = 'archivebox.core'
2 |
3 | from django.apps import AppConfig
4 |
5 | import archivebox
6 |
7 |
8 | class CoreConfig(AppConfig):
9 | name = 'core'
10 |
11 | def ready(self):
12 | """Register the archivebox.core.admin_site as the main django admin site"""
13 | from django.conf import settings
14 | archivebox.pm.hook.ready(settings=settings)
15 |
16 | from core.admin_site import register_admin_site
17 | register_admin_site()
18 |
19 |
20 |
21 |
--------------------------------------------------------------------------------
/archivebox/core/migrations/0013_auto_20210218_0729.py:
--------------------------------------------------------------------------------
1 | # Generated by Django 3.1.3 on 2021-02-18 07:29
2 |
3 | from django.db import migrations, models
4 |
5 |
6 | class Migration(migrations.Migration):
7 |
8 | dependencies = [
9 | ('core', '0012_auto_20210216_1425'),
10 | ]
11 |
12 | operations = [
13 | migrations.AlterField(
14 | model_name='snapshot',
15 | name='title',
16 | field=models.CharField(blank=True, db_index=True, max_length=256, null=True),
17 | ),
18 | ]
19 |
--------------------------------------------------------------------------------
/archivebox/core/migrations/0014_auto_20210218_0729.py:
--------------------------------------------------------------------------------
1 | # Generated by Django 3.1.3 on 2021-02-18 07:29
2 |
3 | from django.db import migrations, models
4 |
5 |
6 | class Migration(migrations.Migration):
7 |
8 | dependencies = [
9 | ('core', '0013_auto_20210218_0729'),
10 | ]
11 |
12 | operations = [
13 | migrations.AlterField(
14 | model_name='snapshot',
15 | name='title',
16 | field=models.CharField(blank=True, db_index=True, max_length=1024, null=True),
17 | ),
18 | ]
19 |
--------------------------------------------------------------------------------
/archivebox/core/migrations/0015_auto_20210218_0730.py:
--------------------------------------------------------------------------------
1 | # Generated by Django 3.1.3 on 2021-02-18 07:30
2 |
3 | from django.db import migrations, models
4 |
5 |
6 | class Migration(migrations.Migration):
7 |
8 | dependencies = [
9 | ('core', '0014_auto_20210218_0729'),
10 | ]
11 |
12 | operations = [
13 | migrations.AlterField(
14 | model_name='snapshot',
15 | name='title',
16 | field=models.CharField(blank=True, db_index=True, max_length=512, null=True),
17 | ),
18 | ]
19 |
--------------------------------------------------------------------------------
/archivebox/pkgs/abx-plugin-htmltotext/abx_plugin_htmltotext/__init__.py:
--------------------------------------------------------------------------------
1 | __package__ = 'abx_plugin_htmltotext'
2 | __label__ = 'HTML-to-Text'
3 |
4 | import abx
5 |
6 |
7 | @abx.hookimpl
8 | def get_CONFIG():
9 | from .config import HTMLTOTEXT_CONFIG
10 |
11 | return {
12 | 'HTMLTOTEXT_CONFIG': HTMLTOTEXT_CONFIG
13 | }
14 |
15 |
16 | # @abx.hookimpl
17 | # def get_EXTRACTORS():
18 | # from .extractors import FAVICON_EXTRACTOR
19 |
20 | # return {
21 | # 'htmltotext': FAVICON_EXTRACTOR,
22 | # }
23 |
--------------------------------------------------------------------------------
/archivebox/pkgs/abx-plugin-ripgrep-search/pyproject.toml:
--------------------------------------------------------------------------------
1 | [project]
2 | name = "abx-plugin-ripgrep-search"
3 | version = "2024.10.28"
4 | description = "Add your description here"
5 | readme = "README.md"
6 | requires-python = ">=3.10"
7 | dependencies = [
8 | "abx>=0.1.0",
9 | "abx-spec-config>=0.1.0",
10 | "abx-spec-searchbackend>=0.1.0",
11 | ]
12 |
13 | [build-system]
14 | requires = ["hatchling"]
15 | build-backend = "hatchling.build"
16 |
17 | [project.entry-points.abx]
18 | abx_plugin_ripgrep_search = "abx_plugin_ripgrep_search"
19 |
--------------------------------------------------------------------------------
/archivebox/core/migrations/0008_auto_20210105_1421.py:
--------------------------------------------------------------------------------
1 | # Generated by Django 3.1.3 on 2021-01-05 14:21
2 |
3 | from django.db import migrations, models
4 |
5 |
6 | class Migration(migrations.Migration):
7 |
8 | dependencies = [
9 | ('core', '0007_archiveresult'),
10 | ]
11 |
12 | operations = [
13 | migrations.AlterField(
14 | model_name='archiveresult',
15 | name='cmd_version',
16 | field=models.CharField(blank=True, default=None, max_length=32, null=True),
17 | ),
18 | ]
19 |
--------------------------------------------------------------------------------
/archivebox/core/migrations/0017_auto_20210219_0211.py:
--------------------------------------------------------------------------------
1 | # Generated by Django 3.1.3 on 2021-02-19 02:11
2 |
3 | from django.db import migrations, models
4 |
5 |
6 | class Migration(migrations.Migration):
7 |
8 | dependencies = [
9 | ('core', '0016_auto_20210218_1204'),
10 | ]
11 |
12 | operations = [
13 | migrations.AlterField(
14 | model_name='tag',
15 | name='slug',
16 | field=models.SlugField(blank=True, max_length=100, unique=True, verbose_name='slug'),
17 | ),
18 | ]
19 |
--------------------------------------------------------------------------------
/archivebox/pkgs/abx-plugin-puppeteer/pyproject.toml:
--------------------------------------------------------------------------------
1 | [project]
2 | name = "abx-plugin-puppeteer"
3 | version = "2024.10.28"
4 | description = "Add your description here"
5 | readme = "README.md"
6 | requires-python = ">=3.10"
7 | dependencies = [
8 | "abx>=0.1.0",
9 | "abx-spec-config>=0.1.0",
10 | "abx-spec-abx-pkg>=0.1.0",
11 | "abx-pkg>=0.5.4",
12 | ]
13 |
14 | [build-system]
15 | requires = ["hatchling"]
16 | build-backend = "hatchling.build"
17 |
18 | [project.entry-points.abx]
19 | abx_plugin_puppeteer = "abx_plugin_puppeteer"
20 |
--------------------------------------------------------------------------------
/archivebox/pkgs/abx-plugin-singlefile/pyproject.toml:
--------------------------------------------------------------------------------
1 | [project]
2 | name = "abx-plugin-singlefile"
3 | version = "2024.10.28"
4 | description = "Add your description here"
5 | readme = "README.md"
6 | requires-python = ">=3.10"
7 | dependencies = [
8 | "abx>=0.1.0",
9 | "abx-spec-config>=0.1.0",
10 | "abx-spec-abx-pkg>=0.1.0",
11 | "abx-pkg>=0.5.4",
12 | ]
13 |
14 | [build-system]
15 | requires = ["hatchling"]
16 | build-backend = "hatchling.build"
17 |
18 | [project.entry-points.abx]
19 | abx_plugin_singlefile = "abx_plugin_singlefile"
20 |
--------------------------------------------------------------------------------
/.github/PULL_REQUEST_TEMPLATE.md:
--------------------------------------------------------------------------------
1 |
2 |
3 | # Summary
4 |
5 |
6 |
7 | # Related issues
8 |
9 |
10 |
11 | # Changes these areas
12 |
13 | - [ ] Bugfixes
14 | - [ ] Feature behavior
15 | - [ ] Command line interface
16 | - [ ] Configuration options
17 | - [ ] Internal architecture
18 | - [ ] Snapshot data layout on disk
19 |
--------------------------------------------------------------------------------
/archivebox/pkgs/abx-plugin-git/pyproject.toml:
--------------------------------------------------------------------------------
1 | [project]
2 | name = "abx-plugin-git"
3 | version = "2024.10.28"
4 | description = "Add your description here"
5 | readme = "README.md"
6 | requires-python = ">=3.10"
7 | dependencies = [
8 | "abx>=0.1.0",
9 | "abx-spec-config>=0.1.0",
10 | "abx-spec-abx-pkg>=0.1.0",
11 | "abx-plugin-default-binproviders>=2024.10.24",
12 | ]
13 |
14 | [build-system]
15 | requires = ["hatchling"]
16 | build-backend = "hatchling.build"
17 |
18 | [project.entry-points.abx]
19 | abx_plugin_git = "abx_plugin_git"
20 |
--------------------------------------------------------------------------------
/archivebox/pkgs/abx-plugin-sqlitefts-search/pyproject.toml:
--------------------------------------------------------------------------------
1 | [project]
2 | name = "abx-plugin-sqlitefts-search"
3 | version = "2024.10.28"
4 | description = "Add your description here"
5 | readme = "README.md"
6 | requires-python = ">=3.10"
7 | dependencies = [
8 | "abx>=0.1.0",
9 | "abx-spec-config>=0.1.0",
10 | "abx-spec-searchbackend>=0.1.0",
11 | ]
12 |
13 | [build-system]
14 | requires = ["hatchling"]
15 | build-backend = "hatchling.build"
16 |
17 | [project.entry-points.abx]
18 | abx_plugin_sqlitefts_search = "abx_plugin_sqlitefts_search"
19 |
--------------------------------------------------------------------------------
/archivebox/core/migrations/0029_alter_archiveresult_id.py:
--------------------------------------------------------------------------------
1 | # Generated by Django 5.0.6 on 2024-08-18 04:28
2 |
3 | from django.db import migrations, models
4 |
5 |
6 | class Migration(migrations.Migration):
7 |
8 | dependencies = [
9 | ('core', '0028_alter_archiveresult_uuid'),
10 | ]
11 |
12 | operations = [
13 | migrations.AlterField(
14 | model_name='archiveresult',
15 | name='id',
16 | field=models.BigIntegerField(primary_key=True, serialize=False, verbose_name='ID'),
17 | ),
18 | ]
19 |
--------------------------------------------------------------------------------
/archivebox/core/migrations/0035_remove_archiveresult_uuid_archiveresult_id.py:
--------------------------------------------------------------------------------
1 | # Generated by Django 5.0.6 on 2024-08-18 05:49
2 |
3 | import uuid
4 | from django.db import migrations, models
5 |
6 |
7 | class Migration(migrations.Migration):
8 |
9 | dependencies = [
10 | ('core', '0034_alter_archiveresult_old_id_alter_archiveresult_uuid'),
11 | ]
12 |
13 | operations = [
14 | migrations.RenameField(
15 | model_name='archiveresult',
16 | old_name='uuid',
17 | new_name='id',
18 | ),
19 | ]
20 |
--------------------------------------------------------------------------------
/archivebox/core/wsgi.py:
--------------------------------------------------------------------------------
1 | """
2 | WSGI config for archivebox project.
3 |
4 | It exposes the WSGI callable as a module-level variable named ``application``.
5 |
6 | For more information on this file, see
7 | https://docs.djangoproject.com/en/2.1/howto/deployment/wsgi/
8 | """
9 |
10 | import archivebox # noqa
11 | from archivebox.config.django import setup_django
12 |
13 | setup_django(in_memory_db=False, check_db=True)
14 |
15 | from django.core.wsgi import get_wsgi_application
16 |
17 | application = get_wsgi_application()
18 |
--------------------------------------------------------------------------------
/archivebox/pkgs/abx-plugin-archivedotorg/abx_plugin_archivedotorg/__init__.py:
--------------------------------------------------------------------------------
1 | __label__ = 'Archive.org'
2 | __homepage__ = 'https://archive.org'
3 |
4 | import abx
5 |
6 | @abx.hookimpl
7 | def get_CONFIG():
8 | from .config import ARCHIVEDOTORG_CONFIG
9 |
10 | return {
11 | 'ARCHIVEDOTORG_CONFIG': ARCHIVEDOTORG_CONFIG
12 | }
13 |
14 |
15 | # @abx.hookimpl
16 | # def get_EXTRACTORS():
17 | # from .extractors import ARCHIVEDOTORG_EXTRACTOR
18 | #
19 | # return {
20 | # 'archivedotorg': ARCHIVEDOTORG_EXTRACTOR,
21 | # }
22 |
--------------------------------------------------------------------------------
/archivebox/pkgs/abx-plugin-sqlitefts-search/abx_plugin_sqlitefts_search/__init__.py:
--------------------------------------------------------------------------------
1 | __package__ = 'abx_plugin_sqlitefts_search'
2 | __label__ = 'SQLiteFTS Search'
3 |
4 | import abx
5 |
6 | @abx.hookimpl
7 | def get_CONFIG():
8 | from .config import SQLITEFTS_CONFIG
9 |
10 | return {
11 | 'SQLITEFTS_CONFIG': SQLITEFTS_CONFIG
12 | }
13 |
14 |
15 | @abx.hookimpl
16 | def get_SEARCHBACKENDS():
17 | from .searchbackend import SQLITEFTS_SEARCH_BACKEND
18 |
19 | return {
20 | 'sqlitefts': SQLITEFTS_SEARCH_BACKEND,
21 | }
22 |
--------------------------------------------------------------------------------
/archivebox/core/migrations/0025_alter_archiveresult_uuid.py:
--------------------------------------------------------------------------------
1 | # Generated by Django 5.0.6 on 2024-05-13 12:08
2 |
3 | import uuid
4 | from django.db import migrations, models
5 |
6 |
7 | class Migration(migrations.Migration):
8 |
9 | dependencies = [
10 | ('core', '0024_auto_20240513_1143'),
11 | ]
12 |
13 | operations = [
14 | migrations.AlterField(
15 | model_name='archiveresult',
16 | name='uuid',
17 | field=models.UUIDField(default=uuid.uuid4, editable=False, unique=True),
18 | ),
19 | ]
20 |
--------------------------------------------------------------------------------
/archivebox/core/migrations/0054_alter_snapshot_timestamp.py:
--------------------------------------------------------------------------------
1 | # Generated by Django 5.0.6 on 2024-08-20 02:40
2 |
3 | from django.db import migrations, models
4 |
5 |
6 | class Migration(migrations.Migration):
7 |
8 | dependencies = [
9 | ('core', '0053_remove_snapshottag_snapshot_old'),
10 | ]
11 |
12 | operations = [
13 | migrations.AlterField(
14 | model_name='snapshot',
15 | name='timestamp',
16 | field=models.CharField(db_index=True, editable=False, max_length=32, unique=True),
17 | ),
18 | ]
19 |
--------------------------------------------------------------------------------
/archivebox/pkgs/abx-plugin-default-binproviders/pyproject.toml:
--------------------------------------------------------------------------------
1 | [project]
2 | name = "abx-plugin-default-binproviders"
3 | version = "2024.10.24"
4 | description = "Default BinProviders for ABX (apt, brew, env)"
5 | readme = "README.md"
6 | requires-python = ">=3.10"
7 | dependencies = [
8 | "abx>=0.1.0",
9 | "abx-pkg>=0.5.4",
10 | "abx-spec-abx-pkg>=0.1.0",
11 | ]
12 |
13 | [build-system]
14 | requires = ["hatchling"]
15 | build-backend = "hatchling.build"
16 |
17 | [project.entry-points.abx]
18 | abx_plugin_default_binproviders = "abx_plugin_default_binproviders"
19 |
--------------------------------------------------------------------------------
/bin/release_pip.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | ### Bash Environment Setup
4 | # http://redsymbol.net/articles/unofficial-bash-strict-mode/
5 | # https://www.gnu.org/software/bash/manual/html_node/The-Set-Builtin.html
6 | # set -o xtrace
7 | set -o errexit
8 | set -o errtrace
9 | set -o nounset
10 | set -o pipefail
11 | IFS=$'\n'
12 |
13 | REPO_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && cd .. && pwd )"
14 | cd "$REPO_DIR"
15 | source "$REPO_DIR/.venv/bin/activate"
16 |
17 | echo "[^] Publishing to PyPI..."
18 | rm -Rf dist
19 | uv build
20 | uv publish
21 |
--------------------------------------------------------------------------------
/archivebox/pkgs/abx-plugin-playwright/pyproject.toml:
--------------------------------------------------------------------------------
1 | [project]
2 | name = "abx-plugin-playwright"
3 | version = "2024.10.28"
4 | description = "Add your description here"
5 | readme = "README.md"
6 | requires-python = ">=3.10"
7 | dependencies = [
8 | "abx>=0.1.0",
9 | "pydantic>=2.4.2",
10 | "abx-pkg>=0.5.4",
11 | "abx-spec-abx-pkg>=0.1.0",
12 | "abx-spec-config>=0.1.0",
13 | ]
14 |
15 | [build-system]
16 | requires = ["hatchling"]
17 | build-backend = "hatchling.build"
18 |
19 | [project.entry-points.abx]
20 | abx_plugin_playwright = "abx_plugin_playwright"
21 |
--------------------------------------------------------------------------------
/archivebox/pkgs/abx-plugin-npm/pyproject.toml:
--------------------------------------------------------------------------------
1 | [project]
2 | name = "abx-plugin-npm"
3 | version = "2024.10.24"
4 | description = "NPM binary provider plugin for ABX"
5 | readme = "README.md"
6 | requires-python = ">=3.10"
7 | dependencies = [
8 | "abx>=0.1.0",
9 | "abx-pkg>=0.5.4",
10 | "abx-spec-abx-pkg>=0.1.0",
11 | "abx-spec-config>=0.1.0",
12 | "abx-plugin-default-binproviders>=2024.10.24",
13 | ]
14 |
15 | [build-system]
16 | requires = ["hatchling"]
17 | build-backend = "hatchling.build"
18 |
19 | [project.entry-points.abx]
20 | abx_plugin_npm = "abx_plugin_npm"
21 |
--------------------------------------------------------------------------------
/archivebox/api/urls.py:
--------------------------------------------------------------------------------
1 | __package__ = 'archivebox.api'
2 |
3 | from django.urls import path
4 | from django.views.generic.base import RedirectView
5 |
6 | from .v1_api import urls as v1_api_urls
7 |
8 | urlpatterns = [
9 | path("", RedirectView.as_view(url='/api/v1')),
10 |
11 | path("v1/", v1_api_urls),
12 | path("v1", RedirectView.as_view(url='/api/v1/docs')),
13 |
14 | # ... v2 can be added here ...
15 | # path("v2/", v2_api_urls),
16 | # path("v2", RedirectView.as_view(url='/api/v2/docs')),
17 | ]
18 |
--------------------------------------------------------------------------------
/archivebox/core/migrations/0004_auto_20200713_1552.py:
--------------------------------------------------------------------------------
1 | # Generated by Django 3.0.7 on 2020-07-13 15:52
2 |
3 | from django.db import migrations, models
4 |
5 |
6 | class Migration(migrations.Migration):
7 |
8 | dependencies = [
9 | ('core', '0003_auto_20200630_1034'),
10 | ]
11 |
12 | operations = [
13 | migrations.AlterField(
14 | model_name='snapshot',
15 | name='timestamp',
16 | field=models.CharField(db_index=True, default=None, max_length=32, unique=True),
17 | preserve_default=False,
18 | ),
19 | ]
20 |
--------------------------------------------------------------------------------
/archivebox/pkgs/abx-plugin-puppeteer/abx_plugin_puppeteer/config.py:
--------------------------------------------------------------------------------
1 | __package__ = 'abx_plugin_puppeteer'
2 |
3 |
4 | from abx_spec_config.base_configset import BaseConfigSet
5 |
6 |
7 | ###################### Config ##########################
8 |
9 |
10 | class PuppeteerConfig(BaseConfigSet):
11 | PUPPETEER_BINARY: str = 'puppeteer'
12 | # PUPPETEER_ARGS: Optional[List[str]] = Field(default=None)
13 | # PUPPETEER_EXTRA_ARGS: List[str] = []
14 | # PUPPETEER_DEFAULT_ARGS: List[str] = ['--timeout={TIMEOUT-10}']
15 | pass
16 |
17 |
18 | PUPPETEER_CONFIG = PuppeteerConfig()
19 |
--------------------------------------------------------------------------------
/archivebox/pkgs/abx-spec-config/pyproject.toml:
--------------------------------------------------------------------------------
1 | [project]
2 | name = "abx-spec-config"
3 | version = "0.1.0"
4 | description = "The common shared interfaces for the ABX ArchiveBox plugin ecosystem."
5 | readme = "README.md"
6 | requires-python = ">=3.10"
7 | dependencies = [
8 | "abx>=0.1.0",
9 | "python-benedict>=0.34.0",
10 | "pydantic>=2.9.2",
11 | "pydantic-settings>=2.6.0",
12 | "rich>=13.9.3",
13 | ]
14 |
15 | [build-system]
16 | requires = ["hatchling"]
17 | build-backend = "hatchling.build"
18 |
19 | [project.entry-points.abx]
20 | abx_spec_config = "abx_spec_config"
21 |
--------------------------------------------------------------------------------
/archivebox/pkgs/abx-plugin-sonic-search/pyproject.toml:
--------------------------------------------------------------------------------
1 | [project]
2 | name = "abx-plugin-sonic-search"
3 | version = "2024.10.28"
4 | description = "Add your description here"
5 | readme = "README.md"
6 | requires-python = ">=3.10"
7 | dependencies = [
8 | "abx>=0.1.0",
9 | "abx-spec-config>=0.1.0",
10 | "abx-spec-abx-pkg>=0.1.0",
11 | "abx-spec-searchbackend>=0.1.0",
12 | "abx-pkg>=0.5.4",
13 | ]
14 |
15 | [build-system]
16 | requires = ["hatchling"]
17 | build-backend = "hatchling.build"
18 |
19 | [project.entry-points.abx]
20 | abx_plugin_sonic_search = "abx_plugin_sonic_search"
21 |
--------------------------------------------------------------------------------
/archivebox/core/migrations/0074_alter_snapshot_downloaded_at.py:
--------------------------------------------------------------------------------
1 | # Generated by Django 5.1 on 2024-09-05 01:24
2 |
3 | from django.db import migrations, models
4 |
5 |
6 | class Migration(migrations.Migration):
7 |
8 | dependencies = [
9 | ('core', '0073_rename_created_archiveresult_created_at_and_more'),
10 | ]
11 |
12 | operations = [
13 | migrations.AlterField(
14 | model_name='snapshot',
15 | name='downloaded_at',
16 | field=models.DateTimeField(blank=True, db_index=True, default=None, editable=False, null=True),
17 | ),
18 | ]
19 |
--------------------------------------------------------------------------------
/archivebox/pkgs/abx-plugin-favicon/abx_plugin_favicon/extractors.py:
--------------------------------------------------------------------------------
1 | __package__ = 'abx_plugin_favicon'
2 |
3 | from pathlib import Path
4 |
5 | from abx_pkg import BinName
6 |
7 | from abx_spec_extractor import BaseExtractor, ExtractorName
8 |
9 | from abx_plugin_curl.binaries import CURL_BINARY
10 |
11 |
12 | class FaviconExtractor(BaseExtractor):
13 | name: ExtractorName = 'favicon'
14 | binary: BinName = CURL_BINARY.name
15 |
16 | def get_output_path(self, snapshot) -> Path | None:
17 | return Path(snapshot.link_dir) / 'favicon.png'
18 |
19 | FAVICON_EXTRACTOR = FaviconExtractor()
20 |
--------------------------------------------------------------------------------
/archivebox/pkgs/abx-plugin-pip/pyproject.toml:
--------------------------------------------------------------------------------
1 | [project]
2 | name = "abx-plugin-pip"
3 | version = "2024.10.24"
4 | description = "Add your description here"
5 | readme = "README.md"
6 | requires-python = ">=3.10"
7 | dependencies = [
8 | "abx>=0.1.0",
9 | "abx-pkg>=0.5.4",
10 | "abx-spec-config>=0.1.0",
11 | "abx-spec-abx-pkg>=0.1.0",
12 | "abx-plugin-default-binproviders>=2024.10.24",
13 | "django>=5.0.0",
14 | ]
15 |
16 |
17 | [build-system]
18 | requires = ["hatchling"]
19 | build-backend = "hatchling.build"
20 |
21 | [project.entry-points.abx]
22 | abx_plugin_pip = "abx_plugin_pip"
23 |
--------------------------------------------------------------------------------
/archivebox/pkgs/abx-plugin-mercury/abx_plugin_mercury/extractors.py:
--------------------------------------------------------------------------------
1 | __package__ = 'abx_plugin_mercury'
2 |
3 | from pathlib import Path
4 |
5 | from abx_pkg import BinName
6 | from abx_spec_extractor import BaseExtractor, ExtractorName
7 |
8 | from .binaries import MERCURY_BINARY
9 |
10 |
11 |
12 | class MercuryExtractor(BaseExtractor):
13 | name: ExtractorName = 'mercury'
14 | binary: BinName = MERCURY_BINARY.name
15 |
16 | def get_output_path(self, snapshot) -> Path | None:
17 | return snapshot.link_dir / 'mercury' / 'content.html'
18 |
19 |
20 | MERCURY_EXTRACTOR = MercuryExtractor()
21 |
--------------------------------------------------------------------------------
/archivebox/pkgs/abx-plugin-git/abx_plugin_git/__init__.py:
--------------------------------------------------------------------------------
1 | __package__ = 'abx_plugin_git'
2 | __label__ = 'Git'
3 |
4 | import abx
5 |
6 |
7 | @abx.hookimpl
8 | def get_CONFIG():
9 | from .config import GIT_CONFIG
10 |
11 | return {
12 | 'GIT_CONFIG': GIT_CONFIG
13 | }
14 |
15 | @abx.hookimpl
16 | def get_BINARIES():
17 | from .binaries import GIT_BINARY
18 |
19 | return {
20 | 'git': GIT_BINARY,
21 | }
22 |
23 | @abx.hookimpl
24 | def get_EXTRACTORS():
25 | from .extractors import GIT_EXTRACTOR
26 |
27 | return {
28 | 'git': GIT_EXTRACTOR,
29 | }
30 |
--------------------------------------------------------------------------------
/archivebox/core/migrations/0045_alter_snapshot_old_id.py:
--------------------------------------------------------------------------------
1 | # Generated by Django 5.0.6 on 2024-08-20 01:54
2 |
3 | import uuid
4 | from django.db import migrations, models
5 |
6 |
7 | class Migration(migrations.Migration):
8 |
9 | dependencies = [
10 | ('core', '0044_alter_archiveresult_snapshot_alter_tag_uuid_and_more'),
11 | ]
12 |
13 | operations = [
14 | migrations.AlterField(
15 | model_name='snapshot',
16 | name='old_id',
17 | field=models.UUIDField(default=uuid.uuid4, editable=False, primary_key=True, serialize=False, unique=True),
18 | ),
19 | ]
20 |
--------------------------------------------------------------------------------
/.dockerignore:
--------------------------------------------------------------------------------
1 | .DS_Store
2 | ._*
3 | *.pyc
4 | __pycache__/
5 | .mypy_cache/
6 | .pytest_cache/
7 | .github/
8 | .pdm-build/
9 | .pdm-python
10 | .eggs/
11 | .git/
12 | .vscode/
13 | !.git/HEAD
14 | !.git/refs/heads/*
15 |
16 | venv/
17 | .venv/
18 | .venv-old/
19 | .docker_venv/
20 | .docker-venv/
21 | node_modules/
22 | chrome/
23 | chromeprofile/
24 | chrome_profile/
25 |
26 | pdm.dev.lock
27 | pdm.lock
28 |
29 | docs/
30 | build/
31 | dist/
32 | brew_dist/
33 | deb_dist/
34 | pip_dist/
35 | assets/
36 | docker/
37 | website/
38 | typings/
39 |
40 | tmp/
41 | data/
42 | data*/
43 | output/
44 | index.sqlite3
45 | index.sqlite3-wal
46 |
--------------------------------------------------------------------------------
/archivebox/pkgs/abx-plugin-singlefile/abx_plugin_singlefile/extractors.py:
--------------------------------------------------------------------------------
1 | __package__ = 'abx_plugin_singlefile'
2 |
3 |
4 | from pathlib import Path
5 |
6 | from abx_pkg import BinName
7 |
8 | from abx_spec_extractor import BaseExtractor, ExtractorName
9 |
10 | from .binaries import SINGLEFILE_BINARY
11 |
12 |
13 | class SinglefileExtractor(BaseExtractor):
14 | name: ExtractorName = 'singlefile'
15 | binary: BinName = SINGLEFILE_BINARY.name
16 |
17 | def get_output_path(self, snapshot) -> Path:
18 | return Path(snapshot.link_dir) / 'singlefile.html'
19 |
20 |
21 | SINGLEFILE_EXTRACTOR = SinglefileExtractor()
22 |
--------------------------------------------------------------------------------
/archivebox/api/migrations/0005_remove_apitoken_uuid_remove_outboundwebhook_uuid_and_more.py:
--------------------------------------------------------------------------------
1 | # Generated by Django 5.1 on 2024-08-20 22:40
2 |
3 | import uuid
4 | from django.db import migrations, models
5 |
6 |
7 | class Migration(migrations.Migration):
8 |
9 | dependencies = [
10 | ('api', '0004_alter_apitoken_id_alter_apitoken_uuid'),
11 | ]
12 |
13 | operations = [
14 | migrations.RemoveField(
15 | model_name='apitoken',
16 | name='uuid',
17 | ),
18 | migrations.RemoveField(
19 | model_name='outboundwebhook',
20 | name='id',
21 | ),
22 | ]
23 |
--------------------------------------------------------------------------------
/archivebox/core/migrations/0062_alter_snapshottag_old_tag.py:
--------------------------------------------------------------------------------
1 | # Generated by Django 5.0.6 on 2024-08-20 03:44
2 |
3 | import django.db.models.deletion
4 | from django.db import migrations, models
5 |
6 |
7 | class Migration(migrations.Migration):
8 |
9 | dependencies = [
10 | ('core', '0061_rename_tag_snapshottag_old_tag_and_more'),
11 | ]
12 |
13 | operations = [
14 | migrations.AlterField(
15 | model_name='snapshottag',
16 | name='old_tag',
17 | field=models.ForeignKey(db_column='old_tag_id', on_delete=django.db.models.deletion.CASCADE, to='core.tag'),
18 | ),
19 | ]
20 |
--------------------------------------------------------------------------------
/archivebox/core/migrations/0067_alter_snapshottag_tag.py:
--------------------------------------------------------------------------------
1 | # Generated by Django 5.0.6 on 2024-08-20 03:53
2 |
3 | import django.db.models.deletion
4 | from django.db import migrations, models
5 |
6 |
7 | class Migration(migrations.Migration):
8 |
9 | dependencies = [
10 | ('core', '0066_alter_snapshottag_tag_alter_tag_id_alter_tag_old_id'),
11 | ]
12 |
13 | operations = [
14 | migrations.AlterField(
15 | model_name='snapshottag',
16 | name='tag',
17 | field=models.ForeignKey(db_column='tag_id', on_delete=django.db.models.deletion.CASCADE, to='core.tag'),
18 | ),
19 | ]
20 |
--------------------------------------------------------------------------------
/archivebox/pkgs/abx-plugin-readability/abx_plugin_readability/extractors.py:
--------------------------------------------------------------------------------
1 | # __package__ = 'abx_plugin_readability'
2 |
3 | from pathlib import Path
4 |
5 | from abx_pkg import BinName
6 |
7 | from abx_spec_extractor import BaseExtractor, ExtractorName
8 | from .binaries import READABILITY_BINARY
9 |
10 |
11 | class ReadabilityExtractor(BaseExtractor):
12 | name: ExtractorName = 'readability'
13 | binary: BinName = READABILITY_BINARY.name
14 |
15 | def get_output_path(self, snapshot) -> Path:
16 | return Path(snapshot.link_dir) / 'readability' / 'content.html'
17 |
18 |
19 | READABILITY_EXTRACTOR = ReadabilityExtractor()
20 |
--------------------------------------------------------------------------------
/archivebox/core/migrations/0058_alter_tag_old_id.py:
--------------------------------------------------------------------------------
1 | # Generated by Django 5.0.6 on 2024-08-20 03:30
2 |
3 | import random
4 | from django.db import migrations, models
5 |
6 |
7 | def rand_int_id():
8 | return random.getrandbits(32)
9 |
10 | class Migration(migrations.Migration):
11 |
12 | dependencies = [
13 | ('core', '0057_rename_id_tag_old_id'),
14 | ]
15 |
16 | operations = [
17 | migrations.AlterField(
18 | model_name='tag',
19 | name='old_id',
20 | field=models.BigIntegerField(default=rand_int_id, primary_key=True, serialize=False, verbose_name='Old ID'),
21 | ),
22 | ]
23 |
--------------------------------------------------------------------------------
/archivebox/core/migrations/0061_rename_tag_snapshottag_old_tag_and_more.py:
--------------------------------------------------------------------------------
1 | # Generated by Django 5.0.6 on 2024-08-20 03:43
2 |
3 | from django.db import migrations
4 |
5 |
6 | class Migration(migrations.Migration):
7 |
8 | dependencies = [
9 | ('core', '0060_alter_tag_id'),
10 | ]
11 |
12 | operations = [
13 | migrations.RenameField(
14 | model_name='snapshottag',
15 | old_name='tag',
16 | new_name='old_tag',
17 | ),
18 | migrations.AlterUniqueTogether(
19 | name='snapshottag',
20 | unique_together={('snapshot', 'old_tag')},
21 | ),
22 | ]
23 |
--------------------------------------------------------------------------------
/archivebox/workers/management/commands/orchestrator.py:
--------------------------------------------------------------------------------
1 |
2 |
3 | from django.core.management.base import BaseCommand
4 |
5 | from workers.orchestrator import ArchivingOrchestrator
6 |
7 |
8 | class Command(BaseCommand):
9 | help = 'Run the archivebox orchestrator'
10 |
11 | # def add_arguments(self, parser):
12 | # parser.add_argument('subcommand', type=str, help='The subcommand you want to run')
13 | # parser.add_argument('command_args', nargs='*', help='Arguments to pass to the subcommand')
14 |
15 |
16 | def handle(self, *args, **kwargs):
17 | orchestrator = ArchivingOrchestrator()
18 | orchestrator.start()
19 |
--------------------------------------------------------------------------------
/archivebox/__main__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | """This is the entrypoint for python -m archivebox ..."""
3 | __package__ = 'archivebox'
4 |
5 | import archivebox # noqa # make sure monkey patches are applied before anything else
6 | import sys
7 |
8 | from .cli import main
9 |
10 | ASCII_LOGO_MINI = r"""
11 | _ _ _ ____
12 | / \ _ __ ___| |__ (_)_ _____| __ ) _____ __
13 | / _ \ | '__/ __| '_ \| \ \ / / _ \ _ \ / _ \ \/ /
14 | / ___ \| | | (__| | | | |\ V / __/ |_) | (_) > <
15 | /_/ \_\_| \___|_| |_|_| \_/ \___|____/ \___/_/\_\
16 | """
17 |
18 | main(args=sys.argv[1:], stdin=sys.stdin)
19 |
--------------------------------------------------------------------------------
/.github/.readthedocs.yaml:
--------------------------------------------------------------------------------
1 | # Read the Docs config for https://docs.archivebox.io
2 | # https://docs.readthedocs.io/en/stable/config-file/v2.html
3 |
4 | version: 2
5 |
6 | submodules:
7 | include: all
8 | recursive: true
9 |
10 | build:
11 | os: ubuntu-22.04
12 | tools:
13 | python: "3.12"
14 | #nodejs: "20" # not needed unless we need the full archivebox to run while building docs for some reason
15 |
16 | sphinx:
17 | configuration: docs/conf.py
18 |
19 | formats:
20 | - pdf
21 | - epub
22 |
23 | # https://docs.readthedocs.io/en/stable/guides/reproducible-builds.html
24 | python:
25 | install:
26 | - requirements: docs/requirements.txt
27 |
--------------------------------------------------------------------------------
/archivebox/pkgs/abx-plugin-default-binproviders/abx_plugin_default_binproviders.py:
--------------------------------------------------------------------------------
1 |
2 | import abx
3 |
4 | from typing import Dict
5 |
6 | from abx_pkg import (
7 | AptProvider,
8 | BrewProvider,
9 | EnvProvider,
10 | BinProvider,
11 | )
12 | apt = APT_BINPROVIDER = AptProvider()
13 | brew = BREW_BINPROVIDER = BrewProvider()
14 | env = ENV_BINPROVIDER = EnvProvider()
15 | apt.setup()
16 | brew.setup()
17 | env.setup()
18 |
19 |
20 | @abx.hookimpl(tryfirst=True)
21 | def get_BINPROVIDERS() -> Dict[str, BinProvider]:
22 | return {
23 | 'apt': APT_BINPROVIDER,
24 | 'brew': BREW_BINPROVIDER,
25 | 'env': ENV_BINPROVIDER,
26 | }
27 |
--------------------------------------------------------------------------------
/archivebox/core/admin.py:
--------------------------------------------------------------------------------
1 | __package__ = 'archivebox.core'
2 |
3 | from django.contrib.auth import get_user_model
4 |
5 |
6 | from core.models import Snapshot, ArchiveResult, Tag
7 | from core.admin_tags import TagAdmin
8 | from core.admin_snapshots import SnapshotAdmin
9 | from core.admin_archiveresults import ArchiveResultAdmin
10 | from core.admin_users import UserAdmin
11 |
12 | import abx
13 |
14 |
15 | @abx.hookimpl
16 | def register_admin(admin_site):
17 | admin_site.register(get_user_model(), UserAdmin)
18 | admin_site.register(ArchiveResult, ArchiveResultAdmin)
19 | admin_site.register(Snapshot, SnapshotAdmin)
20 | admin_site.register(Tag, TagAdmin)
21 |
--------------------------------------------------------------------------------
/archivebox/core/management/commands/archivebox.py:
--------------------------------------------------------------------------------
1 | __package__ = 'archivebox'
2 |
3 | from django.core.management.base import BaseCommand
4 |
5 |
6 | from .cli import run_subcommand
7 |
8 |
9 | class Command(BaseCommand):
10 | help = 'Run an ArchiveBox CLI subcommand (e.g. add, remove, list, etc)'
11 |
12 | def add_arguments(self, parser):
13 | parser.add_argument('subcommand', type=str, help='The subcommand you want to run')
14 | parser.add_argument('command_args', nargs='*', help='Arguments to pass to the subcommand')
15 |
16 |
17 | def handle(self, *args, **kwargs):
18 | run_subcommand(kwargs['subcommand'], args=kwargs['command_args'])
19 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .DS_Store
2 | ._*
3 | *.pyc
4 | __pycache__/
5 | .mypy_cache/
6 | .eggs/
7 | tests/out/
8 |
9 | # Python and Node dependencies
10 | venv/
11 | .venv/
12 | .docker-venv/
13 | node_modules/
14 | typings/
15 |
16 | # Ignore dev lockfiles (should always be built fresh)
17 | pdm.dev.lock
18 | requirements-dev.txt
19 |
20 | # Packaging artifacts
21 | requirements.txt
22 | .pdm-python
23 | .pdm-build
24 | archivebox.egg-info
25 | archivebox-*.tar.gz
26 | build/
27 | dist/
28 |
29 | # Data folders
30 | lib/
31 | tmp/
32 | data/
33 | data*/
34 | output/
35 | index.sqlite3
36 | queue.sqlite3
37 | *.sqlite*
38 | data.*
39 | .archivebox_id
40 |
41 | # vim
42 | *.sw?
43 | .vscode
44 |
--------------------------------------------------------------------------------
/bin/lint.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | ### Bash Environment Setup
4 | # http://redsymbol.net/articles/unofficial-bash-strict-mode/
5 | # https://www.gnu.org/software/bash/manual/html_node/The-Set-Builtin.html
6 | # set -o xtrace
7 | set -o errexit
8 | set -o errtrace
9 | set -o nounset
10 | set -o pipefail
11 | IFS=$'\n'
12 |
13 | DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && cd .. && pwd )"
14 |
15 | source "$DIR/.venv/bin/activate"
16 |
17 | echo "[*] Running flake8..."
18 | cd "$DIR/archivebox"
19 | flake8 . && echo "√ No errors found."
20 |
21 | echo
22 |
23 | echo "[*] Running mypy..."
24 | echo "(skipping for now, run 'mypy archivebox' to run it manually)"
25 |
--------------------------------------------------------------------------------
/archivebox/core/migrations/0050_alter_snapshottag_snapshot_old.py:
--------------------------------------------------------------------------------
1 | # Generated by Django 5.0.6 on 2024-08-20 02:30
2 |
3 | import django.db.models.deletion
4 | from django.db import migrations, models
5 |
6 |
7 | class Migration(migrations.Migration):
8 |
9 | dependencies = [
10 | ('core', '0049_rename_snapshot_snapshottag_snapshot_old_and_more'),
11 | ]
12 |
13 | operations = [
14 | migrations.AlterField(
15 | model_name='snapshottag',
16 | name='snapshot_old',
17 | field=models.ForeignKey(db_column='snapshot_old_id', on_delete=django.db.models.deletion.CASCADE, to='core.snapshot', to_field='old_id'),
18 | ),
19 | ]
20 |
--------------------------------------------------------------------------------
/archivebox/pkgs/abx-plugin-ytdlp/abx_plugin_ytdlp/__init__.py:
--------------------------------------------------------------------------------
1 | __package__ = 'abx_plugin_ytdlp'
2 | __label__ = 'YT-DLP'
3 | __homepage__ = 'https://github.com/yt-dlp/yt-dlp'
4 |
5 | import abx
6 |
7 | @abx.hookimpl
8 | def get_CONFIG():
9 | from .config import YTDLP_CONFIG
10 |
11 | return {
12 | 'YTDLP_CONFIG': YTDLP_CONFIG
13 | }
14 |
15 | @abx.hookimpl
16 | def get_BINARIES():
17 | from .binaries import YTDLP_BINARY, FFMPEG_BINARY
18 |
19 | return {
20 | 'ytdlp': YTDLP_BINARY,
21 | 'ffmpeg': FFMPEG_BINARY,
22 | }
23 |
24 | @abx.hookimpl
25 | def ready():
26 | from .config import YTDLP_CONFIG
27 | YTDLP_CONFIG.validate()
28 |
--------------------------------------------------------------------------------
/archivebox/core/migrations/0049_rename_snapshot_snapshottag_snapshot_old_and_more.py:
--------------------------------------------------------------------------------
1 | # Generated by Django 5.0.6 on 2024-08-20 02:26
2 |
3 | from django.db import migrations
4 |
5 |
6 | class Migration(migrations.Migration):
7 |
8 | dependencies = [
9 | ('core', '0048_alter_archiveresult_snapshot_and_more'),
10 | ]
11 |
12 | operations = [
13 | migrations.RenameField(
14 | model_name='snapshottag',
15 | old_name='snapshot',
16 | new_name='snapshot_old',
17 | ),
18 | migrations.AlterUniqueTogether(
19 | name='snapshottag',
20 | unique_together={('snapshot_old', 'tag')},
21 | ),
22 | ]
23 |
--------------------------------------------------------------------------------
/archivebox/pkgs/abx-plugin-puppeteer/abx_plugin_puppeteer/binaries.py:
--------------------------------------------------------------------------------
1 | __package__ = 'abx_plugin_puppeteer'
2 |
3 | from typing import List
4 |
5 | from pydantic import InstanceOf
6 | from abx_pkg import BinProvider, BinName, Binary
7 |
8 |
9 | from abx_plugin_default_binproviders import env
10 |
11 | from abx_plugin_npm.binproviders import LIB_NPM_BINPROVIDER, SYS_NPM_BINPROVIDER
12 |
13 |
14 | ###################### Config ##########################
15 |
16 |
17 | class PuppeteerBinary(Binary):
18 | name: BinName = "puppeteer"
19 |
20 | binproviders_supported: List[InstanceOf[BinProvider]] = [LIB_NPM_BINPROVIDER, SYS_NPM_BINPROVIDER, env]
21 |
22 |
23 | PUPPETEER_BINARY = PuppeteerBinary()
24 |
--------------------------------------------------------------------------------
/archivebox/core/migrations/0018_auto_20210327_0952.py:
--------------------------------------------------------------------------------
1 | # Generated by Django 3.1.3 on 2021-03-27 09:52
2 |
3 | from django.db import migrations, models
4 |
5 |
6 | class Migration(migrations.Migration):
7 |
8 | dependencies = [
9 | ('core', '0017_auto_20210219_0211'),
10 | ]
11 |
12 | operations = [
13 | migrations.AlterField(
14 | model_name='tag',
15 | name='name',
16 | field=models.CharField(max_length=100, unique=True),
17 | ),
18 | migrations.AlterField(
19 | model_name='tag',
20 | name='slug',
21 | field=models.SlugField(blank=True, max_length=100, unique=True),
22 | ),
23 | ]
24 |
--------------------------------------------------------------------------------
/archivebox/core/migrations/0043_alter_archiveresult_snapshot_alter_snapshot_id_and_more.py:
--------------------------------------------------------------------------------
1 | # Generated by Django 5.0.6 on 2024-08-18 06:52
2 |
3 | import django.db.models.deletion
4 | import uuid
5 | from django.db import migrations, models
6 |
7 |
8 | class Migration(migrations.Migration):
9 |
10 | dependencies = [
11 | ('core', '0042_remove_archiveresult_snapshot_old'),
12 | ]
13 |
14 | operations = [
15 | migrations.AlterField(
16 | model_name='archiveresult',
17 | name='snapshot',
18 | field=models.ForeignKey(db_column='snapshot_id', on_delete=django.db.models.deletion.CASCADE, to='core.snapshot', to_field='id'),
19 | ),
20 | ]
21 |
--------------------------------------------------------------------------------
/archivebox/core/migrations/0032_alter_archiveresult_id.py:
--------------------------------------------------------------------------------
1 | # Generated by Django 5.0.6 on 2024-08-18 05:20
2 |
3 | import core.models
4 | import random
5 | from django.db import migrations, models
6 |
7 |
8 | def rand_int_id():
9 | return random.getrandbits(32)
10 |
11 | class Migration(migrations.Migration):
12 |
13 | dependencies = [
14 | ('core', '0031_alter_archiveresult_id_alter_archiveresult_uuid_and_more'),
15 | ]
16 |
17 | operations = [
18 | migrations.AlterField(
19 | model_name='archiveresult',
20 | name='id',
21 | field=models.BigIntegerField(default=rand_int_id, primary_key=True, serialize=False, verbose_name='ID'),
22 | ),
23 | ]
24 |
--------------------------------------------------------------------------------
/archivebox/core/migrations/0072_rename_added_snapshot_bookmarked_at_and_more.py:
--------------------------------------------------------------------------------
1 | # Generated by Django 5.1 on 2024-09-05 00:05
2 |
3 | from django.db import migrations
4 |
5 |
6 | class Migration(migrations.Migration):
7 |
8 | dependencies = [
9 | ('core', '0071_remove_archiveresult_old_id_remove_snapshot_old_id_and_more'),
10 | ]
11 |
12 | operations = [
13 | migrations.RenameField(
14 | model_name='snapshot',
15 | old_name='added',
16 | new_name='bookmarked_at',
17 | ),
18 | migrations.RenameField(
19 | model_name='snapshot',
20 | old_name='updated',
21 | new_name='downloaded_at',
22 | ),
23 | ]
24 |
--------------------------------------------------------------------------------
/archivebox/pkgs/abx-plugin-readability/abx_plugin_readability/config.py:
--------------------------------------------------------------------------------
1 | from pydantic import Field
2 |
3 | from abx_spec_config.base_configset import BaseConfigSet
4 |
5 | from archivebox.config.common import ARCHIVING_CONFIG
6 |
7 |
8 | class ReadabilityConfig(BaseConfigSet):
9 | SAVE_READABILITY: bool = Field(default=True, alias='USE_READABILITY')
10 |
11 | READABILITY_TIMEOUT: int = Field(default=lambda: ARCHIVING_CONFIG.TIMEOUT)
12 |
13 | READABILITY_BINARY: str = Field(default='readability-extractor')
14 | # READABILITY_EXTRA_ARGS: List[str] = [] # readability-extractor doesn't take any extra args
15 |
16 |
17 | READABILITY_CONFIG = ReadabilityConfig()
18 |
--------------------------------------------------------------------------------
/archivebox/pkgs/abx-plugin-favicon/abx_plugin_favicon/__init__.py:
--------------------------------------------------------------------------------
1 | __label__ = 'Favicon'
2 | __version__ = '2024.10.24'
3 | __author__ = 'ArchiveBox'
4 | __homepage__ = 'https://github.com/ArchiveBox/archivebox'
5 | __dependencies__ = [
6 | 'abx>=0.1.0',
7 | 'abx-spec-config>=0.1.0',
8 | 'abx-plugin-curl-extractor>=2024.10.24',
9 | ]
10 |
11 | import abx
12 |
13 |
14 | @abx.hookimpl
15 | def get_CONFIG():
16 | from .config import FAVICON_CONFIG
17 |
18 | return {
19 | 'FAVICON_CONFIG': FAVICON_CONFIG
20 | }
21 |
22 |
23 | @abx.hookimpl
24 | def get_EXTRACTORS():
25 | from .extractors import FAVICON_EXTRACTOR
26 |
27 | return {
28 | 'favicon': FAVICON_EXTRACTOR,
29 | }
30 |
--------------------------------------------------------------------------------
/bin/build_pip.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | ### Bash Environment Setup
4 | # http://redsymbol.net/articles/unofficial-bash-strict-mode/
5 | # https://www.gnu.org/software/bash/manual/html_node/The-Set-Builtin.html
6 | # set -o xtrace
7 | set -o errexit
8 | set -o errtrace
9 | set -o nounset
10 | set -o pipefail
11 | IFS=$'\n'
12 |
13 | REPO_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && cd .. && pwd )"
14 | cd "$REPO_DIR"
15 |
16 | # Generate pdm.lock, requirements.txt, and package-lock.json
17 | bash ./bin/lock_pkgs.sh
18 | source .venv/bin/activate
19 |
20 | echo "[+] Building sdist, bdist_wheel, and egg_info"
21 | rm -Rf build dist
22 | uv build
23 |
24 | echo
25 | echo "[√] Finished. Built package in dist/"
26 |
--------------------------------------------------------------------------------
/archivebox/pkgs/abx-plugin-playwright/abx_plugin_playwright/__init__.py:
--------------------------------------------------------------------------------
1 | __label__ = 'Playwright'
2 | __homepage__ = 'https://github.com/microsoft/playwright-python'
3 |
4 | import abx
5 |
6 |
7 | @abx.hookimpl
8 | def get_CONFIG():
9 | from .config import PLAYWRIGHT_CONFIG
10 | return {
11 | 'PLAYWRIGHT_CONFIG': PLAYWRIGHT_CONFIG
12 | }
13 |
14 | @abx.hookimpl
15 | def get_BINARIES():
16 | from .binaries import PLAYWRIGHT_BINARY
17 |
18 | return {
19 | 'playwright': PLAYWRIGHT_BINARY,
20 | }
21 |
22 | @abx.hookimpl
23 | def get_BINPROVIDERS():
24 | from .binproviders import PLAYWRIGHT_BINPROVIDER
25 |
26 | return {
27 | 'playwright': PLAYWRIGHT_BINPROVIDER,
28 | }
29 |
--------------------------------------------------------------------------------
/bin/build.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | ### Bash Environment Setup
4 | # http://redsymbol.net/articles/unofficial-bash-strict-mode/
5 | # https://www.gnu.org/software/bash/manual/html_node/The-Set-Builtin.html
6 | # set -o xtrace
7 | set -o errexit
8 | set -o errtrace
9 | set -o nounset
10 | set -o pipefail
11 | IFS=$'\n'
12 |
13 | REPO_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && cd .. && pwd )"
14 |
15 | cd "$REPO_DIR"
16 |
17 | # pipenv install --dev
18 |
19 | # the order matters
20 | ./bin/build_docs.sh
21 | ./bin/build_pip.sh
22 | ./bin/build_docker.sh
23 |
24 | echo "[√] Done. Install the built package by running:"
25 | echo " python3 setup.py install"
26 | echo " # or"
27 | echo " pip3 install ."
28 |
--------------------------------------------------------------------------------
/archivebox/core/migrations/0012_auto_20210216_1425.py:
--------------------------------------------------------------------------------
1 | # Generated by Django 3.1.3 on 2021-02-16 14:25
2 |
3 | from django.db import migrations, models
4 |
5 |
6 | class Migration(migrations.Migration):
7 |
8 | dependencies = [
9 | ('core', '0011_auto_20210216_1331'),
10 | ]
11 |
12 | operations = [
13 | migrations.AlterField(
14 | model_name='archiveresult',
15 | name='cmd_version',
16 | field=models.CharField(blank=True, default=None, max_length=128, null=True),
17 | ),
18 | migrations.AlterField(
19 | model_name='archiveresult',
20 | name='output',
21 | field=models.CharField(max_length=1024),
22 | ),
23 | ]
24 |
--------------------------------------------------------------------------------
/archivebox/pkgs/abx-plugin-mercury/abx_plugin_mercury/__init__.py:
--------------------------------------------------------------------------------
1 | __package__ = 'abx_plugin_mercury'
2 | __label__ = 'Postlight Parser'
3 | __homepage__ = 'https://github.com/postlight/mercury-parser'
4 |
5 | import abx
6 |
7 | @abx.hookimpl
8 | def get_CONFIG():
9 | from .config import MERCURY_CONFIG
10 |
11 | return {
12 | 'MERCURY_CONFIG': MERCURY_CONFIG
13 | }
14 |
15 | @abx.hookimpl
16 | def get_BINARIES():
17 | from .binaries import MERCURY_BINARY
18 |
19 | return {
20 | 'mercury': MERCURY_BINARY,
21 | }
22 |
23 | @abx.hookimpl
24 | def get_EXTRACTORS():
25 | from .extractors import MERCURY_EXTRACTOR
26 |
27 | return {
28 | 'mercury': MERCURY_EXTRACTOR,
29 | }
30 |
--------------------------------------------------------------------------------
/archivebox/cli/archivebox_shell.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 |
3 | __package__ = 'archivebox.cli'
4 |
5 | from typing import Iterable
6 |
7 | import rich_click as click
8 |
9 | from archivebox.misc.util import docstring
10 |
11 |
12 | def shell(args: Iterable[str]=()) -> None:
13 | """Enter an interactive ArchiveBox Django shell"""
14 |
15 | from django.core.management import call_command
16 | call_command("shell_plus", *args)
17 |
18 |
19 | @click.command(add_help_option=False, context_settings=dict(ignore_unknown_options=True))
20 | @click.argument('args', nargs=-1)
21 | @docstring(shell.__doc__)
22 | def main(args: Iterable[str]=()) -> None:
23 | shell(args=args)
24 |
25 |
26 | if __name__ == '__main__':
27 | main()
28 |
--------------------------------------------------------------------------------
/etc/crontabs/archivebox:
--------------------------------------------------------------------------------
1 | # DO NOT EDIT THIS FILE - edit the master and reinstall.
2 | # (/tmp/tmpe3dawo9u installed on Tue Jun 13 23:21:48 2023)
3 | # (Cron version -- $Id: crontab.c,v 2.13 1994/01/17 03:20:37 vixie Exp $)
4 |
5 | @daily cd /data && /usr/local/bin/archivebox add --depth=0 "https://example.com/3" >> /data/logs/schedule.log 2>&1 # archivebox_schedule
6 | @daily cd /data && /usr/local/bin/archivebox add --depth=0 "https://example.com/2" >> /data/logs/schedule.log 2>&1 # archivebox_schedule
7 | @daily cd /data && /usr/local/bin/archivebox add --depth=0 "https://example.com" >> /data/logs/schedule.log 2>&1 # archivebox_schedule
8 | @daily cd /data && /usr/local/bin/archivebox add --depth=0 "update" >> /data/logs/schedule.log 2>&1 # archivebox_schedule
9 |
--------------------------------------------------------------------------------
/archivebox/core/migrations/0020_auto_20210410_1031.py:
--------------------------------------------------------------------------------
1 | # Generated by Django 3.1.8 on 2021-04-10 10:31
2 |
3 | from django.db import migrations, models
4 |
5 |
6 | class Migration(migrations.Migration):
7 |
8 | dependencies = [
9 | ('core', '0019_auto_20210401_0654'),
10 | ]
11 |
12 | operations = [
13 | migrations.AlterField(
14 | model_name='archiveresult',
15 | name='id',
16 | field=models.AutoField(primary_key=True, serialize=False, verbose_name='ID'),
17 | ),
18 | migrations.AlterField(
19 | model_name='tag',
20 | name='id',
21 | field=models.AutoField(primary_key=True, serialize=False, verbose_name='ID'),
22 | ),
23 | ]
24 |
--------------------------------------------------------------------------------
/archivebox/pkgs/abx-plugin-playwright/abx_plugin_playwright/binaries.py:
--------------------------------------------------------------------------------
1 | __package__ = 'abx_plugin_playwright'
2 |
3 | from typing import List
4 |
5 | from pydantic import InstanceOf
6 | from abx_pkg import BinName, BinProvider, Binary
7 |
8 |
9 | from abx_plugin_pip.binproviders import LIB_PIP_BINPROVIDER, VENV_PIP_BINPROVIDER, SYS_PIP_BINPROVIDER
10 | from abx_plugin_default_binproviders import env
11 |
12 | from .config import PLAYWRIGHT_CONFIG
13 |
14 |
15 | class PlaywrightBinary(Binary):
16 | name: BinName = PLAYWRIGHT_CONFIG.PLAYWRIGHT_BINARY
17 |
18 | binproviders_supported: List[InstanceOf[BinProvider]] = [LIB_PIP_BINPROVIDER, VENV_PIP_BINPROVIDER, SYS_PIP_BINPROVIDER, env]
19 |
20 |
21 | PLAYWRIGHT_BINARY = PlaywrightBinary()
22 |
--------------------------------------------------------------------------------
/archivebox/pkgs/abx-plugin-ripgrep-search/abx_plugin_ripgrep_search/binaries.py:
--------------------------------------------------------------------------------
1 | __package__ = 'abx_plugin_ripgrep_search'
2 |
3 | from typing import List
4 |
5 | from pydantic import InstanceOf
6 | from abx_pkg import BinProvider, BinaryOverrides, BinName, Binary
7 |
8 | from abx_plugin_default_binproviders import apt, brew, env
9 |
10 |
11 | from .config import RIPGREP_CONFIG
12 |
13 |
14 | class RipgrepBinary(Binary):
15 | name: BinName = RIPGREP_CONFIG.RIPGREP_BINARY
16 | binproviders_supported: List[InstanceOf[BinProvider]] = [apt, brew, env]
17 |
18 | overrides: BinaryOverrides = {
19 | apt.name: {'packages': ['ripgrep']},
20 | brew.name: {'packages': ['ripgrep']},
21 | }
22 |
23 | RIPGREP_BINARY = RipgrepBinary()
24 |
--------------------------------------------------------------------------------
/bin/release_docs.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | ### Bash Environment Setup
4 | # http://redsymbol.net/articles/unofficial-bash-strict-mode/
5 | # https://www.gnu.org/software/bash/manual/html_node/The-Set-Builtin.html
6 | # set -o xtrace
7 | set -o errexit
8 | set -o errtrace
9 | set -o nounset
10 | set -o pipefail
11 | IFS=$'\n'
12 |
13 | REPO_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && cd .. && pwd )"
14 | VERSION="$(grep '^version = ' "${REPO_DIR}/pyproject.toml" | awk -F'"' '{print $2}')"
15 | cd "$REPO_DIR"
16 |
17 |
18 | echo "[^] Pushing docs to github"
19 | cd docs/
20 | git add .
21 | git commit -am "$VERSION release"
22 | git push
23 | git tag -a "v$VERSION" -m "v$VERSION"
24 | git push origin
25 | git push origin --tags
26 |
--------------------------------------------------------------------------------
/archivebox/pkgs/abx-plugin-puppeteer/abx_plugin_puppeteer/__init__.py:
--------------------------------------------------------------------------------
1 | __package__ = 'abx_plugin_puppeteer'
2 | __label__ = 'Puppeteer'
3 | __homepage__ = 'https://github.com/puppeteer/puppeteer'
4 |
5 | import abx
6 |
7 |
8 | @abx.hookimpl
9 | def get_CONFIG():
10 | from .config import PUPPETEER_CONFIG
11 |
12 | return {
13 | 'PUPPETEER_CONFIG': PUPPETEER_CONFIG
14 | }
15 |
16 | @abx.hookimpl
17 | def get_BINARIES():
18 | from .binaries import PUPPETEER_BINARY
19 |
20 | return {
21 | 'puppeteer': PUPPETEER_BINARY,
22 | }
23 |
24 | @abx.hookimpl
25 | def get_BINPROVIDERS():
26 | from .binproviders import PUPPETEER_BINPROVIDER
27 |
28 | return {
29 | 'puppeteer': PUPPETEER_BINPROVIDER,
30 | }
31 |
--------------------------------------------------------------------------------
/archivebox/pkgs/abx-plugin-ripgrep-search/abx_plugin_ripgrep_search/__init__.py:
--------------------------------------------------------------------------------
1 | __package__ = 'abx_plugin_ripgrep_search'
2 | __label__ = 'Ripgrep Search'
3 | __homepage__ = 'https://github.com/BurntSushi/ripgrep'
4 |
5 | import abx
6 |
7 | @abx.hookimpl
8 | def get_CONFIG():
9 | from .config import RIPGREP_CONFIG
10 |
11 | return {
12 | 'RIPGREP_CONFIG': RIPGREP_CONFIG
13 | }
14 |
15 |
16 | @abx.hookimpl
17 | def get_BINARIES():
18 | from .binaries import RIPGREP_BINARY
19 |
20 | return {
21 | 'ripgrep': RIPGREP_BINARY
22 | }
23 |
24 |
25 | @abx.hookimpl
26 | def get_SEARCHBACKENDS():
27 | from .searchbackend import RIPGREP_SEARCH_BACKEND
28 |
29 | return {
30 | 'ripgrep': RIPGREP_SEARCH_BACKEND,
31 | }
32 |
--------------------------------------------------------------------------------
/archivebox/pkgs/abx-plugin-chrome/abx_plugin_chrome/extractors.py:
--------------------------------------------------------------------------------
1 | __package__ = 'abx_plugin_chrome'
2 |
3 | from abx_pkg import BinName
4 |
5 | from abx_spec_extractor import BaseExtractor, ExtractorName
6 |
7 | from .binaries import CHROME_BINARY
8 |
9 |
10 | class PDFExtractor(BaseExtractor):
11 | name: ExtractorName = 'pdf'
12 | binary: BinName = CHROME_BINARY.name
13 |
14 | PDF_EXTRACTOR = PDFExtractor()
15 |
16 |
17 | class ScreenshotExtractor(BaseExtractor):
18 | name: ExtractorName = 'screenshot'
19 | binary: BinName = CHROME_BINARY.name
20 |
21 | SCREENSHOT_EXTRACTOR = ScreenshotExtractor()
22 |
23 | class DOMExtractor(BaseExtractor):
24 | name: ExtractorName = 'dom'
25 | binary: BinName = CHROME_BINARY.name
26 | DOM_EXTRACTOR = DOMExtractor()
27 |
--------------------------------------------------------------------------------
/archivebox/pkgs/abx-plugin-readability/abx_plugin_readability/__init__.py:
--------------------------------------------------------------------------------
1 | __package__ = 'abx_plugin_readability'
2 | __label__ = 'Readability'
3 | __homepage__ = 'https://github.com/ArchiveBox/readability-extractor'
4 |
5 | import abx
6 |
7 |
8 | @abx.hookimpl
9 | def get_CONFIG():
10 | from .config import READABILITY_CONFIG
11 |
12 | return {
13 | 'READABILITY_CONFIG': READABILITY_CONFIG
14 | }
15 |
16 | @abx.hookimpl
17 | def get_BINARIES():
18 | from .binaries import READABILITY_BINARY
19 |
20 | return {
21 | 'readability': READABILITY_BINARY,
22 | }
23 |
24 | @abx.hookimpl
25 | def get_EXTRACTORS():
26 | from .extractors import READABILITY_EXTRACTOR
27 |
28 | return {
29 | 'readability': READABILITY_EXTRACTOR,
30 | }
31 |
--------------------------------------------------------------------------------
/archivebox/pkgs/abx-plugin-wget/abx_plugin_wget/__init__.py:
--------------------------------------------------------------------------------
1 | __package__ = 'abx_plugin_wget'
2 | __label__ = 'WGET'
3 |
4 | import abx
5 |
6 |
7 | @abx.hookimpl
8 | def get_CONFIG():
9 | from .config import WGET_CONFIG
10 |
11 | return {
12 | 'WGET_CONFIG': WGET_CONFIG
13 | }
14 |
15 | @abx.hookimpl
16 | def get_BINARIES():
17 | from .binaries import WGET_BINARY
18 |
19 | return {
20 | 'wget': WGET_BINARY,
21 | }
22 |
23 | @abx.hookimpl
24 | def get_EXTRACTORS():
25 | from .extractors import WGET_EXTRACTOR, WARC_EXTRACTOR
26 |
27 | return {
28 | 'wget': WGET_EXTRACTOR,
29 | 'warc': WARC_EXTRACTOR,
30 | }
31 |
32 | @abx.hookimpl
33 | def ready():
34 | from .config import WGET_CONFIG
35 | WGET_CONFIG.validate()
36 |
--------------------------------------------------------------------------------
/archivebox/core/migrations/0021_auto_20220914_0934.py:
--------------------------------------------------------------------------------
1 | # Generated by Django 3.1.14 on 2022-09-14 09:34
2 |
3 | from django.db import migrations, models
4 |
5 |
6 | class Migration(migrations.Migration):
7 |
8 | dependencies = [
9 | ('core', '0020_auto_20210410_1031'),
10 | ]
11 |
12 | operations = [
13 | migrations.AlterField(
14 | model_name='archiveresult',
15 | name='extractor',
16 | field=models.CharField(choices=[('favicon', 'favicon'), ('headers', 'headers'), ('singlefile', 'singlefile'), ('pdf', 'pdf'), ('screenshot', 'screenshot'), ('dom', 'dom'), ('wget', 'wget'), ('title', 'title'), ('readability', 'readability'), ('mercury', 'mercury'), ('git', 'git'), ('media', 'media'), ('archive_org', 'archive_org')], max_length=32),
17 | ),
18 | ]
19 |
--------------------------------------------------------------------------------
/archivebox/workers/views.py:
--------------------------------------------------------------------------------
1 |
2 | from django.views.generic import TemplateView
3 | from django.contrib.auth.mixins import UserPassesTestMixin
4 | from django.utils import timezone
5 | from api.auth import get_or_create_api_token
6 |
7 |
8 | class JobsDashboardView(UserPassesTestMixin, TemplateView):
9 | template_name = "jobs_dashboard.html"
10 |
11 |
12 | def test_func(self):
13 | return self.request.user and self.request.user.is_superuser
14 |
15 | def get_context_data(self, **kwargs):
16 | api_token = get_or_create_api_token(self.request.user)
17 | context = super().get_context_data(**kwargs)
18 | context['api_token'] = api_token.token if api_token else 'UNABLE TO GENERATE API TOKEN'
19 | context['now'] = timezone.now().strftime("%H:%M:%S")
20 | return context
21 |
--------------------------------------------------------------------------------
/.github/dependabot.yml:
--------------------------------------------------------------------------------
1 | # To get started with Dependabot version updates, you'll need to specify which
2 | # package ecosystems to update and where the package manifests are located.
3 | # Please see the documentation for all configuration options:
4 | # https://docs.github.com/code-security/dependabot/dependabot-version-updates/configuration-options-for-the-dependabot.yml-file
5 |
6 | version: 2
7 | updates:
8 | - package-ecosystem: "pip"
9 | directory: "/"
10 | target-branch: "dev"
11 | schedule:
12 | interval: "monthly"
13 | groups:
14 | pip:
15 | patterns:
16 | - "*"
17 | - package-ecosystem: "npm"
18 | directory: "/"
19 | target-branch: "dev"
20 | schedule:
21 | interval: "monthly"
22 | groups:
23 | npm:
24 | patterns:
25 | - "*"
26 |
--------------------------------------------------------------------------------
/archivebox/pkgs/abx-plugin-npm/abx_plugin_npm/__init__.py:
--------------------------------------------------------------------------------
1 | __label__ = 'NPM'
2 | __author__ = 'ArchiveBox'
3 | __homepage__ = 'https://www.npmjs.com/'
4 |
5 | import abx
6 |
7 |
8 | @abx.hookimpl
9 | def get_CONFIG():
10 | from .config import NPM_CONFIG
11 | return {
12 | 'NPM_CONFIG': NPM_CONFIG,
13 | }
14 |
15 | @abx.hookimpl
16 | def get_BINARIES():
17 | from .binaries import NODE_BINARY, NPM_BINARY, NPX_BINARY
18 |
19 | return {
20 | 'node': NODE_BINARY,
21 | 'npm': NPM_BINARY,
22 | 'npx': NPX_BINARY,
23 | }
24 |
25 | @abx.hookimpl
26 | def get_BINPROVIDERS():
27 | from .binproviders import LIB_NPM_BINPROVIDER, SYS_NPM_BINPROVIDER
28 |
29 | return {
30 | 'sys_npm': SYS_NPM_BINPROVIDER,
31 | 'lib_npm': LIB_NPM_BINPROVIDER,
32 | }
33 |
--------------------------------------------------------------------------------
/bin/release_git.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | ### Bash Environment Setup
4 | # http://redsymbol.net/articles/unofficial-bash-strict-mode/
5 | # https://www.gnu.org/software/bash/manual/html_node/The-Set-Builtin.html
6 | # set -o xtrace
7 | set -o errexit
8 | set -o errtrace
9 | set -o nounset
10 | set -o pipefail
11 | IFS=$'\n'
12 |
13 | REPO_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && cd .. && pwd )"
14 | VERSION="$(grep '^version = ' "${REPO_DIR}/pyproject.toml" | awk -F'"' '{print $2}')"
15 | cd "$REPO_DIR"
16 |
17 |
18 | # Push build to github
19 | echo "[^] Pushing release commit + tag to Github"
20 | git tag -f -a "v$VERSION" -m "v$VERSION"
21 | git push origin -f --tags
22 | echo " To finish publishing the release go here:"
23 | echo " https://github.com/ArchiveBox/ArchiveBox/releases/new"
24 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/config.yml:
--------------------------------------------------------------------------------
1 | blank_issues_enabled: false
2 | contact_links:
3 | - name: ❓ Ask a question or start a discussion
4 | url: https://github.com/ArchiveBox/ArchiveBox/discussions
5 | about: "Ask a question, get support, or start a design discussion (to report a problem please use '🐞 Bug report' instead)"
6 | - name: 💬 Chat with the dev team & community on Zulip
7 | url: https://zulip.archivebox.io
8 | about: "Join us on our Zulip forum to chat with the developers and other users (it's similar to Discord but self-hosted)."
9 | - name: 💁♂️ Hire us for professional support with fast response times
10 | url: https://docs.monadical.com/s/archivebox-consulting-services
11 | about: "We provide hosting, develoment, and support, including on-prem/cloud w/ SSO & storage, CAPTCHA-solving, proxies, etc."
12 |
--------------------------------------------------------------------------------
/archivebox/api/migrations/0004_alter_apitoken_id_alter_apitoken_uuid.py:
--------------------------------------------------------------------------------
1 | # Generated by Django 5.1 on 2024-08-20 10:44
2 |
3 | import uuid
4 | from django.db import migrations, models
5 |
6 |
7 | class Migration(migrations.Migration):
8 |
9 | dependencies = [
10 | ('api', '0003_rename_user_apitoken_created_by_apitoken_abid_and_more'),
11 | ]
12 |
13 | operations = [
14 | migrations.AlterField(
15 | model_name='apitoken',
16 | name='id',
17 | field=models.UUIDField(default=uuid.uuid4, editable=False, primary_key=True, serialize=False),
18 | ),
19 | migrations.AlterField(
20 | model_name='apitoken',
21 | name='uuid',
22 | field=models.UUIDField(blank=True, editable=False, null=True, unique=True),
23 | ),
24 | ]
25 |
--------------------------------------------------------------------------------
/archivebox/parsers/generic_jsonl.py:
--------------------------------------------------------------------------------
1 | __package__ = 'archivebox.parsers'
2 |
3 | import json
4 | from typing import IO, Iterable
5 |
6 | from archivebox.misc.util import enforce_types
7 |
8 | from ..index.schema import Link
9 | from .generic_json import jsonObjectToLink
10 |
11 | def parse_line(line: str):
12 | if line.strip() != "":
13 | return json.loads(line)
14 |
15 | @enforce_types
16 | def parse_generic_jsonl_export(json_file: IO[str], **_kwargs) -> Iterable[Link]:
17 | """Parse JSONL format bookmarks export files"""
18 |
19 | json_file.seek(0)
20 |
21 | links = [ parse_line(line) for line in json_file ]
22 |
23 | for link in links:
24 | if link:
25 | yield jsonObjectToLink(link,json_file.name)
26 |
27 | KEY = 'jsonl'
28 | NAME = 'Generic JSONL'
29 | PARSER = parse_generic_jsonl_export
30 |
--------------------------------------------------------------------------------
/etc/fly.toml:
--------------------------------------------------------------------------------
1 | # fly.toml file generated for archivebox on 2021-04-23T16:35:11-04:00
2 |
3 | app = "archivebox"
4 |
5 | kill_signal = "SIGINT"
6 | kill_timeout = 5
7 |
8 | [env]
9 |
10 | [mounts]
11 | source="archivebox_data"
12 | destination="/data"
13 |
14 | [experimental]
15 | auto_rollback = true
16 |
17 | [[services]]
18 | http_checks = []
19 | internal_port = 8000
20 | protocol = "tcp"
21 | script_checks = []
22 |
23 | [services.concurrency]
24 | hard_limit = 25
25 | soft_limit = 20
26 | type = "connections"
27 |
28 | [[services.ports]]
29 | handlers = ["http"]
30 | port = 80
31 |
32 | [[services.ports]]
33 | handlers = ["tls", "http"]
34 | port = 443
35 |
36 | [[services.tcp_checks]]
37 | grace_period = "1s"
38 | interval = "15s"
39 | restart_limit = 6
40 | timeout = "2s"
41 |
--------------------------------------------------------------------------------
/tests/fixtures.py:
--------------------------------------------------------------------------------
1 | import os
2 | import subprocess
3 |
4 | import pytest
5 |
6 | @pytest.fixture
7 | def process(tmp_path):
8 | os.chdir(tmp_path)
9 | process = subprocess.run(['archivebox', 'init'], capture_output=True)
10 | return process
11 |
12 | @pytest.fixture
13 | def disable_extractors_dict():
14 | env = os.environ.copy()
15 | env.update({
16 | "USE_WGET": "false",
17 | "USE_SINGLEFILE": "false",
18 | "USE_READABILITY": "false",
19 | "USE_MERCURY": "false",
20 | "SAVE_HTMLTOTEXT": "false",
21 | "SAVE_PDF": "false",
22 | "SAVE_SCREENSHOT": "false",
23 | "SAVE_DOM": "false",
24 | "SAVE_HEADERS": "false",
25 | "USE_GIT": "false",
26 | "SAVE_MEDIA": "false",
27 | "SAVE_ARCHIVE_DOT_ORG": "false"
28 | })
29 | return env
30 |
--------------------------------------------------------------------------------
/archivebox/core/migrations/0022_auto_20231023_2008.py:
--------------------------------------------------------------------------------
1 | # Generated by Django 3.1.14 on 2023-10-23 20:08
2 |
3 | from django.db import migrations, models
4 |
5 |
6 | class Migration(migrations.Migration):
7 |
8 | dependencies = [
9 | ('core', '0021_auto_20220914_0934'),
10 | ]
11 |
12 | operations = [
13 | migrations.AlterField(
14 | model_name='archiveresult',
15 | name='extractor',
16 | field=models.CharField(choices=[('favicon', 'favicon'), ('headers', 'headers'), ('singlefile', 'singlefile'), ('pdf', 'pdf'), ('screenshot', 'screenshot'), ('dom', 'dom'), ('wget', 'wget'), ('title', 'title'), ('readability', 'readability'), ('mercury', 'mercury'), ('htmltotext', 'htmltotext'), ('git', 'git'), ('media', 'media'), ('archive_org', 'archive_org')], max_length=32),
17 | ),
18 | ]
19 |
--------------------------------------------------------------------------------
/archivebox/api/migrations/0007_alter_apitoken_created_by.py:
--------------------------------------------------------------------------------
1 | # Generated by Django 5.1 on 2024-08-20 22:52
2 |
3 | import django.db.models.deletion
4 | from django.conf import settings
5 | from django.db import migrations, models
6 |
7 | import archivebox.base_models.models
8 |
9 |
10 | class Migration(migrations.Migration):
11 |
12 | dependencies = [
13 | ('api', '0006_remove_outboundwebhook_uuid_apitoken_id_and_more'),
14 | migrations.swappable_dependency(settings.AUTH_USER_MODEL),
15 | ]
16 |
17 | operations = [
18 | migrations.AlterField(
19 | model_name='apitoken',
20 | name='created_by',
21 | field=models.ForeignKey(default=archivebox.base_models.models.get_or_create_system_user_pk, on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL),
22 | ),
23 | ]
24 |
--------------------------------------------------------------------------------
/archivebox/pkgs/abx-plugin-sonic-search/abx_plugin_sonic_search/__init__.py:
--------------------------------------------------------------------------------
1 | __package__ = 'abx_plugin_sonic_search'
2 | __label__ = 'Sonic Search'
3 | __homepage__ = 'https://github.com/valeriansaliou/sonic'
4 |
5 | import abx
6 |
7 |
8 | @abx.hookimpl
9 | def get_CONFIG():
10 | from .config import SONIC_CONFIG
11 |
12 | return {
13 | 'SONIC_CONFIG': SONIC_CONFIG
14 | }
15 |
16 |
17 | @abx.hookimpl
18 | def get_BINARIES():
19 | from .binaries import SONIC_BINARY
20 |
21 | return {
22 | 'sonic': SONIC_BINARY
23 | }
24 |
25 |
26 | @abx.hookimpl
27 | def get_SEARCHBACKENDS():
28 | from .searchbackend import SONIC_SEARCH_BACKEND
29 |
30 | return {
31 | 'sonic': SONIC_SEARCH_BACKEND,
32 | }
33 |
34 | @abx.hookimpl
35 | def ready():
36 | from .config import SONIC_CONFIG
37 | SONIC_CONFIG.validate()
38 |
--------------------------------------------------------------------------------
/archivebox/workers/admin.py:
--------------------------------------------------------------------------------
1 | __package__ = 'archivebox.workers'
2 |
3 | import abx
4 |
5 | from django.contrib.auth import get_permission_codename
6 |
7 | from huey_monitor.apps import HueyMonitorConfig
8 | from huey_monitor.admin import TaskModel, TaskModelAdmin, SignalInfoModel, SignalInfoModelAdmin
9 |
10 |
11 | HueyMonitorConfig.verbose_name = 'Background Workers'
12 |
13 |
14 | class CustomTaskModelAdmin(TaskModelAdmin):
15 | actions = ["delete_selected"]
16 |
17 | def has_delete_permission(self, request, obj=None):
18 | codename = get_permission_codename("delete", self.opts)
19 | return request.user.has_perm("%s.%s" % (self.opts.app_label, codename))
20 |
21 |
22 |
23 | @abx.hookimpl
24 | def register_admin(admin_site):
25 | admin_site.register(TaskModel, CustomTaskModelAdmin)
26 | admin_site.register(SignalInfoModel, SignalInfoModelAdmin)
27 |
--------------------------------------------------------------------------------
/archivebox/pkgs/abx-plugin-git/abx_plugin_git/config.py:
--------------------------------------------------------------------------------
1 | __package__ = 'abx_plugin_git'
2 |
3 | from typing import List
4 |
5 | from pydantic import Field
6 |
7 | from abx_spec_config.base_configset import BaseConfigSet
8 |
9 | from archivebox.config.common import ARCHIVING_CONFIG
10 |
11 |
12 | class GitConfig(BaseConfigSet):
13 |
14 | SAVE_GIT: bool = True
15 |
16 | GIT_DOMAINS: str = Field(default='github.com,bitbucket.org,gitlab.com,gist.github.com,codeberg.org,gitea.com,git.sr.ht')
17 |
18 | GIT_BINARY: str = Field(default='git')
19 | GIT_ARGS: List[str] = [
20 | '--recursive',
21 | ]
22 | GIT_EXTRA_ARGS: List[str] = []
23 |
24 | GIT_TIMEOUT: int = Field(default=lambda: ARCHIVING_CONFIG.TIMEOUT)
25 | GIT_CHECK_SSL_VALIDITY: bool = Field(default=lambda: ARCHIVING_CONFIG.CHECK_SSL_VALIDITY)
26 |
27 |
28 | GIT_CONFIG = GitConfig()
29 |
--------------------------------------------------------------------------------
/archivebox/pkgs/abx-plugin-singlefile/abx_plugin_singlefile/__init__.py:
--------------------------------------------------------------------------------
1 | __package__ = 'abx_plugin_singlefile'
2 | __label__ = 'Singlefile'
3 | __homepage__ = 'https://github.com/gildas-lormeau/singlefile'
4 |
5 | import abx
6 |
7 |
8 | @abx.hookimpl
9 | def get_CONFIG():
10 | from .config import SINGLEFILE_CONFIG
11 |
12 | return {
13 | 'SINGLEFILE_CONFIG': SINGLEFILE_CONFIG
14 | }
15 |
16 | @abx.hookimpl
17 | def get_BINARIES():
18 | from .binaries import SINGLEFILE_BINARY
19 |
20 | return {
21 | 'singlefile': SINGLEFILE_BINARY,
22 | }
23 |
24 | @abx.hookimpl
25 | def get_EXTRACTORS():
26 | from .extractors import SINGLEFILE_EXTRACTOR
27 |
28 | return {
29 | 'singlefile': SINGLEFILE_EXTRACTOR,
30 | }
31 |
32 | @abx.hookimpl
33 | def get_INSTALLED_APPS():
34 | # needed to load ./models.py
35 | return [__package__]
36 |
--------------------------------------------------------------------------------
/archivebox/core/migrations/0041_alter_archiveresult_snapshot_and_more.py:
--------------------------------------------------------------------------------
1 | # Generated by Django 5.0.6 on 2024-08-18 06:50
2 |
3 | import django.db.models.deletion
4 | from django.db import migrations, models
5 |
6 |
7 | class Migration(migrations.Migration):
8 |
9 | dependencies = [
10 | ('core', '0040_archiveresult_snapshot'),
11 | ]
12 |
13 | operations = [
14 | migrations.AlterField(
15 | model_name='archiveresult',
16 | name='snapshot',
17 | field=models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='core.snapshot', to_field='id'),
18 | ),
19 | migrations.AlterField(
20 | model_name='archiveresult',
21 | name='snapshot_old',
22 | field=models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='archiveresults_old', to='core.snapshot'),
23 | ),
24 | ]
25 |
--------------------------------------------------------------------------------
/bin/build_docs.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | ### Bash Environment Setup
4 | # http://redsymbol.net/articles/unofficial-bash-strict-mode/
5 | # https://www.gnu.org/software/bash/manual/html_node/The-Set-Builtin.html
6 | # set -o xtrace
7 | set -o errexit
8 | set -o errtrace
9 | set -o nounset
10 | set -o pipefail
11 | IFS=$'\n'
12 |
13 | REPO_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && cd .. && pwd )"
14 |
15 | if [[ -f "$REPO_DIR/.venv/bin/activate" ]]; then
16 | source "$REPO_DIR/.venv/bin/activate"
17 | else
18 | echo "[!] Warning: No virtualenv presesnt in $REPO_DIR.venv"
19 | fi
20 | cd "$REPO_DIR"
21 |
22 |
23 | echo "[*] Fetching latest docs version"
24 | cd "$REPO_DIR/docs"
25 | git pull
26 | cd "$REPO_DIR"
27 |
28 | echo "[+] Building docs"
29 | cd "$REPO_DIR/docs"
30 | make clean
31 | make html
32 | # open docs/_build/html/index.html to see the output
33 | cd "$REPO_DIR"
34 |
--------------------------------------------------------------------------------
/.github/workflows/lint.yml:
--------------------------------------------------------------------------------
1 | name: Run linters
2 |
3 | on:
4 | workflow_dispatch:
5 | push:
6 |
7 | env:
8 | MAX_LINE_LENGTH: 110
9 |
10 | jobs:
11 | lint:
12 | runs-on: ubuntu-20.04
13 | steps:
14 | - uses: actions/checkout@v2
15 | with:
16 | submodules: true
17 | fetch-depth: 1
18 |
19 | - name: Set up Python
20 | uses: actions/setup-python@v1
21 | with:
22 | python-version: 3.9
23 | architecture: x64
24 |
25 | - name: Install flake8
26 | run: |
27 | pip install flake8
28 |
29 | - name: Lint with flake8
30 | run: |
31 | cd archivebox
32 | # one pass for show-stopper syntax errors or undefined names
33 | flake8 . --count --show-source --statistics
34 | # one pass for small stylistic things
35 | flake8 . --count --max-line-length="$MAX_LINE_LENGTH" --statistics
36 |
--------------------------------------------------------------------------------
/archivebox/core/migrations/0047_alter_snapshottag_unique_together_and_more.py:
--------------------------------------------------------------------------------
1 | # Generated by Django 5.0.6 on 2024-08-20 02:16
2 |
3 | import django.db.models.deletion
4 | from django.db import migrations, models
5 |
6 |
7 | class Migration(migrations.Migration):
8 |
9 | dependencies = [
10 | ('core', '0046_alter_archiveresult_snapshot_alter_snapshot_id_and_more'),
11 | ]
12 |
13 | operations = [
14 | migrations.AlterField(
15 | model_name='archiveresult',
16 | name='snapshot',
17 | field=models.ForeignKey(db_column='snapshot_id', on_delete=django.db.models.deletion.CASCADE, to='core.snapshot', to_field='id'),
18 | ),
19 | migrations.AlterField(
20 | model_name='snapshottag',
21 | name='tag',
22 | field=models.ForeignKey(db_column='tag_id', on_delete=django.db.models.deletion.CASCADE, to='core.tag'),
23 | ),
24 | ]
25 |
--------------------------------------------------------------------------------
/archivebox/core/migrations/0048_alter_archiveresult_snapshot_and_more.py:
--------------------------------------------------------------------------------
1 | # Generated by Django 5.0.6 on 2024-08-20 02:17
2 |
3 | import django.db.models.deletion
4 | from django.db import migrations, models
5 |
6 |
7 | class Migration(migrations.Migration):
8 |
9 | dependencies = [
10 | ('core', '0047_alter_snapshottag_unique_together_and_more'),
11 | ]
12 |
13 | operations = [
14 | migrations.AlterField(
15 | model_name='archiveresult',
16 | name='snapshot',
17 | field=models.ForeignKey(db_column='snapshot_id', on_delete=django.db.models.deletion.CASCADE, to='core.snapshot'),
18 | ),
19 | migrations.AlterField(
20 | model_name='snapshottag',
21 | name='snapshot',
22 | field=models.ForeignKey(db_column='snapshot_id', on_delete=django.db.models.deletion.CASCADE, to='core.snapshot', to_field='old_id'),
23 | ),
24 | ]
25 |
--------------------------------------------------------------------------------
/archivebox/core/__init__.py:
--------------------------------------------------------------------------------
1 | __package__ = 'archivebox.core'
2 | __order__ = 100
3 | import abx
4 |
5 | @abx.hookimpl
6 | def register_admin(admin_site):
7 | """Register the core.models views (Snapshot, ArchiveResult, Tag, etc.) with the admin site"""
8 | from core.admin import register_admin
9 | register_admin(admin_site)
10 |
11 |
12 |
13 | @abx.hookimpl
14 | def get_CONFIG():
15 | from archivebox.config.common import (
16 | SHELL_CONFIG,
17 | STORAGE_CONFIG,
18 | GENERAL_CONFIG,
19 | SERVER_CONFIG,
20 | ARCHIVING_CONFIG,
21 | SEARCH_BACKEND_CONFIG,
22 | )
23 | return {
24 | 'SHELL_CONFIG': SHELL_CONFIG,
25 | 'STORAGE_CONFIG': STORAGE_CONFIG,
26 | 'GENERAL_CONFIG': GENERAL_CONFIG,
27 | 'SERVER_CONFIG': SERVER_CONFIG,
28 | 'ARCHIVING_CONFIG': ARCHIVING_CONFIG,
29 | 'SEARCHBACKEND_CONFIG': SEARCH_BACKEND_CONFIG,
30 | }
31 |
32 |
--------------------------------------------------------------------------------
/archivebox/core/migrations/0064_alter_snapshottag_unique_together_and_more.py:
--------------------------------------------------------------------------------
1 | # Generated by Django 5.0.6 on 2024-08-20 03:50
2 |
3 | import django.db.models.deletion
4 | from django.db import migrations, models
5 |
6 |
7 | class Migration(migrations.Migration):
8 |
9 | dependencies = [
10 | ('core', '0063_snapshottag_tag_alter_snapshottag_old_tag'),
11 | ]
12 |
13 | operations = [
14 | migrations.AlterUniqueTogether(
15 | name='snapshottag',
16 | unique_together=set(),
17 | ),
18 | migrations.AlterField(
19 | model_name='snapshottag',
20 | name='tag',
21 | field=models.ForeignKey(db_column='tag_id', on_delete=django.db.models.deletion.CASCADE, to='core.tag', to_field='id'),
22 | ),
23 | migrations.AlterUniqueTogether(
24 | name='snapshottag',
25 | unique_together={('snapshot', 'tag')},
26 | ),
27 | ]
28 |
--------------------------------------------------------------------------------
/archivebox/pkgs/abx-spec-archivebox/abx_spec_archivebox/__init__.py:
--------------------------------------------------------------------------------
1 | __package__ = 'abx_spec_archivebox'
2 | __order__ = 400
3 |
4 | # from .effects import *
5 | # from .events import *
6 | # from .reads import *
7 | # from .writes import *
8 | # from .states import *
9 |
10 | from typing import cast
11 |
12 | import abx
13 | from abx_spec_config import ConfigPluginSpec
14 | from abx_spec_abx_pkg import AbxPkgPluginSpec
15 | from abx_spec_django import DjangoPluginSpec
16 | from abx_spec_searchbackend import SearchBackendPluginSpec
17 |
18 | class ArchiveBoxPluginSpec(ConfigPluginSpec, AbxPkgPluginSpec, DjangoPluginSpec, SearchBackendPluginSpec):
19 | """
20 | ArchiveBox plugins can use any of the hooks from the Config, AbxPkg, and Django plugin specs.
21 | """
22 | pass
23 |
24 | PLUGIN_SPEC = ArchiveBoxPluginSpec
25 |
26 |
27 | TypedPluginManager = abx.ABXPluginManager[ArchiveBoxPluginSpec]
28 | pm = cast(TypedPluginManager, abx.pm)
29 |
--------------------------------------------------------------------------------
/archivebox/workers/tests.py:
--------------------------------------------------------------------------------
1 | from django.test import TestCase
2 |
3 | # Create your tests here.
4 |
5 |
6 | class CrawlActorTest(TestCase):
7 |
8 | def test_crawl_creation(self):
9 | seed = Seed.objects.create(uri='https://example.com')
10 | Event.dispatch('CRAWL_CREATE', {'seed_id': seed.id})
11 |
12 | crawl_actor = CrawlActor()
13 |
14 | output_events = list(crawl_actor.process_next_event())
15 |
16 | assert len(output_events) == 1
17 | assert output_events[0].get('name', 'unset') == 'FS_WRITE'
18 | assert output_events[0].get('path') == '/tmp/test_crawl/index.json'
19 |
20 | output_events = list(crawl_actor.process_next_event())
21 | assert len(output_events) == 1
22 | assert output_events[0].get('name', 'unset') == 'CRAWL_CREATED'
23 |
24 | assert Crawl.objects.filter(seed_id=seed.id).exists(), 'Crawl was not created'
25 |
26 |
--------------------------------------------------------------------------------
/archivebox/core/migrations/0005_auto_20200728_0326.py:
--------------------------------------------------------------------------------
1 | # Generated by Django 3.0.7 on 2020-07-28 03:26
2 |
3 | from django.db import migrations, models
4 |
5 |
6 | class Migration(migrations.Migration):
7 |
8 | dependencies = [
9 | ('core', '0004_auto_20200713_1552'),
10 | ]
11 |
12 | operations = [
13 | migrations.AlterField(
14 | model_name='snapshot',
15 | name='tags',
16 | field=models.CharField(blank=True, db_index=True, max_length=256, null=True),
17 | ),
18 | migrations.AlterField(
19 | model_name='snapshot',
20 | name='title',
21 | field=models.CharField(blank=True, db_index=True, max_length=128, null=True),
22 | ),
23 | migrations.AlterField(
24 | model_name='snapshot',
25 | name='updated',
26 | field=models.DateTimeField(blank=True, db_index=True, null=True),
27 | ),
28 | ]
29 |
--------------------------------------------------------------------------------
/archivebox/core/migrations/0052_alter_snapshottag_unique_together_and_more.py:
--------------------------------------------------------------------------------
1 | # Generated by Django 5.0.6 on 2024-08-20 02:37
2 |
3 | import django.db.models.deletion
4 | from django.db import migrations, models
5 |
6 |
7 | class Migration(migrations.Migration):
8 |
9 | dependencies = [
10 | ('core', '0051_snapshottag_snapshot_alter_snapshottag_snapshot_old'),
11 | ]
12 |
13 | operations = [
14 | migrations.AlterUniqueTogether(
15 | name='snapshottag',
16 | unique_together=set(),
17 | ),
18 | migrations.AlterField(
19 | model_name='snapshottag',
20 | name='snapshot',
21 | field=models.ForeignKey(db_column='snapshot_id', on_delete=django.db.models.deletion.CASCADE, to='core.snapshot'),
22 | ),
23 | migrations.AlterUniqueTogether(
24 | name='snapshottag',
25 | unique_together={('snapshot', 'tag')},
26 | ),
27 | ]
28 |
--------------------------------------------------------------------------------
/archivebox/pkgs/abx-plugin-singlefile/abx_plugin_singlefile/config.py:
--------------------------------------------------------------------------------
1 | from pathlib import Path
2 | from typing import List, Optional
3 |
4 | from pydantic import Field
5 |
6 | from abx_spec_config.base_configset import BaseConfigSet
7 |
8 | from archivebox.config.common import ARCHIVING_CONFIG
9 |
10 |
11 | class SinglefileConfig(BaseConfigSet):
12 | SAVE_SINGLEFILE: bool = True
13 |
14 | SINGLEFILE_USER_AGENT: str = Field(default=lambda: ARCHIVING_CONFIG.USER_AGENT)
15 | SINGLEFILE_TIMEOUT: int = Field(default=lambda: ARCHIVING_CONFIG.TIMEOUT)
16 | SINGLEFILE_CHECK_SSL_VALIDITY: bool = Field(default=lambda: ARCHIVING_CONFIG.CHECK_SSL_VALIDITY)
17 | SINGLEFILE_COOKIES_FILE: Optional[Path] = Field(default=lambda: ARCHIVING_CONFIG.COOKIES_FILE)
18 |
19 | SINGLEFILE_BINARY: str = Field(default='single-file')
20 | SINGLEFILE_EXTRA_ARGS: List[str] = []
21 |
22 |
23 | SINGLEFILE_CONFIG = SinglefileConfig()
24 |
--------------------------------------------------------------------------------
/archivebox/templates/core/minimal_index.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | Archived Sites
5 |
6 |
7 |
8 |
9 |
10 |
11 | Bookmarked
12 | Saved Link ({{num_links}})
13 | Files
14 | Original URL
15 |
16 |
17 |
18 | {% for link in links %}
19 | {% include "index_row.html" with link=link %}
20 | {% endfor %}
21 |
22 |
23 |
24 |
25 |
--------------------------------------------------------------------------------
/bin/release.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | ### Bash Environment Setup
4 | # http://redsymbol.net/articles/unofficial-bash-strict-mode/
5 | # https://www.gnu.org/software/bash/manual/html_node/The-Set-Builtin.html
6 | # set -o xtrace
7 | set -o errexit
8 | set -o errtrace
9 | set -o nounset
10 | set -o pipefail
11 | IFS=$'\n'
12 |
13 | REPO_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && cd .. && pwd )"
14 | cd "$REPO_DIR"
15 |
16 |
17 | # Run the linters and tests
18 | # ./bin/lint.sh
19 | # ./bin/test.sh
20 |
21 | # # Run all the build scripts
22 | # ./bin/build_git.sh
23 | # ./bin/build_docs.sh
24 | # ./bin/build_pip.sh
25 | # ./bin/build_docker.sh
26 |
27 | # Push relase to public repositories
28 | # ./bin/release_docs.sh
29 | ./bin/release_git.sh "$@"
30 | ./bin/release_pip.sh "$@"
31 | ./bin/release_docker.sh "$@"
32 |
33 | VERSION="$(grep '^version = ' "${REPO_DIR}/pyproject.toml" | awk -F'"' '{print $2}')"
34 | echo "[√] Done. Published version v$VERSION"
35 |
--------------------------------------------------------------------------------
/archivebox/core/migrations/0036_alter_archiveresult_id_alter_archiveresult_old_id.py:
--------------------------------------------------------------------------------
1 | # Generated by Django 5.0.6 on 2024-08-18 05:59
2 |
3 | import core.models
4 | import uuid
5 | import random
6 | from django.db import migrations, models
7 |
8 |
9 | def rand_int_id():
10 | return random.getrandbits(32)
11 |
12 | class Migration(migrations.Migration):
13 |
14 | dependencies = [
15 | ('core', '0035_remove_archiveresult_uuid_archiveresult_id'),
16 | ]
17 |
18 | operations = [
19 | migrations.AlterField(
20 | model_name='archiveresult',
21 | name='id',
22 | field=models.UUIDField(default=uuid.uuid4, primary_key=True, serialize=False, unique=True, verbose_name='ID'),
23 | ),
24 | migrations.AlterField(
25 | model_name='archiveresult',
26 | name='old_id',
27 | field=models.BigIntegerField(default=rand_int_id, serialize=False, verbose_name='Old ID'),
28 | ),
29 | ]
30 |
--------------------------------------------------------------------------------
/archivebox/pkgs/abx-spec-archivebox/abx_spec_archivebox/events.py:
--------------------------------------------------------------------------------
1 | """
2 | Hookspec for ArchiveBox system events that plugins can hook into.
3 |
4 | Loosely modeled after Django's signals architecture.
5 | https://docs.djangoproject.com/en/5.1/ref/signals/
6 | """
7 |
8 | __package__ = 'abx.archivebox'
9 |
10 | import abx
11 |
12 |
13 |
14 | @abx.hookspec
15 | def on_crawl_schedule_tick(crawl_schedule):
16 | pass
17 |
18 |
19 |
20 |
21 | @abx.hookspec
22 | def on_seed_post_save(seed, created=False):
23 | ...
24 |
25 | @abx.hookspec
26 | def on_crawl_post_save(crawl, created=False):
27 | ...
28 |
29 |
30 | @abx.hookspec
31 | def on_snapshot_post_save(snapshot, created=False):
32 | ...
33 |
34 | # @abx.hookspec
35 | # def on_snapshot_post_delete(snapshot):
36 | # ...
37 |
38 |
39 | @abx.hookspec
40 | def on_archiveresult_post_save(archiveresult, created=False):
41 | ...
42 |
43 | # @abx.hookspec
44 | # def on_archiveresult_post_delete(archiveresult):
45 | # ...
46 |
--------------------------------------------------------------------------------
/archivebox/parsers/url_list.py:
--------------------------------------------------------------------------------
1 | __package__ = 'archivebox.parsers'
2 | __description__ = 'URL list'
3 |
4 | import re
5 |
6 | from typing import IO, Iterable
7 | from datetime import datetime, timezone
8 |
9 | from ..index.schema import Link
10 | from archivebox.misc.util import (
11 | enforce_types,
12 | URL_REGEX,
13 | )
14 |
15 |
16 | @enforce_types
17 | def parse_url_list(text_file: IO[str], **_kwargs) -> Iterable[Link]:
18 | """Parse raw URLs from each line in a text file"""
19 |
20 | text_file.seek(0)
21 | for line in text_file.readlines():
22 | url = line.strip()
23 | if (not url) or not re.findall(URL_REGEX, url):
24 | continue
25 |
26 | yield Link(
27 | url=url,
28 | timestamp=str(datetime.now(timezone.utc).timestamp()),
29 | title=None,
30 | tags=None,
31 | sources=[text_file.name],
32 | )
33 |
34 |
35 | KEY = 'url_list'
36 | NAME = 'URL List'
37 | PARSER = parse_url_list
38 |
--------------------------------------------------------------------------------
/etc/archivebox.service:
--------------------------------------------------------------------------------
1 | # This is an example systemd service config definition for ArchiveBox.
2 | #
3 | # Link it into place on your system to use systemd to auto-start the ArchiveBox server on boot:
4 | # https://unix.stackexchange.com/questions/224992/where-do-i-put-my-systemd-unit-file
5 | #
6 | # Review and change these lines as-needed for your specific environment and needs:
7 | # WorkingDirectory, ExecStart, User, Group
8 |
9 | [Unit]
10 | Description=Open source self-hosted web archiving
11 | Documentation=https://github.com/ArchiveBox/ArchiveBox/wiki
12 |
13 | [Service]
14 | Type=simple
15 | WorkingDirectory=/home/archivebox/archivebox/
16 | ExecStart=/usr/local/bin/archivebox server 0.0.0.0:8000
17 | ExecReload=/bin/kill -s HUP $MAINPID
18 | ExecStop=/bin/kill -s QUIT $MAINPID
19 | Restart=always
20 | RestartSec=2
21 | StandardOutput=syslog
22 | StandardError=syslog
23 | SyslogIdentifier=archivebox
24 | User=archivebox
25 | Group=archivebox
26 |
27 |
28 | [Install]
29 | WantedBy=multi-user.target
30 |
--------------------------------------------------------------------------------
/website/assets/css/style.scss:
--------------------------------------------------------------------------------
1 | ---
2 | ---
3 |
4 | @import "{{ site.theme }}";
5 |
6 | div.shell {
7 | width: 80%;
8 | max-width: 1300px;
9 | min-width: 300px;
10 | }
11 |
12 | span.banner-fix {
13 | width: 80%;
14 | max-width: 1300px;
15 | min-width: 300px;
16 | }
17 |
18 | header h1 {
19 | background-color: #aa1f55;
20 | padding-bottom: 15px;
21 | font-weight: 200px;
22 | }
23 | header h2 {
24 | background-color: #aa1f55;
25 | font-family: 'Open Sans';
26 | }
27 |
28 | #main_content div[align=center] h1 {
29 | display: none;
30 | }
31 | #main_content img {
32 | box-shadow: 4px 4px 4px rgba(0,0,0,0.1);
33 | border-radius: 8px;
34 | border: 0px;
35 | vertical-align: top;
36 | }
37 | #main_content em img {
38 | display: block;
39 | margin-top: -83px;
40 | padding: 0px;
41 | margin-bottom: 20px;
42 | }
43 |
44 | #main_content img[alt=comparison] {
45 | margin: 25px;
46 | }
47 |
48 | #forkme_banner {
49 | opacity: 0.1;
50 | }
51 |
--------------------------------------------------------------------------------
/archivebox/api/migrations/0006_remove_outboundwebhook_uuid_apitoken_id_and_more.py:
--------------------------------------------------------------------------------
1 | # Generated by Django 5.1 on 2024-08-20 22:43
2 |
3 | import uuid
4 | from django.db import migrations, models
5 |
6 |
7 | class Migration(migrations.Migration):
8 |
9 | dependencies = [
10 | ('api', '0005_remove_apitoken_uuid_remove_outboundwebhook_uuid_and_more'),
11 | ]
12 |
13 | operations = [
14 | migrations.RenameField(
15 | model_name='outboundwebhook',
16 | old_name='uuid',
17 | new_name='id'
18 | ),
19 | migrations.AlterField(
20 | model_name='outboundwebhook',
21 | name='id',
22 | field=models.UUIDField(default=uuid.uuid4, editable=False, primary_key=True, serialize=False),
23 | ),
24 | migrations.AlterField(
25 | model_name='apitoken',
26 | name='id',
27 | field=models.UUIDField(default=uuid.uuid4, editable=False, primary_key=True, serialize=False),
28 | ),
29 | ]
30 |
--------------------------------------------------------------------------------
/archivebox/core/migrations/0011_auto_20210216_1331.py:
--------------------------------------------------------------------------------
1 | # Generated by Django 3.1.3 on 2021-02-16 13:31
2 |
3 | from django.db import migrations, models
4 | import uuid
5 |
6 |
7 | class Migration(migrations.Migration):
8 |
9 | dependencies = [
10 | ('core', '0010_auto_20210216_1055'),
11 | ]
12 |
13 | operations = [
14 | migrations.AddField(
15 | model_name='archiveresult',
16 | name='uuid',
17 | field=models.UUIDField(default=uuid.uuid4, editable=False),
18 | ),
19 | migrations.AlterField(
20 | model_name='archiveresult',
21 | name='extractor',
22 | field=models.CharField(choices=[('title', 'title'), ('favicon', 'favicon'), ('headers', 'headers'), ('singlefile', 'singlefile'), ('pdf', 'pdf'), ('screenshot', 'screenshot'), ('dom', 'dom'), ('wget', 'wget'), ('readability', 'readability'), ('mercury', 'mercury'), ('git', 'git'), ('media', 'media'), ('archive_org', 'archive_org')], max_length=32),
23 | ),
24 | ]
25 |
--------------------------------------------------------------------------------
/tests/mock_server/templates/example.atom:
--------------------------------------------------------------------------------
1 |
2 |
6 | http://www.example.com/
7 | Example of an Atom feed
8 |
9 |
10 |
11 | Jim Winstead
12 |
13 | 2024-02-26T03:18:26Z
14 |
15 | Example
16 |
17 | tag:example.com,2024-02-25:3319
18 | 2024-02-26T03:18:26Z
19 | 2024-02-25T19:18:25-08:00
20 |
21 |
22 | This is some <b>content</b>
23 |
24 |
25 |
--------------------------------------------------------------------------------
/archivebox/core/migrations/0001_initial.py:
--------------------------------------------------------------------------------
1 | # Generated by Django 2.2 on 2019-05-01 03:27
2 |
3 | from django.db import migrations, models
4 | import uuid
5 |
6 |
7 | class Migration(migrations.Migration):
8 |
9 | initial = True
10 |
11 | dependencies = [
12 | ]
13 |
14 | operations = [
15 | migrations.CreateModel(
16 | name='Snapshot',
17 | fields=[
18 | ('id', models.UUIDField(default=uuid.uuid4, editable=False, primary_key=True, serialize=False)),
19 | ('url', models.URLField(unique=True)),
20 | ('timestamp', models.CharField(default=None, max_length=32, null=True, unique=True)),
21 | ('title', models.CharField(default=None, max_length=128, null=True)),
22 | ('tags', models.CharField(default=None, max_length=256, null=True)),
23 | ('added', models.DateTimeField(auto_now_add=True)),
24 | ('updated', models.DateTimeField(default=None, null=True)),
25 | ],
26 | ),
27 | ]
28 |
--------------------------------------------------------------------------------
/archivebox/cli/archivebox_worker.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 |
3 | __package__ = 'archivebox.cli'
4 | __command__ = 'archivebox worker'
5 |
6 | import sys
7 | import json
8 |
9 | import rich_click as click
10 |
11 |
12 | @click.command()
13 | @click.argument('worker_type')
14 | @click.option('--wait-for-first-event', is_flag=True)
15 | @click.option('--exit-on-idle', is_flag=True)
16 | def main(worker_type: str, wait_for_first_event: bool, exit_on_idle: bool):
17 | """Start an ArchiveBox worker process of the given type"""
18 |
19 | from workers.worker import get_worker_type
20 |
21 | # allow piping in events to process from stdin
22 | # if not sys.stdin.isatty():
23 | # for line in sys.stdin.readlines():
24 | # Event.dispatch(event=json.loads(line), parent=None)
25 |
26 | # run the actor
27 | Worker = get_worker_type(worker_type)
28 | for event in Worker.run(wait_for_first_event=wait_for_first_event, exit_on_idle=exit_on_idle):
29 | print(event)
30 |
31 |
32 | if __name__ == '__main__':
33 | main()
34 |
--------------------------------------------------------------------------------
/archivebox/core/asgi.py:
--------------------------------------------------------------------------------
1 | """
2 | WSGI config for archivebox project.
3 |
4 | It exposes the WSGI callable as a module-level variable named ``application``.
5 |
6 | For more information on this file, see
7 | https://docs.djangoproject.com/en/2.1/howto/deployment/wsgi/
8 | """
9 |
10 | from archivebox.config.django import setup_django
11 |
12 | setup_django(in_memory_db=False, check_db=True)
13 |
14 |
15 | # from channels.auth import AuthMiddlewareStack
16 | # from channels.security.websocket import AllowedHostsOriginValidator
17 | from channels.routing import ProtocolTypeRouter # , URLRouter
18 | from django.core.asgi import get_asgi_application
19 |
20 | # from core.routing import websocket_urlpatterns
21 |
22 |
23 | django_asgi_app = get_asgi_application()
24 |
25 | application = ProtocolTypeRouter(
26 | {
27 | "http": django_asgi_app,
28 | # only if we need websocket support later:
29 | # "websocket": AllowedHostsOriginValidator(
30 | # AuthMiddlewareStack(URLRouter(websocket_urlpatterns))
31 | # ),
32 | }
33 | )
34 |
--------------------------------------------------------------------------------
/archivebox/pkgs/abx-plugin-pip/abx_plugin_pip/__init__.py:
--------------------------------------------------------------------------------
1 | __package__ = 'abx_plugin_pip'
2 | __label__ = 'PIP'
3 | __order__ = 200
4 |
5 | import abx
6 |
7 |
8 | @abx.hookimpl
9 | def get_CONFIG():
10 | from .config import PIP_CONFIG
11 |
12 | return {
13 | 'PIP_CONFIG': PIP_CONFIG
14 | }
15 |
16 | @abx.hookimpl(tryfirst=True)
17 | def get_BINARIES():
18 | from .binaries import ARCHIVEBOX_BINARY, PYTHON_BINARY, DJANGO_BINARY, SQLITE_BINARY, PIP_BINARY, PIPX_BINARY
19 |
20 | return {
21 | 'archivebox': ARCHIVEBOX_BINARY,
22 | 'python': PYTHON_BINARY,
23 | 'django': DJANGO_BINARY,
24 | 'sqlite': SQLITE_BINARY,
25 | 'pip': PIP_BINARY,
26 | 'pipx': PIPX_BINARY,
27 | }
28 |
29 | @abx.hookimpl
30 | def get_BINPROVIDERS():
31 | from .binproviders import SYS_PIP_BINPROVIDER, VENV_PIP_BINPROVIDER, LIB_PIP_BINPROVIDER
32 |
33 | return {
34 | 'sys_pip': SYS_PIP_BINPROVIDER,
35 | 'venv_pip': VENV_PIP_BINPROVIDER,
36 | 'lib_pip': LIB_PIP_BINPROVIDER,
37 | }
38 |
--------------------------------------------------------------------------------
/archivebox/pkgs/abx-plugin-readability/abx_plugin_readability/binaries.py:
--------------------------------------------------------------------------------
1 | __package__ = 'abx_plugin_readability'
2 |
3 | from typing import List
4 |
5 | from pydantic import InstanceOf
6 | from abx_pkg import Binary, BinProvider, BinaryOverrides, BinName
7 |
8 | from abx_plugin_default_binproviders import env
9 | from abx_plugin_npm.binproviders import SYS_NPM_BINPROVIDER, LIB_NPM_BINPROVIDER
10 |
11 | from .config import READABILITY_CONFIG
12 |
13 |
14 | READABILITY_PACKAGE_NAME = 'github:ArchiveBox/readability-extractor'
15 |
16 | class ReadabilityBinary(Binary):
17 | name: BinName = READABILITY_CONFIG.READABILITY_BINARY
18 | binproviders_supported: List[InstanceOf[BinProvider]] = [LIB_NPM_BINPROVIDER, SYS_NPM_BINPROVIDER, env]
19 |
20 | overrides: BinaryOverrides = {
21 | LIB_NPM_BINPROVIDER.name: {"packages": [READABILITY_PACKAGE_NAME]},
22 | SYS_NPM_BINPROVIDER.name: {"packages": [READABILITY_PACKAGE_NAME], "install": lambda: None}, # prevent modifying system global npm packages
23 | }
24 |
25 |
26 | READABILITY_BINARY = ReadabilityBinary()
27 |
--------------------------------------------------------------------------------
/bin/build_git.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | ### Bash Environment Setup
4 | # http://redsymbol.net/articles/unofficial-bash-strict-mode/
5 | # https://www.gnu.org/software/bash/manual/html_node/The-Set-Builtin.html
6 | # set -o xtrace
7 | set -o errexit
8 | set -o errtrace
9 | set -o nounset
10 | set -o pipefail
11 | IFS=$'\n'
12 |
13 | REPO_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && cd .. && pwd )"
14 |
15 | cd "$REPO_DIR"
16 | source "./.venv/bin/activate"
17 |
18 |
19 | # Make sure git is clean
20 | if [ -z "$(git status --porcelain)" ] && [[ "$(git branch --show-current)" == "master" ]]; then
21 | git pull
22 | else
23 | echo "[!] Warning: git status is dirty!"
24 | echo " Press Ctrl-C to cancel, or wait 10sec to continue..."
25 | sleep 10
26 | fi
27 |
28 | # Bump version number in source
29 | function bump_semver {
30 | echo "$1" | awk -F. '{$NF = $NF + 1;} 1' | sed 's/ /./g'
31 | }
32 |
33 | # OLD_VERSION="$(grep '^version = ' "${REPO_DIR}/pyproject.toml" | awk -F'"' '{print $2}')"
34 | # NEW_VERSION="$(bump_semver "$OLD_VERSION")"
35 |
36 |
--------------------------------------------------------------------------------
/archivebox/index/csv.py:
--------------------------------------------------------------------------------
1 | __package__ = 'archivebox.index'
2 |
3 | from typing import List, Optional, Any
4 |
5 | from archivebox.misc.util import enforce_types
6 | from .schema import Link
7 |
8 |
9 | @enforce_types
10 | def links_to_csv(links: List[Link],
11 | cols: Optional[List[str]]=None,
12 | header: bool=True,
13 | separator: str=',',
14 | ljust: int=0) -> str:
15 |
16 | cols = cols or ['timestamp', 'is_archived', 'url']
17 |
18 | header_str = ''
19 | if header:
20 | header_str = separator.join(col.ljust(ljust) for col in cols)
21 |
22 | row_strs = (
23 | link.to_csv(cols=cols, ljust=ljust, separator=separator)
24 | for link in links
25 | )
26 |
27 | return '\n'.join((header_str, *row_strs))
28 |
29 |
30 | @enforce_types
31 | def to_csv(obj: Any, cols: List[str], separator: str=',', ljust: int=0) -> str:
32 | from .json import to_json
33 |
34 | return separator.join(
35 | to_json(getattr(obj, col), indent=None).ljust(ljust)
36 | for col in cols
37 | )
38 |
--------------------------------------------------------------------------------
/tests/mock_server/templates/example.jsonl:
--------------------------------------------------------------------------------
1 | {"href":"http://127.0.0.1:8080/static/example.com.html","description":"Example","extended":"","meta":"18a973f09c9cc0608c116967b64e0419","hash":"910293f019c2f4bb1a749fb937ba58e3","time":"2014-06-14T15:51:42Z","shared":"no","toread":"no","tags":"Tag1 Tag2","trap":"http://www.example.com/should-not-exist"}
2 | {"href":"http://127.0.0.1:8080/static/iana.org.html","description":"Example 2","extended":"","meta":"18a973f09c9cc0608c116967b64e0419","hash":"910293f019c2f4bb1a749fb937ba58e3","time":"2014-06-14T15:51:43Z","shared":"no","toread":"no","tags":"Tag3,Tag4 with Space"}
3 | {"href":"http://127.0.0.1:8080/static/shift_jis.html","description":"Example 2","extended":"","meta":"18a973f09c9cc0608c116967b64e0419","hash":"910293f019c2f4bb1a749fb937ba58e3","time":"2014-06-14T15:51:44Z","shared":"no","toread":"no","tags":["Tag5","Tag6 with Space"]}
4 | {"href":"http://127.0.0.1:8080/static/title_og_with_html","description":"Example 2","extended":"","meta":"18a973f09c9cc0608c116967b64e0419","hash":"910293f019c2f4bb1a749fb937ba58e3","time":"2014-06-14T15:51:45Z","shared":"no","toread":"no"}
5 |
--------------------------------------------------------------------------------
/archivebox/search/admin.py:
--------------------------------------------------------------------------------
1 | __package__ = 'archivebox.search'
2 |
3 | from django.contrib import messages
4 | from django.contrib import admin
5 |
6 | from archivebox.search import query_search_index
7 |
8 | class SearchResultsAdminMixin(admin.ModelAdmin):
9 | def get_search_results(self, request, queryset, search_term: str):
10 | """Enhances the search queryset with results from the search backend"""
11 |
12 | qs, use_distinct = super().get_search_results(request, queryset, search_term)
13 |
14 | search_term = search_term.strip()
15 | if not search_term:
16 | return qs.distinct(), use_distinct
17 | try:
18 | qsearch = query_search_index(search_term)
19 | qs = qs | qsearch
20 | except Exception as err:
21 | print(f'[!] Error while using search backend: {err.__class__.__name__} {err}')
22 | messages.add_message(request, messages.WARNING, f'Error from the search backend, only showing results from default admin search fields - Error: {err}')
23 |
24 | return qs.distinct(), use_distinct
25 |
--------------------------------------------------------------------------------
/archivebox/api/migrations/0001_initial.py:
--------------------------------------------------------------------------------
1 | # Generated by Django 4.2.11 on 2024-04-25 04:19
2 |
3 | import api.models
4 | from django.conf import settings
5 | from django.db import migrations, models
6 | import django.db.models.deletion
7 | import uuid
8 |
9 |
10 | class Migration(migrations.Migration):
11 |
12 | initial = True
13 |
14 | dependencies = [
15 | migrations.swappable_dependency(settings.AUTH_USER_MODEL),
16 | ]
17 |
18 | operations = [
19 | migrations.CreateModel(
20 | name='APIToken',
21 | fields=[
22 | ('id', models.UUIDField(default=uuid.uuid4, editable=False, primary_key=True, serialize=False)),
23 | ('token', models.CharField(default=api.models.generate_secret_token, max_length=32, unique=True)),
24 | ('created', models.DateTimeField(auto_now_add=True)),
25 | ('expires', models.DateTimeField(blank=True, null=True)),
26 | ('user', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL)),
27 | ],
28 | ),
29 | ]
30 |
--------------------------------------------------------------------------------
/archivebox/pkgs/abx-plugin-ripgrep-search/abx_plugin_ripgrep_search/config.py:
--------------------------------------------------------------------------------
1 | __package__ = 'abx_plugin_ripgrep_search'
2 |
3 | from pathlib import Path
4 | from typing import List
5 |
6 | from pydantic import Field
7 |
8 | from abx_spec_config.base_configset import BaseConfigSet
9 |
10 | from archivebox.config import CONSTANTS
11 | from archivebox.config.common import SEARCH_BACKEND_CONFIG
12 |
13 |
14 | class RipgrepConfig(BaseConfigSet):
15 | RIPGREP_BINARY: str = Field(default='rg')
16 |
17 | RIPGREP_IGNORE_EXTENSIONS: str = Field(default='css,js,orig,svg')
18 | RIPGREP_ARGS_DEFAULT: List[str] = Field(default=lambda c: [
19 | # https://github.com/BurntSushi/ripgrep/blob/master/GUIDE.md
20 | f'--type-add=ignore:*.{{{c.RIPGREP_IGNORE_EXTENSIONS}}}',
21 | '--type-not=ignore',
22 | '--ignore-case',
23 | '--files-with-matches',
24 | '--regexp',
25 | ])
26 | RIPGREP_SEARCH_DIR: Path = CONSTANTS.ARCHIVE_DIR
27 | RIPGREP_TIMEOUT: int = Field(default=lambda: SEARCH_BACKEND_CONFIG.SEARCH_BACKEND_TIMEOUT)
28 |
29 | RIPGREP_CONFIG = RipgrepConfig()
30 |
--------------------------------------------------------------------------------
/tests/mock_server/templates/example.json:
--------------------------------------------------------------------------------
1 | [
2 | {"href":"http://127.0.0.1:8080/static/example.com.html","description":"Example","extended":"","meta":"18a973f09c9cc0608c116967b64e0419","hash":"910293f019c2f4bb1a749fb937ba58e3","time":"2014-06-14T15:51:42Z","shared":"no","toread":"no","tags":"Tag1 Tag2","trap":"http://www.example.com/should-not-exist"},
3 | {"href":"http://127.0.0.1:8080/static/iana.org.html","description":"Example 2","extended":"","meta":"18a973f09c9cc0608c116967b64e0419","hash":"910293f019c2f4bb1a749fb937ba58e3","time":"2014-06-14T15:51:43Z","shared":"no","toread":"no","tags":"Tag3,Tag4 with Space"},
4 | {"href":"http://127.0.0.1:8080/static/shift_jis.html","description":"Example 2","extended":"","meta":"18a973f09c9cc0608c116967b64e0419","hash":"910293f019c2f4bb1a749fb937ba58e3","time":"2014-06-14T15:51:44Z","shared":"no","toread":"no","tags":["Tag5","Tag6 with Space"]},
5 | {"href":"http://127.0.0.1:8080/static/title_og_with_html","description":"Example 2","extended":"","meta":"18a973f09c9cc0608c116967b64e0419","hash":"910293f019c2f4bb1a749fb937ba58e3","time":"2014-06-14T15:51:45Z","shared":"no","toread":"no"}
6 | ]
7 |
--------------------------------------------------------------------------------
/archivebox/core/migrations/0046_alter_archiveresult_snapshot_alter_snapshot_id_and_more.py:
--------------------------------------------------------------------------------
1 | # Generated by Django 5.0.6 on 2024-08-20 01:55
2 |
3 | import django.db.models.deletion
4 | import uuid
5 | from django.db import migrations, models
6 |
7 |
8 | class Migration(migrations.Migration):
9 |
10 | dependencies = [
11 | ('core', '0045_alter_snapshot_old_id'),
12 | ]
13 |
14 | operations = [
15 | migrations.AlterField(
16 | model_name='archiveresult',
17 | name='snapshot',
18 | field=models.ForeignKey(db_column='snapshot_id', on_delete=django.db.models.deletion.CASCADE, to='core.snapshot', to_field='id'),
19 | ),
20 | migrations.AlterField(
21 | model_name='snapshot',
22 | name='id',
23 | field=models.UUIDField(default=uuid.uuid4, primary_key=True, serialize=False, unique=True),
24 | ),
25 | migrations.AlterField(
26 | model_name='snapshot',
27 | name='old_id',
28 | field=models.UUIDField(default=uuid.uuid4, editable=False, unique=True),
29 | ),
30 | ]
31 |
--------------------------------------------------------------------------------
/archivebox/pkgs/abx-plugin-readwise/abx_plugin_readwise.py:
--------------------------------------------------------------------------------
1 | __package__ = 'abx_plugin_readwise_extractor'
2 | __id__ = 'abx_plugin_readwise_extractor'
3 | __label__ = 'Readwise API'
4 | __version__ = '2024.10.27'
5 | __author__ = 'ArchiveBox'
6 | __homepage__ = 'https://github.com/ArchiveBox/ArchiveBox/tree/dev/archivebox/pkgs/abx-plugin-readwise-extractor'
7 | __dependencies__ = []
8 |
9 | import abx
10 |
11 | from typing import Dict
12 | from pathlib import Path
13 |
14 | from pydantic import Field
15 |
16 | from abx_spec_config.base_configset import BaseConfigSet
17 |
18 | from archivebox.config import CONSTANTS
19 |
20 | class ReadwiseConfig(BaseConfigSet):
21 | READWISE_DB_PATH: Path = Field(default=CONSTANTS.SOURCES_DIR / "readwise_reader_api.db")
22 | READWISE_READER_TOKENS: Dict[str, str] = Field(default=lambda: {}) # {: , ...}
23 |
24 |
25 | @abx.hookimpl
26 | def get_CONFIG():
27 | return {
28 | __id__: ReadwiseConfig()
29 | }
30 |
31 | @abx.hookimpl
32 | def ready():
33 | READWISE_CONFIG = abx.pm.hook.get_CONFIG()[__id__]
34 | READWISE_CONFIG.validate()
35 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2024 Nick Sweeting
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/archivebox/pkgs/abx-plugin-curl/abx_plugin_curl/config.py:
--------------------------------------------------------------------------------
1 | __package__ = 'abx_plugin_curl'
2 |
3 | from typing import List, Optional
4 | from pathlib import Path
5 |
6 | from pydantic import Field
7 |
8 | from abx_spec_config.base_configset import BaseConfigSet
9 |
10 | from archivebox.config.common import ARCHIVING_CONFIG
11 |
12 |
13 | class CurlConfig(BaseConfigSet):
14 |
15 | SAVE_TITLE: bool = Field(default=True)
16 | SAVE_HEADERS: bool = Field(default=True)
17 | USE_CURL: bool = Field(default=True)
18 |
19 | CURL_BINARY: str = Field(default='curl')
20 | CURL_ARGS: List[str] = [
21 | '--silent',
22 | '--location',
23 | '--compressed',
24 | ]
25 | CURL_EXTRA_ARGS: List[str] = []
26 |
27 | CURL_TIMEOUT: int = Field(default=lambda: ARCHIVING_CONFIG.TIMEOUT)
28 | CURL_CHECK_SSL_VALIDITY: bool = Field(default=lambda: ARCHIVING_CONFIG.CHECK_SSL_VALIDITY)
29 | CURL_USER_AGENT: str = Field(default=lambda: ARCHIVING_CONFIG.USER_AGENT)
30 | CURL_COOKIES_FILE: Optional[Path] = Field(default=lambda: ARCHIVING_CONFIG.COOKIES_FILE)
31 |
32 |
33 | CURL_CONFIG = CurlConfig()
34 |
--------------------------------------------------------------------------------
/archivebox/pkgs/abx-plugin-sonic-search/abx_plugin_sonic_search/binaries.py:
--------------------------------------------------------------------------------
1 | __package__ = 'abx_plugin_sonic_search'
2 |
3 | from typing import List
4 |
5 | from pydantic import InstanceOf
6 | from abx_pkg import BinProvider, BinaryOverrides, BinName, Binary
7 |
8 | from abx_plugin_default_binproviders import brew, env
9 |
10 | from .config import SONIC_CONFIG
11 |
12 |
13 | class SonicBinary(Binary):
14 | name: BinName = SONIC_CONFIG.SONIC_BINARY
15 | binproviders_supported: List[InstanceOf[BinProvider]] = [brew, env] # TODO: add cargo
16 |
17 | overrides: BinaryOverrides = {
18 | brew.name: {'packages': ['sonic']},
19 | # cargo.name: {'packages': ['sonic-server']}, # TODO: add cargo
20 | }
21 |
22 | # TODO: add version checking over protocol? for when sonic backend is on remote server and binary is not installed locally
23 | # def on_get_version(self):
24 | # with sonic.IngestClient(SONIC_CONFIG.SONIC_HOST, str(SONIC_CONFIG.SONIC_PORT), SONIC_CONFIG.SONIC_PASSWORD) as ingestcl:
25 | # return SemVer.parse(str(ingestcl.protocol))
26 |
27 | SONIC_BINARY = SonicBinary()
28 |
--------------------------------------------------------------------------------
/tests/test_update.py:
--------------------------------------------------------------------------------
1 | import sqlite3
2 |
3 | from .fixtures import *
4 |
5 | def test_update_status_invalid(tmp_path, process, disable_extractors_dict):
6 | subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'], capture_output=True, env=disable_extractors_dict)
7 | assert list((tmp_path / "archive").iterdir()) != []
8 |
9 | a_process = subprocess.run(['archivebox', 'remove', 'http://127.0.0.1:8080/static/example.com.html', '--yes'], capture_output=True)
10 |
11 | conn = sqlite3.connect(str(tmp_path / "index.sqlite3"))
12 | c = conn.cursor()
13 | link = c.execute("SELECT * FROM core_snapshot").fetchone()
14 | conn.commit()
15 | conn.close()
16 |
17 | assert link is None
18 |
19 | update_process = subprocess.run(['archivebox', 'update', '--status=invalid'], capture_output=True, env=disable_extractors_dict)
20 |
21 | conn = sqlite3.connect(str(tmp_path / "index.sqlite3"))
22 | c = conn.cursor()
23 | url = c.execute("SELECT url FROM core_snapshot").fetchone()[0]
24 | conn.commit()
25 | conn.close()
26 |
27 | assert url == 'http://127.0.0.1:8080/static/example.com.html'
28 |
--------------------------------------------------------------------------------
/archivebox/misc/debugging.py:
--------------------------------------------------------------------------------
1 | from functools import wraps
2 | from time import time
3 |
4 | def timed_function(func):
5 | """
6 | Very simple profiling decorator for debugging.
7 | Usage:
8 | @timed_function
9 | def my_func():
10 | ...
11 |
12 | More advanced alternatives:
13 | - viztracer ../.venv/bin/archivebox manage check # https://viztracer.readthedocs.io/en/latest/filter.html
14 | - python -m cProfile -o archivebox.prof ../.venv/bin/archivebox manage check; snakeviz archivebox.prof
15 | - Django Debug Toolbar + django-debug-toolbar-flamegraph
16 | + Django Requests Tracker (requests-tracker)
17 | """
18 | @wraps(func)
19 | def wrap(*args, **kwargs):
20 | if args and hasattr(args[0], '__module__'):
21 | module = args[0].__module__
22 | else:
23 | module = func.__module__
24 | ts_start = time()
25 | result = func(*args, **kwargs)
26 | ts_end = time()
27 | ms_elapsed = int((ts_end-ts_start) * 1000)
28 | print(f'[DEBUG][{ms_elapsed}ms] {module}.{func.__name__}(...)')
29 | return result
30 | return wrap
31 |
--------------------------------------------------------------------------------
/archivebox/pkgs/abx-plugin-mercury/abx_plugin_mercury/config.py:
--------------------------------------------------------------------------------
1 | __package__ = 'abx_plugin_mercury'
2 |
3 | from typing import List, Optional
4 | from pathlib import Path
5 |
6 | from pydantic import Field
7 |
8 | from abx_spec_config.base_configset import BaseConfigSet
9 |
10 | from archivebox.config.common import ARCHIVING_CONFIG, STORAGE_CONFIG
11 |
12 |
13 |
14 | class MercuryConfig(BaseConfigSet):
15 |
16 | SAVE_MERCURY: bool = Field(default=True, alias='USE_MERCURY')
17 |
18 | MERCURY_BINARY: str = Field(default='postlight-parser')
19 | MERCURY_EXTRA_ARGS: List[str] = []
20 |
21 | SAVE_MERCURY_REQUISITES: bool = Field(default=True)
22 | MERCURY_RESTRICT_FILE_NAMES: str = Field(default=lambda: STORAGE_CONFIG.RESTRICT_FILE_NAMES)
23 |
24 | MERCURY_TIMEOUT: int = Field(default=lambda: ARCHIVING_CONFIG.TIMEOUT)
25 | MERCURY_CHECK_SSL_VALIDITY: bool = Field(default=lambda: ARCHIVING_CONFIG.CHECK_SSL_VALIDITY)
26 | MERCURY_USER_AGENT: str = Field(default=lambda: ARCHIVING_CONFIG.USER_AGENT)
27 | MERCURY_COOKIES_FILE: Optional[Path] = Field(default=lambda: ARCHIVING_CONFIG.COOKIES_FILE)
28 |
29 |
30 |
31 | MERCURY_CONFIG = MercuryConfig()
32 |
--------------------------------------------------------------------------------
/archivebox/pkgs/abx-plugin-wget/abx_plugin_wget/extractors.py:
--------------------------------------------------------------------------------
1 | __package__ = 'abx_plugin_wget'
2 |
3 | from pathlib import Path
4 |
5 | from abx_pkg import BinName
6 |
7 | from abx_spec_extractor import BaseExtractor, ExtractorName
8 |
9 | from .binaries import WGET_BINARY
10 | from .wget_util import wget_output_path
11 |
12 | class WgetExtractor(BaseExtractor):
13 | name: ExtractorName = 'wget'
14 | binary: BinName = WGET_BINARY.name
15 |
16 | def get_output_path(self, snapshot) -> str:
17 | # wget_index_path = wget_output_path(snapshot.as_link())
18 | # if wget_index_path:
19 | # return Path(wget_index_path)
20 | return 'wget'
21 |
22 | WGET_EXTRACTOR = WgetExtractor()
23 |
24 |
25 | class WarcExtractor(BaseExtractor):
26 | name: ExtractorName = 'warc'
27 | binary: BinName = WGET_BINARY.name
28 |
29 | def get_output_path(self, snapshot) -> Path | None:
30 | warc_files = list((Path(snapshot.link_dir) / 'warc').glob('*.warc.gz'))
31 | if warc_files:
32 | return sorted(warc_files, key=lambda x: x.stat().st_size, reverse=True)[0]
33 | return None
34 |
35 |
36 | WARC_EXTRACTOR = WarcExtractor()
37 |
38 |
--------------------------------------------------------------------------------
/archivebox/pkgs/abx-spec-searchbackend/abx_spec_searchbackend.py:
--------------------------------------------------------------------------------
1 | import abc
2 | from typing import Iterable, List, Dict, cast
3 |
4 | import abx
5 | from abx_spec_config import ConfigPluginSpec
6 |
7 |
8 | class BaseSearchBackend(abc.ABC):
9 | name: str
10 |
11 | @staticmethod
12 | @abc.abstractmethod
13 | def index(snapshot_id: str, texts: List[str]):
14 | return
15 |
16 | @staticmethod
17 | @abc.abstractmethod
18 | def flush(snapshot_ids: Iterable[str]):
19 | return
20 |
21 | @staticmethod
22 | @abc.abstractmethod
23 | def search(text: str) -> List[str]:
24 | raise NotImplementedError("search method must be implemented by subclass")
25 |
26 |
27 | class SearchBackendPluginSpec:
28 | __order__ = 10
29 |
30 | @staticmethod
31 | @abx.hookspec
32 | @abx.hookimpl
33 | def get_SEARCHBACKENDS() -> Dict[abx.PluginId, BaseSearchBackend]:
34 | return {}
35 |
36 |
37 | class ExpectedPluginSpec(SearchBackendPluginSpec, ConfigPluginSpec):
38 | pass
39 |
40 | PLUGIN_SPEC = SearchBackendPluginSpec
41 |
42 | TypedPluginManager = abx.ABXPluginManager[ExpectedPluginSpec]
43 | pm = cast(TypedPluginManager, abx.pm)
44 |
--------------------------------------------------------------------------------
/archivebox/core/migrations/0031_alter_archiveresult_id_alter_archiveresult_uuid_and_more.py:
--------------------------------------------------------------------------------
1 | # Generated by Django 5.0.6 on 2024-08-18 05:09
2 |
3 | import uuid
4 | from django.db import migrations, models
5 |
6 |
7 | class Migration(migrations.Migration):
8 |
9 | dependencies = [
10 | ('core', '0030_alter_archiveresult_uuid'),
11 | ]
12 |
13 | operations = [
14 | migrations.AlterField(
15 | model_name='archiveresult',
16 | name='id',
17 | field=models.IntegerField(default=uuid.uuid4, primary_key=True, serialize=False, verbose_name='ID'),
18 | ),
19 | migrations.AlterField(
20 | model_name='archiveresult',
21 | name='uuid',
22 | field=models.UUIDField(default=uuid.uuid4, unique=True),
23 | ),
24 | migrations.AlterField(
25 | model_name='snapshot',
26 | name='uuid',
27 | field=models.UUIDField(default=uuid.uuid4, unique=True),
28 | ),
29 | migrations.AlterField(
30 | model_name='tag',
31 | name='uuid',
32 | field=models.UUIDField(default=uuid.uuid4, null=True, unique=True),
33 | ),
34 | ]
35 |
--------------------------------------------------------------------------------
/archivebox/pkgs/abx-plugin-chrome/abx_plugin_chrome/__init__.py:
--------------------------------------------------------------------------------
1 | __package__ = 'abx_plugin_chrome'
2 | __label__ = 'Chrome'
3 | __author__ = 'ArchiveBox'
4 |
5 | import abx
6 |
7 | @abx.hookimpl
8 | def get_CONFIG():
9 | from .config import CHROME_CONFIG
10 |
11 | return {
12 | 'CHROME_CONFIG': CHROME_CONFIG
13 | }
14 |
15 | @abx.hookimpl
16 | def get_BINARIES():
17 | from .binaries import CHROME_BINARY
18 |
19 | return {
20 | 'chrome': CHROME_BINARY,
21 | }
22 |
23 | @abx.hookimpl
24 | def ready():
25 | from .config import CHROME_CONFIG
26 | CHROME_CONFIG.validate()
27 |
28 |
29 | @abx.hookimpl
30 | def get_EXTRACTORS():
31 | """extractors that can be run for each URL, producing one or more ArchiveResults each"""
32 | from .extractors import PDF_EXTRACTOR, SCREENSHOT_EXTRACTOR, DOM_EXTRACTOR
33 | # dom -> ./output.html -> ./chrome_dom/index.html
34 | # screenshot -> ./screenshot.png -> ./chrome_screenshot/screenshot.png
35 | # pdf -> ./output.pdf -> ./chrome_pdf/pdf.pdf
36 | return {
37 | 'pdf': PDF_EXTRACTOR,
38 | 'screenshot': SCREENSHOT_EXTRACTOR,
39 | 'dom': DOM_EXTRACTOR,
40 | }
41 |
--------------------------------------------------------------------------------
/archivebox/config/__init__.py:
--------------------------------------------------------------------------------
1 | __package__ = 'archivebox.config'
2 | __order__ = 200
3 |
4 | from .paths import (
5 | PACKAGE_DIR, # noqa
6 | DATA_DIR, # noqa
7 | ARCHIVE_DIR, # noqa
8 | )
9 | from .constants import CONSTANTS, CONSTANTS_CONFIG, PACKAGE_DIR, DATA_DIR, ARCHIVE_DIR # noqa
10 | from .version import VERSION # noqa
11 |
12 | # import abx
13 |
14 | # @abx.hookimpl
15 | # def get_CONFIG():
16 | # from .common import (
17 | # SHELL_CONFIG,
18 | # STORAGE_CONFIG,
19 | # GENERAL_CONFIG,
20 | # SERVER_CONFIG,
21 | # ARCHIVING_CONFIG,
22 | # SEARCH_BACKEND_CONFIG,
23 | # )
24 | # return {
25 | # 'SHELL_CONFIG': SHELL_CONFIG,
26 | # 'STORAGE_CONFIG': STORAGE_CONFIG,
27 | # 'GENERAL_CONFIG': GENERAL_CONFIG,
28 | # 'SERVER_CONFIG': SERVER_CONFIG,
29 | # 'ARCHIVING_CONFIG': ARCHIVING_CONFIG,
30 | # 'SEARCHBACKEND_CONFIG': SEARCH_BACKEND_CONFIG,
31 | # }
32 |
33 | # @abx.hookimpl
34 | # def ready():
35 | # for config in get_CONFIG().values():
36 | # config.validate()
37 |
--------------------------------------------------------------------------------
/archivebox/pkgs/abx-plugin-mercury/abx_plugin_mercury/binaries.py:
--------------------------------------------------------------------------------
1 | __package__ = 'abx_plugin_mercury'
2 |
3 | from typing import List
4 |
5 | from pydantic import InstanceOf
6 | from abx_pkg import BinProvider, BinName, BinaryOverrides, bin_abspath, Binary
7 |
8 | from abx_plugin_default_binproviders import env
9 |
10 | from abx_plugin_npm.binproviders import SYS_NPM_BINPROVIDER, LIB_NPM_BINPROVIDER
11 |
12 | from .config import MERCURY_CONFIG
13 |
14 |
15 | class MercuryBinary(Binary):
16 | name: BinName = MERCURY_CONFIG.MERCURY_BINARY
17 | binproviders_supported: List[InstanceOf[BinProvider]] = [LIB_NPM_BINPROVIDER, SYS_NPM_BINPROVIDER, env]
18 |
19 | overrides: BinaryOverrides = {
20 | LIB_NPM_BINPROVIDER.name: {
21 | 'packages': ['@postlight/parser@^2.2.3'],
22 | },
23 | SYS_NPM_BINPROVIDER.name: {
24 | 'packages': ['@postlight/parser@^2.2.3'],
25 | 'install': lambda: None, # never try to install things into global prefix
26 | },
27 | env.name: {
28 | 'version': lambda: '999.999.999' if bin_abspath('postlight-parser', PATH=env.PATH) else None,
29 | },
30 | }
31 |
32 | MERCURY_BINARY = MercuryBinary()
33 |
--------------------------------------------------------------------------------
/archivebox/templates/core/navigation.html:
--------------------------------------------------------------------------------
1 | {% load i18n static %}
2 |
3 |
28 |
--------------------------------------------------------------------------------
/etc/sonic.cfg:
--------------------------------------------------------------------------------
1 | # Sonic
2 | # Fast, lightweight and schema-less search backend
3 | # Configuration file
4 | # Example: https://github.com/valeriansaliou/sonic/blob/master/config.cfg
5 |
6 |
7 | [server]
8 |
9 | # log_level = "debug"
10 | log_level = "warn"
11 |
12 |
13 | [channel]
14 |
15 | inet = "0.0.0.0:1491"
16 | tcp_timeout = 300
17 |
18 | auth_password = "${env.SEARCH_BACKEND_PASSWORD}"
19 |
20 | [channel.search]
21 |
22 | query_limit_default = 65535
23 | query_limit_maximum = 65535
24 | query_alternates_try = 10
25 |
26 | suggest_limit_default = 5
27 | suggest_limit_maximum = 20
28 |
29 |
30 | [store]
31 |
32 | [store.kv]
33 |
34 | path = "/var/lib/sonic/store/kv/"
35 |
36 | retain_word_objects = 100000
37 |
38 | [store.kv.pool]
39 |
40 | inactive_after = 1800
41 |
42 | [store.kv.database]
43 |
44 | flush_after = 900
45 |
46 | compress = true
47 | parallelism = 2
48 | max_files = 100
49 | max_compactions = 1
50 | max_flushes = 1
51 | write_buffer = 16384
52 | write_ahead_log = true
53 |
54 | [store.fst]
55 |
56 | path = "/var/lib/sonic/store/fst/"
57 |
58 | [store.fst.pool]
59 |
60 | inactive_after = 300
61 |
62 | [store.fst.graph]
63 |
64 | consolidate_after = 180
65 |
66 | max_size = 2048
67 | max_words = 250000
68 |
--------------------------------------------------------------------------------
/archivebox/core/migrations/0066_alter_snapshottag_tag_alter_tag_id_alter_tag_old_id.py:
--------------------------------------------------------------------------------
1 | # Generated by Django 5.0.6 on 2024-08-20 03:52
2 |
3 | import core.models
4 | import django.db.models.deletion
5 | import uuid
6 | import random
7 | from django.db import migrations, models
8 |
9 | def rand_int_id():
10 | return random.getrandbits(32)
11 |
12 | class Migration(migrations.Migration):
13 |
14 | dependencies = [
15 | ('core', '0065_remove_snapshottag_old_tag'),
16 | ]
17 |
18 | operations = [
19 | migrations.AlterField(
20 | model_name='snapshottag',
21 | name='tag',
22 | field=models.ForeignKey(db_column='tag_id', on_delete=django.db.models.deletion.CASCADE, to='core.tag', to_field='id'),
23 | ),
24 | migrations.AlterField(
25 | model_name='tag',
26 | name='id',
27 | field=models.UUIDField(default=uuid.uuid4, editable=False, primary_key=True, serialize=False, unique=True),
28 | ),
29 | migrations.AlterField(
30 | model_name='tag',
31 | name='old_id',
32 | field=models.BigIntegerField(default=rand_int_id, serialize=False, unique=True, verbose_name='Old ID'),
33 | ),
34 | ]
35 |
--------------------------------------------------------------------------------
/archivebox/cli/archivebox_manage.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 |
3 | __package__ = 'archivebox.cli'
4 |
5 | import rich_click as click
6 | from archivebox.misc.util import docstring, enforce_types
7 |
8 |
9 | @enforce_types
10 | def manage(args: list[str] | None=None) -> None:
11 | """Run an ArchiveBox Django management command"""
12 |
13 | from archivebox.config.common import SHELL_CONFIG
14 | from archivebox.misc.logging import stderr
15 |
16 | if (args and "createsuperuser" in args) and (SHELL_CONFIG.IN_DOCKER and not SHELL_CONFIG.IS_TTY):
17 | stderr('[!] Warning: you need to pass -it to use interactive commands in docker', color='lightyellow')
18 | stderr(' docker run -it archivebox manage {}'.format(' '.join(args or ['...'])), color='lightyellow')
19 | stderr('')
20 |
21 | from django.core.management import execute_from_command_line
22 | execute_from_command_line(['manage.py', *(args or ['help'])])
23 |
24 |
25 | @click.command(add_help_option=False, context_settings=dict(ignore_unknown_options=True))
26 | @click.argument('args', nargs=-1)
27 | @docstring(manage.__doc__)
28 | def main(args: list[str] | None=None) -> None:
29 | manage(args=args)
30 |
31 |
32 | if __name__ == '__main__':
33 | main()
34 |
--------------------------------------------------------------------------------
/archivebox/pkgs/abx-plugin-singlefile/abx_plugin_singlefile/actors.py:
--------------------------------------------------------------------------------
1 | # __package__ = 'abx_plugin_singlefile'
2 |
3 | # from typing import ClassVar
4 | # from django.db.models import QuerySet
5 | # from django.utils.functional import classproperty
6 |
7 | # from workers.actor import ActorType
8 |
9 | # from .models import SinglefileResult
10 |
11 |
12 | # class SinglefileActor(ActorType[SinglefileResult]):
13 | # CLAIM_ORDER: ClassVar[str] = 'created_at DESC'
14 | # CLAIM_WHERE: ClassVar[str] = 'status = "queued" AND extractor = "favicon"'
15 | # CLAIM_SET: ClassVar[str] = 'status = "started"'
16 |
17 | # @classproperty
18 | # def QUERYSET(cls) -> QuerySet:
19 | # return SinglefileResult.objects.filter(status='queued')
20 |
21 | # def tick(self, obj: SinglefileResult):
22 | # print(f'[grey53]{self}.tick({obj.abid or obj.id}, status={obj.status}) remaining:[/grey53]', self.get_queue().count())
23 | # updated = SinglefileResult.objects.filter(id=obj.id, status='started').update(status='success') == 1
24 | # if not updated:
25 | # raise Exception(f'Failed to update {obj.abid or obj.id}, interrupted by another actor writing to the same object')
26 | # obj.refresh_from_db()
27 | # obj.save()
28 |
--------------------------------------------------------------------------------
/archivebox/api/tests.py:
--------------------------------------------------------------------------------
1 | __package__ = 'archivebox.api'
2 |
3 | # from django.test import TestCase
4 | # from ninja.testing import TestClient
5 |
6 | # from .routes_cli import router
7 |
8 | # class ArchiveBoxCLIAPITestCase(TestCase):
9 | # def setUp(self):
10 | # self.client = TestClient(router)
11 |
12 | # def test_add_endpoint(self):
13 | # response = self.client.post("/add", json={"urls": ["http://example.com"], "tag": "testTag1,testTag2"})
14 | # self.assertEqual(response.status_code, 200)
15 | # self.assertTrue(response.json()["success"])
16 |
17 | # def test_remove_endpoint(self):
18 | # response = self.client.post("/remove", json={"filter_patterns": ["http://example.com"]})
19 | # self.assertEqual(response.status_code, 200)
20 | # self.assertTrue(response.json()["success"])
21 |
22 | # def test_update_endpoint(self):
23 | # response = self.client.post("/update", json={})
24 | # self.assertEqual(response.status_code, 200)
25 | # self.assertTrue(response.json()["success"])
26 |
27 | # def test_list_all_endpoint(self):
28 | # response = self.client.post("/list_all", json={})
29 | # self.assertEqual(response.status_code, 200)
30 | # self.assertTrue(response.json()["success"])
31 |
--------------------------------------------------------------------------------
/tests/mock_server/templates/example.rss:
--------------------------------------------------------------------------------
1 |
2 |
7 |
8 | Sample Feed
9 | http://example.org/
10 | For documentation only
11 | en-us
12 | Nobody (nobody@example.org)
13 | Public domain
14 | 2024-02-26T17:28:12-08:00
15 |
16 |
17 |
18 | -
19 |
First!
20 | http://127.0.0.1:8080/static/example.com.html
21 | just-an@example.org
22 |
23 | This has a description.
24 |
25 | Tag1 Tag2
26 | 2024-02-26T17:28:12-08:00
27 | description.]]>
29 |
30 |
31 |
32 |
33 |
--------------------------------------------------------------------------------
/archivebox/misc/paginators.py:
--------------------------------------------------------------------------------
1 | __package__ = 'archivebox.misc'
2 |
3 | from django.core.paginator import Paginator
4 | from django.utils.functional import cached_property
5 |
6 |
7 | class AccelleratedPaginator(Paginator):
8 | """
9 | Accellerated Pagniator ignores DISTINCT when counting total number of rows.
10 | Speeds up SELECT Count(*) on Admin views by >20x.
11 | https://hakibenita.com/optimizing-the-django-admin-paginator
12 | """
13 |
14 | @cached_property
15 | def count(self):
16 | if self.object_list._has_filters(): # type: ignore
17 | # fallback to normal count method on filtered queryset
18 | return super().count
19 | else:
20 | # otherwise count total rows in a separate fast query
21 | return self.object_list.model.objects.count()
22 |
23 | # Alternative approach for PostgreSQL: fallback count takes > 200ms
24 | # from django.db import connection, transaction, OperationalError
25 | # with transaction.atomic(), connection.cursor() as cursor:
26 | # cursor.execute('SET LOCAL statement_timeout TO 200;')
27 | # try:
28 | # return super().count
29 | # except OperationalError:
30 | # return 9999999999999
31 |
--------------------------------------------------------------------------------
/etc/nginx.conf:
--------------------------------------------------------------------------------
1 | user www-data;
2 | pid /var/run/nginx.pid;
3 |
4 | worker_processes auto;
5 | timer_resolution 100ms;
6 |
7 | worker_rlimit_nofile 40000;
8 | events {
9 | worker_connections 40000;
10 | use epoll;
11 | multi_accept on;
12 | }
13 |
14 | http {
15 | sendfile on;
16 | tcp_nopush on;
17 | tcp_nodelay on;
18 | server_tokens off;
19 | send_timeout 20;
20 | keepalive_timeout 65;
21 | types_hash_max_size 2048;
22 | client_max_body_size 25m;
23 |
24 | include mime.types;
25 | default_type application/octet-stream;
26 |
27 | access_log /dev/stdout;
28 | error_log stderr;
29 |
30 | reset_timedout_connection on;
31 |
32 | server_names_hash_bucket_size 64;
33 |
34 | server {
35 | listen 80 default_server;
36 | server_name _;
37 |
38 | index index.html;
39 | autoindex on;
40 | try_files $uri $uri/ $uri.html =404;
41 |
42 | location /archive {
43 | root /var/www/archive;
44 | }
45 | }
46 | }
47 |
48 |
--------------------------------------------------------------------------------
/archivebox/parsers/generic_rss.py:
--------------------------------------------------------------------------------
1 | __package__ = 'archivebox.parsers'
2 |
3 |
4 | from typing import IO, Iterable
5 | from time import mktime
6 | from feedparser import parse as feedparser
7 |
8 | from ..index.schema import Link
9 | from archivebox.misc.util import (
10 | htmldecode,
11 | enforce_types
12 | )
13 |
14 | @enforce_types
15 | def parse_generic_rss_export(rss_file: IO[str], **_kwargs) -> Iterable[Link]:
16 | """Parse RSS XML-format files into links"""
17 |
18 | rss_file.seek(0)
19 | feed = feedparser(rss_file.read())
20 | for item in feed.entries:
21 | url = item.link
22 | title = item.title
23 | time = mktime(item.updated_parsed)
24 |
25 | try:
26 | tags = ','.join(map(lambda tag: tag.term, item.tags))
27 | except AttributeError:
28 | tags = ''
29 |
30 | if url is None:
31 | # Yielding a Link with no URL will
32 | # crash on a URL validation assertion
33 | continue
34 |
35 | yield Link(
36 | url=htmldecode(url),
37 | timestamp=str(time),
38 | title=htmldecode(title) or None,
39 | tags=tags,
40 | sources=[rss_file.name],
41 | )
42 |
43 |
44 | KEY = 'rss'
45 | NAME = 'Generic RSS'
46 | PARSER = parse_generic_rss_export
47 |
--------------------------------------------------------------------------------
/archivebox/api/admin.py:
--------------------------------------------------------------------------------
1 | __package__ = 'archivebox.api'
2 |
3 | from signal_webhooks.admin import WebhookAdmin
4 | from signal_webhooks.utils import get_webhook_model
5 |
6 | from archivebox.base_models.admin import ABIDModelAdmin
7 |
8 | from api.models import APIToken
9 |
10 |
11 | class APITokenAdmin(ABIDModelAdmin):
12 | list_display = ('created_at', 'abid', 'created_by', 'token_redacted', 'expires')
13 | sort_fields = ('abid', 'created_at', 'created_by', 'expires')
14 | readonly_fields = ('created_at', 'modified_at', 'abid_info')
15 | search_fields = ('id', 'abid', 'created_by__username', 'token')
16 | fields = ('created_by', 'token', 'expires', *readonly_fields)
17 |
18 | list_filter = ('created_by',)
19 | ordering = ['-created_at']
20 | list_per_page = 100
21 |
22 |
23 | class CustomWebhookAdmin(WebhookAdmin, ABIDModelAdmin):
24 | list_display = ('created_at', 'created_by', 'abid', *WebhookAdmin.list_display)
25 | sort_fields = ('created_at', 'created_by', 'abid', 'referenced_model', 'endpoint', 'last_success', 'last_error')
26 | readonly_fields = ('created_at', 'modified_at', 'abid_info', *WebhookAdmin.readonly_fields)
27 |
28 |
29 | def register_admin(admin_site):
30 | admin_site.register(APIToken, APITokenAdmin)
31 | admin_site.register(get_webhook_model(), CustomWebhookAdmin)
32 |
--------------------------------------------------------------------------------
/archivebox/pkgs/abx-plugin-favicon/abx_plugin_favicon/actors.py:
--------------------------------------------------------------------------------
1 | __package__ = 'abx_plugin_favicon'
2 |
3 | from typing import ClassVar
4 |
5 | from core.actors import ActorType
6 | from core.statemachines import ArchiveResultMachine
7 |
8 | from statemachine import State
9 |
10 | from .models import FaviconResult
11 |
12 |
13 | class FaviconResultActor(ActorType[FaviconResult]):
14 | """
15 | The primary actor for progressing ArchiveResult objects
16 | through their lifecycle using the ArchiveResultMachine.
17 | """
18 | Model = FaviconResult
19 | StateMachineClass = ArchiveResultMachine
20 |
21 | ACTIVE_STATE: ClassVar[State] = ArchiveResultMachine.started # 'started'
22 | FINAL_STATES: ClassVar[list[State]] = ArchiveResultMachine.final_states # ['succeeded', 'failed', 'skipped']
23 | STATE_FIELD_NAME: ClassVar[str] = ArchiveResultMachine.state_field_name # status
24 |
25 | MAX_CONCURRENT_ACTORS: ClassVar[int] = 6
26 | MAX_TICK_TIME: ClassVar[int] = 60
27 | CLAIM_FROM_TOP_N: ClassVar[int] = MAX_CONCURRENT_ACTORS * 10
28 |
29 | # @classproperty
30 | # def qs(cls) -> QuerySet[ModelType]:
31 | # """Get the unfiltered and unsorted QuerySet of all objects that this Actor might care about."""
32 | # return cls.Model.objects.filter(extractor='favicon')
33 |
--------------------------------------------------------------------------------
/archivebox/pkgs/abx-plugin-npm/abx_plugin_npm/binproviders.py:
--------------------------------------------------------------------------------
1 | import os
2 | from pathlib import Path
3 | from typing import Optional
4 |
5 | from abx_pkg import NpmProvider, PATHStr, BinProviderName
6 |
7 | import abx
8 |
9 | DEFAULT_LIB_NPM_DIR = Path('/usr/local/share/abx/npm')
10 |
11 | OLD_NODE_BIN_PATH = Path(os.getcwd()) / 'node_modules' / '.bin'
12 | NEW_NODE_BIN_PATH = DEFAULT_LIB_NPM_DIR / 'node_modules' / '.bin'
13 |
14 |
15 | class SystemNpmBinProvider(NpmProvider):
16 | name: BinProviderName = "sys_npm"
17 |
18 | npm_prefix: Optional[Path] = None
19 |
20 |
21 | class LibNpmBinProvider(NpmProvider):
22 | name: BinProviderName = "lib_npm"
23 | PATH: PATHStr = f'{NEW_NODE_BIN_PATH}:{OLD_NODE_BIN_PATH}'
24 |
25 | npm_prefix: Optional[Path] = DEFAULT_LIB_NPM_DIR
26 |
27 | def setup(self) -> None:
28 | # update paths from config at runtime
29 | LIB_DIR = abx.pm.hook.get_LIB_DIR()
30 | self.npm_prefix = LIB_DIR / 'npm'
31 | self.PATH = f'{LIB_DIR / "npm" / "node_modules" / ".bin"}:{NEW_NODE_BIN_PATH}:{OLD_NODE_BIN_PATH}'
32 | super().setup()
33 |
34 |
35 | SYS_NPM_BINPROVIDER = SystemNpmBinProvider()
36 | LIB_NPM_BINPROVIDER = LibNpmBinProvider()
37 | LIB_NPM_BINPROVIDER.setup()
38 | npm = LIB_NPM_BINPROVIDER
39 |
40 | LIB_NPM_BINPROVIDER.setup()
41 | SYS_NPM_BINPROVIDER.setup()
42 |
--------------------------------------------------------------------------------
/archivebox/core/migrations/0003_auto_20200630_1034.py:
--------------------------------------------------------------------------------
1 | # Generated by Django 3.0.7 on 2020-06-30 10:34
2 |
3 | from django.db import migrations, models
4 |
5 |
6 | class Migration(migrations.Migration):
7 |
8 | dependencies = [
9 | ('core', '0002_auto_20200625_1521'),
10 | ]
11 |
12 | operations = [
13 | migrations.AlterField(
14 | model_name='snapshot',
15 | name='added',
16 | field=models.DateTimeField(auto_now_add=True, db_index=True),
17 | ),
18 | migrations.AlterField(
19 | model_name='snapshot',
20 | name='tags',
21 | field=models.CharField(db_index=True, default=None, max_length=256, null=True),
22 | ),
23 | migrations.AlterField(
24 | model_name='snapshot',
25 | name='timestamp',
26 | field=models.CharField(db_index=True, default=None, max_length=32, null=True),
27 | ),
28 | migrations.AlterField(
29 | model_name='snapshot',
30 | name='title',
31 | field=models.CharField(db_index=True, default=None, max_length=128, null=True),
32 | ),
33 | migrations.AlterField(
34 | model_name='snapshot',
35 | name='updated',
36 | field=models.DateTimeField(db_index=True, default=None, null=True),
37 | ),
38 | ]
39 |
--------------------------------------------------------------------------------
/archivebox/core/migrations/0073_rename_created_archiveresult_created_at_and_more.py:
--------------------------------------------------------------------------------
1 | # Generated by Django 5.1 on 2024-09-05 00:25
2 |
3 | from django.db import migrations
4 |
5 |
6 | class Migration(migrations.Migration):
7 |
8 | dependencies = [
9 | ('core', '0072_rename_added_snapshot_bookmarked_at_and_more'),
10 | ]
11 |
12 | operations = [
13 | migrations.RenameField(
14 | model_name='archiveresult',
15 | old_name='created',
16 | new_name='created_at',
17 | ),
18 | migrations.RenameField(
19 | model_name='archiveresult',
20 | old_name='modified',
21 | new_name='modified_at',
22 | ),
23 | migrations.RenameField(
24 | model_name='snapshot',
25 | old_name='created',
26 | new_name='created_at',
27 | ),
28 | migrations.RenameField(
29 | model_name='snapshot',
30 | old_name='modified',
31 | new_name='modified_at',
32 | ),
33 | migrations.RenameField(
34 | model_name='tag',
35 | old_name='created',
36 | new_name='created_at',
37 | ),
38 | migrations.RenameField(
39 | model_name='tag',
40 | old_name='modified',
41 | new_name='modified_at',
42 | ),
43 | ]
44 |
--------------------------------------------------------------------------------
/archivebox/parsers/medium_rss.py:
--------------------------------------------------------------------------------
1 | __package__ = 'archivebox.parsers'
2 |
3 |
4 | from typing import IO, Iterable
5 | from datetime import datetime
6 |
7 | from xml.etree import ElementTree
8 |
9 | from ..index.schema import Link
10 | from archivebox.misc.util import (
11 | htmldecode,
12 | enforce_types,
13 | )
14 |
15 |
16 | @enforce_types
17 | def parse_medium_rss_export(rss_file: IO[str], **_kwargs) -> Iterable[Link]:
18 | """Parse Medium RSS feed files into links"""
19 |
20 | rss_file.seek(0)
21 | root = ElementTree.parse(rss_file).getroot()
22 | items = root.find("channel").findall("item") # type: ignore
23 | for item in items:
24 | url = item.find("link").text # type: ignore
25 | title = item.find("title").text.strip() # type: ignore
26 | ts_str = item.find("pubDate").text # type: ignore
27 | time = datetime.strptime(ts_str, "%a, %d %b %Y %H:%M:%S %Z") # type: ignore
28 |
29 | yield Link(
30 | url=htmldecode(url),
31 | timestamp=str(time.timestamp()),
32 | title=htmldecode(title) or None,
33 | tags=None,
34 | sources=[rss_file.name],
35 | )
36 |
37 |
38 | KEY = 'medium_rss'
39 | NAME = 'Medium RSS'
40 | PARSER = parse_medium_rss_export
41 |
--------------------------------------------------------------------------------
/archivebox/core/templatetags/core_tags.py:
--------------------------------------------------------------------------------
1 | from django import template
2 | from django.contrib.admin.templatetags.base import InclusionAdminNode
3 |
4 |
5 | from typing import Union
6 |
7 |
8 | register = template.Library()
9 |
10 | @register.filter(name='split')
11 | def split(value, separator: str=','):
12 | return (value or '').split(separator)
13 |
14 | @register.filter
15 | def file_size(num_bytes: Union[int, float]) -> str:
16 | for count in ['Bytes','KB','MB','GB']:
17 | if num_bytes > -1024.0 and num_bytes < 1024.0:
18 | return '%3.1f %s' % (num_bytes, count)
19 | num_bytes /= 1024.0
20 | return '%3.1f %s' % (num_bytes, 'TB')
21 |
22 | def result_list(cl):
23 | """
24 | Monkey patched result
25 | """
26 | num_sorted_fields = 0
27 | return {
28 | 'cl': cl,
29 | 'num_sorted_fields': num_sorted_fields,
30 | 'results': cl.result_list,
31 | }
32 |
33 | @register.tag(name='snapshots_grid')
34 | def result_list_tag(parser, token):
35 | return InclusionAdminNode(
36 | parser, token,
37 | func=result_list,
38 | template_name='snapshots_grid.html',
39 | takes_context=False,
40 | )
41 |
42 | @register.simple_tag(takes_context=True)
43 | def url_replace(context, **kwargs):
44 | dict_ = context['request'].GET.copy()
45 | dict_.update(**kwargs)
46 | return dict_.urlencode()
47 |
--------------------------------------------------------------------------------
/archivebox/core/migrations/0069_alter_archiveresult_created_alter_snapshot_added_and_more.py:
--------------------------------------------------------------------------------
1 | # Generated by Django 5.1 on 2024-08-28 09:40
2 |
3 | import django.utils.timezone
4 | from django.db import migrations
5 |
6 | import archivebox.base_models.models
7 |
8 |
9 | class Migration(migrations.Migration):
10 |
11 | dependencies = [
12 | ('core', '0068_alter_archiveresult_options'),
13 | ]
14 |
15 | operations = [
16 | migrations.AlterField(
17 | model_name='archiveresult',
18 | name='created',
19 | field=archivebox.base_models.models.AutoDateTimeField(db_index=True, default=django.utils.timezone.now),
20 | ),
21 | migrations.AlterField(
22 | model_name='snapshot',
23 | name='added',
24 | field=archivebox.base_models.models.AutoDateTimeField(db_index=True, default=django.utils.timezone.now),
25 | ),
26 | migrations.AlterField(
27 | model_name='snapshot',
28 | name='created',
29 | field=archivebox.base_models.models.AutoDateTimeField(db_index=True, default=django.utils.timezone.now),
30 | ),
31 | migrations.AlterField(
32 | model_name='tag',
33 | name='created',
34 | field=archivebox.base_models.models.AutoDateTimeField(db_index=True, default=django.utils.timezone.now),
35 | ),
36 | ]
37 |
--------------------------------------------------------------------------------
/archivebox/api/migrations/0009_rename_created_apitoken_created_at_and_more.py:
--------------------------------------------------------------------------------
1 | # Generated by Django 5.1 on 2024-09-05 00:26
2 |
3 | from django.db import migrations, models
4 |
5 | import archivebox.base_models.models
6 |
7 |
8 | class Migration(migrations.Migration):
9 |
10 | dependencies = [
11 | ('api', '0008_alter_apitoken_created_alter_apitoken_created_by_and_more'),
12 | ]
13 |
14 | operations = [
15 | migrations.RenameField(
16 | model_name='apitoken',
17 | old_name='created',
18 | new_name='created_at',
19 | ),
20 | migrations.RenameField(
21 | model_name='apitoken',
22 | old_name='modified',
23 | new_name='modified_at',
24 | ),
25 | migrations.RenameField(
26 | model_name='outboundwebhook',
27 | old_name='modified',
28 | new_name='modified_at',
29 | ),
30 | migrations.AddField(
31 | model_name='outboundwebhook',
32 | name='created_at',
33 | field=archivebox.base_models.models.AutoDateTimeField(db_index=True, default=None),
34 | ),
35 | migrations.AlterField(
36 | model_name='outboundwebhook',
37 | name='created',
38 | field=models.DateTimeField(auto_now_add=True, help_text='When the webhook was created.', verbose_name='created'),
39 | ),
40 | ]
41 |
--------------------------------------------------------------------------------
/tests/mock_server/templates/example.com.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | Example Domain
5 |
6 |
7 |
8 |
9 |
37 |
38 |
39 |
40 |
41 |
Example Domain
42 |
This domain is for use in illustrative examples in documents. You may use this
43 | domain in literature without prior coordination or asking for permission.
44 |
45 | More information...
46 |
47 |
48 |
49 |
50 |
--------------------------------------------------------------------------------
/archivebox/cli/archivebox_extract.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 |
3 | __package__ = 'archivebox.cli'
4 | __command__ = 'archivebox extract'
5 |
6 |
7 | import sys
8 | from typing import TYPE_CHECKING, Generator
9 |
10 | import rich_click as click
11 |
12 | from django.db.models import Q
13 |
14 | from archivebox.misc.util import enforce_types, docstring
15 |
16 |
17 | if TYPE_CHECKING:
18 | from core.models import ArchiveResult
19 |
20 |
21 | ORCHESTRATOR = None
22 |
23 | @enforce_types
24 | def extract(archiveresult_id: str) -> Generator['ArchiveResult', None, None]:
25 | archiveresult = ArchiveResult.objects.get(Q(id=archiveresult_id) | Q(abid=archiveresult_id))
26 | if not archiveresult:
27 | raise Exception(f'ArchiveResult {archiveresult_id} not found')
28 |
29 | return archiveresult.EXTRACTOR.extract()
30 |
31 | # @#/absolute/path/to/binary
32 | # 2014.24.01
33 |
34 | @click.command()
35 |
36 | @click.argument('archiveresult_ids', nargs=-1, type=str)
37 | @docstring(extract.__doc__)
38 | def main(archiveresult_ids: list[str]):
39 | """Add a new URL or list of URLs to your archive"""
40 |
41 | for archiveresult_id in (archiveresult_ids or sys.stdin):
42 | print(f'Extracting {archiveresult_id}...')
43 | archiveresult = extract(str(archiveresult_id))
44 | print(archiveresult.as_json())
45 |
46 |
47 | if __name__ == '__main__':
48 | main()
49 |
50 |
--------------------------------------------------------------------------------
/archivebox/parsers/netscape_html.py:
--------------------------------------------------------------------------------
1 | __package__ = 'archivebox.parsers'
2 |
3 |
4 | import re
5 |
6 | from typing import IO, Iterable
7 | from datetime import datetime
8 |
9 | from ..index.schema import Link
10 | from archivebox.misc.util import (
11 | htmldecode,
12 | enforce_types,
13 | )
14 |
15 |
16 | @enforce_types
17 | def parse_netscape_html_export(html_file: IO[str], **_kwargs) -> Iterable[Link]:
18 | """Parse netscape-format bookmarks export files (produced by all browsers)"""
19 |
20 | html_file.seek(0)
21 | pattern = re.compile("]*>(.+) ", re.UNICODE | re.IGNORECASE)
22 | for line in html_file:
23 | # example line
24 | # example bookmark title
25 |
26 | match = pattern.search(line)
27 | if match:
28 | url = match.group(1)
29 | time = datetime.fromtimestamp(float(match.group(2)))
30 | title = match.group(3).strip()
31 |
32 | yield Link(
33 | url=htmldecode(url),
34 | timestamp=str(time.timestamp()),
35 | title=htmldecode(title) or None,
36 | tags=None,
37 | sources=[html_file.name],
38 | )
39 |
40 |
41 | KEY = 'netscape_html'
42 | NAME = 'Netscape HTML'
43 | PARSER = parse_netscape_html_export
44 |
--------------------------------------------------------------------------------
/archivebox/parsers/pinboard_rss.py:
--------------------------------------------------------------------------------
1 | __package__ = 'archivebox.parsers'
2 |
3 |
4 | from typing import IO, Iterable
5 | from time import mktime
6 | from feedparser import parse as feedparser
7 |
8 | from ..index.schema import Link
9 | from archivebox.misc.util import (
10 | htmldecode,
11 | enforce_types
12 | )
13 |
14 | @enforce_types
15 | def parse_pinboard_rss_export(rss_file: IO[str], **_kwargs) -> Iterable[Link]:
16 | """Parse Pinboard RSS feed files into links"""
17 |
18 | rss_file.seek(0)
19 | feed = feedparser(rss_file.read())
20 | for item in feed.entries:
21 | url = item.link
22 | # title will start with "[priv] " if pin was marked private. useful?
23 | title = item.title
24 | time = mktime(item.updated_parsed)
25 |
26 | # all tags are in one entry.tags with spaces in it. annoying!
27 | try:
28 | tags = item.tags[0].term.replace(' ', ',')
29 | except AttributeError:
30 | tags = ''
31 |
32 | if url is None:
33 | # Yielding a Link with no URL will
34 | # crash on a URL validation assertion
35 | continue
36 |
37 | yield Link(
38 | url=htmldecode(url),
39 | timestamp=str(time),
40 | title=htmldecode(title) or None,
41 | tags=htmldecode(tags) or None,
42 | sources=[rss_file.name],
43 | )
44 |
45 |
46 | KEY = 'pinboard_rss'
47 | NAME = 'Pinboard RSS'
48 | PARSER = parse_pinboard_rss_export
49 |
--------------------------------------------------------------------------------
/.github/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | # Contribution Process
2 |
3 | 1. Confirm your desired features fit into our bigger project goals [Roadmap](https://github.com/pirate/ArchiveBox/wiki/Roadmap).
4 | 2. Open an issue with your planned implementation to discuss
5 | 3. Check in with me before starting development to make sure your work wont conflict with or duplicate existing work
6 | 4. Setup your dev environment, make some changes, and test using the test input files
7 | 5. Commit, push, and submit a PR and wait for review feedback
8 | 6. Have patience, don't abandon your PR! We love contributors but we all have day jobs and don't always have time to respond to notifications instantly. If you want a faster response, ping @theSquashSH on twitter or Patreon.
9 |
10 | **Useful links:**
11 |
12 | - https://github.com/ArchiveBox/ArchiveBox/issues
13 | - https://github.com/ArchiveBox/ArchiveBox/pulls
14 | - https://github.com/ArchiveBox/ArchiveBox/wiki/Roadmap
15 | - https://github.com/ArchiveBox/ArchiveBox/wiki/Install#manual-setup
16 |
17 | ### Development Setup
18 |
19 | ```bash
20 | git clone https://github.com/ArchiveBox/ArchiveBox
21 | cd ArchiveBox
22 | # Ideally do this in a virtualenv
23 | pip install -e '.[dev]' # or use: pipenv install --dev
24 | ```
25 |
26 | ### Running Tests
27 |
28 | ```bash
29 | ./bin/lint.sh
30 | ./bin/test.sh
31 | ./bin/build.sh
32 | ```
33 |
34 | For more common tasks see the `Development` section at the bottom of the README.
35 |
36 | ### Getting Help
37 |
38 | Open issues on Github or message me https://sweeting.me/#contact.
39 |
--------------------------------------------------------------------------------