├── .gitattributes
├── .gitignore
├── dev_requirements.in
├── CHANGELOG.md
├── .github
├── dependabot.yml
└── workflows
│ ├── test.yml
│ └── upload_binaries.yml
├── tests
├── conftest.py
├── utils.py
├── fixtures
│ └── example.com.html
└── test_save_safari_webarchive.py
├── CONTRIBUTING.md
├── dev_requirements.txt
├── LICENSE
├── README.md
└── save_safari_webarchive.swift
/.gitattributes:
--------------------------------------------------------------------------------
1 | tests/fixtures/*.html linguist-generated=true
2 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | *.pyc
2 | *.webarchive
3 | save_safari_webarchive
4 |
--------------------------------------------------------------------------------
/dev_requirements.in:
--------------------------------------------------------------------------------
1 | pytest
2 | pytest-httpserver
3 | pytest-xdist
4 |
--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
1 | # CHANGELOG
2 |
3 | ## v1.0.1 - 2024-06-05
4 |
5 | Fix version number returned by `--version`.
6 |
7 | ## v1.0.0 - 2024-05-17
8 |
9 | Initial release.
10 |
--------------------------------------------------------------------------------
/.github/dependabot.yml:
--------------------------------------------------------------------------------
1 | version: 2
2 | updates:
3 | - package-ecosystem: "github-actions"
4 | directory: "/"
5 | schedule:
6 | interval: weekly
7 | day: monday
8 | time: "09:00"
9 | - package-ecosystem: "pip"
10 | directory: "/"
11 | schedule:
12 | interval: weekly
13 | day: monday
14 | time: "09:00"
15 |
--------------------------------------------------------------------------------
/tests/conftest.py:
--------------------------------------------------------------------------------
1 | import pathlib
2 |
3 | import pytest
4 |
5 |
6 | @pytest.fixture
7 | def out_path(tmp_path: pathlib.Path) -> None:
8 | """
9 | Returns a temporary path where we can write a webarchive.
10 |
11 | Any files written to this path will be cleaned up at the end of the test.
12 | """
13 | return tmp_path / "example.webarchive"
14 |
--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | # CONTRIBUTING
2 |
3 | ## Creating a new release
4 |
5 | 1. Bump the version number in `save_safari_webarchive.swift`
6 | 2. Add a changelog entry in `CHANGELOG.md`
7 | 3. Create a Git tag with your new version number
8 | 4. Push your changes and Git tag to GitHub
9 |
10 | GitHub Actions will create a new release, including compiled binaries.
11 |
12 | These binaries aren't notarised -- see https://github.com/alexwlchan/safari-webarchiver/issues/6
13 |
--------------------------------------------------------------------------------
/.github/workflows/test.yml:
--------------------------------------------------------------------------------
1 | name: Run tests
2 |
3 | on:
4 | push:
5 | branches:
6 | - main
7 |
8 | pull_request:
9 | branches:
10 | - main
11 |
12 | jobs:
13 | test:
14 | runs-on: macos-latest
15 |
16 | steps:
17 | - uses: actions/checkout@v4
18 |
19 | - name: Set up Python
20 | uses: actions/setup-python@v5
21 | with:
22 | python-version: "3.12"
23 | cache: 'pip'
24 | cache-dependency-path: 'dev_requirements.txt'
25 |
26 | - name: Install dependencies
27 | run: pip install -r dev_requirements.txt
28 |
29 | - name: Run tests
30 | run: python3 -m pytest -n 5 tests
31 |
--------------------------------------------------------------------------------
/dev_requirements.txt:
--------------------------------------------------------------------------------
1 | # This file was autogenerated by uv via the following command:
2 | # uv pip compile dev_requirements.in --output-file dev_requirements.txt
3 | exceptiongroup==1.2.2
4 | # via pytest
5 | execnet==2.1.1
6 | # via pytest-xdist
7 | iniconfig==2.0.0
8 | # via pytest
9 | markupsafe==2.1.5
10 | # via werkzeug
11 | packaging==24.0
12 | # via pytest
13 | pluggy==1.5.0
14 | # via pytest
15 | pygments==2.19.1
16 | # via pytest
17 | pytest==8.4.1
18 | # via
19 | # -r dev_requirements.in
20 | # pytest-xdist
21 | pytest-httpserver==1.1.3
22 | # via -r dev_requirements.in
23 | pytest-xdist==3.8.0
24 | # via -r dev_requirements.in
25 | tomli==2.2.1
26 | # via pytest
27 | werkzeug==3.0.3
28 | # via pytest-httpserver
29 |
--------------------------------------------------------------------------------
/tests/utils.py:
--------------------------------------------------------------------------------
1 | import pathlib
2 | import subprocess
3 | import typing
4 |
5 |
6 | class CommandOutput(typing.TypedDict):
7 | returncode: int
8 | stdout: str | None
9 | stderr: str | None
10 |
11 |
12 | def save_safari_webarchive(argv: list[str | pathlib.Path]) -> CommandOutput:
13 | """
14 | Run the ``save_safari_webarchive.swift`` script and return the result.
15 | """
16 | cmd = ["swift", "save_safari_webarchive.swift"] + [str(av) for av in argv]
17 |
18 | proc = subprocess.Popen(
19 | cmd,
20 | stdout=subprocess.PIPE,
21 | stderr=subprocess.PIPE,
22 | )
23 | stdout, stderr = proc.communicate()
24 |
25 | if stdout is not None:
26 | stdout = stdout.decode("utf8")
27 |
28 | if stderr is not None:
29 | stderr = stderr.decode("utf8")
30 |
31 | return CommandOutput(
32 | returncode=proc.returncode,
33 | stdout=stdout or None,
34 | stderr=stderr or None,
35 | )
36 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Copyright (c) 2024 Alex Chan
2 |
3 | Permission is hereby granted, free of charge, to any person obtaining a
4 | copy of this software and associated documentation files (the "Software"),
5 | to deal in the Software without restriction, including without limitation
6 | the rights to use, copy, modify, merge, publish, distribute, sublicense,
7 | and/or sell copies of the Software, and to permit persons to whom the Software
8 | is furnished to do so, subject to the following conditions:
9 |
10 | The above copyright notice and this permission notice shall be included in
11 | all copies or substantial portions of the Software.
12 |
13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
16 | THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
17 | OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
18 | ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
19 | OTHER DEALINGS IN THE SOFTWARE.
20 |
--------------------------------------------------------------------------------
/tests/fixtures/example.com.html:
--------------------------------------------------------------------------------
1 |
2 | Example Domain
3 |
4 |
5 |
6 |
7 |
34 |
35 |
36 |
37 |
38 |
Example Domain
39 |
This domain is for use in illustrative examples in documents. You may use this
40 | domain in literature without prior coordination or asking for permission.
41 |
More information...
42 |
43 |
44 |
45 |
--------------------------------------------------------------------------------
/.github/workflows/upload_binaries.yml:
--------------------------------------------------------------------------------
1 | name: Upload binaries
2 |
3 | on:
4 | push:
5 | tags:
6 | - v[0-9]+.*
7 |
8 | jobs:
9 | create-release:
10 | runs-on: ubuntu-latest
11 | permissions:
12 | contents: write
13 | steps:
14 | - uses: actions/checkout@v4
15 | - uses: taiki-e/create-gh-release-action@v1
16 | with:
17 | changelog: CHANGELOG.md
18 | env:
19 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
20 |
21 | upload-assets:
22 | strategy:
23 | matrix:
24 |
25 | # See https://docs.github.com/en/actions/using-github-hosted-runners/about-github-hosted-runners/about-github-hosted-runners#standard-github-hosted-runners-for-public-repositories
26 | include:
27 | - target: aarch64-apple-darwin
28 | os: macos-latest
29 | - target: x86_64-apple-darwin
30 | os: macos-13
31 |
32 | runs-on: ${{ matrix.os }}
33 |
34 | permissions:
35 | contents: write
36 |
37 | steps:
38 | - uses: actions/checkout@v4
39 |
40 | - name: Compile the Swift script
41 | run: swiftc save_safari_webarchive.swift
42 |
43 | - name: Create the zip archive
44 | run: zip "save_safari_webarchive.${{ matrix.target }}.zip" save_safari_webarchive README.md LICENSE
45 |
46 | - name: Push the zip file to the GitHub release
47 | run: gh release upload "$GITHUB_REF_NAME" "save_safari_webarchive.${{ matrix.target }}.zip" --clobber
48 | env:
49 | GH_TOKEN: ${{ github.token }}
50 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # safari-webarchiver
2 |
3 | This tool creates [Safari webarchive files](https://en.wikipedia.org/wiki/Webarchive) on the command line.
4 | This gives you an offline archive of web pages, which can be stored and backed up independently of any cloud services.
5 |
6 | ```console
7 | $ save_safari_webarchive "https://example.com" "example.webarchive"
8 | ```
9 |
10 | These archives are the same as those created by the `File > Save As…` menu item, but now you can create them programatically and in bulk.
11 |
12 | ## How it works
13 |
14 | It opens the given URL in a `WKWebView`, calls `createWebArchiveData` to create a webarchive file, and saves it to disk.
15 | That's the core workflow, and then there's a bunch of error handling around that.
16 |
17 | For a more detailed explanation of this code, see
18 |
19 | ## Installation
20 |
21 | ### Install from source
22 |
23 | 1. Install the Xcode Command Line Tools
24 | 2. Download the `save_safari_webarchive.swift` script from this repo
25 | 3. Compile the script into a binary:
26 |
27 | ```console
28 | $ swiftc save_safari_webarchive.swift
29 | ```
30 |
31 | 4. Copy the compiled binary `save_safari_webarchive` to somewhere in your PATH.
32 |
33 | ### Install a compiled binary
34 |
35 | 1. Find the latest [GitHub release](https://github.com/alexwlchan/safari-webarchiver/releases)
36 | 2. Download the zip file which is appropriate for your system (Intel = `x86_64`, Apple Silion = `aarch64`)
37 | 3. Open the zip file, and add the `save_safari_webarchive` app to your PATH
38 |
39 | The app is just a compiled version of the Swift script.
40 | It isn't notarised, so when you run it, you may get a warning that this app is from an unidentified developer.
41 | You can get around this by right-clicking the app icon in Finder, and choosing `Open` from the shortcut menu.
42 |
43 | ## Usage
44 |
45 | Run the script passing two arguments: the URL you want to archive, and the path where you want to save the webarchive file.
46 |
47 | For example, this command will save the URL to this GitHub repo to `safari-webarchiver.webarchive`:
48 |
49 | ```console
50 | $ save_safari_webarchive "https://github.com/alexwlchan/safari-webarchiver" "safari-webarchiver.webarchive"
51 | ```
52 |
53 | ## Acknowledgements
54 |
55 | This is partially inspired by [a similar script](https://github.com/newzealandpaul/webarchiver) written in 2008 by newzealandpaul.
56 |
--------------------------------------------------------------------------------
/save_safari_webarchive.swift:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env swift
2 | /// Save a web page as a Safari webarchive.
3 | ///
4 | /// Usage: save_safari_webarchive [URL] [OUTPUT_PATH]
5 | ///
6 | /// This will save the page to the desired file, but may fail for
7 | /// several reasons:
8 | ///
9 | /// - the web page can't be loaded
10 | /// - the web page returns a non-200 status code
11 | /// - there's already a file at that path (it won't overwrite an existing
12 | /// webarchive)
13 | ///
14 | /// For a detailed explanation of the code in this script, see
15 | /// https://alexwlchan.net/2024/creating-a-safari-webarchive/
16 | ///
17 | /// The canonical copy of this script lives in GitHub, see
18 | /// https://github.com/alexwlchan/safari-webarchiver
19 |
20 | import WebKit
21 |
22 | let SCRIPT_VERSION = "1.0.1"
23 |
24 | /// Print an error message and terminate the process if there are
25 | /// any errors while loading a page.
26 | class ExitOnFailureDelegate: NSObject, WKNavigationDelegate {
27 | var urlString: String
28 |
29 | init(_ urlString: String) {
30 | self.urlString = urlString
31 | }
32 |
33 | func webView(
34 | _: WKWebView,
35 | didFail: WKNavigation!,
36 | withError error: Error
37 | ) {
38 | fputs("Failed to load \(self.urlString): \(error.localizedDescription)\n", stderr)
39 | exit(1)
40 | }
41 |
42 | func webView(
43 | _: WKWebView,
44 | didFailProvisionalNavigation: WKNavigation!,
45 | withError error: Error
46 | ) {
47 | fputs("Failed to load \(self.urlString): \(error.localizedDescription)\n", stderr)
48 | exit(1)
49 | }
50 |
51 | func webView(
52 | _: WKWebView,
53 | decidePolicyFor navigationResponse: WKNavigationResponse,
54 | decisionHandler: (WKNavigationResponsePolicy) -> Void
55 | ) {
56 | if let httpUrlResponse = (navigationResponse.response as? HTTPURLResponse) {
57 | if httpUrlResponse.statusCode != 200 {
58 | fputs("Failed to load \(self.urlString): got status code \(httpUrlResponse.statusCode)\n", stderr)
59 | exit(1)
60 | }
61 | }
62 |
63 | decisionHandler(.allow)
64 | }
65 | }
66 |
67 | extension WKWebView {
68 |
69 | /// Load the given URL in the web view.
70 | ///
71 | /// This method will block until the URL has finished loading.
72 | func load(_ urlString: String) {
73 | let delegate = ExitOnFailureDelegate(urlString)
74 | webView.navigationDelegate = delegate
75 |
76 | if let url = URL(string: urlString) {
77 | let request = URLRequest(url: url)
78 | self.load(request)
79 |
80 | while (self.isLoading) {
81 | RunLoop.main.run(until: Date(timeIntervalSinceNow: 0.1))
82 | }
83 | } else {
84 | fputs("Unable to use \(urlString) as a URL\n", stderr)
85 | exit(1)
86 | }
87 | }
88 |
89 | /// Save a copy of the web view's contents as a webarchive file.
90 | ///
91 | /// This method will block until the webarchive has been saved,
92 | /// or the save has failed for some reason.
93 | func saveAsWebArchive(savePath: URL) {
94 | var isSaving = true
95 |
96 | self.createWebArchiveData(completionHandler: { result in
97 | do {
98 | let data = try result.get()
99 | try data.write(
100 | to: savePath,
101 | options: [Data.WritingOptions.withoutOverwriting]
102 | )
103 | isSaving = false
104 | } catch {
105 | fputs("Unable to save webarchive file: \(error.localizedDescription)\n", stderr)
106 | exit(1)
107 | }
108 | })
109 |
110 | while (isSaving) {
111 | RunLoop.main.run(until: Date(timeIntervalSinceNow: 0.1))
112 | }
113 | }
114 | }
115 |
116 | if CommandLine.arguments.count == 2 && CommandLine.arguments[1] == "--version" {
117 | let filename = (CommandLine.arguments[0] as NSString).lastPathComponent
118 | print("\(filename) \(SCRIPT_VERSION)")
119 | exit(0)
120 | }
121 |
122 | guard CommandLine.arguments.count == 3 else {
123 | fputs("Usage: \(CommandLine.arguments[0]) \n", stderr)
124 | exit(1)
125 | }
126 |
127 | let urlString = CommandLine.arguments[1]
128 | let savePath = URL(fileURLWithPath: CommandLine.arguments[2])
129 |
130 | let webView = WKWebView()
131 |
132 | webView.load(urlString)
133 | webView.saveAsWebArchive(savePath: savePath)
134 |
135 | print("Saved webarchive to \(savePath)")
136 |
--------------------------------------------------------------------------------
/tests/test_save_safari_webarchive.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 |
3 | import os
4 | import pathlib
5 | import plistlib
6 | import re
7 |
8 | import pytest
9 | from pytest_httpserver import HTTPServer
10 |
11 | from utils import save_safari_webarchive
12 |
13 |
14 | def test_creates_a_single_archive(out_path: pathlib.Path) -> None:
15 | result = save_safari_webarchive(["https://example.com", out_path])
16 |
17 | assert result["returncode"] == 0
18 | assert result["stdout"] is not None
19 | assert result["stderr"] is None
20 | assert out_path.exists()
21 |
22 | with open(out_path, "rb") as in_file:
23 | webarchive = plistlib.load(in_file)
24 |
25 | main_resource = webarchive["WebMainResource"]
26 |
27 | assert main_resource["WebResourceURL"] == "https://example.com/"
28 | assert (
29 | main_resource["WebResourceData"]
30 | == open("tests/fixtures/example.com.html", "rb").read()
31 | )
32 |
33 |
34 | def test_does_not_overwrite_existing_archive(out_path: pathlib.Path) -> None:
35 | out_path.write_text("This should still be here later")
36 |
37 | result = save_safari_webarchive(["https://example.com", out_path])
38 |
39 | assert result == {
40 | "returncode": 1,
41 | "stdout": None,
42 | "stderr": (
43 | "Unable to save webarchive file: "
44 | "The file “example.webarchive” couldn’t be saved in the folder "
45 | "“test_does_not_overwrite_existi0” because a file with "
46 | "the same name already exists.\n"
47 | ),
48 | }
49 |
50 | assert out_path.read_text() == "This should still be here later"
51 |
52 |
53 | @pytest.mark.parametrize(
54 | "argv",
55 | [
56 | pytest.param([], id="no_arguments"),
57 | pytest.param(["https://example.com"], id="not_enough_arguments"),
58 | pytest.param(
59 | ["https://example.com", "example.webarchive", "--debug"],
60 | id="too_many_arguments",
61 | ),
62 | ],
63 | )
64 | def test_it_fails_if_you_supply_the_wrong_arguments(argv: list[str]) -> None:
65 | result = save_safari_webarchive(argv)
66 |
67 | assert result == {
68 | "returncode": 1,
69 | "stdout": None,
70 | "stderr": "Usage: save_safari_webarchive.swift \n",
71 | }
72 |
73 |
74 | @pytest.mark.parametrize("status_code", ["403", "404", "410", "500"])
75 | def test_it_fails_if_non_200_status_code(
76 | httpserver: HTTPServer, status_code: str, out_path: pathlib.Path
77 | ) -> None:
78 | httpserver.expect_request("/error").respond_with_data(
79 | "Boom!", status=int(status_code), content_type="text/plain"
80 | )
81 | url = f"http://localhost:{httpserver.port}/error"
82 |
83 | result = save_safari_webarchive([url, out_path])
84 |
85 | assert result == {
86 | "returncode": 1,
87 | "stdout": None,
88 | "stderr": f"Failed to load {url}: got status code {status_code}\n",
89 | }
90 |
91 | assert not out_path.exists()
92 |
93 |
94 | def test_it_fails_if_cannot_load_domain(out_path: pathlib.Path) -> None:
95 | result = save_safari_webarchive(["https://doesnotexist.tk/", out_path])
96 |
97 | assert result == {
98 | "returncode": 1,
99 | "stdout": None,
100 | "stderr": "Failed to load https://doesnotexist.tk/: A server with the specified hostname could not be found.\n",
101 | }
102 |
103 | assert not out_path.exists()
104 |
105 |
106 | # If I run this test in GitHub Actions, I get a warning to stderr but
107 | # the archive is saved correctly:
108 | #
109 | # CFURLCopyResourcePropertyForKey failed because it was passed a URL which
110 | # has no scheme
111 | #
112 | # This test passes locally; leave it for now -- I can come back to this.
113 | @pytest.mark.skipif(
114 | os.getenv("CI") == "true",
115 | reason="This test doesn’t work correctly in GitHub Actions",
116 | )
117 | def test_it_fails_if_url_is_invalid(out_path: pathlib.Path) -> None:
118 | result = save_safari_webarchive([">", out_path])
119 |
120 | assert result == {
121 | "returncode": 1,
122 | "stdout": None,
123 | "stderr": "Unable to use > as a URL\n",
124 | }
125 |
126 | assert not out_path.exists()
127 |
128 |
129 | def test_prints_the_version() -> None:
130 | result = save_safari_webarchive(["--version"])
131 |
132 | assert result["returncode"] == 0
133 | assert result["stderr"] is None
134 | assert re.match(
135 | r"^save_safari_webarchive.swift [0-9]+\.[0-9]+\.[0-9]+\n$", result["stdout"]
136 | ), result["stdout"]
137 |
--------------------------------------------------------------------------------