├── .gitattributes ├── .gitignore ├── dev_requirements.in ├── CHANGELOG.md ├── .github ├── dependabot.yml └── workflows │ ├── test.yml │ └── upload_binaries.yml ├── tests ├── conftest.py ├── utils.py ├── fixtures │ └── example.com.html └── test_save_safari_webarchive.py ├── CONTRIBUTING.md ├── dev_requirements.txt ├── LICENSE ├── README.md └── save_safari_webarchive.swift /.gitattributes: -------------------------------------------------------------------------------- 1 | tests/fixtures/*.html linguist-generated=true 2 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | *.webarchive 3 | save_safari_webarchive 4 | -------------------------------------------------------------------------------- /dev_requirements.in: -------------------------------------------------------------------------------- 1 | pytest 2 | pytest-httpserver 3 | pytest-xdist 4 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # CHANGELOG 2 | 3 | ## v1.0.1 - 2024-06-05 4 | 5 | Fix version number returned by `--version`. 6 | 7 | ## v1.0.0 - 2024-05-17 8 | 9 | Initial release. 10 | -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | updates: 3 | - package-ecosystem: "github-actions" 4 | directory: "/" 5 | schedule: 6 | interval: weekly 7 | day: monday 8 | time: "09:00" 9 | - package-ecosystem: "pip" 10 | directory: "/" 11 | schedule: 12 | interval: weekly 13 | day: monday 14 | time: "09:00" 15 | -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- 1 | import pathlib 2 | 3 | import pytest 4 | 5 | 6 | @pytest.fixture 7 | def out_path(tmp_path: pathlib.Path) -> None: 8 | """ 9 | Returns a temporary path where we can write a webarchive. 10 | 11 | Any files written to this path will be cleaned up at the end of the test. 12 | """ 13 | return tmp_path / "example.webarchive" 14 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # CONTRIBUTING 2 | 3 | ## Creating a new release 4 | 5 | 1. Bump the version number in `save_safari_webarchive.swift` 6 | 2. Add a changelog entry in `CHANGELOG.md` 7 | 3. Create a Git tag with your new version number 8 | 4. Push your changes and Git tag to GitHub 9 | 10 | GitHub Actions will create a new release, including compiled binaries. 11 | 12 | These binaries aren't notarised -- see https://github.com/alexwlchan/safari-webarchiver/issues/6 13 | -------------------------------------------------------------------------------- /.github/workflows/test.yml: -------------------------------------------------------------------------------- 1 | name: Run tests 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | 8 | pull_request: 9 | branches: 10 | - main 11 | 12 | jobs: 13 | test: 14 | runs-on: macos-latest 15 | 16 | steps: 17 | - uses: actions/checkout@v4 18 | 19 | - name: Set up Python 20 | uses: actions/setup-python@v5 21 | with: 22 | python-version: "3.12" 23 | cache: 'pip' 24 | cache-dependency-path: 'dev_requirements.txt' 25 | 26 | - name: Install dependencies 27 | run: pip install -r dev_requirements.txt 28 | 29 | - name: Run tests 30 | run: python3 -m pytest -n 5 tests 31 | -------------------------------------------------------------------------------- /dev_requirements.txt: -------------------------------------------------------------------------------- 1 | # This file was autogenerated by uv via the following command: 2 | # uv pip compile dev_requirements.in --output-file dev_requirements.txt 3 | exceptiongroup==1.2.2 4 | # via pytest 5 | execnet==2.1.1 6 | # via pytest-xdist 7 | iniconfig==2.0.0 8 | # via pytest 9 | markupsafe==2.1.5 10 | # via werkzeug 11 | packaging==24.0 12 | # via pytest 13 | pluggy==1.5.0 14 | # via pytest 15 | pygments==2.19.1 16 | # via pytest 17 | pytest==8.4.1 18 | # via 19 | # -r dev_requirements.in 20 | # pytest-xdist 21 | pytest-httpserver==1.1.3 22 | # via -r dev_requirements.in 23 | pytest-xdist==3.8.0 24 | # via -r dev_requirements.in 25 | tomli==2.2.1 26 | # via pytest 27 | werkzeug==3.0.3 28 | # via pytest-httpserver 29 | -------------------------------------------------------------------------------- /tests/utils.py: -------------------------------------------------------------------------------- 1 | import pathlib 2 | import subprocess 3 | import typing 4 | 5 | 6 | class CommandOutput(typing.TypedDict): 7 | returncode: int 8 | stdout: str | None 9 | stderr: str | None 10 | 11 | 12 | def save_safari_webarchive(argv: list[str | pathlib.Path]) -> CommandOutput: 13 | """ 14 | Run the ``save_safari_webarchive.swift`` script and return the result. 15 | """ 16 | cmd = ["swift", "save_safari_webarchive.swift"] + [str(av) for av in argv] 17 | 18 | proc = subprocess.Popen( 19 | cmd, 20 | stdout=subprocess.PIPE, 21 | stderr=subprocess.PIPE, 22 | ) 23 | stdout, stderr = proc.communicate() 24 | 25 | if stdout is not None: 26 | stdout = stdout.decode("utf8") 27 | 28 | if stderr is not None: 29 | stderr = stderr.decode("utf8") 30 | 31 | return CommandOutput( 32 | returncode=proc.returncode, 33 | stdout=stdout or None, 34 | stderr=stderr or None, 35 | ) 36 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2024 Alex Chan 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a 4 | copy of this software and associated documentation files (the "Software"), 5 | to deal in the Software without restriction, including without limitation 6 | the rights to use, copy, modify, merge, publish, distribute, sublicense, 7 | and/or sell copies of the Software, and to permit persons to whom the Software 8 | is furnished to do so, subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in 11 | all copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 16 | THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR 17 | OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 18 | ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 19 | OTHER DEALINGS IN THE SOFTWARE. 20 | -------------------------------------------------------------------------------- /tests/fixtures/example.com.html: -------------------------------------------------------------------------------- 1 | 2 | Example Domain 3 | 4 | 5 | 6 | 7 | 34 | 35 | 36 | 37 |
38 |

Example Domain

39 |

This domain is for use in illustrative examples in documents. You may use this 40 | domain in literature without prior coordination or asking for permission.

41 |

More information...

42 |
43 | 44 | 45 | -------------------------------------------------------------------------------- /.github/workflows/upload_binaries.yml: -------------------------------------------------------------------------------- 1 | name: Upload binaries 2 | 3 | on: 4 | push: 5 | tags: 6 | - v[0-9]+.* 7 | 8 | jobs: 9 | create-release: 10 | runs-on: ubuntu-latest 11 | permissions: 12 | contents: write 13 | steps: 14 | - uses: actions/checkout@v4 15 | - uses: taiki-e/create-gh-release-action@v1 16 | with: 17 | changelog: CHANGELOG.md 18 | env: 19 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 20 | 21 | upload-assets: 22 | strategy: 23 | matrix: 24 | 25 | # See https://docs.github.com/en/actions/using-github-hosted-runners/about-github-hosted-runners/about-github-hosted-runners#standard-github-hosted-runners-for-public-repositories 26 | include: 27 | - target: aarch64-apple-darwin 28 | os: macos-latest 29 | - target: x86_64-apple-darwin 30 | os: macos-13 31 | 32 | runs-on: ${{ matrix.os }} 33 | 34 | permissions: 35 | contents: write 36 | 37 | steps: 38 | - uses: actions/checkout@v4 39 | 40 | - name: Compile the Swift script 41 | run: swiftc save_safari_webarchive.swift 42 | 43 | - name: Create the zip archive 44 | run: zip "save_safari_webarchive.${{ matrix.target }}.zip" save_safari_webarchive README.md LICENSE 45 | 46 | - name: Push the zip file to the GitHub release 47 | run: gh release upload "$GITHUB_REF_NAME" "save_safari_webarchive.${{ matrix.target }}.zip" --clobber 48 | env: 49 | GH_TOKEN: ${{ github.token }} 50 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # safari-webarchiver 2 | 3 | This tool creates [Safari webarchive files](https://en.wikipedia.org/wiki/Webarchive) on the command line. 4 | This gives you an offline archive of web pages, which can be stored and backed up independently of any cloud services. 5 | 6 | ```console 7 | $ save_safari_webarchive "https://example.com" "example.webarchive" 8 | ``` 9 | 10 | These archives are the same as those created by the `File > Save As…` menu item, but now you can create them programatically and in bulk. 11 | 12 | ## How it works 13 | 14 | It opens the given URL in a `WKWebView`, calls `createWebArchiveData` to create a webarchive file, and saves it to disk. 15 | That's the core workflow, and then there's a bunch of error handling around that. 16 | 17 | For a more detailed explanation of this code, see 18 | 19 | ## Installation 20 | 21 | ### Install from source 22 | 23 | 1. Install the Xcode Command Line Tools 24 | 2. Download the `save_safari_webarchive.swift` script from this repo 25 | 3. Compile the script into a binary: 26 | 27 | ```console 28 | $ swiftc save_safari_webarchive.swift 29 | ``` 30 | 31 | 4. Copy the compiled binary `save_safari_webarchive` to somewhere in your PATH. 32 | 33 | ### Install a compiled binary 34 | 35 | 1. Find the latest [GitHub release](https://github.com/alexwlchan/safari-webarchiver/releases) 36 | 2. Download the zip file which is appropriate for your system (Intel = `x86_64`, Apple Silion = `aarch64`) 37 | 3. Open the zip file, and add the `save_safari_webarchive` app to your PATH 38 | 39 | The app is just a compiled version of the Swift script. 40 | It isn't notarised, so when you run it, you may get a warning that this app is from an unidentified developer. 41 | You can get around this by right-clicking the app icon in Finder, and choosing `Open` from the shortcut menu. 42 | 43 | ## Usage 44 | 45 | Run the script passing two arguments: the URL you want to archive, and the path where you want to save the webarchive file. 46 | 47 | For example, this command will save the URL to this GitHub repo to `safari-webarchiver.webarchive`: 48 | 49 | ```console 50 | $ save_safari_webarchive "https://github.com/alexwlchan/safari-webarchiver" "safari-webarchiver.webarchive" 51 | ``` 52 | 53 | ## Acknowledgements 54 | 55 | This is partially inspired by [a similar script](https://github.com/newzealandpaul/webarchiver) written in 2008 by newzealandpaul. 56 | -------------------------------------------------------------------------------- /save_safari_webarchive.swift: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env swift 2 | /// Save a web page as a Safari webarchive. 3 | /// 4 | /// Usage: save_safari_webarchive [URL] [OUTPUT_PATH] 5 | /// 6 | /// This will save the page to the desired file, but may fail for 7 | /// several reasons: 8 | /// 9 | /// - the web page can't be loaded 10 | /// - the web page returns a non-200 status code 11 | /// - there's already a file at that path (it won't overwrite an existing 12 | /// webarchive) 13 | /// 14 | /// For a detailed explanation of the code in this script, see 15 | /// https://alexwlchan.net/2024/creating-a-safari-webarchive/ 16 | /// 17 | /// The canonical copy of this script lives in GitHub, see 18 | /// https://github.com/alexwlchan/safari-webarchiver 19 | 20 | import WebKit 21 | 22 | let SCRIPT_VERSION = "1.0.1" 23 | 24 | /// Print an error message and terminate the process if there are 25 | /// any errors while loading a page. 26 | class ExitOnFailureDelegate: NSObject, WKNavigationDelegate { 27 | var urlString: String 28 | 29 | init(_ urlString: String) { 30 | self.urlString = urlString 31 | } 32 | 33 | func webView( 34 | _: WKWebView, 35 | didFail: WKNavigation!, 36 | withError error: Error 37 | ) { 38 | fputs("Failed to load \(self.urlString): \(error.localizedDescription)\n", stderr) 39 | exit(1) 40 | } 41 | 42 | func webView( 43 | _: WKWebView, 44 | didFailProvisionalNavigation: WKNavigation!, 45 | withError error: Error 46 | ) { 47 | fputs("Failed to load \(self.urlString): \(error.localizedDescription)\n", stderr) 48 | exit(1) 49 | } 50 | 51 | func webView( 52 | _: WKWebView, 53 | decidePolicyFor navigationResponse: WKNavigationResponse, 54 | decisionHandler: (WKNavigationResponsePolicy) -> Void 55 | ) { 56 | if let httpUrlResponse = (navigationResponse.response as? HTTPURLResponse) { 57 | if httpUrlResponse.statusCode != 200 { 58 | fputs("Failed to load \(self.urlString): got status code \(httpUrlResponse.statusCode)\n", stderr) 59 | exit(1) 60 | } 61 | } 62 | 63 | decisionHandler(.allow) 64 | } 65 | } 66 | 67 | extension WKWebView { 68 | 69 | /// Load the given URL in the web view. 70 | /// 71 | /// This method will block until the URL has finished loading. 72 | func load(_ urlString: String) { 73 | let delegate = ExitOnFailureDelegate(urlString) 74 | webView.navigationDelegate = delegate 75 | 76 | if let url = URL(string: urlString) { 77 | let request = URLRequest(url: url) 78 | self.load(request) 79 | 80 | while (self.isLoading) { 81 | RunLoop.main.run(until: Date(timeIntervalSinceNow: 0.1)) 82 | } 83 | } else { 84 | fputs("Unable to use \(urlString) as a URL\n", stderr) 85 | exit(1) 86 | } 87 | } 88 | 89 | /// Save a copy of the web view's contents as a webarchive file. 90 | /// 91 | /// This method will block until the webarchive has been saved, 92 | /// or the save has failed for some reason. 93 | func saveAsWebArchive(savePath: URL) { 94 | var isSaving = true 95 | 96 | self.createWebArchiveData(completionHandler: { result in 97 | do { 98 | let data = try result.get() 99 | try data.write( 100 | to: savePath, 101 | options: [Data.WritingOptions.withoutOverwriting] 102 | ) 103 | isSaving = false 104 | } catch { 105 | fputs("Unable to save webarchive file: \(error.localizedDescription)\n", stderr) 106 | exit(1) 107 | } 108 | }) 109 | 110 | while (isSaving) { 111 | RunLoop.main.run(until: Date(timeIntervalSinceNow: 0.1)) 112 | } 113 | } 114 | } 115 | 116 | if CommandLine.arguments.count == 2 && CommandLine.arguments[1] == "--version" { 117 | let filename = (CommandLine.arguments[0] as NSString).lastPathComponent 118 | print("\(filename) \(SCRIPT_VERSION)") 119 | exit(0) 120 | } 121 | 122 | guard CommandLine.arguments.count == 3 else { 123 | fputs("Usage: \(CommandLine.arguments[0]) \n", stderr) 124 | exit(1) 125 | } 126 | 127 | let urlString = CommandLine.arguments[1] 128 | let savePath = URL(fileURLWithPath: CommandLine.arguments[2]) 129 | 130 | let webView = WKWebView() 131 | 132 | webView.load(urlString) 133 | webView.saveAsWebArchive(savePath: savePath) 134 | 135 | print("Saved webarchive to \(savePath)") 136 | -------------------------------------------------------------------------------- /tests/test_save_safari_webarchive.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import os 4 | import pathlib 5 | import plistlib 6 | import re 7 | 8 | import pytest 9 | from pytest_httpserver import HTTPServer 10 | 11 | from utils import save_safari_webarchive 12 | 13 | 14 | def test_creates_a_single_archive(out_path: pathlib.Path) -> None: 15 | result = save_safari_webarchive(["https://example.com", out_path]) 16 | 17 | assert result["returncode"] == 0 18 | assert result["stdout"] is not None 19 | assert result["stderr"] is None 20 | assert out_path.exists() 21 | 22 | with open(out_path, "rb") as in_file: 23 | webarchive = plistlib.load(in_file) 24 | 25 | main_resource = webarchive["WebMainResource"] 26 | 27 | assert main_resource["WebResourceURL"] == "https://example.com/" 28 | assert ( 29 | main_resource["WebResourceData"] 30 | == open("tests/fixtures/example.com.html", "rb").read() 31 | ) 32 | 33 | 34 | def test_does_not_overwrite_existing_archive(out_path: pathlib.Path) -> None: 35 | out_path.write_text("This should still be here later") 36 | 37 | result = save_safari_webarchive(["https://example.com", out_path]) 38 | 39 | assert result == { 40 | "returncode": 1, 41 | "stdout": None, 42 | "stderr": ( 43 | "Unable to save webarchive file: " 44 | "The file “example.webarchive” couldn’t be saved in the folder " 45 | "“test_does_not_overwrite_existi0” because a file with " 46 | "the same name already exists.\n" 47 | ), 48 | } 49 | 50 | assert out_path.read_text() == "This should still be here later" 51 | 52 | 53 | @pytest.mark.parametrize( 54 | "argv", 55 | [ 56 | pytest.param([], id="no_arguments"), 57 | pytest.param(["https://example.com"], id="not_enough_arguments"), 58 | pytest.param( 59 | ["https://example.com", "example.webarchive", "--debug"], 60 | id="too_many_arguments", 61 | ), 62 | ], 63 | ) 64 | def test_it_fails_if_you_supply_the_wrong_arguments(argv: list[str]) -> None: 65 | result = save_safari_webarchive(argv) 66 | 67 | assert result == { 68 | "returncode": 1, 69 | "stdout": None, 70 | "stderr": "Usage: save_safari_webarchive.swift \n", 71 | } 72 | 73 | 74 | @pytest.mark.parametrize("status_code", ["403", "404", "410", "500"]) 75 | def test_it_fails_if_non_200_status_code( 76 | httpserver: HTTPServer, status_code: str, out_path: pathlib.Path 77 | ) -> None: 78 | httpserver.expect_request("/error").respond_with_data( 79 | "Boom!", status=int(status_code), content_type="text/plain" 80 | ) 81 | url = f"http://localhost:{httpserver.port}/error" 82 | 83 | result = save_safari_webarchive([url, out_path]) 84 | 85 | assert result == { 86 | "returncode": 1, 87 | "stdout": None, 88 | "stderr": f"Failed to load {url}: got status code {status_code}\n", 89 | } 90 | 91 | assert not out_path.exists() 92 | 93 | 94 | def test_it_fails_if_cannot_load_domain(out_path: pathlib.Path) -> None: 95 | result = save_safari_webarchive(["https://doesnotexist.tk/", out_path]) 96 | 97 | assert result == { 98 | "returncode": 1, 99 | "stdout": None, 100 | "stderr": "Failed to load https://doesnotexist.tk/: A server with the specified hostname could not be found.\n", 101 | } 102 | 103 | assert not out_path.exists() 104 | 105 | 106 | # If I run this test in GitHub Actions, I get a warning to stderr but 107 | # the archive is saved correctly: 108 | # 109 | # CFURLCopyResourcePropertyForKey failed because it was passed a URL which 110 | # has no scheme 111 | # 112 | # This test passes locally; leave it for now -- I can come back to this. 113 | @pytest.mark.skipif( 114 | os.getenv("CI") == "true", 115 | reason="This test doesn’t work correctly in GitHub Actions", 116 | ) 117 | def test_it_fails_if_url_is_invalid(out_path: pathlib.Path) -> None: 118 | result = save_safari_webarchive([">", out_path]) 119 | 120 | assert result == { 121 | "returncode": 1, 122 | "stdout": None, 123 | "stderr": "Unable to use > as a URL\n", 124 | } 125 | 126 | assert not out_path.exists() 127 | 128 | 129 | def test_prints_the_version() -> None: 130 | result = save_safari_webarchive(["--version"]) 131 | 132 | assert result["returncode"] == 0 133 | assert result["stderr"] is None 134 | assert re.match( 135 | r"^save_safari_webarchive.swift [0-9]+\.[0-9]+\.[0-9]+\n$", result["stdout"] 136 | ), result["stdout"] 137 | --------------------------------------------------------------------------------